def test_batch_segment(): import time texts = [ "挖基坑土方 1.部位:沉箱 2.挖土深度:550mm", "零星砌砖 1.LC15陶粒混凝土填充层 2.15厚1:3水泥砂浆保护层 3.钢筋混凝土楼板扫水泥浆一道 4.部位:沉箱", "砌块墙 1.砌块品种、规格、强度等级:蒸压加气混凝土砌体 2.墙体类型:内墙 3.砂浆强度等级:预拌水泥砂浆M5.0 4.部位:变形缝", "栏板 1.部位:盥洗池 2.混凝土强度等级:C20商品混凝土", "现浇构件钢筋 1.钢筋种类、规格:圆钢φ10内", "金属(塑钢)门 1.门代号及洞口尺寸:M1(900*2100) 2.门框、扇材质:不锈钢扣板、磨砂玻璃门 3.玻璃品种、厚度:磨砂钢化玻璃8mm", "屋面卷材防水 1.卷材品种、规格、厚度:1.5厚合成高分子防水卷材 2.部位:屋面", "楼面变形缝 1.嵌缝材料种类:聚乙烯泡沫塑料棒 2.阻火带:不燃材料阻火带 3.标准图集:中南标11ZJ111(5/A-8) 4.50*50*3mm不锈钢角钢 5.0.8mm不锈钢接水槽", "保温隔热屋面 1.保温隔热材料品种、规格、厚度:SGK防水型隔热板(B节能型333×333×60mm厚度28mm30kg/m3) 2.结合层厚度、砂浆配合比:25mm1:4干硬性水泥砂浆", "保温隔热屋面 1.找坡:30厚(最薄处)1:8憎水性膨胀珍珠岩找坡 2.工程部位:屋面" ] tokenizer = MyTokenizer() t0 = time.time() for text in texts: tokenizer.segment(text) print("[INFO] Segment %d documents took %f seconds" % (len(texts), time.time() - t0))
class NmslibBillSearcher(object): def __init__(self) -> None: self._db_df = pd.read_csv(BILL_DATA_FILEPATH) self._db_vects = joblib.load(DATABASE_VECTORS_FILEPATH).toarray().astype('float32') self._texts_df = self._generate_text_dataframe() self._tokenizer = MyTokenizer() self._vectorizer = joblib.load(T2_VECTORIZER_FILEPATH) self._ordinal_2_id = joblib.load(ORDINAL_2_ID_DICT_FILEPATH) self._d = self._db_vects.shape[1] self._index = nmslib.init(method="hnsw", space="l2", data_type=nmslib.DataType.DENSE_VECTOR) self._index.addDataPointBatch(self._db_vects) self._index.createIndex(INDEX_TIME_PARAMS) self._index.setQueryTimeParams({"efSearch": self._d}) def _generate_text_dataframe(self) -> pd.DataFrame: feature_cols = ['bill_name', 'bill_desc', 'unit'] texts_df = self._db_df.copy() texts_df['bill_text'] = texts_df[feature_cols[0]].str.cat( texts_df[feature_cols[1:]], sep=' ' ) texts_df.drop(columns=feature_cols, inplace=True) return texts_df def find_k_nearest_bills(self, query_texts: List[str], k: int = 5, num_threads: int = 4) -> List[pd.DataFrame]: text_segmented = [self._tokenizer.segment(text) for text in query_texts] query_vects = self._vectorizer.transform(text_segmented).toarray().astype('float32') nbrs = self._index.knnQueryBatch(query_vects, k, num_threads) results = [] for i, text in enumerate(query_texts): ordinals, distances = nbrs[i] distances = list(distances) ids = [self._ordinal_2_id[ordinal] for ordinal in ordinals] k_nearest_bills = pd.DataFrame() if text in self._texts_df.bill_text.unique(): bill_id = int(self._db_df.loc[self._texts_df.bill_text == text].bill_id) distances = [0] + distances k_nearest_bills = pd.concat([k_nearest_bills, self._db_df.loc[self._db_df.bill_id == bill_id]], axis=0) for _id in ids: k_nearest_bills = pd.concat([k_nearest_bills, self._db_df.loc[self._db_df.bill_id == _id]], axis=0) k_nearest_bills['distance'] = distances k_nearest_bills.drop_duplicates(['bill_name', 'bill_desc', 'unit'], keep='first', inplace=True) k_nearest_bills = k_nearest_bills.iloc[:k] assert len(k_nearest_bills) == k results.append(k_nearest_bills) return results @property def d(self): return self._d
class BillClassifier(object): def __init__(self): self._tokenizer = MyTokenizer() self._vectorizer = joblib.load(T1_VECTORIZER_FILEPATH) self._model = joblib.load(T1_MODEL_FILEPATH) self._label_2_type = joblib.load(LABEL_2_TYPE_DICT_FILEPATH) def _classify(self, texts: List[str]) -> List[int]: texts_segmented = [self._tokenizer.segment(text) for text in texts] return list( self._model.predict(self._vectorizer.transform(texts_segmented))) def classify_bill(self, texts: List[str]) -> List[str]: labels = self._classify(texts) return [self._label_2_type[label] for label in labels]
class FaissBillSearcher(object): def __init__(self) -> None: self._db_df = pd.read_csv(BILL_DATA_FILEPATH) self._db_vects = joblib.load(DATABASE_VECTORS_FILEPATH).toarray().astype('float32') self._texts_df = self._generate_text_dataframe() self._tokenizer = MyTokenizer() self._vectorizer = joblib.load(T2_VECTORIZER_FILEPATH) self._ordinal_2_id = joblib.load(ORDINAL_2_ID_DICT_FILEPATH) self._d = self._db_vects.shape[1] self._index = faiss.IndexFlatL2(self._d) self._index.add(self._db_vects) def _generate_text_dataframe(self) -> pd.DataFrame: feature_cols = ['bill_name', 'bill_desc', 'unit'] texts_df = self._db_df.copy() texts_df['bill_text'] = texts_df[feature_cols[0]].str.cat( texts_df[feature_cols[1:]], sep=' ' ) texts_df.drop(columns=feature_cols, inplace=True) return texts_df def _find_k_nearest_indexes(self, query_texts: List[str], k: int = 5) -> Tuple[np.ndarray, np.ndarray]: text_segmented = [self._tokenizer.segment(text) for text in query_texts] query_vects = self._vectorizer.transform(text_segmented).toarray().astype('float32') D, I = self._index.search(query_vects, k) return D, I def find_k_nearest_texts(self, query_texts: List[str], k: int = 5) -> List[List[tuple]]: _, I = self._find_k_nearest_indexes(query_texts, k) ans = [] for text, ordinals in zip(query_texts, I): ids = [self._ordinal_2_id[ordinal] for ordinal in ordinals] k_nearest_texts = [] for _id in ids: record = tuple(self._db_df.loc[self._db_df.bill_id == _id].values.ravel()) k_nearest_texts.append(record) ans.append(k_nearest_texts) return ans def find_k_nearest_bills(self, query_texts: List[str], k: int = 5) -> List[pd.DataFrame]: D, I = self._find_k_nearest_indexes(query_texts, k) results = [] for i, text in enumerate(query_texts): ordinals, distances = I[i], list(D[i]) ids = [self._ordinal_2_id[ordinal] for ordinal in ordinals] k_nearest_bills = pd.DataFrame() if text in self._texts_df.bill_text.unique(): bill_id = int(self._db_df.loc[self._texts_df.bill_text == text].bill_id) distances = [0] + distances k_nearest_bills = pd.concat( [k_nearest_bills, self._db_df.loc[self._db_df.bill_id == bill_id]], axis=0 ) for _id in ids: k_nearest_bills = pd.concat( [k_nearest_bills, self._db_df.loc[self._db_df.bill_id == _id]], axis=0 ) k_nearest_bills['distance'] = distances k_nearest_bills.drop_duplicates(['bill_name', 'bill_desc', 'unit'], keep='first', inplace=True) k_nearest_bills = k_nearest_bills.iloc[:k] assert len(k_nearest_bills) == k results.append(k_nearest_bills) return results @property def d(self): return self._d