예제 #1
0
 def __init__(self) -> None:
     self._db_df = pd.read_csv(BILL_DATA_FILEPATH)
     self._db_vects = joblib.load(DATABASE_VECTORS_FILEPATH).toarray().astype('float32')
     self._texts_df = self._generate_text_dataframe()
     self._tokenizer = MyTokenizer()
     self._vectorizer = joblib.load(T2_VECTORIZER_FILEPATH)
     self._ordinal_2_id = joblib.load(ORDINAL_2_ID_DICT_FILEPATH)
     self._d = self._db_vects.shape[1]
     self._index = faiss.IndexFlatL2(self._d)
     self._index.add(self._db_vects)
예제 #2
0
 def __init__(self) -> None:
     self._db_df = pd.read_csv(BILL_DATA_FILEPATH)
     self._db_vects = joblib.load(DATABASE_VECTORS_FILEPATH).toarray().astype('float32')
     self._texts_df = self._generate_text_dataframe()
     self._tokenizer = MyTokenizer()
     self._vectorizer = joblib.load(T2_VECTORIZER_FILEPATH)
     self._ordinal_2_id = joblib.load(ORDINAL_2_ID_DICT_FILEPATH)
     self._d = self._db_vects.shape[1]
     self._index = nmslib.init(method="hnsw", space="l2", data_type=nmslib.DataType.DENSE_VECTOR)
     self._index.addDataPointBatch(self._db_vects)
     self._index.createIndex(INDEX_TIME_PARAMS)
     self._index.setQueryTimeParams({"efSearch": self._d})
예제 #3
0
def test_batch_segment():
    import time
    texts = [
        "挖基坑土方 1.部位:沉箱 2.挖土深度:550mm",
        "零星砌砖 1.LC15陶粒混凝土填充层 2.15厚1:3水泥砂浆保护层 3.钢筋混凝土楼板扫水泥浆一道 4.部位:沉箱",
        "砌块墙 1.砌块品种、规格、强度等级:蒸压加气混凝土砌体 2.墙体类型:内墙 3.砂浆强度等级:预拌水泥砂浆M5.0 4.部位:变形缝",
        "栏板 1.部位:盥洗池 2.混凝土强度等级:C20商品混凝土",
        "现浇构件钢筋 1.钢筋种类、规格:圆钢φ10内",
        "金属(塑钢)门 1.门代号及洞口尺寸:M1(900*2100) 2.门框、扇材质:不锈钢扣板、磨砂玻璃门 3.玻璃品种、厚度:磨砂钢化玻璃8mm",
        "屋面卷材防水 1.卷材品种、规格、厚度:1.5厚合成高分子防水卷材 2.部位:屋面",
        "楼面变形缝 1.嵌缝材料种类:聚乙烯泡沫塑料棒 2.阻火带:不燃材料阻火带 3.标准图集:中南标11ZJ111(5/A-8) 4.50*50*3mm不锈钢角钢 5.0.8mm不锈钢接水槽",
        "保温隔热屋面 1.保温隔热材料品种、规格、厚度:SGK防水型隔热板(B节能型333×333×60mm厚度28mm30kg/m3) 2.结合层厚度、砂浆配合比:25mm1:4干硬性水泥砂浆",
        "保温隔热屋面 1.找坡:30厚(最薄处)1:8憎水性膨胀珍珠岩找坡 2.工程部位:屋面"
    ]
    tokenizer = MyTokenizer()
    t0 = time.time()
    for text in texts:
        tokenizer.segment(text)
    print("[INFO] Segment %d documents took %f seconds" % (len(texts), time.time() - t0))
예제 #4
0
class NmslibBillSearcher(object):
    def __init__(self) -> None:
        self._db_df = pd.read_csv(BILL_DATA_FILEPATH)
        self._db_vects = joblib.load(DATABASE_VECTORS_FILEPATH).toarray().astype('float32')
        self._texts_df = self._generate_text_dataframe()
        self._tokenizer = MyTokenizer()
        self._vectorizer = joblib.load(T2_VECTORIZER_FILEPATH)
        self._ordinal_2_id = joblib.load(ORDINAL_2_ID_DICT_FILEPATH)
        self._d = self._db_vects.shape[1]
        self._index = nmslib.init(method="hnsw", space="l2", data_type=nmslib.DataType.DENSE_VECTOR)
        self._index.addDataPointBatch(self._db_vects)
        self._index.createIndex(INDEX_TIME_PARAMS)
        self._index.setQueryTimeParams({"efSearch": self._d})

    def _generate_text_dataframe(self) -> pd.DataFrame:
        feature_cols = ['bill_name', 'bill_desc', 'unit']
        texts_df = self._db_df.copy()
        texts_df['bill_text'] = texts_df[feature_cols[0]].str.cat(
            texts_df[feature_cols[1:]], sep=' '
        )
        texts_df.drop(columns=feature_cols, inplace=True)
        return texts_df

    def find_k_nearest_bills(self, query_texts: List[str], k: int = 5, num_threads: int = 4) -> List[pd.DataFrame]:
        text_segmented = [self._tokenizer.segment(text) for text in query_texts]
        query_vects = self._vectorizer.transform(text_segmented).toarray().astype('float32')
        nbrs = self._index.knnQueryBatch(query_vects, k, num_threads)
        results = []
        for i, text in enumerate(query_texts):
            ordinals, distances = nbrs[i]
            distances = list(distances)
            ids = [self._ordinal_2_id[ordinal] for ordinal in ordinals]
            k_nearest_bills = pd.DataFrame()
            if text in self._texts_df.bill_text.unique():
                bill_id = int(self._db_df.loc[self._texts_df.bill_text == text].bill_id)
                distances = [0] + distances
                k_nearest_bills = pd.concat([k_nearest_bills, self._db_df.loc[self._db_df.bill_id == bill_id]], axis=0)
            for _id in ids:
                k_nearest_bills = pd.concat([k_nearest_bills, self._db_df.loc[self._db_df.bill_id == _id]], axis=0)
            k_nearest_bills['distance'] = distances
            k_nearest_bills.drop_duplicates(['bill_name', 'bill_desc', 'unit'], keep='first', inplace=True)
            k_nearest_bills = k_nearest_bills.iloc[:k]
            assert len(k_nearest_bills) == k
            results.append(k_nearest_bills)
        return results

    @property
    def d(self):
        return self._d
예제 #5
0
class BillClassifier(object):
    def __init__(self):
        self._tokenizer = MyTokenizer()
        self._vectorizer = joblib.load(T1_VECTORIZER_FILEPATH)
        self._model = joblib.load(T1_MODEL_FILEPATH)
        self._label_2_type = joblib.load(LABEL_2_TYPE_DICT_FILEPATH)

    def _classify(self, texts: List[str]) -> List[int]:
        texts_segmented = [self._tokenizer.segment(text) for text in texts]
        return list(
            self._model.predict(self._vectorizer.transform(texts_segmented)))

    def classify_bill(self, texts: List[str]) -> List[str]:
        labels = self._classify(texts)
        return [self._label_2_type[label] for label in labels]
예제 #6
0
 def __init__(self):
     self._tokenizer = MyTokenizer()
     self._vectorizer = joblib.load(T1_VECTORIZER_FILEPATH)
     self._model = joblib.load(T1_MODEL_FILEPATH)
     self._label_2_type = joblib.load(LABEL_2_TYPE_DICT_FILEPATH)
예제 #7
0
class FaissBillSearcher(object):
    def __init__(self) -> None:
        self._db_df = pd.read_csv(BILL_DATA_FILEPATH)
        self._db_vects = joblib.load(DATABASE_VECTORS_FILEPATH).toarray().astype('float32')
        self._texts_df = self._generate_text_dataframe()
        self._tokenizer = MyTokenizer()
        self._vectorizer = joblib.load(T2_VECTORIZER_FILEPATH)
        self._ordinal_2_id = joblib.load(ORDINAL_2_ID_DICT_FILEPATH)
        self._d = self._db_vects.shape[1]
        self._index = faiss.IndexFlatL2(self._d)
        self._index.add(self._db_vects)

    def _generate_text_dataframe(self) -> pd.DataFrame:
        feature_cols = ['bill_name', 'bill_desc', 'unit']
        texts_df = self._db_df.copy()
        texts_df['bill_text'] = texts_df[feature_cols[0]].str.cat(
            texts_df[feature_cols[1:]], sep=' '
        )
        texts_df.drop(columns=feature_cols, inplace=True)
        return texts_df

    def _find_k_nearest_indexes(self, query_texts: List[str], k: int = 5) -> Tuple[np.ndarray, np.ndarray]:
        text_segmented = [self._tokenizer.segment(text) for text in query_texts]
        query_vects = self._vectorizer.transform(text_segmented).toarray().astype('float32')
        D, I = self._index.search(query_vects, k)
        return D, I

    def find_k_nearest_texts(self, query_texts: List[str], k: int = 5) -> List[List[tuple]]:
        _, I = self._find_k_nearest_indexes(query_texts, k)
        ans = []
        for text, ordinals in zip(query_texts, I):
            ids = [self._ordinal_2_id[ordinal] for ordinal in ordinals]
            k_nearest_texts = []
            for _id in ids:
                record = tuple(self._db_df.loc[self._db_df.bill_id == _id].values.ravel())
                k_nearest_texts.append(record)
            ans.append(k_nearest_texts)
        return ans

    def find_k_nearest_bills(self, query_texts: List[str], k: int = 5) -> List[pd.DataFrame]:
        D, I = self._find_k_nearest_indexes(query_texts, k)
        results = []
        for i, text in enumerate(query_texts):
            ordinals, distances = I[i], list(D[i])
            ids = [self._ordinal_2_id[ordinal] for ordinal in ordinals]
            k_nearest_bills = pd.DataFrame()
            if text in self._texts_df.bill_text.unique():
                bill_id = int(self._db_df.loc[self._texts_df.bill_text == text].bill_id)
                distances = [0] + distances
                k_nearest_bills = pd.concat(
                    [k_nearest_bills, self._db_df.loc[self._db_df.bill_id == bill_id]],
                    axis=0
                )
            for _id in ids:
                k_nearest_bills = pd.concat(
                    [k_nearest_bills, self._db_df.loc[self._db_df.bill_id == _id]],
                    axis=0
                )
            k_nearest_bills['distance'] = distances
            k_nearest_bills.drop_duplicates(['bill_name', 'bill_desc', 'unit'], keep='first', inplace=True)
            k_nearest_bills = k_nearest_bills.iloc[:k]
            assert len(k_nearest_bills) == k
            results.append(k_nearest_bills)
        return results

    @property
    def d(self):
        return self._d