def data_process(self, data, model_data_file):
        '''
        @description: 判断咨询中是否包含业务关键词, 如果包含label为1, 否则为0
                      并处理成fasttext 需要的数据格式:
                      "__label__" + label + "\t" + content + "\n"
        @param {type}
        model_data_file: 模型训练数据保存路径
        model_test_file: 模型验证数据保存路径
        @return:
        '''
        logging.info('processing data: %s.' % model_data_file)
        data['custom'] = data['custom'].fillna('')
        examples = []
        for sentence in data['custom'].values:
            if sentence != '':
                if any(kw in sentence for kw in self.keywords):
                    label = 1
                else:
                    label = 0
            text = '\t'.join(['__label__%s' % label, clean(sentence)])
            examples.append(text)

        with open(model_data_file, 'w') as f:
            for text in examples:
                f.write(text + '\n')

        logging.info('Processing data, finished!')
def read_data(file_path):
    '''
    @description: 读取数据,清洗
    @param {type}
    file_path: 文件所在路径
    @return: Training samples.
    '''
    data = read_file(file_path, is_train=True)
    data = pd.DataFrame(data, columns=['session_id', 'role', 'content'])
    data['clean'] = data['content'].apply(lambda x: clean(x))
    # print("data shape: {}".format(data.shape))
    # print("data: {}".format(data.head(5)))
    return data
 def predict(self, text):
     '''
     @description: 预测
     @param {type}
     text: 文本
     @return: label, score
     '''
     logging.info('Predicting.')
     clean_text = clean(filter_content(text))
     logging.info('text: %s' % text)
     logging.info('clean text: %s' % clean_text)
     start_time = time.time()
     label, score = self.fast.predict(clean_text)
     logging.info('used time: {:.4f}s'.format(time.time() - start_time))
     return label, score
    def search(self, text, k=5):
        '''
        @description: 通过hnsw 检索
        @param {type}
        text: 检索句子
        k: 检索返回的数量
        @return: DataFrame containing the customer input, assistance response
                and the distance to the query.
        '''
        test_vec = wam(clean(text), self.w2v_model)
        test_vec = test_vec.reshape(1, -1)

        D, I = self.index.search(test_vec, k)
        logging.info("D: {}".format(D))
        logging.info("I: {}".format(I))

        return pd.concat(
            (self.data.iloc[I[0]]['custom'].reset_index(),
             self.data.iloc[I[0]]['assistance'].reset_index(drop=True),
             pd.DataFrame(D.reshape(-1, 1), columns=['q_distance'])),
            axis=1)
 def load_data(self, data_path):
     '''
     @description: 读取数据,并生成句向量
     @param {type}
     data_path:问答pair数据所在路径
     @return: 包含句向量的dataframe
     '''
     if os.path.exists(data_path.replace('.csv', '_for_hnsw.pkl')):
         logging.info("Reading data from %s" % data_path.replace('.csv', '_for_hnsw.pkl'))
         data = pd.read_pickle(data_path.replace('.csv', '_for_hnsw.pkl'))
         logging.info("data: %s" % data.head(5))
     else:
         logging.info("Reading data from %s" % data_path)
         data = pd.read_csv(data_path, header=0)
         data['custom_vec'] = data['custom'].progress_apply(
             lambda s: wam(clean(s), self.w2v_model))
         # data['assistance_vec'] = data['assistance'].apply(
         #     lambda s: wam(s, self.w2v_model))
         data = data.dropna()
         logging.info("data: %s" % data.head(5))
         data.to_pickle(data_path.replace('.csv', '_for_hnsw.pkl'))
     return data
# ## Final Preprocessed Data

# %%
train_x[0]


# %%
X_train[0]

# %% [markdown]
# ### Clean the data
%load_ext autoreload
%autoreload 2
from utils.preprocessing import clean

clean_train_x = clean(train_x)
clean_valid_x = clean(valid_x)
clean_test_x = clean(test_x)

# %% [markdown]
# ### Stopwords

# %%
%load_ext autoreload
%autoreload 2
from utils.preprocessing import filter_stop_words
filtered_train_x = filter_stop_words(clean_train_x)
filtered_valid_x = filter_stop_words(clean_valid_x)
filtered_test_x = filter_stop_words(clean_test_x)

tokenizer = Tokenizer(num_words=max_words)