def _get_w2v(self, data, col, size=300): ''' 得到当前data中的col列的词向量 ''' file_name = './cache/{col}_w2v_df.csv'.format(col=col) columns = ['{}_w2v_{}'.format(col, i) for i in range(size)] with open(file_name, 'a', encoding='utf-8') as f: # write columns f.write(','.join(columns) + '\n') for idx, item in data[col].items(): if item == 'null': item_list = [''] * size elif not item: item_list = [''] * size else: #分词后做词汇的清洗 seg_cut = jieba.lcut(item) seg_cut = char_list_cheaner(seg_cut) w2v_array = list() for word in seg_cut: try: similar_list = self.w2v_model[word] w2v_array.append(similar_list) except KeyError: pass if not w2v_array: item_list = [''] * size else: #取词向量的平均值 item_list = matutils.unitvec( np.array(w2v_array, dtype=np.float32).mean(axis=0)) f.write(','.join(map(str, item_list)) + '\n')
def __iter__(self): for sentence in get_sentence(self.fname): seg_list = jieba.cut(sentence) seg_list = list(seg_list) seg_list = char_list_cheaner(seg_list) if seg_list: yield seg_list
def _get_prefix_w2v(item): prefix = item["prefix"] title = item["title"] if not isinstance(prefix, str): prefix = "null" prefix_cut = list(jieba.cut(prefix)) title_cut = list(jieba.cut(title)) prefix_cut = char_list_cheaner(prefix_cut) title_cut = char_list_cheaner(title_cut) try: w2v_similar = w2v_model.n_similarity(prefix_cut, title_cut) except (KeyError, ZeroDivisionError): w2v_similar = None return w2v_similar
def _get_w2v_similar(item): item_dict = dict() query_predict = item["query_prediction"] title = item["title"] if not query_predict: item_dict["max_similar"] = None item_dict["mean_similar"] = None item_dict["weight_similar"] = None return item_dict similar_list = list() weight_similar_list = list() title_cut = list(jieba.cut(title)) title_cut = char_list_cheaner(title_cut) for key, value in query_predict.items(): query_cut = list(jieba.cut(key)) query_cut = char_list_cheaner(query_cut) try: w2v_similar = w2v_model.n_similarity(query_cut, title_cut) except (KeyError, ZeroDivisionError): w2v_similar = np.nan similar_list.append(w2v_similar) weight_w2v_similar = w2v_similar * float(value) weight_similar_list.append(weight_w2v_similar) max_similar = np.nanmax(similar_list) mean_similar = np.nanmean(similar_list) weight_similar = np.nansum(weight_similar_list) item_dict["max_similar"] = max_similar item_dict["mean_similar"] = mean_similar item_dict["weight_similar"] = weight_similar return item_dict
def _get_jieba_array(words, size=500): seg_cut = jieba.lcut(words) seg_cut = char_list_cheaner(seg_cut) w2v_array = list() for word in seg_cut: try: similar_list = w2v_model[word] w2v_array.append(similar_list) except KeyError: continue if not w2v_array: w2v_array = [None] * size else: w2v_array = matutils.unitvec(np.array(w2v_array).mean(axis=0)) return w2v_array
def _to_csv(df, col, size): file_name = '{col}_w2v.csv'.format(col=col) file_path = os.path.join(TEMP_DATA_PATH, file_name) if os.path.exists(file_path): os.remove(file_path) columns = ['{}_w2v_{}'.format(col, i) for i in range(size)] none_index_set = set() with open(file_path, 'a', encoding='utf-8') as f: # write columns f.write(','.join(columns) + '\n') for idx, item in tqdm(df[col].items()): if item == 'null': item_list = [''] * size none_index_set.add(idx) elif not item: item_list = [''] * size none_index_set.add(idx) else: seg_cut = jieba.lcut(item) seg_cut = char_list_cheaner(seg_cut) w2v_array = list() for word in seg_cut: try: similar_list = w2v_model[word] w2v_array.append(similar_list) except KeyError: pass if not w2v_array: item_list = [''] * size none_index_set.add(idx) else: item_list = matutils.unitvec(np.array(w2v_array).mean(axis=0)) f.write(','.join(map(str, item_list)) + '\n') return none_index_set
def _get_jieba_array(self, words, size=300): ''' 对输入的word做结巴分词后获取对于的词向量,取平均后作为words的向量 ''' seg_cut = jieba.lcut(words) seg_cut = char_list_cheaner(seg_cut) w2v_array = list() for word in seg_cut: try: similar_list = self.w2v_model[word] w2v_array.append(similar_list) except KeyError: continue if not w2v_array: w2v_array = [None] * size else: w2v_array = matutils.unitvec(np.array(w2v_array).mean(axis=0)) return w2v_array