Exemple #1
0
    def _get_w2v(self, data, col, size=300):
        '''
        得到当前data中的col列的词向量
        '''
        file_name = './cache/{col}_w2v_df.csv'.format(col=col)
        columns = ['{}_w2v_{}'.format(col, i) for i in range(size)]

        with open(file_name, 'a', encoding='utf-8') as f:
            # write columns
            f.write(','.join(columns) + '\n')
            for idx, item in data[col].items():
                if item == 'null':
                    item_list = [''] * size
                elif not item:
                    item_list = [''] * size
                else:
                    #分词后做词汇的清洗
                    seg_cut = jieba.lcut(item)
                    seg_cut = char_list_cheaner(seg_cut)

                    w2v_array = list()
                    for word in seg_cut:
                        try:
                            similar_list = self.w2v_model[word]
                            w2v_array.append(similar_list)
                        except KeyError:
                            pass

                    if not w2v_array:
                        item_list = [''] * size
                    else:
                        #取词向量的平均值
                        item_list = matutils.unitvec(
                            np.array(w2v_array, dtype=np.float32).mean(axis=0))
                f.write(','.join(map(str, item_list)) + '\n')
Exemple #2
0
 def __iter__(self):
     for sentence in get_sentence(self.fname):
         seg_list = jieba.cut(sentence)
         seg_list = list(seg_list)
         seg_list = char_list_cheaner(seg_list)
         if seg_list:
             yield seg_list
    def _get_prefix_w2v(item):
        prefix = item["prefix"]
        title = item["title"]
        if not isinstance(prefix, str):
            prefix = "null"

        prefix_cut = list(jieba.cut(prefix))
        title_cut = list(jieba.cut(title))

        prefix_cut = char_list_cheaner(prefix_cut)
        title_cut = char_list_cheaner(title_cut)

        try:
            w2v_similar = w2v_model.n_similarity(prefix_cut, title_cut)
        except (KeyError, ZeroDivisionError):
            w2v_similar = None

        return w2v_similar
    def _get_w2v_similar(item):
        item_dict = dict()

        query_predict = item["query_prediction"]
        title = item["title"]

        if not query_predict:
            item_dict["max_similar"] = None
            item_dict["mean_similar"] = None
            item_dict["weight_similar"] = None
            return item_dict

        similar_list = list()
        weight_similar_list = list()

        title_cut = list(jieba.cut(title))
        title_cut = char_list_cheaner(title_cut)
        for key, value in query_predict.items():
            query_cut = list(jieba.cut(key))
            query_cut = char_list_cheaner(query_cut)

            try:
                w2v_similar = w2v_model.n_similarity(query_cut, title_cut)
            except (KeyError, ZeroDivisionError):
                w2v_similar = np.nan

            similar_list.append(w2v_similar)
            weight_w2v_similar = w2v_similar * float(value)
            weight_similar_list.append(weight_w2v_similar)

        max_similar = np.nanmax(similar_list)
        mean_similar = np.nanmean(similar_list)
        weight_similar = np.nansum(weight_similar_list)

        item_dict["max_similar"] = max_similar
        item_dict["mean_similar"] = mean_similar
        item_dict["weight_similar"] = weight_similar
        return item_dict
    def _get_jieba_array(words, size=500):
        seg_cut = jieba.lcut(words)
        seg_cut = char_list_cheaner(seg_cut)

        w2v_array = list()
        for word in seg_cut:
            try:
                similar_list = w2v_model[word]
                w2v_array.append(similar_list)
            except KeyError:
                continue

        if not w2v_array:
            w2v_array = [None] * size
        else:
            w2v_array = matutils.unitvec(np.array(w2v_array).mean(axis=0))

        return w2v_array
    def _to_csv(df, col, size):
        file_name = '{col}_w2v.csv'.format(col=col)
        file_path = os.path.join(TEMP_DATA_PATH, file_name)
        if os.path.exists(file_path):
            os.remove(file_path)

        columns = ['{}_w2v_{}'.format(col, i) for i in range(size)]
        none_index_set = set()

        with open(file_path, 'a', encoding='utf-8') as f:
            # write columns
            f.write(','.join(columns) + '\n')

            for idx, item in tqdm(df[col].items()):
                if item == 'null':
                    item_list = [''] * size
                    none_index_set.add(idx)
                elif not item:
                    item_list = [''] * size
                    none_index_set.add(idx)
                else:
                    seg_cut = jieba.lcut(item)
                    seg_cut = char_list_cheaner(seg_cut)

                    w2v_array = list()
                    for word in seg_cut:
                        try:
                            similar_list = w2v_model[word]
                            w2v_array.append(similar_list)
                        except KeyError:
                            pass

                    if not w2v_array:
                        item_list = [''] * size
                        none_index_set.add(idx)
                    else:
                        item_list = matutils.unitvec(np.array(w2v_array).mean(axis=0))

                f.write(','.join(map(str, item_list)) + '\n')

        return none_index_set
Exemple #7
0
    def _get_jieba_array(self, words, size=300):
        '''
        对输入的word做结巴分词后获取对于的词向量,取平均后作为words的向量
        '''
        seg_cut = jieba.lcut(words)
        seg_cut = char_list_cheaner(seg_cut)

        w2v_array = list()
        for word in seg_cut:
            try:
                similar_list = self.w2v_model[word]
                w2v_array.append(similar_list)
            except KeyError:
                continue

        if not w2v_array:
            w2v_array = [None] * size
        else:
            w2v_array = matutils.unitvec(np.array(w2v_array).mean(axis=0))

        return w2v_array