Esempio n. 1
0
    def get_similarity(self,sen1, sen2):
        """默认的用于计算两个句子相似度的函数。
        Keyword arguments:
        word_list1, word_list2  --  分别代表两个句子,都是由单词组成的列表
        """
        word_list1 = tools.seperate(sen1)
        word_list2 = tools.seperate(sen2)

        words = list(set(word_list1 + word_list2))
        vector1 = [float(word_list1.count(word)) for word in words]
        vector2 = [float(word_list2.count(word)) for word in words]

        vector3 = [vector1[x] * vector2[x] for x in range(len(vector1))]
        vector4 = [1 for num in vector3 if num > 0.]
        co_occur_num = sum(vector4)

        if abs(co_occur_num) <= 1e-12:
            return 0.

        denominator = math.log(float(len(word_list1))) + math.log(float(len(word_list2)))  # 分母

        if abs(denominator) < 1e-12:
            return 0.

        return co_occur_num / denominator
Esempio n. 2
0
def data_filter(news, abstract):
    if len(news) < 10 or len(news) >= 80:
        return -2
    if len(abstract) < 2 or len(''.join(abstract)) < 50:
        return -1

    news_gram2, abstract_gram2 = [], []
    for i in range(len(news)):
        tmp = tools.seperate(news[i])
        news_gram2.append(
            set([tmp[k] + tmp[k + 1] for k in range(len(tmp) - 1)]))
        # news_gram2.append(set(tmp))
    result = 0
    for i in range(len(abstract)):
        tmp = tools.seperate(abstract[i])
        abstract_gram2.append(
            set([tmp[k] + tmp[k + 1] for k in range(len(tmp) - 1)]))
        # abstract_gram2.append(set(tmp))
        value = 0
        for j in range(len(news_gram2)):
            v = len(abstract_gram2[i].intersection(news_gram2[j]))
            if v > value:
                value = v
        result += value
    # print(news_gram2[12])
    # print(abstract_gram2[0])
    # print(abstract_gram2[0].intersection(news_gram2[12]))
    result /= sum([len(abstract_gram2[i]) for i in range(len(abstract_gram2))])
    # print(result)
    return result
Esempio n. 3
0
    def get_similarity_(self, sen1, sen2):
        word_list1 = tools.seperate(sen1)
        word_list2 = tools.seperate(sen2)

        words = list(set(word_list1 + word_list2))
        vector1 = [float(word_list1.count(word)) for word in words]
        vector2 = [float(word_list2.count(word)) for word in words]

        vector1, vector2 = np.mat(vector1), np.mat(vector2)
        dist = np.sqrt(np.sum(np.square(vector1 - vector2)))
        return dist
Esempio n. 4
0
 def sim(self, sen1, sen2):
     count = 0
     words1 = tools.seperate(sen1)
     words2 = tools.seperate(sen2)
     for var in words1:
         if var in words2:
             count += 1
     if len(words1) == 0:
         print("sentences", sen1)
         input()
     count /= len(words1)
     return count
Esempio n. 5
0
    def weighted_vectorize(self, text):
        res = []
        sentences = tools.seperate_sentences(text)
        tr_text = self.tr.textrank(text)
        for sen in sentences:
            tmp = []
            tmp_weight = []
            sen_words = tools.seperate(sen)
            for w in sen_words:
                if self.model.wv.vocab.__contains__(w):
                    tmp.append(self.model.__getitem__(w))
                    if w in tr_text:
                        tmp_weight.append(tr_text[w])
                    else:
                        tmp_weight.append(1 / len(sen_words))
                else:
                    tmp.append([0] * self.vec_length)
                    tmp_weight.append(1 / len(sen_words))
            for i in range(len(tmp)):
                tmp[i] = tools.vector_multi(tmp[i],
                                            tmp_weight[i] / sum(tmp_weight))

            sen_vec = tools.vector_add_multi(tmp)
            if len(sen_vec) == 0:
                print(sen)
            res.append(sen_vec)
        return res
Esempio n. 6
0
 def textrank(self, text):
     sentences = tools.seperate_sentences(text)
     words = {}
     words_list = []
     res = {}
     sen_words = []
     for sen in sentences:
         ws = tools.seperate(sen)
         sen_words.append(ws)
         for w in ws:
             if w not in words.keys():
                 words_list.append(w)
                 words[w] = len(words)
     matrix = np.zeros((len(words), len(words)))
     # matrix = [[0] * len(words) for var in range(len(words))]
     for sen_w in sen_words:
         for i in range(len(sen_w)):
             for j in range(i, len(sen_w)):
                 # print(words[sen_w[i]],words[sen_w[j]],len(words))
                 matrix[words[sen_w[i]], words[sen_w[j]]] += 1
                 matrix[words[sen_w[j]], words[sen_w[i]]] += 1
     nx_graph = nx.from_numpy_matrix(matrix)
     nx_parameter = {'alpha': 0.85}
     score = nx.pagerank(nx_graph, **nx_parameter)
     sorted_score = sorted(score.items(),
                           key=lambda item: item[1],
                           reverse=True)
     for index, value in sorted_score:
         if words_list[index] not in res.keys():
             res[words_list[index]] = value
     return res
Esempio n. 7
0
 def load_data(self, path=Dir.res + "/cleandata_604/news/"):
     flist = ftools.get_files(path)
     data = []
     count = 0
     for name in flist:
         filepath = path + name
         lines = ftools.read_lines(filepath)
         for line in lines:
             words = tools.seperate(line)
             data.append(TaggedDocument(words, ["sen_" + str(count)]))
             self.sen_dict[''.join(words)] = "sen_" + str(count)
             count += 1
     return data
Esempio n. 8
0
def transfer(cleandata_root = Dir.res+"/cleandata_1189/news/",save_path = Dir.res+"/sen_data/1189_corpus.txt"):
    filelist = os.listdir(cleandata_root)
    lines = []
    for name in filelist:
        filepath = cleandata_root+name
        for line in ftools.read_lines(filepath):
            words = tools.seperate(line)
            for i in range(len(words)):
                if words[i].isdigit():
                    words[i] = "num"
            lines.append(' '.join(words))

    ftools.write_list(save_path,lines)
Esempio n. 9
0
    def get_similarity_(self, sen1, sen2):
        word_list1 = tools.seperate(sen1)
        word_list2 = tools.seperate(sen2)

        words = list(set(word_list1 + word_list2))
        vector1 = [float(word_list1.count(word)) for word in words]
        vector2 = [float(word_list2.count(word)) for word in words]
        # try:
        if len(vector1) == 0:
            return 0
        res = self.dist.sim(vector1, vector2, self.dist_type)
        if self.dist_type == Distance.EUD:
            res = 1 / res - 1
        if self.dist_type == Distance.OCCLOSE:
            res *= 10
            # print(res)
            # return res
        # except:
        #     print(sen1,sen2)
        #     print(word_list1,word_list2)
        #     print(words)
        #     print(vector1)
        #     print(vector2)
        return res
Esempio n. 10
0
 def unweighted_vectorize(self, text):
     res = []
     sentences = tools.seperate_sentences(text)
     for line in sentences:
         tmp = []
         for word in tools.seperate(line):
             if self.model.wv.vocab.__contains__(word):
                 wv = self.model.__getitem__(word)
                 tmp.append(wv)
             else:
                 tmp.append([0] * self.vec_length)
         tmp = tools.vector_add_multi(tmp)
         tmp = tools.vector_multi(tmp, 1 / (len(tmp)))
         res.append(tmp)
     return res
Esempio n. 11
0
def loaddata(path):
    # flist = ftools.get_files(data_root)

    # count =1
    # for name in flist:
    #     print(count,len(flist))
    #     count+=1
    #     path = data_root+name
    trainformat_sentences = []
    content = ftools.read_lines(path)
    for line in content:
        article = line[line.rindex(",") + 1:]
        sentences = tools.seperate_sentences(article)
        for sen in sentences:
            trainformat_sentences.append(tools.seperate(sen))
    return trainformat_sentences
Esempio n. 12
0
 def load_data(self, path=Dir.res + "/cleandata_604/news/"):
     flist = ftools.get_files(path)
     data = []
     count = 0
     for name in flist:
         filepath = path + name
         lines = ftools.read_lines(filepath)
         essay = ""
         tmp = []
         for line in lines:
             words = tools.seperate(line)
             tmp.extend(words)
             essay += ''.join(words)
         data.append(TaggedDocument(tmp, ["text_" + str(count)]))
         self.sen_dict[essay] = "text_" + str(count)
         count += 1
     return data
Esempio n. 13
0
 def vectorize(self, text):
     sentences = tools.seperate_sentences(text)
     res = []
     words = {}
     sen_w = []
     for i in range(len(sentences)):
         sen_words = tools.seperate(sentences[i])
         sen_w.append(sen_words)
         for w in sen_words:
             if w not in words.keys():
                 words[w] = len(words)
     for i in range(len(sen_w)):
         tmp = [0] * len(words)
         for var in sen_w[i]:
             tmp[words[var]] += 1
         res.append(tmp)
     return res
Esempio n. 14
0
def get_clue_words(path=Dir.res + "/extradata/",
                   savepath=Dir.res +
                   "/parameter/summarization_parameter/clue_words",
                   word_index=3):
    _, res_sen = filter(path)
    words = {}
    for var in res_sen:
        for sen in var[1:]:
            ws = tools.seperate(sen)
            for w in ws[:word_index]:
                if w not in words.keys():
                    words[w] = 0
                words[w] += 1

    content = ""
    for w in words.keys():
        content += w + "," + str(words[w]) + "\n"
    ftools.write(savepath + str(word_index), content)
Esempio n. 15
0
        sens_vect = []
        essay_key = []
        for sen in sens:
            essay_key.extend(sen)
            vec = self.sen2v.get_sen_vec(sen)
            # if vec == None:
            #     input()
            sens_vect.append(vec)
        essay_vector = self.doc2v.get_sen_vec(essay_key)
        return sens_vect, essay_vector


if __name__ == "__main__":

    # sen2v = Sen2Vec()
    # sen2v.train()
    # doc2v= Doc2Vec()
    # doc2v.train()

    sens = ftools.read_lines(Dir.res + "/cleandata_604/news/training_4.txt")
    pvdm_v = pvdm_vectorize()
    text = []
    for line in sens:
        text.append(tools.seperate(line))
    sens, essay = pvdm_v.vectorize(text)
    print(sens[0])
    for ss in sens:
        print(ss)

    # print(essay)
Esempio n. 16
0
def filter_craw_data(data_dir=Dir.res + "/craw_data/data/",
                     save_dir=Dir.res + "/cleandata_none"):
    if os.path.lexists(save_dir):
        shutil.rmtree(save_dir)

    files = tools.get_files(data_dir)
    cleandata = []
    count = 0
    bad_sample = []
    for i in range(len(files)):
        print(i, len(files), len(cleandata))
        fname = files[i]
        path = data_dir + fname
        lines = tools.read_lines(path)
        for line in lines:
            line = line.strip()

            # try:
            if 1:
                last_ = line.rindex(",")
                first_ = line.index(",")
                if first_ == last_:
                    continue
                tmp = [line[:first_], line[first_ + 1:last_], line[last_ + 1:]]
                abstracts = tls.seperate_sentences(tmp[1])
                news = tls.seperate_sentences(tmp[2])

                tmp = get_abstract_index(news, abstracts)

                count += 1
                if len(tmp) != len(abstracts):
                    continue
                # print(tmp)
                # cmd = input()
                # if "1" in cmd:
                #     print('\n'.join(abstracts))
                #     print("--------------------")
                #     print('\n'.join(news))
                #
                #     print("--------------------")
                #     print("words:",w_count)
                w_count = 0
                for li in news:
                    w_count += len(tls.seperate(li))
                if w_count < 520:
                    continue

                if sum(tmp[:3]) <= 3:
                    continue
                cleandata.append([abstracts, news])
                tools.write(
                    save_dir + "/abstract/trainning_" + str(len(cleandata)) +
                    ".txt", '\n'.join(abstracts))
                tools.write(
                    save_dir + "/news/trainning_" + str(len(cleandata)) +
                    ".txt", '\n'.join(news))
            # except Exception as e:
            #     print(str(e),e.with_traceback(e.__traceback__))
            #     print("error",line)
            #     bad_sample.append(line)
    print(count, len(bad_sample), len(cleandata))