def main():
    vocab=calcu_voacb()
    #print(vocab)

    sents1=load_sents()
    sents2=load_corpus()
    out=[]
    for i in range(len(sents1)):
        token1=segment(sents1[i])
        token2=segment(sents2[i])
        vec1=[0 for i in range(len(vocab))]
        vec2 = [0 for i in range(len(vocab))]
        for word in token1:
            if word in list(vocab.keys()):
                vec1[vocab[word]]+=1
            else:
                vec1[-1]+=1
        for word in token2:
            if word in list(vocab.keys()):
                vec2[vocab[word]] += 1
            else:
                vec2[-1] += 1
        out.append(cos_sim(vec1[:-1],vec2[:-1]))
    with open("res.txt","w") as f:
        for i in out:
            f.write(str(i)+'\n')
        f.close()
def get_similarity_score():

    params = request.get_json(force=True)

    wiki_content = params['wiki']
    exp_content = params['exp']
    wiki_keyword = tfidf.get_keyword(wiki_content)
    wiki_keyword_simi = []
    for keyword in wiki_keyword.split():
        try:
            key_simi = model.most_similar([keyword], topn=15)
            for elem in key_simi:
                wiki_keyword_simi.append(elem[0])
        except KeyError:
            continue
    wiki_cut = [word for word in segmenter.segment(wiki_content).split()]
    wiki_cut = wiki_cut + wiki_keyword_simi
    wiki_keyword_set = set(wiki_cut)

    exp_cut = [word for word in segmenter.segment(exp_content).split()]
    exp_set = set(exp_cut)
    intersection = len(wiki_keyword_set & exp_set)
    if len(exp_set) == 0:
        return str(0)
    else:
        return str(intersection / len(exp_set))
 def __iter__(self):
     with open(self.filepath, 'r') as input_file:
         for line in input_file:
             raw_sentence = line.strip().split('\t')[1]
             sentence = segmenter.segment(raw_sentence)
             yield [word for word in sentence.split()
                    ]  #['word_1','word_2','word_3']
Ejemplo n.º 4
0
def submit():
    text = request.form.get('text')
    thu_segmenter = thulac.thulac(seg_only=True)

    thu_list = thu_segmenter.cut(text)
    thu_array = np.array(thu_list)
    seg_list, keyword = segmenter.segment(text, keywords)
    jieba_list = jieba.cut(text)

    seg_result = "/".join(seg_list)
    jieba_result = "/".join(jieba_list)
    thu_result = "/".join(thu_array[:, 0])

    jieba_similarity, jieba_result = test.similarity(seg_list,
                                                     jieba_result.split("/"))
    thulac_similarity, thu_result = test.similarity(seg_list,
                                                    thu_result.split("/"))
    jieba_similarity = util.float2percentage(jieba_similarity)
    thulac_similarity = util.float2percentage(thulac_similarity)

    keyword = list(keyword.keys())

    return render_template("submit.html",
                           seg=seg_list,
                           jieba=jieba_result,
                           thulac=thu_result,
                           jieba_similarity=jieba_similarity,
                           thulac_similarity=thulac_similarity,
                           keyword_first=keyword[0:10],
                           keyword_second=keyword[10:20])
Ejemplo n.º 5
0
def get_idf(inputfile, idffile):  # idf generator

    inputfile = inputfile
    outputfile = idffile

    doc = []
    with open(inputfile, 'r') as ins:
        for line in ins:
            line = line.strip().split('\t')[1]
            doc.append(segment(line))

    id_freq = {}
    i = 0
    for cut_doc in doc:
        #print "doc: ",doc
        for x in cut_doc.split():
            id_freq[x] = id_freq.get(x, 0) + 1
        if i % 1000 == 0:
            print('Documents processed: ', i, ', time: ',
                  datetime.datetime.now())
        i += 1

    with open(outputfile, 'w') as f:
        for key, value in id_freq.items():
            f.write(
                key.encode('utf-8') + '\t' + str(math.log(i / value + 1, 2)) +
                '\n')
Ejemplo n.º 6
0
 def __iter__(self):
     for dirfile in os.walk(self.dirname):
         for fname in dirfile[2]:
             text = open(os.path.join(dirfile[0], fname),
                         'r',
                         encoding='utf-8',
                         errors='ignore').read()
             yield segment(text)  # time consuming
Ejemplo n.º 7
0
 def __iter__(self):
     # os.walk() 遞迴印出資料夾中所有目錄及檔名
     # ('idf', [], ['1.txt', '2.txt'])
     # dirfile[2] =>檔名
     for fname in self.fname:
         # 內文
         docs = self.read_file(self.folderName + '/' + fname, 'json')
         for i in range(len(docs)):
             yield segment(docs[i]['content'])
Ejemplo n.º 8
0
def generate(source='articles/cnbeta'):
    'combines cleaner and segmenter'
    import cleaner, segmenter
    
    documents = []
    items = cleaner.clean(source)
    documents = segmenter.segment(items)
    publish(documents, source)
    
    return documents
Ejemplo n.º 9
0
 def __iter__(self):
     for dirfile in os.walk(self.dirname):
         for fname in dirfile[2]:
             # text = open(os.path.join(dirfile[0], fname),
             #             'r', encoding='utf-8', errors='ignore').read()
             with open(os.path.join(dirfile[0], fname),
                       'r',
                       encoding='utf-8',
                       errors='ignore') as f:
                 text = ''
                 for i, line in enumerate(f):
                     text = text + line
                     if i % 1000 == 0 and i != 0:
                         yield segment(text)  # time consuming
                         text = ''
Ejemplo n.º 10
0
def get_similarity_score(wiki, exp):
    wiki_content = wiki
    exp_content = exp
    wiki_keyword = tfidf.get_keyword(wiki_content)
    wiki_keyword_simi = []
    for keyword in wiki_keyword.split():
        try:
            key_simi = model.most_similar([keyword], topn=15)
            for elem in key_simi:
                wiki_keyword_simi.append(elem[0])
        except KeyError:
            continue

    wiki_cut = [word for word in segmenter.segment(wiki_content).split()]
    wiki_cut = wiki_cut + wiki_keyword_simi
    wiki_keyword_set = set(wiki_cut)

    exp_cut = [word for word in segmenter.segment(exp_content).split()]
    exp_set = set(exp_cut)
    intersection = len(wiki_keyword_set & exp_set)
    if len(exp_set) == 0:
        return 0
    else:
        return intersection / len(exp_set)
    def extract_keywords(self, sentence, topK=20):    # 提取关键词
        # 过滤
        seg_list = segment(sentence)

        freq = {}
        for w in seg_list:
            freq[w] = freq.get(w, 0.0) + 1.0
        total = sum(freq.values())

        for k in freq:   # 计算 TF-IDF
            freq[k] *= self.idf_freq.get(k, self.mean_idf) / total

        tags = sorted(freq, key=freq.__getitem__, reverse=True)  # 排序

        if topK:
            return tags[:topK]
        else:
            return tags
Ejemplo n.º 12
0
    def extract_keywords(self, sentence, topK=20):    # 提取关键词
        # 过滤
        seg_list = segment(sentence)

        freq = {}  # 统计每个句子中词语出现的频率TF
        for w in seg_list:
            freq[w] = freq.get(w, 0.0) + 1.0
        total = sum(freq.values())  # 文章中的总的词语数量

        for k in freq:   # 计算  # TF-IDF = TF*IDF
            freq[k] *= self.idf_freq.get(k, self.mean_idf) / total

        tags = sorted(freq, key=freq.__getitem__, reverse=True)  # 降序排列,TF-IDF值越大,说明越重要!

        if topK:
            return tags[:topK]   # 取前topK=20个关键词
        else:
            return tags
Ejemplo n.º 13
0
def main():  # idf generator
    sents = load_sents()
    sents = [segment(x) for x in sents]
    ignored = {'', ' ', '', '。', ':', ',', ')', '(', '!', '?', '”', '“'}
    id_freq = {}
    i = 0
    for doc in sents:
        doc = set(x for x in doc if x not in ignored)
        for x in doc:
            id_freq[x] = id_freq.get(x, 0) + 1
        if i % 1000 == 0:
            print('Documents processed: ', i, ', time: ',
                  datetime.datetime.now())
        i += 1

    with open("idf.txt", 'w', encoding='utf-8') as f:
        for key, value in id_freq.items():
            f.write(key + ' ' + str(math.log(i / value, 2)) + '\n')
Ejemplo n.º 14
0
    def extract_keywords(self, sentence, topK=15):  # 提取关键词
        # 过滤
        seg_list = segment(sentence)
        freq = {}
        for w in seg_list.split():
            freq[w] = freq.get(w, 0.0) + 1.0
        if '' in freq:
            del freq['']
        total = sum(freq.values())

        for k in freq:  # 计算 TF-IDF,这里不产生异常,因为idf不存在则用平均值替代
            freq[k] *= self.idf_freq.get(k, self.mean_idf) / total

        tags = sorted(freq, key=freq.__getitem__, reverse=True)  # 排序

        if topK:
            return tags[:topK]
        else:
            return tags
Ejemplo n.º 15
0
    def __iter__(self):

        for i in range(len(self.docs)):
            yield segment(self.docs[i]['title'])
            yield segment(self.docs[i]['content'])
Ejemplo n.º 16
0
# -*- coding: utf-8 -*-
# @Author  : lin.xiong

from __future__ import division
import segmenter

out_file =  open('C:/Users/lin/Desktop/out_uid_content.data','w')
with open('C:/Users/lin/Desktop/uid_content.data','r') as uid_content_file:
    for line in uid_content_file:
        line = line.strip()
        uid = line.split('\t')[0]
        comment_list = line.split('\t')[1].split('|')
        comment_arr = []
        union_comment = set()
        if len(comment_list) < 2:
            out_file.write(uid + '\t' + '-1' + '\t' + '1' +'\n')
        else:
            for comment in comment_list:
                comment_set = set(segmenter.segment(comment).split())
                comment_arr.append(comment_set)
                union_comment |= comment_set

            inter_comment = reduce(lambda x ,y : x & y, comment_arr)
            if len(union_comment) == 0:
                simi = 1
            else:
                simi = len(inter_comment)/len(union_comment)
            out_file.write(uid + '\t' + str(simi) + '\t' + str(len(comment_list)) + '\n')
out_file.close()
Ejemplo n.º 17
0
 def __iter__(self):
     text = open(self.inputfile, 'r').readlines()
     for line in text:
         line = line.strip().split('\t')[1]
         yield segment(line)  # time consuming