コード例 #1
0
def stdOut(sentence_result_lists, dicts, scope):
    lists = list()
    j = 0
    sentence_nbr = len(dicts)
    sentence_result_lists_tmp = map(
        lambda (key, score): (key, score * (1 - math.log(
            (key + 1)) / math.log(sentence_nbr))), sentence_result_lists)
    sentence_result_lists2 = sorted(sentence_result_lists_tmp,
                                    key=lambda (key, score): score,
                                    reverse=True)
    try:
        for distattr3 in sentence_result_lists2:
            sentence_id = distattr3[0]
            tmp = dicts[sentence_id]
            tmp2 = filter(lambda x: is_chinese(x), tmp)
            if (len(tmp2) < 8 or contain_redundant(
                    redundant_dict='../resource/redundant_dict.txt',
                    string_with_redundant=tmp)):
                continue
            j += 1
            result_str = removePrefix(tmp.strip(" "), "”".decode("utf8"))
            result = distattr2(sentence_id, result_str)
            lists.append(result)
            if (j >= scope):
                break
        std = sorted(lists,
                     key=lambda x: 0.5 * len(x.strs) / (x.ids + 1),
                     reverse=True)
    except:
        std = lists
    return std
コード例 #2
0
def evaluate_words(dictssentence,keyword,new=True):
    # for word !!!
    dictsword_tmp = {key: sentence_evaluate(value) for key, value in dictssentence.items() if len(sentence_evaluate(value)) > 0}
    dictsword_tmp2 = map(lambda z: z[1], dictsword_tmp.items())
    corpus = map(lambda z: " ".join(z), dictsword_tmp2)
    wordlists = top50words(corpus)
    # sentence lists
    if(new):
        if keyword[0] is None:
            words_final = map(lambda (word, importance): word, wordlists)[:100]
        else:
            dictsword_tmp3 = list(set(flatten(dictsword_tmp2)))
            word_potential = map(lambda t: (t,similar_check_higher(t, keyword)), flatten(dictsword_tmp3))
            words_2 = sorted(word_potential, key = lambda (word, score):score, reverse =True)
            words_3 = filter(lambda (key,score):score != -1,words_2)
            words_final = map(lambda (key, score): key, words_3)[:100]
        # word importance based on word2vec
        # ids:word index: order id score: vector
        word_vector_dicts = [distattr2(word, word2vec_evaluate(word)) for word in words_final if len(word2vec_evaluate(word)) != 0]
        try:
            final_list = map(lambda x: (x.ids,x.score),textrankgetter(word_vector_dicts, False))
            return final_list
        except:
            return wordlists
    else:
        return wordlists
コード例 #3
0
def evaluate_sentence_tradition(dictssentence, words_importance):
    result = list()
    words_lookup = dict(words_importance)
    sentence_nbr = len(dictssentence)
    for key, value in dictssentence.items():
        cut_sentence = sentence_evaluate(value)
        if len(cut_sentence) == 0:
            continue
        sentence_id = key
        importance_list = map(lambda x: words_lookup.get(x, 0), cut_sentence)
        score = reduce(lambda x, y: x + y, importance_list) * (1 - math.log(
            (key + 1)) / math.log(sentence_nbr))
        result.append(distattr2(sentence_id, score))
    result_final = sorted(result,
                          key=lambda x: 0.5 * len(x.strs) / (sentence_id + 1),
                          reverse=True)
    return result_final
コード例 #4
0
def evaluate_sentence(dictssentence):
    dictsword = {
        key: sentence_evaluate(value)
        for key, value in dictssentence.items()
        if len(sentence_evaluate(value)) > 6
    }
    if len(dictsword) == 0:
        result_list_final = []
        return result_list_final
    else:
        sentence_vector_dicts = [
            distattr2(key, model_abstract.infer_vector(value))
            for key, value in dictsword.items()
        ]
        try:
            result_list = textrankgetter(sentence_vector_dicts)
            result_list_final = map(lambda x: (x.ids, x.score), result_list)
        except:
            result_list_final = []
        return result_list_final
コード例 #5
0
 def stdOut(self, rank, dicts, top):
     lists = list()
     j = 0
     try:
         for sentence_id in rank:
             tmp = dicts[sentence_id]
             tmp2 = filter(lambda x: is_chinese(x), tmp)
             if (len(tmp2) < 8 or contain_redundant(
                     redundant_dict='../resource/redundant_dict.txt',
                     string_with_redundant=tmp)):
                 continue
             j += 1
             result_str = removePrefix(tmp.replace(" ", ""),
                                       "”".decode("utf8"))
             result = distattr2(sentence_id, result_str)
             lists.append(result)
             if (j >= top):
                 break
         std = sorted(lists, key=lambda x: x.ids)
     except:
         std = lists
     return std
コード例 #6
0
def main():
    # load data
    conn = MySQLdb.connect(host=args.host,
                           user=args.user,
                           passwd=args.passwd,
                           db=args.db,
                           charset='utf8')
    cur = conn.cursor()
    cur.execute('select id, content_html from t_crawler_obj limit ' +
                args.file[0] + ',' + args.file[1])
    data = cur.fetchall()

    # load model
    model = doc2vec.Doc2Vec.load(args.model)
    # parse data by beautiful soup
    dicts1 = dict()
    for line in data:
        ids, content_html = line
        content = BeautifulSoup(content_html, "html.parser")
        dicts1[ids] = content.get_text()

    # split sentence # nested dict dict2-> key: paper, value: dicttmp-> key: sentence id, value: sentence string
    dicts2 = defaultdict(dict)
    for key, value in dicts1.items():
        lists = cut_sentence_new(value)
        dicttmp = dict()
        for key2, value2 in enumerate(lists):
            dicttmp[key2] = value2
        dicts2[key] = dicttmp

# split words dict3-> key: paper, value: dicttmp-> key: sentence id, value: sentence split list
    dicts3 = defaultdict(dict)
    analyse.set_stop_words('../resource/stop_words.txt')
    for key, value in dicts2.items():
        dicttmp = dict()
        for key2, value2 in value.items():
            seg_list = jieba.cut(
                string_parser(punc_file='../resource/punc_file.txt',
                              string_with_punc=value2))
            seg_list = filter(lambda x: x != " ", seg_list)
            lists = list(seg_list)
            if (len(lists) >= 3):  #save sentence with length greater than 3
                dicttmp[key2] = lists
        dicts3[key] = dicttmp


# vectorization and textrank

    for key, value in dicts3.items():
        dictrember = dict()
        X = list()
        i = 0
        for key2, value2 in value.items():
            dictrember[i] = key2  # i: X index; key2: sentence order
            X.append(model.infer_vector(value2))
            i += 1
        X = np.array(X, dtype='float32')
        distance_matrix = pairwise_distances(X, metric='cosine')
        rank = rankgetter(distance_matrix=distance_matrix,
                          dictrember=dictrember)
        j = 0
        try:
            lists = list()
            for info in rank:
                ind = info.ids  # sentence order
                tmp = dicts2[key][ind]
                tmp2 = filter(lambda x: is_chinese(x), tmp)
                if (len(tmp2) < 8 or contain_redundant(
                        redundant_dict='../resource/redundant_dict.txt',
                        string_with_redundant=dicts2[key][ind])):
                    continue
                j += 1
                result_str = removePrefix(dicts2[key][ind].replace(" ", ""),
                                          "”".decode("utf8"))
                result = distattr2(ind, result_str)
                lists.append(result)
                if (j >= args.top):
                    break

            stdOut = sorted(
                lists, key=lambda x: x.ids
            )  # print the result according to the order sentence
            for key3, sentence3 in enumerate(stdOut):
                print str(key) + " " + str(key3 + 1) + ": " + sentence3.strs

        except:
            print("No More Qualified Sentence!")