Exemple #1
0
    # writer.writerow([row[0],
    #                  row[1],
    #                  properties_dict['node_num'],
    #                  properties_dict['edge_num'],
    #                  properties_dict['max_in_degree'],
    #                  properties_dict['min_in_degree'],
    #                  properties_dict['avg_in_degree'],
    #                  properties_dict['max_out_degree'],
    #                  properties_dict['min_out_degree'],
    #                  properties_dict['avg_out_degree'],
    #                  properties_dict['density'],
    #                  properties_dict['num_components'],
    #                  properties_dict['max_component_diameter'],
    #                  properties_dict['max_component_node_num'],
    #                  properties_dict['max_component_edge_num'],
    #                  properties_dict['max_component_max_in_degree'],
    #                  properties_dict['max_component_min_in_degree'],
    #                  properties_dict['max_component_avg_in_degree'],
    #                  properties_dict['max_component_max_out_degree'],
    #                  properties_dict['max_component_min_in_degree'],
    #                  properties_dict['max_component_avg_in_degree']])


if __name__ == '__main__':
    con = get_db_connection('meme')
    meme = 'continual reassessment method'
    run(con, r'../results/meme/path', limit=999999)
    # gen_meme_path_v2(con, 102699, r'../results/meme/path/pajek')
    con.close()
Exemple #2
0
def gen_n_gram_v3(ngram=3):
    """
    生成N-Gram并存入数据库,先从内部引证表中读取有效的目标文献及其标题摘要,
    将标题摘要按以下操作步骤进行处理:小写化、(分句、分词、去停)、对每一句形成
    1到N-gram,建立N-gram序号字典,将处理好的结果存入数据库
    :param ngram: 最高需要多少-gram
    :return: None
    """
    stopwords = get_stopwords()

    query_title_abs_keyword_keywordplus = 'SELECT c.*, GROUP_CONCAT(d.KEYWORD) AS `keyword` FROM ' \
                                          '(SELECT a.`paper_id`, b.`title`, b.`abs` FROM ' \
                                          '(SELECT `paper_id` FROM `cndblp_paper` WHERE `paper_id` IN ' \
                                          '(SELECT DISTINCT `paper_id` FROM `cndblp_inner_reference`) OR `paper_id` IN ' \
                                          '(SELECT DISTINCT `cited_paper_id` FROM `cndblp_inner_reference`)) AS a ' \
                                          'LEFT JOIN `cndblp_paper` AS b ON a.`paper_id` = b.`paper_id`) AS c ' \
                                          'LEFT JOIN ' \
                                          '(SELECT PAPER_ID,KEYWORD FROM `cndblp_keyword_plus` ' \
                                          'UNION ALL ' \
                                          'SELECT PAPER_ID,KEYWORD FROM cndblp_keyword) AS d ON c.`paper_id` = d.`PAPER_ID` ' \
                                          'GROUP BY c.paper_id'
    # query_title_abs = 'SELECT a.paper_id, b.title, b.abs FROM ' \
    #                   '(SELECT `paper_id` FROM `cndblp_paper` ' \
    #                   'WHERE `paper_id` IN (SELECT DISTINCT `paper_id` FROM `cndblp_inner_reference`) ' \
    #                   'OR `paper_id` IN (SELECT DISTINCT `cited_paper_id` FROM `cndblp_inner_reference`)) AS a ' \
    #                   'LEFT JOIN `cndblp_paper` AS b ' \
    #                   'ON a.paper_id = b.paper_id'
    # insert_title = 'INSERT INTO `cndblp_title_ngram` VALUES(%s, %s)'
    insert_abs = 'INSERT INTO `cndblp_abs_ngram` VALUES(%s, %s)'
    # insert_keyword = 'INSERT INTO `cndblp_keyword_ngram` VALUES(%s, %s)'
    insert_word = 'INSERT INTO `cndblp_ngram_list` VALUES(%s, %s)'
    insert_title_keyword = 'INSERT INTO `cndblp_title_keyword_ngram` VALUES(%s, %s)'

    title_list = []
    abs_list = []
    keyword_list = []
    word_dict = {}
    key = 1

    # 开启两个连接,一个用于分批查询,另一个用于在查询过程中插入数据
    con = get_db_connection('patent_thesis')
    con2 = get_db_connection('patent_thesis')

    cursor = con.cursor()
    cursor2 = con.cursor()

    # 每次取1000条进行处理
    batch_size = 1000
    cursor.execute(query_title_abs_keyword_keywordplus)
    result = cursor.fetchmany(batch_size)

    i = len(result)
    while len(result) > 0:
        print('正在处理第{}条以前的文献'.format(i))

        # 对每批1000条的数据进行逐条处理
        for row in result:
            paper_id = row[0]
            title = row[1].lower()
            abs = row[2]
            keyword = row[3]

            # 如果有的文献没有摘要或关键词,则为空
            if abs is not None:
                abs = abs.lower()
            else:
                abs = ''

            if keyword is not None:
                keyword = keyword.lower()
            else:
                keyword = ''

            # 对标题进行分词、去停处理(标题不需要分句)
            title = title.replace('"', '')
            title = [
                word for word in word_tokenize(title)
                if word not in stopwords and len(word) > 1
            ]
            title_ngram = []

            for n in range(1, ngram + 1):
                tmp = [' '.join(grams) for grams in ngrams(title, n)]
                title_ngram += tmp

            # 对摘要进行分句、分词、去停处理,摘要的ngram要在每个句子内部进行
            abs_ngram = []

            abs = abs.replace('"', '')
            for sentence in split_sentence(abs):
                s = [
                    word for word in word_tokenize(sentence)
                    if word not in stopwords and len(word) > 1
                ]
                for n in range(1, ngram + 1):
                    tmp2 = [' '.join(grams) for grams in ngrams(s, n)]
                    abs_ngram += tmp2

            # 对关键词进行拆分处理,不需要ngram
            keyword = keyword.split(',')
            for _keyword in keyword:
                if _keyword not in word_dict:
                    word_dict[_keyword] = key
                    key += 1
                _keyword_index = word_dict[_keyword]
                keyword_list.append((paper_id, _keyword_index))

            # 去除重复的ngram
            title_ngram = set(title_ngram)
            abs_ngram = set(abs_ngram)

            for t_ngram in title_ngram:
                if t_ngram not in word_dict:
                    word_dict[t_ngram] = key
                    key += 1
                t_index = word_dict[t_ngram]
                title_list.append((paper_id, t_index))
            for a_ngram in abs_ngram:
                if a_ngram not in word_dict:
                    word_dict[a_ngram] = key
                    key += 1
                a_index = word_dict[a_ngram]
                abs_list.append((paper_id, a_index))

        cursor2.executemany(insert_title_keyword, keyword_list)
        cursor2.executemany(insert_title_keyword, title_list)
        cursor2.executemany(insert_abs, abs_list)

        # 记得清空待插入列表
        title_list.clear()
        abs_list.clear()
        keyword_list.clear()

        result = cursor.fetchmany(batch_size)
        i += len(result)

    print('处理完成,正在插入ngram索引')
    word_list = []
    for key, value in word_dict.items():
        word_list.append((value, key))
    del word_dict

    cursor2.executemany(insert_word, word_list)

    con.close()
    con2.close()
    print('全部完成')
Exemple #3
0
                    dm_to_dm += 1
                    d_to_dm += 1
                else:
                    dm_to_d += 1
                    d_to_d += 1
            else:
                if candidate[0] in cited_paper_n_gram:
                    d_to_dm += 1
                else:
                    d_to_d += 1

        # result = cursor.fetchmany(1000)
        cur_meme_score = (dm_to_dm / (d_to_dm + delta)) / ((dm_to_d + delta) /
                                                           (d_to_d + delta))
        meme_score[candidate[0]] = (cur_meme_score, candidate[1])

        tok = time.time()
        if cur_meme_score != 0:
            print(candidate, dm_to_dm, d_to_dm, dm_to_d, d_to_d,
                  cur_meme_score, tok - tick)

    return meme_score


if __name__ == '__main__':
    con = get_db_connection('patent_thesis')
    # candidates, total_frequency = get_candidates(con)
    # print(total_frequency)
    score = cal_meme_score(con, ['planar'], 3)
    con.close()
Exemple #4
0
                    file.write('"')
                    file.write('","'.join(list(map(str, value))))
                    file.write('"\n')
            score_result.clear()

        i += 1
    with open(output_path, 'a', encoding='utf-8') as file:
        for value in score_result:
            file.write('"')
            file.write('","'.join(list(map(str, value))))
            file.write('"\n')


if __name__ == '__main__':
    conf = 'health_statistics'
    con = get_db_connection(conf)
    print('正在获取候选词列表')
    candidates = get_candidates(con)
    print(candidates[:5])
    print('候选词列表获取完成,共有{}个候选词,开始计算meme分数'.format(str(len(candidates))))

    filename = '../results/meme/{}.csv'.format(conf)
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(
            '"nid","ngram","dm_to_dm","d_to_dm","dm_to_d","d_to_d","ngram_occur_num","total_paper_num","meme_score"\n'
        )

    cal_meme_score(con, candidates, filename, delta=3)

    con.close()