Ejemplo n.º 1
0
def indexed_weights():

    global _indexed_weights
    if _indexed_weights is not None:
        return _indexed_weights
    print >> sys.stderr, len(
        wordmap.map), "=?=", HYPERPARAMETERS["VOCABULARY_SIZE"]
    assert len(wordmap.map) == HYPERPARAMETERS["VOCABULARY_SIZE"]
    if HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 0:
        _indexed_weights = [1 for id in range(wordmap.len)]
    elif HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 1:
        from common.json import load
        from common.file import myopen
        ngrams_file = HYPERPARAMETERS["NGRAMS"][(
            HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"],
            HYPERPARAMETERS["VOCABULARY_SIZE"])]
        print >> sys.stderr, "Reading ngrams from", ngrams_file, "..."
        from collections import defaultdict
        ngramcnt = defaultdict(int)
        for (ngram, cnt) in load(myopen(ngrams_file)):
            assert len(ngram) == 1
            ngramcnt[ngram[0]] = cnt + HYPERPARAMETERS[
                "TRAINING_NOISE_SMOOTHING_ADDITION"]
        _indexed_weights = [
            ngramcnt[wordmap.str(id)] for id in range(len(wordmap.map))
        ]
        _indexed_weights = build(_indexed_weights)
    else:
        assert 0
    return _indexed_weights
Ejemplo n.º 2
0
def indexed_weights():
    
    global _indexed_weights
    if _indexed_weights is not None:
        return _indexed_weights
    print >> sys.stderr, len(wordmap.map), "=?=", HYPERPARAMETERS["VOCABULARY_SIZE"]
    assert len(wordmap.map) == HYPERPARAMETERS["VOCABULARY_SIZE"]
    if HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 0:
        _indexed_weights = [1 for id in range(wordmap.len)]
    elif HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"] == 1:
        from common.json import load
        from common.file import myopen
        ngrams_file = HYPERPARAMETERS["NGRAMS"][(HYPERPARAMETERS["NGRAM_FOR_TRAINING_NOISE"], HYPERPARAMETERS["VOCABULARY_SIZE"])]
        print >> sys.stderr, "Reading ngrams from", ngrams_file, "..."
        from collections import defaultdict
        ngramcnt = defaultdict(int)
        for (ngram, cnt) in load(myopen(ngrams_file)):
            assert len(ngram) == 1
            ngramcnt[ngram[0]] = cnt + HYPERPARAMETERS["TRAINING_NOISE_SMOOTHING_ADDITION"]
        _indexed_weights = [ngramcnt[wordmap.str(id)] for id in range(len(wordmap.map))]
        _indexed_weights = build(_indexed_weights)
    else: assert 0
    return _indexed_weights
bister = '(un|duo|tre|bis|qua|quin[tqu]*|sex|sept|octo?|novo?|non|dec|vic|ter|ies)+'
re_alin_sup = re.compile(ur'supprimés?\)$', re.I)
re_clean_alin = re.compile(r'^"?([IVXCDLM]+|\d+|[a-z]|[°)\-\.\s]+)+\s*((%s|[A-Z]+)[°)\-\.\s]+)*' % bister)
re_upper_first = re.compile(r'^(.)(.*)$')
step_id = ''
old_step_id = ''
for nstep, step in enumerate(steps):
    if not 'resulting_text_directory' in step:
        if step['stage'] not in [u"promulgation", u"constitutionnalité"]:
            sys.stderr.write("WARNING no directory found for step %s\n" % step['stage'])
        continue
    try:
        path = os.path.join(sourcedir, step['resulting_text_directory'])
        step_id = "%02d%s" % (nstep, step['directory'][2:])
        with open(os.path.join(path, 'texte.json'), "r") as texte:
            data = json.load(texte)

        echec = (step['echec'] and step['echec'] != "renvoi en commission")
        if echec:
            if not 'echec' in out['articles']:
                out['articles']['echec'] = {'id': 'echec', 'titre': step['echec'], 'section': 'echec', 'steps': []}
            next_step = create_step(step_id, step['directory'], echec_type=step['echec'])
            out['articles']['echec']['steps'].append(next_step)
            if not 'echec' in out['sections']:
                out['sections']['echec'] = {}
            out['sections']['echec'][step_id] = {'title': data['expose'], 'type': step['echec'].upper()}
            continue
        for section in data['sections']:
            if not section['id'] in out['sections']:
                out['sections'][section['id']] = {}
            out['sections'][section['id']][step_id] = {'title': section['titre'], 'type': re_upper_first.sub(lambda x: x.group(1).upper() + x.group(2), section['type_section'])}
    r'^"?([IVXCDLM]+|\d+|[a-z]|[°)\-\.\s]+)+\s*((%s|[A-Z]+)[°)\-\.\s]+)*' %
    bister)
re_upper_first = re.compile(r'^(.)(.*)$')
step_id = ''
old_step_id = ''
for nstep, step in enumerate(steps):
    if not 'resulting_text_directory' in step:
        if step['stage'] not in [u"promulgation", u"constitutionnalité"]:
            sys.stderr.write("WARNING no directory found for step %s\n" %
                             step['stage'])
        continue
    try:
        path = os.path.join(sourcedir, step['resulting_text_directory'])
        step_id = "%02d%s" % (nstep, step['directory'][2:])
        with open(os.path.join(path, 'texte.json'), "r") as texte:
            data = json.load(texte)

        echec = (step['echec'] and step['echec'] != "renvoi en commission")
        if echec:
            if not 'echec' in out['articles']:
                out['articles']['echec'] = {
                    'id': 'echec',
                    'titre': step['echec'],
                    'section': 'echec',
                    'steps': []
                }
            next_step = create_step(step_id,
                                    step['directory'],
                                    echec_type=step['echec'])
            out['articles']['echec']['steps'].append(next_step)
            if not 'echec' in out['sections']:
Ejemplo n.º 5
0
            sims = index[tfidf[bow_corpus]]  # 计算相似性矩阵
            i_cluster = graph_sim_matrix(sims, corr)
            author_cluster = [[
                author_papers[index]['id'] for index in l_inside
            ] for l_inside in i_cluster]
            #            res_realx={}
            #            res_realx[author]=res_real[author]
            #            print(author,'pairwise-f1',pairwise_f1(res_realx,{author:author_cluster}))
            print(i, author, '文章数', len(author_papers), '消歧后作者数',
                  len(author_cluster))
            res_dict[author] = author_cluster
    return res_dict

if __name__ == "__main__":
    # 读取测试集数据
    train_author_data = json.load(
        open(train_author_path, 'r', encoding='utf-8'))
    train_pub_data = json.load(open(train_pub_path, 'r', encoding='utf-8'))

    # 选择部分数据进行测试
    author_list = list(train_author_data.keys())
    author_selects = author_list[0:200]

    # 人工标识的分类结果
    res_real = {}
    for author in author_selects:
        p_merge = []
        for plist in train_author_data[author].values():
            p_merge.append(plist)
        res_real[author] = p_merge

    papers = {}