コード例 #1
0
def rewrite_topic(args):
    dirnames = get_dirnames(args.odirs)
    for dirname in dirnames:
        a = os.walk(dirname)
        for root, output_dirs, files in a:
            # dirs
            for output_dir in output_dirs:
                if not select_outputs(output_dir, args.target):
                    continue
                output_path = os.path.join(root, output_dir)
                result_path = os.path.join(output_path, 'result')
                if os.path.isfile(result_path):
                    result = loadpk(result_path)
                    a = 1
                    try:                        
                        if result.tag in args.tags:
                            pass
                            #evo_file = os.path.join(output_path, 'topics_evo.txt') ; print(evo_file)
                    except:
                        continue
                    
                    retopic_file = os.path.join(output_path, 'topics_.txt')
                    i = 0
                    for tw_list, cohs in zip(result.twlist, result.COHs):
                        seg = '---------- topics in time {}  ----------'.format(i)
                        display_topics(tw_list=tw_list, cohs=cohs,
                                               head='topic', seg=seg, 
                                               file = retopic_file)
                        i += 1
                else:
                    pass
            break
コード例 #2
0
def get_all_results(dirnames, target=None, result_tags=['complete']):
    res = []
    for dirname in dirnames:
        a = os.walk(dirname)
        for root, output_dirs, files in a:
            # dirs
            for output_dir in output_dirs:
                if not select_outputs(output_dir, target):
                    continue
                output_path = os.path.join(root, output_dir)
                result_path = os.path.join(output_path, 'result')
                if os.path.isfile(result_path):
                    result = loadpk(result_path)
                    try:                        
                        if result.tag in result_tags:
                            res.append((output_path,result)) 
                        else:
                            pass
                    except:
                        pass
                        #print('result has no attribute tag.')
                else:
                    #s = 'there is no result file in {}'.format(output_path)
                    #print(s)
                    pass
            break
    print('get {} results as below.'.format(len(res)))
    for dirname, _ in res:
        print(dirname)
    print('  ')
    return res
コード例 #3
0
def pickle2pdf(target_category='cs.LG'):
    #flag = 'This article has been withdrawn' 
    filelist = []
    for year in range(2012,2020):
        file = os.path.join(os.getcwd(),'data/arxiv/{}_papers.pkl'.format(year))
        filelist.append(file)
    print('filelist ready')
    num_suc = 0 ; num_fail = 0
    global c
    c = ''
    for f_i, file in enumerate(filelist):
        papers = loadpk(file)
        print('pickled paper loaded {}'.format(f_i))
        def custom_slugify(obj):
            name = obj.get('id').split('/')[-1]
            #time = obj.get('published').split('T')
            res = 'data/arxiv/{}/pdf/'.format(c) + name#+ time + '_' + name
            return  res#obj.get('id').split('/')[-1]
        
        for paper in papers:
            arxiv_id = paper['arxivid']
            category = paper['categories']
            if 'cs.LG' in category:
                if 'cs.CL' in category:
                    c = 'LG_CL'
                else:
                    c = 'LG'
            elif 'cs.CL' in category:
                c = 'CL'
            else:
                pass
            
            try:
                d_paper = ax.query(id_list=[arxiv_id])[0]
                ax.download(d_paper, slugify=custom_slugify)
#                res = 'data/arxiv/{}/'.format(target_category) + d_paper.get('id').split('/')[-1]
#                all_pdf_path.append(res)
                print('download {} {} succeed.'.format(arxiv_id, c))
                with open('suc_ids.txt', 'a') as w:
                    w.write('{}\t{}\n'.format(arxiv_id,c))
                    w.close()
                num_suc += 1
            except:
                print('----------download {} {} failed'.format(arxiv_id,c))
                with open('faild_ids.txt', 'a') as w:
                    w.write('{}\t{}\n'.format(arxiv_id,c))
                    w.close()
                num_fail += 1
    print('num_suc: {} , num_fail: {}'.format(num_suc, num_fail))
    return
コード例 #4
0
def pickle2txt(target_category='cs.LG'):
    flag = 'This article has been withdrawn'
    num_withdraw = 0
    num_abstract = 0
    filelist = []
    for year in range(2012, 2020):
        file = os.path.join(os.getcwd(),
                            'data/arxiv/{}_papers.pkl'.format(year))
        filelist.append(file)
    data_path = os.path.join(os.getcwd(), 'data/arxiv')
    w = os.path.join(data_path, 'train_{}.txt'.format(target_category))
    log = os.path.join(data_path, 'log.txt')
    for file in filelist:
        per_num_withdraw = 0
        per_num_abstract = 0
        papers = loadpk(file)
        with open(w, 'a') as writer:
            for paper in papers:
                time = paper['created']
                time = datetime.date(time.year, time.month, time.day)

                category = paper['categories']
                if target_category in category:
                    pass
                else:
                    per_num_withdraw += 1
                    continue

                text = paper['abstract']
                if text[:31] == flag:
                    per_num_withdraw += 1
                    continue
                else:
                    text = rm_seg(text)
                    s = '{}\t{}\t{}\n'.format(time, category, text)
                    writer.write(s)
                    per_num_abstract += 1
            my_print(
                'harvest {} papers and throw {} papers from {}.'.format(
                    per_num_abstract, per_num_withdraw, file), log)
            num_withdraw += per_num_withdraw
            num_abstract += per_num_abstract
        writer.close()
    my_print(
        'harvest {} papers and throw {} papers from all file.'.format(
            num_abstract, num_withdraw), log)
コード例 #5
0
def pickle2pdf(target_category='cs.LG'):
    #flag = 'This article has been withdrawn'
    num_withdraw = 0
    num_abstract = 0
    filelist = []
    all_pdf_path = []
    for year in range(2012, 2013):
        file = os.path.join(os.getcwd(),
                            'data/arxiv/{}_papers.pkl'.format(year))
        filelist.append(file)
    print('filelist ready')
    data_path = os.path.join(os.getcwd(), 'data/arxiv')
    w = os.path.join(data_path, 'train_{}.txt'.format(target_category))
    log = os.path.join(data_path, 'log.txt')
    n = 0
    for f_i, file in enumerate(filelist):
        per_num_withdraw = 0
        per_num_abstract = 0
        papers = loadpk(file)
        print('pickled paper loaded {}'.format(f_i))

        def custom_slugify(obj):
            name = obj.get('id').split('/')[-1]
            #time = obj.get('published').split('T')
            res = 'data/arxiv/{}/pdf/'.format(
                target_category) + name  #+ time + '_' + name
            print(res)
            return res  #obj.get('id').split('/')[-1]

        for paper in papers:
            if n > 10:
                break
            arxiv_id = paper['arxivid']
            try:
                d_paper = ax.query(id_list=[arxiv_id])[0]
                ax.download(d_paper, slugify=custom_slugify)


#                res = 'data/arxiv/{}/'.format(target_category) + d_paper.get('id').split('/')[-1]
#                all_pdf_path.append(res)
            except:
                print('download {} failed'.format(arxiv_id))
            n += 1
    return