def rewrite_topic(args): dirnames = get_dirnames(args.odirs) for dirname in dirnames: a = os.walk(dirname) for root, output_dirs, files in a: # dirs for output_dir in output_dirs: if not select_outputs(output_dir, args.target): continue output_path = os.path.join(root, output_dir) result_path = os.path.join(output_path, 'result') if os.path.isfile(result_path): result = loadpk(result_path) a = 1 try: if result.tag in args.tags: pass #evo_file = os.path.join(output_path, 'topics_evo.txt') ; print(evo_file) except: continue retopic_file = os.path.join(output_path, 'topics_.txt') i = 0 for tw_list, cohs in zip(result.twlist, result.COHs): seg = '---------- topics in time {} ----------'.format(i) display_topics(tw_list=tw_list, cohs=cohs, head='topic', seg=seg, file = retopic_file) i += 1 else: pass break
def get_all_results(dirnames, target=None, result_tags=['complete']): res = [] for dirname in dirnames: a = os.walk(dirname) for root, output_dirs, files in a: # dirs for output_dir in output_dirs: if not select_outputs(output_dir, target): continue output_path = os.path.join(root, output_dir) result_path = os.path.join(output_path, 'result') if os.path.isfile(result_path): result = loadpk(result_path) try: if result.tag in result_tags: res.append((output_path,result)) else: pass except: pass #print('result has no attribute tag.') else: #s = 'there is no result file in {}'.format(output_path) #print(s) pass break print('get {} results as below.'.format(len(res))) for dirname, _ in res: print(dirname) print(' ') return res
def pickle2pdf(target_category='cs.LG'): #flag = 'This article has been withdrawn' filelist = [] for year in range(2012,2020): file = os.path.join(os.getcwd(),'data/arxiv/{}_papers.pkl'.format(year)) filelist.append(file) print('filelist ready') num_suc = 0 ; num_fail = 0 global c c = '' for f_i, file in enumerate(filelist): papers = loadpk(file) print('pickled paper loaded {}'.format(f_i)) def custom_slugify(obj): name = obj.get('id').split('/')[-1] #time = obj.get('published').split('T') res = 'data/arxiv/{}/pdf/'.format(c) + name#+ time + '_' + name return res#obj.get('id').split('/')[-1] for paper in papers: arxiv_id = paper['arxivid'] category = paper['categories'] if 'cs.LG' in category: if 'cs.CL' in category: c = 'LG_CL' else: c = 'LG' elif 'cs.CL' in category: c = 'CL' else: pass try: d_paper = ax.query(id_list=[arxiv_id])[0] ax.download(d_paper, slugify=custom_slugify) # res = 'data/arxiv/{}/'.format(target_category) + d_paper.get('id').split('/')[-1] # all_pdf_path.append(res) print('download {} {} succeed.'.format(arxiv_id, c)) with open('suc_ids.txt', 'a') as w: w.write('{}\t{}\n'.format(arxiv_id,c)) w.close() num_suc += 1 except: print('----------download {} {} failed'.format(arxiv_id,c)) with open('faild_ids.txt', 'a') as w: w.write('{}\t{}\n'.format(arxiv_id,c)) w.close() num_fail += 1 print('num_suc: {} , num_fail: {}'.format(num_suc, num_fail)) return
def pickle2txt(target_category='cs.LG'): flag = 'This article has been withdrawn' num_withdraw = 0 num_abstract = 0 filelist = [] for year in range(2012, 2020): file = os.path.join(os.getcwd(), 'data/arxiv/{}_papers.pkl'.format(year)) filelist.append(file) data_path = os.path.join(os.getcwd(), 'data/arxiv') w = os.path.join(data_path, 'train_{}.txt'.format(target_category)) log = os.path.join(data_path, 'log.txt') for file in filelist: per_num_withdraw = 0 per_num_abstract = 0 papers = loadpk(file) with open(w, 'a') as writer: for paper in papers: time = paper['created'] time = datetime.date(time.year, time.month, time.day) category = paper['categories'] if target_category in category: pass else: per_num_withdraw += 1 continue text = paper['abstract'] if text[:31] == flag: per_num_withdraw += 1 continue else: text = rm_seg(text) s = '{}\t{}\t{}\n'.format(time, category, text) writer.write(s) per_num_abstract += 1 my_print( 'harvest {} papers and throw {} papers from {}.'.format( per_num_abstract, per_num_withdraw, file), log) num_withdraw += per_num_withdraw num_abstract += per_num_abstract writer.close() my_print( 'harvest {} papers and throw {} papers from all file.'.format( num_abstract, num_withdraw), log)
def pickle2pdf(target_category='cs.LG'): #flag = 'This article has been withdrawn' num_withdraw = 0 num_abstract = 0 filelist = [] all_pdf_path = [] for year in range(2012, 2013): file = os.path.join(os.getcwd(), 'data/arxiv/{}_papers.pkl'.format(year)) filelist.append(file) print('filelist ready') data_path = os.path.join(os.getcwd(), 'data/arxiv') w = os.path.join(data_path, 'train_{}.txt'.format(target_category)) log = os.path.join(data_path, 'log.txt') n = 0 for f_i, file in enumerate(filelist): per_num_withdraw = 0 per_num_abstract = 0 papers = loadpk(file) print('pickled paper loaded {}'.format(f_i)) def custom_slugify(obj): name = obj.get('id').split('/')[-1] #time = obj.get('published').split('T') res = 'data/arxiv/{}/pdf/'.format( target_category) + name #+ time + '_' + name print(res) return res #obj.get('id').split('/')[-1] for paper in papers: if n > 10: break arxiv_id = paper['arxivid'] try: d_paper = ax.query(id_list=[arxiv_id])[0] ax.download(d_paper, slugify=custom_slugify) # res = 'data/arxiv/{}/'.format(target_category) + d_paper.get('id').split('/')[-1] # all_pdf_path.append(res) except: print('download {} failed'.format(arxiv_id)) n += 1 return