-
Notifications
You must be signed in to change notification settings - Fork 0
/
step4_build_bulk_for_es.py
172 lines (132 loc) · 6.18 KB
/
step4_build_bulk_for_es.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
from utils import read_pdf, listdir, org_years_dict, curlmd5, time_out_wrap, gen_files_path_from_dict, split_path, clean_text
import json, os, re, shutil, collections
from tqdm import tqdm
from multiprocessing import Pool
# from pathos.multiprocessing import Pool
import time, traceback
manual_processing_folder = '/home/d/github_work/find_papers_by_datasets/papers_data_by_hand'
cant_dw_list = ['Binarized Back-Propagation Training Binarized Neural Networks with Binarized Gradients.pdf',
'TV Advertisement Scheduling by Learning Expert Intentions.pdf',
'Whole Page Optimization with Global Constraints.pdf']
def _dumps_pdf(file_path):
def __dumps_pdf(file_path):
try:
json_body = {}
json_body['Organization'] = file_path.split('/')[4]
json_body['Name'] = file_path.split('/')[-1].split('.pdf')[0]
json_body['Year'] = file_path.split('/')[5].split('_')[1]
json_body['Text'] = read_pdf(file_path)
if json_body['Text'].startswith('ERROR '): return json_body['Text']
json_start = '{"index":{"_id":\"' + curlmd5(json_body['Name']) + '\"}}'
json_body = json_start + '\n' + json.dumps(json_body) + '\n'
return json_body
except:
traceback.print_exc()
return f'ERROR : {file_path}'
# add timeout err
return time_out_wrap(100, __dumps_pdf, file_path)
def dumps_pdf(org, years, file_paths, folder_name = 'pdf_es_json', use_multi = True, auto_gen = False):
file_paths = [i for i in file_paths if i.split('/')[-3] == org]
start_time = time.time()
res_json_text = []
res_err = []
if not os.path.exists(folder_name): os.mkdir(folder_name)
if use_multi:
pbar = tqdm(total = len(file_paths), ncols = 100, desc = f'multi mode: {org}')
with Pool(4) as pool:
pool_iter = pool.imap(_dumps_pdf, file_paths)
for i,r in enumerate(pool_iter):
if not r.startswith('ERROR '):
res_json_text.append(r)
else:
res_err.append(r)
print(r)
pbar.update()
# for test
else:
for file_path in tqdm(file_paths, ncols = 100, desc = f'single mode: {org}'):
r = _dumps_pdf(file_path)
if not r.startswith('ERROR '):
res_json_text.append(r)
else:
print(r)
# res
_years = '_'.join(years)
with open(f'./{folder_name}/{org}_{_years}.json', 'w', encoding='utf8') as f:
for i in res_json_text:
f.write(i)
# err
_years = '_'.join(years)
with open(f'./{folder_name}/ERR_{org}_{_years}.txt', 'w', encoding='utf8') as f:
for i in res_err:
f.write(i)
end_time = time.time()
print(f'time cost: {end_time - start_time:.1f}s.')
# auto-gen err pdf-txt pair files for mamul processing/
if auto_gen:
with open(f'./{folder_name}/ERR_{org}_{_years}.txt', 'r', encoding='utf8') as f:
t = f.read()
patten = re.compile(r'/home/d/(.*?).pdf') # papers_data/ACL/ACL_2018/xxxxx
res1 = patten.findall(t)
for path in res1:
folder, org, org_year, name = path.split('/')
pdf_name = name + '.pdf'
if pdf_name in cant_dw_list: continue
new_foolder = folder + '_by_hand'
if not os.path.exists(f'{new_foolder}'): os.makedirs(f'{new_foolder}')
_file_from = f'/home/d/{folder}/{org}/{org_year}/{pdf_name}'
_file_to = f'{new_foolder}/{org_year}@@@{pdf_name}'
shutil.copy(_file_from, _file_to)
txt_name = name + '.txt'
with open(f'{new_foolder}/{org_year}@@@{txt_name}', 'w') as f:
f.write('')
return None
def add_from_txt(txt_folder_name = r'..\..\papers_data'):
file_paths = []
listdir(txt_folder_name, file_paths, ['.txt'])
start_time = time.time()
res_json_text = []
org_years = collections.defaultdict(set) # for json name.
for txt_file_path in tqdm(file_paths, ncols = 100, desc = f'process txts'):
folder_name, org, org_year, file_name = split_path(txt_file_path)
name = file_name.split('.txt')[0]
year = org_year.split('_')[1]
json_body = {}
json_body['Organization'] = org
json_body['Name'] = name
json_body['Year'] = year
with open(txt_file_path, 'r', encoding='utf8') as f:
json_body['Text'] = clean_text(f.read())
json_start = '{"index":{"_id":\"' + curlmd5(json_body['Name']) + '\"}}'
json_body = json_start + '\n' + json.dumps(json_body) + '\n'
res_json_text.append([org, json_body])
org_years[org].add(year)
out_folder = 'txt2ES_json'
if not os.path.exists(out_folder): os.mkdir(out_folder)
for _org in org_years.keys():
org_years[_org] = '_'.join(sorted(org_years[_org]))
with open(f'./{out_folder}/{_org}_{org_years[_org]}.json', 'w', encoding='utf8') as f:
for __org, _json_body in res_json_text:
if __org == _org: f.write(_json_body)
end_time = time.time()
print(f'time cost: {end_time - start_time:.2f}s.')
return None
if __name__ == "__main__":
# check first
# from utils import check_all_pdfs
# check_all_pdfs(path = '/home/d/papers_data/')
# exit()
# from utils import check_all_pdfs
# bad_count = check_all_pdfs('/home/d/papers_data/')
# print(f'bad_count: {bad_count}')
# # all files' path
# file_paths = gen_files_path_from_dict(json_files_path = './url_json/', base_url = '../../papers_data')
# dumps_pdf('ACL', org_years_dict['ACL'], file_paths)
# dumps_pdf('AAAI', org_years_dict['AAAI'], file_paths)
# dumps_pdf('EMNLP', org_years_dict['EMNLP'], file_paths)
# dumps_pdf('ICLR', org_years_dict['ICLR'], file_paths)
# dumps_pdf('ICML', org_years_dict['ICML'], file_paths)
# dumps_pdf('KDD', org_years_dict['KDD'], file_paths)
# dumps_pdf('NIPS', org_years_dict['NIPS'], file_paths)
# dumps_pdf('IJCAI', org_years_dict['IJCAI'], file_paths)
add_from_txt()