def get_all_chapter_text(link_data, out_dir, tsleep=1, nproc=3, nretry=10): os.makedirs(out_dir, exist_ok=True) args_list = [] for x in link_data: in_url = x['url'] out_json = '{}/book{}.chapter{}.json'.format(out_dir, x['book'], x['chapter']) args_list.append([in_url, out_json, tsleep]) utils.parallel_call(get_chapter_text, args_list, nproc=nproc, nretry=nretry)
def get_all_journal_data(all_journal_dict, out_dir, nproc=3, nretry=10, tsleep=1): '''Get data of all journals''' os.makedirs(out_dir, exist_ok=True) args_list = [] for url in sorted(all_journal_dict): journal_abbrev = re.findall(r'/impact-factor-of-(.*?).shtml', url)[0].lower() out_json = '{}/{}.json'.format(out_dir, journal_abbrev) args_list.append([url, out_json, tsleep]) args_list = [x for x in sorted(args_list, key=lambda x: x[0])] utils.parallel_call(get_journal_data, args_list, nproc, nretry)
def get_all_data(in_oid_list, out_dir, nproc=3, nretry=10, tsleep=1, proxy_list=None): # load proxy proxy_set = set() if os.path.isfile(proxy_list): for line in open(proxy_list): proxy_set.add(line.rstrip().split('\t')[0]) # load oid list oid_list = [] for line in open(in_oid_list): oid = line.rstrip() oid_list.append(oid) oid_list = oid_list args_list = [] for oid in oid_list: oid_name = oid.replace(':', '.').lower() out_json = '{}/{}.json.gz'.format(out_dir, oid_name) args_list.append([oid, out_json, proxy_set, tsleep]) data = utils.parallel_call(get_data, args_list, nproc, nretry)
def get_all_journal_data(all_journal_dict, out_dir, nproc=3, nretry=10, tsleep=1): '''Get data of all journals''' os.makedirs(out_dir, exist_ok=True) args_list = [] for subject in sorted(all_journal_dict): for url in sorted(all_journal_dict[subject]): journal_abbrev = re.findall(r'/if/html/(.*?).html', url)[0].lower() out_json = '{}/{}.json'.format(out_dir, journal_abbrev) args_list.append([url, out_json, tsleep]) args_list = [x for x in sorted(args_list, key=lambda x: x[0])] # if len(args_list) > 10: # break # if len(args_list) > 10: # break done_set = set() cycle = 0 while True: tmp = utils.parallel_call(get_journal_data, args_list, nproc, nretry) cycle += 1 todo_set = set() for x in tmp: for url in x['done']: done_set.add(url) for url in x['todo']: todo_set.add(url) todo_set = todo_set - done_set if len(todo_set) == 0: break args_list = [] for url in sorted(todo_set): journal_abbrev = re.findall(r'/if/html/(.*?).html', url)[0].lower() out_json = '{}/{}.json'.format(out_dir, journal_abbrev) args_list.append([url, out_json, tsleep]) utils.qprint('after cycle {}, get {} new journals'.format( cycle, len(args_list)))
def get_all_chapter_links(out_json, tsleep=1, nproc=3, nretry=10): data = [] args_list = [] for i in range(1, 74): url = '{}/index.php/?m=bible&template={}'.format(host, i) args_list.append([url, tsleep]) tmp = utils.parallel_call(get_chapter_links, args_list, nproc=nproc, nretry=nretry) for x in tmp: for y in x: data.append(y) utils.write_json(data, out_json) return data
def get_all_chapter_links(out_json, tsleep=1, nproc=3, nretry=10): data = [] args_list = [] for i in range(1, 74): url = ('{}/pls/labibbia_new/Bibbia_Utils.elenco_capitoli?' 'origine=cei2008&idlibroz={}'.format(host, i)) args_list.append([url, tsleep]) tmp = utils.parallel_call(get_chapter_links, args_list, nproc=nproc, nretry=nretry) for x in tmp: for y in x: data.append(y) utils.write_json(data, out_json) return data
def get_all_journal_links(nproc=3, nretry=10, tsleep=1): '''Get urls of all journals''' subject_names = [ 'agriculture-and-forestry', 'astronomy', 'biology', 'chemistry', 'engineering', 'environmental-science', 'geoscience', 'medicine', 'math', 'management-science', 'physics', 'social-science', ] subject_list = [ '{}/{}-journal-impact-factor-list.shtml'.format(host, x) for x in subject_names ] number_list = [ '{}/list-of-impact-factor-journal_{}.shtml'.format(host, x) for x in range(1,91) ] alphabet_list = [ '{}/list-of-impact-factor-journal_{}.shtml'.format(host, x) for x in ascii_uppercase ] url_set = set(subject_list) url_set.update(set(number_list)) url_set.update(set(alphabet_list)) args_list = [] for url in sorted(url_set): args_list.append([url, tsleep]) tmp = utils.parallel_call(get_journal_links, args_list, nproc, nretry) data = {} for x in tmp: for k,v in x.items(): data[k] = v utils.qprint('get urls of {} journals'.format(len(data))) return data
def get_all_journal_links(nproc=3, nretry=10, tsleep=1): '''Get urls of all journals''' subject_list = [ 'biology', 'medicine', 'agriculture', 'chemistry', 'geoscience', 'astronomy', 'engineering', 'management', 'environmental', 'math', 'physics', 'social', ] data = {} for subject in subject_list: npage, error = get_subject_npage(subject) args_list = [] for page in range(1, npage + 1): url = ('http://www.bioxbio.com/if/subject/{}-{}.html'.format( subject, page)) args_list.append([url, tsleep]) tmp = utils.parallel_call(get_journal_links, args_list, nproc, nretry) data[subject] = {} for x in tmp: for k, v in x.items(): data[subject][k] = v utils.qprint('get urls of {} journals of subject {}'.format( len(data[subject]), subject)) return data