def get_chapter_links(in_url, tsleep): data = [] error = None try: utils.qprint('get ' + in_url) r = requests.get(in_url) if not r.ok: raise Exception('can not get url ' + in_url) time.sleep(tsleep) p = (r'<a href="/index.php/\?m=bible&template=(.*?)&chapter=(.*?)" ' '>(.*?)</a>') for x in re.findall(p, r.text): book, chapter, title = x url = ('{}/index.php/?m=bible&template={}&chapter={}'.format( host, book, chapter)) out = { 'book': book, 'chapter': chapter, 'title': title, 'url': url, } data.append(out) except Exception as e: error = '*ERROR* ' + str(e) data = [] raise return data, error
def get_chapter_links(in_url, tsleep): data = [] error = None try: utils.qprint('get ' + in_url) r = requests.get(in_url) if not r.ok: raise Exception('can not get url ' + in_url) time.sleep(tsleep) book = re.findall('&idlibroz=(.*?)$', in_url)[0] p = (r'<a id="ext" href="(.*?)" alt="altri capitoli del ' 'libro">(.*?)</a>') for x in re.findall(p, r.text): url, chapter = x y = re.findall(r'Libro=(.*?)&capitolo=(.*?)&', url) title, chapter = y[0] url = host + url out = { 'book': book, 'chapter': chapter, 'title': title, 'url': url, } data.append(out) except Exception as e: error = '*ERROR* ' + str(e) data = [] raise return data, error
def get_journal_links(in_url, tsleep): '''Get all journal links under input URL https://www.scijournal.org/list-of-impact-factor-journal_A.shtml ''' data = {} error = None try: utils.qprint('get ' + in_url) r = requests.get(in_url) if not r.ok: raise time.sleep(tsleep) text = etree.HTML(r.text) for a in text.xpath('//center/h2/a'): title = a.text url = a.get('href') if not url.startswith('impact-factor-of-'): continue url = urljoin(host, url) data[url] = title except Exception as e: error = '*ERROR* ' + str(e) data = {} return data, error
def get_journal_links(in_url, tsleep): '''Get all journal links under input URL http://www.bioxbio.com/if/subject/{subject}-{n}.html ''' data = {} error = None try: utils.qprint('get ' + in_url) r = requests.get(in_url) if not r.ok: raise time.sleep(tsleep) text = etree.HTML(r.text) for a in text.xpath('//tr/td/a'): title = a.text url = a.get('href') url = urljoin(in_url, url) data[url] = title except Exception as e: error = '*ERROR* ' + str(e) raise data = {} return data, error
def get_chapter_text(in_url, out_json, tsleep): data = {} error = None if os.path.isfile(out_json): return data, error try: utils.qprint('get ' + in_url) r = requests.get(in_url) if not r.ok: raise Exception('can not get url ' + in_url) time.sleep(tsleep) text = etree.HTML(r.text) data['url'] = in_url pattern = (r'<span class="chapter_title"> (.*?) <i class="glyphicon ' 'glyphicon-chevron-right "> </i> (.*?) <i class="glyphicon ' 'glyphicon-chevron-right"> </i> (.*?)</span>') x = re.findall(pattern, r.text) if x: data['volume'] = x[0][0] data['book'] = x[0][1] data['chapter'] = x[0][2] else: data['volume'] = None data['book'] = None data['chapter'] = None phrase_set = set() data['content'] = [] for x in text.xpath('//div[@id="bible_chapter_content"]/*'): t = {'vers': None, 'text': None} if x.tag == 'p': t = {'vers': x.get('value'), 'text': x.text.split(' ', 1)[-1]} else: t = {'vers': '', 'text': x.text} if t['vers'] == None and t['text'] == None: raise Exception( 'can not extract content from "{}" (url={})'.format( x.text, in_url)) # avoid duplicate entry phrase = '|{}|{}|'.format(t['vers'], t['text']) if phrase not in phrase_set: data['content'].append(t) phrase_set.add(phrase) utils.write_json(data, out_json) data = {} except Exception as e: error = '*ERROR* ' + str(e) raise data = {} return data, error
def print_csv_short_list(): data = utils.load_json('../data/short_list/omim_diseases.json') fname_out = 'csv/omim_diseases.csv' utils.qprint('writing ' + fname_out) fout = open(fname_out, 'w') print('oid,name_cn,name_en', file=fout) for k,v in sorted(data.items()): print('OMIM:'+k, v['name_cn'].replace(',',''), v['name_en'].replace(',',''), sep=',', file=fout) fout.close()
def get_chapter_text(in_url, out_json, tsleep): data = {} error = None # if os.path.isfile(out_json): # return data, error try: utils.qprint('get ' + in_url) r = requests.get(in_url) if not r.ok: raise Exception('can not get url ' + in_url) time.sleep(tsleep) text = etree.HTML(r.text) data['url'] = in_url x = text.xpath('//div[@class="libro"]') data['book'] = x[0].text if x else None x = text.xpath('//div[@id="capitolo"]') data['chapter'] = x[0].text.strip() if x else None data['content'] = [] el = text.xpath('//div[@class="testidellibro"]')[0] s = html.unescape(etree.tostring(el).decode('utf-8')) for line in s.split('\n'): if not line.startswith('<sup>'): continue # x = re.findall(r'<sup>.*?<a .*?>(.*?)</a></sup>(.*?)$', line) x = re.findall(r'<a (.*?)>(.*?)</a></sup>(.*?)$', line) if not x: continue atag, vers, phrase = x[0] vers = re.findall(r'name="VER_(.*?)"', atag)[0] phrase = re.sub(r'<.*?>', ' ', phrase) phrase = re.sub(r'\t', ' ', phrase) phrase = re.sub(r' +', ' ', phrase) t = {'vers': vers, 'text': phrase.strip()} data['content'].append(t) utils.write_json(data, out_json) data = {} except Exception as e: error = '*ERROR* ' + str(e) raise data = {} return data, error
def get_all_journal_data(all_journal_dict, out_dir, nproc=3, nretry=10, tsleep=1): '''Get data of all journals''' os.makedirs(out_dir, exist_ok=True) args_list = [] for subject in sorted(all_journal_dict): for url in sorted(all_journal_dict[subject]): journal_abbrev = re.findall(r'/if/html/(.*?).html', url)[0].lower() out_json = '{}/{}.json'.format(out_dir, journal_abbrev) args_list.append([url, out_json, tsleep]) args_list = [x for x in sorted(args_list, key=lambda x: x[0])] # if len(args_list) > 10: # break # if len(args_list) > 10: # break done_set = set() cycle = 0 while True: tmp = utils.parallel_call(get_journal_data, args_list, nproc, nretry) cycle += 1 todo_set = set() for x in tmp: for url in x['done']: done_set.add(url) for url in x['todo']: todo_set.add(url) todo_set = todo_set - done_set if len(todo_set) == 0: break args_list = [] for url in sorted(todo_set): journal_abbrev = re.findall(r'/if/html/(.*?).html', url)[0].lower() out_json = '{}/{}.json'.format(out_dir, journal_abbrev) args_list.append([url, out_json, tsleep]) utils.qprint('after cycle {}, get {} new journals'.format( cycle, len(args_list)))
def get_journal_data(in_url, out_json, tsleep): '''Get journal data (name, issn, impact factors) from input URL https://www.scijournal.org/impact-factor-of-ACM-SIGPLAN-NOTICES.shtml and get URL of next/previous journals ''' data = {} error = None if os.path.isfile(out_json): return data, error try: utils.qprint('get ' + in_url) r = requests.get(in_url) if not r.ok: raise time.sleep(tsleep) root = etree.HTML(r.text) data = {} data['url'] = in_url x = root.xpath('//h1')[0].text data['title'] = x.replace('Impact Factor','').strip() x = re.findall(r'Journal Abbreviation: (.*?)<br>', r.text) data['title_abbrev'] = x[0] if x else None x = re.findall('Journal ISSN: (.*?)$', r.text, re.MULTILINE) data['issn'] = x[0] if x else None data['impact'] = {} years = ['2016/2017'] for i in range(2008, 2016): years.append(str(i)) for year in years: x = re.findall(r'{} Impact Factor : (.*?)<br>'.format(year), r.text) data['impact'][year] = x[0] if x else '-NA-' utils.write_json(data, out_json) except Exception as e: error = '*ERROR* ' + str(e) data = {} return data, error
def get_all_journal_links(nproc=3, nretry=10, tsleep=1): '''Get urls of all journals''' subject_names = [ 'agriculture-and-forestry', 'astronomy', 'biology', 'chemistry', 'engineering', 'environmental-science', 'geoscience', 'medicine', 'math', 'management-science', 'physics', 'social-science', ] subject_list = [ '{}/{}-journal-impact-factor-list.shtml'.format(host, x) for x in subject_names ] number_list = [ '{}/list-of-impact-factor-journal_{}.shtml'.format(host, x) for x in range(1,91) ] alphabet_list = [ '{}/list-of-impact-factor-journal_{}.shtml'.format(host, x) for x in ascii_uppercase ] url_set = set(subject_list) url_set.update(set(number_list)) url_set.update(set(alphabet_list)) args_list = [] for url in sorted(url_set): args_list.append([url, tsleep]) tmp = utils.parallel_call(get_journal_links, args_list, nproc, nretry) data = {} for x in tmp: for k,v in x.items(): data[k] = v utils.qprint('get urls of {} journals'.format(len(data))) return data
def print_csv_full_data(): fname_out = 'csv/full_list.csv' utils.qprint('writing ' + fname_out) fout = open(fname_out, 'w') print('oid,name_cn,name_en', file=fout) in_dir = '../data/full_data' for fname in sorted(os.listdir(in_dir)): oid_fname = fname.split('.')[1] db_type = None if fname.startswith('hp.'): db_type = 'HP' elif fname.startswith('omim.'): db_type = 'OMIM' fname = in_dir + '/' + fname data = utils.load_json(fname, verbose=False) oid, name_cn, name_en = None, None, None if db_type == 'HP': for x in data['results']: if x['hpoId'] != 'HP:' + oid_fname: continue oid = x['hpoId'] name_cn = ('{};{}'.format(x['name_cn'], x['definition_cn']) .replace(',','')) name_en = ('{};{}'.format(x['name_en'], x['definition_en']) .replace(',','')) elif db_type == 'OMIM': for x in data['results']: if str(x['mimNumber']) != oid_fname: continue oid = 'OMIM:' + str(x['mimNumber']) name_cn = x['cnTitle'].replace(',','') name_en = x['preTitle'].replace(',','') if oid: print(oid, name_cn, name_en, sep=',', file=fout) fout.close()
def format_csv(timestamp): data_json = '../data/{}.subject.json'.format(timestamp) data_dir = '../data/{}'.format(timestamp) out_csv = '../data/{}.csv'.format(timestamp) data = utils.load_json(data_json, verbose=True) url_with_subj = dict() for subj in data: for url in data[subj]: url_with_subj[url] = subj utils.qprint('writing ' + out_csv) fout = open(out_csv, 'w') for fname in os.listdir(data_dir): journal = fname.split('/')[-1].replace('.html', '').lower() fname = join(data_dir, fname) jdata = utils.load_json(fname, verbose=False) if jdata['url'] in url_with_subj: subj = url_with_subj[jdata['url']] else: subj = 'unknown' print(subj, journal, jdata['title'].replace(',', ''), jdata['impact']['2016/2017']['ifact'], jdata['impact']['2015']['ifact'], jdata['impact']['2014']['ifact'], jdata['impact']['2016/2017']['npub'], jdata['impact']['2015']['npub'], jdata['impact']['2014']['npub'], jdata['title_abbrev'].replace(',', ''), jdata['issn'].replace(',', ''), jdata['url'], sep=',', file=fout) for subj in sorted(data): for url in sorted(data[subj]): journal = url.split('/')[-1].replace('.html', '').lower() fname = '{}/{}.json'.format(data_dir, journal) fout.close()
def get_all_journal_links(nproc=3, nretry=10, tsleep=1): '''Get urls of all journals''' subject_list = [ 'biology', 'medicine', 'agriculture', 'chemistry', 'geoscience', 'astronomy', 'engineering', 'management', 'environmental', 'math', 'physics', 'social', ] data = {} for subject in subject_list: npage, error = get_subject_npage(subject) args_list = [] for page in range(1, npage + 1): url = ('http://www.bioxbio.com/if/subject/{}-{}.html'.format( subject, page)) args_list.append([url, tsleep]) tmp = utils.parallel_call(get_journal_links, args_list, nproc, nretry) data[subject] = {} for x in tmp: for k, v in x.items(): data[subject][k] = v utils.qprint('get urls of {} journals of subject {}'.format( len(data[subject]), subject)) return data
def get_data(oid, out_json, proxy_set, tsleep): '''input Ontology ID, output in ../data ''' if os.path.isfile(out_json): return None, None out_dir = abspath(dirname(out_json)) os.makedirs(out_dir, exist_ok=True) proxies = None if len(proxy_set): proxy = random.choice(list(proxy_set)) proxies = {'http': proxy} headers = { 'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/39.0.2171.95 Safari/537.36'), 'Authorization': token, } timeout_connect = 10 timeout_read = 30 timeout = (timeout_connect, timeout_read) # need to change url = None if oid.startswith('HP:'): url = host + '/hpo/?search={}&type=0&page=1'.format(oid) elif oid.startswith('OMIM:'): oid = oid.split(':')[1] url = host + '/omim/?search={}&type=1&page=1'.format(oid) else: raise 'input ID is not HP or OMIM' data = dict() error = None try: utils.qprint('get ' + url) r = requests.get(url, headers=headers, proxies=proxies, timeout=timeout) time.sleep(tsleep) if not r.ok: raise 'can not get url: ' + url data = json.loads(r.text) if len(data) != 4: raise 'output json seems incorrect (missing keys)' utils.write_json(data, out_json) except Exception as e: error = '*ERROR* ' + str(e) data = dict() # print(error) # raise return None, error
def get_journal_data(in_url, out_json, tsleep): '''Get journal data (name, issn, impact factors) from input URL http://www.bioxbio.com/if/html/{journal}.html and get URL of next/previous journals ''' data = {'done': [], 'todo': []} error = None try: utils.qprint('get ' + in_url) r = requests.get(in_url) if not r.ok: raise time.sleep(tsleep) root = etree.HTML(r.text) if not os.path.isfile(out_json): jdata = {} jdata['url'] = in_url jdata['title'] = root.xpath('//div/h1')[0].text x = re.findall(r'<p>Journal Abbreviation: (.*?)<br>', r.text) jdata['title_abbrev'] = x[0] if x else x x = re.findall(r'Journal ISSN: (.*?)</p>', r.text) jdata['issn'] = x[0] if x else x jdata['impact'] = {} for tr in root.xpath('//table/tr'): td_list = tr.xpath('./td') if len(td_list) != 3: continue year, ifact, npub = [x.text for x in td_list] if year == 'Year': continue try: ifact = float(ifact) except: ifact = -1 try: npub = int(npub) except: npub = -1 jdata['impact'][year] = { 'ifact': ifact, 'npub': npub, } utils.write_json(jdata, out_json) data['done'].append(in_url) # get prev and next url a = root.xpath('//div[@class="col-md-6 col-sm-12 text-left"]/a') data['todo'].append('http://www.bioxbio.com/if/html/' + a[0].get('href')) a = root.xpath('//div[@class="col-md-6 col-sm-12 text-right"]/a') data['todo'].append('http://www.bioxbio.com/if/html/' + a[0].get('href')) except Exception as e: error = '*ERROR* ' + str(e) data = [] return data, error