def fix_it(timestamp, book_chapter, content_err_list, content_fix_list): fix_dir = '../data/{}.fix'.format(timestamp) os.makedirs(fix_dir, exist_ok=True) fin_json = '../data/{}/{}.json'.format(timestamp, book_chapter) fold_json = '{}/{}.old.json'.format(fix_dir, book_chapter) fnew_json = '{}/{}.new.json'.format(fix_dir, book_chapter) data = utils.load_json(fin_json) new_content = [] content_changed = False for i, x in enumerate(data['content']): has_issue = False for y, zz in zip(content_err_list, content_fix_list): if (x['vers'] == y['vers'] and x['text'] == y['text']): has_issue = True for z in zz: new_content.append(z) content_changed = True if not has_issue: new_content.append(x) if not content_changed: print(':: target problem is not found in {}'.format(fin_json), file=sys.stderr, flush=True) return qcmd('cp {} {}'.format(fin_json, fold_json)) data['content'] = new_content utils.write_json(data, fnew_json) qcmd('cp {} {}'.format(fnew_json, fin_json))
def get_chapter_text(in_url, out_json, tsleep): data = {} error = None if os.path.isfile(out_json): return data, error try: utils.qprint('get ' + in_url) r = requests.get(in_url) if not r.ok: raise Exception('can not get url ' + in_url) time.sleep(tsleep) text = etree.HTML(r.text) data['url'] = in_url pattern = (r'<span class="chapter_title"> (.*?) <i class="glyphicon ' 'glyphicon-chevron-right "> </i> (.*?) <i class="glyphicon ' 'glyphicon-chevron-right"> </i> (.*?)</span>') x = re.findall(pattern, r.text) if x: data['volume'] = x[0][0] data['book'] = x[0][1] data['chapter'] = x[0][2] else: data['volume'] = None data['book'] = None data['chapter'] = None phrase_set = set() data['content'] = [] for x in text.xpath('//div[@id="bible_chapter_content"]/*'): t = {'vers': None, 'text': None} if x.tag == 'p': t = {'vers': x.get('value'), 'text': x.text.split(' ', 1)[-1]} else: t = {'vers': '', 'text': x.text} if t['vers'] == None and t['text'] == None: raise Exception( 'can not extract content from "{}" (url={})'.format( x.text, in_url)) # avoid duplicate entry phrase = '|{}|{}|'.format(t['vers'], t['text']) if phrase not in phrase_set: data['content'].append(t) phrase_set.add(phrase) utils.write_json(data, out_json) data = {} except Exception as e: error = '*ERROR* ' + str(e) raise data = {} return data, error
def get_chapter_text(in_url, out_json, tsleep): data = {} error = None # if os.path.isfile(out_json): # return data, error try: utils.qprint('get ' + in_url) r = requests.get(in_url) if not r.ok: raise Exception('can not get url ' + in_url) time.sleep(tsleep) text = etree.HTML(r.text) data['url'] = in_url x = text.xpath('//div[@class="libro"]') data['book'] = x[0].text if x else None x = text.xpath('//div[@id="capitolo"]') data['chapter'] = x[0].text.strip() if x else None data['content'] = [] el = text.xpath('//div[@class="testidellibro"]')[0] s = html.unescape(etree.tostring(el).decode('utf-8')) for line in s.split('\n'): if not line.startswith('<sup>'): continue # x = re.findall(r'<sup>.*?<a .*?>(.*?)</a></sup>(.*?)$', line) x = re.findall(r'<a (.*?)>(.*?)</a></sup>(.*?)$', line) if not x: continue atag, vers, phrase = x[0] vers = re.findall(r'name="VER_(.*?)"', atag)[0] phrase = re.sub(r'<.*?>', ' ', phrase) phrase = re.sub(r'\t', ' ', phrase) phrase = re.sub(r' +', ' ', phrase) t = {'vers': vers, 'text': phrase.strip()} data['content'].append(t) utils.write_json(data, out_json) data = {} except Exception as e: error = '*ERROR* ' + str(e) raise data = {} return data, error
def get_all_chapter_links(out_json, tsleep=1, nproc=3, nretry=10): data = [] args_list = [] for i in range(1, 74): url = '{}/index.php/?m=bible&template={}'.format(host, i) args_list.append([url, tsleep]) tmp = utils.parallel_call(get_chapter_links, args_list, nproc=nproc, nretry=nretry) for x in tmp: for y in x: data.append(y) utils.write_json(data, out_json) return data
def get_journal_data(in_url, out_json, tsleep): '''Get journal data (name, issn, impact factors) from input URL https://www.scijournal.org/impact-factor-of-ACM-SIGPLAN-NOTICES.shtml and get URL of next/previous journals ''' data = {} error = None if os.path.isfile(out_json): return data, error try: utils.qprint('get ' + in_url) r = requests.get(in_url) if not r.ok: raise time.sleep(tsleep) root = etree.HTML(r.text) data = {} data['url'] = in_url x = root.xpath('//h1')[0].text data['title'] = x.replace('Impact Factor','').strip() x = re.findall(r'Journal Abbreviation: (.*?)<br>', r.text) data['title_abbrev'] = x[0] if x else None x = re.findall('Journal ISSN: (.*?)$', r.text, re.MULTILINE) data['issn'] = x[0] if x else None data['impact'] = {} years = ['2016/2017'] for i in range(2008, 2016): years.append(str(i)) for year in years: x = re.findall(r'{} Impact Factor : (.*?)<br>'.format(year), r.text) data['impact'][year] = x[0] if x else '-NA-' utils.write_json(data, out_json) except Exception as e: error = '*ERROR* ' + str(e) data = {} return data, error
def get_all_chapter_links(out_json, tsleep=1, nproc=3, nretry=10): data = [] args_list = [] for i in range(1, 74): url = ('{}/pls/labibbia_new/Bibbia_Utils.elenco_capitoli?' 'origine=cei2008&idlibroz={}'.format(host, i)) args_list.append([url, tsleep]) tmp = utils.parallel_call(get_chapter_links, args_list, nproc=nproc, nretry=nretry) for x in tmp: for y in x: data.append(y) utils.write_json(data, out_json) return data
def test(): # data, error = get_journal_links( # # 'https://www.scijournal.org/list-of-impact-factor-journal_Z.shtml', # 'https://www.scijournal.org/agriculture-and-forestry-journal-impact-factor-list.shtml', # 0) # url = 'https://www.scijournal.org/impact-factor-of-NAT-REV-CANCER.shtml' # url = 'https://www.scijournal.org/impact-factor-of-HEALTH-INFORM-J.shtml' # data, error = get_journal_data( # url, # # 'https://www.scijournal.org/impact-factor-of-NATURE.shtml', # 'foo.json', 0) # print(json.dumps(data, sort_keys=True, indent=2)) # print(len(data)) # print(error) data = get_all_journal_links(nproc=3, nretry=10, tsleep=1) utils.write_json(data, 'foo.json')
def get_data(oid, out_json, proxy_set, tsleep): '''input Ontology ID, output in ../data ''' if os.path.isfile(out_json): return None, None out_dir = abspath(dirname(out_json)) os.makedirs(out_dir, exist_ok=True) proxies = None if len(proxy_set): proxy = random.choice(list(proxy_set)) proxies = {'http': proxy} headers = { 'User-Agent': ('Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/39.0.2171.95 Safari/537.36'), 'Authorization': token, } timeout_connect = 10 timeout_read = 30 timeout = (timeout_connect, timeout_read) # need to change url = None if oid.startswith('HP:'): url = host + '/hpo/?search={}&type=0&page=1'.format(oid) elif oid.startswith('OMIM:'): oid = oid.split(':')[1] url = host + '/omim/?search={}&type=1&page=1'.format(oid) else: raise 'input ID is not HP or OMIM' data = dict() error = None try: utils.qprint('get ' + url) r = requests.get(url, headers=headers, proxies=proxies, timeout=timeout) time.sleep(tsleep) if not r.ok: raise 'can not get url: ' + url data = json.loads(r.text) if len(data) != 4: raise 'output json seems incorrect (missing keys)' utils.write_json(data, out_json) except Exception as e: error = '*ERROR* ' + str(e) data = dict() # print(error) # raise return None, error
def get_journal_data(in_url, out_json, tsleep): '''Get journal data (name, issn, impact factors) from input URL http://www.bioxbio.com/if/html/{journal}.html and get URL of next/previous journals ''' data = {'done': [], 'todo': []} error = None try: utils.qprint('get ' + in_url) r = requests.get(in_url) if not r.ok: raise time.sleep(tsleep) root = etree.HTML(r.text) if not os.path.isfile(out_json): jdata = {} jdata['url'] = in_url jdata['title'] = root.xpath('//div/h1')[0].text x = re.findall(r'<p>Journal Abbreviation: (.*?)<br>', r.text) jdata['title_abbrev'] = x[0] if x else x x = re.findall(r'Journal ISSN: (.*?)</p>', r.text) jdata['issn'] = x[0] if x else x jdata['impact'] = {} for tr in root.xpath('//table/tr'): td_list = tr.xpath('./td') if len(td_list) != 3: continue year, ifact, npub = [x.text for x in td_list] if year == 'Year': continue try: ifact = float(ifact) except: ifact = -1 try: npub = int(npub) except: npub = -1 jdata['impact'][year] = { 'ifact': ifact, 'npub': npub, } utils.write_json(jdata, out_json) data['done'].append(in_url) # get prev and next url a = root.xpath('//div[@class="col-md-6 col-sm-12 text-left"]/a') data['todo'].append('http://www.bioxbio.com/if/html/' + a[0].get('href')) a = root.xpath('//div[@class="col-md-6 col-sm-12 text-right"]/a') data['todo'].append('http://www.bioxbio.com/if/html/' + a[0].get('href')) except Exception as e: error = '*ERROR* ' + str(e) data = [] return data, error
def test(): data, error = get_journal_data( 'http://www.bioxbio.com/if/html/Z-PADAGOGIK.html', 'foo.json', 0) print(data) print(error) if __name__ == '__main__': # timestamp = '201804140057' # catch_missing(timestamp) # test() if len(sys.argv) != 2: timestamp = utils.get_timestamp() else: timestamp = sys.argv[1] out_dir = '../data/{}'.format(timestamp) subject_json = '../data/{}.subject.json'.format(timestamp) data = {} if os.path.isfile(subject_json): data = utils.load_json(subject_json) else: data = get_all_journal_links() utils.write_json(data, subject_json) get_all_journal_data(data, out_dir)
# print(json.dumps(data, sort_keys=True, indent=2)) # print(len(data)) # print(error) data = get_all_journal_links(nproc=3, nretry=10, tsleep=1) utils.write_json(data, 'foo.json') if __name__ == '__main__': # test() if len(sys.argv) != 2: timestamp = utils.get_timestamp() else: timestamp = sys.argv[1] out_dir = '../data/{}'.format(timestamp) url_json = '../data/{}.url.json'.format(timestamp) data = {} if os.path.isfile(url_json): data = utils.load_json(url_json) else: data = get_all_journal_links() utils.write_json(data, url_json) get_all_journal_data(data, out_dir)