def get_captcha(session, captcha_url): resp = WebRequest.get_response(captcha_url, session=session) im = Image.open(io.BytesIO(resp.content)) im = im.convert('RGB') pixdata = im.load() weight, height = im.size for x in range(weight): for y in range(height): rgb = pixdata[x, y] if (rgb[0] - rgb[1] > 73) and (rgb[0] - rgb[2] > 73): pixdata[x, y] = (0, 0, 0) else: pixdata[x, y] = (255, 255, 255) captcha = pytesseract.image_to_string(im).strip() if len(captcha) != 4: return get_captcha(session, captcha_url) payload = util.query_payload(tryCode=captcha) funding_url = 'http://output.nsfc.gov.cn/baseQuery/data/supportQueryResultsData' resp = WebRequest.get_response(funding_url, method='POST', session=session, json=payload) if resp.json()['message'] != '验证码错误': click.secho('right captcha: {}'.format(captcha), fg='green', bold=True) return captcha click.secho('wrong captcha: {}'.format(captcha), fg='yellow') return get_captcha(session, captcha_url)
def deal_captcha(self, url, outdir, filename, chunk_size): """deal with the captcha """ soup = WR.get_soup(url) img = soup.select_one('#captcha').attrs['src'] img_url = url.rsplit('/', 3)[0] + img # print(img_url) self.logger.warning(f'need to type in the captcha: {img_url}') if os.getenv('DISPLAY'): self.logger.info( f'failed to open the picture, you can open it in your browser: {img_url}' ) else: content = WR.get_response(img_url, max_try=1).content im = Image.open(io.BytesIO(content)) # im.show() # this will block the program im.save('tmp.png') # **************************** # ***** non-blocked mode ***** # **************************** pylab.ion() img = pylab.imread('tmp.png') pylab.imshow(img) pylab.show() while True: answer = click.prompt('please input the captcha') if answer == 'new': return self.download(url, outdir=outdir, filename=filename, chunk_size=chunk_size) payload = { 'id': img_url.split('/')[-1].split('.')[0], 'answer': answer } # payload = {'id': '6058249282282', 'answer': 'manila'} self.logger.debug(payload) resp = WR.get_response(url, method='POST', stream=True, data=payload) if resp.headers['Content-Type'] == 'application/pdf': pylab.close() return resp self.logger.warning('bad captcha, try again!')
def do_search(self, url, requires, **kwargs): """ """ if not kwargs.get('ratifyNo'): for key, name in requires.items(): if not kwargs.get(key): kwargs[key] = click.prompt('请输入{}'.format(name)) payload = util.query_payload(**kwargs) data = WebRequest.get_response(url, method='POST', json=payload, session=self.session, max_try=20).json() if data['code'] != 200: print(payload) click.secho('error code: {}'.format(json.dumps(data, ensure_ascii=False)), fg='red') exit() for each in self.do_search(url, requires, **payload): yield each else: yield data # 结果大于10条 total = data['data']['iTotalRecords'] if (payload['pageNum'] + 1) * payload['pageSize'] < total: payload['pageNum'] += 1 click.secho('>>> crawling {code}-{projectType}-{ratifyYear} page {page} ...'.format(page=payload['pageNum'] + 1, **payload), fg='cyan') for each in self.do_search(url, requires, **payload): yield each
def get_field_codes(cls): """ 所有的学科代码 """ url = cls.base_url + '/common/data/fieldCode' print(url) return WR.get_response(url).json()['data']
def elink(self, ids, dbfrom='pubmed', cmd='neighbor', **kwargs): """ https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ELink > - get cited (`{"linkname": "pubmed_pubmed_citedin"}`) >> elink.fcgi?dbfrom=pubmed&db=pubmed&id=20210808&cmd=neighbor&retmode=json > - get pdf url >> elink.fcgi?dbfrom=pubmed&db=pubmed&cmd=prlinks&id=10210801 cmds: - neighbor (default) - neighbor_score - neighbor_history - acheck - ncheck - lcheck - llinks - llinkslib - prlinks """ url = self.base_url + 'elink.fcgi' params = self.parse_params(retmode='json', id=ids, dbfrom=dbfrom, cmd=cmd, **kwargs) result = WebRequest.get_response(url, params=params).json() return result
def conclusion_project(self, projectid): """ 结题项目详情 """ url = self.base_url + '/baseQuery/data/conclusionProjectInfo/' + projectid data = WebRequest.get_response(url).json()['data'] return data
def support_types(self): """ 资助类别 """ # url = self.base_url + '/common/data/supportTypeData' # 包含子类 url = self.base_url + '/common/data/supportTypeClassOneData' # 仅一类 return WebRequest.get_response(url).json()['data']
def get_mim2gene(self, outfile=None): url = self.omim_url + '/static/omim/data/mim2gene.txt' resp = WR.get_response(url, stream=True) if outfile: with open(outfile, 'wb') as out: for chunk in resp.iter_content(chunk_size=512): out.write(chunk) self.logger.debug(f'save file: {outfile}') else: return resp.text
def list_provinces(self): """省份列表 """ self.logger.debug('list provinces ...') soup = WR.get_soup(self.index_url) province_list = [ each.attrs['value'] for each in soup.select('#province_main option[value!=""]') ] return province_list
def search(self, term, max_try=3): """ term: URL, PMID, DOI or search string return: the url of pdf """ soup = WR.get_soup(self.url) form = soup.select_one('form[method="POST"]') post_url = self.url if form.attrs['action'] == '/' else form.attrs[ 'action'] payload = {'sci-hub-plugin-check': '', 'request': term} self.logger.debug(f'search pdf url for: {term}') while max_try: max_try -= 1 soup = WR.get_soup(post_url, method='POST', data=payload, timeout=self.timeout) pdf = soup.select_one('#pdf') if 'article not found' in soup.text: self.logger.warning(f'article not found [{term}]') return elif not pdf: # print(soup.select('title')) continue pdf_url = pdf.attrs['src'] if pdf_url.startswith('//'): pdf_url = post_url.split(':')[0] + f':{pdf_url}' self.logger.info(f'pdf url: {pdf_url}') return pdf_url self.logger.error( f'your searching has no result, please check! [{term}]')
def get_conclusion_report_images(cls, projectid): url = cls.base_url + '/baseQuery/data/completeProjectReport' index = 1 while True: payload = {'id': projectid, 'index': index} res = WR.get_response(url, method='POST', data=payload).json()['data'] if not res['hasnext']: break yield cls.base_url + res['url'] index += 1
def fetch_factor(*kws): for kw in kws: url = BASE_URL + kw resp = WebRequest.get_response(url) soup = WebRequest.get_soup(resp) context = {} trs = soup.select('table tr') if len(trs) > 2: print('multiple result for kw: {}'.format(kw)) elif len(trs) < 2: print('no result for kw: {}'.format(kw)) else: title = [th.text for th in trs[0].find_all('th')[2:]] values = [td.text for td in trs[1].find_all('td')[2:]] if values[-1]: context['factor_history'] = json.dumps(dict(zip(title, values))) context['factor'] = values[-1] context['kw'] = kw if context: return context
def download(self, url, outdir='.', filename=None, chunk_size=512, overwrite=None, show_progress=True): """download pdf from url """ filename = filename or os.path.basename(url).split('#')[0] if outdir != '.' and not os.path.exists(outdir): os.makedirs(outdir) outfile = os.path.join(outdir, filename) if os.path.isfile(outfile) and os.stat(outfile).st_size > 0: if not isinstance(overwrite, bool): overwrite = click.confirm( 'The file already exists, do you want to overwrite it?') if overwrite: self.logger.debug(f'overwriting the file: {outfile}') else: self.logger.debug(f'skip downloading file: {outfile}') return True resp = WR.get_response(url, stream=True) if resp.headers['Content-Type'] != 'application/pdf': resp = self.deal_captcha(url, outdir, filename, chunk_size) length = int(resp.headers.get('Content-Length')) # if os.path.isfile(outfile) and os.stat(outfile).st_size == length: self.logger.info( f'downloading pdf: {outfile} [{length/1024/1024:.2f} M]') bar = click.progressbar(length=length, label='downloading', show_percent=True, show_pos=True, show_eta=True) with open(outfile, 'wb') as out, bar: for chunk in resp.iter_content(chunk_size=chunk_size): out.write(chunk) if show_progress: bar.update(chunk_size) self.logger.info(f'save file: {outfile}') return True
def einfo(self, **kwargs): """ https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EInfo > - show all database list >> einfo.fcgi?db= > - show dbinfo for given database >> einfo.fcgi?db=pubmed """ url = self.base_url + 'einfo.fcgi' params = self.parse_params(retmode='json', **kwargs) info = WebRequest.get_response(url, params=params, allowed_codes=[200, 400]).json() return info
def check_host(): """ checking available urls of Sci-Hub """ url = 'https://lovescihub.wordpress.com/' soup = WR.get_soup(url) text = soup.select_one('.entry-content p:nth-child(2)').text update_text = soup.select_one('.entry-title').text update_time = date_parse( re.findall(r'Last check time:(.+?)\)', update_text)[0]) hosts = dict(re.findall(r'(http.+?)\s+(.+?s)', text)) return hosts, update_time
def list_support_types(self): """项目类别列表 Bug: 网页显示:应急管理项目 实际应该:科学部主任基金项目/应急管理项目 """ self.logger.debug('list support types ...') soup = WR.get_soup(self.index_url) subcategory_list = [] for option in soup.select('select#subcategory option')[1:]: if option.text == '应急管理项目': text = '科学部主任基金项目/应急管理项目' else: text = option.text subcategory_list.append(text) return subcategory_list
def search_page(self, params, payload): """查询页面 """ self.logger.debug( f'searching for: {payload} [page: {params["currentpage"]}]') while True: soup = WR.get_soup(self.search_url, method='POST', params=params, data=payload) if not soup.select_one('#dict div b'): self.logger.warning(f'{soup.text}') if '需要先注册登录' in soup.text: exit() time.sleep(30) continue time.sleep(random.randint(5, 10)) return soup
def get_conclusion_data(cls, ratify_number, detail=True): """ 获取指定项目批准号的结题数据 """ url = cls.base_url + '/baseQuery/data/conclusionQueryResultsData' payload = { 'ratifyNo': ratify_number, 'queryType': 'input', 'complete': 'true', } result = WR.get_response(url, method='POST', json=payload).json()['data']['resultsData'] data = {} if result: data['projectid'] = result[0][0] data['project_type'] = result[0][3] data['result_stat'] = result[0][10] if detail and data.get('projectid'): detail_data = cls.get_detail_data(data['projectid']) data.update(detail_data) return data
def get_conclusion_report(cls, ratify_number, tmpdir='tmp', pdf=True, outfile=None): data = cls.get_conclusion_data(ratify_number, detail=False) if not data: cls.logger.warning(f'no conclusion result for: {ratify_number}') return images = list(cls.get_conclusion_report_images(data['projectid'])) if not os.path.exists(tmpdir): os.makedirs(tmpdir) pngs = [] for n, url in enumerate(images, 1): name = os.path.basename(url) png = f'{tmpdir}/{name}.png' pngs.append(png) cls.logger.debug( f'[{n}/{len(images)}] download png: {url} => {png}') resp = WR.get_response(url, stream=True) with open(png, 'wb') as out: for chunk in resp.iter_content(chunk_size=512): out.write(chunk) cls.logger.debug(f'save png: {png}') if pdf: cls.logger.debug('converting *png to pdf') outfile = outfile or f'{ratify_number}.pdf' with open(outfile, 'wb') as out: out.write(img2pdf.convert(pngs)) size = human_readable.file_size(os.stat(outfile).st_size) cls.logger.info(f'save pdf: {outfile} [{size}]') return True
def efetch(self, ids, batch_size=5, **kwargs): """ https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch > - fetch from a database for given ids >> efetch.cgi?db=pubmed&id=1,2,3 """ url = self.base_url + 'efetch.fcgi' self.logger.info('fetching start: total {}, batch_size: {}'.format( len(ids), batch_size)) for n in range(0, len(ids), batch_size): _id = ','.join(ids[n:n + batch_size]) self.logger.debug(f'fetching xml: {n+1} - {n+batch_size}') params = self.parse_params(id=_id, retmode='xml') xml = WebRequest.get_response(url, params=params).text self.logger.debug(f'parsing xml: {n+1} - {n+batch_size}') for context in xml_parser.parse(xml): article = Article(**context) yield article
def list_codes(self): self.logger.debug('list subject codes ...') url = self.base_url + '/js/nsfctags2019multiple.js' resp = WR.get_response(url) codes = defaultdict(list) for line in resp.text.split('\n'): if line.startswith('subtag['): linelist = line.strip().split("', '") subject = linelist[2] # 学部 code1 = linelist[3] # 一级学科 A01 code2 = linelist[4] # 二级学科 A0101 code3 = linelist[5] # 二级学科 A010101 # name = linelist[6].split("'")[0] # 学科名字 if code1 not in codes[subject]: codes[subject].append(code1) if code2 not in codes[code1]: codes[code1].append(code2) if code3 not in codes[code2]: codes[code2].append(code3) # print(subject, name, code1, code2, code3) return dict(codes)
def field_codes(self): """ 申请代码 """ url = self.base_url + '/common/data/fieldCode' return WebRequest.get_response(url).json()['data']
context['author_first'] = context['author_last'] = '.' if authors: context['author_first'] = authors[0] if len(authors) > 1: context['author_last'] = authors[-1] context['authors'] = ',\n'.join(author_list) context['pub_types'] = Article.xpath( 'PublicationTypeList/PublicationType/text()') context['doi'] = PubmedArticle.findtext( 'PubmedData/ArticleIdList/ArticleId[@IdType="doi"]') context['pmc'] = PubmedArticle.findtext( 'PubmedData/ArticleIdList/ArticleId[@IdType="pmc"]') yield context if __name__ == '__main__': import json from webrequests import WebRequest pmid = '17284678,9997' pmid = '28143587' pmid = '33577981' url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={pmid}&retmode=xml' resp = WebRequest.get_response(url) for context in parse(resp.text): print(json.dumps(context, indent=2))
def esearch(self, term, retstart=0, retmax=250, head=False, limit=None, **kwargs): """ https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch > -search from a database from given term >> - esearch.cgi?db=pubmed&term=ngs >> - esearch.cgi?db=pubmed&term=ngs&retmode=xml&field=TIAB >> - esearch.cgi?db=pubmed&term=ngs[Title/Abstract]&retmode=xml """ url = self.base_url + 'esearch.fcgi' params = self.parse_params(term=term, retmode='json', retstart=retstart, retmax=retmax, **kwargs) # print(params) result = WebRequest.get_response(url, params=params).json()['esearchresult'] if head: return result self.logger.info( '{count} articles found with term: {querytranslation}'.format( **result)) if limit is None and int(result['count']) > 250: self.logger.warning( 'too many results, you can limit output with option "-l/--limit N", ' 'or simplify your input with sub-command "advance-search" ') exit(1) idlist = result['idlist'] while int(result['retstart']) + int(result['retmax']) < int( result['count']): if limit and len(idlist) >= limit: break retstart = int(result['retstart']) + int(result['retmax']) params = self.parse_params(term=term, retmode='json', retstart=retstart, retmax=retmax, **kwargs) result = WebRequest.get_response( url, params=params).json()['esearchresult'] idlist += result['idlist'] if limit: self.logger.info('limit {} from {}'.format(limit, result['count'])) idlist = idlist[:limit] if idlist: self.logger.debug('idlist: {} ...'.format(', '.join(idlist[:10]))) else: self.logger.warning('no result for term: {}'.format(term)) return idlist
def get_soup(self, url): soup = WR.get_soup(url) return soup
def list_projects(cls): soup = WR.get_soup(cls.url) for box in soup.select('.input-area .ms-checkbox input'): yield box.attrs['value']
class Official(object): base_url = 'http://output.nsfc.gov.cn' logger = SimpleLogger('Official') field_codes = WR.get_response(base_url + '/common/data/fieldCode').json()['data'] @classmethod def get_field_codes(cls): """ 所有的学科代码 """ url = cls.base_url + '/common/data/fieldCode' print(url) return WR.get_response(url).json()['data'] @classmethod def list_root_codes(cls): """ 获取所有的学科分类代码 """ root_codes = {} for context in cls.field_codes: if len(context['code']) == 1: root_codes[context['code']] = context['name'] return root_codes @classmethod def list_child_codes(cls, keys): """ 获取最低级的学科代码 C01 --> C010101, C010102, ... H10 --> H1001, H1002, ... """ child_codes = {} for key in keys.split(','): for context in cls.field_codes: code = context['code'] if len(code) == 1: continue if code.startswith(key): child_codes[code] = context['name'] if code[:-2] in child_codes: del child_codes[code[:-2]] return child_codes @classmethod def get_conclusion_data(cls, ratify_number, detail=True): """ 获取指定项目批准号的结题数据 """ url = cls.base_url + '/baseQuery/data/conclusionQueryResultsData' payload = { 'ratifyNo': ratify_number, 'queryType': 'input', 'complete': 'true', } result = WR.get_response(url, method='POST', json=payload).json()['data']['resultsData'] data = {} if result: data['projectid'] = result[0][0] data['project_type'] = result[0][3] data['result_stat'] = result[0][10] if detail and data.get('projectid'): detail_data = cls.get_detail_data(data['projectid']) data.update(detail_data) return data @classmethod def get_detail_data(cls, projectid): url = cls.base_url + '/baseQuery/data/conclusionProjectInfo/' + projectid data = WR.get_response(url).json()['data'] return data @classmethod def get_conclusion_report(cls, ratify_number, tmpdir='tmp', pdf=True, outfile=None): data = cls.get_conclusion_data(ratify_number, detail=False) if not data: cls.logger.warning(f'no conclusion result for: {ratify_number}') return images = list(cls.get_conclusion_report_images(data['projectid'])) if not os.path.exists(tmpdir): os.makedirs(tmpdir) pngs = [] for n, url in enumerate(images, 1): name = os.path.basename(url) png = f'{tmpdir}/{name}.png' pngs.append(png) cls.logger.debug( f'[{n}/{len(images)}] download png: {url} => {png}') resp = WR.get_response(url, stream=True) with open(png, 'wb') as out: for chunk in resp.iter_content(chunk_size=512): out.write(chunk) cls.logger.debug(f'save png: {png}') if pdf: cls.logger.debug('converting *png to pdf') outfile = outfile or f'{ratify_number}.pdf' with open(outfile, 'wb') as out: out.write(img2pdf.convert(pngs)) size = human_readable.file_size(os.stat(outfile).st_size) cls.logger.info(f'save pdf: {outfile} [{size}]') return True @classmethod def get_conclusion_report_images(cls, projectid): url = cls.base_url + '/baseQuery/data/completeProjectReport' index = 1 while True: payload = {'id': projectid, 'index': index} res = WR.get_response(url, method='POST', data=payload).json()['data'] if not res['hasnext']: break yield cls.base_url + res['url'] index += 1
def get_detail_data(cls, projectid): url = cls.base_url + '/baseQuery/data/conclusionProjectInfo/' + projectid data = WR.get_response(url).json()['data'] return data
def ncbi_citations(pmid, fmt=None): url = 'https://pubmed.ncbi.nlm.nih.gov/{}/citations/'.format(pmid) data = WebRequest.get_response(url).json() if fmt in data: return data[fmt] return data
def get_detail(cls, url): soup = WR.get_soup(url) for column in soup.select('.journal-content .journal-content-column'): key = column.select_one('.column-label').text value = column.select_one('.font-black').text.strip() yield key, value