def deal_captcha(self, url, outdir, filename, chunk_size): """deal with the captcha """ soup = WR.get_soup(url) img = soup.select_one('#captcha').attrs['src'] img_url = url.rsplit('/', 3)[0] + img # print(img_url) self.logger.warning(f'need to type in the captcha: {img_url}') if os.getenv('DISPLAY'): self.logger.info( f'failed to open the picture, you can open it in your browser: {img_url}' ) else: content = WR.get_response(img_url, max_try=1).content im = Image.open(io.BytesIO(content)) # im.show() # this will block the program im.save('tmp.png') # **************************** # ***** non-blocked mode ***** # **************************** pylab.ion() img = pylab.imread('tmp.png') pylab.imshow(img) pylab.show() while True: answer = click.prompt('please input the captcha') if answer == 'new': return self.download(url, outdir=outdir, filename=filename, chunk_size=chunk_size) payload = { 'id': img_url.split('/')[-1].split('.')[0], 'answer': answer } # payload = {'id': '6058249282282', 'answer': 'manila'} self.logger.debug(payload) resp = WR.get_response(url, method='POST', stream=True, data=payload) if resp.headers['Content-Type'] == 'application/pdf': pylab.close() return resp self.logger.warning('bad captcha, try again!')
def list_provinces(self): """省份列表 """ self.logger.debug('list provinces ...') soup = WR.get_soup(self.index_url) province_list = [ each.attrs['value'] for each in soup.select('#province_main option[value!=""]') ] return province_list
def search(self, term, max_try=3): """ term: URL, PMID, DOI or search string return: the url of pdf """ soup = WR.get_soup(self.url) form = soup.select_one('form[method="POST"]') post_url = self.url if form.attrs['action'] == '/' else form.attrs[ 'action'] payload = {'sci-hub-plugin-check': '', 'request': term} self.logger.debug(f'search pdf url for: {term}') while max_try: max_try -= 1 soup = WR.get_soup(post_url, method='POST', data=payload, timeout=self.timeout) pdf = soup.select_one('#pdf') if 'article not found' in soup.text: self.logger.warning(f'article not found [{term}]') return elif not pdf: # print(soup.select('title')) continue pdf_url = pdf.attrs['src'] if pdf_url.startswith('//'): pdf_url = post_url.split(':')[0] + f':{pdf_url}' self.logger.info(f'pdf url: {pdf_url}') return pdf_url self.logger.error( f'your searching has no result, please check! [{term}]')
def check_host(): """ checking available urls of Sci-Hub """ url = 'https://lovescihub.wordpress.com/' soup = WR.get_soup(url) text = soup.select_one('.entry-content p:nth-child(2)').text update_text = soup.select_one('.entry-title').text update_time = date_parse( re.findall(r'Last check time:(.+?)\)', update_text)[0]) hosts = dict(re.findall(r'(http.+?)\s+(.+?s)', text)) return hosts, update_time
def list_support_types(self): """项目类别列表 Bug: 网页显示:应急管理项目 实际应该:科学部主任基金项目/应急管理项目 """ self.logger.debug('list support types ...') soup = WR.get_soup(self.index_url) subcategory_list = [] for option in soup.select('select#subcategory option')[1:]: if option.text == '应急管理项目': text = '科学部主任基金项目/应急管理项目' else: text = option.text subcategory_list.append(text) return subcategory_list
def search_page(self, params, payload): """查询页面 """ self.logger.debug( f'searching for: {payload} [page: {params["currentpage"]}]') while True: soup = WR.get_soup(self.search_url, method='POST', params=params, data=payload) if not soup.select_one('#dict div b'): self.logger.warning(f'{soup.text}') if '需要先注册登录' in soup.text: exit() time.sleep(30) continue time.sleep(random.randint(5, 10)) return soup
def fetch_factor(*kws): for kw in kws: url = BASE_URL + kw resp = WebRequest.get_response(url) soup = WebRequest.get_soup(resp) context = {} trs = soup.select('table tr') if len(trs) > 2: print('multiple result for kw: {}'.format(kw)) elif len(trs) < 2: print('no result for kw: {}'.format(kw)) else: title = [th.text for th in trs[0].find_all('th')[2:]] values = [td.text for td in trs[1].find_all('td')[2:]] if values[-1]: context['factor_history'] = json.dumps(dict(zip(title, values))) context['factor'] = values[-1] context['kw'] = kw if context: return context
def get_soup(self, url): soup = WR.get_soup(url) return soup
def list_projects(cls): soup = WR.get_soup(cls.url) for box in soup.select('.input-area .ms-checkbox input'): yield box.attrs['value']
def get_detail(cls, url): soup = WR.get_soup(url) for column in soup.select('.journal-content .journal-content-column'): key = column.select_one('.column-label').text value = column.select_one('.font-black').text.strip() yield key, value
def search(cls, page=1, txtitle='', project_classname_list='', date_begin='', date_end='', **kwargs): params = { 'txtitle': txtitle, 'page': page, 'project_classname_list': project_classname_list, 'cost_begin': '', 'cost_end': '', 'date_begin': date_begin, 'date_end': date_end, 'sort_type': '3', } soup = WR.get_soup(cls.url, params=params) total_count = int( re.findall(r'\d+', soup.select_one('.list-result').text)[0]) total_page = math.ceil(total_count / 15.) click.secho(f'total page: {total_page}, total count: {total_count}', err=True, fg='yellow') if total_count == 500: click.secho( f'too many results: {params}, searching by each project ...', err=True, fg='yellow') for project in cls.list_projects(): if params['project_classname_list']: click.secho(f'still too many results: {params} ...', err=True, fg='red') exit(1) params['project_classname_list'] = project yield from cls.search(**params) for page in range(1, total_page + 1): click.secho(f'>>> crawling page: {page}/{total_page}', err=True, fg='green') params['page'] = page soup = WR.get_soup(cls.url, params=params) for a in soup.select('#journalList .journal-item strong a'): click.secho(str(a), err=True, fg='white') context = {} href = a.attrs['href'] data = dict(list(cls.get_detail(href))) context['title'] = data['项目名称'] context['project_id'] = data['项目批准号'] context['project_type'] = data['资助类型'] context['person'] = data['负责人'] context['institution'] = data['依托单位'] context['money'] = data['批准金额'].strip('万元') context['approval_year'] = data['批准年份'] context['subject_code'] = data['学科分类'].split()[0] context['start_time'], context['end_time'] = data[ '起止时间'].split('-') yield context