Beispiel #1
0
    def deal_captcha(self, url, outdir, filename, chunk_size):
        """deal with the captcha
        """
        soup = WR.get_soup(url)
        img = soup.select_one('#captcha').attrs['src']
        img_url = url.rsplit('/', 3)[0] + img
        # print(img_url)

        self.logger.warning(f'need to type in the captcha: {img_url}')
        if os.getenv('DISPLAY'):
            self.logger.info(
                f'failed to open the picture, you can open it in your browser: {img_url}'
            )
        else:
            content = WR.get_response(img_url, max_try=1).content
            im = Image.open(io.BytesIO(content))

            # im.show()  # this will block the program
            im.save('tmp.png')

            # ****************************
            # ***** non-blocked mode *****
            # ****************************
            pylab.ion()
            img = pylab.imread('tmp.png')

            pylab.imshow(img)
            pylab.show()

        while True:
            answer = click.prompt('please input the captcha')

            if answer == 'new':
                return self.download(url,
                                     outdir=outdir,
                                     filename=filename,
                                     chunk_size=chunk_size)

            payload = {
                'id': img_url.split('/')[-1].split('.')[0],
                'answer': answer
            }

            # payload = {'id': '6058249282282', 'answer': 'manila'}
            self.logger.debug(payload)

            resp = WR.get_response(url,
                                   method='POST',
                                   stream=True,
                                   data=payload)

            if resp.headers['Content-Type'] == 'application/pdf':
                pylab.close()
                return resp

            self.logger.warning('bad captcha, try again!')
Beispiel #2
0
 def list_provinces(self):
     """省份列表
     """
     self.logger.debug('list provinces ...')
     soup = WR.get_soup(self.index_url)
     province_list = [
         each.attrs['value']
         for each in soup.select('#province_main option[value!=""]')
     ]
     return province_list
Beispiel #3
0
    def search(self, term, max_try=3):
        """
            term: URL, PMID, DOI or search string

            return: the url of pdf
        """
        soup = WR.get_soup(self.url)
        form = soup.select_one('form[method="POST"]')
        post_url = self.url if form.attrs['action'] == '/' else form.attrs[
            'action']

        payload = {'sci-hub-plugin-check': '', 'request': term}

        self.logger.debug(f'search pdf url for: {term}')

        while max_try:
            max_try -= 1

            soup = WR.get_soup(post_url,
                               method='POST',
                               data=payload,
                               timeout=self.timeout)

            pdf = soup.select_one('#pdf')

            if 'article not found' in soup.text:
                self.logger.warning(f'article not found [{term}]')
                return
            elif not pdf:
                # print(soup.select('title'))
                continue

            pdf_url = pdf.attrs['src']

            if pdf_url.startswith('//'):
                pdf_url = post_url.split(':')[0] + f':{pdf_url}'

            self.logger.info(f'pdf url: {pdf_url}')
            return pdf_url

        self.logger.error(
            f'your searching has no result, please check! [{term}]')
Beispiel #4
0
def check_host():
    """
        checking available urls of Sci-Hub
    """
    url = 'https://lovescihub.wordpress.com/'

    soup = WR.get_soup(url)

    text = soup.select_one('.entry-content p:nth-child(2)').text
    update_text = soup.select_one('.entry-title').text
    update_time = date_parse(
        re.findall(r'Last check time:(.+?)\)', update_text)[0])

    hosts = dict(re.findall(r'(http.+?)\s+(.+?s)', text))

    return hosts, update_time
Beispiel #5
0
    def list_support_types(self):
        """项目类别列表

            Bug: 网页显示:应急管理项目
                 实际应该:科学部主任基金项目/应急管理项目
        """
        self.logger.debug('list support types ...')
        soup = WR.get_soup(self.index_url)
        subcategory_list = []
        for option in soup.select('select#subcategory option')[1:]:
            if option.text == '应急管理项目':
                text = '科学部主任基金项目/应急管理项目'
            else:
                text = option.text
            subcategory_list.append(text)

        return subcategory_list
Beispiel #6
0
    def search_page(self, params, payload):
        """查询页面
        """
        self.logger.debug(
            f'searching for: {payload} [page: {params["currentpage"]}]')
        while True:
            soup = WR.get_soup(self.search_url,
                               method='POST',
                               params=params,
                               data=payload)
            if not soup.select_one('#dict div b'):
                self.logger.warning(f'{soup.text}')
                if '需要先注册登录' in soup.text:
                    exit()
                time.sleep(30)
                continue

            time.sleep(random.randint(5, 10))
            return soup
Beispiel #7
0
def fetch_factor(*kws):
    for kw in kws:
        url = BASE_URL + kw
        resp = WebRequest.get_response(url)
        soup = WebRequest.get_soup(resp)

        context = {}

        trs = soup.select('table tr')
        if len(trs) > 2:
            print('multiple result for kw: {}'.format(kw))
        elif len(trs) < 2:
            print('no result for kw: {}'.format(kw))
        else:
            title = [th.text for th in trs[0].find_all('th')[2:]]
            values = [td.text for td in trs[1].find_all('td')[2:]]
            if values[-1]:
                context['factor_history'] = json.dumps(dict(zip(title,
                                                                values)))
                context['factor'] = values[-1]
                context['kw'] = kw
        if context:
            return context
Beispiel #8
0
 def get_soup(self, url):
     soup = WR.get_soup(url)
     return soup
Beispiel #9
0
 def list_projects(cls):
     soup = WR.get_soup(cls.url)
     for box in soup.select('.input-area .ms-checkbox input'):
         yield box.attrs['value']
Beispiel #10
0
 def get_detail(cls, url):
     soup = WR.get_soup(url)
     for column in soup.select('.journal-content .journal-content-column'):
         key = column.select_one('.column-label').text
         value = column.select_one('.font-black').text.strip()
         yield key, value
Beispiel #11
0
    def search(cls,
               page=1,
               txtitle='',
               project_classname_list='',
               date_begin='',
               date_end='',
               **kwargs):
        params = {
            'txtitle': txtitle,
            'page': page,
            'project_classname_list': project_classname_list,
            'cost_begin': '',
            'cost_end': '',
            'date_begin': date_begin,
            'date_end': date_end,
            'sort_type': '3',
        }
        soup = WR.get_soup(cls.url, params=params)
        total_count = int(
            re.findall(r'\d+',
                       soup.select_one('.list-result').text)[0])
        total_page = math.ceil(total_count / 15.)
        click.secho(f'total page: {total_page}, total count: {total_count}',
                    err=True,
                    fg='yellow')

        if total_count == 500:
            click.secho(
                f'too many results: {params}, searching by each project ...',
                err=True,
                fg='yellow')
            for project in cls.list_projects():
                if params['project_classname_list']:
                    click.secho(f'still too many results: {params} ...',
                                err=True,
                                fg='red')
                    exit(1)
                params['project_classname_list'] = project
                yield from cls.search(**params)

        for page in range(1, total_page + 1):
            click.secho(f'>>> crawling page: {page}/{total_page}',
                        err=True,
                        fg='green')
            params['page'] = page
            soup = WR.get_soup(cls.url, params=params)
            for a in soup.select('#journalList .journal-item strong a'):
                click.secho(str(a), err=True, fg='white')
                context = {}
                href = a.attrs['href']
                data = dict(list(cls.get_detail(href)))
                context['title'] = data['项目名称']
                context['project_id'] = data['项目批准号']
                context['project_type'] = data['资助类型']
                context['person'] = data['负责人']
                context['institution'] = data['依托单位']
                context['money'] = data['批准金额'].strip('万元')
                context['approval_year'] = data['批准年份']

                context['subject_code'] = data['学科分类'].split()[0]

                context['start_time'], context['end_time'] = data[
                    '起止时间'].split('-')

                yield context