Example #1
0
def get_captcha(session, captcha_url):

    resp = WebRequest.get_response(captcha_url, session=session)
    im = Image.open(io.BytesIO(resp.content))

    im = im.convert('RGB')

    pixdata = im.load()
    weight, height = im.size
    for x in range(weight):
        for y in range(height):
            rgb = pixdata[x, y]
            if (rgb[0] - rgb[1] > 73) and (rgb[0] - rgb[2] > 73):
                pixdata[x, y] = (0, 0, 0)
            else:
                pixdata[x, y] = (255, 255, 255)

    captcha = pytesseract.image_to_string(im).strip()

    if len(captcha) != 4:
        return get_captcha(session, captcha_url)

    payload = util.query_payload(tryCode=captcha)
    funding_url = 'http://output.nsfc.gov.cn/baseQuery/data/supportQueryResultsData'
    resp = WebRequest.get_response(funding_url,
                                   method='POST',
                                   session=session,
                                   json=payload)

    if resp.json()['message'] != '验证码错误':
        click.secho('right captcha: {}'.format(captcha), fg='green', bold=True)
        return captcha

    click.secho('wrong captcha: {}'.format(captcha), fg='yellow')
    return get_captcha(session, captcha_url)
Example #2
0
    def deal_captcha(self, url, outdir, filename, chunk_size):
        """deal with the captcha
        """
        soup = WR.get_soup(url)
        img = soup.select_one('#captcha').attrs['src']
        img_url = url.rsplit('/', 3)[0] + img
        # print(img_url)

        self.logger.warning(f'need to type in the captcha: {img_url}')
        if os.getenv('DISPLAY'):
            self.logger.info(
                f'failed to open the picture, you can open it in your browser: {img_url}'
            )
        else:
            content = WR.get_response(img_url, max_try=1).content
            im = Image.open(io.BytesIO(content))

            # im.show()  # this will block the program
            im.save('tmp.png')

            # ****************************
            # ***** non-blocked mode *****
            # ****************************
            pylab.ion()
            img = pylab.imread('tmp.png')

            pylab.imshow(img)
            pylab.show()

        while True:
            answer = click.prompt('please input the captcha')

            if answer == 'new':
                return self.download(url,
                                     outdir=outdir,
                                     filename=filename,
                                     chunk_size=chunk_size)

            payload = {
                'id': img_url.split('/')[-1].split('.')[0],
                'answer': answer
            }

            # payload = {'id': '6058249282282', 'answer': 'manila'}
            self.logger.debug(payload)

            resp = WR.get_response(url,
                                   method='POST',
                                   stream=True,
                                   data=payload)

            if resp.headers['Content-Type'] == 'application/pdf':
                pylab.close()
                return resp

            self.logger.warning('bad captcha, try again!')
Example #3
0
    def do_search(self, url, requires, **kwargs):
        """
        
        """
        if not kwargs.get('ratifyNo'):
            for key, name in requires.items():
                if not kwargs.get(key):
                    kwargs[key] = click.prompt('请输入{}'.format(name))

        
        payload = util.query_payload(**kwargs)

        data = WebRequest.get_response(url, method='POST', json=payload, session=self.session, max_try=20).json()

        if data['code'] != 200:
            print(payload)
            click.secho('error code: {}'.format(json.dumps(data, ensure_ascii=False)), fg='red')
            exit()
            for each in self.do_search(url, requires, **payload):
                yield each
        else:
            yield data
            # 结果大于10条
            total = data['data']['iTotalRecords']
            if (payload['pageNum'] + 1) * payload['pageSize'] < total:
                payload['pageNum'] += 1
                click.secho('>>> crawling {code}-{projectType}-{ratifyYear} page {page} ...'.format(page=payload['pageNum'] + 1, **payload), fg='cyan')
                for each in self.do_search(url, requires, **payload):
                    yield each
Example #4
0
 def get_field_codes(cls):
     """
         所有的学科代码
     """
     url = cls.base_url + '/common/data/fieldCode'
     print(url)
     return WR.get_response(url).json()['data']
Example #5
0
    def elink(self, ids, dbfrom='pubmed', cmd='neighbor', **kwargs):
        """
            https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ELink

            > - get cited (`{"linkname": "pubmed_pubmed_citedin"}`)
            >> elink.fcgi?dbfrom=pubmed&db=pubmed&id=20210808&cmd=neighbor&retmode=json
            > - get pdf url
            >> elink.fcgi?dbfrom=pubmed&db=pubmed&cmd=prlinks&id=10210801

            cmds:
                - neighbor (default)
                - neighbor_score
                - neighbor_history
                - acheck
                - ncheck
                - lcheck
                - llinks
                - llinkslib
                - prlinks
        """
        url = self.base_url + 'elink.fcgi'
        params = self.parse_params(retmode='json',
                                   id=ids,
                                   dbfrom=dbfrom,
                                   cmd=cmd,
                                   **kwargs)
        result = WebRequest.get_response(url, params=params).json()
        return result
Example #6
0
 def conclusion_project(self, projectid):
     """
         结题项目详情        
     """
     url = self.base_url + '/baseQuery/data/conclusionProjectInfo/' + projectid
     data = WebRequest.get_response(url).json()['data']
     return data
Example #7
0
 def support_types(self):
     """
         资助类别
     """
     # url = self.base_url + '/common/data/supportTypeData'        # 包含子类
     url = self.base_url + '/common/data/supportTypeClassOneData'  # 仅一类
     return WebRequest.get_response(url).json()['data']
Example #8
0
 def get_mim2gene(self, outfile=None):
     url = self.omim_url + '/static/omim/data/mim2gene.txt'
     resp = WR.get_response(url, stream=True)
     if outfile:
         with open(outfile, 'wb') as out:
             for chunk in resp.iter_content(chunk_size=512):
                 out.write(chunk)
         self.logger.debug(f'save file: {outfile}')
     else:
         return resp.text
Example #9
0
 def list_provinces(self):
     """省份列表
     """
     self.logger.debug('list provinces ...')
     soup = WR.get_soup(self.index_url)
     province_list = [
         each.attrs['value']
         for each in soup.select('#province_main option[value!=""]')
     ]
     return province_list
Example #10
0
    def search(self, term, max_try=3):
        """
            term: URL, PMID, DOI or search string

            return: the url of pdf
        """
        soup = WR.get_soup(self.url)
        form = soup.select_one('form[method="POST"]')
        post_url = self.url if form.attrs['action'] == '/' else form.attrs[
            'action']

        payload = {'sci-hub-plugin-check': '', 'request': term}

        self.logger.debug(f'search pdf url for: {term}')

        while max_try:
            max_try -= 1

            soup = WR.get_soup(post_url,
                               method='POST',
                               data=payload,
                               timeout=self.timeout)

            pdf = soup.select_one('#pdf')

            if 'article not found' in soup.text:
                self.logger.warning(f'article not found [{term}]')
                return
            elif not pdf:
                # print(soup.select('title'))
                continue

            pdf_url = pdf.attrs['src']

            if pdf_url.startswith('//'):
                pdf_url = post_url.split(':')[0] + f':{pdf_url}'

            self.logger.info(f'pdf url: {pdf_url}')
            return pdf_url

        self.logger.error(
            f'your searching has no result, please check! [{term}]')
Example #11
0
 def get_conclusion_report_images(cls, projectid):
     url = cls.base_url + '/baseQuery/data/completeProjectReport'
     index = 1
     while True:
         payload = {'id': projectid, 'index': index}
         res = WR.get_response(url, method='POST',
                               data=payload).json()['data']
         if not res['hasnext']:
             break
         yield cls.base_url + res['url']
         index += 1
Example #12
0
def fetch_factor(*kws):
    for kw in kws:
        url = BASE_URL + kw
        resp = WebRequest.get_response(url)
        soup = WebRequest.get_soup(resp)

        context = {}

        trs = soup.select('table tr')
        if len(trs) > 2:
            print('multiple result for kw: {}'.format(kw))
        elif len(trs) < 2:
            print('no result for kw: {}'.format(kw))
        else:
            title = [th.text for th in trs[0].find_all('th')[2:]]
            values = [td.text for td in trs[1].find_all('td')[2:]]
            if values[-1]:
                context['factor_history'] = json.dumps(dict(zip(title,
                                                                values)))
                context['factor'] = values[-1]
                context['kw'] = kw
        if context:
            return context
Example #13
0
    def download(self,
                 url,
                 outdir='.',
                 filename=None,
                 chunk_size=512,
                 overwrite=None,
                 show_progress=True):
        """download pdf from url
        """
        filename = filename or os.path.basename(url).split('#')[0]
        if outdir != '.' and not os.path.exists(outdir):
            os.makedirs(outdir)

        outfile = os.path.join(outdir, filename)
        if os.path.isfile(outfile) and os.stat(outfile).st_size > 0:
            if not isinstance(overwrite, bool):
                overwrite = click.confirm(
                    'The file already exists, do you want to overwrite it?')

            if overwrite:
                self.logger.debug(f'overwriting the file: {outfile}')
            else:
                self.logger.debug(f'skip downloading file: {outfile}')
                return True

        resp = WR.get_response(url, stream=True)

        if resp.headers['Content-Type'] != 'application/pdf':
            resp = self.deal_captcha(url, outdir, filename, chunk_size)

        length = int(resp.headers.get('Content-Length'))

        # if os.path.isfile(outfile) and os.stat(outfile).st_size == length:

        self.logger.info(
            f'downloading pdf: {outfile} [{length/1024/1024:.2f} M]')

        bar = click.progressbar(length=length,
                                label='downloading',
                                show_percent=True,
                                show_pos=True,
                                show_eta=True)
        with open(outfile, 'wb') as out, bar:
            for chunk in resp.iter_content(chunk_size=chunk_size):
                out.write(chunk)
                if show_progress:
                    bar.update(chunk_size)

        self.logger.info(f'save file: {outfile}')
        return True
Example #14
0
    def einfo(self, **kwargs):
        """
            https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EInfo

            > - show all database list
            >> einfo.fcgi?db=
            > - show dbinfo for given database
            >> einfo.fcgi?db=pubmed
        """
        url = self.base_url + 'einfo.fcgi'
        params = self.parse_params(retmode='json', **kwargs)
        info = WebRequest.get_response(url,
                                       params=params,
                                       allowed_codes=[200, 400]).json()
        return info
Example #15
0
def check_host():
    """
        checking available urls of Sci-Hub
    """
    url = 'https://lovescihub.wordpress.com/'

    soup = WR.get_soup(url)

    text = soup.select_one('.entry-content p:nth-child(2)').text
    update_text = soup.select_one('.entry-title').text
    update_time = date_parse(
        re.findall(r'Last check time:(.+?)\)', update_text)[0])

    hosts = dict(re.findall(r'(http.+?)\s+(.+?s)', text))

    return hosts, update_time
Example #16
0
    def list_support_types(self):
        """项目类别列表

            Bug: 网页显示:应急管理项目
                 实际应该:科学部主任基金项目/应急管理项目
        """
        self.logger.debug('list support types ...')
        soup = WR.get_soup(self.index_url)
        subcategory_list = []
        for option in soup.select('select#subcategory option')[1:]:
            if option.text == '应急管理项目':
                text = '科学部主任基金项目/应急管理项目'
            else:
                text = option.text
            subcategory_list.append(text)

        return subcategory_list
Example #17
0
    def search_page(self, params, payload):
        """查询页面
        """
        self.logger.debug(
            f'searching for: {payload} [page: {params["currentpage"]}]')
        while True:
            soup = WR.get_soup(self.search_url,
                               method='POST',
                               params=params,
                               data=payload)
            if not soup.select_one('#dict div b'):
                self.logger.warning(f'{soup.text}')
                if '需要先注册登录' in soup.text:
                    exit()
                time.sleep(30)
                continue

            time.sleep(random.randint(5, 10))
            return soup
Example #18
0
    def get_conclusion_data(cls, ratify_number, detail=True):
        """
            获取指定项目批准号的结题数据
        """
        url = cls.base_url + '/baseQuery/data/conclusionQueryResultsData'
        payload = {
            'ratifyNo': ratify_number,
            'queryType': 'input',
            'complete': 'true',
        }
        result = WR.get_response(url, method='POST',
                                 json=payload).json()['data']['resultsData']
        data = {}
        if result:
            data['projectid'] = result[0][0]
            data['project_type'] = result[0][3]
            data['result_stat'] = result[0][10]

        if detail and data.get('projectid'):
            detail_data = cls.get_detail_data(data['projectid'])
            data.update(detail_data)
        return data
Example #19
0
    def get_conclusion_report(cls,
                              ratify_number,
                              tmpdir='tmp',
                              pdf=True,
                              outfile=None):
        data = cls.get_conclusion_data(ratify_number, detail=False)
        if not data:
            cls.logger.warning(f'no conclusion result for: {ratify_number}')
            return

        images = list(cls.get_conclusion_report_images(data['projectid']))

        if not os.path.exists(tmpdir):
            os.makedirs(tmpdir)

        pngs = []
        for n, url in enumerate(images, 1):
            name = os.path.basename(url)
            png = f'{tmpdir}/{name}.png'
            pngs.append(png)
            cls.logger.debug(
                f'[{n}/{len(images)}] download png: {url} => {png}')

            resp = WR.get_response(url, stream=True)
            with open(png, 'wb') as out:
                for chunk in resp.iter_content(chunk_size=512):
                    out.write(chunk)
            cls.logger.debug(f'save png: {png}')

        if pdf:
            cls.logger.debug('converting *png to pdf')
            outfile = outfile or f'{ratify_number}.pdf'
            with open(outfile, 'wb') as out:
                out.write(img2pdf.convert(pngs))

            size = human_readable.file_size(os.stat(outfile).st_size)
            cls.logger.info(f'save pdf: {outfile} [{size}]')
        return True
Example #20
0
    def efetch(self, ids, batch_size=5, **kwargs):
        """
            https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch

            > - fetch from a database for given ids
            >> efetch.cgi?db=pubmed&id=1,2,3
        """
        url = self.base_url + 'efetch.fcgi'

        self.logger.info('fetching start: total {}, batch_size: {}'.format(
            len(ids), batch_size))

        for n in range(0, len(ids), batch_size):
            _id = ','.join(ids[n:n + batch_size])

            self.logger.debug(f'fetching xml: {n+1} - {n+batch_size}')
            params = self.parse_params(id=_id, retmode='xml')
            xml = WebRequest.get_response(url, params=params).text

            self.logger.debug(f'parsing xml: {n+1} - {n+batch_size}')
            for context in xml_parser.parse(xml):
                article = Article(**context)
                yield article
Example #21
0
    def list_codes(self):
        self.logger.debug('list subject codes ...')
        url = self.base_url + '/js/nsfctags2019multiple.js'
        resp = WR.get_response(url)

        codes = defaultdict(list)
        for line in resp.text.split('\n'):
            if line.startswith('subtag['):
                linelist = line.strip().split("', '")
                subject = linelist[2]  # 学部
                code1 = linelist[3]  # 一级学科 A01
                code2 = linelist[4]  # 二级学科 A0101
                code3 = linelist[5]  # 二级学科 A010101
                # name = linelist[6].split("'")[0]        # 学科名字

                if code1 not in codes[subject]:
                    codes[subject].append(code1)
                if code2 not in codes[code1]:
                    codes[code1].append(code2)
                if code3 not in codes[code2]:
                    codes[code2].append(code3)

                # print(subject, name, code1, code2, code3)
        return dict(codes)
Example #22
0
 def field_codes(self):
     """
         申请代码
     """
     url = self.base_url + '/common/data/fieldCode'
     return WebRequest.get_response(url).json()['data']
Example #23
0
            context['author_first'] = context['author_last'] = '.'
            if authors:
                context['author_first'] = authors[0]
                if len(authors) > 1:
                    context['author_last'] = authors[-1]

            context['authors'] = ',\n'.join(author_list)

            context['pub_types'] = Article.xpath(
                'PublicationTypeList/PublicationType/text()')
            context['doi'] = PubmedArticle.findtext(
                'PubmedData/ArticleIdList/ArticleId[@IdType="doi"]')
            context['pmc'] = PubmedArticle.findtext(
                'PubmedData/ArticleIdList/ArticleId[@IdType="pmc"]')

            yield context


if __name__ == '__main__':
    import json
    from webrequests import WebRequest

    pmid = '17284678,9997'
    pmid = '28143587'
    pmid = '33577981'

    url = f'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={pmid}&retmode=xml'
    resp = WebRequest.get_response(url)
    for context in parse(resp.text):
        print(json.dumps(context, indent=2))
Example #24
0
    def esearch(self,
                term,
                retstart=0,
                retmax=250,
                head=False,
                limit=None,
                **kwargs):
        """
            https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch

            > -search from a database from given term
            >> - esearch.cgi?db=pubmed&term=ngs
            >> - esearch.cgi?db=pubmed&term=ngs&retmode=xml&field=TIAB
            >> - esearch.cgi?db=pubmed&term=ngs[Title/Abstract]&retmode=xml
        """
        url = self.base_url + 'esearch.fcgi'
        params = self.parse_params(term=term,
                                   retmode='json',
                                   retstart=retstart,
                                   retmax=retmax,
                                   **kwargs)

        # print(params)

        result = WebRequest.get_response(url,
                                         params=params).json()['esearchresult']

        if head:
            return result

        self.logger.info(
            '{count} articles found with term: {querytranslation}'.format(
                **result))

        if limit is None and int(result['count']) > 250:
            self.logger.warning(
                'too many results, you can limit output with option "-l/--limit N", '
                'or simplify your input with sub-command "advance-search" ')
            exit(1)

        idlist = result['idlist']

        while int(result['retstart']) + int(result['retmax']) < int(
                result['count']):
            if limit and len(idlist) >= limit:
                break
            retstart = int(result['retstart']) + int(result['retmax'])
            params = self.parse_params(term=term,
                                       retmode='json',
                                       retstart=retstart,
                                       retmax=retmax,
                                       **kwargs)
            result = WebRequest.get_response(
                url, params=params).json()['esearchresult']
            idlist += result['idlist']

        if limit:
            self.logger.info('limit {} from {}'.format(limit, result['count']))
            idlist = idlist[:limit]

        if idlist:
            self.logger.debug('idlist: {} ...'.format(', '.join(idlist[:10])))
        else:
            self.logger.warning('no result for term: {}'.format(term))

        return idlist
Example #25
0
 def get_soup(self, url):
     soup = WR.get_soup(url)
     return soup
Example #26
0
 def list_projects(cls):
     soup = WR.get_soup(cls.url)
     for box in soup.select('.input-area .ms-checkbox input'):
         yield box.attrs['value']
Example #27
0
class Official(object):
    base_url = 'http://output.nsfc.gov.cn'
    logger = SimpleLogger('Official')

    field_codes = WR.get_response(base_url +
                                  '/common/data/fieldCode').json()['data']

    @classmethod
    def get_field_codes(cls):
        """
            所有的学科代码
        """
        url = cls.base_url + '/common/data/fieldCode'
        print(url)
        return WR.get_response(url).json()['data']

    @classmethod
    def list_root_codes(cls):
        """
            获取所有的学科分类代码
        """
        root_codes = {}
        for context in cls.field_codes:
            if len(context['code']) == 1:
                root_codes[context['code']] = context['name']
        return root_codes

    @classmethod
    def list_child_codes(cls, keys):
        """
            获取最低级的学科代码
                C01  -->  C010101, C010102, ...
                H10  -->  H1001, H1002, ...
        """
        child_codes = {}
        for key in keys.split(','):
            for context in cls.field_codes:
                code = context['code']
                if len(code) == 1:
                    continue
                if code.startswith(key):
                    child_codes[code] = context['name']
                    if code[:-2] in child_codes:
                        del child_codes[code[:-2]]
        return child_codes

    @classmethod
    def get_conclusion_data(cls, ratify_number, detail=True):
        """
            获取指定项目批准号的结题数据
        """
        url = cls.base_url + '/baseQuery/data/conclusionQueryResultsData'
        payload = {
            'ratifyNo': ratify_number,
            'queryType': 'input',
            'complete': 'true',
        }
        result = WR.get_response(url, method='POST',
                                 json=payload).json()['data']['resultsData']
        data = {}
        if result:
            data['projectid'] = result[0][0]
            data['project_type'] = result[0][3]
            data['result_stat'] = result[0][10]

        if detail and data.get('projectid'):
            detail_data = cls.get_detail_data(data['projectid'])
            data.update(detail_data)
        return data

    @classmethod
    def get_detail_data(cls, projectid):
        url = cls.base_url + '/baseQuery/data/conclusionProjectInfo/' + projectid
        data = WR.get_response(url).json()['data']
        return data

    @classmethod
    def get_conclusion_report(cls,
                              ratify_number,
                              tmpdir='tmp',
                              pdf=True,
                              outfile=None):
        data = cls.get_conclusion_data(ratify_number, detail=False)
        if not data:
            cls.logger.warning(f'no conclusion result for: {ratify_number}')
            return

        images = list(cls.get_conclusion_report_images(data['projectid']))

        if not os.path.exists(tmpdir):
            os.makedirs(tmpdir)

        pngs = []
        for n, url in enumerate(images, 1):
            name = os.path.basename(url)
            png = f'{tmpdir}/{name}.png'
            pngs.append(png)
            cls.logger.debug(
                f'[{n}/{len(images)}] download png: {url} => {png}')

            resp = WR.get_response(url, stream=True)
            with open(png, 'wb') as out:
                for chunk in resp.iter_content(chunk_size=512):
                    out.write(chunk)
            cls.logger.debug(f'save png: {png}')

        if pdf:
            cls.logger.debug('converting *png to pdf')
            outfile = outfile or f'{ratify_number}.pdf'
            with open(outfile, 'wb') as out:
                out.write(img2pdf.convert(pngs))

            size = human_readable.file_size(os.stat(outfile).st_size)
            cls.logger.info(f'save pdf: {outfile} [{size}]')
        return True

    @classmethod
    def get_conclusion_report_images(cls, projectid):
        url = cls.base_url + '/baseQuery/data/completeProjectReport'
        index = 1
        while True:
            payload = {'id': projectid, 'index': index}
            res = WR.get_response(url, method='POST',
                                  data=payload).json()['data']
            if not res['hasnext']:
                break
            yield cls.base_url + res['url']
            index += 1
Example #28
0
 def get_detail_data(cls, projectid):
     url = cls.base_url + '/baseQuery/data/conclusionProjectInfo/' + projectid
     data = WR.get_response(url).json()['data']
     return data
Example #29
0
def ncbi_citations(pmid, fmt=None):
    url = 'https://pubmed.ncbi.nlm.nih.gov/{}/citations/'.format(pmid)
    data = WebRequest.get_response(url).json()
    if fmt in data:
        return data[fmt]
    return data
Example #30
0
 def get_detail(cls, url):
     soup = WR.get_soup(url)
     for column in soup.select('.journal-content .journal-content-column'):
         key = column.select_one('.column-label').text
         value = column.select_one('.font-black').text.strip()
         yield key, value