コード例 #1
0
 def _parse_basic(self, record_list):
     if not record_list:
         return None
     result = []
     try:
         for record in record_list:
             basic = {}
             basic['nrdAn'] = record.get('fieldMap').get('AP')
             basic['nrdPn'] = record.get('fieldMap').get('PN')
             basic['patent_id'] = record.get('fieldMap').get('ID')
             basic['request_number'] = record.get('fieldMap').get('APO')
             basic['request_date'] = record.get('fieldMap').get('APD')
             basic['publish_number'] = record.get('fieldMap').get('PN')
             basic['publish_date'] = record.get('fieldMap').get('PD')
             basic['invention_name'] = record.get('fieldMap').get('TIVIEW')
             basic['inventor'] = record.get('fieldMap').get('INVIEW')
             basic['proposer'] = record.get('fieldMap').get('PAVIEW')
             basic['agent'] = record.get('fieldMap').get('AGT')
             basic['agency'] = record.get('fieldMap').get('AGY')
             # 去除<FONT>和</FONT>格式
             for key, value in basic.items():
                 basic[key] = re.sub(r'</{0,1}FONT>', '', value)
             result.append(basic)
         return result
     except Exception as e:
         logger.error(e)
         return None
コード例 #2
0
def login(username=None, password=None):
    """
    登录API
    :return: True: 登录成功; False: 登录失败
    """
    if username is None or password is None:
        username = account.username
        password = account.password
    ctrl.BEING_LOG = True
    if check_login_status():
        ctrl.BEING_LOG = False
        return True

    error_times = 0
    while True:
        try:
            update_proxy()
            update_cookies()
            busername = change_to_base64(username)
            bpassword = change_to_base64(password)
            captcha = get_captcha()
            logger.info('验证码识别结果:%s' % captcha)
            form_data = url_login.get('form_data')
            form_data.__setitem__('j_validation_code', captcha)
            form_data.__setitem__('j_username', busername)
            form_data.__setitem__('j_password', bpassword)

            resp = requests.post(url=url_login.get('url'),
                                 headers=url_login.get('headers'),
                                 data=form_data,
                                 cookies=ctrl.COOKIES,
                                 proxies=ctrl.PROXIES,
                                 timeout=TIMEOUT)
            if resp.text.find(username + ',欢迎访问') != -1:
                jsession = ctrl.COOKIES.get('JSESSIONID')
                resp.cookies.__delitem__('JSESSIONID')
                resp.cookies.set('JSESSIONID',
                                 jsession,
                                 domain='www.pss-system.gov.cn')
                update_cookies(resp.cookies)
                requests.post(
                    'http://www.pss-system.gov.cn/sipopublicsearch/patentsearch/showViewList-jumpToView.shtml',
                    cookies=ctrl.COOKIES,
                    proxies=ctrl.PROXIES)
                ctrl.BEING_LOG = False
                logger.info('登录成功')
                return True
            else:
                if error_times > 5:
                    break
                logger.error('登录失败')
                error_times += 1
        except Exception as e:
            logger.error(e)

    ctrl.BEING_LOG = False
    return False
コード例 #3
0
 def parse_full_text(self, response):
     if response.status_code == 200:
         try:
             full_text = json.loads(response.text)
             full_text_dto = full_text.get('fullTextDTO')
             clain_instruct_str = full_text_dto['literaInfohtml']
             clain, instruction = self.clear_claim(clain_instruct_str)
             return clain, instruction
         except Exception as e:
             logger.error('{},\n{}'.format(response.text, response.headers))
             logger.error(e)
     else:
         return None, None
コード例 #4
0
 def write_patent_item(self, count, patent_item):
     """
     写入 标题,摘要,权利要求书,说明书
     :param count:
     :param patent_item:
     :return:
     """
     if patent_item:
         # 拼接专利全文的路径
         patent_path = os.path.join(PATENT_TEXT_DIR,
                                    patent_item['request_number'] + '.txt')
         with open(patent_path, 'w', encoding='utf-8') as w:
             w.write("{}\n{}\n{}\n{}".format(patent_item['title'],
                                             patent_item['abstract'],
                                             patent_item['claim'],
                                             patent_item['instructions']))
         logger.info('第{}篇专利全文写入{} 完成!'.format(count, patent_path))
     else:
         logger.error('专利全文写到本地失败!')
コード例 #5
0
 def parseFirstPage(self, response):
     if response.status_code == 200:
         try:
             result = json.loads(response.text)
             searchResultRecord = result['searchResultDTO'][
                 'searchResultRecord']
             if searchResultRecord:
                 result_list = self._parse_basic(searchResultRecord)
                 if len(result_list) > 0:
                     # 只取第一项
                     item = result_list[0]
                     patentid = item.get('patent_id')
                     return patentid
                 else:
                     logger.info('无记录!')
                     return None
             else:
                 logger.error('检索列表出错了!')
                 return None
         except Exception as e:
             logger.error('{},\n{}'.format(response.text, response.headers))
             logger.error(e)
             return None
     else:
         return None
コード例 #6
0
 def parsePatentDetail(self, response):
     # print(response.text)
     print(response.status_code)
     if response.status_code == 200:
         # print(response.text)
         # print('-----------------------')
         # print(response.content)
         try:
             detail = json.loads(response.text)
             abstract = BeautifulSoup(
                 detail.get('abstractInfoDTO').get('abIndexList')[0].get(
                     'value'), 'lxml').text.replace('\n', '').strip()
             invention_name = detail.get('abstractInfoDTO').get(
                 'tioIndex').get('value')
             return abstract, invention_name
         except Exception as e:
             logger.error('{},\n{}'.format(response.text, response.headers))
             logger.error(e)
             return None, None
     else:
         logger.error('解析专利标题和摘要出错!')
         return None, None
コード例 #7
0
    def start_requests(self):
        mainSearch = url_config.mainSearch
        headers = mainSearch.get('headers')
        # cookie_str = input("请输入cookie值进行初始化:")
        # cookies = self.parse_cookie_str(cookie_str)
        # self.cookies = self.get_cookies()
        self.get_cookies()
        # 专利全文
        patent_item = {}
        for count, request_number in enumerate(self.sipoList):
            count += 1
            if request_number in self.down_set:
                logger.info('{}已经存在,跳过下载'.format(request_number))
                continue

            patent_item['request_number'] = request_number
            # searchExpCn = "申请号=(CN201410811795+)"
            searchExpCn = self.componet_search(request_number)
            logger.info('第{}个检索表达式--- {}'.format(count, searchExpCn))
            form_data = mainSearch.get('form_data')
            form_data.__setitem__('searchCondition.searchExp', searchExpCn)
            # 检索patent_id
            first_response = requests.post(
                url=url_config.mainSearch.get('url'),
                headers=headers,
                cookies=self.cookies,
                data=form_data)
            # 抽取patent_id
            patent_id = self.parseFirstPage(first_response)
            if not patent_id:
                logger.error('patent_id is {}'.format(patent_id))
                self.get_cookies()  # 更新一次cookies,当前专利记录不完整,放弃当前记录,进行下一条。
                continue
            # 专利ID
            patent_item['patent_id'] = patent_id
            # 组建标题和摘要的表单
            form_data = url_config.detailSearch.get('form_data')
            form_data.__setitem__('nrdAn', str(patent_id).split('.')[0])
            form_data.__setitem__('cid', str(patent_id))
            form_data.__setitem__('sid', str(patent_id))
            logger.info('获取专利ID:{}\n'.format(patent_id))
            # print(form_data)
            # 检索摘要和标题
            abstract_title_response = requests.post(
                url=url_config.detailSearch.get('url'),
                headers=url_config.detailSearch.get('headers'),
                cookies=self.cookies,
                data=form_data)
            # 解析摘要和标题
            abstract, title = self.parsePatentDetail(abstract_title_response)
            if not abstract and not title:
                logger.error('abstract is {}, title is {}'.format(
                    abstract, title))
                self.get_cookies()  # 更新一次cookies,当前专利记录不完整,放弃当前记录,进行下一条。
                continue
            patent_item['abstract'] = abstract
            patent_item['title'] = title

            # 组建权利要求和说明书表单
            form_data = url_config.full_text.get('form_data')
            form_data.__setitem__('nrdAn', str(patent_id).split('.')[0])
            form_data.__setitem__('cid', str(patent_id))
            form_data.__setitem__('sid', str(patent_id))
            # 检索权利要求和说明书
            full_text_response = requests.post(
                url=url_config.full_text.get('url'),
                headers=url_config.full_text.get('headers'),
                cookies=self.cookies,
                data=form_data)
            # 解析权利要求和说明书
            claim, instructions = self.parse_full_text(full_text_response)
            if not claim and not instructions:
                logger.error('claim is {}, instructions is {}'.format(
                    claim, instructions))
                self.get_cookies()  # 更新一次cookies,当前专利记录不完整,放弃当前记录,进行下一条。
                continue
            patent_item['claim'] = claim
            patent_item['instructions'] = instructions
            # 写入到本地
            self.write_patent_item(count, patent_item)