def test_remove_comments(self): # text with comments self.assertEqual(remove_comments(u'<!--text with comments-->'), u'') self.assertEqual(remove_comments(u'Hello<!--World-->'), u'Hello') self.assertEqual(remove_comments(u'Hello<!--My\nWorld-->'), u'Hello') self.assertEqual(remove_comments(b"test <!--textcoment--> whatever"), u'test whatever') self.assertEqual(remove_comments(b"test <!--\ntextcoment\n--> whatever"), u'test whatever')
def test_returns_unicode(self): # make sure it always return unicode assert isinstance(remove_comments(b'without comments'), six.text_type) assert isinstance(remove_comments(b'<!-- with comments -->'), six.text_type) assert isinstance(remove_comments(u'without comments'), six.text_type) assert isinstance(remove_comments(u'<!-- with comments -->'), six.text_type)
def test_remove_comments(self): # make sure it always return unicode assert isinstance(remove_comments('without comments'), unicode) assert isinstance(remove_comments('<!-- with comments -->'), unicode) # text without comments self.assertEqual(remove_comments(u'text without comments'), u'text without comments') # text with comments self.assertEqual(remove_comments(u'<!--text with comments-->'), u'') self.assertEqual(remove_comments(u'Hello<!--World-->'), u'Hello')
def normalize_web_content(x, keep=('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'strong'), token='____SECTION____'): """Normalize web content. Parameters ---------- keep : tuple HTML tags to keep. token : str or None Token to use for replacing kep HTML tags. Do not replace if `None`. """ try: x = strip_html5_whitespace(x) x = remove_comments(x) x = remove_tags(x, keep=keep) if token: x = replace_tags(x, token=token) x = replace_entities(x) x = replace_escape_chars(x) except (TypeError, AttributeError): pass for part in _rx_web_sectionize.split(x): if part: yield part
def _process_markup(region, textf, tagf, tags_to_purge=_TAGS_TO_PURGE): fragments = getattr(region, 'parsed_fragments', None) if fragments is None: yield textf(region) return fiter = iter(fragments) for fragment in fiter: if isinstance(fragment, HtmlTag): # skip forward to closing script tags tag = fragment.tag if tag in tags_to_purge: # if opening, keep going until closed if fragment.tag_type == HtmlTagType.OPEN_TAG: for probe in fiter: if isinstance(probe, HtmlTag) and \ probe.tag == tag and \ probe.tag_type == HtmlTagType.CLOSE_TAG: break else: output = tagf(fragment) if output: yield output else: text = region.htmlpage.fragment_data(fragment) text = remove_comments(text) text = textf(text) if text: yield text
def get_base_url(response): """Return the base url of the given response, joined with the response url""" if response not in _baseurl_cache: text = response.text[0:4096] text = html.remove_comments(text, response.encoding) _baseurl_cache[response] = html.get_base_url(text, response.url, response.encoding) return _baseurl_cache[response]
def filteHTML(string): content = remove_comments(string) # 过滤注释 content = html.unescape(content) # 去掉实体字符 content = content.replace(' ', '') content = content.replace(' ', '') content = content.replace(' ', '') return content
def getPageText(self): # 获取网页正文 try: content = self.browser.find_element_by_css_selector( 'lph-article-comView').get_attribute('innerHTML') html = remove_comments(content) except NoSuchElementException: html = self.browser.page_source return html
def parse_detail(self, response): try: html = Selector(text=remove_comments(response.text)) yield cdeItem except Exception as e: self.logger.error('解析出错:%s?%s', response.url, response.request.body) self.logger.error(e)
def getPageText(self): # 获取网页正文 pageTitle = self.browser.find_element_by_css_selector( 'div.container div.row > div[ng-show="showTitle"]').get_attribute( 'outerHTML') pageHTML = self.browser.find_element_by_css_selector( 'div.container div.row > div#wenzhang-content').get_attribute( 'innerHTML') pureHTML = remove_comments(pageHTML) html = pageTitle + pureHTML return html
def getOilFundData(self): try: html = requests.get(url = self.url).content.decode('utf-8') except : wxPusher = Helper.WxPusher() wxPusher.sendMessage(title = '发生未知错误!' , text = '访问haoETF网站获取数据失败,请检查代码接口!') raise #去除不必要的注释 web_content = remove_comments(html) soup = BeautifulSoup(web_content,features='lxml') #获取表头 thead = soup.body.table.thead.tr.find_all('th') thead_list = [] for child in thead: thead_list.append(child.text) #获取表内容 tbody = soup.body.table.tbody.find_all('tr') tr_list = [] for tr in tbody: td_list = [] for td in tr: if td.string != '\n' : td_list.append(td.string) tr_list.append(td_list) #创建表 table = pd.DataFrame(tr_list,columns=thead_list) #设定变量字段名 discount_rt = thead_list[5] #溢价率 volume = thead_list[9] #成交额 limit = thead_list[-1] #申购限额 #去除百分比%字符 table[discount_rt] = table[discount_rt].str.replace('%','') #将字符串转化为数字格式 table[discount_rt] = pd.to_numeric(table[discount_rt],errors='ignore') table[volume] = pd.to_numeric(table[volume],errors='ignore') #选取溢价率超过4%且成交额>500万且不开放申购的基金 table = table[(table[discount_rt] >= 4) & (table[limit].str.contains('暂停') == False) & (table[volume] > 500)].sort_values(discount_rt,ascending=False) #截取需要用到的字段 seleted = table.loc[:,['代码','名称',discount_rt,'现价','T-1估值']] return seleted
async def sql(self, a): all_urls = [] conn, cursor = POOL_DB().create_conn() item = ED_SQL(cursor, a) POOL_DB().close_conn(conn, cursor) argument = argument_get(item[0]) argument['info_page'] = 1 all_urls = Handle_url(all_urls, **argument) for j in all_urls: kwargs = j[5] kwargs['judge_model'] = "test" ##区分正常采集和测试采集返回的数据不一样 if int(j[3]) == 1: ##post采集 url = j[0] html = await self.AIO_POST(argument, j, url, logger) else: if int(j[5]['immit_js']) == 1: ####瑞树js html = await Ray_html(j[2], argument['res_headers'], logger) else: ####get采集 url = j[2] html = await self.AIO_GET(argument, url, logger) first_url = kwargs['url'] title_tag = kwargs['title_tag'] title_re = kwargs['title_re'] xpath_list = kwargs['xpath_list'] if "http://tjj.gz.gov.cn/zwgk/gfxwj" in first_url: pass else: html = remove_comments(html) kwargs['html'] = html if title_tag: json_list = Handle_tttt(logger, **kwargs) elif title_re: json_list = Handle_title_re(**kwargs) elif xpath_list: json_list = Handle_xpath(**kwargs) all = {'info': json_list} ck = json.dumps(all) now_time = datetime.now() m = hashlib.md5() pp = str(first_url) + str(now_time) m.update(str(pp).encode('utf-8')) md = m.hexdigest() ####把url转码md5 self.sql_insert(md, ck) POOL_DB().close_db() return md
def test_remove_comments(self): # text with comments self.assertEqual(remove_comments("<!--text with comments-->"), "") self.assertEqual(remove_comments("Hello<!--World-->"), "Hello") self.assertEqual(remove_comments("Hello<!--My\nWorld-->"), "Hello") self.assertEqual(remove_comments(b"test <!--textcoment--> whatever"), "test whatever") self.assertEqual( remove_comments(b"test <!--\ntextcoment\n--> whatever"), "test whatever") self.assertEqual(remove_comments(b"test <!--"), "test ")
def _safe_html(html_part): '''去掉html中一些标签(script、input等) @type html_part: unicode @param html_part: html的一部分,或者一段文本 @type encoding: string @param encoding: 编码 @return: 去掉html中一些标签的html ''' if html_part is None: return None value = remove_tags_with_content(html_part, which_ones=_REMOVE_TAGS) # remove_tags_with_content、remove_tags一起使用来删除标签 value = remove_tags(value, which_ones=_REMOVE_TAGS) # 删除注释 value = remove_comments(value) return value
def _has_ajaxcrawlable_meta(text): """ >>> _has_ajaxcrawlable_meta('<html><head><meta name="fragment" content="!"/></head><body></body></html>') True >>> _has_ajaxcrawlable_meta("<html><head><meta name='fragment' content='!'></head></html>") True >>> _has_ajaxcrawlable_meta('<html><head><!--<meta name="fragment" content="!"/>--></head><body></body></html>') False >>> _has_ajaxcrawlable_meta('<html></html>') False """ # Stripping scripts and comments is slow (about 20x slower than # just checking if a string is in text); this is a quick fail-fast # path that should work for most pages. if "fragment" not in text: return False if "content" not in text: return False text = html.remove_tags_with_content(text, ("script", "noscript")) text = html.replace_entities(text) text = html.remove_comments(text) return _ajax_crawlable_re.search(text) is not None
def clean_html(text): ''' :param text: :return: Version : 2020-01-17_ver ''' if text is not None: body = replace_entities( text) # - 띄어쓰기, ‘ - ... 이런것들 제거하는 코드 # remove_tags : 입력된 텍스트에서 태그 제거하는 라이브러리 함수 # remove_tags_with_content : 입력한 텍스트에서 선택된 태그안의 내용을 지우는 라이브러리 함수 body = replace_entities( remove_tags_with_content(body, ('script', 'a', 'h4'))) body = remove_comments(body) body = remove_tags(body) # body = re.sub('(http|ftp|https)://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', '', body) # 텍스트의 http url 제거 # body = re.sub('([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)', '', body) # 텍스트의 이메일 제거 # body = re.sub('[\{\}\[\]\/?;:|\)*~`!^\-_+<>@\#$%&\\\=\(]', ' ', body) # 특수문자 제거 # body = re.sub('([ㄱ-ㅎㅏ-ㅣ]+)', '', body) # 한글 자음, 모음만 쓴것 제거 body_split = body.split() # 문자열을 리스트에 담는다. body = " ".join(body_split) return body else: return text
def clear_soup_w3lib(bs4_find_data_group): cleared_data_script = remove_tags_with_content(bs4_find_data_group, ('script', 'iframe')) cleared_data = remove_comments(cleared_data_script) return cleared_data
def parse(self, response): global epsilon shouldCrawlRule = True crawlingRule = response.meta["crawlingRuleEntry"] # Figure out if now is the time to crawl this rule and whether is the first crawl for the rule currDateTime = datetime.datetime.now() isFirstCrawl = True # Assume this is the first time we check this crawling rule lastCrawlTimestamp = 0 if crawlingRule.lastcrawltime: # The rule was used before isFirstCrawl = False lastCrawlTimestamp = crawlingRule.lastcrawltime.timestamp() deltaTimestamp = currDateTime.timestamp( ) - lastCrawlTimestamp + epsilon self.log("currDateTime=" + str(currDateTime), logging.INFO) if isFirstCrawl: self.log("lastcrawltime=Never", logging.INFO) else: self.log("lastcrawltime=" + str(crawlingRule.lastcrawltime), logging.INFO) self.log("deltaTimestamp+epsilon=" + str(deltaTimestamp), logging.INFO) # Check whether the wait interval between two consecutive crawls has passed if (deltaTimestamp / 60) < crawlingRule.crawlperiod: shouldCrawlRule = False return if shouldCrawlRule: crawlingRule.lastcrawltime = currDateTime # A new crawl will begin selector = crawlingRule.selectionrule.replace('::text', '').strip() currContent = "".join(response.css(selector).extract( )) # Extract all the content + tags using the selector if webDiffCrawler.TEXT_ONLY or '::text' in crawlingRule.selectionrule: # Ditch the script tags' content and then extract the text self.log("Extracting the text from the HTML...", logging.INFO) currContent = remove_tags(remove_tags_with_content( currContent, ('script', )), keep=webDiffCrawler.keptTags) currContent = remove_comments(currContent) currContent = webDiffCrawler.cleanHtmlContent(currContent) # Convert relative URLs to absolute URLs currContent = webDiffCrawler.makeURLsAbsolute( response.url, currContent) currContent = remove_tags(remove_tags_with_content( currContent, ('script', )), keep=webDiffCrawler.keptTags) # Extract URLs to downloadable documents currLinks = webDiffCrawler.extractURLsToDocuments(currContent) # currContent = html.escape(currContent) # currContent = currContent.replace("'", "\\'") # currContent = currContent.replace('"', '\\"') currContent = currContent.strip() # currContent = currContent.encode('unicode-escape').decode() # Escape special chars like \n \t self.log("currContent = " + currContent) oldContent = crawlingRule.content oldLinks = crawlingRule.docslinks # oldContent = oldContent.encode('unicode-escape').decode() self.log("oldContent = " + oldContent) if not isFirstCrawl: # If there is some old content to compare the new content to self.sequenceMatcher.set_seqs(oldContent, currContent) operations = [] newContentTagsIntervals = extractTagsIntervals(currContent) oldContentTagsIntervals = extractTagsIntervals(oldContent) if oldContent: operations = self.sequenceMatcher.get_opcodes() if len(operations) == 1 and operations[0][0] == 'equal': self.log( "The content for id_crawlingrules=" + str(crawlingRule.id_crawlingrules) + " hasn't changed so no new Notification was issued", logging.INFO) else: self.log( "The content for id_crawlingrules=" + str(crawlingRule.id_crawlingrules) + " has changed => New notification issued", logging.INFO) # Update the operations interval indices in order for all the intervals to be closed for operation in operations: operation = list(operation) self.log("Initial Operation: " + str(operation), logging.DEBUG) operation[2] -= 1 if operation[2] < operation[1]: operation[2] = operation[1] operation[4] -= 1 if operation[4] < operation[3]: operation[4] = operation[3] self.log("Final Operation: " + str(operation), logging.DEBUG) # Generate colored HTML code #coloredCurrContent, detecte = colorDifferences(currContent, operations, tagsIntervals) coloredCurrContent, detectedReplacedOrInserted, coloredOldContent, detectedDeleted = colorDifferences( currContent, oldContent, operations, newContentTagsIntervals, oldContentTagsIntervals) # Create a new notification and add it to the 'notifications' table recipients = ["all"] newNotification = mappedClasses.Notifications( address=crawlingRule.address, id_matchingrule=crawlingRule.id_crawlingrules, modifytime=crawlingRule.lastcrawltime, currcontent=currContent, coloredcurrcontent=coloredCurrContent, currdocslinks=json.dumps(currLinks), detectedreplacedorinserted=detectedReplacedOrInserted, oldcontenttime=crawlingRule.lastmodifytime, oldcontent=oldContent, coloredoldcontent=coloredOldContent, detecteddeleted=detectedDeleted, olddocslinks=oldLinks, changes=json.dumps(operations), recipients=recipients, ackers=[]) self.session.add(newNotification) crawlingRule.content = currContent crawlingRule.docslinks = json.dumps(currLinks) crawlingRule.lastmodifytime = datetime.datetime.now() else: # This is the first content we ever get for this rule self.log( "This is the first crawl for id_crawlingrules=" + str(crawlingRule.id_crawlingrules) + " so no new Notification was issued", logging.INFO) crawlingRule.content = currContent crawlingRule.docslinks = json.dumps(currLinks) crawlingRule.lastmodifytime = datetime.datetime.now() self.session.add(crawlingRule) self.session.commit()
def test_no_comments(self): # text without comments self.assertEqual(remove_comments('text without comments'), 'text without comments')
def test_returns_unicode(self): # make sure it always return unicode assert isinstance(remove_comments(b"without comments"), str) assert isinstance(remove_comments(b"<!-- with comments -->"), str) assert isinstance(remove_comments("without comments"), str) assert isinstance(remove_comments("<!-- with comments -->"), str)
def remove_comments(self, html): return remove_comments(html)
async def next_one(j, rizhi_q): async with (sem): html = "" kwargs = j[5] rizhi_q.put((kwargs['cid'], 3)) num = 0 print(j) if int(j[3]) == 1: ##post采集 url = j[0] while num < 3: try: if num > 0: proxies = await Aiohttp_ip() else: proxies = None html, pd = await Aiohttp_post(url, kwargs['cookie'], proxies, j[2]) if int(pd) == 1: html = html break elif int(pd) == 2: url = html num += 1 elif int(pd) == 3: return "附件" except: num += 1 else: if int(j[5]['immit_js']) == 1: html = Ray_html(j[2]) else: url = j[2] while num < 3: try: if num > 0: proxies = await Aiohttp_ip() else: proxies = None html, pd = await Aiohttp_get(url, kwargs['cookie'], proxies) if int(pd) == 1: html = html break elif int(pd) == 2: url = html num += 1 elif int(pd) == 3: break except: num += 1 first_url = kwargs['url'] title_tag = kwargs['title_tag'] title_re = kwargs['title_re'] xpath_list = kwargs['xpath_list'] cid = kwargs['cid'] host_name = kwargs['host_name'] name = kwargs['name'] if len(html) < 5: tuple_one = (cid, first_url, 1, str(host_name + name), 1 ) ###插入日志 页面请求不到 rizhi_q.put((tuple_one, 2)) else: if "http://tjj.gz.gov.cn/zwgk/gfxwj" in first_url: pass else: html = remove_comments(html) kwargs['html'] = html if title_tag: json_list = Handle_tttt(**kwargs) elif title_re: json_list = Handle_title_re(**kwargs) elif xpath_list: json_list = Handle_xpath(**kwargs) if len(json_list) > 0 and json_list != None: for i in json_list: rizhi_q.put((i, 1)) else: tuple_one = (cid, first_url, 3, str(host_name + name), 1 ) ###插入日志 标签有问题 rizhi_q.put((tuple_one, 2))
def test_no_comments(self): # text without comments self.assertEqual(remove_comments("text without comments"), "text without comments")
async def sql(self, a, b): all_urls = [] if b == "zc": conn, cursor = create_conn() item = ED_SQL(cursor, a) close_conn(conn, cursor) elif b == "hy": conn, cursor = create_conn() item = HY_SQL(cursor, a) close_conn(conn, cursor) argument = argument_get(item[0]) argument['info_page'] = 1 all_urls = Handle_url(all_urls, **argument) for j in all_urls: kwargs = j[5] num = 0 if int(j[3]) == 1: ##post采集 url = j[0] while num < 3: try: if num > 0: proxies = await Aiohttp_ip() else: proxies = None html, pd = await Aiohttp_post(url, argument['cookie'], proxies, j[2]) if int(pd) == 1: html = html break elif int(pd) == 2: url = html num += 1 elif int(pd) == 3: return "附件" except: num += 1 else: if int(j[5]['immit_js']) == 1: html = Ray_html(j[2]) else: url = j[2] while num < 3: try: if num > 0: proxies = await Aiohttp_ip() else: proxies = None html, pd = await Aiohttp_get( url, argument['cookie'], proxies) if int(pd) == 1: html = html break elif int(pd) == 2: url = html num += 1 elif int(pd) == 3: return "附件" except: num += 1 first_url = kwargs['url'] title_tag = kwargs['title_tag'] title_re = kwargs['title_re'] xpath_list = kwargs['xpath_list'] if "http://tjj.gz.gov.cn/zwgk/gfxwj" in first_url: pass else: html = remove_comments(html) kwargs['html'] = html print(html) kwargs['judge_model'] = "test" ##区分正常采集和测试采集返回的数据不一样 if title_tag: json_list = Handle_tttt(**kwargs) elif title_re: json_list = Handle_title_re(**kwargs) elif xpath_list: json_list = Handle_xpath(**kwargs) print(json_list) all = {'info': json_list} ck = json.dumps(all) now_time = datetime.now() m = hashlib.md5() pp = str(first_url) + str(now_time) m.update(str(pp).encode('utf-8')) md = m.hexdigest() ####把url转码md5 self.sql_insert(md, ck) POOL.close() return md
def parse_detail(self, response): try: html = Selector(text=remove_comments(response.text)) cdeItem = CdeItem() cdeContainer = html.xpath('//*[@id="div_open_close_01"]') projectItem = ProjectItem() projectMainContainer = html.css( '.register_mainB>.apply_zhgl>.cxtj_tm') cdeItem['_id'] = projectMainContainer.xpath( 'table//tr[1]/td[2]/text()').extract_first(default='').strip() # 登记号 projectItem['registrationNo'] = cdeItem['_id'] # 试验状态 projectItem['studyStatus'] = projectMainContainer.xpath( 'table//tr[1]/td[4]/text()').extract_first(default='').strip() # 申办者联系人 projectItem['sponsorConcatName'] = projectMainContainer.xpath( 'table//tr[2]/td[2]/text()').extract_first(default='').strip() # 首次公示信息日期 projectItem['firstPublishDate'] = projectMainContainer.xpath( 'table//tr[2]/td[4]/text()').extract_first(default='').strip() # 适应症 projectItem['indication'] = cdeContainer.xpath( 'table//tr[2]/td[2]/text()').extract_first(default='').strip() # 试验通俗题目 projectItem['popularTitle'] = cdeContainer.xpath( 'table//tr[3]/td[2]/text()').extract_first(default='').strip() # 试验专业题目 projectItem['studyTitle'] = cdeContainer.xpath( 'table//tr[4]/td[2]/text()').extract_first(default='').strip() # 试验方案编号 projectItem['protocolNo'] = cdeContainer.xpath( 'table//tr[5]/td[2]/text()').extract_first(default='').strip() #临床申请受理号 -- 化学药备案号 projectItem['acceptNo'] = cdeContainer.xpath( 'table//tr[6]/td[2]/text()').extract_first(default='').strip() # 药物名称 projectItem['drugName'] = cdeContainer.xpath( 'table//tr[7]/td[2]/text()').extract_first(default='').strip() # 药物类型 projectItem['drugClassification'] = cdeContainer.xpath( 'table//tr[8]/td[2]/text()').extract_first(default='').strip() # 试验相关信息 projectItem['otherInfo'] = '<div>{}</div>'.format( html.css('.register_main>.register_mainB>.apply_zhgl').xpath( './table').extract_first()) # 首例入组日期 # projectItem['firstSubjectEncroEnrollmentDate']=cdeContainer.xpath('.//table[4]//tr/td/text()').extract_first(default='').strip() projectItem['firstSubjectEncroEnrollmentDate'] = cdeContainer.xpath( ".//div[@class='STYLE2'][contains(., '第一例受试者入组日期')]/following-sibling::table[1]//td/text()" ).extract_first(default='').strip() # 试验终止日期 # projectItem['testStopDate']=cdeContainer.xpath('.//table[5]//tr/td/text()').extract_first(default='').strip() projectItem['testStopDate'] = cdeContainer.xpath( ".//div[@class='STYLE2'][contains(., '试验终止日期')]/following-sibling::table[1]//td/text()" ).extract_first(default='').strip() #八、试验状态 #projectItem['studyStatus2']=cdeContainer.xpath('.//table[8]//tr/td').extract_first(default='').strip() projectItem['studyStatus2'] = re.sub( r"\s+", "", cdeContainer.xpath( ".//div[@class='STYLE2'][contains(., '试验状态')]/following-sibling::table[1]//td/text()" ).extract_first(default='').strip()) cdeItem['Project'] = dict(projectItem) ## 申办方信息 sponsorInfoItem = SponsorInfoItem() sponsorContainer = cdeContainer.xpath('./table[2]') #申办方名称 sponsorInfoItem['sponsorNames'] = [] for tr in sponsorContainer.xpath('.//tr[1]/td[2]/table/tr'): sponsorInfoItem['sponsorNames'].append( tr.xpath('td[2]/text()').extract_first( default='').strip('/')) #联系人姓名 sponsorInfoItem['concatName'] = sponsorContainer.xpath( 'tr[2]/td[2]/text()').extract_first(default='').strip() #联系电话 sponsorInfoItem['tel'] = sponsorContainer.xpath( 'tr[3]/td[2]/text()').extract_first(default='').strip() #Email sponsorInfoItem['email'] = sponsorContainer.xpath( 'tr[3]/td[4]/text()').extract_first(default='').strip() #地址 sponsorInfoItem['address'] = sponsorContainer.xpath( 'tr[4]/td[2]/text()').extract_first(default='').strip() #邮编 sponsorInfoItem['zipCode'] = sponsorContainer.xpath( 'tr[4]/td[4]/text()').extract_first(default='').strip() #费用来源 # sponsorInfoItem['costFrom']=sponsorContainer.xpath('.//tr[5]/td[2]/text()').extract_first(default='').strip() sponsorInfoItem['costFrom'] = ''.join([ item.strip() for item in sponsorContainer.xpath( 'tr[5]/td[2]/text()').extract() ]) cdeItem['SponsorInfo'] = dict(sponsorInfoItem) ## 试验设计信息 clinicalTrialInfomation = ClinicalTrialInformationItem() clinicalTrialContainer = cdeContainer.xpath('./table[3]') #试验目的 clinicalTrialInfomation[ 'testPurpose'] = clinicalTrialContainer.xpath( 'tr[2]/td/text()').extract_first(default='').strip() #试验分类 clinicalTrialInfomation['testType'] = clinicalTrialContainer.xpath( 'tr[4]/td/table//tr[1]/td[3]/text()').extract_first( default='').strip() #试验分期 clinicalTrialInfomation[ 'testStaging'] = clinicalTrialContainer.xpath( 'tr[4]/td/table//tr[2]/td[3]/text()').extract_first( default='').strip() #设计类型 clinicalTrialInfomation[ 'testDesignType'] = clinicalTrialContainer.xpath( 'tr[4]/td/table//tr[3]/td[3]/text()').extract_first( default='').strip() #随机化 clinicalTrialInfomation[ 'testRandomization'] = clinicalTrialContainer.xpath( 'tr[4]/td/table//tr[4]/td[3]/text()').extract_first( default='').strip() #盲法 clinicalTrialInfomation[ 'testBlind'] = clinicalTrialContainer.xpath( 'tr[4]/td/table//tr[5]/td[3]/text()').extract_first( default='').strip() #试验范围 clinicalTrialInfomation[ 'testRange'] = clinicalTrialContainer.xpath( 'tr[4]/td/table//tr[6]/td[3]/text()').extract_first( default='').strip() ## 3、受试者信息 #年龄 -- 去掉内容中的\t\n\r clinicalTrialInfomation['subjectAge'] = re.sub( r"\s+", "", clinicalTrialContainer.xpath( 'tr[6]/td[2]/text()').extract_first(default='').strip()) #性别 clinicalTrialInfomation[ 'subjectGeneder'] = clinicalTrialContainer.xpath( 'tr[7]/td[2]/text()').extract_first(default='').strip() #健康受试者 clinicalTrialInfomation[ 'subjectHealth'] = clinicalTrialContainer.xpath( 'tr[8]/td[2]/text()').extract_first(default='').strip() # 目标入组人数 clinicalTrialInfomation[ 'subjectTargetEnrollment'] = clinicalTrialContainer.xpath( 'tr[11]/td[2]/text()').extract_first(default='').strip() # 实际入组人数 clinicalTrialInfomation[ 'subjectActualEnrollment'] = clinicalTrialContainer.xpath( 'tr[12]/td[2]/text()').extract_first(default='').strip() # 数据安全监察委员会 clinicalTrialInfomation[ 'subjectDMC'] = clinicalTrialContainer.xpath( 'tr[19]/td/text()').re_first(r'([有|无])') # 为受试者购买试验伤害保险 clinicalTrialInfomation[ 'subjectInjuryInsurance'] = clinicalTrialContainer.xpath( 'tr[20]/td/text()').re_first(r'([有|无])') cdeItem['ClinicalTrialInformation'] = dict(clinicalTrialInfomation) ## 主要研究者信息 cdeItem['MainInvestigators'] = [] for table in cdeContainer.xpath('table[6]//tr[2]/td/table'): mainInvestigator = MainInvestigatorItem() # 姓名 #去除人名中的杂质 如:(叶定伟,医学博士) # tempName=table.xpath('tr[1]/td[2]/text()').extract_first(default='').strip() tempNames = re.split( '[,,]', table.xpath( './/td[contains(.,"姓名")]//following-sibling::td[1]/text()' ).extract_first(default='').strip()) mainInvestigator['name'] = tempNames[0] if len( tempNames) > 0 else '' # 获取专业认证 从姓名中解析 如:(叶定伟,医学博士) mainInvestigator['certification'] = tempNames[1] if len( tempNames) > 1 else '' # 职称 mainInvestigator['jobTitle'] = table.xpath( './/td[contains(.,"职称")]//following-sibling::td[1]/text()' ).extract_first(default='').strip() # 电话 mainInvestigator['tel'] = table.xpath( './/td[contains(.,"电话")]//following-sibling::td[1]/text()' ).extract_first(default='').strip() # Email mainInvestigator['email'] = table.xpath( './/td[contains(.,"Email")]//following-sibling::td[1]/text()' ).extract_first(default='').strip() # 地址 mainInvestigator['address'] = table.xpath( './/td[contains(.,"邮政地址")]//following-sibling::td[1]/text()' ).extract_first(default='').strip() # 邮编 mainInvestigator['zipCode'] = table.xpath( './/td[contains(.,"邮编")]//following-sibling::td[1]/text()' ).extract_first(default='').strip() # 单位名称 mainInvestigator['companyName'] = table.xpath( './/td[contains(.,"单位名称")]//following-sibling::td[1]/text()' ).extract_first(default='').strip() cdeItem['MainInvestigators'].append(dict(mainInvestigator)) ## 各参加机构信息 cdeItem['Hospitals'] = [] for tr in cdeContainer.xpath( '//*[@id="hspTable"]//tr[position()>1]'): hospital = HospitalItem() # 序号 hospital['no'] = tr.xpath('td[1]/text()').extract_first( default='').strip() # 机构名称 hospital['name'] = tr.xpath('td[2]/text()').extract_first( default='').strip() # 主要研究者 hospital['mainSponsorName'] = tr.xpath( 'td[3]/text()').extract_first(default='').strip() # 国家 hospital['state'] = tr.xpath('td[4]/text()').extract_first( default='').strip() # 所在省 hospital['province'] = tr.xpath('td[5]/text()').extract_first( default='').strip() # 所在市 hospital['city'] = tr.xpath('td[6]/text()').extract_first( default='').strip() cdeItem['Hospitals'].append(dict(hospital)) ## 伦理委员会信息 cdeItem['ECs'] = [] for tr in cdeContainer.xpath( '//*[@id="div_open_close_01"]/table[7]//tr[position()>1]'): ec = ECItem() # 下标 ec['no'] = tr.xpath('td[1]/text()').extract_first( default='').strip() # 名称 ec['name'] = tr.xpath('td[2]/text()').extract_first( default='').strip() # 审查结论 ec['approveResult'] = tr.xpath('td[3]/text()').extract_first( default='').strip() # 审查日期 ec['approveDate'] = tr.xpath('td[4]/text()').extract_first( default='').strip() cdeItem['ECs'].append(dict(ec)) yield cdeItem except Exception as e: self.logger.error('解析出错:%s?%s', response.url, response.request.body) self.logger.error(e)