Esempi in Python per remove_comments, esempi in Python per w3lib.html.remove_comments

Esempio n. 1

0

Mostra file

    def test_remove_comments(self):
        # text with comments
        self.assertEqual(remove_comments(u'<!--text with comments-->'), u'')
        self.assertEqual(remove_comments(u'Hello<!--World-->'), u'Hello')
        self.assertEqual(remove_comments(u'Hello<!--My\nWorld-->'), u'Hello')

        self.assertEqual(remove_comments(b"test <!--textcoment--> whatever"), u'test  whatever')
        self.assertEqual(remove_comments(b"test <!--\ntextcoment\n--> whatever"), u'test  whatever')

Esempio n. 2

0

Mostra file

File: test_html.py Progetto: azizur77/w3lib

 def test_returns_unicode(self):
     # make sure it always return unicode
     assert isinstance(remove_comments(b'without comments'), six.text_type)
     assert isinstance(remove_comments(b'<!-- with comments -->'),
                       six.text_type)
     assert isinstance(remove_comments(u'without comments'), six.text_type)
     assert isinstance(remove_comments(u'<!-- with comments -->'),
                       six.text_type)

Esempio n. 3

0

Mostra file

File: test_html.py Progetto: Preetwinder/w3lib

    def test_remove_comments(self):
        # text with comments
        self.assertEqual(remove_comments(u'<!--text with comments-->'), u'')
        self.assertEqual(remove_comments(u'Hello<!--World-->'), u'Hello')
        self.assertEqual(remove_comments(u'Hello<!--My\nWorld-->'), u'Hello')

        self.assertEqual(remove_comments(b"test <!--textcoment--> whatever"), u'test  whatever')
        self.assertEqual(remove_comments(b"test <!--\ntextcoment\n--> whatever"), u'test  whatever')

Esempio n. 4

0

Mostra file

File: test_html.py Progetto: nasirsphi/w3lib

    def test_remove_comments(self):
        # make sure it always return unicode
        assert isinstance(remove_comments('without comments'), unicode)
        assert isinstance(remove_comments('<!-- with comments -->'), unicode)

        # text without comments
        self.assertEqual(remove_comments(u'text without comments'),
                         u'text without comments')

        # text with comments
        self.assertEqual(remove_comments(u'<!--text with comments-->'), u'')
        self.assertEqual(remove_comments(u'Hello<!--World-->'), u'Hello')

Esempio n. 5

0

Mostra file

File: utils.py Progetto: sztal/taukit

def normalize_web_content(x, keep=('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'strong'),
                          token='____SECTION____'):
    """Normalize web content.

    Parameters
    ----------
    keep : tuple
        HTML tags to keep.
    token : str or None
        Token to use for replacing kep HTML tags.
        Do not replace if `None`.
    """
    try:
        x = strip_html5_whitespace(x)
        x = remove_comments(x)
        x = remove_tags(x, keep=keep)
        if token:
            x = replace_tags(x, token=token)
        x = replace_entities(x)
        x = replace_escape_chars(x)
    except (TypeError, AttributeError):
        pass
    for part in _rx_web_sectionize.split(x):
        if part:
            yield part

Esempio n. 6

0

Mostra file

File: extractors.py Progetto: bopopescu/vinalo

def _process_markup(region, textf, tagf, tags_to_purge=_TAGS_TO_PURGE):
    fragments = getattr(region, 'parsed_fragments', None)
    if fragments is None:
        yield textf(region)
        return
    fiter = iter(fragments)
    for fragment in fiter:
        if isinstance(fragment, HtmlTag):
            # skip forward to closing script tags
            tag = fragment.tag
            if tag in tags_to_purge:
                # if opening, keep going until closed
                if fragment.tag_type == HtmlTagType.OPEN_TAG:
                    for probe in fiter:
                        if isinstance(probe, HtmlTag) and \
                            probe.tag == tag and \
                            probe.tag_type == HtmlTagType.CLOSE_TAG:
                            break
            else:
                output = tagf(fragment)
                if output:
                    yield output
        else:
            text = region.htmlpage.fragment_data(fragment)
            text = remove_comments(text)
            text = textf(text)
            if text:
                yield text

Esempio n. 7

0

Mostra file

File: response.py Progetto: maximedb/scrapy

def get_base_url(response):
    """Return the base url of the given response, joined with the response url"""
    if response not in _baseurl_cache:
        text = response.text[0:4096]
        text = html.remove_comments(text, response.encoding)
        _baseurl_cache[response] = html.get_base_url(text, response.url,
                                                     response.encoding)
    return _baseurl_cache[response]

Esempio n. 8

0

Mostra file

def filteHTML(string):
    content = remove_comments(string)  # 过滤注释
    content = html.unescape(content)  # 去掉实体字符
    content = content.replace('&ensp;', '')
    content = content.replace('&emsp;', '')
    content = content.replace('&nbsp;', '')

    return content

Esempio n. 9

0

Mostra file

File: leiphone.py Progetto: tonyhauuk/Non-project

    def getPageText(self):  # 获取网页正文
        try:
            content = self.browser.find_element_by_css_selector(
                'lph-article-comView').get_attribute('innerHTML')
            html = remove_comments(content)
        except NoSuchElementException:
            html = self.browser.page_source

        return html

Esempio n. 10

0

Mostra file

    def parse_detail(self, response):
        try:
            html = Selector(text=remove_comments(response.text))

            yield cdeItem
        except Exception as e:
            self.logger.error('解析出错:%s?%s', response.url,
                              response.request.body)
            self.logger.error(e)

Esempio n. 11

0

Mostra file

File: Banking_srv.py Progetto: tonyhauuk/Non-project

    def getPageText(self):  # 获取网页正文
        pageTitle = self.browser.find_element_by_css_selector(
            'div.container div.row > div[ng-show="showTitle"]').get_attribute(
                'outerHTML')
        pageHTML = self.browser.find_element_by_css_selector(
            'div.container div.row > div#wenzhang-content').get_attribute(
                'innerHTML')
        pureHTML = remove_comments(pageHTML)
        html = pageTitle + pureHTML

        return html

Esempio n. 12

0

Mostra file

File: DataApi.py Progetto: todokku/stockreminder

    def getOilFundData(self):
        try:
            html = requests.get(url = self.url).content.decode('utf-8')
        except :
            wxPusher = Helper.WxPusher()
            wxPusher.sendMessage(title = '发生未知错误！' , text = '访问haoETF网站获取数据失败，请检查代码接口！')
            raise

        #去除不必要的注释
        web_content = remove_comments(html)
        soup = BeautifulSoup(web_content,features='lxml')

        #获取表头
        thead = soup.body.table.thead.tr.find_all('th')

        thead_list = []
        for child in thead:
            thead_list.append(child.text)
        
        
        #获取表内容
        tbody = soup.body.table.tbody.find_all('tr')

        tr_list = []
        for tr in tbody:
            td_list =  []
            for td in tr:
                if td.string != '\n' :
                    td_list.append(td.string)
            tr_list.append(td_list)

        #创建表
        table = pd.DataFrame(tr_list,columns=thead_list)
        #设定变量字段名
        discount_rt = thead_list[5]  #溢价率
        volume = thead_list[9]       #成交额
        limit = thead_list[-1]       #申购限额

        #去除百分比%字符
        table[discount_rt] = table[discount_rt].str.replace('%','')
        #将字符串转化为数字格式
        table[discount_rt] = pd.to_numeric(table[discount_rt],errors='ignore')
        table[volume] = pd.to_numeric(table[volume],errors='ignore')

        #选取溢价率超过4%且成交额>500万且不开放申购的基金
        table = table[(table[discount_rt] >= 4)  & 
                    (table[limit].str.contains('暂停') == False) &
                    (table[volume] > 500)].sort_values(discount_rt,ascending=False)

        #截取需要用到的字段
        seleted = table.loc[:,['代码','名称',discount_rt,'现价','T-1估值']]

        return seleted

Esempio n. 13

0

Mostra file

 async def sql(self, a):
     all_urls = []
     conn, cursor = POOL_DB().create_conn()
     item = ED_SQL(cursor, a)
     POOL_DB().close_conn(conn, cursor)
     argument = argument_get(item[0])
     argument['info_page'] = 1
     all_urls = Handle_url(all_urls, **argument)
     for j in all_urls:
         kwargs = j[5]
         kwargs['judge_model'] = "test"  ##区分正常采集和测试采集返回的数据不一样
         if int(j[3]) == 1:
             ##post采集
             url = j[0]
             html = await self.AIO_POST(argument, j, url, logger)
         else:
             if int(j[5]['immit_js']) == 1:  ####瑞树js
                 html = await Ray_html(j[2], argument['res_headers'],
                                       logger)
             else:
                 ####get采集
                 url = j[2]
                 html = await self.AIO_GET(argument, url, logger)
         first_url = kwargs['url']
         title_tag = kwargs['title_tag']
         title_re = kwargs['title_re']
         xpath_list = kwargs['xpath_list']
         if "http://tjj.gz.gov.cn/zwgk/gfxwj" in first_url:
             pass
         else:
             html = remove_comments(html)
         kwargs['html'] = html
         if title_tag:
             json_list = Handle_tttt(logger, **kwargs)
         elif title_re:
             json_list = Handle_title_re(**kwargs)
         elif xpath_list:
             json_list = Handle_xpath(**kwargs)
     all = {'info': json_list}
     ck = json.dumps(all)
     now_time = datetime.now()
     m = hashlib.md5()
     pp = str(first_url) + str(now_time)
     m.update(str(pp).encode('utf-8'))
     md = m.hexdigest()  ####把url转码md5
     self.sql_insert(md, ck)
     POOL_DB().close_db()
     return md

Esempio n. 14

0

Mostra file

File: test_html.py Progetto: scrapy/w3lib

    def test_remove_comments(self):
        # text with comments
        self.assertEqual(remove_comments("<!--text with comments-->"), "")
        self.assertEqual(remove_comments("Hello<!--World-->"), "Hello")
        self.assertEqual(remove_comments("Hello<!--My\nWorld-->"), "Hello")

        self.assertEqual(remove_comments(b"test <!--textcoment--> whatever"),
                         "test  whatever")
        self.assertEqual(
            remove_comments(b"test <!--\ntextcoment\n--> whatever"),
            "test  whatever")

        self.assertEqual(remove_comments(b"test <!--"), "test ")

Esempio n. 15

0

Mostra file

File: processors.py Progetto: uapdw/spider

def _safe_html(html_part):
    '''去掉html中一些标签(script、input等)
    @type  html_part: unicode
    @param html_part: html的一部分，或者一段文本
    @type  encoding: string
    @param encoding: 编码
    @return: 去掉html中一些标签的html
    '''

    if html_part is None:
        return None

    value = remove_tags_with_content(html_part, which_ones=_REMOVE_TAGS)

    # remove_tags_with_content、remove_tags一起使用来删除标签
    value = remove_tags(value, which_ones=_REMOVE_TAGS)

    # 删除注释
    value = remove_comments(value)

    return value

Esempio n. 16

0

Mostra file

def _has_ajaxcrawlable_meta(text):
    """
    >>> _has_ajaxcrawlable_meta('<html><head><meta name="fragment"  content="!"/></head><body></body></html>')
    True
    >>> _has_ajaxcrawlable_meta("<html><head><meta name='fragment' content='!'></head></html>")
    True
    >>> _has_ajaxcrawlable_meta('<html><head><!--<meta name="fragment"  content="!"/>--></head><body></body></html>')
    False
    >>> _has_ajaxcrawlable_meta('<html></html>')
    False
    """

    # Stripping scripts and comments is slow (about 20x slower than
    # just checking if a string is in text); this is a quick fail-fast
    # path that should work for most pages.
    if "fragment" not in text:
        return False
    if "content" not in text:
        return False

    text = html.remove_tags_with_content(text, ("script", "noscript"))
    text = html.replace_entities(text)
    text = html.remove_comments(text)
    return _ajax_crawlable_re.search(text) is not None

Esempio n. 17

0

Mostra file

File: tools.py Progetto: yjun1806/scrapy-naver-news

def clean_html(text):
    '''
    :param text:
    :return:
    Version : 2020-01-17_ver
    '''
    if text is not None:
        body = replace_entities(
            text)  # &nbsp; - 띄어쓰기, &#8216; - ... 이런것들 제거하는 코드
        # remove_tags : 입력된 텍스트에서 태그 제거하는 라이브러리 함수
        # remove_tags_with_content : 입력한 텍스트에서 선택된 태그안의 내용을 지우는 라이브러리 함수
        body = replace_entities(
            remove_tags_with_content(body, ('script', 'a', 'h4')))
        body = remove_comments(body)
        body = remove_tags(body)
        # body = re.sub('(http|ftp|https)://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', '', body)  # 텍스트의 http url 제거
        # body = re.sub('([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)', '', body)  # 텍스트의 이메일 제거
        # body = re.sub('[\{\}\[\]\/?;:|\)*~`!^\-_+<>@\#$%&\\\=\(]', ' ', body)  # 특수문자 제거
        # body = re.sub('([ㄱ-ㅎㅏ-ㅣ]+)', '', body)  # 한글 자음, 모음만 쓴것 제거
        body_split = body.split()  # 문자열을 리스트에 담는다.
        body = " ".join(body_split)
        return body
    else:
        return text

Esempio n. 18

0

Mostra file

File: infoTool.py Progetto: ggMikl/Crawler_wechat

def clear_soup_w3lib(bs4_find_data_group):
    cleared_data_script = remove_tags_with_content(bs4_find_data_group, ('script', 'iframe'))
    cleared_data = remove_comments(cleared_data_script)
    return cleared_data

Esempio n. 19

0

Mostra file

File: webDiffCrawler.py Progetto: alexandrucosminmihai/web-diff-crawler

    def parse(self, response):
        global epsilon

        shouldCrawlRule = True

        crawlingRule = response.meta["crawlingRuleEntry"]

        # Figure out if now is the time to crawl this rule and whether is the first crawl for the rule
        currDateTime = datetime.datetime.now()

        isFirstCrawl = True  # Assume this is the first time we check this crawling rule
        lastCrawlTimestamp = 0

        if crawlingRule.lastcrawltime:  # The rule was used before
            isFirstCrawl = False
            lastCrawlTimestamp = crawlingRule.lastcrawltime.timestamp()

        deltaTimestamp = currDateTime.timestamp(
        ) - lastCrawlTimestamp + epsilon
        self.log("currDateTime=" + str(currDateTime), logging.INFO)
        if isFirstCrawl:
            self.log("lastcrawltime=Never", logging.INFO)
        else:
            self.log("lastcrawltime=" + str(crawlingRule.lastcrawltime),
                     logging.INFO)
        self.log("deltaTimestamp+epsilon=" + str(deltaTimestamp), logging.INFO)

        # Check whether the wait interval between two consecutive crawls has passed
        if (deltaTimestamp / 60) < crawlingRule.crawlperiod:
            shouldCrawlRule = False
            return

        if shouldCrawlRule:
            crawlingRule.lastcrawltime = currDateTime  # A new crawl will begin
            selector = crawlingRule.selectionrule.replace('::text', '').strip()

            currContent = "".join(response.css(selector).extract(
            ))  # Extract all the content + tags using the selector

            if webDiffCrawler.TEXT_ONLY or '::text' in crawlingRule.selectionrule:
                # Ditch the script tags' content and then extract the text
                self.log("Extracting the text from the HTML...", logging.INFO)
                currContent = remove_tags(remove_tags_with_content(
                    currContent, ('script', )),
                                          keep=webDiffCrawler.keptTags)
                currContent = remove_comments(currContent)
                currContent = webDiffCrawler.cleanHtmlContent(currContent)

            # Convert relative URLs to absolute URLs
            currContent = webDiffCrawler.makeURLsAbsolute(
                response.url, currContent)
            currContent = remove_tags(remove_tags_with_content(
                currContent, ('script', )),
                                      keep=webDiffCrawler.keptTags)
            # Extract URLs to downloadable documents
            currLinks = webDiffCrawler.extractURLsToDocuments(currContent)
            # currContent = html.escape(currContent)
            # currContent = currContent.replace("'", "\\'")
            # currContent = currContent.replace('"', '\\"')
            currContent = currContent.strip()
            # currContent = currContent.encode('unicode-escape').decode()  # Escape special chars like \n \t
            self.log("currContent = " + currContent)
            oldContent = crawlingRule.content
            oldLinks = crawlingRule.docslinks
            # oldContent = oldContent.encode('unicode-escape').decode()
            self.log("oldContent = " + oldContent)

            if not isFirstCrawl:
                # If there is some old content to compare the new content to
                self.sequenceMatcher.set_seqs(oldContent, currContent)
                operations = []
                newContentTagsIntervals = extractTagsIntervals(currContent)
                oldContentTagsIntervals = extractTagsIntervals(oldContent)

                if oldContent:
                    operations = self.sequenceMatcher.get_opcodes()

                if len(operations) == 1 and operations[0][0] == 'equal':
                    self.log(
                        "The content for id_crawlingrules=" +
                        str(crawlingRule.id_crawlingrules) +
                        " hasn't changed so no new Notification was issued",
                        logging.INFO)
                else:
                    self.log(
                        "The content for id_crawlingrules=" +
                        str(crawlingRule.id_crawlingrules) +
                        " has changed => New notification issued",
                        logging.INFO)

                    # Update the operations interval indices in order for all the intervals to be closed
                    for operation in operations:
                        operation = list(operation)
                        self.log("Initial Operation: " + str(operation),
                                 logging.DEBUG)
                        operation[2] -= 1
                        if operation[2] < operation[1]:
                            operation[2] = operation[1]

                        operation[4] -= 1
                        if operation[4] < operation[3]:
                            operation[4] = operation[3]

                        self.log("Final Operation: " + str(operation),
                                 logging.DEBUG)

                    # Generate colored HTML code
                    #coloredCurrContent, detecte = colorDifferences(currContent, operations, tagsIntervals)
                    coloredCurrContent, detectedReplacedOrInserted, coloredOldContent, detectedDeleted = colorDifferences(
                        currContent, oldContent, operations,
                        newContentTagsIntervals, oldContentTagsIntervals)

                    # Create a new notification and add it to the 'notifications' table
                    recipients = ["all"]
                    newNotification = mappedClasses.Notifications(
                        address=crawlingRule.address,
                        id_matchingrule=crawlingRule.id_crawlingrules,
                        modifytime=crawlingRule.lastcrawltime,
                        currcontent=currContent,
                        coloredcurrcontent=coloredCurrContent,
                        currdocslinks=json.dumps(currLinks),
                        detectedreplacedorinserted=detectedReplacedOrInserted,
                        oldcontenttime=crawlingRule.lastmodifytime,
                        oldcontent=oldContent,
                        coloredoldcontent=coloredOldContent,
                        detecteddeleted=detectedDeleted,
                        olddocslinks=oldLinks,
                        changes=json.dumps(operations),
                        recipients=recipients,
                        ackers=[])
                    self.session.add(newNotification)

                    crawlingRule.content = currContent
                    crawlingRule.docslinks = json.dumps(currLinks)
                    crawlingRule.lastmodifytime = datetime.datetime.now()
            else:
                # This is the first content we ever get for this rule
                self.log(
                    "This is the first crawl for id_crawlingrules=" +
                    str(crawlingRule.id_crawlingrules) +
                    " so no new Notification was issued", logging.INFO)
                crawlingRule.content = currContent
                crawlingRule.docslinks = json.dumps(currLinks)
                crawlingRule.lastmodifytime = datetime.datetime.now()

            self.session.add(crawlingRule)
            self.session.commit()

Esempio n. 20

0

Mostra file

 def test_no_comments(self):
     # text without comments
     self.assertEqual(remove_comments('text without comments'),
                      'text without comments')

Esempio n. 21

0

Mostra file

File: test_html.py Progetto: scrapy/w3lib

 def test_returns_unicode(self):
     # make sure it always return unicode
     assert isinstance(remove_comments(b"without comments"), str)
     assert isinstance(remove_comments(b"<!-- with comments -->"), str)
     assert isinstance(remove_comments("without comments"), str)
     assert isinstance(remove_comments("<!-- with comments -->"), str)

Esempio n. 22

0

Mostra file

File: html_parse_unit.py Progetto: evell1992/spider_template

 def remove_comments(self, html):
     return remove_comments(html)

Esempio n. 23

0

Mostra file

File: 每天一页.py Progetto: hahahaha666/Policy_acquisition

async def next_one(j, rizhi_q):
    async with (sem):
        html = ""
        kwargs = j[5]
        rizhi_q.put((kwargs['cid'], 3))
        num = 0
        print(j)
        if int(j[3]) == 1:
            ##post采集
            url = j[0]
            while num < 3:
                try:
                    if num > 0:
                        proxies = await Aiohttp_ip()
                    else:
                        proxies = None
                    html, pd = await Aiohttp_post(url, kwargs['cookie'],
                                                  proxies, j[2])
                    if int(pd) == 1:
                        html = html
                        break
                    elif int(pd) == 2:
                        url = html
                        num += 1
                    elif int(pd) == 3:
                        return "附件"
                except:
                    num += 1
        else:
            if int(j[5]['immit_js']) == 1:
                html = Ray_html(j[2])
            else:
                url = j[2]
                while num < 3:
                    try:
                        if num > 0:
                            proxies = await Aiohttp_ip()
                        else:
                            proxies = None
                        html, pd = await Aiohttp_get(url, kwargs['cookie'],
                                                     proxies)
                        if int(pd) == 1:
                            html = html
                            break
                        elif int(pd) == 2:
                            url = html
                            num += 1
                        elif int(pd) == 3:
                            break
                    except:
                        num += 1
        first_url = kwargs['url']
        title_tag = kwargs['title_tag']
        title_re = kwargs['title_re']
        xpath_list = kwargs['xpath_list']
        cid = kwargs['cid']
        host_name = kwargs['host_name']
        name = kwargs['name']
        if len(html) < 5:
            tuple_one = (cid, first_url, 1, str(host_name + name), 1
                         )  ###插入日志 页面请求不到
            rizhi_q.put((tuple_one, 2))
        else:
            if "http://tjj.gz.gov.cn/zwgk/gfxwj" in first_url:
                pass
            else:
                html = remove_comments(html)
            kwargs['html'] = html
            if title_tag:
                json_list = Handle_tttt(**kwargs)
            elif title_re:
                json_list = Handle_title_re(**kwargs)
            elif xpath_list:
                json_list = Handle_xpath(**kwargs)
            if len(json_list) > 0 and json_list != None:
                for i in json_list:
                    rizhi_q.put((i, 1))
            else:
                tuple_one = (cid, first_url, 3, str(host_name + name), 1
                             )  ###插入日志 标签有问题
                rizhi_q.put((tuple_one, 2))

Esempio n. 24

0

Mostra file

File: test_html.py Progetto: scrapy/w3lib

 def test_no_comments(self):
     # text without comments
     self.assertEqual(remove_comments("text without comments"),
                      "text without comments")

Esempio n. 25

0

Mostra file

 async def sql(self, a, b):
     all_urls = []
     if b == "zc":
         conn, cursor = create_conn()
         item = ED_SQL(cursor, a)
         close_conn(conn, cursor)
     elif b == "hy":
         conn, cursor = create_conn()
         item = HY_SQL(cursor, a)
         close_conn(conn, cursor)
     argument = argument_get(item[0])
     argument['info_page'] = 1
     all_urls = Handle_url(all_urls, **argument)
     for j in all_urls:
         kwargs = j[5]
         num = 0
         if int(j[3]) == 1:
             ##post采集
             url = j[0]
             while num < 3:
                 try:
                     if num > 0:
                         proxies = await Aiohttp_ip()
                     else:
                         proxies = None
                     html, pd = await Aiohttp_post(url, argument['cookie'],
                                                   proxies, j[2])
                     if int(pd) == 1:
                         html = html
                         break
                     elif int(pd) == 2:
                         url = html
                         num += 1
                     elif int(pd) == 3:
                         return "附件"
                 except:
                     num += 1
         else:
             if int(j[5]['immit_js']) == 1:
                 html = Ray_html(j[2])
             else:
                 url = j[2]
                 while num < 3:
                     try:
                         if num > 0:
                             proxies = await Aiohttp_ip()
                         else:
                             proxies = None
                         html, pd = await Aiohttp_get(
                             url, argument['cookie'], proxies)
                         if int(pd) == 1:
                             html = html
                             break
                         elif int(pd) == 2:
                             url = html
                             num += 1
                         elif int(pd) == 3:
                             return "附件"
                     except:
                         num += 1
         first_url = kwargs['url']
         title_tag = kwargs['title_tag']
         title_re = kwargs['title_re']
         xpath_list = kwargs['xpath_list']
         if "http://tjj.gz.gov.cn/zwgk/gfxwj" in first_url:
             pass
         else:
             html = remove_comments(html)
         kwargs['html'] = html
         print(html)
         kwargs['judge_model'] = "test"  ##区分正常采集和测试采集返回的数据不一样
         if title_tag:
             json_list = Handle_tttt(**kwargs)
         elif title_re:
             json_list = Handle_title_re(**kwargs)
         elif xpath_list:
             json_list = Handle_xpath(**kwargs)
     print(json_list)
     all = {'info': json_list}
     ck = json.dumps(all)
     now_time = datetime.now()
     m = hashlib.md5()
     pp = str(first_url) + str(now_time)
     m.update(str(pp).encode('utf-8'))
     md = m.hexdigest()  ####把url转码md5
     self.sql_insert(md, ck)
     POOL.close()
     return md

Esempio n. 26

0

Mostra file

File: yscro_spider.py Progetto: xiexingen/scrapy-study

    def parse_detail(self, response):
        try:
            html = Selector(text=remove_comments(response.text))
            cdeItem = CdeItem()

            cdeContainer = html.xpath('//*[@id="div_open_close_01"]')

            projectItem = ProjectItem()

            projectMainContainer = html.css(
                '.register_mainB>.apply_zhgl>.cxtj_tm')
            cdeItem['_id'] = projectMainContainer.xpath(
                'table//tr[1]/td[2]/text()').extract_first(default='').strip()
            # 登记号
            projectItem['registrationNo'] = cdeItem['_id']
            # 试验状态
            projectItem['studyStatus'] = projectMainContainer.xpath(
                'table//tr[1]/td[4]/text()').extract_first(default='').strip()
            # 申办者联系人
            projectItem['sponsorConcatName'] = projectMainContainer.xpath(
                'table//tr[2]/td[2]/text()').extract_first(default='').strip()
            # 首次公示信息日期
            projectItem['firstPublishDate'] = projectMainContainer.xpath(
                'table//tr[2]/td[4]/text()').extract_first(default='').strip()

            # 适应症
            projectItem['indication'] = cdeContainer.xpath(
                'table//tr[2]/td[2]/text()').extract_first(default='').strip()
            # 试验通俗题目
            projectItem['popularTitle'] = cdeContainer.xpath(
                'table//tr[3]/td[2]/text()').extract_first(default='').strip()
            # 试验专业题目
            projectItem['studyTitle'] = cdeContainer.xpath(
                'table//tr[4]/td[2]/text()').extract_first(default='').strip()
            # 试验方案编号
            projectItem['protocolNo'] = cdeContainer.xpath(
                'table//tr[5]/td[2]/text()').extract_first(default='').strip()
            #临床申请受理号 -- 化学药备案号
            projectItem['acceptNo'] = cdeContainer.xpath(
                'table//tr[6]/td[2]/text()').extract_first(default='').strip()
            # 药物名称
            projectItem['drugName'] = cdeContainer.xpath(
                'table//tr[7]/td[2]/text()').extract_first(default='').strip()
            # 药物类型
            projectItem['drugClassification'] = cdeContainer.xpath(
                'table//tr[8]/td[2]/text()').extract_first(default='').strip()
            # 试验相关信息
            projectItem['otherInfo'] = '<div>{}</div>'.format(
                html.css('.register_main>.register_mainB>.apply_zhgl').xpath(
                    './table').extract_first())
            # 首例入组日期
            # projectItem['firstSubjectEncroEnrollmentDate']=cdeContainer.xpath('.//table[4]//tr/td/text()').extract_first(default='').strip()
            projectItem['firstSubjectEncroEnrollmentDate'] = cdeContainer.xpath(
                ".//div[@class='STYLE2'][contains(., '第一例受试者入组日期')]/following-sibling::table[1]//td/text()"
            ).extract_first(default='').strip()
            # 试验终止日期
            # projectItem['testStopDate']=cdeContainer.xpath('.//table[5]//tr/td/text()').extract_first(default='').strip()
            projectItem['testStopDate'] = cdeContainer.xpath(
                ".//div[@class='STYLE2'][contains(., '试验终止日期')]/following-sibling::table[1]//td/text()"
            ).extract_first(default='').strip()
            #八、试验状态
            #projectItem['studyStatus2']=cdeContainer.xpath('.//table[8]//tr/td').extract_first(default='').strip()
            projectItem['studyStatus2'] = re.sub(
                r"\s+", "",
                cdeContainer.xpath(
                    ".//div[@class='STYLE2'][contains(., '试验状态')]/following-sibling::table[1]//td/text()"
                ).extract_first(default='').strip())

            cdeItem['Project'] = dict(projectItem)

            ## 申办方信息
            sponsorInfoItem = SponsorInfoItem()
            sponsorContainer = cdeContainer.xpath('./table[2]')
            #申办方名称
            sponsorInfoItem['sponsorNames'] = []
            for tr in sponsorContainer.xpath('.//tr[1]/td[2]/table/tr'):
                sponsorInfoItem['sponsorNames'].append(
                    tr.xpath('td[2]/text()').extract_first(
                        default='').strip('/'))
            #联系人姓名
            sponsorInfoItem['concatName'] = sponsorContainer.xpath(
                'tr[2]/td[2]/text()').extract_first(default='').strip()
            #联系电话
            sponsorInfoItem['tel'] = sponsorContainer.xpath(
                'tr[3]/td[2]/text()').extract_first(default='').strip()
            #Email
            sponsorInfoItem['email'] = sponsorContainer.xpath(
                'tr[3]/td[4]/text()').extract_first(default='').strip()
            #地址
            sponsorInfoItem['address'] = sponsorContainer.xpath(
                'tr[4]/td[2]/text()').extract_first(default='').strip()
            #邮编
            sponsorInfoItem['zipCode'] = sponsorContainer.xpath(
                'tr[4]/td[4]/text()').extract_first(default='').strip()
            #费用来源
            # sponsorInfoItem['costFrom']=sponsorContainer.xpath('.//tr[5]/td[2]/text()').extract_first(default='').strip()
            sponsorInfoItem['costFrom'] = ''.join([
                item.strip() for item in sponsorContainer.xpath(
                    'tr[5]/td[2]/text()').extract()
            ])

            cdeItem['SponsorInfo'] = dict(sponsorInfoItem)

            ## 试验设计信息
            clinicalTrialInfomation = ClinicalTrialInformationItem()
            clinicalTrialContainer = cdeContainer.xpath('./table[3]')
            #试验目的
            clinicalTrialInfomation[
                'testPurpose'] = clinicalTrialContainer.xpath(
                    'tr[2]/td/text()').extract_first(default='').strip()
            #试验分类
            clinicalTrialInfomation['testType'] = clinicalTrialContainer.xpath(
                'tr[4]/td/table//tr[1]/td[3]/text()').extract_first(
                    default='').strip()
            #试验分期
            clinicalTrialInfomation[
                'testStaging'] = clinicalTrialContainer.xpath(
                    'tr[4]/td/table//tr[2]/td[3]/text()').extract_first(
                        default='').strip()
            #设计类型
            clinicalTrialInfomation[
                'testDesignType'] = clinicalTrialContainer.xpath(
                    'tr[4]/td/table//tr[3]/td[3]/text()').extract_first(
                        default='').strip()
            #随机化
            clinicalTrialInfomation[
                'testRandomization'] = clinicalTrialContainer.xpath(
                    'tr[4]/td/table//tr[4]/td[3]/text()').extract_first(
                        default='').strip()
            #盲法
            clinicalTrialInfomation[
                'testBlind'] = clinicalTrialContainer.xpath(
                    'tr[4]/td/table//tr[5]/td[3]/text()').extract_first(
                        default='').strip()
            #试验范围
            clinicalTrialInfomation[
                'testRange'] = clinicalTrialContainer.xpath(
                    'tr[4]/td/table//tr[6]/td[3]/text()').extract_first(
                        default='').strip()
            ##  3、受试者信息
            #年龄 -- 去掉内容中的\t\n\r
            clinicalTrialInfomation['subjectAge'] = re.sub(
                r"\s+", "",
                clinicalTrialContainer.xpath(
                    'tr[6]/td[2]/text()').extract_first(default='').strip())
            #性别
            clinicalTrialInfomation[
                'subjectGeneder'] = clinicalTrialContainer.xpath(
                    'tr[7]/td[2]/text()').extract_first(default='').strip()
            #健康受试者
            clinicalTrialInfomation[
                'subjectHealth'] = clinicalTrialContainer.xpath(
                    'tr[8]/td[2]/text()').extract_first(default='').strip()
            # 目标入组人数
            clinicalTrialInfomation[
                'subjectTargetEnrollment'] = clinicalTrialContainer.xpath(
                    'tr[11]/td[2]/text()').extract_first(default='').strip()
            # 实际入组人数
            clinicalTrialInfomation[
                'subjectActualEnrollment'] = clinicalTrialContainer.xpath(
                    'tr[12]/td[2]/text()').extract_first(default='').strip()
            # 数据安全监察委员会
            clinicalTrialInfomation[
                'subjectDMC'] = clinicalTrialContainer.xpath(
                    'tr[19]/td/text()').re_first(r'([有|无])')
            # 为受试者购买试验伤害保险
            clinicalTrialInfomation[
                'subjectInjuryInsurance'] = clinicalTrialContainer.xpath(
                    'tr[20]/td/text()').re_first(r'([有|无])')

            cdeItem['ClinicalTrialInformation'] = dict(clinicalTrialInfomation)

            ## 主要研究者信息
            cdeItem['MainInvestigators'] = []
            for table in cdeContainer.xpath('table[6]//tr[2]/td/table'):
                mainInvestigator = MainInvestigatorItem()
                # 姓名 #去除人名中的杂质 如:(叶定伟，医学博士)
                # tempName=table.xpath('tr[1]/td[2]/text()').extract_first(default='').strip()
                tempNames = re.split(
                    '[,，]',
                    table.xpath(
                        './/td[contains(.,"姓名")]//following-sibling::td[1]/text()'
                    ).extract_first(default='').strip())
                mainInvestigator['name'] = tempNames[0] if len(
                    tempNames) > 0 else ''
                # 获取专业认证 从姓名中解析 如:(叶定伟，医学博士)
                mainInvestigator['certification'] = tempNames[1] if len(
                    tempNames) > 1 else ''
                # 职称
                mainInvestigator['jobTitle'] = table.xpath(
                    './/td[contains(.,"职称")]//following-sibling::td[1]/text()'
                ).extract_first(default='').strip()
                # 电话
                mainInvestigator['tel'] = table.xpath(
                    './/td[contains(.,"电话")]//following-sibling::td[1]/text()'
                ).extract_first(default='').strip()
                # Email
                mainInvestigator['email'] = table.xpath(
                    './/td[contains(.,"Email")]//following-sibling::td[1]/text()'
                ).extract_first(default='').strip()
                # 地址
                mainInvestigator['address'] = table.xpath(
                    './/td[contains(.,"邮政地址")]//following-sibling::td[1]/text()'
                ).extract_first(default='').strip()
                # 邮编
                mainInvestigator['zipCode'] = table.xpath(
                    './/td[contains(.,"邮编")]//following-sibling::td[1]/text()'
                ).extract_first(default='').strip()
                # 单位名称
                mainInvestigator['companyName'] = table.xpath(
                    './/td[contains(.,"单位名称")]//following-sibling::td[1]/text()'
                ).extract_first(default='').strip()
                cdeItem['MainInvestigators'].append(dict(mainInvestigator))

            ## 各参加机构信息
            cdeItem['Hospitals'] = []
            for tr in cdeContainer.xpath(
                    '//*[@id="hspTable"]//tr[position()>1]'):
                hospital = HospitalItem()
                # 序号
                hospital['no'] = tr.xpath('td[1]/text()').extract_first(
                    default='').strip()
                # 机构名称
                hospital['name'] = tr.xpath('td[2]/text()').extract_first(
                    default='').strip()
                # 主要研究者
                hospital['mainSponsorName'] = tr.xpath(
                    'td[3]/text()').extract_first(default='').strip()
                # 国家
                hospital['state'] = tr.xpath('td[4]/text()').extract_first(
                    default='').strip()
                # 所在省
                hospital['province'] = tr.xpath('td[5]/text()').extract_first(
                    default='').strip()
                # 所在市
                hospital['city'] = tr.xpath('td[6]/text()').extract_first(
                    default='').strip()
                cdeItem['Hospitals'].append(dict(hospital))

            ## 伦理委员会信息
            cdeItem['ECs'] = []
            for tr in cdeContainer.xpath(
                    '//*[@id="div_open_close_01"]/table[7]//tr[position()>1]'):
                ec = ECItem()
                # 下标
                ec['no'] = tr.xpath('td[1]/text()').extract_first(
                    default='').strip()
                # 名称
                ec['name'] = tr.xpath('td[2]/text()').extract_first(
                    default='').strip()
                # 审查结论
                ec['approveResult'] = tr.xpath('td[3]/text()').extract_first(
                    default='').strip()
                # 审查日期
                ec['approveDate'] = tr.xpath('td[4]/text()').extract_first(
                    default='').strip()

                cdeItem['ECs'].append(dict(ec))

            yield cdeItem
        except Exception as e:
            self.logger.error('解析出错:%s?%s', response.url,
                              response.request.body)
            self.logger.error(e)