Example #1
0
    def test_with_tags(self):
        # text with tags
        self.assertEqual(remove_tags_with_content(u'<p>one p tag</p>'), u'<p>one p tag</p>')
        self.assertEqual(remove_tags_with_content(u'<p>one p tag</p>', which_ones=('p',)), u'')

        self.assertEqual(remove_tags_with_content(u'<b>not will removed</b><i>i will removed</i>', which_ones=('i',)),
                         u'<b>not will removed</b>')
Example #2
0
    def test_with_tags(self):
        # text with tags
        self.assertEqual(remove_tags_with_content(u'<p>one p tag</p>'), u'<p>one p tag</p>')
        self.assertEqual(remove_tags_with_content(u'<p>one p tag</p>', which_ones=('p',)), u'')

        self.assertEqual(remove_tags_with_content(u'<b>not will removed</b><i>i will removed</i>', which_ones=('i',)),
                         u'<b>not will removed</b>')
 def parse_modal_infor(self,response):
     res = response.text
     result = response.meta.get("item")
     res_convert = json.loads(res)
     res_convert = res_convert['spec']
     res_convert_del_header = re.sub(r'<li><label>([a-zA-Z_ÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠàáâãèéêìíòóôõùúăđĩũơƯĂẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỀỂưăạảấầẩẫậắằẳẵặẹẻẽềếểỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪễệỉịọỏốồổỗộớờởỡợụủứừỬỮỰỲỴÝỶỸửữựỳýỵỷỹ\s\.&,.-])+<\/label><\/li>',"",res_convert)
     res_key = remove_tags_with_content(res_convert_del_header, which_ones=('div',))
     res_key_replace_tags = replace_tags(res_key,'|','utf-8')
     res_key_array = list()
     res_key_gen = (value for value in res_key_replace_tags.split("||||"))
     for val in res_key_gen:
         res_key_array.append(val.replace("||",""))
     res_val = remove_tags_with_content(res_convert_del_header,which_ones=('span',))
     res_val_remove_tags = remove_tags(res_val,which_ones = ('a','li',))  
     res_val = replace_tags(res_val_remove_tags,'|','utf-8')
     res_val = res_val.split("||")
     res_val_array = list()
     res_val_gen = (val for val in res_val)
     for val in res_val_gen:
         res_val_array.append(val.replace("|",""))
     res_modal = dict(zip(res_key_array,res_val_array))
     result['data'] = res_modal
     # spec = Selector(text=json.loads(response.text)['spec'])
     # spec_values_container = list(filter(lambda x: len(x.xpath('./@class')) != 0, spec.css('li')))
     # spec_values_dict_keys = [ x.xpath('./span/text()').get() if x.xpath('./span/text()').get().split() else x.xpath('./span/div/text()').get() for x in spec_values_container ]
     # spec_values_dict_values = [remove_tags(x.xpath('./div').get()) for x in spec_values_container]
     yield result
Example #4
0
 def test_without_tags(self):
     # text without tags
     self.assertEqual(remove_tags_with_content('no tags'), 'no tags')
     self.assertEqual(
         remove_tags_with_content('no tags', which_ones=(
             'p',
             'b',
         )), 'no tags')
Example #5
0
 def test_without_tags(self):
     # text without tags
     self.assertEqual(remove_tags_with_content("no tags"), "no tags")
     self.assertEqual(
         remove_tags_with_content(
             "no tags",
             which_ones=(
                 "p",
                 "b",
             ),
         ),
         "no tags",
     )
Example #6
0
    def test_with_tags(self):
        # text with tags
        self.assertEqual(remove_tags_with_content("<p>one p tag</p>"),
                         "<p>one p tag</p>")
        self.assertEqual(
            remove_tags_with_content("<p>one p tag</p>", which_ones=("p", )),
            "")

        self.assertEqual(
            remove_tags_with_content(
                "<b>not will removed</b><i>i will removed</i>",
                which_ones=("i", )),
            "<b>not will removed</b>",
        )
Example #7
0
    def process_item(self, item, spider):
        """
        Process content based on its type.
        """
        content_type = item.get('content_type', 'UNKNOWN')
        log = structlog.get_logger().bind(
            event = 'PROCESS_ITEM',
            content_type = content_type,
            source_url = item['source_url'])

        if content_type == 'HTML':
            plain_content = html.replace_escape_chars(
                html.remove_tags(
                    html.remove_tags_with_content(
                        item['content'],
                        which_ones = ('script',)
                    )
                ),
                which_ones = ('\n','\t','\r','   '),
                replace_by = '')
            item['content'] = plain_content
            log.info(message = 'HTML content extracted')
        # @TODO
        elif content_type in ['PDF','MS_WORD', 'LIBREOFFICE', 'POWERPOINT', 'CSV', 'XLSX', 'XLS']:
            log.info(
                event = 'QUEUE_CONTENT',
                message = 'Pushing content for deferred processing')
        elif content_type in [None, 'UNKNOWN']:
            log.warn(error = 'UNRECOGNIZED_CONTENT_TYPE')

        return item
Example #8
0
 def test_tags_with_shared_prefix(self):
     # https://github.com/scrapy/w3lib/issues/114
     self.assertEqual(
         remove_tags_with_content("<span></span><s></s>",
                                  which_ones=("s", )),
         "<span></span>",
     )
Example #9
0
    def process(self, data, url_object):
        """Process HTML data.

        Replaces entities and removes tags (except comments) before
        processing with TextProcessor.
        """
        logging.info("Process HTML %s" % url_object.url)
        try:
            encoding, data = get_codec_and_string(data)
            # Remove style tags to avoid false positives from inline styles
            data = remove_tags_with_content(data, which_ones=('style',))
        except UnicodeDecodeError as ude:
            logging.error('UnicodeDecodeError in handle_error_method: {}'.format(ude))
            logging.error('Error happened for file: {}'.format(url_object.url))
            return False

        # Convert HTML entities to their unicode representation
        entity_replaced_html = replace_entities(data)

        # Collapse whitespace (including newlines), since extra whitespace is
        # not significant in HTML (except inside comment tags)
        collapsed_html = _whitespace_re.sub(' ', entity_replaced_html)

        # Replace tags with <> character to make sure text processor
        # doesn't match across tag boundaries.
        replace_tags_text = _html_tag_re.sub('<>', collapsed_html)

        return self.text_processor.process(replace_tags_text, url_object)
    def parse_item(self, response):
        sel = Selector(response)
        filename = "test/english/T" + str(self.count) + ".txt"
        tags_removed_text = remove_tags(
            remove_tags_with_content(
                sel.xpath('//*[@id="ja-content"]').extract()[0],
                which_ones=('script', )))
        tabs_removed_text = tags_removed_text.replace("\t",
                                                      '').replace('\r', '')
        newLineRemovedText = re.sub(r'(\s*\n\s*)\1*', '\n', tabs_removed_text)
        # with open(filename, "w") as out_file:
        #     for item in newLineRemovedText.split('\n'):
        #         out_file.write("%s\n" % item.strip())
        #
        # followed_urls = "followedTestURLs/englishUrls.txt"
        # with open(followed_urls, "a") as out_file:
        #     out_file.write("%s\n" % response.request.url)
        # docum = OrgItem()
        # docum['text'] = newLineRemovedText
        # docum['link'] = response.request.url

        self.writer.writerow({
            'text': newLineRemovedText,
            'link': response.request.url
        })  # writing data into file.
        self.count += 1
        print(self.count)
Example #11
0
    def handle_blog(self, response):
        hxs = HtmlXPathSelector(response)
        item = BuzzCrawlerItem()

        item['url'] = response.url
        item['date'] = dateutil.parser.parse(hxs.xpath(".//li[@class='entryDate']/time/@datetime").extract()[0])
        item['title'] = hxs.xpath(".//h1[@id='headline']/text()").extract()[0].strip()
        item['blurb'] = ""

        unprocessed_content = hxs.xpath(".//span[@itemprop='articleBody']").extract()[0]

        sane_html = remove_tags_with_content(unprocessed_content,("noscript","div","h6"))

        h = html2text.HTML2Text()
        h.ignore_links = True
        h.ignore_images = True

        processed_content = h.handle(sane_html)

        if "noscript" in unprocessed_content:
            print sane_html.encode("iso-8859-15", "replace")
            print "*"*98

        item['content'] = markdown(processed_content)
        item['source'] = 'wired.com'
        yield item
Example #12
0
    def handle_blog(self, response):
        hxs = HtmlXPathSelector(response)
        item = BuzzCrawlerItem()

        item['url'] = response.url
        item['date'] = dateutil.parser.parse(
            hxs.xpath(".//li[@class='entryDate']/time/@datetime").extract()[0])
        item['title'] = hxs.xpath(
            ".//h1[@id='headline']/text()").extract()[0].strip()
        item['blurb'] = ""

        unprocessed_content = hxs.xpath(
            ".//span[@itemprop='articleBody']").extract()[0]

        sane_html = remove_tags_with_content(unprocessed_content,
                                             ("noscript", "div", "h6"))

        h = html2text.HTML2Text()
        h.ignore_links = True
        h.ignore_images = True

        processed_content = h.handle(sane_html)

        if "noscript" in unprocessed_content:
            print sane_html.encode("iso-8859-15", "replace")
            print "*" * 98

        item['content'] = markdown(processed_content)
        item['source'] = 'wired.com'
        yield item
Example #13
0
    def parse_condition_page(self, response):
        """
        Verify that 1 item is returned and 3 additional requests are made (one
        for each one of the tab items). Also verify that the expected fields
        are contained in the returned item.

        @url http://www.nhs.uk/conditions/Food-poisoning/Pages/Introduction.aspx
        @returns requests 3 3
        @returns items 1
        @scrapes main_content url title

        """
        main_content = response.css('.healthaz-content').extract_first()
        main_content_text = remove_tags(
            remove_tags_with_content(main_content, ('script', 'noscript')))

        yield {
            "main_content": main_content_text,
            "url": response.url,
            "title": response.css('.healthaz-header h1::text').extract_first()
        }

        for tab_href in response.css(
                '#ctl00_PlaceHolderMain_articles a::attr(href)'):
            yield response.follow(tab_href, self.parse_condition_page)
Example #14
0
    def parse_with_term(self, response, term, newspaper):
        # clean response from scripts
        response_content = remove_tags_with_content(response.text, (
            'script',
            'a',
        ))
        selector = Selector(text=response_content)
        term_query = '//body//*[contains(text(), "%s")]/text()' % self.term
        term_nodes = selector.select(term_query).extract()
        if not term_nodes:
            return

        item = {
            'url': response.url,
            'newspaper': newspaper,
            'term': term,
            'response_content': response.text,
            'timestamp': time.time()
        }

        related_terms = self.get_related_terms(term_nodes)
        if term in related_terms:
            related_terms.pop(term)
        item['related_terms'] = dict(related_terms)

        #with open(self.term, 'a') as content_file:
        #    content_file.write("%s\n" % item)
        cb_client.insert(str(uuid.uuid4()), item)

        # update scraper process
        self.update_scraper_summary(item)
        return item
 def parse_details_page_right_pane_kv(self, selector):
     for item in selector.css('div > div'):
         key = item.css('span ::text').re_first('(.+):\Z').strip()
         value = Selector(text=remove_tags_with_content(item.get(),
                                                        which_ones=['span'
                                                                    ]),
                          type='html').css('html > body > div')
         yield (key, value)
Example #16
0
 def test_returns_unicode(self):
     # make sure it always return unicode
     assert isinstance(remove_tags_with_content(b'no tags'), six.text_type)
     assert isinstance(remove_tags_with_content(b'no tags', which_ones=('p',)), six.text_type)
     assert isinstance(remove_tags_with_content(b'<p>one tag</p>', which_ones=('p',)), six.text_type)
     assert isinstance(remove_tags_with_content(b'<a>link</a>', which_ones=('b',)), six.text_type)
     assert isinstance(remove_tags_with_content(u'no tags'), six.text_type)
     assert isinstance(remove_tags_with_content(u'no tags', which_ones=('p',)), six.text_type)
     assert isinstance(remove_tags_with_content(u'<p>one tag</p>', which_ones=('p',)), six.text_type)
     assert isinstance(remove_tags_with_content(u'<a>link</a>', which_ones=('b',)), six.text_type)
Example #17
0
 def test_returns_unicode(self):
     # make sure it always return unicode
     assert isinstance(remove_tags_with_content(b'no tags'), six.text_type)
     assert isinstance(remove_tags_with_content(b'no tags', which_ones=('p',)), six.text_type)
     assert isinstance(remove_tags_with_content(b'<p>one tag</p>', which_ones=('p',)), six.text_type)
     assert isinstance(remove_tags_with_content(b'<a>link</a>', which_ones=('b',)), six.text_type)
     assert isinstance(remove_tags_with_content(u'no tags'), six.text_type)
     assert isinstance(remove_tags_with_content(u'no tags', which_ones=('p',)), six.text_type)
     assert isinstance(remove_tags_with_content(u'<p>one tag</p>', which_ones=('p',)), six.text_type)
     assert isinstance(remove_tags_with_content(u'<a>link</a>', which_ones=('b',)), six.text_type)
Example #18
0
    def parse(self, response):
        """
        Parse the response page
        """
        # Skip error URLs
        if response.status != 200:
            return

        data = json.loads(response.text)

        title = data['title']
        # Replace / with a space - creates issues with writing to file
        title = title.replace('/', ' ')

        description = data['description']
        data = data['content']

        # Remove <script>, <sup>, <math> tags with the content
        paragraph = remove_tags_with_content(data,
                                             which_ones=('script', 'sup',
                                                         'math', 'style'))
        # Remove the rest of the tags without removing the content
        paragraph = remove_tags(paragraph)

        # Replace &amp; with &
        paragraph = paragraph.replace('&amp;', '&')
        # Replace &#39; with '
        paragraph = paragraph.replace('&#39;', "'")
        paragraph = paragraph.replace('&rsquo;', "'")
        paragraph = paragraph.replace('&ldquo;', "'")
        paragraph = paragraph.replace('&rdquo;', "'")
        # Replace &nbsp; with a space
        paragraph = re.sub("&.....;", ' ', paragraph)
        paragraph = re.sub("&....;", ' ', paragraph)

        # Replace 'U.S.' with 'US':
        paragraph = paragraph.replace('U.S.', 'US')

        # Some more replacements to improve the default tokenization
        paragraph = paragraph.replace('\r', '')
        paragraph = paragraph.replace('\t', '')

        text = title + '\n\n' + description + '\n\n' + paragraph

        # Create the directory
        dirname = 'data/qplum'
        if not os.path.exists(dirname):
            os.makedirs(dirname, exist_ok=True)
        elif not os.path.isdir(dirname):
            os.remove(dirname)
            os.makedirs(dirname, exist_ok=True)

        # Save the title and the text both
        filename = '{}/{}'.format(dirname, title)
        f = open(filename, 'w')
        f.write(text)
        f.close()
Example #19
0
    def process_response(self, request, response, spider):
        # clean body
        orig_body = response.body_as_unicode()
        body = remove_tags_with_content(orig_body, which_ones=('script', 'head'))
        body = remove_tags(remove_comments(body))
        terms = tokenize(body.lower())
        request.meta['terms'] = terms
        request.meta['body'] = body

        return response
Example #20
0
def clean_tags_from_affiliations(value):
    """Clean the affiliaton string for an author."""
    for affiliation in value.get('affiliations', []):
        # Remove tag AND content of any prefix like <label><sup>1</sup></label>
        affiliation['value'] = remove_tags_with_content(affiliation['value'], ('label',))
        # Now remove all tags but KEEP content
        affiliation['value'] = remove_tags(affiliation['value'])
        # Remove random whitespaces
        affiliation['value'] = clean_whitespace_characters(affiliation['value'])
    return value
Example #21
0
    def parse(self, response):
        """
        Parse the response page
        """
        # Skip error URLs
        if response.status != 200:
            return

        data = json.loads(response.text)

        title = data['title']
        # Replace / with a space - creates issues with writing to file
        title = title.replace('/', ' ')

        description = data['description']
        data = data['content']

        # Remove <script>, <sup>, <math> tags with the content
        paragraph = remove_tags_with_content(data, which_ones=('script', 'sup', 'math', 'style'))
        # Remove the rest of the tags without removing the content
        paragraph = remove_tags(paragraph)

        # Replace &amp; with &
        paragraph = paragraph.replace('&amp;', '&')
        # Replace &#39; with '
        paragraph = paragraph.replace('&#39;', "'")
        paragraph = paragraph.replace('&rsquo;', "'")
        paragraph = paragraph.replace('&ldquo;', "'")
        paragraph = paragraph.replace('&rdquo;', "'")
        # Replace &nbsp; with a space
        paragraph = re.sub("&.....;", ' ', paragraph)
        paragraph = re.sub("&....;", ' ', paragraph)

        # Replace 'U.S.' with 'US':
        paragraph = paragraph.replace('U.S.', 'US')

        # Some more replacements to improve the default tokenization
        paragraph = paragraph.replace('\r', '')
        paragraph = paragraph.replace('\t', '')

        text = title + '\n\n' + description + '\n\n' + paragraph

        # Create the directory
        dirname = 'data/qplum'
        if not os.path.exists(dirname):
            os.makedirs(dirname, exist_ok=True)
        elif not os.path.isdir(dirname):
            os.remove(dirname)
            os.makedirs(dirname, exist_ok=True)

        # Save the title and the text both
        filename = '{}/{}'.format(dirname, title)
        f = open(filename, 'w')
        f.write(text)
        f.close()
Example #22
0
class ScienceDailyArticleLoader(ArticleLoader):
    content_in = Compose(
        Join('\n\n'),
        lambda x: remove_tags_with_content(x, ('div',)),  # there's "div"s for advertisements
        ArticleLoader.default_input_processor,
    )

    date_out = Compose(
        TakeFirst(),
        lambda date_str: datetime.strptime(date_str, "%B %d, %Y"),
    )
Example #23
0
    def parse(self, response):
        """Parse the response.
		* Scrapes last updated, description, title and licence information from dataset links
		* Skips scraping and follows all links on page links
		* Yields dictionary containing page and response info to pipelines.py
		"""
        if self.stats:
            self.output_stats(response)

        depth = response.meta['depth']
        page = {
            'url': response.url,
            'updated': '',
            'license': '',
            'title': '',
            'summary': ''
        }

        # Handle bad urls
        if response.status in self.handle_httpstatus_list:
            bad_url = self.process_bad_url(response)
            yield bad_url if bad_url != None else {
                'page': page,
                'response': response
            }
            return

        if not response.meta['pagelink']:
            # Remove all javascript and style content from html body
            response_plain = scrapy.Selector(
                text=remove_tags_with_content(response.text, (
                    'script',
                    'style',
                )))

            page['updated'] = self.update.search_pattern(response_plain)
            page['license'] = self.licence.search_pattern(response_plain)
            page['summary'] = self.description.search_pattern(response_plain)
            page['title'] = self.title.search_pattern(response_plain)
        elif depth < self.maxdepth:
            # Get all links on the page
            links = self.extractor.extract_links(response)

            for link in links:
                yield self.new_request(url=link.url,
                                       depth=depth + 1,
                                       retry=False,
                                       pagelink=self.is_pagelink(link.url),
                                       country=response.meta['country'],
                                       territory=response.meta['territory'],
                                       retries=0)

        # Print crawled information to file or upload
        yield {'page': page, 'response': response}
Example #24
0
    def test_remove_tags_with_content(self):
        # make sure it always return unicode
        assert isinstance(remove_tags_with_content('no tags'), unicode)
        assert isinstance(remove_tags_with_content('no tags', which_ones=('p',)), unicode)
        assert isinstance(remove_tags_with_content('<p>one tag</p>', which_ones=('p',)), unicode)
        assert isinstance(remove_tags_with_content('<a>link</a>', which_ones=('b',)), unicode)

        # text without tags
        self.assertEqual(remove_tags_with_content(u'no tags'), u'no tags')
        self.assertEqual(remove_tags_with_content(u'no tags', which_ones=('p', 'b',)), u'no tags')

        # text with tags
        self.assertEqual(remove_tags_with_content(u'<p>one p tag</p>'), u'<p>one p tag</p>')
        self.assertEqual(remove_tags_with_content(u'<p>one p tag</p>', which_ones=('p',)), u'')

        self.assertEqual(remove_tags_with_content(u'<b>not will removed</b><i>i will removed</i>', which_ones=('i',)),
                         u'<b>not will removed</b>')

        # text with empty tags
        self.assertEqual(remove_tags_with_content(u'<br/>a<br />', which_ones=('br',)), u'a')
Example #25
0
    def test_remove_tags_with_content(self):
        # make sure it always return unicode
        assert isinstance(remove_tags_with_content("no tags"), unicode)
        assert isinstance(remove_tags_with_content("no tags", which_ones=("p",)), unicode)
        assert isinstance(remove_tags_with_content("<p>one tag</p>", which_ones=("p",)), unicode)
        assert isinstance(remove_tags_with_content("<a>link</a>", which_ones=("b",)), unicode)

        # text without tags
        self.assertEqual(remove_tags_with_content(u"no tags"), u"no tags")
        self.assertEqual(remove_tags_with_content(u"no tags", which_ones=("p", "b")), u"no tags")

        # text with tags
        self.assertEqual(remove_tags_with_content(u"<p>one p tag</p>"), u"<p>one p tag</p>")
        self.assertEqual(remove_tags_with_content(u"<p>one p tag</p>", which_ones=("p",)), u"")

        self.assertEqual(
            remove_tags_with_content(u"<b>not will removed</b><i>i will removed</i>", which_ones=("i",)),
            u"<b>not will removed</b>",
        )

        # text with empty tags
        self.assertEqual(remove_tags_with_content(u"<br/>a<br />", which_ones=("br",)), u"a")
Example #26
0
    def process_item(self, item, spider):
        ps = [
            remove_tags(remove_tags_with_content(
                p, ('script', ))).strip().replace(u'\xa0', u' ')
            for p in item['text']
        ]
        item['text'] = '\n'.join(ps)

        # additional stripping for description
        if item['description']:
            item['description'] = item['description'].strip()

        return item
Example #27
0
def ins_one(item, verdict):
    """
    更新详情页中间表信息

    :param source: 来源
    :param c_time: 日期
    :param content: 正文
    :param md: 附件md5值列表
    :param title:标题
    :return:
    """
    source = item[0]
    c_time = item[1]
    content = item[2]
    md = item[3]
    title = item[4]
    url = item[5]
    cid = item[6]
    a = time.time()
    try:
        soup = BeautifulSoup(content, 'html.parser')
        content = soup.prettify()
    except:
        content = item[2]

    try:
        content = html.remove_tags(content,
                                   which_ones=('font', 'frame', 'iframe',
                                               'span', 'o:p', 'form', 'link'))
        content = html.remove_tags_with_content(content,
                                                which_ones=('script', 'form'))
    except:
        content = item[2]
    if cid in verdict:
        try:
            img_md = fir_img_ins(content)
        except:
            img_md = None
    else:
        img_md = None
    try:
        conn, cursor = create_conn()
        sql = " UPDATE  tbl_article_info_temp  SET release_at=%s,source=%s,content=%s,attachment_id=%s,type=0,title=%s,status=1,img_md5=%s  WHERE url=%s"
        cursor.execute(
            sql, (c_time, source, content, ",".join(md), title, img_md, url))
        conn.commit()
        print(" 更新详情表成功" + str(title) + str(c_time) + "  " +
              str(time.time() - a))
        close_conn(conn, cursor)
    except Exception as err:
        print(str(err) + "定位3")
Example #28
0
def crawl():
    global visited_urls
    global visited_hashes
    global queue
    pages_crawled, words_found = 0, 0

    # Set depth limit - may want to crawl less than 3 levels
    global depth_limit
    depth = request.args.get('depth')
    if depth:
        depth_limit = int(depth)

    start_url = request.args.get('url')
    if start_url:
        queue.append((start_url, 1))

        while queue:
            url, depth = queue.popleft()
            if url not in visited_urls:
                visited_urls.add(url)

                try:
                    response = requests.get(url, timeout=5)
                except Exception:
                    print('Exception')
                    response = None

                if (response is not None
                        and response.status_code == requests.codes.ok
                        and 'text/html' in response.headers['Content-Type']):
                    # Need to create and check hash since multiple urls could have identical content
                    page_content = BeautifulSoup(
                        remove_tags_with_content(response.content,
                                                 ('script', 'style')),
                        'html.parser')
                    hash = hashlib.md5(
                        page_content.get_text().encode()).hexdigest()
                    if hash not in visited_hashes:
                        visited_hashes.add(hash)
                        if page_content:
                            pages_crawled += 1
                            words_on_page = create_index(page_content, url)
                            words_found += words_on_page
                            add_pages_to_queue(page_content, url, depth)

    return jsonify({
        'pages_crawled': pages_crawled,
        'words_found': words_found,
    })
Example #29
0
    def clean_content(self, text):
        """
        Return a string of text cleaned up by tags, entities,
        escape chars, quotes and spaces
        """

        temp = remove_tags_with_content(text,
                                        which_ones=('style', 'script',
                                                    'figcaption'))
        temp = remove_tags(temp)
        temp = remove_entities(temp)
        temp = replace_escape_chars(temp)
        temp = unquote_markup(temp)
        temp = " ".join(temp.split())
        return temp
    def parse_news(self, response):
        input = ''.join(
            response.xpath('//*[@id="story_body_content"]/span/p').extract())
        content = remove_tags(remove_tags_with_content(input,
                                                       ('div', 'figure')),
                              keep=('p', ))

        item = ArticleItem()
        item['a_title'] = response.css('h1.story_art_title::text').get()
        item['a_datetime'] = response.css(
            'div.shareBar__info--author span::text').get()
        item['a_source'] = response.css(
            'div.shareBar__info--author::text').get()
        item['a_content'] = content
        yield item
Example #31
0
    def test_remove_tags_with_content(self):
        # make sure it always return unicode
        assert isinstance(remove_tags_with_content('no tags'), unicode)
        assert isinstance(
            remove_tags_with_content('no tags', which_ones=('p', )), unicode)
        assert isinstance(
            remove_tags_with_content('<p>one tag</p>', which_ones=('p', )),
            unicode)
        assert isinstance(
            remove_tags_with_content('<a>link</a>', which_ones=('b', )),
            unicode)

        # text without tags
        self.assertEqual(remove_tags_with_content(u'no tags'), u'no tags')
        self.assertEqual(
            remove_tags_with_content(u'no tags', which_ones=(
                'p',
                'b',
            )), u'no tags')

        # text with tags
        self.assertEqual(remove_tags_with_content(u'<p>one p tag</p>'),
                         u'<p>one p tag</p>')
        self.assertEqual(
            remove_tags_with_content(u'<p>one p tag</p>', which_ones=('p', )),
            u'')

        self.assertEqual(
            remove_tags_with_content(
                u'<b>not will removed</b><i>i will removed</i>',
                which_ones=('i', )), u'<b>not will removed</b>')

        # text with empty tags
        self.assertEqual(
            remove_tags_with_content(u'<br/>a<br />', which_ones=('br', )),
            u'a')
Example #32
0
    def _parse_topic_response(self, response):
        """
        Parses various topics
        e.g. www.investopedia.com/terms/o/oddlottheory.asp
        """
        # Get the title first
        title = response.css('title::text').extract_first()

        # Replace / with a space - creates issues with writing to file
        title = title.replace('/', ' ')

        # Get the first div with class content
        content = response.css('div.content')
        if isinstance(content, list) and len(content) > 0:
            content = content[0]
        else:
            content = response.css('div.roth__content')[0]

        text = title + '\n\n'
        for child in content.xpath('//p'):

            # Get the text from this child <p></p> tag
            paragraph = child.extract()

            # Remove tags including <p> and <a>
            paragraph = remove_tags(
                remove_tags_with_content(paragraph, ('script', ))).strip()

            # Replace '&amp;' with '&'
            paragraph = paragraph.replace('&amp;', '&')

            # Add to the file
            text += paragraph + '\n'

        # Create the directory
        dirname = 'data/investopedia'
        if not os.path.exists(dirname):
            os.makedirs(dirname, exist_ok=True)
        elif not os.path.isdir(dirname):
            os.remove(dirname)
            os.makedirs(dirname, exist_ok=True)

        # Save the text
        name = response.url.split('/')[-1]
        filename = '{}/{}'.format(dirname, name)
        f = open(filename, 'w')
        f.write(text)
        f.close()
    def _parse_topic_response(self, response):
        """
        Parses various topics
        e.g. www.investopedia.com/terms/o/oddlottheory.asp
        """
        # Get the title first
        title = response.css('title::text').extract_first()

        # Replace / with a space - creates issues with writing to file
        title = title.replace('/', ' ')

        # Get the first div with class content
        content = response.css('div.content')
        if isinstance(content, list) and len(content) > 0:
            content = content[0]
        else:
            content = response.css('div.roth__content')[0]

        text = title + '\n\n'
        for child in content.xpath('//p'):

            # Get the text from this child <p></p> tag
            paragraph = child.extract()

            # Remove tags including <p> and <a>
            paragraph = remove_tags(remove_tags_with_content(paragraph, ('script', ))).strip()

            # Replace '&amp;' with '&'
            paragraph = paragraph.replace('&amp;', '&')

            # Add to the file
            text += paragraph + '\n'

        # Create the directory
        dirname = 'data/investopedia'
        if not os.path.exists(dirname):
            os.makedirs(dirname, exist_ok=True)
        elif not os.path.isdir(dirname):
            os.remove(dirname)
            os.makedirs(dirname, exist_ok=True)

        # Save the text
        name = response.url.split('/')[-1]
        filename = '{}/{}'.format(dirname, name)
        f = open(filename, 'w')
        f.write(text)
        f.close()
Example #34
0
 def test_returns_unicode(self):
     # make sure it always return unicode
     assert isinstance(remove_tags_with_content(b"no tags"), str)
     assert isinstance(
         remove_tags_with_content(b"no tags", which_ones=("p", )), str)
     assert isinstance(
         remove_tags_with_content(b"<p>one tag</p>", which_ones=("p", )),
         str)
     assert isinstance(
         remove_tags_with_content(b"<a>link</a>", which_ones=("b", )), str)
     assert isinstance(remove_tags_with_content("no tags"), str)
     assert isinstance(
         remove_tags_with_content("no tags", which_ones=("p", )), str)
     assert isinstance(
         remove_tags_with_content("<p>one tag</p>", which_ones=("p", )),
         str)
     assert isinstance(
         remove_tags_with_content("<a>link</a>", which_ones=("b", )), str)
Example #35
0
    def parse(self, response):
        # remove <script> tags from <p> elements
        for text in response.css('p'):
            yield {
                'text':
                remove_tags(
                    remove_tags_with_content(text.extract(), ('script', )))
            }

        # add new URLs that are descendants of the request URL (same domain)
        for next_page in response.css('div > a'):
            a_tag = next_page.extract()
            if "href=" in a_tag:
                link = (a_tag.split('href="')[1]).split('"')[0]
                if link.count("/") > 2:
                    if link.split("/")[2] in response.request.url:
                        yield response.follow(next_page, self.parse)
Example #36
0
    def parse_item(self, response):
        item = ScifibotItem()
        # clean body
        orig_body = response.body_as_unicode()
        body = remove_tags_with_content(orig_body,
            which_ones=('script', 'head'))
        body = remove_tags(remove_comments(body))
        tokens = tokenize(body.lower())
        # decide if the page is interesting
        if not is_relevant(tokens):
            stats.inc_value('scifi/filtered_out') # probably not scifi page
            return

        item['keywords'] = tokens
        item['page'] = orig_body
        item['url'] = response.url
        return item
Example #37
0
    def parse_topic_response(self, response):
        """
        Parse the content
        """

        # Get the title first
        title = response.css('title::text').extract_first()

        # Replace / with a space - creates issues with writing to file
        title = title.replace('/', ' ')

        content = response.css('div#mw-content-text')

        # Just extract all the '<p></p>' children from this
        text = title + '\n\n'
        for child in content.xpath('//p'):

            # Get the text from this child <p></p> tag
            paragraph = child.extract()

            # Remove <script>, <sup>, <math> tags with the content
            paragraph = remove_tags_with_content(paragraph, which_ones=('script', 'sup', 'math'))
            # Remove the rest of the tags without removing the content
            paragraph = remove_tags(paragraph)

            # Replace '&amp;' with '&'
            paragraph = paragraph.replace('&amp;', '&')

            # Add to the file
            text += paragraph + '\n'

        # Create the directory
        dirname = 'data/wikipedia'
        if not os.path.exists(dirname):
            os.makedirs(dirname, exist_ok=True)
        elif not os.path.isdir(dirname):
            os.remove(dirname)
            os.makedirs(dirname, exist_ok=True)

        # Save the text
        name = response.url.split('/')[-1]
        filename = '{}/{}'.format(dirname, name)
        f = open(filename, 'w')
        f.write(text)
        f.close()
Example #38
0
    def parse_topic_response(self, response):
        """
        Parse the content
        """

        # Get the title first
        title = response.css('title::text').extract_first()

        # Replace / with a space - creates issues with writing to file
        title = title.replace('/', ' ')

        content = response.css('div#mw-content-text')

        # Just extract all the '<p></p>' children from this
        text = ''
        for child in content.xpath('//p'):

            # Get the text from this child <p></p> tag
            paragraph = child.extract()

            # Remove <script>, <sup>, <math> tags with the content
            paragraph = remove_tags_with_content(paragraph,
                                                 which_ones=('script', 'sup',
                                                             'math'))
            # Remove the rest of the tags without removing the content
            paragraph = remove_tags(paragraph)

            # Replace '&amp;' with '&'
            paragraph = paragraph.replace('&amp;', '&')

            # Replace 'U.S.' with 'US':
            paragraph = paragraph.replace('U.S.', 'US')

            # Some more replacements to improve the default tokenization
            for c in '();.,[]"\'-:/%$+@?':
                paragraph = paragraph.replace(c, ' {} '.format(c))

            # Add to the file
            text += paragraph.lower() + '\n'

        filename = 'wiki_data.txt'
        f = open(filename, 'a')
        f.write(text)
        f.close()
Example #39
0
def download(self, url: AnyStr) -> Union[AnyStr, None]:
    headers = {
        "User-Agent":
        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:64.0) Gecko/20100101 Firefox/64.0",
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "DNT": "1",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
    }
    try:
        response = requests.get(url=url, headers=headers)
        response.raise_for_status()
    except Exception:
        return

    return remove_tags_with_content(text=response.text,
                                    which_ones=("script", ))
Example #40
0
    def _parse_topic_response(self, response):
        """
        Parses various topics
        e.g. www.investopedia.com/terms/o/oddlottheory.asp
        """
        # Get the title first
        title = response.css('title::text').extract_first()

        # Replace / with a space - creates issues with writing to file
        title = title.replace('/', ' ')

        # Get the first div with id Content
        content = response.css('div#Content')[0]
        content = content.css('div.content-box')

        text = ''
        for child in content.xpath('//p'):

            # Get the text from this child <p></p> tag
            paragraph = child.extract()

            # Remove tags including <p> and <a>
            paragraph = remove_tags(
                remove_tags_with_content(paragraph, ('script', ))).strip()

            # Replace '&amp;' with '&'
            paragraph = paragraph.replace('&amp;', '&')

            # Replace 'U.S.' with 'US':
            paragraph = paragraph.replace('U.S.', 'US')

            # Some more replacements to improve the default tokenization
            for c in '();.,[]"\'-:/%$+@?':
                paragraph = paragraph.replace(c, ' {} '.format(c))

            # Add to the file
            text += paragraph.lower() + '\n'

        # Save the title and the text both
        filename = 'investopedia_data.txt'
        f = open(filename, 'a')
        f.write(text)
        f.close()
Example #41
0
def _has_ajaxcrawlable_meta(text):
    """
    >>> _has_ajaxcrawlable_meta('<html><head><meta name="fragment"  content="!"/></head><body></body></html>')
    True
    >>> _has_ajaxcrawlable_meta("<html><head><meta name='fragment' content='!'></head></html>")
    True
    >>> _has_ajaxcrawlable_meta('<html><head><!--<meta name="fragment"  content="!"/>--></head><body></body></html>')
    False
    >>> _has_ajaxcrawlable_meta('<html></html>')
    False
    """

    # Stripping scripts and comments is slow (about 20x slower than
    # just checking if a string is in text); this is a quick fail-fast
    # path that should work for most pages.
    if 'fragment' not in text:
        return False
    if 'content' not in text:
        return False

    text = html.remove_tags_with_content(text, ('script', 'noscript'))
    text = html.replace_entities(text)
    text = html.remove_comments(text)
    return _ajax_crawlable_re.search(text) is not None
Example #42
0
 def test_tags_with_shared_prefix(self):
     # https://github.com/scrapy/w3lib/issues/114
     self.assertEqual(remove_tags_with_content(u'<span></span><s></s>', which_ones=('s',)), u'<span></span>')
Example #43
0
 def test_without_tags(self):
     # text without tags
     self.assertEqual(remove_tags_with_content(u'no tags'), u'no tags')
     self.assertEqual(remove_tags_with_content(u'no tags', which_ones=('p', 'b',)), u'no tags')
Example #44
0
 def test_empty_tags(self):
     # text with empty tags
     self.assertEqual(remove_tags_with_content(u'<br/>a<br />', which_ones=('br',)), u'a')