Beispiel #1
0
    def _run_extraction(self, name, templates, page, descriptor, expected_output):
        self.trace = None
        template_pages = [HtmlPage(None, {}, t) for t in templates]
        # extracts with trace enabled in order to generate traceback
        extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages], True)
        actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
        if actual_output is not None:
            actual_output = actual_output[0]
            self.trace = ["Extractor:\n%s" % extractor] + actual_output.pop('trace')
        # extracts again with trace disabled in order to get the pure output
        extractor = InstanceBasedLearningExtractor([(t, descriptor) for t in template_pages])
        actual_output, _ = extractor.extract(HtmlPage(None, {}, page))
        if actual_output is None:
            if expected_output is None:
                return
            assert False, "failed to extract data for test '%s'" % name
        else:
            actual_output = actual_output[0]
        expected_names = set(expected_output.keys())
        actual_names = set(actual_output.keys())
        
        missing_in_output = filter(None, expected_names - actual_names)
        error = "attributes '%s' were expected but were not present in test '%s'" % \
                ("', '".join(missing_in_output), name)
        assert len(missing_in_output) == 0, error

        unexpected = actual_names - expected_names
        error = "unexpected attributes %s in test '%s'" % \
                (', '.join(unexpected), name)
        assert len(unexpected) == 0, error

        for k, v in expected_output.items():
            extracted = actual_output[k]
            assert v == extracted, "in test '%s' for attribute '%s', " \
                "expected value '%s' but got '%s'" % (name, k, v, extracted)
Beispiel #2
0
 def test_annotation(self):
     html_page = HtmlPage(body=TEST_PAGE)
     template = {
         'original_body': html_page.body
     }
     data = {
         'extracts': [
             {
                 'annotations': {'href': 'origin'},
                 'id': 'test-id-123',
                 'required': [],
                 'tagid': 123,
                 'variant': 0
             }
         ]
     }
     annotations = Annotations()
     annotations.save_extraction_data(data, template)
     sample = HtmlPage(body=add_tagids(template['annotated_body']))
     for element in sample.parsed_body:
         if isinstance(element, HtmlTag):
             tagid = element.attributes.get(TAGID, None)
             if tagid and int(tagid) == data['extracts'][0]['tagid']:
                 annotation = element.attributes.get('data-scrapy-annotate')
                 self.assertTrue(annotation)
                 self.assertTrue('"id": "test-id-123"')
Beispiel #3
0
def _open_sample_and_page(name):
    sample_spec = _open_spec(name)
    annotations = sample_spec['plugins']['annotations-plugin']['extracts']
    annotated = apply_annotations(_clean_annotation_data(annotations),
                                  sample_spec['original_body'])
    url = sample_spec['url']
    return (HtmlPage(url=url, body=annotated),
            HtmlPage(url=url, body=sample_spec['original_body']))
Beispiel #4
0
    def test_extraction(self, name, templates, page, descriptor,
                        expected_output):
        template_pages = [HtmlPage(None, {}, t) for t in templates]

        extractor = InstanceBasedLearningExtractor([(t, descriptor)
                                                    for t in template_pages])
        actual_output, _ = extractor.extract(HtmlPage(None, {}, page))

        self.assertEqual(expected_output, actual_output and actual_output[0])
Beispiel #5
0
def parse_strings(template_html, extraction_html):
    """Create a template and extraction page from raw strings

    this is useful for testing purposes
    """
    t = TokenDict()
    template_page = HtmlPage(body=template_html)
    extraction_page = HtmlPage(body=extraction_html)
    return (parse_template(t, template_page),
            parse_extraction_page(t, extraction_page))
Beispiel #6
0
    def test_copy(self):
        """Test copy/deepcopy"""
        page = HtmlPage(url='http://www.example.com', body=PAGE)
        region = page.subregion(10, 15)

        regioncopy = copy.copy(region)
        self.assertEqual(regioncopy.start_index, 10)
        self.assertEqual(regioncopy.end_index, 15)
        self.assertFalse(region is regioncopy)
        self.assertTrue(region.htmlpage is regioncopy.htmlpage)

        regiondeepcopy = copy.deepcopy(region)
        self.assertEqual(regiondeepcopy.start_index, 10)
        self.assertEqual(regiondeepcopy.end_index, 15)
        self.assertFalse(region is regiondeepcopy)
        self.assertFalse(region.htmlpage is regiondeepcopy.htmlpage)
Beispiel #7
0
    def url_to_page_mod(self,
                        website,
                        encoding=None,
                        default_encoding='utf-8'):
        '''this function has been modified to take a website object insteal of going to the url    
        '''
        """Fetch a URL, using python urllib2, and return an HtmlPage object.
        
        The `url` may be a string, or a `urllib2.Request` object. The `encoding`
        argument can be used to force the interpretation of the page encoding.
        
        Redirects are followed, and the `url` property of the returned HtmlPage object
        is the url of the final page redirected to.
        
        If the encoding of the page is known, it can be passed as a keyword argument. If
        unspecified, the encoding is guessed using `w3lib.encoding.html_to_unicode`.
        `default_encoding` is used if the encoding cannot be determined.
        """

        #fh = urllib2.urlopen(url)
        #info = fh.info()
        info = website.info
        headers_dict = dict(info.headers)
        #body_str = fh.read()
        body = website.browser.page_source
        # guess content encoding if not specified
        if encoding is None:
            encoding = default_encoding
        return HtmlPage(website.url,
                        headers=headers_dict,
                        body=body,
                        encoding=encoding)
Beispiel #8
0
    def test_get_base_url(self):
        """Basic get_base_url test"""
        html = u'<html><head><base href="http://example.com/products/" />\
<body></body></html>'

        page = HtmlPage("http://example.com/products/p19.html", body=html)
        self.assertEqual(get_base_url(page), "http://example.com/products/")
Beispiel #9
0
    def test_spider_with_link_template(self):
        name = "seedsofchange"
        spider = self.smanager.create(name)
        spec = self.smanager._specs["spiders"][name]
        t1, t2 = spec["templates"]
        target1, target2 = [HtmlPage(url=t["url"], body=t["original_body"]) for t in spec["templates"]]

        items, link_regions = spider.plugins['Annotations'].extract_items(target1)
        self.assertEqual(items, [])
        self.assertEqual(len(list(spider.plugins['Annotations']._process_link_regions(target1, link_regions))), 104)

        items, link_regions = spider.plugins['Annotations'].extract_items(target2)
        self.assertEqual(items[0], {
                '_template': u'4fac3b47688f920c7800000f',
                '_type': u'default',
                u'category': [u'Winter Squash'],
                u'days': [None],
                u'description': [u'1-2 lbs. (75-95 days)&nbsp;This early, extremely productive, compact bush variety is ideal for small gardens.&nbsp; Miniature pumpkin-shaped fruits have pale red-orange skin and dry, sweet, dark orange flesh.&nbsp; Great for stuffing, soups and pies.'],
                u'lifecycle': [u'Tender Annual'],
                u'name': [u'Gold Nugget'],
                u'price': [u'3.49'],
                u'product_id': [u'01593'],
                u'species': [u'Cucurbita maxima'],
                'url': u'http://www.seedsofchange.com/garden_center/product_details.aspx?item_no=PS14165',
                u'weight': [None]}
        )
        self.assertEqual(link_regions, [])
        self.assertEqual(len(list(spider.plugins['Annotations']._process_link_regions(target2, link_regions))), 0)
Beispiel #10
0
    def test_copy(self):
        """Test copy/deepcopy"""
        page = HtmlPage(url='http://www.example.com', body=PAGE)
        region = page.subregion(10, 15)

        regioncopy = copy.copy(region)
        self.assertEqual(regioncopy.start_index, 10)
        self.assertEqual(regioncopy.end_index, 15)
        self.assertFalse(region is regioncopy)
        self.assertTrue(region.htmlpage is regioncopy.htmlpage)

        regiondeepcopy = copy.deepcopy(region)
        self.assertEqual(regiondeepcopy.start_index, 10)
        self.assertEqual(regiondeepcopy.end_index, 15)
        self.assertFalse(region is regiondeepcopy)
        self.assertFalse(region.htmlpage is regiondeepcopy.htmlpage)
    def _requests_to_follow(self, htmlpage):
        if self._links_ibl_extractor is not None:
            #bugfix,self._links_ibl_extractor.extract will find a serious of link
            #when it was create by repeated annotation.
            extracted_list = self._links_ibl_extractor.extract(htmlpage)[0]
            if extracted_list is not None:
                seen = set()
                for extracted in extracted_list:
                    #every key doesn't start with '_' is links attribute.
                    for key in extracted.keys():
                        if not str(key).startswith('_'):
                            extracted_regions = extracted.get(key, [])
                            for region in extracted_regions:
                                #if isinstance(region, six.string_types) :
                                #    region = region.decode(htmlpage.encoding)

                                htmlregion = HtmlPage(
                                    htmlpage.url,
                                    htmlpage.headers,
                                    region,
                                    encoding=htmlpage.encoding)
                                for request in self._request_to_follow_from_region(
                                        htmlregion):
                                    if request.url in seen:
                                        continue
                                    seen.add(request.url)
                                    yield request
        else:
            for request in self._request_to_follow_from_region(htmlpage):
                yield request
Beispiel #12
0
    def test_spider_with_link_region_but_not_link_template(self):
        name = "seedsofchange2"
        spider = self.smanager.create(name)
        spec = self.smanager._specs["spiders"][name]
        t1, t2 = spec["templates"]

        target1, target2 = [
            HtmlPage(url=t["url"], body=t["original_body"])
            for t in spec["templates"]
        ]
        items, link_regions = spider.plugins['Annotations'].extract_items(
            target1)
        self.assertEqual(
            items[0], {
                '_template':
                u'4fad6a7c688f922437000014',
                '_type':
                u'default',
                u'category': [u'Onions'],
                u'days': [None],
                u'description': [
                    u'(110-120 days)&nbsp; Midsized Italian variety.&nbsp; Long to intermediate day red onion that tolerates cool climates.&nbsp; Excellent keeper.&nbsp; We have grown out thousands of bulbs and re-selected this variety to be the top quality variety that it once was.&nbsp; 4-5&quot; bulbs are top-shaped, uniformly colored, and have tight skins.'
                ],
                u'lifecycle': [u'Heirloom/Rare'],
                u'name': [u'Rossa Di Milano Onion'],
                u'price': [u'3.49'],
                u'species': [u'Alium cepa'],
                u'type': [u'Heirloom/Rare'],
                'url':
                u'http://www.seedsofchange.com/garden_center/product_details.aspx?item_no=PS15978'
            })
        self.assertEqual(link_regions, [])

        items, link_regions = spider.plugins['Annotations'].extract_items(
            target2)
        self.assertEqual(
            items[0], {
                '_template':
                u'4fad6a7d688f922437000017',
                '_type':
                u'default',
                u'category': [u'Winter Squash'],
                u'days': [None],
                u'description': [
                    u'1-2 lbs. (75-95 days)&nbsp;This early, extremely productive, compact bush variety is ideal for small gardens.&nbsp; Miniature pumpkin-shaped fruits have pale red-orange skin and dry, sweet, dark orange flesh.&nbsp; Great for stuffing, soups and pies.'
                ],
                u'lifecycle': [u'Tender Annual'],
                u'name': [u'Gold Nugget'],
                u'price': [u'3.49'],
                u'species': [u'Cucurbita maxima'],
                'url':
                u'http://www.seedsofchange.com/garden_center/product_details.aspx?item_no=PS14165',
                u'weight': [None]
            })
        self.assertEqual(len(link_regions), 1)
        self.assertEqual(
            len(
                list(spider.plugins['Annotations']._process_link_regions(
                    target1, link_regions))), 25)
Beispiel #13
0
def htmlpage_from_response(response, _add_tagids=False):
    body = response.body_as_unicode()
    if _add_tagids:
        body = add_tagids(body)
    return HtmlPage(response.url,
                    response.headers,
                    body,
                    encoding=response.encoding)
Beispiel #14
0
 def _load_templates(self):
     if not os.path.exists(self.filename):
         return []
     with open(self.filename) as f:
         templates = json.load(f)['templates']
         templates = [HtmlPage(t['url'], body=t['body'], encoding=t['encoding']) \
             for t in templates]
         return templates
Beispiel #15
0
 def handle_html(self, response):
     htmlpage = HtmlPage(response.url, response.headers, \
                 response.body_as_unicode(), encoding=response.encoding)
     items, link_regions = self.extract_items(htmlpage)
     for item in items:
         yield item
     for request in self._process_link_regions(htmlpage, link_regions):
         yield request
Beispiel #16
0
    def test_get_base_url_empty_basehref(self):
        """Base tag exists but href is empty"""
        html = u'<html><head><base href="" />\
<body></body></html>'

        url = "http://example.com/products/p19.html"
        page = HtmlPage(url, body=html)
        self.assertEqual(get_base_url(page), url)
Beispiel #17
0
 def test_not_standard_chars_in_url(self):
     body = u'<html><meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" /></html>'
     url = u'fotos/produtos/Mam\xe3e noel.jpg'
     htmlpage = HtmlPage(url=u"http://www.example.com/",
                         body=body,
                         encoding='cp1252')
     processor = UrlFieldTypeProcessor()
     self.assertEqual(
         processor.adapt(url, htmlpage),
         u'http://www.example.com/fotos/produtos/Mam%C3%A3e%20noel.jpg')
Beispiel #18
0
 def _process_link_regions(self, htmlpage, link_regions):
     """Process link regions if any, and generate requests"""
     if link_regions:
         for link_region in link_regions:
             htmlregion = HtmlPage(htmlpage.url, htmlpage.headers, \
                     link_region, encoding=htmlpage.encoding)
             for request in self._requests_to_follow(htmlregion):
                 yield request
     else:
         for request in self._requests_to_follow(htmlpage):
             yield request
Beispiel #19
0
    def test_extraction(self):

        samples_encoding = 'latin1'
        [(html1, data1), (html2, data2)] = list(iter_samples(
            'scraper_loadstore', html_encoding=samples_encoding))
        sc = Scraper()
        page1 = HtmlPage(body=html1, encoding=samples_encoding)
        sc.train_from_htmlpage(page1, data1)

        page2 = HtmlPage(body=html2, encoding=samples_encoding)
        extracted_data = sc.scrape_page(page2)
        self._assert_extracted(extracted_data, data2)

        # check still works after serialize/deserialize 
        f = StringIO()
        sc.tofile(f)
        f.seek(0)
        sc = Scraper.fromfile(f)
        extracted_data = sc.scrape_page(page2)
        self._assert_extracted(extracted_data, data2)
Beispiel #20
0
def load_templates(fn, site_id):
    try:
        with open(fn, 'r') as f:
            obj = json.load(f)
    except IOError:
        return list()

    tmpl_ids = obj['sites'].get(unicode(site_id))
    if not tmpl_ids:
        return list()
    return list(
        HtmlPage(**x) for x in obj['templates'] if x['page_id'] in tmpl_ids)
Beispiel #21
0
    def test_variants(self):
        """Ensure variants are extracted as list of dicts"""

        name = "networkhealth.com"
        spider = self.smanager.create(name)
        spec = self.smanager._specs["spiders"][name]
        template, = spec["templates"]
        target = HtmlPage(url=template["url"], body=template["original_body"])
        items, link_regions = spider.plugins['Annotations'].extract_items(target)
        for item in items:
            for variant in item["variants"]:
                self.assertEqual(type(variant), dict)
Beispiel #22
0
def remove_tagids(source):
    """remove from the given page, all tagids previously added by add_tagids()
    """
    output = []
    if not isinstance(source, HtmlPage):
        source = HtmlPage(body=source)
    for element in source.parsed_body:
        if _must_add_tagid(element):
            element.attributes.pop(TAGID, None)
            output.append(serialize_tag(element))
        else:
            output.append(source.body[element.start:element.end])
    return ''.join(output)
Beispiel #23
0
 def test_uri_stripped_of_whitespace_and_quote_characters_correctly(self):
     urls = [u' image.jpg ', u"    '/data.jpg'", u'\n\t"file.jpg"\n\t\t']
     results = ['http://www.example.com/images/image.jpg',
                'http://www.example.com/data.jpg',
                'http://www.example.com/images/file.jpg']
     htmlpage = HtmlPage(url=u"http://www.example.com/images/",
                         body=u'<html><body></body></html>',
                         encoding='utf-8')
     url_p = UrlFieldTypeProcessor()
     img_p = ImagesFieldTypeProcessor()
     for text, url in zip(urls, results):
         self.assertEqual(img_p.adapt(img_p.extract(text), htmlpage), url)
         self.assertEqual(url_p.adapt(url_p.extract(text), htmlpage), url)
Beispiel #24
0
 def test_uri_with_illegal_html_entities(self):
     urls = [u'&#9;&#10 image.jpg ', u"    '/&#11;&#0;data.jpg'",
             u'&#15;\n\t"&#14;file.jpg"\n\t\t']
     results = ['http://www.example.com/images/image.jpg',
                'http://www.example.com/data.jpg',
                'http://www.example.com/images/file.jpg']
     htmlpage = HtmlPage(url=u"http://www.example.com/images/",
                         body=u'<html><body></body></html>',
                         encoding='utf-8')
     url_p = UrlFieldTypeProcessor()
     img_p = ImagesFieldTypeProcessor()
     for text, url in zip(urls, results):
         self.assertEqual(img_p.adapt(img_p.extract(text), htmlpage), url)
         self.assertEqual(url_p.adapt(url_p.extract(text), htmlpage), url)
Beispiel #25
0
def _get_cleansing(target_html, annotations):
    """
    Gets relevant pieces of text affected by browser cleansing.
    """

    numbered_html = add_tagids(target_html)
    target = HtmlPage(body=numbered_html)
    element = target.parsed_body[0]

    all_cleansing = {}
    for annotation in annotations:
        if isinstance(annotation, list):  # partial annotation

            # search insert point we are interested on
            target_it = iter(target.parsed_body)
            for p in annotation:
                if isinstance(p, HtmlTag) and "insert-after" in p.attributes:
                    insert_after = p.attributes["insert-after"]
                    break
            while not (isinstance(element, HtmlTag) and
                       element.attributes.get(TAGID) == insert_after):
                element = target_it.next()

            # 1. browser removes tags inside <option>...</option>
            # 2. browser adds </option> if it is not present
            if element.tag == "option" and \
                    element.tag_type == HtmlTagType.OPEN_TAG:
                cached = []
                add_cached = False
                closed_option = False
                element = target_it.next()
                while not (isinstance(element, HtmlTag) and
                           element.tag in ["option", "select"]):
                    cached.append(element)
                    if hasattr(element, 'tag'):
                        add_cached = True
                    element = target_it.next()

                if (element.tag == "option" and
                    element.tag_type == HtmlTagType.OPEN_TAG) or \
                        (element.tag == "select" and
                         element.tag_type == HtmlTagType.CLOSE_TAG):
                    closed_option = True

                if add_cached or closed_option:
                    out = "".join([numbered_html[e.start:e.end]
                                  for e in cached])
                    all_cleansing[insert_after] = out

    return all_cleansing
Beispiel #26
0
 def _requests_to_follow(self, htmlpage):
     if self._links_ibl_extractor is not None:
         extracted = self._links_ibl_extractor.extract(htmlpage)[0]
         if extracted:
             extracted_regions = extracted[0].get('_links', [])
             seen = set()
             for region in extracted_regions:
                 htmlregion = HtmlPage(htmlpage.url, htmlpage.headers, region, encoding=htmlpage.encoding)
                 for request in self._request_to_follow_from_region(htmlregion):
                     if request.url in seen:
                         continue
                     seen.add(request.url)
                     yield request
     else:
         for request in self._request_to_follow_from_region(htmlpage):
             yield request
Beispiel #27
0
 def test_site_pages(self):
     """
     Tests from real pages. More reliable and easy to build for more complicated structures
     """
     for source, annotations in iter_samples('pageparsing'):
         template = HtmlPage(body=source)
         parser = TemplatePageParser(TokenDict())
         parser.feed(template)
         for annotation in parser.annotations:
             test_annotation = annotations.pop(0)
             for s in annotation.__slots__:
                 if s == "tag_attributes":
                     for pair in getattr(annotation, s):
                         self.assertEqual(list(pair), test_annotation[s].pop(0))
                 else:
                     self.assertEqual(getattr(annotation, s), test_annotation[s])
         self.assertEqual(annotations, [])
    def _process_link_regions(self, htmlpage, link_regions):
        """Process link regions if any, and generate requests"""
        if link_regions:
            for link_region in link_regions:

                #if isinstance(link_region, six.string_types) :
                #    link_region = link_region.decode(htmlpage.encoding)

                htmlregion = HtmlPage(htmlpage.url,
                                      htmlpage.headers,
                                      link_region,
                                      encoding=htmlpage.encoding)
                for request in self._requests_to_follow(htmlregion):
                    yield request
        else:
            for request in self._requests_to_follow(htmlpage):
                yield request
Beispiel #29
0
def add_tagids(source):
    """
    Applies a unique attribute code number for each tag element in order to be
    identified later in the process of apply annotation"""
    output = []
    tagcount = 0
    if not isinstance(source, HtmlPage):
        source = HtmlPage(body=source)
    for element in source.parsed_body:
        if _must_add_tagid(element):
            element.attributes[TAGID] = str(tagcount)
            tagcount += 1
            output.append(serialize_tag(element))
        else:
            output.append(source.body[element.start:element.end])

    return ''.join(output)
Beispiel #30
0
def _modify_tagids(source, add=True):
    """Add or remove tags ids to/from HTML document"""
    output = []
    tagcount = 0
    if not isinstance(source, HtmlPage):
        source = HtmlPage(body=source)
    for element in source.parsed_body:
        if _must_add_tagid(element):
            if add:
                element.attributes[TAGID] = str(tagcount)
                tagcount += 1
            else:  # Remove previously added tagid
                element.attributes.pop(TAGID, None)
            output.append(serialize_tag(element))
        else:
            output.append(source.body[element.start:element.end])
    return u''.join(output)
Beispiel #31
0
 def spider_opened(self, spider):
     try:
         clustering = spider.plugins['Annotations'].clustering
         assert bool(clustering) == True
         self.clustering_enabled = True
     except (KeyError, AttributeError, AssertionError):
         logging.warning('Persistent page clustering has not been enabled '
                         'because page clustering is not enabled for this '
                         'spider')
         return
     if not os.path.exists(self.directory):
         os.makedirs(self.directory)
     dbpath = os.path.join(self.directory, spider.name)
     flag = 'n' if self.reset else 'c'
     self.db = self.dbmodule.open(dbpath, flag=flag)
     for data in getattr(self.db, 'itervalues', self.db.values)():
         page, encoding = json.loads(data)
         clustering.add_page(HtmlPage(body=page.decode(encoding)))
Beispiel #32
0
 def test_empty_subregion(self):
     htmlpage = HtmlPage(body=u"")
     self.assertEqual(htmlpage.subregion(), u"")