Exemple #1
0
    def test_schemaorg_CreativeWork(self):
        body = get_testdata('schema.org', 'CreativeWork.001.html')
        expected = json.loads(get_testdata('schema.org', 'CreativeWork.001.jsonld').decode('UTF-8'))

        jsonlde = JsonLdExtractor()
        data = jsonlde.extract(body)
        self.assertEqual(data, expected)
Exemple #2
0
def metadata_from_url(url, microdata=True, jsonld=True, rdfa=True):
    resp = requests.get(url, timeout=30)
    result = {
        'url': url,
        'status': '{} {}'.format(resp.status_code, resp.reason)
    }
    try:
        resp.raise_for_status()
    except requests.exceptions.HTTPError:
        return result

    parser = XmlDomHTMLParser(encoding=resp.encoding)
    tree = lxml.html.fromstring(resp.content, parser=parser)

    if microdata:
        mde = MicrodataExtractor(nested=True)
        result['microdata'] = mde.extract_items(tree, resp.url)

    if jsonld:
        jsonlde = JsonLdExtractor()
        result['json-ld'] = jsonlde.extract_items(tree, resp.url)

    if rdfa:
        rdfae = RDFaExtractor()
        result['rdfa'] = rdfae.extract_items(tree, resp.url)

    return result
def extract_all_json_ld(html_text):
    try:
        jslde = JsonLdExtractor()
        data = jslde.extract(html_text)
        return data
    except:
        return {}
Exemple #4
0
    def __init__(self, response, microdata=False, jsonld=False, rdfa=False):
        self.response = response
        self.microdata = microdata
        self.jsonld = jsonld
        self.rdfa = rdfa

        if rdfa:
            try:
                self.rdfae = RDFaExtractor()
                self.rdfadata = self.rdfae.extract(self.response.text,
                                                   url=self.response.url)
            except JSONDecodeError:
                pass
        if microdata:
            try:
                self.mde = MicrodataExtractor()
                self.mdedata = self.mde.extract(self.response.text)
            except JSONDecodeError:
                pass
        if jsonld:
            try:
                self.jlde = JsonLdExtractor()
                self.jldata = self.jlde.extract(self.response.text)
            except (JSONDecodeError, TypeError):
                self.jldata = []
            finally:
                # Sometimes we get this in the meta dict from RISJExtractJSONLD
                self.jldata.extend(self.response.meta.get('json-ld', []))
Exemple #5
0
    def test_schemaorg_CreativeWork(self):
        for i in [1]:
            body = get_testdata('schema.org', 'CreativeWork.{:03d}.html'.format(i))
            expected = json.loads(get_testdata('schema.org', 'CreativeWork.{:03d}.jsonld'.format(i)).decode('UTF-8'))

            jsonlde = JsonLdExtractor()
            data = jsonlde.extract(body)
            self.assertEqual(data, expected)
Exemple #6
0
    def test_schemaorg_CreativeWork(self):
        for i in [1]:
            body = get_testdata('schema.org', 'CreativeWork.{:03d}.html'.format(i))
            expected = json.loads(get_testdata('schema.org', 'CreativeWork.{:03d}.jsonld'.format(i)).decode('UTF-8'))

            jsonlde = JsonLdExtractor()
            data = jsonlde.extract(body)
            self.assertDictEqual(data, expected)
Exemple #7
0
    def test_songkick(self):
        page = "Elysian Fields Brooklyn Tickets, The Owl Music Parlor, 31 Oct 2015"
        body = get_testdata('songkick', '{}.html'.format(page))
        expected = json.loads(get_testdata('songkick', '{}.jsonld'.format(page)).decode('UTF-8'))

        jsonlde = JsonLdExtractor()
        data = jsonlde.extract(body)
        self.assertEqual(data, expected)
Exemple #8
0
    def test_null(self):
        page = "null_ld_mock"
        body = get_testdata('misc', '{}.html'.format(page))
        expected = json.loads(
            get_testdata('misc', '{}.jsonld'.format(page)).decode('UTF-8'))

        jsonlde = JsonLdExtractor()
        data = jsonlde.extract(body)
        self.assertEqual(data, expected)
Exemple #9
0
    def parse_item(self, response):
        self.logger.info('Hi, this is an item page! %s', response.url)

        # Fields to output for each tool
        my_keys = [
            '@id', '@type', 'applicationCategory', 'name', 'alternateName',
            'description', 'url', 'sameAs', 'image', 'genre',
            'softwareVersion', 'softwareRequirements', 'operatingSystem',
            'downloadUrl', 'installUrl'
        ]

        #oururl= urlopen(url).read()
        #print(oururl)
        extractor = JsonLdExtractor()
        #with urllib.request.urlopen(url) as response:
        #html_text = response.read().decode('utf-8')

        my_items = extractor.extract(response.body_as_unicode(), response.url)

        # If this is a -tool page only:
        #my_items = extractor.extract(html_text)
        this_item = my_items['items'][0]['@graph'][0]
        my_item = {}

        # output all basic items
        for this_key in my_keys:
            #      print(this_key,'\t',this_item[this_key])
            my_item[this_key] = this_item[this_key]

        # get license
        license_type = this_item['license']['@type']
        license_text = self.removeNonAscii(this_item['license']['text'])
        #   print('license_type','\t',license_type)
        #   print('license_text','\t',license_text)
        my_item['license_type'] = license_type
        my_item['license_text'] = license_text

        # Get pmcrefcount of first only
        Entrez.email = "*****@*****.**"
        first_pub = this_item['publication'][0]
        pmcrefcount = 0
        if 'pubmed' in first_pub['url']:
            this_pmid = first_pub['url'].split('/')[-1:]
            pmcrefcount = Entrez.read(
                Entrez.efetch(db="pubmed", id=this_pmid,
                              rettype="docsum"))[0]['PmcRefCount']
        #   print('primary_pub','\t',first_pub['name'])
        #   print('primary_pub_url','\t',first_pub['url'])
        #   print('primary_pub_pmcrefcount','\t',pmcrefcount)
        my_item['primary_pub'] = first_pub['name']
        my_item['primary_pub_url'] = first_pub['url']
        my_item['primary_pub_pmcrefcount'] = pmcrefcount

        #return my_item

        yield my_item
        return my_item
Exemple #10
0
    def test_jsonld_with_comments(self):
        for prefix in ['JoinAction.001',
                       'AllocateAction.001',
                ]:
            body = get_testdata('schema.org.invalid', '{}.html'.format(prefix))
            expected = json.loads(get_testdata('schema.org.invalid', '{}.jsonld'.format(prefix)).decode('UTF-8'))

            jsonlde = JsonLdExtractor()
            data = jsonlde.extract(body)
            self.assertEqual(data, expected)
Exemple #11
0
    def parse_item(self, response):
        """Parse the recipe to get title and ingredients."""
        schema_type = "mde"
        mde = MicrodataExtractor()
        data = mde.extract(response.body)
        # print('response.body:', response.body)
        # print('data:', data)
        if len(data) == 0:
            jslde = JsonLdExtractor()
            data = jslde.extract(response.body)
            schema_type = "jsonld"

        if schema_type == "mde":
            recipe = data[0]['properties']
            # recipe_output_item = RecipeItem()
            # recipe_output_item['recipe_name'] = recipe['name']
            # recipe_output_item['ingredients'] = [
            #     ingredient for ingredient in recipe['ingredients']
            #     if ingredient not in ['', 'Add all ingredients to list']
            # ]
            # recipe_output_item['tags'] = [tag['properties']['title']
            #                               for tag in data['items'][1:]]
            # try:
            #   recipe_output_item['description'] = recipe['description']
            # except KeyError:
            #   recipe_output_item['description'] = None
            # recipe_output_item['url'] = recipe['url']
        elif schema_type == "jsonld":
            recipe = data['items'][0]
            # recipe_output_item = RecipeItem()
            # recipe_output_item['recipe_name'] = recipe['name']
            # recipe_output_item['ingredients'] = recipe['ingredients']
            # recipe_output_item['tags'] = [tag['properties']['title']
            #                               for tag in data['items'][1:]]
            # try:
            #   recipe_output_item['description'] = recipe['description']
            # except KeyError:
            #   recipe_output_item['description'] = None
            # recipe_output_item['url'] = recipe['url']

        properties = [
            'totalTime', 'nutrition', 'name', 'author', 'url', 'image',
            'recipeIngredient', 'aggregateRating', 'recipeYield',
            'recipeInstructions', 'video', 'mainEntityOfPage', 'cookTime',
            'recipeCategory', 'review', 'prepTime', 'description'
        ]
        recipe_output_item = RecipeItem()
        for prop in properties:
            try:
                recipe_output_item[prop] = recipe[prop]
            except KeyError:
                recipe_output_item[prop] = None

        yield recipe_output_item
Exemple #12
0
    def test_songkick(self):
        for page in [
                "Elysian Fields Brooklyn Tickets, The Owl Music Parlor, 31 Oct 2015",
                #"Maxïmo Park Gigography, Tour History & Past Concerts",
                #"Years & Years Tickets, Tour Dates 2015 & Concerts",
            ]:
            body = get_testdata('songkick', '{}.html'.format(page))
            expected = json.loads(get_testdata('songkick', '{}.jsonld'.format(page)).decode('UTF-8'))

            jsonlde = JsonLdExtractor()
            data = jsonlde.extract(body)
            self.assertEqual(data, expected)
Exemple #13
0
 def set_data(self):
     """ Extract JSON-LD data from self.page and store as a string in self.data. Return True if data is present, False if not. Raise RuntimeError if JSON-LD cannot be extracted (e.g. if it is malformed)."""
     jslde = JsonLdExtractor()
     try:
         self.data = dumps(jslde.extract(self.page, base_url=self.base_url))
         if self.data == "[]":
             return False
         else:
             return True
     except:
         msg = "Error extracting data from page"
         raise RuntimeError(msg)
Exemple #14
0
    def test_songkick(self):
        for page in [
                "Elysian Fields Brooklyn Tickets, The Owl Music Parlor, 31 Oct 2015",
                #"Maxïmo Park Gigography, Tour History & Past Concerts",
                #"Years & Years Tickets, Tour Dates 2015 & Concerts",
            ]:
            body = get_testdata('songkick', '{}.html'.format(page))
            expected = json.loads(get_testdata('songkick', '{}.jsonld'.format(page)).decode('UTF-8'))

            jsonlde = JsonLdExtractor()
            data = jsonlde.extract(body)
            self.assertDictEqual(data, expected)
def extract_json_ld(html, typ_str):
    try:
        jslde = JsonLdExtractor()
        items = jslde.extract(html)
        for item in items:
            item_context = item.get('@context', '').rstrip(' /')
            if (item_context == 'http://schema.org' or item_context == 'https://schema.org') \
                    and item.get('@type', '') == typ_str:
                return item

        return None
    except:
        return None
    def extract(self, html_text: str,
                extract_title: bool = False,
                extract_meta: bool = False,
                extract_microdata: bool = False,
                microdata_base_url: str = "",
                extract_json_ld: bool = False,
                extract_rdfa: bool = False,
                rdfa_base_url: str = "") \
            -> List[Extraction]:
        """
        Args:
            html_text (str): input html string to be extracted
            extract_title (bool): True if string of 'title' tag needs to be extracted, return as { "title": "..." }
            extract_meta (bool): True if string of 'meta' tags needs to be extracted, return as { "meta": { "author": "...", ...}}
            extract_microdata (bool): True if microdata needs to be extracted, returns as { "microdata": [...] }
            microdata_base_url (str): base namespace url for microdata, empty string if no base url is specified
            extract_json_ld (bool): True if json-ld needs to be extracted, return as { "json-ld": [...] }
            extract_rdfa (bool): True if rdfs needs to be extracted, returns as { "rdfa": [...] }
            rdfa_base_url (str): base namespace url for rdfa, empty string if no base url is specified

        Returns:
            List[Extraction]: the list of extraction or the empty list if there are no matches.
        """
        res = list()
        soup = BeautifulSoup(html_text, 'html.parser')

        if soup.title and extract_title:
            title = self._wrap_data("title", soup.title.string.encode('utf-8').decode('utf-8'))
            res.append(title)

        if soup.title and extract_meta:
            meta_content = self._wrap_meta_content(soup.find_all("meta"))
            meta_data = self._wrap_data("meta", meta_content)
            res.append(meta_data)

        if extract_microdata:
            mde = MicrodataExtractor()
            mde_data = self._wrap_data("microdata", mde.extract(html_text, microdata_base_url))
            res.append(mde_data)

        if extract_json_ld:
            jslde = JsonLdExtractor()
            jslde_data = self._wrap_data("json-ld", jslde.extract(html_text))
            res.append(jslde_data)

        if extract_rdfa:
            rdfae = RDFaExtractor()
            rdfae_data = self._wrap_data("rdfa", rdfae.extract(html_text, rdfa_base_url))
            res.append(rdfae_data)

        return res
Exemple #17
0
    def parse_item(self, response):
        items = []
        def microdata2jsonld(md):
            if md.get('properties'):
                item = md['properties']
                item['@type'] = md.get('type')
                return item
        items += map(microdata2jsonld, MicrodataExtractor().extract(
            response.body_as_unicode(), response.url)['items'])
        items += JsonLdExtractor().extract(
            response.body_as_unicode(), response.url)['items']

        if not items:
            self.logger.debug("No Microdata items found for %s", response.url)

        self.logger.debug("Checking URL for item: %s" , items)

        for item in items:
            if not item or not item.get('url'):
                self.logger.debug("No URL for item: %s" , item)
                continue

            if item['url'] != response.url:
                self.logger.debug("Not in main URL, go there..")
                yield Request(item['url'], callback=self.parse_item)
            else:
                item['@type'] = item.get('type')
                self.logger.debug("Parsed microdata: %s" % item)
                yield item
Exemple #18
0
def extract(htmlstring, url='http://www.example.com/', encoding="UTF-8"):
    domparser = XmlDomHTMLParser(encoding=encoding)
    tree = fromstring(htmlstring, parser=domparser)
    return {
        name: extractor.extract_items(tree, url=url)
        for name, extractor in (('json-ld', JsonLdExtractor()),
                                ('microdata', MicrodataExtractor()),
                                ('rdfa', RDFaExtractor()))
    }
Exemple #19
0
def loadSOGraphFromHtml(html, url):
    """
    Extract jsonld entries from provided HTML text

    Args:
        html(string): HTML text to be parsed

    Returns:
        ConjunctiveGraph: Graph loaded from html

    """
    jslde = JsonLdExtractor()
    json_content = jslde.extract(html)
    g = ConjunctiveGraph()
    for json_data in json_content:
        g_data = loadSOGraph(data=json.dumps(json_data), publicID=url)
        g += g_data
    return g
Exemple #20
0
def async_extruct(url, microdata=True, jsonld=True):
    response.content_type = 'application/json'
    resp = requests.get(url, timeout=30)

    parser = lxml.html.HTMLParser(encoding=resp.encoding)
    lxmldoc = lxml.html.fromstring(resp.content, parser=parser)

    result = {'url': url, 'status': 'ok'}

    if microdata:
        mde = MicrodataExtractor(nested=True)
        result['microdata'] = mde.extract_items(lxmldoc, url)

    if jsonld:
        jsonlde = JsonLdExtractor()
        result['json-ld'] = jsonlde.extract_items(lxmldoc)

    return result
Exemple #21
0
    def parse_scene(self, response):
        jslde = JsonLdExtractor()
        json = jslde.extract(response.text)
        data = {}
        for obj in json:
            if obj['@type'] == 'VideoObject':
                data = obj
                break

        item = SceneItem()
        item['title'] = self.cleanup_title(data['name'])
        item['description'] = self.cleanup_description(data['description'])
        item['image'] = data['thumbnail']
        item['image_blob'] = None
        item['id'] = self.get_id(response)
        item['trailer'] = data['contentUrl']
        item['url'] = response.url
        item['date'] = self.parse_date(data['datePublished']).isoformat()
        item['site'] = data['author']['name']
        item['network'] = self.network
        item['parent'] = item['site']

        item['performers'] = []
        for model in data['actor']:
            item['performers'].append(model['name'])

        item['tags'] = self.get_tags(response)
        days = int(self.days)
        if days > 27375:
            filterdate = "0000-00-00"
        else:
            filterdate = date.today() - timedelta(days)
            filterdate = filterdate.strftime('%Y-%m-%d')

        if self.debug:
            if not item['date'] > filterdate:
                item['filtered'] = "Scene filtered due to date restraint"
            print(item)
        else:
            if filterdate:
                if item['date'] > filterdate:
                    yield item
            else:
                yield item
    def parse_item(self, response):
        """Parse the recipe to get title and ingredients."""
        schema_type = "mde"
        mde = MicrodataExtractor()
        data = mde.extract(response.body)
        if len(data['items']) == 0:
            jslde = JsonLdExtractor()
            data = jslde.extract(response.body)
            schema_type = "jsonld"

        if schema_type == "mde":
            recipe = data['items'][2]['properties']
            recipe_output_item = RecipeItem()
            recipe_output_item['recipe_name'] = recipe['name']
            recipe_output_item['ingredients'] = [
                ingredient for ingredient in recipe['ingredients']
                if ingredient not in ['', 'Add all ingredients to list']
            ]
            recipe_tags = recipe['recipeCategory']
            if 'recipeCuisine' in recipe.keys():
                recipe_tags.append(recipe['recipeCuisine'])
            recipe_output_item['tags'] = recipe_tags
            try:
                recipe_output_item['description'] = recipe['description']
            except KeyError:
                recipe_output_item['description'] = None
            recipe_output_item['url'] = recipe['url']
        elif schema_type == "jsonld":
            recipe = data['items'][0]
            recipe_output_item = RecipeItem()
            recipe_output_item['recipe_name'] = recipe['name']
            recipe_output_item['ingredients'] = recipe['ingredients']
            recipe_output_item['tags'] = [
                tag['properties']['title'] for tag in data['items'][1:]
            ]
            try:
                recipe_output_item['description'] = recipe['description']
            except KeyError:
                recipe_output_item['description'] = None
            recipe_output_item['url'] = recipe['url']

        yield recipe_output_item
Exemple #23
0
  def parse_item(self, response):
    """Parse the recipe to get title and ingredients."""
    schema_type = "mde"
    mde = MicrodataExtractor()
    data = mde.extract(response.body)
    if len(data['items']) == 0:
      jslde = JsonLdExtractor()
      data = jslde.extract(response.body)
      schema_type = "jsonld"

    if schema_type == "mde":
      recipe = data['items'][2]['properties']
      recipe_output_item = RecipeItem()
      recipe_output_item['recipe_name'] = recipe['name']
      recipe_output_item['ingredients'] = [
          ingredient for ingredient in recipe['ingredients']
          if ingredient not in ['', 'Add all ingredients to list']
      ]
      recipe_tags = recipe['recipeCategory']
      if 'recipeCuisine' in recipe.keys():
        recipe_tags.append(recipe['recipeCuisine'])
      recipe_output_item['tags'] = recipe_tags
      try:
        recipe_output_item['description'] = recipe['description']
      except KeyError:
        recipe_output_item['description'] = None
      recipe_output_item['url'] = recipe['url']
    elif schema_type == "jsonld":
      recipe = data['items'][0]
      recipe_output_item = RecipeItem()
      recipe_output_item['recipe_name'] = recipe['name']
      recipe_output_item['ingredients'] = recipe['ingredients']
      recipe_output_item['tags'] = [tag['properties']['title']
                                    for tag in data['items'][1:]]
      try:
        recipe_output_item['description'] = recipe['description']
      except KeyError:
        recipe_output_item['description'] = None
      recipe_output_item['url'] = recipe['url']

    yield recipe_output_item
Exemple #24
0
def async_extruct(url, microdata=True, jsonld=True):
    resp = requests.get(url, timeout=30)

    parser = lxml.html.HTMLParser(encoding=resp.encoding)
    lxmldoc = lxml.html.fromstring(resp.content, parser=parser)

    result = {"url": url, "status": "ok"}

    if microdata:
        mde = MicrodataExtractor(nested=True)
        microdata = mde.extract_items(lxmldoc, url)
        if microdata.get("items", []):
            result["microdata"] = microdata

    if jsonld:
        jsonlde = JsonLdExtractor()
        jsonldata = jsonlde.extract_items(lxmldoc)
        if jsonldata.get("items", []):
            result["json-ld"] = jsonldata

    return result
Exemple #25
0
def async_extruct(url, microdata=True, jsonld=True):
    response.content_type = 'application/json'
    resp = requests.get(url, timeout=30)

    parser = lxml.html.HTMLParser(encoding=resp.encoding)
    lxmldoc = lxml.html.fromstring(resp.content, parser=parser)

    result = {'url': url, 'status': 'ok'}

    if microdata:
        mde = MicrodataExtractor(nested=True)
        microdata = mde.extract_items(lxmldoc, url)
        if microdata.get('items', []):
            result['microdata'] = microdata

    if jsonld:
        jsonlde = JsonLdExtractor()
        jsonldata = jsonlde.extract_items(lxmldoc)
        if jsonldata.get('items', []):
            result['json-ld'] = jsonldata

    return result
    def parse_job(self, response):
        # Create the loader using the response
        # E.G. : l.add_xpath('item', '*xpath*', re='*expression*')
        jslde = JsonLdExtractor()
        data = jslde.extract(response.text)
        data = data[0]

        l = ItemLoader(item=JobsjsonItem(), response=response)
        l.add_value('search_postcode', self.search_postcode)
        l.add_value('search_radius', self.search_radius)
        l.add_value('date_scraped', time.strftime("%Y-%m-%d %H:%M:%S"))
        l.add_value('date_posted', data['datePosted'])
        l.add_value('valid_until', data['validThrough'])
        l.add_value('job_id', response.url, re='\d{7}')
        l.add_value('job_title', data['title'])
        l.add_value('job_type', data['employmentType'])
        l.add_value('location', ",".join([  data['jobLocation']['address']['addressLocality'], data['jobLocation']['address']['addressRegion'], data['jobLocation']['address']['postalCode'], data['jobLocation']['address']['addressCountry']]))
        l.add_xpath('contact_name', '//table[@class="job-listing-table"]//tr[8]//td//text()')
        l.add_xpath('start_date', '//table[@class="job-listing-table"]//tr[6]//td//text()')
        try:
            l.add_value('salary_min', data['baseSalary']['value']['value'])
        except:
            l.add_value('salary_min', 'NA')
        l.add_value('listed_on', data['datePosted'])
        l.add_value('recruiter', data['hiringOrganization']['name'])
        l.add_value('recruiter_url', data['hiringOrganization']['sameAs'])
        try:
            l.add_value('job_reference', data['identifier']['value'])
        except:
            l.add_value('job_reference', 'NA')
        l.add_value('url', response.url)
        l.add_value('job_description', data['description'])
        l.add_value('job_skills', data['skills'])
        l.add_value('addressLocality', data['jobLocation']['address']['addressLocality'])
        l.add_value('addressRegion', data['jobLocation']['address']['addressRegion'])
        l.add_value('postalCode', data['jobLocation']['address']['postalCode'])
        l.add_value('addressCountry', data['jobLocation']['address']['addressCountry'])
        return l.load_item()
Exemple #27
0
    def extract(self, html_text: str,
                extract_title: bool = False,
                extract_meta: bool = False,
                extract_microdata: bool = False,
                extract_json_ld: bool = False,
                extract_rdfa: bool = False) \
            -> List[Extraction]:

        res = list()
        soup = BeautifulSoup(html_text, 'html.parser')

        if soup.title and extract_title:
            title = self.wrap_data(
                "title",
                soup.title.string.encode('utf-8').decode('utf-8'))
            res.append(title)

        if soup.title and extract_meta:
            meta_content = self.wrap_meta_content(soup.find_all("meta"))
            meta_data = self.wrap_data("meta", meta_content)
            res.append(meta_data)

        if extract_microdata:
            mde = MicrodataExtractor()
            mde_data = self.wrap_data("microdata", mde.extract(html_text))
            res.append(mde_data)

        if extract_json_ld:
            jslde = JsonLdExtractor()
            jslde_data = self.wrap_data("json-ld", jslde.extract(html_text))
            res.append(jslde_data)

        if extract_rdfa:
            rdfae = RDFaExtractor()
            rdfae_data = self.wrap_data("rdfa", rdfae.extract(html_text))
            res.append(rdfae_data)

        return res
Exemple #28
0
    def extract_jsonld(self, response, _id=None):
        """
        Scrapy Spider Request Callback Function

        * Inject an _id field for database pipeline
        * Use response URL as default _id

        """

        jslds = JsonLdExtractor().extract(response.body)

        for jsld in jslds:
            if _id:
                jsld['_id'] = _id
            else:
                jsld['_id'] = response.url

            logging.debug(jsld)
            yield jsld
Exemple #29
0
class RISJMetadataExtractor(object):
    """An extruct-based metadata extractor"""

    # TODO: Extend to microdata and RDFa, replacing bespoke xpath code. Then
    #       test on body of crawlers!
    def __init__(self, response, microdata=False, jsonld=False, rdfa=False):
        self.response = response
        self.microdata = microdata
        self.jsonld = jsonld
        self.rdfa = rdfa

        if rdfa:
            try:
                self.rdfae = RDFaExtractor()
                self.rdfadata = self.rdfae.extract(self.response.text,
                                                   url=self.response.url)
            except JSONDecodeError:
                pass
        if microdata:
            try:
                self.mde = MicrodataExtractor()
                self.mdedata = self.mde.extract(self.response.text)
            except JSONDecodeError:
                pass
        if jsonld:
            try:
                self.jlde = JsonLdExtractor()
                self.jldata = self.jlde.extract(self.response.text)
            except (JSONDecodeError, TypeError):
                self.jldata = []
            finally:
                # Sometimes we get this in the meta dict from RISJExtractJSONLD
                self.jldata.extend(self.response.meta.get('json-ld', []))

    def extract_newsarticle_schemaorg(self,
                                      microdata=None,
                                      jsonld=None,
                                      rdfa=None):
        """Extract schema.org NewsArticle metadata, encoded using any
           supported metadata format. Note that we only try to extract the
           *first* block of NewsArticle data for each method (which is then
           combined with the first extracted from other methods if more than
           one is selected."""
        if microdata is None:
            microdata = self.microdata
        if jsonld is None:
            jsonld = self.jsonld
        if rdfa is None:
            rdfa = self.rdfa

        outd = {}
        if jsonld:
            for d in self.jldata:
                #                logger.debug('Analysing JSON-LD data: '+pformat(d))
                try:
                    if (re.match(r'https?://schema.org/?', d['@context'])
                            and d['@type'] == 'NewsArticle'):
                        outd.update(d)
                except (KeyError, TypeError):
                    continue
        if microdata:
            for d in self.mdedata:
                logger.debug('Analysing W3C microdata: ' + pformat(d))
                if re.match(r'https?://schema.org/NewsArticle/?',
                            d.get('type', '')):
                    outd.update(d)
        if rdfa:
            raise NotImplementedError
#        logger.debug('Returning schema.org NewsArticle: '+pformat(outd))
        return outd
Exemple #30
0
def extract(htmlstring,
            base_url=None,
            encoding="UTF-8",
            syntaxes=SYNTAXES,
            errors='strict',
            uniform=False,
            return_html_node=False,
            schema_context='http://schema.org',
            with_og_array=False,
            **kwargs):
    """htmlstring: string with valid html document;
       base_url: base url of the html document
       encoding: encoding of the html document
       syntaxes: list of syntaxes to extract, default SYNTAXES
       errors: set to 'log' to log the exceptions, 'ignore' to ignore them
               or 'strict'(default) to raise them
       uniform: if True uniform output format of all syntaxes to a list of dicts.
                Returned dicts structure:
                {'@context': 'http://example.com',
                 '@type': 'example_type',
                 /* All other the properties in keys here */
                 }
       return_html_node: if True, it includes into the result a HTML node of
                         respective embedded metadata under 'htmlNode' key.
                         The feature is supported only by microdata syntax.
                         Each node is of `lxml.etree.Element` type.
       schema_context: schema's context for current page"""
    if base_url is None and 'url' in kwargs:
        warnings.warn('"url" argument is deprecated, please use "base_url"',
                      DeprecationWarning,
                      stacklevel=2)
        base_url = kwargs.pop('url')
    if kwargs:
        raise TypeError('Unexpected keyword arguments')
    if not (isinstance(syntaxes, list) and all(v in SYNTAXES
                                               for v in syntaxes)):
        raise ValueError("syntaxes must be a list with any or all (default) of"
                         "these values: {}".format(SYNTAXES))
    if errors not in ['log', 'ignore', 'strict']:
        raise ValueError('Invalid error command, valid values are either "log"'
                         ', "ignore" or "strict"')
    try:
        tree = parse_xmldom_html(htmlstring, encoding=encoding)
    except Exception as e:
        if errors == 'ignore':
            return {}
        if errors == 'log':
            logger.exception('Failed to parse html, raises {}'.format(e))
            return {}
        if errors == 'strict':
            raise
    processors = []
    if 'microdata' in syntaxes:
        processors.append(
            ('microdata',
             MicrodataExtractor(add_html_node=return_html_node).extract_items,
             tree))
    if 'json-ld' in syntaxes:
        processors.append((
            'json-ld',
            JsonLdExtractor().extract_items,
            tree,
        ))
    if 'opengraph' in syntaxes:
        processors.append(
            ('opengraph', OpenGraphExtractor().extract_items, tree))
    if 'microformat' in syntaxes:
        processors.append(
            ('microformat', MicroformatExtractor().extract_items, htmlstring))
    if 'rdfa' in syntaxes:
        processors.append((
            'rdfa',
            RDFaExtractor().extract_items,
            tree,
        ))
    if 'dublincore' in syntaxes:
        processors.append((
            'dublincore',
            DublinCoreExtractor().extract_items,
            tree,
        ))
    output = {}
    for syntax, extract, document in processors:
        try:
            output[syntax] = list(extract(document, base_url=base_url))
        except Exception as e:
            if errors == 'log':
                logger.exception('Failed to extract {}, raises {}'.format(
                    syntax, e))
            if errors == 'ignore':
                pass
            if errors == 'strict':
                raise
    if uniform:
        uniform_processors = []
        if 'microdata' in syntaxes:
            uniform_processors.append((
                'microdata',
                _umicrodata_microformat,
                output['microdata'],
                schema_context,
            ))
        if 'microformat' in syntaxes:
            uniform_processors.append((
                'microformat',
                _umicrodata_microformat,
                output['microformat'],
                'http://microformats.org/wiki/',
            ))
        if 'opengraph' in syntaxes:
            uniform_processors.append((
                'opengraph',
                _uopengraph,
                output['opengraph'],
                None,
            ))
        if 'dublincore' in syntaxes:
            uniform_processors.append((
                'dublincore',
                _udublincore,
                output['dublincore'],
                None,
            ))

        for syntax, uniform, raw, schema_context in uniform_processors:
            try:
                if syntax == 'opengraph':
                    output[syntax] = uniform(raw, with_og_array=with_og_array)
                elif syntax == 'dublincore':
                    output[syntax] = uniform(raw)
                else:
                    output[syntax] = uniform(raw, schema_context)
            except Exception as e:
                if errors == 'ignore':
                    output[syntax] = []
                if errors == 'log':
                    output[syntax] = []
                    logger.exception(
                        'Failed to uniform extracted for {}, raises {}'.format(
                            syntax, e))
                if errors == 'strict':
                    raise

    return output
Exemple #31
0
 def _extract_json_data(self, blob):
     html = blob.download_as_string().decode()
     jslde = JsonLdExtractor()
     data = jslde.extract(html)
     return json.dumps(strip_strings(data))
Exemple #32
0
def extract(htmlstring,
            base_url=None,
            encoding="UTF-8",
            syntaxes=SYNTAXES,
            errors='strict',
            uniform=False,
            schema_context='http://schema.org',
            **kwargs):
    """htmlstring: string with valid html document;
       base_url: base url of the html document
       encoding: encoding of the html document
       syntaxes: list of syntaxes to extract, default SYNTAXES
       errors: set to 'log' to log the exceptions, 'ignore' to ignore them
               or 'strict'(default) to raise them
       uniform: if True uniform output format of all syntaxes to a list of dicts.
                Returned dicts structure:
                {'@context': 'http://example.com',
                 '@type': 'example_type',
                 /* All other the properties in keys here */
                 }
       schema_context: schema's context for current page"""
    if base_url is None and 'url' in kwargs:
        warnings.warn('"url" argument is deprecated, please use "base_url"',
                      DeprecationWarning)
        base_url = kwargs.pop('url')
    if kwargs:
        raise TypeError('Unexpected keyword arguments')
    if not (isinstance(syntaxes, list) and all(v in SYNTAXES
                                               for v in syntaxes)):
        raise ValueError("syntaxes must be a list with any or all (default) of"
                         "these values: {}".format(SYNTAXES))
    if errors not in ['log', 'ignore', 'strict']:
        raise ValueError('Invalid error command, valid values are either "log"'
                         ', "ignore" or "strict"')
    domparser = XmlDomHTMLParser(encoding=encoding)
    tree = fromstring(htmlstring, parser=domparser)
    processors = []
    if 'microdata' in syntaxes:
        processors.append(
            ('microdata', MicrodataExtractor().extract_items, tree))
    if 'json-ld' in syntaxes:
        processors.append(('json-ld', JsonLdExtractor().extract_items, tree))
    if 'opengraph' in syntaxes:
        processors.append(
            ('opengraph', OpenGraphExtractor().extract_items, tree))
    if 'microformat' in syntaxes:
        processors.append(
            ('microformat', MicroformatExtractor().extract_items, htmlstring))
    if 'rdfa' in syntaxes:
        processors.append(('rdfa', RDFaExtractor().extract_items, tree))
    output = {}
    for label, extract, document in processors:
        try:
            output[label] = list(extract(document, base_url=base_url))
        except Exception:
            if errors == 'log':
                logger.exception('Failed to extract {}'.format(label))
            if errors == 'ignore':
                pass
            if errors == 'strict':
                raise

    if uniform:
        if 'microdata' in syntaxes:
            output['microdata'] = _umicrodata_microformat(
                output['microdata'], schema_context=schema_context)
        if 'microformat' in syntaxes:
            output['microformat'] = _umicrodata_microformat(
                output['microformat'],
                schema_context='http://microformats.org/wiki/')
        if 'opengraph' in syntaxes:
            output['opengraph'] = _uopengraph(output['opengraph'])
    return output
Exemple #33
0
    def parse_scene(self, response):
        data = response.css('script:contains("dataLayer =")::text').get()
        data2 = response.xpath(
            "//script[contains(text(), 'ScenePlayerId = \"player\"')] | //script[contains(text(), 'ScenePlayerId = \"scenePlayer\"')] | //script[contains(text(), 'sceneReleaseDate')]"
        ).get()
        data3 = response.xpath(
            '//script[@type="application/ld+json"]/text()').get()
        if data3:
            data3 = json.loads(data3)
            data3 = data3[0]
        else:
            data3 = []

        if len(chompjs.parse_js_object(data)):
            json_data = chompjs.parse_js_object(data)[0]

            jslde = JsonLdExtractor().extract(response.text)
            jsonlde = {}
            for obj in jslde:
                jsonlde.update(obj)

            item = SceneItem()

            if 'name' in jsonlde:
                item['title'] = jsonlde['name']
            elif 'sceneDetails' in json_data and 'sceneTitle' in json_data[
                    'sceneDetails']:
                item['title'] = json_data['sceneDetails']['sceneTitle']
            else:
                item['title'] = self.get_title(response)

            if item['title']:
                if ", scene #01" in item['title'].lower():
                    item['title'] = item['title'].replace(
                        ", Scene #01", "").replace(", scene #01", "")

            if 'sceneDetails' in json_data and 'sceneDescription' in json_data[
                    'sceneDetails']:
                item['description'] = json_data['sceneDetails'][
                    'sceneDescription']
            elif 'description' in jsonlde:
                item['description'] = jsonlde['description']
            else:
                item['description'] = self.get_description(response)

            if 'site' in response.meta:
                item['site'] = response.meta['site']
            elif 'productionCompany' in data3:
                item['site'] = data3['productionCompany']['name']
            elif 'siteName_pretty' in json_data:
                item['site'] = json_data['siteName_pretty']
            elif 'siteName' in json_data:
                item['site'] = json_data['siteName']

            if item['site']:
                item['site'] = match_site(item['site'])

            if not item['site']:
                item['site'] = self.get_site(response)

            if 'date' in response.meta:
                item['date'] = response.meta['date']
            elif 'dateCreated' in jsonlde and 'nudefightclub' not in response.url and '0000-00-00' not in jsonlde[
                    'dateCreated']:
                item['date'] = self.parse_date(jsonlde['dateCreated'],
                                               date_formats=['%Y-%m-%d'
                                                             ]).isoformat()
            elif 'datePublished' in jsonlde and 'nudefightclub' not in response.url and '0000-00-00' not in jsonlde[
                    'datePublished']:
                item['date'] = self.parse_date(jsonlde['datePublished'],
                                               date_formats=['%Y-%m-%d'
                                                             ]).isoformat()
            elif 'nudefightclub' in response.url:
                date1 = response.xpath(
                    '//div[@class="updatedDate"]/b/following-sibling::text()'
                ).get()
                item['date'] = self.parse_date(date1.strip()).isoformat()
            else:
                item['date'] = self.get_date(response)

            if not item['date']:
                item['date'] = self.get_date(response)

            if data2:
                date2 = re.search(r'sceneReleaseDate\":\"(\d{4}-\d{2}-\d{2})',
                                  data2)
                if date2:
                    date2 = date2.group(1)
                    date2 = self.parse_date(date2.strip(),
                                            date_formats=['%Y-%m-%d'
                                                          ]).isoformat()
                    if item['date'] and date2 > item['date']:
                        item['date'] = date2

            if 'image' in response.meta:
                item['image'] = response.meta['image']
            else:
                item['image'] = self.get_image(response)

            item['image_blob'] = None

            if 'performers' in response.meta:
                item['performers'] = response.meta['performers']
            elif 'actor' in jsonlde:
                item['performers'] = list(
                    map(lambda x: x['name'].strip(), jsonlde['actor']))
            else:
                item['performers'] = self.get_performers(response)

            if 'tags' in response.meta:
                item['tags'] = response.meta['tags']
            elif 'keywords' in jsonlde:
                item['tags'] = jsonlde['keywords'].split(',')
            else:
                item['tags'] = self.get_tags(response)

            if item['tags']:
                item['tags'] = list(
                    map(lambda x: string.capwords(x.strip()), item['tags']))

            if 'id' in response.meta:
                item['id'] = response.meta['id']
            else:
                item['id'] = self.get_id(response)

            if 'trailer' in response.meta:
                item['trailer'] = response.meta['trailer']
            else:
                item['trailer'] = self.get_trailer(response)

            item['url'] = self.get_url(response)

            if hasattr(self, 'network'):
                item['network'] = self.network
            else:
                item['network'] = self.get_network(response)

            if hasattr(self, 'parent'):
                item['parent'] = self.parent
            else:
                item['parent'] = self.get_parent(response)

            if item['title']:
                item['title'] = self.cleanup_title(item['title'])

            if item['description']:
                item['description'] = self.cleanup_description(
                    item['description'])

            if item['id'] and item['title']:
                days = int(self.days)
                if days > 27375:
                    filterdate = "0000-00-00"
                else:
                    filterdate = date.today() - timedelta(days)
                    filterdate = filterdate.strftime('%Y-%m-%d')

                if self.debug:
                    if not item['date'] > filterdate:
                        item[
                            'filtered'] = "Scene filtered due to date restraint"
                    print(item)
                else:
                    if filterdate:
                        if item['date'] > filterdate:
                            yield item
                    else:
                        yield item

        else:
            super().parse_scene(response)
Exemple #34
0
def getJsonLdFromHTML(html_text):
    """
    Returns an array of json_ld structures found in the provided html_text
    """
    jslde = JsonLdExtractor()
    return jslde.extract(html_text)
Exemple #35
0
 def _check_jsonld(self, body, expected):
     jsonlde = JsonLdExtractor()
     data = jsonlde.extract(body)
     self.assertEqual(data, expected)