Ejemplo n.º 1
0
 def get(self, url):
     '''RSS feed item'''
     feed = requests.get(url)
     parsed = feedparser.parse(feed.content)
     for item in parsed['entries']:
         if check_exists(item.get('id', item.get('link', 'True')))[0]:
             continue
         item['feed'] = dict(parsed['feed'])
         item['links'] = self.follow_links(item['links'])
         item['_id'] = item.pop('id')
         yield self.feedparser_to_dict(item)
Ejemplo n.º 2
0
    def run(self, document, field, new_field, save=False, force=False):
        old_field = field

        logger.debug("tring to process: ", document)
        if not (type(document) == dict and '_source' in document.keys()):
            logger.debug("input not a document")
            if check_exists(document):
                document = get_document(document)
            else:
                logger.debug(
                    "document retrieval failure {document}".format(**locals()))
                return document

        if old_field not in document['_source'].keys():
            logger.debug("Source field missing: ignoring rename")
            return document

        elif new_field in document['_source'].keys(
        ) and new_field in document['_source']['META'].keys():
            logger.info(
                "Existing *original* (non moved) field: ignoring rename!")
            return document

        elif not new_field in document['_source'].keys():
            document['_source'][new_field] = document['_source'][old_field]
            document['_source']['META'][new_field] = document['_source'][
                'META'][old_field]
            document['_source']['META'][new_field]['moved_from'] = old_field

        elif 'moved_from' in document['_source']['META'].keys():
            logger.info(
                "Moving to existing field (which was itself a result of moving)"
            )
            document['_source'][new_field] = document['_source'][old_field]
            document['_source']['META'][new_field] = document['_source'][
                'META'][old_field]
            document['_source']['META'][new_field]['MOVED_FROM'] = old_field

        self._verify(document['_source'])
        if save: update_document(document, force=True)
        return document
Ejemplo n.º 3
0
 def _check_exists(self, doc_id):
     '''Checks whether a document already exists, can be overwritten for testing etc '''
     return check_exists(doc_id)
Ejemplo n.º 4
0
    def get(self, **kwargs):
        '''Document collected via {} feed reader'''.format(self.doctype)

        # This RSS-scraper is a generic fallback option in case we do not have
        # any specific one. Therefore, only use the following generic values
        # if we do not have any more specific info already
        if 'rss_url' in kwargs:
            RSS_URL = kwargs['rss_url']
        else:
            try:
                RSS_URL = self.rss_url
            except:
                RSS_URL = 'N/A'

        assert RSS_URL != 'N/A', 'You need to specify the feed URL. Example: rss_url="http://www.nu.nl/rss"'

        if type(RSS_URL) is str:
            RSS_URL = [RSS_URL]

        for thisurl in RSS_URL:
            rss_body = self.get_page_body(thisurl)
            d = feedparser.parse(rss_body)
            for post in d.entries:
                try:
                    _id = post.id
                except:
                    _id = post.link

                link = re.sub("/$", "", self.getlink(post.link))

                if self.database == False or check_exists(_id)[0] == False:
                    try:
                        req = urllib2.Request(
                            link, headers={'User-Agent': "Wget/1.9"})
                        htmlsource = urllib2.urlopen(req).read().decode(
                            encoding="utf-8", errors="ignore")
                    except:
                        htmlsource = None
                        logger.info(
                            'Could not open link - will not retrieve full article, but will give it another try with different User Agent'
                        )
                    # Some (few) scrapers seem to block certain user agents. Therefore, if code above did
                    # not succed, try fetching the article pretending to user Firefox on Windows
                    if not htmlsource or htmlsource == "":
                        try:
                            req = urllib2.Request(
                                link,
                                headers={
                                    'User-Agent':
                                    "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0"
                                })
                            htmlsource = urllib2.urlopen(req).read().decode(
                                encoding="utf-8", errors="ignore")
                        except:
                            htmlsource = None
                            logger.info(
                                'Could not open link - will not retrieve full article'
                            )

                    try:
                        teaser = re.sub(r"\n|\r\|\t", " ", post.description)
                    except:
                        teaser = ""
                    try:
                        datum = datetime.datetime(
                            *feedparser._parse_date(post.published)[:6])
                    except:
                        try:
                            # alternative date format as used by nos.nl
                            datum = datetime.datetime(*feedparser._parse_date(
                                post.published[5:16])[:6])
                        except:
                            #print("Couldn't parse publishing date")
                            datum = None
                    doc = {
                        "_id": _id,
                        "title_rss": post.title,
                        "teaser_rss": teaser,
                        "publication_date": datum,
                        "htmlsource": htmlsource,
                        "feedurl": thisurl,
                        "url": re.sub("/$", "", post.link)
                    }
                    if htmlsource is not None:
                        # TODO: CHECK IF PARSEHTML returns None, if so, raise custom exception
                        parsed = self.parsehtml(doc['htmlsource'])
                        if parsed is None or parsed == {}:
                            try:
                                raise UnparsableException
                            except UnparsableException:
                                pass
                        else:
                            doc.update(parsed)
                    parsedurl = self.parseurl(link)
                    doc.update(parsedurl)
                    docnoemptykeys = {k: v for k, v in doc.items() if v}
                    yield docnoemptykeys
Ejemplo n.º 5
0
    def run(self,
            document,
            field,
            new_key=None,
            save=False,
            force=False,
            *args,
            **kwargs):
        '''
        Run a processor.

        Input
        ---
        document: dict or str
            document to be processed
        field: str
            key of the field to be processed
        new_key: str
            if specified, this key will be used as name for new field (instead of field_processorname)
        save: boolean
            indicates whether the result will be stored in the database
        force:
            indicates whether the document should replace (true) or only
            expand existing documents (false). Note that partial updates
            are not supported when forcing.
        '''

        # 1. check if document or id --> return doc
        logger.debug("trying to process: ", document)
        masked = False  # expect a document to be processed as-is (assumes ES origin)
        if not (type(document) == dict):
            logger.debug("input not a document")
            if field == None:  # This path is used to run examples (ignores save)
                return self.process(document, *args, **kwargs)
            elif check_exists(document):
                document = get_document(document)
            else:
                logger.debug(
                    "document retrieval failure {document}".format(**locals()))
                return document
        if not "_source" in document:
            masked = True  #mask documents to ES expectations
            document = {'_source': document}
        if not new_key:
            new_key = "%s_%s" % (field, self.__name__)
        # 2. check whether processing can be skipped
        if not force and new_key in document['_source'].keys(): return document
        # 3. return None if key is missing
        if not field in document['_source'].keys():
            print(document['_source'].keys())
            logger.warning("Key not found in document")
            if masked:
                document = document['_source']
            return document
        # 4. process document
        document['_source'][new_key] = self.process(document['_source'][field],
                                                    *args, **kwargs)
        # 3. add metadata
        document['_source'] = self._add_metadata(document['_source'])
        # 4. check metadata
        self._verify(document['_source'])
        # 5. save if requested
        if save: update_document(document, force=force)
        # 6. emit dotkey-field
        if masked:
            document = document['_source']
        return document