def get(self, url): '''RSS feed item''' feed = requests.get(url) parsed = feedparser.parse(feed.content) for item in parsed['entries']: if check_exists(item.get('id', item.get('link', 'True')))[0]: continue item['feed'] = dict(parsed['feed']) item['links'] = self.follow_links(item['links']) item['_id'] = item.pop('id') yield self.feedparser_to_dict(item)
def run(self, document, field, new_field, save=False, force=False): old_field = field logger.debug("tring to process: ", document) if not (type(document) == dict and '_source' in document.keys()): logger.debug("input not a document") if check_exists(document): document = get_document(document) else: logger.debug( "document retrieval failure {document}".format(**locals())) return document if old_field not in document['_source'].keys(): logger.debug("Source field missing: ignoring rename") return document elif new_field in document['_source'].keys( ) and new_field in document['_source']['META'].keys(): logger.info( "Existing *original* (non moved) field: ignoring rename!") return document elif not new_field in document['_source'].keys(): document['_source'][new_field] = document['_source'][old_field] document['_source']['META'][new_field] = document['_source'][ 'META'][old_field] document['_source']['META'][new_field]['moved_from'] = old_field elif 'moved_from' in document['_source']['META'].keys(): logger.info( "Moving to existing field (which was itself a result of moving)" ) document['_source'][new_field] = document['_source'][old_field] document['_source']['META'][new_field] = document['_source'][ 'META'][old_field] document['_source']['META'][new_field]['MOVED_FROM'] = old_field self._verify(document['_source']) if save: update_document(document, force=True) return document
def _check_exists(self, doc_id): '''Checks whether a document already exists, can be overwritten for testing etc ''' return check_exists(doc_id)
def get(self, **kwargs): '''Document collected via {} feed reader'''.format(self.doctype) # This RSS-scraper is a generic fallback option in case we do not have # any specific one. Therefore, only use the following generic values # if we do not have any more specific info already if 'rss_url' in kwargs: RSS_URL = kwargs['rss_url'] else: try: RSS_URL = self.rss_url except: RSS_URL = 'N/A' assert RSS_URL != 'N/A', 'You need to specify the feed URL. Example: rss_url="http://www.nu.nl/rss"' if type(RSS_URL) is str: RSS_URL = [RSS_URL] for thisurl in RSS_URL: rss_body = self.get_page_body(thisurl) d = feedparser.parse(rss_body) for post in d.entries: try: _id = post.id except: _id = post.link link = re.sub("/$", "", self.getlink(post.link)) if self.database == False or check_exists(_id)[0] == False: try: req = urllib2.Request( link, headers={'User-Agent': "Wget/1.9"}) htmlsource = urllib2.urlopen(req).read().decode( encoding="utf-8", errors="ignore") except: htmlsource = None logger.info( 'Could not open link - will not retrieve full article, but will give it another try with different User Agent' ) # Some (few) scrapers seem to block certain user agents. Therefore, if code above did # not succed, try fetching the article pretending to user Firefox on Windows if not htmlsource or htmlsource == "": try: req = urllib2.Request( link, headers={ 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0" }) htmlsource = urllib2.urlopen(req).read().decode( encoding="utf-8", errors="ignore") except: htmlsource = None logger.info( 'Could not open link - will not retrieve full article' ) try: teaser = re.sub(r"\n|\r\|\t", " ", post.description) except: teaser = "" try: datum = datetime.datetime( *feedparser._parse_date(post.published)[:6]) except: try: # alternative date format as used by nos.nl datum = datetime.datetime(*feedparser._parse_date( post.published[5:16])[:6]) except: #print("Couldn't parse publishing date") datum = None doc = { "_id": _id, "title_rss": post.title, "teaser_rss": teaser, "publication_date": datum, "htmlsource": htmlsource, "feedurl": thisurl, "url": re.sub("/$", "", post.link) } if htmlsource is not None: # TODO: CHECK IF PARSEHTML returns None, if so, raise custom exception parsed = self.parsehtml(doc['htmlsource']) if parsed is None or parsed == {}: try: raise UnparsableException except UnparsableException: pass else: doc.update(parsed) parsedurl = self.parseurl(link) doc.update(parsedurl) docnoemptykeys = {k: v for k, v in doc.items() if v} yield docnoemptykeys
def run(self, document, field, new_key=None, save=False, force=False, *args, **kwargs): ''' Run a processor. Input --- document: dict or str document to be processed field: str key of the field to be processed new_key: str if specified, this key will be used as name for new field (instead of field_processorname) save: boolean indicates whether the result will be stored in the database force: indicates whether the document should replace (true) or only expand existing documents (false). Note that partial updates are not supported when forcing. ''' # 1. check if document or id --> return doc logger.debug("trying to process: ", document) masked = False # expect a document to be processed as-is (assumes ES origin) if not (type(document) == dict): logger.debug("input not a document") if field == None: # This path is used to run examples (ignores save) return self.process(document, *args, **kwargs) elif check_exists(document): document = get_document(document) else: logger.debug( "document retrieval failure {document}".format(**locals())) return document if not "_source" in document: masked = True #mask documents to ES expectations document = {'_source': document} if not new_key: new_key = "%s_%s" % (field, self.__name__) # 2. check whether processing can be skipped if not force and new_key in document['_source'].keys(): return document # 3. return None if key is missing if not field in document['_source'].keys(): print(document['_source'].keys()) logger.warning("Key not found in document") if masked: document = document['_source'] return document # 4. process document document['_source'][new_key] = self.process(document['_source'][field], *args, **kwargs) # 3. add metadata document['_source'] = self._add_metadata(document['_source']) # 4. check metadata self._verify(document['_source']) # 5. save if requested if save: update_document(document, force=force) # 6. emit dotkey-field if masked: document = document['_source'] return document