def process_item(self, item, spider): gcenabled = gc.isenabled() gc.disable() try: contentExtractor = lambda _: spider.contentExtractor(parseHTML(_)) boilerpipeExtractor = lambda _: Extractor(html=_).getText() gooseExtractor = lambda _: Goose().extract(raw_html=_).cleaned_text readabilityExtractor = lambda _: cleanTags(Document(_).summary()) # CE, BP, GO, RE ntimes = range(11) contents = map( lambda _: timeMeThis(partial(contentExtractor, item.rawHtml)), ntimes) boilerpipes = map( lambda _: timeMeThis(partial(boilerpipeExtractor, item.rawHtml)), ntimes) gooses = map( lambda _: timeMeThis(partial(gooseExtractor, item.rawHtml)), ntimes) readabilitys = map( lambda _: timeMeThis(partial(readabilityExtractor, item.rawHtml)), ntimes) log.msg("{} {} {} {} {} {} {} {}".format( mean(contents), std(contents), mean(boilerpipes), std(boilerpipes), mean(gooses), std(gooses), mean(readabilitys), std(readabilitys) )) finally: if gcenabled: gc.enable()
def process_item(self, item, spider): import codecs path = "titles/" + item.url.replace("/", "{") spider.logInfo(path) nicecontent = fix_bad_unicode(cleanTags(item.title)) with codecs.open(path , "w", encoding="utf-8") as out: out.write(nicecontent) path = "articles/" + item.url.replace("/", "{") spider.logInfo(path) nicecontent = fix_bad_unicode(cleanTags(item.content)) with codecs.open(path , "w", encoding="utf-8") as out: out.write(nicecontent) spider.logInfo("DONE:" + item.url)
def process_item(self, item, spider): """Saves the item as Invenio records @type item: bibcrawl.model.postitem.PostItem @param item: the item to process @type spider: scrapy.spider.BaseSpider @param spider: the spider that emitted this item @rtype: bibcrawl.model.postitem.PostItem @return: the processed item """ item.comments = commentsFromFeed(feedparser.parse(first(item.commentFeedUrls))) item.title = cleanTags(item.title) item.author = cleanTags(item.author) spider.logInfo(( """Completed %(url)s %(title)s {0} """ % item).format(len(item.comments)))