class JsonExportPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): #file = open('%_ixbt_jokes.json' % spider.name, 'w+b') file = open('ixbt_jokes.json', 'w+b') self.files[spider] = file self.exporter = JsonItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class MonitorPipeline(object): def open_spider(self, spider): self.cols = spider.cols self.start_urls = spider.start_urls self.file = open('test.json', 'w+b') self.exporter = JsonItemExporter(self.file) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): try: index = self.start_urls.index( item['surl'] ) groupId = index / self.cols r = index % self.cols if r == 0: item['main'] = 0 elif r == 1: item['main'] = 1 elif r == 2: item['main'] = 2 item['gid'] = groupId except: index = -1 self.exporter.export_item(item) return item
class JsonExportPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_products.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class GovbuyPipeline(object): def __init__(self): self.titles_seen = set() self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_products.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): if item['title'] in self.titles_seen: raise DropItem("Duplicate item found: %s" % item) else: self.titles_seen.add(item['title']) self.exporter.export_item(item) return item
class JsonExportPipeline(object): __doc__ = ''' 将所有(从所有spider中)爬取到的item,存储到一个独立地 AtaBlogItems.json文件,每行包含一个序列化为JSON格式的item, JSON 是一个简单而有弹性的格式, 但对大量数据的扩展性不是很好,因为这里会将整个对象放入内存. 如果你要JSON既强大又简单,可以考虑 JsonLinesItemExporter , 或把输出对象分为多个块. ''' def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%sItems.json' % spider.name, 'a+') self.files[spider] = file self.exporter = JsonItemExporter(file, ensure_ascii=False) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class SpidercrawlerPipeline(object): def __init__(self): dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.files = {} #file = open('ScrapedItems.json', 'w+b') self.exporter = JsonItemExporter(file) def spider_opened(self, spider): if(spider.name == 'timesnews'): file = open('TodaysToiScrapedItems.json', 'w+b') else : file = open('TodaysHtScrapedItems.json', 'w+b') self.files[spider] = file self.exporter = JsonItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class ExportJSON(object): """ Exporting to export/json/spider-name.json file """ def __init__(self): self.files = {} self.exporter = None @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file_to_save = open(GLOBAL_PATH + 'exports/json/%s.json' % spider.name, 'w+b') self.files[spider] = file_to_save self.exporter = JsonItemExporter(file_to_save) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file_to_save = self.files.pop(spider) file_to_save.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class JsonExportPipeline(object): def __init__(self): log.msg('JsonExportPipeline.init....', level=log.INFO) self.files = {} @classmethod def from_crawler(cls, crawler): log.msg('JsonExportPipeline.from_crawler....', level=log.INFO) pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): log.msg('JsonExportPipeline.spider_opened....', level=log.INFO) file = open('%s.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): log.msg('JsonExportPipeline.spider_closed....', level=log.INFO) self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): log.msg('JsonExportPipeline.process_item....', level=log.INFO) self.exporter.export_item(item) return item
class AppsPipeline(object): def __init__(self, spider): self.file = open( 'data/{category}-{today}.json'.format( today=date.today().strftime('%d-%m-%Y'), category=spider.category), 'wb') dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) @classmethod def from_crawler(cls, crawler): if crawler.spider is not None: return cls(spider=crawler.spider) def spider_opened(self, spider): self.exporter = JsonItemExporter(self.file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): if spider.name == 'apps': self.exporter.export_item(item) return item
class ExportJSON(object): """ Exporting to export/json/spider-name.json file """ def __init__(self): self.files = {} self.exporter = None @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file_to_save = open('exports/json/%s.json' % spider.name, 'w+b') self.files[spider] = file_to_save self.exporter = JsonItemExporter(file_to_save) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file_to_save = self.files.pop(spider) file_to_save.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class DatesPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_products.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonItemExporter(file, indent=4) # tu powinno byc ensure_ascii=False ale nie dziala;P self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class YxreviewPipeline(object): @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.file = open('items.json', 'wb') self.exporter = JsonItemExporter(self.file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.checkData(item, "title") self.checkData(item, "summary") self.checkData(item, "cover_image") self.checkData(item, "score") self.exporter.export_item(item) return item def checkData(self ,item, field): if len(item[field]) > 0: newText = item[field][0].encode("utf-8") item[field] = newText.strip() else: item[field] = ""
class MonitorPipeline(object): def open_spider(self, spider): self.cols = spider.cols self.start_urls = spider.start_urls self.file = open('test.json', 'w+b') self.exporter = JsonItemExporter(self.file) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): try: index = self.start_urls.index(item['surl']) groupId = index / self.cols r = index % self.cols if r == 0: item['main'] = 0 elif r == 1: item['main'] = 1 elif r == 2: item['main'] = 2 item['gid'] = groupId except: index = -1 self.exporter.export_item(item) return item
class CrawlerPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_products.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonItemExporter(file, indent=4) # tu powinno byc ensure_ascii=False ale nie dziala;P self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class AppsPipeline(object): def __init__(self, spider): self.file = open('{category}-{today}.json'.format( today = date.today().strftime('%d-%m-%Y'), category = spider.category), 'wb') dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) @classmethod def from_crawler(cls, crawler): return cls(spider = crawler.spider) def spider_opened(self, spider): self.exporter = JsonItemExporter(self.file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): if spider.name == 'apps': self.exporter.export_item(item) return item
class YxreviewPipeline(object): @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.file = open('items.json', 'wb') self.exporter = JsonItemExporter(self.file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.checkData(item, "title") self.checkData(item, "summary") self.checkData(item, "cover_image") self.checkData(item, "score") self.exporter.export_item(item) return item def checkData(self, item, field): if len(item[field]) > 0: newText = item[field][0].encode("utf-8") item[field] = newText.strip() else: item[field] = ""
class SaveNewItems(object): def __init__(self): self.files = [] dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) def process_item(self, item, spider): self.new_file_exporter.export_item(item) print "Save " + item["title"][0] return item def spider_opened(self, spider): self.new_item_file = open("storage\\%s_new_items.json" % spider.name, "w") self.new_file_exporter = JsonItemExporter(self.new_item_file) self.new_file_exporter.start_exporting() def spider_closed(self, spider): with open("storage\\%s_items.json" % spider.name, "w") as items_file: self.exporter = JsonItemExporter(items_file) self.exporter.start_exporting() for item in incomingData: self.exporter.export_item(item) self.exporter.finish_exporting() self.new_file_exporter.finish_exporting() items_file.close() self.new_item_file.close()
class CLPipe(object): """A pipeline for writing results to json""" def __init__(self, **kwargs): self.files = {} self.AppID = kwargs.get('AppID') self.ApiKey = kwargs.get('ApiKey') super(CLPipe, self).__init__(**kwargs) @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): #open a static/dynamic file to read and write to file = open('%s_items.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() #reopen new static/dynamic file to parse for sending new = open('%s_items.json' % spider.name) data = json.load(new) #reg = re.compile(r'[\n\r\t]') #for i in data: # log.msg( i ) #this is actually very bad to loop here #in one day I sent almost 500k requests.. thats bad #try sending one load and process on the other end. #not sure if this is efficient, but it works #makes new api call for each loop #pushes single object for each call connection = httplib.HTTPSConnection('api.parse.com', 443) connection.connect() connection.request('POST', '/1/functions/scrapeSaver', json.dumps({ # #"email":data[i]["email"], "referer":data[i]["referer"], "scrapeID":data[i]["id"] "data":data }), { "X-Parse-Application-Id": self.AppID, "X-Parse-REST-API-Key": self.ApiKey, "Content-Type": "application/json" }) result = json.loads(connection.getresponse().read()) print "Sending load ", result #done with the new file, close it new.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class ExportJSON(ExportData): """ Exporting to export/json/spider-name.json file """ def spider_opened(self, spider): file_to_save = open('exports/json/%s.json' % spider.name, 'w+b') self.files[spider] = file_to_save self.exporter = JsonItemExporter(file_to_save) self.exporter.start_exporting()
class JSONExportPipeline(object): def __init__(self): self.file = open('items.json', 'w') self.exporter = JsonItemExporter(self.file) self.exporter.start_exporting() def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self, spider): self.exporter.finish_exporting() self.file.close()
class JsonItemPipeline(object): def open_spider(self, spider): self.file = open('test.json', 'w+b') self.exporter = JsonItemExporter(self.file) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class JsonPipeline(object): def __init__(self): self.file = open("./collected.json", 'wb') self.exporter = JsonItemExporter(self.file, encoding='utf-8', ensure_ascii=False) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item)
class DoubanSpiderPipeline(object): def __init__(self): file = codecs.open('books2.json','w+b',encoding='utf-8') #file = open('books2.json','w+b') self.exporter = JsonItemExporter(file) self.exporter.encoding='utf-8' self.exporter.start_exporting() self.encoder = json.JSONEncoder(ensure_ascii=False) def spider_closed(self,spider): self.exporter.finish_exporting() def process_item(self, item, spider): self.exporter.export_item(self.encoder.encode(item)) return item
class JsonPipeline(object): """ 使用自带方法 """ def __init__(self): self.f = open('news.json', 'wb') self.exporter = JsonItemExporter(self.f, encoding='utf-8') self.exporter.start_exporting() def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self, spider): self.exporter.finish_exporting()
class Pipeline(object): """Pipeline for exporting scraped items into JSON format.""" def __init__(self): self._file = open('wholefoods.json', 'wb') self._exporter = JsonItemExporter(self._file) def open_spider(self, spider): self._exporter.start_exporting() def close_spider(self, spider): self._exporter.finish_exporting() def process_item(self, item, spider): self._exporter.export_item(item) return item
class DoubanJsonWrite(object): def __init__(self): #dispatcher.connect(self.open_spider, signals.spider_opened) #dispatcher.connect(self.close_spider, signals.spider_closed) self.itemsfile = open('imtes.jl', 'w') def open_spider(self, spider): self.exporter = JsonItemExporter(self.itemsfile) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() def process_item(self, item, spider): self.exporter.export_item(item) return item
def parse(self, response): # Only handles top-level categories: http://www.peapod.com/processShowBrowseAisles.jhtml Only one of these # Then looks through each top-level category with new requests to find subcategories, which are processed by parse_category # Top level categories: Already in a list on the left with each link class 'mainCat' hxs = HtmlXPathSelector(response) big_categories = hxs.select('//a[@class="mainCat"]') big_category_objects = [] # Extract categories one-by-one. We are now in the scope of each <a>: # <a href="?cnid=2098" target="_self" class="mainCat">Produce</a> for cat in big_categories: # Get category id (cnid): Remove all text except digits from href that links to category cnid_href = cat.select('@href').extract()[0].translate(string.digits) cnid = re.sub('\D', '', cnid_href) # remove all non-digits, again!? name = cat.select('text()').extract()[0].rstrip() # Category name: Remove the \n at end new_cat = ShopCategory(name=name, cnid=cnid, parent='') big_category_objects.append(new_cat) self.all_categories.append(new_cat) # unicodedata.normalize('NFKD', title).encode('ascii','ignore') turn the text fields to ascii # Prepare requests to parse subcategories subcategory_requests = [] for cat in big_category_objects: url = "http://www.peapod.com/processShowBrowseAisles.jhtml?cnid=" + cat['cnid'] request = Request(url=url, callback=self.parse_category) request.meta['parent_cnid'] = cat['cnid'] subcategory_requests.append(request) # Export big categories print big_category_objects, '\n\n' file = open("categories_top.txt", 'wb') exporter = JsonItemExporter(file) exporter.start_exporting() for cat in big_category_objects: exporter.export_item(cat) exporter.finish_exporting() # Move on to parsing subcategories print '\n\n\n\n\ndone with top level category request. moving on to next\n\n\n' return subcategory_requests
class VisionsJsonPipeline(object): """ Prints category and product data to a JSON file (data/category.json or data/product.json) """ def __init__(self): self.exporter = None def open_spider(self, spider): self.exporter = JsonItemExporter(open('data/%s.json' %spider.name, 'w')) self.exporter.start_exporting() def process_item(self, item, spider): self.exporter.export_item(item) return item def close_spider(self, spider): self.exporter.finish_exporting()
class DceChicangSave(object): @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): print(get_project_settings().get('JASONFILE_PATH') + 'items.json') self.file = open('items' + datetime.datetime.today().strftime('%Y-%m-%d') + '.json', 'wb') self.exporter = JsonItemExporter(self.file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() self.file.close() # print(get_project_settings().get('JASONFILE_PATH') +'items.json') # datestampStr=datetime.datetime.today().strftime('%Y-%m-%d') # if os.path.exists(get_project_settings().get('JASONFILE_PATH')+'items_end'+datestampStr+'.json'): # os.remove(get_project_settings().get('JASONFILE_PATH')+'items_end'+datestampStr+'.json') # print(get_project_settings().get('JASONFILE_PATH')+ 'items_end.json') # os.rename(get_project_settings().get('JASONFILE_PATH')+'items.json',get_project_settings().get('JASONFILE_PATH')+ 'items_end'+datestampStr+'.json') # # 读取配置 settings = get_project_settings() ftp_host = settings.get('FTP_HOST') ftp_username = settings.get('FTP_USER') ftp_password = settings.get('FTP_PASSWORD') ftp_path = settings.get('FTP_PATH') filenametosave = 'items' + datetime.datetime.today().strftime('%Y-%m-%d') + '.json' # 1成功0失败 # result=uploadfile(ftp_host, ftp_username, ftp_password,ftp_path, filenametosave, os.getcwd()) # print result def process_item(self, item, spider): if isinstance(item, DceChicangItem): self.exporter.export_item(item) return item
class JsonWriterPipeline(object): @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.file = open('%s_output.json' % spider.name, 'w+b') self.exporter = JsonItemExporter(self.file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class JsonExportPipeline(object): def __init__(self): dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.files = {} def spider_opened(self, spider): file = open('%s_items.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class JsonExportPipeline(object): def __init__(self): dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.files = {} def spider_opened(self, spider): file = open('web/fefelinks.json', 'w+b') self.files[spider] = file self.exporter = JsonItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class JsonWriterPipeline(object): def __init__(self): self.first_item = True def process_item(self, item, spider): if self.first_item: self.first_item = False file = open('%s_items.json' % spider.name, 'wb') # scrapy 使用item export输出中文到json文件,内容为unicode码,如何输出为中文? # http://stackoverflow.com/questions/18337407/saving-utf-8-texts-in-json-dumps-as-utf8-not-as-u-escape-sequence # 里面有提到,将 JSONEncoder 的 ensure_ascii 参数设为 False 即可。 # 因此就在调用 scrapy.contrib.exporter.JsonItemExporter 的时候额外指定 ensure_ascii=False 就可以啦。 self.exporter = JsonItemExporter(file, ensure_ascii=False) self.exporter.start_exporting() self.exporter.export_item(item) return item def close_spider(self, spider): self.exporter.finish_exporting() self.file.close()
class JSONExportPipeline(object): directory = '/home/ubuntu/dealscrape/output/' def __init__(self): dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.files = {} def spider_opened(self, spider): file = open(self.directory + '%s_items.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): file = self.files.pop(spider) for k,v in spider.crawled_items.items(): self.exporter.export_item(v) self.exporter.finish_exporting() file.close()
class GamecrawlerPipeline(object): def __init__(self): dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.files = {} def spider_opened(self, spider): file = open('gameCrawlerItems.json', 'w+b') self.files[spider] = file self.exporter = JsonItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): ##filter out boxscore(tsn) and htmlreport(nhl) links if (item['link']): boxscoreLinks = [] for link in item['link']: #tsn boxscore if ("/nhl/scores/boxscore" in link): boxscoreLinks.append(link) #nhl html game report if ("/scores/htmlreports" in link): boxscoreLinks.append(link) item['link'] = boxscoreLinks #check if actually date(Mar 13 '14) with regex if (item['date']): dates = [] for date in item['date']: #check if actually date(Mar 13 '14) OR (Monday, March 13, 2014) with regex if (re.match(DATE_PATTERN_STRING, date) is not None): dates.append(date) item['date'] = dates self.exporter.export_item(item) return item
class JsonExportPipeline(object): def __init__(self): log.start() dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.fjsons = {} def spider_opened(self, spider): fjson = open('output/%s_%s_items.json' % (spider.name, str(int(time.mktime(time.gmtime())))), 'wb') self.fjsons[spider] = fjson self.exporter = JsonItemExporter(fjson) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() fjson = self.fjsons.pop(spider) fjson.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class StatsPipeline(object): def __init__(self): self.files = {} dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) def spider_opened(self,spider): file = open('stats.json','wb') self.files[spider] = file self.exporter = JsonItemExporter(file) self.exporter.start_exporting() def spider_closed(self,spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): # determine if the name has a * in it & remove it if item['name']: if "* " in item['name']: item['name'].replace("* ","",1) self.exporter.export_item(item) return item
class JsonExportPipeline(object): def __init__(self): log.start() dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.fjsons = {} def spider_opened(self, spider): fjson = open( 'output/%s_%s_items.json' % (spider.name, str(int(time.mktime(time.gmtime())))), 'wb') self.fjsons[spider] = fjson self.exporter = JsonItemExporter(fjson) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() fjson = self.fjsons.pop(spider) fjson.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class ManningPipeline(object): def __init__(self): self.fields_to_export = { "list": ["title", "url"], "all": ["isbn", "title", "url", "year", "authors", "image_url", "ebook_price"], "parse": ["isbn", "title", "url", "year", "authors", "image_url", "ebook_price"], } dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) dispatcher.connect(self.engine_started, signals.engine_started) log.msg(message="ManningPipeline, __init__", _level=log.INFO) def spider_opened(self, spider): self.spider = spider def engine_started(self): self.json_file = open("result.json", "w") self.json_exporter = JsonItemExporter( self.json_file, fields_to_export=self.fields_to_export[self.spider._crawler.settings["CommandLineParameter"][0]], ) self.json_exporter.start_exporting() log.msg( message="ManningPipeline, engine_started, mode=%s" % self.spider._crawler.settings["CommandLineParameter"][0] ) def process_item(self, item, spider): log.msg(message="ManningPipeline, process_item", _level=log.INFO) self.json_exporter.export_item(item) return item def spider_closed(self, spider): self.json_exporter.finish_exporting() self.json_file.close() log.msg(message="ManningPipeline, spider_closed", _level=log.INFO)
class CategoryPipeline(object): def __init__(self, spider): if spider.name == 'categories': self.file = open('categories.json', 'wb') dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) @classmethod def from_crawler(cls, crawler): if crawler.spider is not None: return cls(spider=crawler.spider) def spider_opened(self, spider): self.exporter = JsonItemExporter(self.file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): if spider.name == 'categories': self.exporter.export_item(item) return item
class CategoryPipeline(object): def __init__(self, spider): if spider.name == 'categories': self.file = open('categories.json', 'wb') dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) @classmethod def from_crawler(cls, crawler): return cls(spider = crawler.spider) def spider_opened(self, spider): self.exporter = JsonItemExporter(self.file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): if spider.name == 'categories': self.exporter.export_item(item) return item
def spider_closed(self, spider): # now dump all items and cats out file = open("all_items.txt", 'wb') exporter = JsonItemExporter(file) exporter.start_exporting() for item in self.all_items: exporter.export_item(item) exporter.finish_exporting() file3 = open("all_items_detail.txt", 'wb') exporter = JsonItemExporter(file3) exporter.start_exporting() for item in self.all_items_detail: exporter.export_item(item) exporter.finish_exporting() file2 = open("all_categories.txt", 'wb') exporter2 = JsonItemExporter(file2) exporter2.start_exporting() for cat in self.all_categories: exporter2.export_item(cat) exporter2.finish_exporting()
class CrawlerPipeline(object): """Pipeline to alter scraped items.""" def __init__(self): """Initialise Pipeline.""" dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.files = {} def spider_opened(self, spider): """Open Spider.""" file = open('../website/data/complete.json', 'w+b') self.files[spider] = file self.exporter = JsonItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): """Close Spider.""" self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): """Process Items.""" # Process Field 'number' if item['number']: # Add leading 0 to 12a and 12b if item['number'][0] in ('12a', '12b'): item['number'] = '0' + item['number'][0] # if item['number'][0] == '12a': # item['number'] = '012.1' # elif item['number'][0] == '12b': # item['number'] = '012.2' else: # Add leading 0 up to total length of 3 item['number'] = item['number'][0].zfill(3) else: # Drop empty item raise DropItem(item['number']) # Process Field 'pubdate' if item['pubdate']: alterToCET = datetime.datetime.strptime(item['pubdate'][0], '%a, %d %b %Y %H:%M:%S') alterToCET = alterToCET + datetime.timedelta(hours=1) # alterToCET = alterToCET.strftime('%a, %d %b %Y %H:%M:%S') alterToCET = alterToCET.strftime('%Y-%m-%d %H:%M:%S.000000') item['pubdate'] = alterToCET # Process Field 'pubday' if item['pubday']: # Changing day format if any('Mon' in s for s in item['pubday']): item['pubday'] = 'Montag' elif any('Tue' in s for s in item['pubday']): item['pubday'] = 'Dienstag' elif any('Wed' in s for s in item['pubday']): item['pubday'] = 'Mittwoch' elif any('Thu' in s for s in item['pubday']): item['pubday'] = 'Donnerstag' elif any('Fri' in s for s in item['pubday']): item['pubday'] = 'Freitag' elif any('Sat' in s for s in item['pubday']): item['pubday'] = 'Samstag' elif any('Sun' in s for s in item['pubday']): item['pubday'] = 'Sonntag' else: item['pubday'] = 'WRONG DAY FORMAT' # Process Field 'pubtime' if item['pubtime']: # Change from GMT to CET alterTimeToCET = datetime.datetime.strptime( item['pubtime'][0], '%H:%M:%S') alterTimeToCET = alterTimeToCET + datetime.timedelta(hours=1) alterTimeToCET = alterTimeToCET.strftime('%H:%M:%S') item['pubtime'] = alterTimeToCET # Save pubtime as integer tpub = item['pubtime'][0] item['pubtime_integer'] = sum( int(x) * 60**i for i, x in enumerate(reversed(tpub.split(":")))) # Alter empty durations with real value if item['number'] == '000': item['duration'][0] = '01:46:01' elif item['number'] == '001': item['duration'][0] = '01:39:34' elif item['number'] == '002': item['duration'][0] = '01:55:14' elif item['number'] == '003': item['duration'][0] = '02:02:28' elif item['number'] == '004': item['duration'][0] = '02:17:18' elif item['number'] == '005': item['duration'][0] = '02:08:03' elif item['number'] == '006': item['duration'][0] = '02:29:09' elif item['number'] == '007': item['duration'][0] = '02:42:58' elif item['number'] == '008': item['duration'][0] = '02:21:36' elif item['number'] == '009': item['duration'][0] = '02:07:42' elif item['number'] == '010': item['duration'][0] = '02:13:07' elif item['number'] == '011': item['duration'][0] = '02:19:37' elif item['number'] == '012a': item['duration'][0] = '00:59:06' elif item['number'] == '012b': item['duration'][0] = '00:41:33' elif item['number'] == '013': item['duration'][0] = '02:43:32' elif item['number'] == '014': item['duration'][0] = '01:59:41' elif item['number'] == '015': item['duration'][0] = '02:30:15' elif item['number'] == '016': item['duration'][0] = '02:46:53' elif item['number'] == '017': item['duration'][0] = '02:29:01' elif item['number'] == '018': item['duration'][0] = '02:44:20' elif item['number'] == '019': item['duration'][0] = '02:18:56' elif item['number'] == '020': item['duration'][0] = '02:27:10' elif item['number'] == '021': item['duration'][0] = '02:51:22' elif item['number'] == '022': item['duration'][0] = '02:18:16' elif item['number'] == '023': item['duration'][0] = '02:49:37' elif item['number'] == '024': item['duration'][0] = '02:37:09' elif item['number'] == '025': item['duration'][0] = '02:34:52' elif item['number'] == '026': item['duration'][0] = '02:44:25' elif item['number'] == '027': item['duration'][0] = '02:37:43' elif item['number'] == '028': item['duration'][0] = '02:56:38' elif item['number'] == '029': item['duration'][0] = '03:14:28' elif item['number'] == '030': item['duration'][0] = '02:19:35' elif item['number'] == '031': item['duration'][0] = '02:55:49' elif item['number'] == '032': item['duration'][0] = '03:12:45' elif item['number'] == '033': item['duration'][0] = '02:17:02' elif item['number'] == '034': item['duration'][0] = '02:52:31' elif item['number'] == '035': item['duration'][0] = '02:36:32' elif item['number'] == '036': item['duration'][0] = '03:40:17' elif item['number'] == '037': item['duration'][0] = '03:07:41' elif item['number'] == '038': item['duration'][0] = '02:50:01' elif item['number'] == '039': item['duration'][0] = '03:01:35' elif item['number'] == '040': item['duration'][0] = '03:39:16' elif item['number'] == '041': item['duration'][0] = '01:48:45' elif item['number'] == '042': item['duration'][0] = '03:37:21' elif item['number'] == '043': item['duration'][0] = '02:46:26' elif item['number'] == '044': item['duration'][0] = '02:46:05' elif item['number'] == '045': item['duration'][0] = '03:08:51' elif item['number'] == '046': item['duration'][0] = '02:59:17' elif item['number'] == '047': item['duration'][0] = '02:46:31' elif item['number'] == '048': item['duration'][0] = '03:14:36' elif item['number'] == '049': item['duration'][0] = '03:21:24' elif item['number'] == '077': item['duration'][0] = '03:13:03' # Process Field 'duration' if item['duration']: # Save pubtime as integer tdur = item['duration'][0] item['duration_integer'] = sum( int(x) * 60**i for i, x in enumerate(reversed(tdur.split(':')))) # Specify duration as time format alterDuration = datetime.datetime.strptime(item['duration'][0], '%H:%M:%S') alterDuration = alterDuration.strftime('%H:%M:%S') item['duration'] = alterDuration # Return all crawled items self.exporter.export_item(item) return item