class NordstromPipeline(object): def __init__(self): self.files = {} self.ids_seen = set() @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def process_item(self, item, spider): if item['product_item_num'] in self.ids_seen: raise DropItem("Duplicate item found: %s" % item) else: self.ids_seen.add(item['product_item_num']) self.exporter.export_item(item) return item def spider_opened(self, spider): out_file = open('%s_products.jl' % spider.name, 'w+b') self.files[spider] = out_file self.exporter = JsonLinesItemExporter(out_file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() out_file = self.files.pop(spider) out_file.close()
class FeedWriterPipeline(object): def __init__(self): log.msg('FeedWriterPipeline.__init__()') self.file = None self.item_exporter = None self.count = 0 def open_spider(self, spider): if FeedSpider.is_feed_op(spider): spider.make_sure_path_exists(spider.get_output_dir_path()) file_name = spider.get_feed_output_file_path() self.file = open(file_name, 'a') self.item_exporter = JsonLinesItemExporter(self.file) log.msg('FeedWriterPipeline, opened file %s to append.' % file_name) def process_item(self, item, spider): if FeedSpider.is_feed_op(spider) and isinstance(item, FeedItem): self.item_exporter.export_item(item) self.count += 1 spider.check_max_limit(self.count) raise DropItem('Save item success') else: return item def close_spider(self, spider): if FeedSpider.is_feed_op(spider): self.file.write('Parsed %i feed items.%s' % (self.count, os.linesep)) self.file.close() log.msg('closed file, appended %i items.' % self.count)
class PerispiderPipeline(object): def open_spider(self, spider): name = "%s.json" % spider.name self.file = open(name, 'w') self.exporter = JsonLinesItemExporter(self.file) self.exporter.start_exporting() def process_item(self, item, spider): self.exporter.export_item(item) return item
def get_exporter(self, item): exporter = None if item.__class__ in self.exporters: exporter = self.exporters[item.__class__] else: if item.__class__ == items.unused_genotype_data: exporter = JsonLinesItemExporter(open(_class_to_file(item.__class__), 'w+b')) else: exporter = CsvItemExporter(open(_class_to_file(item.__class__), 'w+b')) self.exporters[item.__class__] = exporter exporter.start_exporting() return exporter
class JsonLinesItemPipeline(object): def open_spider(self, spider): self.file = open('test.json', 'w+b') self.exporter = JsonLinesItemExporter(self.file) def close_spider(self, spider): self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
def spider_opened(self, spider): if not os.path.exists('./json/'): os.makedirs('./json/') if isinstance(spider, MSPCrawler): MSPFile = open('json/msps.json', 'w+b') self.files['msps'] = MSPFile self.MSPExporter = JsonLinesItemExporter(MSPFile) self.MSPExporter.start_exporting() elif isinstance(spider, VoteCrawler): VoteFile = open('json/votes-' + spider.mspid + '.json', 'w+b') self.files['votes'] = VoteFile self.VoteExporter = JsonLinesItemExporter(VoteFile) self.VoteExporter.start_exporting()
class MoviesPipeline(object): def __init__(self): self.field_to_export = [] self.file = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): # signals start of export print "Spider opened...\nPreparing to crawl..." self.json_exporter = JsonLinesItemExporter(open('movies.json', 'wb')) self.json_exporter.start_exporting() # Since the charts frequently change, we need to deal with differences # in the cached data and current data. # For now, we'll just truncate the table when the spider opens # and dump everything in. cursor = connection.cursor() sql = 'truncate table %s' % MYSQL_TABLE try: cursor.execute(sql) connection.commit() print "*** Truncated %s Table ***" % MYSQL_TABLE except: print "Error %d %s" % (e.args[0], e.args[1]) connection.rollback() def process_item(self, item, spider): # store the item in the database insert_database(item) # Write to JSON file self.json_exporter.export_item(item) return item def spider_closed(self, spider): # signal end of export self.json_exporter = finish_exporting()
def open_spider(self, spider): if FeedSpider.is_feed_op(spider): spider.make_sure_path_exists(spider.get_output_dir_path()) file_name = spider.get_feed_output_file_path() self.file = open(file_name, 'a') self.item_exporter = JsonLinesItemExporter(self.file) log.msg('FeedWriterPipeline, opened file %s to append.' % file_name)
class FmlPipeline(object): ''' def __init__(self): self.file = open('data2.json', 'w') self.exporter = JsonLinesItemExporter(self.file) self.exporter.start_exporting() ''' def open_spider(self, spider): name = "%s.json" % spider.name self.file = open(name, 'w') self.exporter = JsonLinesItemExporter(self.file) self.exporter.start_exporting() def process_item(self, item, spider): self.exporter.export_item(item) return item
def spider_opened(self, spider): file = open('%s/%s/%s.json'% (settings.DATA_DIR, spider.name, datetime.date.today().isoformat()), 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting()
def process_spider_output(self, response, result, spider): items = [] for r in result: if isinstance(r, Item): items.append(r) yield r cca = response2cca(response, base64=True) cca['features'] = {'items': items} cca_item = self.create_item(cca) cca_path = self.get_cca_path(spider) if cca_path is None: yield cca_item else: exporter = self.exporters_by_path.get(cca_path) if exporter is None: exporter = JsonLinesItemExporter(open(cca_path, 'a+')) self.exporters_by_path[cca_path] = exporter exporter.export_item(cca_item)
def process_item(self, item, spider): """ Writes the item to output """ # create the output file for a new class of item per spider settings = spider.crawler.settings if item.__class__ not in self.xporters[spider.name]: filename = '%s.json' % item.export_filename dirpath = path.join(settings.get('IO_PATH', 'io'), settings['DATA_SET']) _mkdir_p(dirpath) file_h = open(path.join(dirpath, filename), 'w') xporter = JsonLinesItemExporter(file=file_h) xporter.start_exporting() self.xporters[spider.name][item.__class__] = (file_h, xporter) xporter = self.xporters[spider.name][item.__class__][1] xporter.export_item(item) return item
class AdbPipeline(object): def __init__(self): dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.files = {} self.seen = set([]) def spider_opened(self, spider): file = open('%s/%s/%s.json'% (settings.DATA_DIR, spider.name, datetime.date.today().isoformat()), 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): if self.seen_before(item): raise DropItem self.exporter.export_item(item) return item def seen_before(self, item): if item['product'] in self.seen: return True else: self.seen.add(item['product']) return False
class JsonExportPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): json_path = os.path.join('data', '%s.json' % spider.name) file = open(json_path, 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) log.msg("process_item", level=log.DEBUG) return item
class JsonLinesExportPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): out_file = open('%s_pics.json' % spider.name, 'a') self.files[spider] = out_file self.exporter = JsonLinesItemExporter(out_file, ensure_ascii=False) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() out_file = self.files.pop(spider) out_file.close() def process_item(self, item, spider): if item.get("image_urls"): self.exporter.export_item(item) return item
class ValidatorPipeline(object): """ Exports items in a temporary JSON file. Unnecessary fields are excluded. """ def __init__(self): self.exporter = None self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): fname = open(_get_spider_output_filename(spider), 'wb') self.files[spider] = fname self.exporter = JsonLinesItemExporter(fname) self.exporter.fields_to_export = _get_fields_to_check(ProductItem) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() f = self.files.pop(spider) f.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class PajandanPipeline(object): def __init__(self): self.files = {} # may be more than one spider @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): # write utf-8 file f = codecs.open('articles.json', 'w+', encoding='utf-8') self.files[spider] = f self.exporter = JsonLinesItemExporter(f, ensure_ascii=False) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() f = self.files.pop(spider) f.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class ScrippaPipeline(object): def __init__(self): #self.files = {} #self.log("MMMMMMMMMMMMMMMMMMMMMMMMMMAAAAAAAAAAAATE", level=log.WARNING) print "DDDDDDDDDDDDDDDDDDDDDDDDDUUUUUUUUUUUUUUUUUUUUUUUUUUUDE" #file = open('1_reports.json', 'w+b') @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('2_reports.json', 'w+b') #self.files[spider] = file self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() #file = self.files.pop(spider) #file.close() def process_item(self, item, spider): print "ScrippaPipeline: exporting item ============================== " self.exporter.export_item(item) return item
class TibiaPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s.json' % (spider.name + datetime.datetime.now().isoformat()), 'a+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class JsonExportPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_Joke.txt' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class PlayerPipeline(object): def __init__(self, *args, **kwargs): self.player_info_file = None self.player_info_exporter = None @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.player_info_file = open("%s/output/player_info.json" % settings.PROJECT_ROOT, 'wb') self.player_info_exporter = JsonLinesItemExporter(self.player_info_file) self.player_info_exporter.start_exporting() def spider_closed(self, spider): self.player_info_exporter.finish_exporting() self.player_info_file.close() def process_item(self, item, spider): if isinstance(item, PlayerInfoItem): self.player_info_exporter.export_item(item) return item
def spider_opened(self, spider): self.csv_exporter = CsvItemExporter(open(spider.name+".csv", "w"), fields_to_export=self.fields_to_export, quoting=csv.QUOTE_ALL) self.json_exporter = TffdatapullJsonItemExporter(open(spider.name+".json", "w"), fields_to_export=self.fields_to_export, sort_keys=True, indent=4) self.jsonlines_exporter = JsonLinesItemExporter(open(spider.name+".linejson", "w"), fields_to_export=self.fields_to_export) self.xml_exporter = TffdatapullXmlItemExporter(open(spider.name+".xml", "w"), fields_to_export=self.fields_to_export, root_element="match_code", item_element="match_code") # Make a quick copy of the list self.csv_exporter.start_exporting() self.json_exporter.start_exporting() self.jsonlines_exporter.start_exporting() self.xml_exporter.start_exporting()
class TffdatapullPipeline(object): def __init__(self): self.fields_to_export = [ 'match_code', 'match_href', 'home_team_id', 'home_team_name', 'score', 'guest_team_id', 'guest_team_name', 'date_of_match', 'time_of_match', 'stadium_name', 'organization_name' ] dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) def spider_opened(self, spider): self.csv_exporter = CsvItemExporter(open(spider.name+".csv", "w"), fields_to_export=self.fields_to_export, quoting=csv.QUOTE_ALL) self.json_exporter = TffdatapullJsonItemExporter(open(spider.name+".json", "w"), fields_to_export=self.fields_to_export, sort_keys=True, indent=4) self.jsonlines_exporter = JsonLinesItemExporter(open(spider.name+".linejson", "w"), fields_to_export=self.fields_to_export) self.xml_exporter = TffdatapullXmlItemExporter(open(spider.name+".xml", "w"), fields_to_export=self.fields_to_export, root_element="match_code", item_element="match_code") # Make a quick copy of the list self.csv_exporter.start_exporting() self.json_exporter.start_exporting() self.jsonlines_exporter.start_exporting() self.xml_exporter.start_exporting() def process_item(self, item, spider): self.csv_exporter.export_item(item) self.json_exporter.export_item(item) self.jsonlines_exporter.export_item(item) self.xml_exporter.export_item(item) return item def spider_closed(self, spider): self.csv_exporter.finish_exporting() self.json_exporter.finish_exporting() self.jsonlines_exporter.finish_exporting() self.xml_exporter.finish_exporting()
def spider_opened(self, spider): self.csv_exporter = CsvItemExporter(open(spider.name+".csv", "w"), fields_to_export=self.fields_to_export, quoting=csv.QUOTE_ALL) self.json_exporter = MTQInfraJsonItemExporter(open(spider.name+".json", "w"), fields_to_export=self.fields_to_export, sort_keys=True, indent=4) self.jsonlines_exporter = JsonLinesItemExporter(open(spider.name+".linejson", "w"), fields_to_export=self.fields_to_export) self.xml_exporter = MTQInfraXmlItemExporter(open(spider.name+".xml", "w"), fields_to_export=self.fields_to_export, root_element="structures", item_element="structure") # Make a quick copy of the list kml_fields = self.fields_to_export[:] kml_fields.append('fusion_marker') self.kml_exporter = MTQInfraKmlItemExporter(spider.name+".kml", fields_to_export=kml_fields) self.csv_exporter.start_exporting() self.json_exporter.start_exporting() self.jsonlines_exporter.start_exporting() self.xml_exporter.start_exporting() self.kml_exporter.start_exporting()
class JsonWriterPipeline2(object): def _init_(self): self.fields_to_export = [ 'title', 'link' ] dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) def spider_opened(self,spider): self.jsonlines_exporter = JsonLinesItemExporter(open(spider.name+".linejson", "w"), fields_to_export=self.fields_to_export) self.jsonlines_exporter.start_exporting() def process_item(self,item,spider): self.jsonlines_exporter.export_item(item) return item def spider_closed(self, spider): self.jsonlines_exporter.finish_exporting()
def spider_opened(self, spider): # signals start of export print "Spider opened...\nPreparing to crawl..." self.json_exporter = JsonLinesItemExporter(open('movies.json', 'wb')) self.json_exporter.start_exporting() # Since the charts frequently change, we need to deal with differences # in the cached data and current data. # For now, we'll just truncate the table when the spider opens # and dump everything in. cursor = connection.cursor() sql = 'truncate table %s' % MYSQL_TABLE try: cursor.execute(sql) connection.commit() print "*** Truncated %s Table ***" % MYSQL_TABLE except: print "Error %d %s" % (e.args[0], e.args[1]) connection.rollback()
class JsonExportExternalIdPipeline(object): @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.file = open('scraped/%s.json' % spider.external_id, 'w') self.exporter = JsonLinesItemExporter(self.file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item)
class JsonLinesExportPipeline(object): def __init__(self): dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.files = {} self.first_item = True def spider_opened(self, spider): file = open('%s_items.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class RecipescraperPipeline(object): def __init__(self): self.file=open('recipes.jl','wb') @classmethod def from_crawler(cls,crawler): pipeline=cls() crawler.signals.connect(pipeline.spider_opened,signals.spider_opened) crawler.signals.connect(pipeline.spider_closed,signals.spider_closed) return pipeline def spider_opened(self,spider): self.exporter=JsonLinesItemExporter(self.file) self.exporter.start_exporting() def spider_closed(self,spider): self.exporter.finish_exporting() # file=self.files.pop(spider) self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class JsonWithEncodingPipeline(NumbeoPipeline): def __init__(self): super(JsonWithEncodingPipeline, self).__init__() self.suffix = 'json' dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.files = {} def spider_opened(self, spider): file = open('{0}_{1}.{2}'.format(spider.name, self.proj, self.suffix), 'w+b') self.files[spider] = file #self.exporter = JsonItemExporter(file) self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
def open_spider(self, spider): self.startTime = datetime.datetime.now() nowStr = self.startTime.strftime("%Y-%m-%d %H:%M") scrapeDir = "" if spider.scrapeMode == "FIXERRORS": scrapeDir = spider.fixpath + "_FIX" else: if spider.scrapeMode == "FULL": if not os.path.exists("FullScrapes"): os.makedirs("FullScrapes") typeDir = "FullScrapes/" elif spider.scrapeMode == "INCREMENTAL": if not os.path.exists("IncrementalScrapes"): os.makedirs("IncrementalScrapes") typeDir = "IncrementalScrapes/" else: if not os.path.exists("TestScrapes"): os.makedirs("TestScrapes") typeDir = "TestScrapes/" scrapeDir = typeDir + nowStr if not os.path.exists(scrapeDir): os.makedirs(scrapeDir) spider.setScrapePath(scrapeDir) self.tendersfile = open(scrapeDir + "/" + "tenders.json", 'wb') self.procuringEntitiesfile = open( scrapeDir + "/" + 'organisations.json', 'wb') self.tenderBiddersFile = open(scrapeDir + "/" + 'tenderBidders.json', 'wb') self.tenderAgreementsFile = open( scrapeDir + "/" + 'tenderAgreements.json', 'wb') self.tenderDocumentationFile = open( scrapeDir + "/" + 'tenderDocumentation.json', 'wb') self.tenderCPVCodeFile = open(scrapeDir + "/" + 'tenderCPVCode.json', 'wb') self.whiteListFile = open(scrapeDir + "/" + 'whiteList.json', 'wb') self.blackListFile = open(scrapeDir + "/" + 'blackList.json', 'wb') self.complaintFile = open(scrapeDir + "/" + 'complaints.json', 'wb') self.bidderResultFile = open(scrapeDir + "/" + 'bidderResult.json', 'wb') self.tenderExporter = JsonLinesItemExporter(self.tendersfile) self.procurerExporter = JsonLinesItemExporter( self.procuringEntitiesfile) self.biddersExporter = JsonLinesItemExporter(self.tenderBiddersFile) self.agreementExporter = JsonLinesItemExporter( self.tenderAgreementsFile) self.documentationExporter = JsonLinesItemExporter( self.tenderDocumentationFile) self.cpvCodeExporter = JsonLinesItemExporter(self.tenderCPVCodeFile) self.whiteListExporter = JsonLinesItemExporter(self.whiteListFile) self.blackListExporter = JsonLinesItemExporter(self.blackListFile) self.complaintExporter = JsonLinesItemExporter(self.complaintFile) self.bidderResultExporter = JsonLinesItemExporter( self.bidderResultFile) self.tenderExporter.start_exporting() self.procurerExporter.start_exporting() self.biddersExporter.start_exporting() self.agreementExporter.start_exporting() self.documentationExporter.start_exporting() self.cpvCodeExporter.start_exporting() self.whiteListExporter.start_exporting() self.blackListExporter.start_exporting() self.complaintExporter.start_exporting() self.bidderResultExporter.start_exporting() self.infoFile = open(scrapeDir + "/" + 'scrapeInfo.txt', 'wb') self.infoFile.write("StartTime: " + nowStr + "\n")
def open_spider(self, spider): self.exporter = JsonLinesItemExporter(self.file) self.exporter.start_exporting()
def spider_opened(self, spider): fname = open(_get_spider_output_filename(spider), 'wb') self.files[spider] = fname self.exporter = JsonLinesItemExporter(fname) self.exporter.fields_to_export = _get_fields_to_check(SiteProductItem) self.exporter.start_exporting()
def spider_opened(self, spider): file = open('%s_items.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting()
def spider_opened(self, spider): file = open('{0}_{1}.{2}'.format(spider.name, self.proj, self.suffix), 'w+b') self.files[spider] = file #self.exporter = JsonItemExporter(file) self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting()
def open_spider(self, spider): self.file = open('data/' + spider.name+'.jsonlines', 'a') self.exporter = JsonLinesItemExporter(self.file) self.exporter.start_exporting()
class DamPipeline(object): def check_item(self,item): if (item['MaximumCapacity'] != "--"): a = datetime.strptime(item['TimeStamp'], '%Y-%m-%d') b = datetime.strptime("2017-01-01", '%Y-%m-%d') if(a<b): if(float(item['EffectiveWaterStorageCapacity']) > (float(item['MaximumCapacity'])*1.1)): #用1.1被去篩選,考慮到淤積因素 item['EffectiveWaterStorageCapacity'] = "-888" else: if(float(item['EffectiveWaterStorageCapacity']) > float(item['MaximumCapacity'])): #2017年後只要大於就排除 item['EffectiveWaterStorageCapacity'] = "-888" else: if item['EffectiveWaterStorageCapacity'] and not re.match('^\d+?\.\d+?$', item['EffectiveWaterStorageCapacity']): #check format item['EffectiveWaterStorageCapacity'] = "-999" else: if(float(item['EffectiveWaterStorageCapacity']) > 100000): item['EffectiveWaterStorageCapacity'] = "-888" for key, val in item.items(): if( re.match('^Max', key) or re.match('^Percentage', key) or re.match('^Water', key) ): #select key from item or re.match('^Effective', key) if val and not re.match('^\d+?\.\d+?$', val): #check format item[key] = "-999" # can use None or NULL if re.match('^TimeStamp',key): if(val == "--\r\n " or val == "--"): raise DropItem("Missing value in %s" % item) return item def __init__(self): path = os.path.abspath("dir.txt").replace("dir.txt","") # To find the path print("Download file is set to be this path: ",path,"\nIf it's not right please make sure dir.txt file is at the same location of the execute file.py") if(os.path.isfile(path + "ReservoirState_items1.json") and os.path.isfile(path + "ReservoirState_items2.json")): print('Err: ReservoirState_items1.json & ReservoirState_items2.json are already exist.') elif(os.path.isfile(path +'ReservoirPastState_items1.json') and os.path.isfile(path +'ReservoirPastState_items2.json')): print('Err: ReservoirPastState_items1.json & ReservoirPastState_items2.json are already exist.') else: dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.files = {} def spider_opened(self, spider): path = os.path.abspath("dir.txt").replace("dir.txt","") # For checking usage if(os.path.isfile(path +'check_item1.txt')): if(os.path.isfile(path +'check_item2.txt')): print('check_item2 is already exist.') else: file = open('%s_items2.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting() print('check_item1 is already exist.') else: file = open('%s_items1.json' % spider.name, 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): path = os.path.abspath("dir.txt").replace("dir.txt","") print("Download file is set to be this path: ",path,"\nIf it's not right please make sure dir.txt file is at the same location of the execute file.py") if(os.path.isfile(path +'check_item1.txt') and os.path.isfile(path +'check_item2.txt')): print('check_item1 and check_item2 are already exist.',"\nNothing change in DB.") else: self.exporter.finish_exporting() file = self.files.pop(spider) file.close() if(os.path.isfile(path +'ReservoirState_items1.json') or os.path.isfile(path +'ReservoirPastState_items1.json')): file = open('check_item1.txt', 'w') file.write("This is for scrapy to check item accuracy") file.close() if(os.path.isfile(path +'ReservoirState_items2.json') or os.path.isfile(path +'ReservoirPastState_items2.json')): file = open('check_item2.txt', 'w') file.write("This is for scrapy to check item accuracy") file.close() def process_item(self, item, spider): path = os.path.abspath("dir.txt").replace("dir.txt","") item = self.check_item(item) if(os.path.isfile(path +'check_item1.txt') and os.path.isfile(path +'check_item2.txt')): print('File exist!!') else: self.exporter.export_item(item) return item # def open_spider(self, spider): # self.conn = MySQLdb.connect(host='localhost', # user='******', # passwd='demo1234', # db='demo', # charset='utf8') # self.cursor = self.conn.cursor() # def check_item(self, item): # for key, val in item.items(): # if re.match('^M', key) or re.match('^Percentage', key): #挑出M開頭的key # if val and not re.match('^\d+?\.\d+?$', val): #??? # item[key] = None # return item # def re_run(self, item): # count = 0 # for key, val in item.items(): # if(count==3): # item[key] = '---' # print('error in the website server') # break # if re.match('^TimeStamp', key) or re.match('^R_ID', key): #挑出M開頭的key # if(val == '--'): # count = count + 1 # # process = CrawlerProcess({ # # 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' # # }) # # process.crawl(DamwraSpider) # # process.start() # os.system("scrapy crawl damwra -o dam2.json") # break # return item # def process_item(self, item, spider): # item = self.check_item(item) # #self.re_run(item) # self.cursor.execute("""INSERT INTO ReservoirState (R_ID, Reservoir, TimeStamp, WaterLevel, EffectiveWaterStorageCapacity, PercentageUsedInReservoirCapacity, MaximumCapacity) VALUES (%s, %s, %s, %s, %s, %s, %s)""",( # item['R_ID'], # item['Reservoir'], # item['TimeStamp'], # item['WaterLevel'], # item['EffectiveWaterStorageCapacity'], # item['PercentageUsedInReservoirCapacity'], # item['MaximumCapacity'] # )) # self.conn.commit() # return item # def close_spider(self, spider): # #self.conn.close() # pass# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html # import MySQLdb # import MySQLdb.cursors # import re # import os # from scrapy.exceptions import DropItem # import scrapy # from scrapy.crawler import CrawlerProcess # from dam.spiders.damwra import DamwraSpider # from scrapy.xlib.pydispatch import dispatcher # from scrapy import signals # from scrapy.contrib.exporter import JsonLinesItemExporter # class DamPipeline(object): # def check_item(self, item): # for key, val in item.items(): # if re.match('^M', key) or re.match('^Percentage', key): #挑出M開頭的key # if val and not re.match('^\d+?\.\d+?$', val): #??? # item[key] = -999 # can use None or NULL # return item # def __init__(self): # if(os.path.isfile('/home/ubuntu/workspace/scrapy_dam/dam/damwra_items1.json') and os.path.isfile('/home/ubuntu/workspace/scrapy_dam/dam/damwra_items2.json')): # print('File exist!!') # else: # dispatcher.connect(self.spider_opened, signals.spider_opened) # dispatcher.connect(self.spider_closed, signals.spider_closed) # self.files = {} # def spider_opened(self, spider): # # For checking usage # if(os.path.isfile('/home/ubuntu/workspace/scrapy_dam/dam/check_item1.txt')): # if(os.path.isfile('/home/ubuntu/workspace/scrapy_dam/dam/check_item2.txt')): # print('file exist') # else: # file = open('%s_items2.json' % spider.name, 'w+b') # self.files[spider] = file # self.exporter = JsonLinesItemExporter(file) # self.exporter.start_exporting() # else: # file = open('%s_items1.json' % spider.name, 'w+b') # self.files[spider] = file # self.exporter = JsonLinesItemExporter(file) # self.exporter.start_exporting() # def spider_closed(self, spider): # if(os.path.isfile('/home/ubuntu/workspace/scrapy_dam/dam/check_item1.txt') and os.path.isfile('/home/ubuntu/workspace/scrapy_dam/dam/check_item2.txt')): # print('File exist!!') # else: # self.exporter.finish_exporting() # file = self.files.pop(spider) # file.close() # if(os.path.isfile('/home/ubuntu/workspace/scrapy_dam/dam/damwra_items1.json')): # file = open('check_item1.txt', 'w') # file.write("This is for scrapy to check item accuracy") # file.close() # if(os.path.isfile('/home/ubuntu/workspace/scrapy_dam/dam/damwra_items2.json')): # file = open('check_item2.txt', 'w') # file.write("This is for scrapy to check item accuracy") # file.close() # def process_item(self, item, spider): # item = self.check_item(item) # if(os.path.isfile('/home/ubuntu/workspace/scrapy_dam/dam/check_item1.txt') and os.path.isfile('/home/ubuntu/workspace/scrapy_dam/dam/check_item2.txt')): # print('File exist!!') # else: # self.exporter.export_item(item) # return item # def open_spider(self, spider): # self.conn = MySQLdb.connect(host='localhost', # user='******', # passwd='demo1234', # db='demo', # charset='utf8') # self.cursor = self.conn.cursor() # def check_item(self, item): # for key, val in item.items(): # if re.match('^M', key) or re.match('^Percentage', key): #挑出M開頭的key # if val and not re.match('^\d+?\.\d+?$', val): #??? # item[key] = None # return item # def re_run(self, item): # count = 0 # for key, val in item.items(): # if(count==3): # item[key] = '---' # print('error in the website server') # break # if re.match('^TimeStamp', key) or re.match('^R_ID', key): #挑出M開頭的key # if(val == '--'): # count = count + 1 # # process = CrawlerProcess({ # # 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' # # }) # # process.crawl(DamwraSpider) # # process.start() # os.system("scrapy crawl damwra -o dam2.json") # break # return item # def process_item(self, item, spider): # item = self.check_item(item) # #self.re_run(item) # self.cursor.execute("""INSERT INTO ReservoirState (R_ID, Reservoir, TimeStamp, WaterLevel, EffectiveWaterStorageCapacity, PercentageUsedInReservoirCapacity, MaximumCapacity) VALUES (%s, %s, %s, %s, %s, %s, %s)""",( # item['R_ID'], # item['Reservoir'], # item['TimeStamp'], # item['WaterLevel'], # item['EffectiveWaterStorageCapacity'], # item['PercentageUsedInReservoirCapacity'], # item['MaximumCapacity'] # )) # self.conn.commit() # return item # def close_spider(self, spider): # #self.conn.close() # pass
class ProcurementscrapePipeline(object): def open_spider(self, spider): self.startTime = datetime.datetime.now() nowStr = self.startTime.strftime("%Y-%m-%d %H:%M") scrapeDir = "" if spider.scrapeMode == "FIXERRORS": scrapeDir = spider.fixpath + "_FIX" else: if spider.scrapeMode == "FULL": if not os.path.exists("FullScrapes"): os.makedirs("FullScrapes") typeDir = "FullScrapes/" elif spider.scrapeMode == "INCREMENTAL": if not os.path.exists("IncrementalScrapes"): os.makedirs("IncrementalScrapes") typeDir = "IncrementalScrapes/" else: if not os.path.exists("TestScrapes"): os.makedirs("TestScrapes") typeDir = "TestScrapes/" scrapeDir = typeDir + nowStr if not os.path.exists(scrapeDir): os.makedirs(scrapeDir) spider.setScrapePath(scrapeDir) self.tendersfile = open(scrapeDir + "/" + "tenders.json", 'wb') self.procuringEntitiesfile = open( scrapeDir + "/" + 'organisations.json', 'wb') self.tenderBiddersFile = open(scrapeDir + "/" + 'tenderBidders.json', 'wb') self.tenderAgreementsFile = open( scrapeDir + "/" + 'tenderAgreements.json', 'wb') self.tenderDocumentationFile = open( scrapeDir + "/" + 'tenderDocumentation.json', 'wb') self.tenderCPVCodeFile = open(scrapeDir + "/" + 'tenderCPVCode.json', 'wb') self.whiteListFile = open(scrapeDir + "/" + 'whiteList.json', 'wb') self.blackListFile = open(scrapeDir + "/" + 'blackList.json', 'wb') self.complaintFile = open(scrapeDir + "/" + 'complaints.json', 'wb') self.bidderResultFile = open(scrapeDir + "/" + 'bidderResult.json', 'wb') self.tenderExporter = JsonLinesItemExporter(self.tendersfile) self.procurerExporter = JsonLinesItemExporter( self.procuringEntitiesfile) self.biddersExporter = JsonLinesItemExporter(self.tenderBiddersFile) self.agreementExporter = JsonLinesItemExporter( self.tenderAgreementsFile) self.documentationExporter = JsonLinesItemExporter( self.tenderDocumentationFile) self.cpvCodeExporter = JsonLinesItemExporter(self.tenderCPVCodeFile) self.whiteListExporter = JsonLinesItemExporter(self.whiteListFile) self.blackListExporter = JsonLinesItemExporter(self.blackListFile) self.complaintExporter = JsonLinesItemExporter(self.complaintFile) self.bidderResultExporter = JsonLinesItemExporter( self.bidderResultFile) self.tenderExporter.start_exporting() self.procurerExporter.start_exporting() self.biddersExporter.start_exporting() self.agreementExporter.start_exporting() self.documentationExporter.start_exporting() self.cpvCodeExporter.start_exporting() self.whiteListExporter.start_exporting() self.blackListExporter.start_exporting() self.complaintExporter.start_exporting() self.bidderResultExporter.start_exporting() self.infoFile = open(scrapeDir + "/" + 'scrapeInfo.txt', 'wb') self.infoFile.write("StartTime: " + nowStr + "\n") def process_item(self, item, spider): itemClassName = item.__class__.__name__ if (itemClassName == "Tender"): self.tenderExporter.export_item(item) elif (itemClassName == "Organisation"): self.procurerExporter.export_item(item) elif (itemClassName == "TenderBidder"): self.biddersExporter.export_item(item) elif (itemClassName == "TenderAgreement"): self.agreementExporter.export_item(item) elif (itemClassName == "TenderDocument"): self.documentationExporter.export_item(item) elif (itemClassName == "CPVCode"): self.cpvCodeExporter.export_item(item) elif (itemClassName == "WhiteListObject"): self.whiteListExporter.export_item(item) elif (itemClassName == "BlackListObject"): self.blackListExporter.export_item(item) elif (itemClassName == "Complaint"): self.complaintExporter.export_item(item) elif (itemClassName == "BidderResult"): self.bidderResultExporter.export_item(item) return item def close_spider(self, spider): self.endTime = datetime.datetime.now() endTimeStr = self.endTime.strftime("%Y-%m-%d %H:%M") self.infoFile.write("End Time: " + endTimeStr + "\n") timeTaken = self.endTime - self.startTime minutes = int(timeTaken.seconds / 60) seconds = timeTaken.seconds % 60 self.infoFile.write( "Time Taken: Days: %d Minutes: %d Seconds %d \n" % (timeTaken.days, minutes, seconds)) self.infoFile.write("Tenders scraped: %d \n" % (spider.tenderCount)) self.infoFile.write("Orgs scraped: %d \n" % (spider.orgCount)) self.infoFile.write("bidders scraped: %d \n" % (spider.bidderCount)) self.infoFile.write("agreements scraped: %d \n" % (spider.agreementCount)) self.infoFile.write("documents scraped: %d \n" % (spider.docCount)) print spider.firstTender self.infoFile.write("firstTenderURL: %d" % int(spider.firstTender)) self.infoFile.close() self.tenderExporter.finish_exporting() self.procurerExporter.finish_exporting() self.biddersExporter.finish_exporting() self.agreementExporter.finish_exporting() self.documentationExporter.finish_exporting() self.whiteListExporter.finish_exporting() self.blackListExporter.finish_exporting() self.complaintExporter.finish_exporting() self.bidderResultExporter.finish_exporting() self.tendersfile.close() self.procuringEntitiesfile.close() self.tenderBiddersFile.close() self.tenderAgreementsFile.close() self.tenderDocumentationFile.close() self.tenderCPVCodeFile.close() self.whiteListFile.close() self.blackListFile.close() self.complaintFile.close() self.bidderResultFile.close()
def _get_exporter(self, **kwargs): return JsonLinesItemExporter(self.output, **kwargs)
def spider_opened(self, spider): # write utf-8 file f = codecs.open('articles.json', 'w+', encoding='utf-8') self.files[spider] = f self.exporter = JsonLinesItemExporter(f, ensure_ascii=False) self.exporter.start_exporting()
def open_spider(self, spider): self.file = open('test.json', 'w+b') self.exporter = JsonLinesItemExporter(self.file)
def open_spider(self, spider): print "===open_spider===" file = open('data/%s_products_%s.json' % (spider.name, datetime.now().strftime("%Y%m%d%H%M%S")), 'w+b') self.files[spider] = file self.exporter = JsonLinesItemExporter(file, ensure_ascii=False) self.exporter.start_exporting()