Exemple #1
0
class NordstromPipeline(object):

  def __init__(self):
    self.files = {}
    self.ids_seen = set()

  @classmethod
  def from_crawler(cls, crawler):
    pipeline = cls()
    crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
    crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
    return pipeline

  def process_item(self, item, spider):
    if item['product_item_num'] in self.ids_seen:
      raise DropItem("Duplicate item found: %s" % item)
    else:
      self.ids_seen.add(item['product_item_num'])
      self.exporter.export_item(item)
      return item

  def spider_opened(self, spider):
    out_file = open('%s_products.jl' % spider.name, 'w+b')
    self.files[spider] = out_file
    self.exporter = JsonLinesItemExporter(out_file)
    self.exporter.start_exporting()

  def spider_closed(self, spider):
    self.exporter.finish_exporting()
    out_file = self.files.pop(spider)
    out_file.close()
Exemple #2
0
class FeedWriterPipeline(object):
    def __init__(self):
        log.msg('FeedWriterPipeline.__init__()')
        self.file = None
        self.item_exporter = None
        self.count = 0

    def open_spider(self, spider):
        if FeedSpider.is_feed_op(spider):
            spider.make_sure_path_exists(spider.get_output_dir_path())
            file_name = spider.get_feed_output_file_path()
            self.file = open(file_name, 'a')
            self.item_exporter = JsonLinesItemExporter(self.file)
            log.msg('FeedWriterPipeline, opened file %s to append.' % file_name)

    def process_item(self, item, spider):
        if FeedSpider.is_feed_op(spider) and isinstance(item, FeedItem):
            self.item_exporter.export_item(item)
            self.count += 1
            spider.check_max_limit(self.count)
            raise DropItem('Save item success')
        else:
            return item

    def close_spider(self, spider):
        if FeedSpider.is_feed_op(spider):
            self.file.write('Parsed %i feed items.%s' % (self.count, os.linesep))
            self.file.close()
            log.msg('closed file, appended %i items.' % self.count)
Exemple #3
0
class PerispiderPipeline(object):
	def open_spider(self, spider):
		name = "%s.json" % spider.name
		self.file = open(name, 'w')
		self.exporter = JsonLinesItemExporter(self.file)
		self.exporter.start_exporting()

	def process_item(self, item, spider):
		self.exporter.export_item(item)
		return item
 def get_exporter(self, item):
     exporter = None
     if item.__class__ in self.exporters:
         exporter = self.exporters[item.__class__]
     else:
         if item.__class__ == items.unused_genotype_data:
             exporter = JsonLinesItemExporter(open(_class_to_file(item.__class__), 'w+b'))
         else:
             exporter = CsvItemExporter(open(_class_to_file(item.__class__), 'w+b'))
         self.exporters[item.__class__] = exporter
         exporter.start_exporting()
     return exporter
Exemple #5
0
class JsonLinesItemPipeline(object):

	def open_spider(self, spider):
		self.file = open('test.json', 'w+b')
		self.exporter = JsonLinesItemExporter(self.file)

	def close_spider(self, spider):
		self.file.close()

	def process_item(self, item, spider):
		self.exporter.export_item(item)
		return item
Exemple #6
0
 def spider_opened(self, spider):
     if not os.path.exists('./json/'):
         os.makedirs('./json/')
     if isinstance(spider, MSPCrawler):
         MSPFile = open('json/msps.json', 'w+b')
         self.files['msps'] = MSPFile
         self.MSPExporter = JsonLinesItemExporter(MSPFile)
         self.MSPExporter.start_exporting()
     elif isinstance(spider, VoteCrawler):
         VoteFile = open('json/votes-' + spider.mspid + '.json', 'w+b')
         self.files['votes'] = VoteFile
         self.VoteExporter = JsonLinesItemExporter(VoteFile)
         self.VoteExporter.start_exporting()
class MoviesPipeline(object):
    
    def __init__(self):
        self.field_to_export = []
        self.file = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        # signals start of export

        print "Spider opened...\nPreparing to crawl..."

        self.json_exporter = JsonLinesItemExporter(open('movies.json', 'wb'))
        self.json_exporter.start_exporting()

        # Since the charts frequently change, we need to deal with differences
        # in the cached data and current data. 
        # For now, we'll just truncate the table when the spider opens
        # and dump everything in.

        cursor = connection.cursor()

        sql = 'truncate table %s' % MYSQL_TABLE

        try:
            cursor.execute(sql)
            connection.commit()
            print "*** Truncated %s Table ***" % MYSQL_TABLE
        except:
            print "Error %d %s" % (e.args[0], e.args[1])
            connection.rollback()

    def process_item(self, item, spider):
        # store the item in the database
        insert_database(item)

        # Write to JSON file
        self.json_exporter.export_item(item)

        return item

    def spider_closed(self, spider):
        # signal end of export
        self.json_exporter = finish_exporting()
Exemple #8
0
 def open_spider(self, spider):
     if FeedSpider.is_feed_op(spider):
         spider.make_sure_path_exists(spider.get_output_dir_path())
         file_name = spider.get_feed_output_file_path()
         self.file = open(file_name, 'a')
         self.item_exporter = JsonLinesItemExporter(self.file)
         log.msg('FeedWriterPipeline, opened file %s to append.' % file_name)
Exemple #9
0
class FmlPipeline(object):
	'''
	def __init__(self):
		self.file = open('data2.json', 'w')
		self.exporter = JsonLinesItemExporter(self.file)
		self.exporter.start_exporting()
	'''
	def open_spider(self, spider):
		name = "%s.json" % spider.name
		self.file = open(name, 'w')
		self.exporter = JsonLinesItemExporter(self.file)
		self.exporter.start_exporting()
	
	def process_item(self, item, spider):
		self.exporter.export_item(item)
		return item
Exemple #10
0
 def spider_opened(self, spider):
     file = open('%s/%s/%s.json'% (settings.DATA_DIR,
                                   spider.name,
                                   datetime.date.today().isoformat()),
                 'w+b')
     self.files[spider] = file
     self.exporter = JsonLinesItemExporter(file)
     self.exporter.start_exporting()
Exemple #11
0
 def process_spider_output(self, response, result, spider):
     items = []
     for r in result:
         if isinstance(r, Item):
             items.append(r)
         yield r
     cca = response2cca(response, base64=True)
     cca['features'] = {'items': items}
     cca_item = self.create_item(cca)
     cca_path = self.get_cca_path(spider)
     if cca_path is None:
         yield cca_item
     else:
         exporter = self.exporters_by_path.get(cca_path)
         if exporter is None:
             exporter = JsonLinesItemExporter(open(cca_path, 'a+'))
             self.exporters_by_path[cca_path] = exporter
         exporter.export_item(cca_item)
    def process_item(self, item, spider):
        """
        Writes the item to output
        """

        # create the output file for a new class of item per spider
        settings = spider.crawler.settings
        if item.__class__ not in self.xporters[spider.name]:
            filename = '%s.json' % item.export_filename
            dirpath = path.join(settings.get('IO_PATH', 'io'), settings['DATA_SET'])
            _mkdir_p(dirpath)
            file_h = open(path.join(dirpath, filename), 'w')
            xporter = JsonLinesItemExporter(file=file_h)
            xporter.start_exporting()
            self.xporters[spider.name][item.__class__] = (file_h, xporter)

        xporter = self.xporters[spider.name][item.__class__][1]
        xporter.export_item(item)
        return item
Exemple #13
0
class AdbPipeline(object):
    def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.files = {}
        self.seen = set([])

    def spider_opened(self, spider):
        file = open('%s/%s/%s.json'% (settings.DATA_DIR,
                                      spider.name,
                                      datetime.date.today().isoformat()),
                    'w+b')
        self.files[spider] = file
        self.exporter = JsonLinesItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        if self.seen_before(item):
            raise DropItem
        self.exporter.export_item(item)
        return item

    def seen_before(self, item):
        if item['product'] in self.seen:
            return True
        else:
            self.seen.add(item['product'])
            return False
Exemple #14
0
class JsonExportPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        json_path = os.path.join('data', '%s.json' % spider.name)
        file = open(json_path, 'w+b')
        self.files[spider] = file
        self.exporter = JsonLinesItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        log.msg("process_item", level=log.DEBUG)
        return item
Exemple #15
0
class JsonLinesExportPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        out_file = open('%s_pics.json' % spider.name, 'a')
        self.files[spider] = out_file
        self.exporter = JsonLinesItemExporter(out_file, ensure_ascii=False)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        out_file = self.files.pop(spider)
        out_file.close()

    def process_item(self, item, spider):
        if item.get("image_urls"):
            self.exporter.export_item(item)
        return item
class ValidatorPipeline(object):
    """ Exports items in a temporary JSON file.
        Unnecessary fields are excluded. """

    def __init__(self):
        self.exporter = None
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        fname = open(_get_spider_output_filename(spider), 'wb')
        self.files[spider] = fname
        self.exporter = JsonLinesItemExporter(fname)
        self.exporter.fields_to_export = _get_fields_to_check(ProductItem)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        f = self.files.pop(spider)
        f.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Exemple #17
0
class PajandanPipeline(object):
    def __init__(self):
        self.files = {} # may be more than one spider

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        # write utf-8 file
        f = codecs.open('articles.json', 'w+', encoding='utf-8')
        self.files[spider] = f
        self.exporter = JsonLinesItemExporter(f, ensure_ascii=False)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        f = self.files.pop(spider)
        f.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Exemple #18
0
class ScrippaPipeline(object):
    
    def __init__(self):
        #self.files = {}
        #self.log("MMMMMMMMMMMMMMMMMMMMMMMMMMAAAAAAAAAAAATE", level=log.WARNING)
        print "DDDDDDDDDDDDDDDDDDDDDDDDDUUUUUUUUUUUUUUUUUUUUUUUUUUUDE"
        #file = open('1_reports.json', 'w+b')
        
    
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('2_reports.json', 'w+b')
        #self.files[spider] = file
        self.exporter = JsonLinesItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        #file = self.files.pop(spider)
        #file.close()

    def process_item(self, item, spider):
        print "ScrippaPipeline: exporting item ============================== "
        self.exporter.export_item(item)
        return item
Exemple #19
0
class TibiaPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s.json' % (spider.name + datetime.datetime.now().isoformat()), 'a+b')
        self.files[spider] = file
        self.exporter = JsonLinesItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
class JsonExportPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_Joke.txt' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = JsonLinesItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Exemple #21
0
class PlayerPipeline(object):
    def __init__(self, *args, **kwargs):
        self.player_info_file = None
        self.player_info_exporter = None

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.player_info_file = open("%s/output/player_info.json" % settings.PROJECT_ROOT, 'wb')
        self.player_info_exporter = JsonLinesItemExporter(self.player_info_file)
        self.player_info_exporter.start_exporting()

    def spider_closed(self, spider):
        self.player_info_exporter.finish_exporting()
        self.player_info_file.close()

    def process_item(self, item, spider):
        if isinstance(item, PlayerInfoItem):
            self.player_info_exporter.export_item(item)
        return item
Exemple #22
0
    def spider_opened(self, spider):
        self.csv_exporter = CsvItemExporter(open(spider.name+".csv", "w"),
                                            fields_to_export=self.fields_to_export,
											quoting=csv.QUOTE_ALL)
        self.json_exporter = TffdatapullJsonItemExporter(open(spider.name+".json", "w"),
                                                      fields_to_export=self.fields_to_export,
                                                      sort_keys=True, indent=4)
        self.jsonlines_exporter = JsonLinesItemExporter(open(spider.name+".linejson", "w"),
                                                        fields_to_export=self.fields_to_export)
 
        self.xml_exporter = TffdatapullXmlItemExporter(open(spider.name+".xml", "w"),
                                                    fields_to_export=self.fields_to_export,
                                                    root_element="match_code", item_element="match_code")
        # Make a quick copy of the list
        self.csv_exporter.start_exporting()
        self.json_exporter.start_exporting()
        self.jsonlines_exporter.start_exporting()
        self.xml_exporter.start_exporting()
Exemple #23
0
class TffdatapullPipeline(object):
    def __init__(self):
        self.fields_to_export = [
            'match_code',
            'match_href',
            'home_team_id',
            'home_team_name',
            'score',
            'guest_team_id',
			'guest_team_name',
			'date_of_match',
			'time_of_match',
			'stadium_name',
			'organization_name'
        ]
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
 
    def spider_opened(self, spider):
        self.csv_exporter = CsvItemExporter(open(spider.name+".csv", "w"),
                                            fields_to_export=self.fields_to_export,
											quoting=csv.QUOTE_ALL)
        self.json_exporter = TffdatapullJsonItemExporter(open(spider.name+".json", "w"),
                                                      fields_to_export=self.fields_to_export,
                                                      sort_keys=True, indent=4)
        self.jsonlines_exporter = JsonLinesItemExporter(open(spider.name+".linejson", "w"),
                                                        fields_to_export=self.fields_to_export)
 
        self.xml_exporter = TffdatapullXmlItemExporter(open(spider.name+".xml", "w"),
                                                    fields_to_export=self.fields_to_export,
                                                    root_element="match_code", item_element="match_code")
        # Make a quick copy of the list
        self.csv_exporter.start_exporting()
        self.json_exporter.start_exporting()
        self.jsonlines_exporter.start_exporting()
        self.xml_exporter.start_exporting()
 
    def process_item(self, item, spider):
        self.csv_exporter.export_item(item)
        self.json_exporter.export_item(item)
        self.jsonlines_exporter.export_item(item)
        self.xml_exporter.export_item(item)
        return item
 
    def spider_closed(self, spider):
        self.csv_exporter.finish_exporting()
        self.json_exporter.finish_exporting()
        self.jsonlines_exporter.finish_exporting()
        self.xml_exporter.finish_exporting()
    def spider_opened(self, spider):
        self.csv_exporter = CsvItemExporter(open(spider.name+".csv", "w"),
                                            fields_to_export=self.fields_to_export, quoting=csv.QUOTE_ALL)
        self.json_exporter = MTQInfraJsonItemExporter(open(spider.name+".json", "w"),
                                                      fields_to_export=self.fields_to_export,
                                                      sort_keys=True, indent=4)
        self.jsonlines_exporter = JsonLinesItemExporter(open(spider.name+".linejson", "w"),
                                                        fields_to_export=self.fields_to_export)

        self.xml_exporter = MTQInfraXmlItemExporter(open(spider.name+".xml", "w"),
                                                    fields_to_export=self.fields_to_export,
                                                    root_element="structures", item_element="structure")
        # Make a quick copy of the list
        kml_fields = self.fields_to_export[:]
        kml_fields.append('fusion_marker')
        self.kml_exporter = MTQInfraKmlItemExporter(spider.name+".kml", fields_to_export=kml_fields)
        self.csv_exporter.start_exporting()
        self.json_exporter.start_exporting()
        self.jsonlines_exporter.start_exporting()
        self.xml_exporter.start_exporting()
        self.kml_exporter.start_exporting()
Exemple #25
0
class JsonWriterPipeline2(object):
  def _init_(self):
    self.fields_to_export = [
      'title',
      'link'
    ]
    dispatcher.connect(self.spider_opened, signals.spider_opened)
    dispatcher.connect(self.spider_closed, signals.spider_closed)
  
  def spider_opened(self,spider):
    self.jsonlines_exporter = JsonLinesItemExporter(open(spider.name+".linejson", "w"), fields_to_export=self.fields_to_export)
    self.jsonlines_exporter.start_exporting()

  def process_item(self,item,spider):
    self.jsonlines_exporter.export_item(item)
    return item
  def spider_closed(self, spider):
    self.jsonlines_exporter.finish_exporting()
    def spider_opened(self, spider):
        # signals start of export

        print "Spider opened...\nPreparing to crawl..."

        self.json_exporter = JsonLinesItemExporter(open('movies.json', 'wb'))
        self.json_exporter.start_exporting()

        # Since the charts frequently change, we need to deal with differences
        # in the cached data and current data. 
        # For now, we'll just truncate the table when the spider opens
        # and dump everything in.

        cursor = connection.cursor()

        sql = 'truncate table %s' % MYSQL_TABLE

        try:
            cursor.execute(sql)
            connection.commit()
            print "*** Truncated %s Table ***" % MYSQL_TABLE
        except:
            print "Error %d %s" % (e.args[0], e.args[1])
            connection.rollback()
Exemple #27
0
class JsonExportExternalIdPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.file = open('scraped/%s.json' % spider.external_id, 'w')
        self.exporter = JsonLinesItemExporter(self.file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
class JsonLinesExportPipeline(object):

    def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.files = {}
        self.first_item = True

    def spider_opened(self, spider):
        file = open('%s_items.json' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = JsonLinesItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Exemple #29
0
class RecipescraperPipeline(object):
  def __init__(self):
    self.file=open('recipes.jl','wb')

  @classmethod
  def from_crawler(cls,crawler):
    pipeline=cls()
    crawler.signals.connect(pipeline.spider_opened,signals.spider_opened)
    crawler.signals.connect(pipeline.spider_closed,signals.spider_closed)
    return pipeline

  def spider_opened(self,spider):
    self.exporter=JsonLinesItemExporter(self.file)
    self.exporter.start_exporting()

  def spider_closed(self,spider):
    self.exporter.finish_exporting()
   # file=self.files.pop(spider)
    self.file.close()

  def process_item(self, item, spider):
    self.exporter.export_item(item)
    return item
Exemple #30
0
class JsonWithEncodingPipeline(NumbeoPipeline):

    def __init__(self):
        super(JsonWithEncodingPipeline, self).__init__()
        self.suffix = 'json'
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.files = {}

    def spider_opened(self, spider):
        file = open('{0}_{1}.{2}'.format(spider.name, self.proj, self.suffix), 'w+b')
        self.files[spider] = file
        #self.exporter = JsonItemExporter(file)
        self.exporter = JsonLinesItemExporter(file)        
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Exemple #31
0
    def open_spider(self, spider):

        self.startTime = datetime.datetime.now()
        nowStr = self.startTime.strftime("%Y-%m-%d %H:%M")
        scrapeDir = ""
        if spider.scrapeMode == "FIXERRORS":
            scrapeDir = spider.fixpath + "_FIX"
        else:
            if spider.scrapeMode == "FULL":
                if not os.path.exists("FullScrapes"):
                    os.makedirs("FullScrapes")
                typeDir = "FullScrapes/"
            elif spider.scrapeMode == "INCREMENTAL":
                if not os.path.exists("IncrementalScrapes"):
                    os.makedirs("IncrementalScrapes")
                typeDir = "IncrementalScrapes/"
            else:
                if not os.path.exists("TestScrapes"):
                    os.makedirs("TestScrapes")
                typeDir = "TestScrapes/"
            scrapeDir = typeDir + nowStr

        if not os.path.exists(scrapeDir):
            os.makedirs(scrapeDir)

        spider.setScrapePath(scrapeDir)
        self.tendersfile = open(scrapeDir + "/" + "tenders.json", 'wb')
        self.procuringEntitiesfile = open(
            scrapeDir + "/" + 'organisations.json', 'wb')
        self.tenderBiddersFile = open(scrapeDir + "/" + 'tenderBidders.json',
                                      'wb')
        self.tenderAgreementsFile = open(
            scrapeDir + "/" + 'tenderAgreements.json', 'wb')
        self.tenderDocumentationFile = open(
            scrapeDir + "/" + 'tenderDocumentation.json', 'wb')
        self.tenderCPVCodeFile = open(scrapeDir + "/" + 'tenderCPVCode.json',
                                      'wb')
        self.whiteListFile = open(scrapeDir + "/" + 'whiteList.json', 'wb')
        self.blackListFile = open(scrapeDir + "/" + 'blackList.json', 'wb')
        self.complaintFile = open(scrapeDir + "/" + 'complaints.json', 'wb')
        self.bidderResultFile = open(scrapeDir + "/" + 'bidderResult.json',
                                     'wb')

        self.tenderExporter = JsonLinesItemExporter(self.tendersfile)
        self.procurerExporter = JsonLinesItemExporter(
            self.procuringEntitiesfile)
        self.biddersExporter = JsonLinesItemExporter(self.tenderBiddersFile)
        self.agreementExporter = JsonLinesItemExporter(
            self.tenderAgreementsFile)
        self.documentationExporter = JsonLinesItemExporter(
            self.tenderDocumentationFile)
        self.cpvCodeExporter = JsonLinesItemExporter(self.tenderCPVCodeFile)
        self.whiteListExporter = JsonLinesItemExporter(self.whiteListFile)
        self.blackListExporter = JsonLinesItemExporter(self.blackListFile)
        self.complaintExporter = JsonLinesItemExporter(self.complaintFile)
        self.bidderResultExporter = JsonLinesItemExporter(
            self.bidderResultFile)

        self.tenderExporter.start_exporting()
        self.procurerExporter.start_exporting()
        self.biddersExporter.start_exporting()
        self.agreementExporter.start_exporting()
        self.documentationExporter.start_exporting()
        self.cpvCodeExporter.start_exporting()
        self.whiteListExporter.start_exporting()
        self.blackListExporter.start_exporting()
        self.complaintExporter.start_exporting()
        self.bidderResultExporter.start_exporting()

        self.infoFile = open(scrapeDir + "/" + 'scrapeInfo.txt', 'wb')
        self.infoFile.write("StartTime: " + nowStr + "\n")
Exemple #32
0
 def open_spider(self, spider):
     self.exporter = JsonLinesItemExporter(self.file)
     self.exporter.start_exporting()
 def spider_opened(self, spider):
     fname = open(_get_spider_output_filename(spider), 'wb')
     self.files[spider] = fname
     self.exporter = JsonLinesItemExporter(fname)
     self.exporter.fields_to_export = _get_fields_to_check(SiteProductItem)
     self.exporter.start_exporting()
Exemple #34
0
 def spider_opened(self, spider):
     file = open('%s_items.json' % spider.name, 'w+b')
     self.files[spider] = file
     self.exporter = JsonLinesItemExporter(file)
     self.exporter.start_exporting()
Exemple #35
0
 def spider_opened(self, spider):
     file = open('{0}_{1}.{2}'.format(spider.name, self.proj, self.suffix), 'w+b')
     self.files[spider] = file
     #self.exporter = JsonItemExporter(file)
     self.exporter = JsonLinesItemExporter(file)        
     self.exporter.start_exporting()
Exemple #36
0
 def open_spider(self, spider):
     self.file = open('data/' + spider.name+'.jsonlines', 'a')
     self.exporter = JsonLinesItemExporter(self.file) 
     self.exporter.start_exporting()
Exemple #37
0
class DamPipeline(object):
    def check_item(self,item):
        if (item['MaximumCapacity'] != "--"):
            a = datetime.strptime(item['TimeStamp'], '%Y-%m-%d')
            b = datetime.strptime("2017-01-01", '%Y-%m-%d')
            if(a<b):
                if(float(item['EffectiveWaterStorageCapacity']) > (float(item['MaximumCapacity'])*1.1)): #用1.1被去篩選,考慮到淤積因素
                    item['EffectiveWaterStorageCapacity'] = "-888"
            else:
                if(float(item['EffectiveWaterStorageCapacity']) > float(item['MaximumCapacity'])): #2017年後只要大於就排除
                    item['EffectiveWaterStorageCapacity'] = "-888"  
        else:
            if item['EffectiveWaterStorageCapacity'] and not re.match('^\d+?\.\d+?$', item['EffectiveWaterStorageCapacity']): #check format
                item['EffectiveWaterStorageCapacity'] = "-999"
            else:
                if(float(item['EffectiveWaterStorageCapacity']) > 100000):
                    item['EffectiveWaterStorageCapacity'] = "-888"
        
        for key, val in item.items():
            if( re.match('^Max', key) or re.match('^Percentage', key) or re.match('^Water', key) ):   #select key from item   or re.match('^Effective', key)
                if val and not re.match('^\d+?\.\d+?$', val): #check format
                    item[key] = "-999"  # can use None or NULL
            if re.match('^TimeStamp',key):
                if(val == "--\r\n      " or val == "--"):
                    raise DropItem("Missing value in %s" % item)
        return item
    
    def __init__(self):
        path = os.path.abspath("dir.txt").replace("dir.txt","") # To find the path
        print("Download file is set to be this path: ",path,"\nIf it's not right please make sure dir.txt file is at the same location of the execute file.py")
        
        if(os.path.isfile(path + "ReservoirState_items1.json") and os.path.isfile(path + "ReservoirState_items2.json")):
            print('Err: ReservoirState_items1.json & ReservoirState_items2.json are already exist.')
        elif(os.path.isfile(path +'ReservoirPastState_items1.json') and os.path.isfile(path +'ReservoirPastState_items2.json')):
            print('Err: ReservoirPastState_items1.json & ReservoirPastState_items2.json are already exist.')
        else:
            dispatcher.connect(self.spider_opened, signals.spider_opened)
            dispatcher.connect(self.spider_closed, signals.spider_closed)
            self.files = {}
            
    def spider_opened(self, spider):
        path = os.path.abspath("dir.txt").replace("dir.txt","")
        # For checking usage
        if(os.path.isfile(path +'check_item1.txt')):
            if(os.path.isfile(path +'check_item2.txt')):
                print('check_item2 is already exist.')
            else:
                file = open('%s_items2.json' % spider.name, 'w+b')
                self.files[spider] = file
                self.exporter = JsonLinesItemExporter(file)
                self.exporter.start_exporting()
                print('check_item1 is already exist.')
        else:
            file = open('%s_items1.json' % spider.name, 'w+b')
            self.files[spider] = file
            self.exporter = JsonLinesItemExporter(file)
            self.exporter.start_exporting()

    def spider_closed(self, spider):
        path = os.path.abspath("dir.txt").replace("dir.txt","")
        print("Download file is set to be this path: ",path,"\nIf it's not right please make sure dir.txt file is at the same location of the execute file.py")
        if(os.path.isfile(path +'check_item1.txt') and os.path.isfile(path +'check_item2.txt')):
            print('check_item1 and check_item2 are already exist.',"\nNothing change in DB.")
        else:
            self.exporter.finish_exporting()
            file = self.files.pop(spider)
            file.close()
            
        if(os.path.isfile(path +'ReservoirState_items1.json') or os.path.isfile(path +'ReservoirPastState_items1.json')):
            file = open('check_item1.txt', 'w')
            file.write("This is for scrapy to check item accuracy")
            file.close()
        
        if(os.path.isfile(path +'ReservoirState_items2.json') or os.path.isfile(path +'ReservoirPastState_items2.json')):
            file = open('check_item2.txt', 'w')
            file.write("This is for scrapy to check item accuracy")
            file.close()

    def process_item(self, item, spider):
        path = os.path.abspath("dir.txt").replace("dir.txt","")
        item = self.check_item(item)
        if(os.path.isfile(path +'check_item1.txt') and os.path.isfile(path +'check_item2.txt')):
            print('File exist!!')
        else:
            self.exporter.export_item(item)
            return item
        
    
    
    
    
#     def open_spider(self, spider):
#         self.conn = MySQLdb.connect(host='localhost',
#                                     user='******',
#                                     passwd='demo1234',
#                                     db='demo',
#                                     charset='utf8')
#         self.cursor = self.conn.cursor()
    
#     def check_item(self, item):
#         for key, val in item.items():
#             if re.match('^M', key) or re.match('^Percentage', key):   #挑出M開頭的key
#                 if val and not re.match('^\d+?\.\d+?$', val): #???
#                     item[key] = None
#         return item
        
#     def re_run(self, item):
#         count = 0
#         for key, val in item.items():
#             if(count==3):
#                 item[key] = '---'
#                 print('error in the website server')
#                 break
            
#             if re.match('^TimeStamp', key) or re.match('^R_ID', key):   #挑出M開頭的key
#                 if(val == '--'): 
#                     count = count + 1
# #                    process = CrawlerProcess({
# #                    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
# #                    })
# #                    process.crawl(DamwraSpider)
# #                    process.start()
                    
#                     os.system("scrapy crawl damwra -o dam2.json")
#                     break
#         return item
        
#     def process_item(self, item, spider):
#         item = self.check_item(item)
#         #self.re_run(item)
#         self.cursor.execute("""INSERT INTO ReservoirState (R_ID, Reservoir, TimeStamp, WaterLevel, EffectiveWaterStorageCapacity, PercentageUsedInReservoirCapacity, MaximumCapacity) VALUES (%s, %s, %s, %s, %s, %s, %s)""",(
#             item['R_ID'],
#             item['Reservoir'],
#             item['TimeStamp'],
#             item['WaterLevel'],
#             item['EffectiveWaterStorageCapacity'],
#             item['PercentageUsedInReservoirCapacity'],
#             item['MaximumCapacity']
#         ))  
#         self.conn.commit()
#         return item
        
    # def close_spider(self, spider):
    #     #self.conn.close() 
    #     pass# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
# import MySQLdb
# import MySQLdb.cursors
# import re
# import os

# from scrapy.exceptions import DropItem
# import scrapy
# from scrapy.crawler import CrawlerProcess
# from dam.spiders.damwra import DamwraSpider

# from scrapy.xlib.pydispatch import dispatcher
# from scrapy import signals
# from scrapy.contrib.exporter import JsonLinesItemExporter

# class DamPipeline(object):
#     def check_item(self, item):
#         for key, val in item.items():
#             if re.match('^M', key) or re.match('^Percentage', key):   #挑出M開頭的key
#                 if val and not re.match('^\d+?\.\d+?$', val): #???
#                     item[key] = -999  # can use None or NULL
#         return item
    
#     def __init__(self):
#         if(os.path.isfile('/home/ubuntu/workspace/scrapy_dam/dam/damwra_items1.json') and os.path.isfile('/home/ubuntu/workspace/scrapy_dam/dam/damwra_items2.json')):
#             print('File exist!!')
#         else:
#             dispatcher.connect(self.spider_opened, signals.spider_opened)
#             dispatcher.connect(self.spider_closed, signals.spider_closed)
#             self.files = {}

#     def spider_opened(self, spider):
        
#         # For checking usage
#         if(os.path.isfile('/home/ubuntu/workspace/scrapy_dam/dam/check_item1.txt')):
#             if(os.path.isfile('/home/ubuntu/workspace/scrapy_dam/dam/check_item2.txt')):
#                 print('file exist')
#             else:
#                 file = open('%s_items2.json' % spider.name, 'w+b')
#                 self.files[spider] = file
#                 self.exporter = JsonLinesItemExporter(file)
#                 self.exporter.start_exporting()
#         else:
#             file = open('%s_items1.json' % spider.name, 'w+b')
#             self.files[spider] = file
#             self.exporter = JsonLinesItemExporter(file)
#             self.exporter.start_exporting()

#     def spider_closed(self, spider):
#         if(os.path.isfile('/home/ubuntu/workspace/scrapy_dam/dam/check_item1.txt') and os.path.isfile('/home/ubuntu/workspace/scrapy_dam/dam/check_item2.txt')):
#             print('File exist!!')
#         else:
#             self.exporter.finish_exporting()
#             file = self.files.pop(spider)
#             file.close()
            
#         if(os.path.isfile('/home/ubuntu/workspace/scrapy_dam/dam/damwra_items1.json')):
#             file = open('check_item1.txt', 'w')
#             file.write("This is for scrapy to check item accuracy")
#             file.close()
        
#         if(os.path.isfile('/home/ubuntu/workspace/scrapy_dam/dam/damwra_items2.json')):
#             file = open('check_item2.txt', 'w')
#             file.write("This is for scrapy to check item accuracy")
#             file.close()

#     def process_item(self, item, spider):
#         item = self.check_item(item)
#         if(os.path.isfile('/home/ubuntu/workspace/scrapy_dam/dam/check_item1.txt') and os.path.isfile('/home/ubuntu/workspace/scrapy_dam/dam/check_item2.txt')):
#             print('File exist!!')
#         else:
#             self.exporter.export_item(item)
#             return item
        
    
    
    
    
#     def open_spider(self, spider):
#         self.conn = MySQLdb.connect(host='localhost',
#                                     user='******',
#                                     passwd='demo1234',
#                                     db='demo',
#                                     charset='utf8')
#         self.cursor = self.conn.cursor()
    
#     def check_item(self, item):
#         for key, val in item.items():
#             if re.match('^M', key) or re.match('^Percentage', key):   #挑出M開頭的key
#                 if val and not re.match('^\d+?\.\d+?$', val): #???
#                     item[key] = None
#         return item
        
#     def re_run(self, item):
#         count = 0
#         for key, val in item.items():
#             if(count==3):
#                 item[key] = '---'
#                 print('error in the website server')
#                 break
            
#             if re.match('^TimeStamp', key) or re.match('^R_ID', key):   #挑出M開頭的key
#                 if(val == '--'): 
#                     count = count + 1
# #                    process = CrawlerProcess({
# #                    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
# #                    })
# #                    process.crawl(DamwraSpider)
# #                    process.start()
                    
#                     os.system("scrapy crawl damwra -o dam2.json")
#                     break
#         return item
        
#     def process_item(self, item, spider):
#         item = self.check_item(item)
#         #self.re_run(item)
#         self.cursor.execute("""INSERT INTO ReservoirState (R_ID, Reservoir, TimeStamp, WaterLevel, EffectiveWaterStorageCapacity, PercentageUsedInReservoirCapacity, MaximumCapacity) VALUES (%s, %s, %s, %s, %s, %s, %s)""",(
#             item['R_ID'],
#             item['Reservoir'],
#             item['TimeStamp'],
#             item['WaterLevel'],
#             item['EffectiveWaterStorageCapacity'],
#             item['PercentageUsedInReservoirCapacity'],
#             item['MaximumCapacity']
#         ))  
#         self.conn.commit()
#         return item
        
    # def close_spider(self, spider):
    #     #self.conn.close() 
    #     pass
Exemple #38
0
class ProcurementscrapePipeline(object):
    def open_spider(self, spider):

        self.startTime = datetime.datetime.now()
        nowStr = self.startTime.strftime("%Y-%m-%d %H:%M")
        scrapeDir = ""
        if spider.scrapeMode == "FIXERRORS":
            scrapeDir = spider.fixpath + "_FIX"
        else:
            if spider.scrapeMode == "FULL":
                if not os.path.exists("FullScrapes"):
                    os.makedirs("FullScrapes")
                typeDir = "FullScrapes/"
            elif spider.scrapeMode == "INCREMENTAL":
                if not os.path.exists("IncrementalScrapes"):
                    os.makedirs("IncrementalScrapes")
                typeDir = "IncrementalScrapes/"
            else:
                if not os.path.exists("TestScrapes"):
                    os.makedirs("TestScrapes")
                typeDir = "TestScrapes/"
            scrapeDir = typeDir + nowStr

        if not os.path.exists(scrapeDir):
            os.makedirs(scrapeDir)

        spider.setScrapePath(scrapeDir)
        self.tendersfile = open(scrapeDir + "/" + "tenders.json", 'wb')
        self.procuringEntitiesfile = open(
            scrapeDir + "/" + 'organisations.json', 'wb')
        self.tenderBiddersFile = open(scrapeDir + "/" + 'tenderBidders.json',
                                      'wb')
        self.tenderAgreementsFile = open(
            scrapeDir + "/" + 'tenderAgreements.json', 'wb')
        self.tenderDocumentationFile = open(
            scrapeDir + "/" + 'tenderDocumentation.json', 'wb')
        self.tenderCPVCodeFile = open(scrapeDir + "/" + 'tenderCPVCode.json',
                                      'wb')
        self.whiteListFile = open(scrapeDir + "/" + 'whiteList.json', 'wb')
        self.blackListFile = open(scrapeDir + "/" + 'blackList.json', 'wb')
        self.complaintFile = open(scrapeDir + "/" + 'complaints.json', 'wb')
        self.bidderResultFile = open(scrapeDir + "/" + 'bidderResult.json',
                                     'wb')

        self.tenderExporter = JsonLinesItemExporter(self.tendersfile)
        self.procurerExporter = JsonLinesItemExporter(
            self.procuringEntitiesfile)
        self.biddersExporter = JsonLinesItemExporter(self.tenderBiddersFile)
        self.agreementExporter = JsonLinesItemExporter(
            self.tenderAgreementsFile)
        self.documentationExporter = JsonLinesItemExporter(
            self.tenderDocumentationFile)
        self.cpvCodeExporter = JsonLinesItemExporter(self.tenderCPVCodeFile)
        self.whiteListExporter = JsonLinesItemExporter(self.whiteListFile)
        self.blackListExporter = JsonLinesItemExporter(self.blackListFile)
        self.complaintExporter = JsonLinesItemExporter(self.complaintFile)
        self.bidderResultExporter = JsonLinesItemExporter(
            self.bidderResultFile)

        self.tenderExporter.start_exporting()
        self.procurerExporter.start_exporting()
        self.biddersExporter.start_exporting()
        self.agreementExporter.start_exporting()
        self.documentationExporter.start_exporting()
        self.cpvCodeExporter.start_exporting()
        self.whiteListExporter.start_exporting()
        self.blackListExporter.start_exporting()
        self.complaintExporter.start_exporting()
        self.bidderResultExporter.start_exporting()

        self.infoFile = open(scrapeDir + "/" + 'scrapeInfo.txt', 'wb')
        self.infoFile.write("StartTime: " + nowStr + "\n")

    def process_item(self, item, spider):
        itemClassName = item.__class__.__name__
        if (itemClassName == "Tender"):
            self.tenderExporter.export_item(item)
        elif (itemClassName == "Organisation"):
            self.procurerExporter.export_item(item)
        elif (itemClassName == "TenderBidder"):
            self.biddersExporter.export_item(item)
        elif (itemClassName == "TenderAgreement"):
            self.agreementExporter.export_item(item)
        elif (itemClassName == "TenderDocument"):
            self.documentationExporter.export_item(item)
        elif (itemClassName == "CPVCode"):
            self.cpvCodeExporter.export_item(item)
        elif (itemClassName == "WhiteListObject"):
            self.whiteListExporter.export_item(item)
        elif (itemClassName == "BlackListObject"):
            self.blackListExporter.export_item(item)
        elif (itemClassName == "Complaint"):
            self.complaintExporter.export_item(item)
        elif (itemClassName == "BidderResult"):
            self.bidderResultExporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.endTime = datetime.datetime.now()
        endTimeStr = self.endTime.strftime("%Y-%m-%d %H:%M")
        self.infoFile.write("End Time: " + endTimeStr + "\n")
        timeTaken = self.endTime - self.startTime

        minutes = int(timeTaken.seconds / 60)
        seconds = timeTaken.seconds % 60
        self.infoFile.write(
            "Time Taken:    Days: %d    Minutes:    %d    Seconds    %d \n" %
            (timeTaken.days, minutes, seconds))
        self.infoFile.write("Tenders scraped: %d \n" % (spider.tenderCount))
        self.infoFile.write("Orgs scraped: %d \n" % (spider.orgCount))
        self.infoFile.write("bidders scraped: %d \n" % (spider.bidderCount))
        self.infoFile.write("agreements scraped: %d \n" %
                            (spider.agreementCount))
        self.infoFile.write("documents scraped: %d \n" % (spider.docCount))
        print spider.firstTender
        self.infoFile.write("firstTenderURL: %d" % int(spider.firstTender))
        self.infoFile.close()

        self.tenderExporter.finish_exporting()
        self.procurerExporter.finish_exporting()
        self.biddersExporter.finish_exporting()
        self.agreementExporter.finish_exporting()
        self.documentationExporter.finish_exporting()
        self.whiteListExporter.finish_exporting()
        self.blackListExporter.finish_exporting()
        self.complaintExporter.finish_exporting()
        self.bidderResultExporter.finish_exporting()

        self.tendersfile.close()
        self.procuringEntitiesfile.close()
        self.tenderBiddersFile.close()
        self.tenderAgreementsFile.close()
        self.tenderDocumentationFile.close()
        self.tenderCPVCodeFile.close()
        self.whiteListFile.close()
        self.blackListFile.close()
        self.complaintFile.close()
        self.bidderResultFile.close()
Exemple #39
0
 def _get_exporter(self, **kwargs):
     return JsonLinesItemExporter(self.output, **kwargs)
Exemple #40
0
 def spider_opened(self, spider):
     # write utf-8 file
     f = codecs.open('articles.json', 'w+', encoding='utf-8')
     self.files[spider] = f
     self.exporter = JsonLinesItemExporter(f, ensure_ascii=False)
     self.exporter.start_exporting()
Exemple #41
0
 def open_spider(self, spider):
     self.file = open('test.json', 'w+b')
     self.exporter = JsonLinesItemExporter(self.file)
Exemple #42
0
 def open_spider(self, spider):
     print "===open_spider==="
     file = open('data/%s_products_%s.json' % (spider.name, datetime.now().strftime("%Y%m%d%H%M%S")), 'w+b')
     self.files[spider] = file
     self.exporter = JsonLinesItemExporter(file, ensure_ascii=False)
     self.exporter.start_exporting()