コード例 #1
2
ファイル: pipelines.py プロジェクト: 2khc/Python-Projects
class CsvExportPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_societies.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = ['name', 'president', 'email', 'url', 'facebook', 'membership', 'about',
                                          'date_established']
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #2
1
ファイル: pipelines.py プロジェクト: uuhako/myScrapy
class JsonExportPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
#         file = open('%s_data.xml' % spider.name, 'w+b')
        import os
        filePath = os.path.dirname(__file__)
        outputDir = filePath +'/output/'
        file = open(outputDir + '%s_data.csv' % spider.name, 'w+b')
        self.files[spider] = file
#         self.exporter = JsonItemExporter(file)
        self.exporter = CsvItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #3
0
class BitcoinTalkCrawlerPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_items.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = \
            ['timestamp', 'category_id', 'topic_id', 'topic_title',
             'message_number', 'message_author', 'message_text']
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #4
0
class catalogscraperPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open("%s_items.csv" % spider.name, "w+b")
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = ["title"]
        #'subject', 'description', 'creator', 'source', 'published', 'rights', 'citation', 'url']
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #5
0
ファイル: pipelines.py プロジェクト: yujiaxinlong/Crawlers
class CSVPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_items.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file,delimiter='\t')
        self.exporter.fields_to_export = ['userId','bookId','name','rating','relativeRating','booklistNum']
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #6
0
class CsvExportPipeline(object):
    """
    app.pipelines.exporter_csv.CsvExportPipeline
    """
    def __init__(self):
        self.files = {}
        self.exporter = None

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file_csv = open('%s_items.csv' % spider.name, 'w+b')
        self.files[spider] = file_csv
        self.exporter = CsvItemExporter(file_csv)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file_csv = self.files.pop(spider)
        file_csv.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #7
0
class CSVWriterPipeline(object):
    
    def __init__(self,filename):
        self.filename = filename
        
    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings
        filename = settings.get('OUTPUT_FILE')
        pipeline = cls(filename)
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.file = open(self.filename, 'w+b')
        self.exporter = CsvItemExporter(self.file,include_headers_line=True)
        self.exporter.encoding='utf-8'
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #8
0
ファイル: test_exporters.py プロジェクト: Rokicto/scrapy
 def assertExportResult(self, item, expected, **kwargs):
     fp = BytesIO()
     ie = CsvItemExporter(fp, **kwargs)
     ie.start_exporting()
     ie.export_item(item)
     ie.finish_exporting()
     self.assertCsvEqual(fp.getvalue(), expected)
コード例 #9
0
ファイル: pipelines.py プロジェクト: mart2010/brd
class DumpToFile(object):
    """
    Dump harvested data into flat file, no other logic is implemented here
    (it's "Dump" :-)
    """
    def __init__(self):
        self.files = {}
        self.counter = 0

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        # TODO: verify if still needed for registration of spider_closed/opened event?
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        filename = spider.get_dump_filepath()
        f = open(filename, 'w')
        self.files[spider.name] = f
        # by default csv module uses Windows-style line terminators (\r\n)
        self.exporter = CsvItemExporter(f, include_headers_line=True, delimiter='|', lineterminator='\n')
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        f = self.files.pop(spider.name)
        f.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        # for counter, could set att in spider at closing
        self.counter += 1
        return item
コード例 #10
0
ファイル: pipelines.py プロジェクト: eadebruijn/Webscraping
class CSVPipeline(object):

  def __init__(self):
    self.files = {}

  @classmethod
  def from_crawler(cls, crawler):
    pipeline = cls()
    crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
    crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
    return pipeline

  def spider_opened(self, spider):
    file = open('%s_items.csv' % spider.name, 'w+b')
    self.files[spider] = file
    self.exporter = CsvItemExporter(file)
    self.exporter.fields_to_export = ["filename", "titel", "publicatie", "dossiernummer", "organisatie", "publicatiedatum", "publicatietype", "file_urls"]
    self.exporter.start_exporting()

  def spider_closed(self, spider):
    self.exporter.finish_exporting()
    file = self.files.pop(spider)
    file.close()

  def process_item(self, item, spider):
    self.exporter.export_item(item)
    return item
コード例 #11
0
class CSVExportPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s' % spider.nameOfFile, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)

        self.exporter.fields_to_export = ['originalString', 'translatedString']

        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #12
0
class CsvExportPipeline(object):

    def __init__(self):

        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):

        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)

        return pipeline

    def spider_opened(self, spider):

        file = open('vagas.csv', 'wb')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):

        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):

        self.exporter.export_item(item)

        return item
コード例 #13
0
ファイル: pipelines.py プロジェクト: Joker-Cch/Amazon
class AmazonCsvPipeline(object):
    def open_spider(self, spider):
        # 保存csv数据的文件对象
        self.f = open("Amazon_goods_crawl.csv", "w")
        # 创建csv文件读写对象
        self.csv_exporter = CsvItemExporter(self.f)
        # 开始进行csv文件读写
        self.csv_exporter.start_exporting()
        # 根据商品标题进行去重处理
        self.add_title = set()

    def process_item(self, item, spider):
        if item['title'] in self.add_title:
            print u'[EEROR] 数据已保存,勿重复%s'% item['title']
        else:
            self.add_title.add(item['title'])
            # 每次写入一个item数据
            # print u'[INFO] 正在写入csv文件中%s'% item['title']
            self.csv_exporter.export_item(item)
        return item

    def close_spider(self, spider):
        # 结束csv文件读写
        # print u'[INFO] 写入csv文件已完成'
        self.csv_exporter.finish_exporting()
        # 关闭文件
        self.f.close()
コード例 #14
0
class WebcrawlerPipeline(object):
    def __init__ (self):
        self.files = {}
        pass
    
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline
    
    def spider_opened(self, spider):
        file = open("%s_urls.txt" % (spider.name), "w+b")
        self.files[spider] = file
        self.exporter = CsvItemExporter(file, include_headers_line=False)
        self.exporter.start_exporting()
        pass
    
    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()
        pass
    
    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
    pass
コード例 #15
0
 def test_header_export_two_items(self):
     for item in [self.i, dict(self.i)]:
         output = BytesIO()
         ie = CsvItemExporter(output)
         ie.start_exporting()
         ie.export_item(item)
         ie.export_item(item)
         ie.finish_exporting()
         self.assertCsvEqual(output.getvalue(), 'age,name\r\n22,John\xc2\xa3\r\n22,John\xc2\xa3\r\n')
コード例 #16
0
ファイル: pipelines.py プロジェクト: kellyho15/capstone
class FashionnovaPipeline(object):
    def __init__(self):
        self.filename = 'fashionnova.csv'
    def open_spider(self, spider):
        self.csvfile = open(self.filename, 'wb')
        self.exporter = CsvItemExporter(self.csvfile)
        self.exporter.start_exporting()
    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.csvfile.close()
    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #17
0
class TsvPipeline(object):
    def __init__(self):
        self.files = dict()

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open(spider.name+'-'+datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S")+'.tsv*', 
                           'wb')
        self.files[spider] = file

        self.exporter = CsvItemExporter(file, include_headers_line=True, join_multivalued=';', encoding="utf-8", delimiter='\t')
        if spider.name=='user':
            self.exporter.fields_to_export = ['uid', 'name', 'nickname', 'joindate', 'activedate']
        elif spider.name=='subject':
            self.exporter.fields_to_export = ['subjectid', 'order', 'subjectname', 'subjecttype', 'rank', 'date', 'votenum', 'favnum', 'staff', 'relations']
        elif spider.name=='record':
            self.exporter.fields_to_export = ['uid', 'name', 'nickname', 'iid', 'typ', 'state', 'adddate', 'rate', 'tags', 'comment']
        elif spider.name=='index':
            self.exporter.fields_to_export = ['indexid', 'creator', 'favourite', 'date', 'items']
        elif spider.name=='friends':
            self.exporter.fields_to_export = ['user', 'friend']
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        filename = file.name
        newname = filename[:-5]+'-'+datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S")+'.tsv'
        file.close()
        os.rename(filename, newname)
        if UPLOAD_TO_AZURE_STORAGE:
            block_blob_service = BlockBlobService(account_name=AZURE_ACCOUNT_NAME, account_key=AZURE_ACCOUNT_KEY)
            block_blob_service.create_blob_from_path(AZURE_CONTAINER,
                                                    newname,
                                                    newname,
                                                    content_settings=ContentSettings(content_type='text/tab-separated-values')
                                                            )
                                                            

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #18
0
ファイル: pipelines.py プロジェクト: kellyho15/capstone
class MacystopPipeline(object):
    def __init__(self):
        self.filename = 'topallproduct_all.csv'

    def open_spider(self, spider):
        self.csvfile = open(self.filename, 'wb')
        self.exporter = CsvItemExporter(self.csvfile)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.csvfile.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #19
0
class RegistryScraperPipeline(object):

	def __init__(self):
		self.filename = 'registry_scraper/output/employment_site.csv'

	def open_spider(self, spider):
		self.csvfile = open(self.filename, 'wb')
		self.exporter = CsvItemExporter(self.csvfile)
		self.exporter.start_exporting()

	def close_spider(self, spider):
		self.exporter.finish_exporting()
		self.csvfile.close()

	def process_item(self, item, spider):
		self.exporter.export_item(item)
		return item
コード例 #20
0
ファイル: pipelines.py プロジェクト: Joker-Cch/AQI
class AqiCsvPipeline(object):
    def open_spider(self, spider):
        # 保存csv数据的文件对象
        self.f = open('aqi.csv', 'w')
        # 创建csv文件读写对象
        self.csv_exporter = CsvItemExporter(self.f)
        # 开始进行csv文件读写
        self.csv_exporter.start_exporting()

    def process_item(self, item, spider):
        # 每次写入一个item数据
        self.csv_exporter.export_item(item)
        return item

    def close_spider(self, spider):
        # 结束csv文件读写
        self.csv_exporter.finish_exporting()
        # 关闭文件
        self.f.close()
コード例 #21
0
class CsvExportPipeline(object):

    def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.files = {}

    def spider_opened(self, spider):
        file = open('%s_jobs.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #22
0
class CrawlerPipeline(object):
    EXPORT_PATH = os.getenv("HOME")

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        path = CrawlerPipeline.EXPORT_PATH + "/" + spider.spider_id + "_export.csv"
        export_file = open(path, "ab" if os.path.isfile(path) else "wb")

        self.files[spider.spider_id] = export_file
        self.exporter = CsvItemExporter(export_file)
        self.exporter.fields_to_export = [
            "item_id",
            "url",
            "num_links",
            "num_images",
            "num_scripts",
            "num_styles",
            "headers",
            "text",
        ]
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        export_file = self.files.pop(spider.spider_id)
        export_file.close()

    def process_item(self, item, spider):
        # This is a common path among ALL crawlers
        self.exporter.export_item(item)
        return item
コード例 #23
0
class AnnonceGumtreePipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file,
                                        delimiter=';',
                                        quotechar='"',
                                        quoting=QUOTE_ALL)
        self.exporter.start_exporting()
        self.exporter.fields_to_export = [
            "ANNONCE_LINK", "ANNONCE_DATE", "ID_CLIENT", "GARAGE_ID", "TYPE",
            "SITE", "MARQUE", "MODELE", "ANNEE", "MOIS", "NOM", "CARROSSERIE",
            "OPTIONS", "CARBURANT", "CYLINDRE", "PUISSANCE", "PORTE", "BOITE",
            "NB_VITESSE", "PRIX", "KM", "PLACE", "COULEUR", "PHOTO", "LITRE",
            "IMMAT", "NO_CHASSIS", "VN_IND", "CONTACT", "CONTACT_PRENOM",
            "CONTACT_NOM", "GARAGE_NAME", "ADRESSE", "VILLE", "CP",
            "DEPARTEMENT", "PROVINCE", "COUNTRY", "TELEPHONE", "TELEPHONE_2",
            "TELEPHONE_3", "TELEPHONE_4", "TELEFAX", "EMAIL", "MINI_SITE"
        ]
        #,'motor','valve_cylind' , 'color_caoutchouc', 'prop_uniq', 'plaque', 'hauteur','large','largeur','distance_axe','cp_reservoir','direction','control_traction']
    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #24
0
class CsvPipeline(object):  #

    stats_name = 'csvpipeline'

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def __init__(self, crawler):
        self.stats = crawler.stats
        self.stats.set_value('done', 0)
        self.settings = crawler.settings

    def open_spider(self, spider):
        if not os.path.exists(spider.scraped_key):
            os.makedirs(spider.scraped_key)

        self.file = open(f'{spider.scraped_key}_result.csv', 'w+b')
        self.exporter = CsvItemExporter(self.file, include_headers_line=True)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()
        print('\n\n')

        logger.info(f'===> RESULT AT:{spider.scraped_key}_result.csv')

    # @defer.inlineCallbacks
    def process_item(self, item, spider):
        self.stats.inc_value('done')
        self.exporter.export_item(item)
        if self.stats.get_value('done') % 100 == 0:
            logger.info("--> CSV pineline: Done %s/ %s",
                        self.stats.get_value('done'),
                        self.stats.get_value('total'))

        return item
コード例 #25
0
class ExpertsExtractPipeline(object):
    def __init__(self):
        self.files = {}
        self.file_name = 'UMass Amherst.csv'
        self.export_fields = [
            "name", "title_1", "title_2", "title_3", "title_4", "title_5",
            "phone", "email", "biography", "headshot", "faculty_page",
            "areas_of_expertise_1", "areas_of_expertise_2",
            "areas_of_expertise_3", "areas_of_expertise_4",
            "areas_of_expertise_5", "areas_of_expertise_6",
            "areas_of_expertise_7", "areas_of_expertise_8",
            "areas_of_expertise_9", "areas_of_expertise_10",
            "areas_of_expertise_11"
        ]

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):

        output_file = open(self.file_name, 'w+b')
        self.files[spider] = output_file
        self.exporter = CsvItemExporter(output_file,
                                        fields_to_export=self.export_fields)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        output_file = self.files.pop(spider)
        output_file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #26
0
class TedEuropaEuPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.file = open('search_reuslt_{}.csv'.format(spider.name), 'wb')
        self.file_details = open('details_reuslt_{}.csv'.format(spider.name),
                                 'wb')

        self.exporter = CsvItemExporter(self.file)
        self.details_exporter = CsvItemExporter(self.file_details)

        self.exporter.fields_to_export = [
            'document_id', 'description', 'country', 'publication_date',
            'deadline'
        ]
        self.details_exporter.fields_to_export = [
            'url', 'document_id', 'name', 'value'
        ]

        self.exporter.start_exporting()
        self.details_exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

        self.details_exporter.finish_exporting()
        self.file_details.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        self.details_exporter.export_item(item)
        return item
コード例 #27
0
class ExtractPipeline(object):
    def __init__(self):
        self.files = {}
        self.file_name = f'cnhcregister_data.csv'
        self.export_fields = [
            'name',
            'registration',
            'telephone',
            'address',
            'discipline',
            'website',
        ]

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        output_file = open(
            self.file_name,
            'w+b',
        )
        self.files[spider] = output_file
        self.exporter = CsvItemExporter(output_file,
                                        fields_to_export=self.export_fields)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        output_file = self.files.pop(spider)
        output_file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #28
0
    def get_exporter(self, item):
        """
		Finds / creates an exporter for a given item
		Takes item as an argument and returns an exporter
		:param item: Pass an item to get its exporter
		"""
        #  Extract key from item using ItemAdapter
        adapter = ItemAdapter(item=item)
        key = adapter[self.key_field]

        # Create an exporter for the key, if needed
        # Check if the key doesn't exist in the dictionary
        if (not (key in self.exporters)):
            # Open a CSV file to create a new exporter
            exporter = CsvItemExporter(open(
                file=f'output/{self.out_dir}/{key}.csv', mode='ab'),
                                       include_headers_line=False)

            # Construct the header row from self.fields
            header = {
                self.fields[i]: self.fields[i]
                for i in range(0, len(self.fields))
            }

            # Configure the fields to export
            exporter.fields_to_export = self.fields

            # Start exporting the file
            exporter.start_exporting()

            # Export the header row
            exporter.export_item(item=header)

            # Add exporter to the dictionary
            self.exporters[key] = exporter

        # Return the corresponding exporter
        return self.exporters[key]
コード例 #29
0
class UserPipeline:
    def __init__(self):
        self.file = open('user_comment.csv', 'w+b')
        self.exporter = CsvItemExporter(self.file, 'unicode')
        self.exporter.fields_to_export = ['userID', 'time', 'comment', 'url']
        # self.exporter.fields_to_export = ['userID', 'url', 'comment', 'articleID', 'categoryID']
        self.exporter.start_exporting

    def process_item(self, item, spider):

        print('process from pipeline ', len(item))
        try:
            for idx in range(5000):
                it = UserItem()
                it['userID'] = item['userID']
                it['time'] = item['time'][idx]
                it['comment'] = item['comment'][idx]
                it['url'] = item['url'][idx]
                self.exporter.export_item(it)
        except IndexError:
            return item

        return item
コード例 #30
0
class CsvPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.file = open('output.csv', 'w+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):

        #remove all spaces
        item['content'] = [s for s in item['content'] if not s.isspace()]
        self.exporter.export_item(item)
        return item
コード例 #31
0
ファイル: pipelines.py プロジェクト: shayongithub/Assignment1
class CryptocurrencyPipeline:
    def __init__(self, file_name):
        self.file = open("bitcoin.csv", "wb")
        self.exporter = CsvItemExporter(self.file)

    @classmethod
    def from_crawler(cls, crawler):
        file_name = getattr(crawler.spider, "name")
        return cls(file_name)

    def open_spider(self, spider):

        self.exporter.start_exporting()

    def process_item(self, item, spider):

        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):

        self.exporter.finish_exporting()
        self.file.close()
コード例 #32
0
ファイル: pipelines.py プロジェクト: mobigen/MSF_V2
class CSVWriterPipeline(object):
    def open_spider(self, spider):
        file = open(spider.output_filename, 'wb')
        self.file_handle = file
        self.exporter = CsvItemExporter(file, delimiter='\t')
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file_handle.close()
        full_path = os.getcwd() + os.sep + spider.output_filename
        sys.stdout.write(full_path)
        sys.stdout.flush()

    def process_item(self, item, spider):
        item.setdefault('uuid', str(uuid.uuid1()))
        item.setdefault('date', datetime.datetime.now().strftime("%Y%m%d%H%M"))
        self.exporter.fields_to_export = spider.fields_to_export
        for field in item.keys():
            if field not in self.exporter.fields_to_export:
                self.exporter.fields_to_export.append(field)
        self.exporter.export_item(item)
        return item
コード例 #33
0
class WriteItemsPipeline(object):
    def open_spider(self, spider):
        self.csvfile1 = open('beer_info.csv', 'wb')
        self.exporter1 = CsvItemExporter(self.csvfile1)
        self.exporter1.start_exporting()

        self.csvfile2 = open('beer_reviews.csv', 'wb')
        self.exporter2 = CsvItemExporter(self.csvfile2)
        self.exporter2.start_exporting()

    def process_item(self, item, spider):
        if isinstance(item, BeerItem):
            self.exporter1.export_item(item)
        elif isinstance(item, ReviewItem):
            self.exporter2.export_item(item)
        return item

    def close_spider(self, spider):
        self.exporter1.finish_exporting()
        self.csvfile1.close()

        self.exporter2.finish_exporting()
        self.csvfile2.close()
コード例 #34
0
ファイル: pipelines.py プロジェクト: Min-An-Nhuien/dou_jobs
class DouJobsPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.file = open('dou_jobs.csv', 'w+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.fields_to_export = [
            'title', 'city', 'salary', 'description', 'company', 'date', 'url'
        ]
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #35
0
ファイル: pipelines.py プロジェクト: adriengb/maminova
class MyPipeline(ImagesPipeline):
    def __init__(self, store_uri, download_func=None, settings=None):
        super(MyPipeline, self).__init__(store_uri,
                                         settings=settings,
                                         download_func=download_func)
        self.file = open("data/raw_labels.csv", 'wb')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.start_exporting()

    def file_path(self, request, response=None, info=None):
        #item=request.meta['item'] # Like this you can use all from item, not just url.
        image_guid = request.url.split('/')[-1]
        return image_guid

    def close_spider(self, spider):
        #super(ImagesPipeline, self).close_spider(spider)
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        super(ImagesPipeline, self).process_item(item, spider)
        self.exporter.export_item(item)
        return item
コード例 #36
0
ファイル: pipelines.py プロジェクト: pekshechka/dzen
class DzenPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        #self.bot = Bot(token=TOKEN)
        self.file = open('four_results.csv', 'a+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.start_exporting()
        #self.bot.send_message(chat_id=ID, text="Starting")

    def spider_closed(self, spider):
        #self.bot.send_message(chat_id=ID, text="It's broken!!!!!!!!!!!!!!")
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #37
0
class TortuPipeline:
    def __init__(self):
        self.file = open("./path/data.csv", 'wb')
        self.exporter = CsvItemExporter(self.file, unicode)
        self.exporter.fields_to_export = [
            'Account Owner', 'Account Owner ID', 'Account Name', 'Phone',
            'Account Site', 'Fax', 'Parent Account', 'Parent Account ID',
            'Account Number', 'Account Type', 'Industry', 'Annual Revenue',
            'Created By', 'Created by ID', 'Modified By', 'Modified by ID',
            'Created Time', 'Modified Time', 'Billing Street', 'Billing City',
            'Billing State', 'Billing Code', 'Billing Country', 'Description',
            'Last Activity Time', 'Layout', 'Layout ID', 'Tag',
            'Water System No', 'Website URL', 'Principal Country Served'
        ]
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
class CsvWriterPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        print("CsvWriterPipeline spider has been open")
        self.file = open('output.csv', 'a+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        print("CsvWriterPipeline spider has been close")
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        #print(item)
        self.exporter.export_item(item)
        return item
コード例 #39
0
class CSVPipeline(object):
    def __init__(self, path):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_items.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = [
            'date',
            'url',
            'category',
            'keywords',
            'title',
            'author',
            'text',
            'title_latin',
            'author_latin',
            'text_latin',
        ]
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #40
0
class VnexpressPipeline:
    def __init__(self):
        self.file = open('items2.csv', 'ab+')
        self.exporter = CsvItemExporter(self.file, 'unicode')
        self.exporter.start_exporting

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_duplicate(self, item):
        link = item['link']

        # if link in self.seen:
        #     raise DropItem('Duplicate item %s' % link)

        self.seen.add(link)

    def pre_process(self, item):
        item['category'] = str(item['category'])
        item['date'] = str(item['date'])
        item['title'] = str(item['title'])
        item['body'] = str(item['body'])
        item['comment'] = str(item['comment'])
        item['link'] = str(item['link'])

        # get id user comment
        user = str(item['user'])
        user = str(re.findall(r'\d+', user))
        print(type(user))
        item['user'] = user

    def process_item(self, item, spider):
        self.pre_process(item)
        self.exporter.export_item(item)

        return item
コード例 #41
0
class CSVExportPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):

        # print( str(spider) )
        # pdb.set_trace()

        file = open('%s' % spider.nameOfFile, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)

        #each spider has a different items
        if ( spider.name is 'fleetintel_list' ):
            self.exporter.fields_to_export = ['Company', 'Model', 'MSN', 'YoM', 'Reg', 'Comments']
        elif ( spider.name is 'Available_assets' ):
            self.exporter.fields_to_export = ['Category', 'Company', 'Contact_webPage', 'Contact_email', 'Contact_phone', 'Model', 'YoM', 'MSN', 'TFHs_TFCs', 'Engines', 'F_B_E', 'OL_A_S', 'LU', 'AD', 'ESN', 'L_E_S']
        
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #42
0
class VivanunciosPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_items.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = [
            'idPropiedad', 'category_id', 'agent_id', 'user_id',
            'type_popiedad', 'title', 'slug', 'body', 'image_name',
            'image_ext', 'meta_keywords', 'meta_desc', 'status', 'create_date',
            'updated_at', 'address', 'city', 'state', 'zip_propiedad',
            'country', 'latitude', 'longitude', 'price', 'beds', 'services',
            'characteristics', 'bath', 'year', 'features', 'is_delete',
            'featured', 'size', 'related', 'disponible', 'tipoLetra',
            'tipoPublicado', 'url_pagina', 'url_vendedor', 'nombre_vendedor',
            'id_anuncio', 'leyenda', 'sitio'
        ]
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.file.pop(spider)
        file.close()

    def process_item(self, item, spider):
        # build your row to export, then export the row
        self.exporter.export_item(item)
        return item
コード例 #43
0
class BigMLPipeline(BigMLAPIMixIn):

    AUTH_ERRMSG = (
        "{errtype:s} BigML credentials. Please supply BIGML_USERNAME"
        " and BIGML_API_KEY as either Scrapy settings or environment"
        " variables."
    )

    def __init__(self, username=None, api_key=None, source_name=None, dev_mode=None):
        self.source_name = source_name
        self.get_bigml_api(username, api_key, dev_mode=dev_mode)

    @classmethod
    def from_crawler(cls, crawler):
        o = cls(
            username=crawler.settings["BIGML_USERNAME"],
            api_key=crawler.settings["BIGML_API_KEY"],
            source_name=crawler.settings.get("BIGML_SOURCE_NAME", "Scrapy"),
            dev_mode=crawler.settings.getbool("BIGML_DEVMODE", False),
        )
        o.crawler = crawler
        o.settings = crawler.settings
        return o

    def open_spider(self, spider):
        self.tempfile = TemporaryFile(prefix="bigml-feed-")
        self.exporter = CsvItemExporter(self.tempfile)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.tempfile.seek(0)
        self.export_to_bigml(self.tempfile, self.source_name)

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #44
0
ファイル: pipelines.py プロジェクト: alanrhannah/harvestman
class HarvestmanPipeline(object):
    """Pipepline definition for spiders in the harvestman_spider project"""

    def __init__(self):
        """__init__, innit."""
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        csv_file = settings.CSV_FILE_OUTPUT_DIR.format(
            spider.base_url.split('/')[2],
            datetime.date.today().strftime('%Y-%m-%d'))

        if spider.name == 'google_serp_spider':
            file = open(csv_file, 'w')
            self.files[spider] = file
            # note this outputs as a tab seperated csv, rather than comma.
            self.exporter = CsvItemExporter(file, delimiter='\t')
            self.exporter.start_exporting()

    def spider_closed(self, spider):
        if spider.name == 'google_serp_spider':
            self.exporter.finish_exporting()
            file = self.files.pop(spider)
            file.close()

    def process_item(self, item, spider):
        if spider.name == 'google_serp_spider':
            self.exporter.export_item(item)
            return item
コード例 #45
0
ファイル: pipelines.py プロジェクト: snamper/code-2
class JDDetailCsvPipeline(object):
    """保存为CSV格式文件的管道类"""
    def open_spider(self,spider):
        # 保存csv数据库文件对象
        self.f = open("jddetail.csv", "wb")
        # 创建csv文件读写对象
        self.csv_exporter = CsvItemExporter(self.f)
        # 开始进行csv文件的读写
        self.csv_exporter.start_exporting()

    def process_item(self,item,spider):
        # 每次写入一个item数据
        print(type(item))
        print(item)
        #print(chardet.detect(list(dict(item).values())[0]))
        print("--" * 50)
        self.csv_exporter.export_item(item)
        return item

    def close_spider(self, spider):
        # 结束csv文件读写
        self.csv_exporter.finish_exporting()
        # 关闭文件
        self.f.close()
コード例 #46
0
class EnrolldataPipeline(object):
    """
                 company_name             会社名
                 job_name                ポジション 
                 link_url                募集詳細link   https://type.jp
                 nearest_station         住所
                 longitude                 経度
                 latitude                  緯度
                 source                    出所
                 occupation                職種
                 annual_income_min         年収min
                 annual_income_max         年収max
                 published_time            サイト内での掲載時間
                 create_data              クロリングした時間 

    """
    def open_spider(self, spider):
        self.file = open("test.csv", "wb")
        self.exporter = CsvItemExporter(self.file,
                                        fields_to_export=[
                                            "company_name", "job_name",
                                            "link_url", "nearest_station",
                                            "longitude", "latitude", "source",
                                            "occupation", "annual_income_min",
                                            "annual_income_max",
                                            "published_time", "create_data"
                                        ])
        self.exporter.start_exporting()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()
コード例 #47
0
class ExtractPipeline(object):
    def __init__(self):
        self.files = {}
        self.file_name = f'silver_sneakers_{time.time()}.csv'
        self.export_fields = [
            'address1', 'amenityIDs', 'city', 'corpID', 'counter',
            'flexClasses', 'genderSpecific', 'hasBoomClass', 'hasFlex',
            'hasSilverSneakersClass', 'locID', 'locationType', 'mileDistance',
            'name', 'phone', 'state', 'upmcPersonalTrainer', 'zipCode'
        ]

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        output_file = open(
            self.file_name,
            'w+b',
        )
        self.files[spider] = output_file
        self.exporter = CsvItemExporter(output_file,
                                        fields_to_export=self.export_fields)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        output_file = self.files.pop(spider)
        output_file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #48
0
class CityAvgPricePipeline(object):
    def __init__(self):
        self.files = {}
        self.file_path = './data/avg.csv'
        self.num = 0

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open(self.file_path, 'a+b')
        self.files[spider] = file
        kwargs = {
            'fields_to_export': ['city_name', 'avg_price', 'last_price']}
        self.exporter = CsvItemExporter(file, include_headers_line=False, **kwargs)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()
        send_mail("%s is closed!,time is  %s" % (spider.name, time.ctime()))

        print("spider closed!")

    def process_item(self, item, spider):
        if isinstance(item, CityAvgItem):
            self.exporter.export_item(item)
            self.num += 1
        if self.num % 100 == 0:
            print("save avg_data %s times" % self.num)
        return item
コード例 #49
0
class CSVPipeline(object):
    def __init__(self):
        self.files = {}
        self.exporter = None
        self.exporter1 = None

    def open_spider(self, spider):
        file = open('%s_items.csv' % spider.name, 'w+b')
        file1 = open('%s.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.files['covid'] = file1
        self.exporter = CsvItemExporter(file)
        self.exporter1 = CsvItemExporter(file1)
        self.exporter.fields_to_export = [
            'countries', 'total_cases', 'new_cases', 'total_recovered',
            'active_cases', 'total_cases_per_million', 'death_per_million',
            'total_deaths', 'new_deaths'
        ]
        self.exporter1.fields_to_export = [
            'new_cases', 'total_cases', 'total_deaths', 'new_deaths'
        ]
        self.exporter.start_exporting()
        self.exporter1.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.exporter1.finish_exporting()
        file = self.files.pop(spider)
        file1 = self.files.pop('covid')
        file.close()
        file1.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        self.exporter1.export_item(item)
        return item
コード例 #50
0
class CSVPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        if spider.name in 'realestate':
            self.file = open('current_listing.csv', 'w+b')
        else:
            self.file = open('past_listing.csv', 'w+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)

        return item
コード例 #51
0
ファイル: pipelines.py プロジェクト: yangxulin/douban250
class Douban250Pipeline(object):
    def __init__(self):
        self.fp = open("douban.csv", "wb")
        self.exporter = CsvItemExporter(self.fp,
                                        fields_to_export=[
                                            'movie_name',
                                            'movie_director_actors',
                                            'movie_time_country',
                                            'movie_grade',
                                            'comment_number',
                                            'movie_introduce',
                                        ])
        # self.exporter.start_exporting()

    def open_spider(self, spider):
        print("爬虫开始了")

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self):
        # self.exporter.finish_exporting()
        self.fp.close()
コード例 #52
0
ファイル: pipelines.py プロジェクト: ccs1910/WebScraper_PY
class CsvWriterPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        filename = "output_rmob_" + time.strftime("%Y%m%d-%H%M%S")
        self.file = open(filename + '.csv', 'w+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.fields_to_export = [
            "id", "brand", "full_title", "year", "transmission", "price"
        ]
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #53
0
class ToCsvPipeline(object):
    """Outputs data to a csv file."""
    def open_spider(self, spider):
        scrapedate = datetime.now().strftime('%Y%m%d_%H%M%S')
        if not path.exists(settings.CSV_STORE):
            mkdir(settings.CSV_STORE)
        assert path.isdir(settings.CSV_STORE), \
        '{} is not a directory'.format(settings.CSV_STORE)
        pth_csv = path.join(settings.CSV_STORE,
                            'data_{}.csv'.format(scrapedate))
        self.file = open(pth_csv, 'w+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.fields_to_export = [
            'title', 'address', 'cuisines', 'opening', 'phone', 'website'
        ]
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #54
0
class CSVExportPipelines(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s.csv' % spider.name, 'wb')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file,
                                        include_headers_line=True,
                                        join_multivalued=',',
                                        lineterminator='\n')
        self.exporter.fields_to_export = [
            'date', 'episode', 'artist', 'song', 'link', 'image'
        ]

        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        required_fields = ['episode']  # your list of required fields
        if all(field in item for field in required_fields):
            self.exporter.export_item(item)
            return item
        else:
            raise DropItem("Item null")
コード例 #55
0
class HomeWorkMarketCsv(object):
    def __init__(self):
        self.file = open("jobs.csv", 'wb')
        self.exporter = CsvItemExporter(self.file, unicode)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        # usinf gmail to send mail
        mailer = MailSender(smtphost="smtp.gmail.com",
                            mailfrom='',
                            smtpuser="",
                            smtppass="",
                            smtpport=587)
        myFile = open("jobs.csv", "r")
        self.file.close()
        mailer.send(to=["*****@*****.**"],
                    subject="Scrapy mail",
                    body="Did you receive this, oh!",
                    attachs=(("twors", "text/plain", myFile), ))

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #56
0
class MiPipeline(object):
    def open_spider(self, spider):
        self.file = open('LorealProductInfo.csv', 'wb')
        self.exporter = CsvItemExporter(self.file,
                                        fields_to_export=[
                                            'title',
                                            'subtitle',
                                            'image_urls',
                                            'image_paths',
                                            'attr',
                                            'price',
                                        ])
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        if item['title'] is not None:
            self.exporter.export_item(item)
            return item
        else:
            raise DropItem('Drop item without title')
コード例 #57
0
class CsvExportPipeline(object):

    fields_to_export = [
        'Title',
        'Author',
        'AuthorLifetime',
        'TotalLength',
        'Language',
        'Genre',
        'Readers',
        'NumberOfReaders',
        'WikipediaLink',
        'AuthorWikipediaLink',
        'CatalogedOnDate',
        'DescriptionText',
        'LibrivoxUrlOfTitle',
        'LinksToAll128kMp3Files',
        'HasCoverArt',
        'HasCdInsertArt'
    ]

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        FILES_STORE = settings.FILES_STORE
        self.file = open(FILES_STORE + 'Librivox-Book-List.csv', 'w+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.fields_to_export = self.fields_to_export
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        FILES_STORE = settings.FILES_STORE
        invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
        title_dir = item['Title']
        for each_char in invalid_chars:
            title_dir = title_dir.replace(each_char, '-')

        if not os.path.exists(FILES_STORE + title_dir):
            os.makedirs(FILES_STORE + title_dir)
        # write txt files
        for each_file in self.fields_to_export:
            txt_file = FILES_STORE + title_dir + '/' + each_file + '.txt'
            with open(txt_file, 'w') as outfile:
                outfile.write(item[each_file])
        return item

    def convert_csv_to_excel(self, csv_file, excel_file):
        workbook = Workbook(excel_file)
        worksheet = workbook.add_worksheet()
        with open(csv_file, 'rb') as f:
            reader = csv.reader(f)
            for r, row in enumerate(reader):
                for c, col in enumerate(row):
                    worksheet.write(r, c, col)
        workbook.close()
コード例 #58
0
ファイル: pipelines.py プロジェクト: mkogrady/hoop_io
class CsvExportPipeline(object):

    spiders_to_processors = None

    def __init__(self):
        self.files = {}
        self.exporter = None

        self.spiders_to_processors = {
            teams.TeamsSpider.__name__: TeamProcessor,
            team_season.TeamSeasonSpider.__name__: TeamSeasonProcessor,
            players.PlayersSpider.__name__: PlayerProcessor,
            player_season.PlayerSeasonSpider.__name__: PlayerSeasonProcessor,
        }

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()

        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)

        return pipeline

    def spider_opened(self, spider):
        """
        called when the spider is started
        """

        try:
            processor = self.spiders_to_processors[type(spider).__name__]()
        except KeyError:
            self.exporter = None
            return

        file = open(processor.get_storage_filepath(spider), "w+b")

        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        """
        called when the spider is finished crawling
        """

        if self.exporter:

            self.exporter.finish_exporting()

            file = self.files.pop(spider)
            file.close()

    def process_item(self, item, spider):
        """
        called every time an item is yielded from a spider
        """

        if self.exporter:

            self.exporter.export_item(item)
            return item