Esempio n. 1
2
class CsvExportPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_societies.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = ['name', 'president', 'email', 'url', 'facebook', 'membership', 'about',
                                          'date_established']
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 2
1
class JsonExportPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
#         file = open('%s_data.xml' % spider.name, 'w+b')
        import os
        filePath = os.path.dirname(__file__)
        outputDir = filePath +'/output/'
        file = open(outputDir + '%s_data.csv' % spider.name, 'w+b')
        self.files[spider] = file
#         self.exporter = JsonItemExporter(file)
        self.exporter = CsvItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 3
0
class BitcoinTalkCrawlerPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_items.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = \
            ['timestamp', 'category_id', 'topic_id', 'topic_title',
             'message_number', 'message_author', 'message_text']
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 4
0
class catalogscraperPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open("%s_items.csv" % spider.name, "w+b")
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = ["title"]
        #'subject', 'description', 'creator', 'source', 'published', 'rights', 'citation', 'url']
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 5
0
class CSVPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_items.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file,delimiter='\t')
        self.exporter.fields_to_export = ['userId','bookId','name','rating','relativeRating','booklistNum']
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 6
0
class CsvExportPipeline(object):
    """
    app.pipelines.exporter_csv.CsvExportPipeline
    """
    def __init__(self):
        self.files = {}
        self.exporter = None

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file_csv = open('%s_items.csv' % spider.name, 'w+b')
        self.files[spider] = file_csv
        self.exporter = CsvItemExporter(file_csv)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file_csv = self.files.pop(spider)
        file_csv.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 7
0
class CSVWriterPipeline(object):
    
    def __init__(self,filename):
        self.filename = filename
        
    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings
        filename = settings.get('OUTPUT_FILE')
        pipeline = cls(filename)
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.file = open(self.filename, 'w+b')
        self.exporter = CsvItemExporter(self.file,include_headers_line=True)
        self.exporter.encoding='utf-8'
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 8
0
 def assertExportResult(self, item, expected, **kwargs):
     fp = BytesIO()
     ie = CsvItemExporter(fp, **kwargs)
     ie.start_exporting()
     ie.export_item(item)
     ie.finish_exporting()
     self.assertCsvEqual(fp.getvalue(), expected)
Esempio n. 9
0
class DumpToFile(object):
    """
    Dump harvested data into flat file, no other logic is implemented here
    (it's "Dump" :-)
    """
    def __init__(self):
        self.files = {}
        self.counter = 0

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        # TODO: verify if still needed for registration of spider_closed/opened event?
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        filename = spider.get_dump_filepath()
        f = open(filename, 'w')
        self.files[spider.name] = f
        # by default csv module uses Windows-style line terminators (\r\n)
        self.exporter = CsvItemExporter(f, include_headers_line=True, delimiter='|', lineterminator='\n')
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        f = self.files.pop(spider.name)
        f.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        # for counter, could set att in spider at closing
        self.counter += 1
        return item
Esempio n. 10
0
class CSVPipeline(object):

  def __init__(self):
    self.files = {}

  @classmethod
  def from_crawler(cls, crawler):
    pipeline = cls()
    crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
    crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
    return pipeline

  def spider_opened(self, spider):
    file = open('%s_items.csv' % spider.name, 'w+b')
    self.files[spider] = file
    self.exporter = CsvItemExporter(file)
    self.exporter.fields_to_export = ["filename", "titel", "publicatie", "dossiernummer", "organisatie", "publicatiedatum", "publicatietype", "file_urls"]
    self.exporter.start_exporting()

  def spider_closed(self, spider):
    self.exporter.finish_exporting()
    file = self.files.pop(spider)
    file.close()

  def process_item(self, item, spider):
    self.exporter.export_item(item)
    return item
class CSVExportPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s' % spider.nameOfFile, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)

        self.exporter.fields_to_export = ['originalString', 'translatedString']

        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 12
0
class CsvExportPipeline(object):

    def __init__(self):

        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):

        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)

        return pipeline

    def spider_opened(self, spider):

        file = open('vagas.csv', 'wb')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):

        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):

        self.exporter.export_item(item)

        return item
Esempio n. 13
0
class AmazonCsvPipeline(object):
    def open_spider(self, spider):
        # 保存csv数据的文件对象
        self.f = open("Amazon_goods_crawl.csv", "w")
        # 创建csv文件读写对象
        self.csv_exporter = CsvItemExporter(self.f)
        # 开始进行csv文件读写
        self.csv_exporter.start_exporting()
        # 根据商品标题进行去重处理
        self.add_title = set()

    def process_item(self, item, spider):
        if item['title'] in self.add_title:
            print u'[EEROR] 数据已保存,勿重复%s'% item['title']
        else:
            self.add_title.add(item['title'])
            # 每次写入一个item数据
            # print u'[INFO] 正在写入csv文件中%s'% item['title']
            self.csv_exporter.export_item(item)
        return item

    def close_spider(self, spider):
        # 结束csv文件读写
        # print u'[INFO] 写入csv文件已完成'
        self.csv_exporter.finish_exporting()
        # 关闭文件
        self.f.close()
Esempio n. 14
0
class WebcrawlerPipeline(object):
    def __init__ (self):
        self.files = {}
        pass
    
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline
    
    def spider_opened(self, spider):
        file = open("%s_urls.txt" % (spider.name), "w+b")
        self.files[spider] = file
        self.exporter = CsvItemExporter(file, include_headers_line=False)
        self.exporter.start_exporting()
        pass
    
    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()
        pass
    
    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
    pass
Esempio n. 15
0
 def test_header_export_two_items(self):
     for item in [self.i, dict(self.i)]:
         output = BytesIO()
         ie = CsvItemExporter(output)
         ie.start_exporting()
         ie.export_item(item)
         ie.export_item(item)
         ie.finish_exporting()
         self.assertCsvEqual(output.getvalue(), 'age,name\r\n22,John\xc2\xa3\r\n22,John\xc2\xa3\r\n')
Esempio n. 16
0
class FashionnovaPipeline(object):
    def __init__(self):
        self.filename = 'fashionnova.csv'
    def open_spider(self, spider):
        self.csvfile = open(self.filename, 'wb')
        self.exporter = CsvItemExporter(self.csvfile)
        self.exporter.start_exporting()
    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.csvfile.close()
    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 17
0
class TsvPipeline(object):
    def __init__(self):
        self.files = dict()

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open(spider.name+'-'+datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S")+'.tsv*', 
                           'wb')
        self.files[spider] = file

        self.exporter = CsvItemExporter(file, include_headers_line=True, join_multivalued=';', encoding="utf-8", delimiter='\t')
        if spider.name=='user':
            self.exporter.fields_to_export = ['uid', 'name', 'nickname', 'joindate', 'activedate']
        elif spider.name=='subject':
            self.exporter.fields_to_export = ['subjectid', 'order', 'subjectname', 'subjecttype', 'rank', 'date', 'votenum', 'favnum', 'staff', 'relations']
        elif spider.name=='record':
            self.exporter.fields_to_export = ['uid', 'name', 'nickname', 'iid', 'typ', 'state', 'adddate', 'rate', 'tags', 'comment']
        elif spider.name=='index':
            self.exporter.fields_to_export = ['indexid', 'creator', 'favourite', 'date', 'items']
        elif spider.name=='friends':
            self.exporter.fields_to_export = ['user', 'friend']
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        filename = file.name
        newname = filename[:-5]+'-'+datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S")+'.tsv'
        file.close()
        os.rename(filename, newname)
        if UPLOAD_TO_AZURE_STORAGE:
            block_blob_service = BlockBlobService(account_name=AZURE_ACCOUNT_NAME, account_key=AZURE_ACCOUNT_KEY)
            block_blob_service.create_blob_from_path(AZURE_CONTAINER,
                                                    newname,
                                                    newname,
                                                    content_settings=ContentSettings(content_type='text/tab-separated-values')
                                                            )
                                                            

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 18
0
class MacystopPipeline(object):
    def __init__(self):
        self.filename = 'topallproduct_all.csv'

    def open_spider(self, spider):
        self.csvfile = open(self.filename, 'wb')
        self.exporter = CsvItemExporter(self.csvfile)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.csvfile.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
class RegistryScraperPipeline(object):

	def __init__(self):
		self.filename = 'registry_scraper/output/employment_site.csv'

	def open_spider(self, spider):
		self.csvfile = open(self.filename, 'wb')
		self.exporter = CsvItemExporter(self.csvfile)
		self.exporter.start_exporting()

	def close_spider(self, spider):
		self.exporter.finish_exporting()
		self.csvfile.close()

	def process_item(self, item, spider):
		self.exporter.export_item(item)
		return item
Esempio n. 20
0
class AqiCsvPipeline(object):
    def open_spider(self, spider):
        # 保存csv数据的文件对象
        self.f = open('aqi.csv', 'w')
        # 创建csv文件读写对象
        self.csv_exporter = CsvItemExporter(self.f)
        # 开始进行csv文件读写
        self.csv_exporter.start_exporting()

    def process_item(self, item, spider):
        # 每次写入一个item数据
        self.csv_exporter.export_item(item)
        return item

    def close_spider(self, spider):
        # 结束csv文件读写
        self.csv_exporter.finish_exporting()
        # 关闭文件
        self.f.close()
Esempio n. 21
0
class CsvExportPipeline(object):

    def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.files = {}

    def spider_opened(self, spider):
        file = open('%s_jobs.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 22
0
class CrawlerPipeline(object):
    EXPORT_PATH = os.getenv("HOME")

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        path = CrawlerPipeline.EXPORT_PATH + "/" + spider.spider_id + "_export.csv"
        export_file = open(path, "ab" if os.path.isfile(path) else "wb")

        self.files[spider.spider_id] = export_file
        self.exporter = CsvItemExporter(export_file)
        self.exporter.fields_to_export = [
            "item_id",
            "url",
            "num_links",
            "num_images",
            "num_scripts",
            "num_styles",
            "headers",
            "text",
        ]
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        export_file = self.files.pop(spider.spider_id)
        export_file.close()

    def process_item(self, item, spider):
        # This is a common path among ALL crawlers
        self.exporter.export_item(item)
        return item
Esempio n. 23
0
class AnnonceGumtreePipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file,
                                        delimiter=';',
                                        quotechar='"',
                                        quoting=QUOTE_ALL)
        self.exporter.start_exporting()
        self.exporter.fields_to_export = [
            "ANNONCE_LINK", "ANNONCE_DATE", "ID_CLIENT", "GARAGE_ID", "TYPE",
            "SITE", "MARQUE", "MODELE", "ANNEE", "MOIS", "NOM", "CARROSSERIE",
            "OPTIONS", "CARBURANT", "CYLINDRE", "PUISSANCE", "PORTE", "BOITE",
            "NB_VITESSE", "PRIX", "KM", "PLACE", "COULEUR", "PHOTO", "LITRE",
            "IMMAT", "NO_CHASSIS", "VN_IND", "CONTACT", "CONTACT_PRENOM",
            "CONTACT_NOM", "GARAGE_NAME", "ADRESSE", "VILLE", "CP",
            "DEPARTEMENT", "PROVINCE", "COUNTRY", "TELEPHONE", "TELEPHONE_2",
            "TELEPHONE_3", "TELEPHONE_4", "TELEFAX", "EMAIL", "MINI_SITE"
        ]
        #,'motor','valve_cylind' , 'color_caoutchouc', 'prop_uniq', 'plaque', 'hauteur','large','largeur','distance_axe','cp_reservoir','direction','control_traction']
    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 24
0
class CsvPipeline(object):  #

    stats_name = 'csvpipeline'

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def __init__(self, crawler):
        self.stats = crawler.stats
        self.stats.set_value('done', 0)
        self.settings = crawler.settings

    def open_spider(self, spider):
        if not os.path.exists(spider.scraped_key):
            os.makedirs(spider.scraped_key)

        self.file = open(f'{spider.scraped_key}_result.csv', 'w+b')
        self.exporter = CsvItemExporter(self.file, include_headers_line=True)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()
        print('\n\n')

        logger.info(f'===> RESULT AT:{spider.scraped_key}_result.csv')

    # @defer.inlineCallbacks
    def process_item(self, item, spider):
        self.stats.inc_value('done')
        self.exporter.export_item(item)
        if self.stats.get_value('done') % 100 == 0:
            logger.info("--> CSV pineline: Done %s/ %s",
                        self.stats.get_value('done'),
                        self.stats.get_value('total'))

        return item
Esempio n. 25
0
class ExpertsExtractPipeline(object):
    def __init__(self):
        self.files = {}
        self.file_name = 'UMass Amherst.csv'
        self.export_fields = [
            "name", "title_1", "title_2", "title_3", "title_4", "title_5",
            "phone", "email", "biography", "headshot", "faculty_page",
            "areas_of_expertise_1", "areas_of_expertise_2",
            "areas_of_expertise_3", "areas_of_expertise_4",
            "areas_of_expertise_5", "areas_of_expertise_6",
            "areas_of_expertise_7", "areas_of_expertise_8",
            "areas_of_expertise_9", "areas_of_expertise_10",
            "areas_of_expertise_11"
        ]

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):

        output_file = open(self.file_name, 'w+b')
        self.files[spider] = output_file
        self.exporter = CsvItemExporter(output_file,
                                        fields_to_export=self.export_fields)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        output_file = self.files.pop(spider)
        output_file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 26
0
class TedEuropaEuPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.file = open('search_reuslt_{}.csv'.format(spider.name), 'wb')
        self.file_details = open('details_reuslt_{}.csv'.format(spider.name),
                                 'wb')

        self.exporter = CsvItemExporter(self.file)
        self.details_exporter = CsvItemExporter(self.file_details)

        self.exporter.fields_to_export = [
            'document_id', 'description', 'country', 'publication_date',
            'deadline'
        ]
        self.details_exporter.fields_to_export = [
            'url', 'document_id', 'name', 'value'
        ]

        self.exporter.start_exporting()
        self.details_exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

        self.details_exporter.finish_exporting()
        self.file_details.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        self.details_exporter.export_item(item)
        return item
Esempio n. 27
0
class ExtractPipeline(object):
    def __init__(self):
        self.files = {}
        self.file_name = f'cnhcregister_data.csv'
        self.export_fields = [
            'name',
            'registration',
            'telephone',
            'address',
            'discipline',
            'website',
        ]

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        output_file = open(
            self.file_name,
            'w+b',
        )
        self.files[spider] = output_file
        self.exporter = CsvItemExporter(output_file,
                                        fields_to_export=self.export_fields)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        output_file = self.files.pop(spider)
        output_file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 28
0
    def get_exporter(self, item):
        """
		Finds / creates an exporter for a given item
		Takes item as an argument and returns an exporter
		:param item: Pass an item to get its exporter
		"""
        #  Extract key from item using ItemAdapter
        adapter = ItemAdapter(item=item)
        key = adapter[self.key_field]

        # Create an exporter for the key, if needed
        # Check if the key doesn't exist in the dictionary
        if (not (key in self.exporters)):
            # Open a CSV file to create a new exporter
            exporter = CsvItemExporter(open(
                file=f'output/{self.out_dir}/{key}.csv', mode='ab'),
                                       include_headers_line=False)

            # Construct the header row from self.fields
            header = {
                self.fields[i]: self.fields[i]
                for i in range(0, len(self.fields))
            }

            # Configure the fields to export
            exporter.fields_to_export = self.fields

            # Start exporting the file
            exporter.start_exporting()

            # Export the header row
            exporter.export_item(item=header)

            # Add exporter to the dictionary
            self.exporters[key] = exporter

        # Return the corresponding exporter
        return self.exporters[key]
Esempio n. 29
0
class UserPipeline:
    def __init__(self):
        self.file = open('user_comment.csv', 'w+b')
        self.exporter = CsvItemExporter(self.file, 'unicode')
        self.exporter.fields_to_export = ['userID', 'time', 'comment', 'url']
        # self.exporter.fields_to_export = ['userID', 'url', 'comment', 'articleID', 'categoryID']
        self.exporter.start_exporting

    def process_item(self, item, spider):

        print('process from pipeline ', len(item))
        try:
            for idx in range(5000):
                it = UserItem()
                it['userID'] = item['userID']
                it['time'] = item['time'][idx]
                it['comment'] = item['comment'][idx]
                it['url'] = item['url'][idx]
                self.exporter.export_item(it)
        except IndexError:
            return item

        return item
Esempio n. 30
0
class CsvPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.file = open('output.csv', 'w+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):

        #remove all spaces
        item['content'] = [s for s in item['content'] if not s.isspace()]
        self.exporter.export_item(item)
        return item
Esempio n. 31
0
class CryptocurrencyPipeline:
    def __init__(self, file_name):
        self.file = open("bitcoin.csv", "wb")
        self.exporter = CsvItemExporter(self.file)

    @classmethod
    def from_crawler(cls, crawler):
        file_name = getattr(crawler.spider, "name")
        return cls(file_name)

    def open_spider(self, spider):

        self.exporter.start_exporting()

    def process_item(self, item, spider):

        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):

        self.exporter.finish_exporting()
        self.file.close()
Esempio n. 32
0
class CSVWriterPipeline(object):
    def open_spider(self, spider):
        file = open(spider.output_filename, 'wb')
        self.file_handle = file
        self.exporter = CsvItemExporter(file, delimiter='\t')
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file_handle.close()
        full_path = os.getcwd() + os.sep + spider.output_filename
        sys.stdout.write(full_path)
        sys.stdout.flush()

    def process_item(self, item, spider):
        item.setdefault('uuid', str(uuid.uuid1()))
        item.setdefault('date', datetime.datetime.now().strftime("%Y%m%d%H%M"))
        self.exporter.fields_to_export = spider.fields_to_export
        for field in item.keys():
            if field not in self.exporter.fields_to_export:
                self.exporter.fields_to_export.append(field)
        self.exporter.export_item(item)
        return item
Esempio n. 33
0
class WriteItemsPipeline(object):
    def open_spider(self, spider):
        self.csvfile1 = open('beer_info.csv', 'wb')
        self.exporter1 = CsvItemExporter(self.csvfile1)
        self.exporter1.start_exporting()

        self.csvfile2 = open('beer_reviews.csv', 'wb')
        self.exporter2 = CsvItemExporter(self.csvfile2)
        self.exporter2.start_exporting()

    def process_item(self, item, spider):
        if isinstance(item, BeerItem):
            self.exporter1.export_item(item)
        elif isinstance(item, ReviewItem):
            self.exporter2.export_item(item)
        return item

    def close_spider(self, spider):
        self.exporter1.finish_exporting()
        self.csvfile1.close()

        self.exporter2.finish_exporting()
        self.csvfile2.close()
Esempio n. 34
0
class DouJobsPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.file = open('dou_jobs.csv', 'w+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.fields_to_export = [
            'title', 'city', 'salary', 'description', 'company', 'date', 'url'
        ]
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 35
0
class MyPipeline(ImagesPipeline):
    def __init__(self, store_uri, download_func=None, settings=None):
        super(MyPipeline, self).__init__(store_uri,
                                         settings=settings,
                                         download_func=download_func)
        self.file = open("data/raw_labels.csv", 'wb')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.start_exporting()

    def file_path(self, request, response=None, info=None):
        #item=request.meta['item'] # Like this you can use all from item, not just url.
        image_guid = request.url.split('/')[-1]
        return image_guid

    def close_spider(self, spider):
        #super(ImagesPipeline, self).close_spider(spider)
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        super(ImagesPipeline, self).process_item(item, spider)
        self.exporter.export_item(item)
        return item
Esempio n. 36
0
class DzenPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        #self.bot = Bot(token=TOKEN)
        self.file = open('four_results.csv', 'a+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.start_exporting()
        #self.bot.send_message(chat_id=ID, text="Starting")

    def spider_closed(self, spider):
        #self.bot.send_message(chat_id=ID, text="It's broken!!!!!!!!!!!!!!")
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 37
0
class TortuPipeline:
    def __init__(self):
        self.file = open("./path/data.csv", 'wb')
        self.exporter = CsvItemExporter(self.file, unicode)
        self.exporter.fields_to_export = [
            'Account Owner', 'Account Owner ID', 'Account Name', 'Phone',
            'Account Site', 'Fax', 'Parent Account', 'Parent Account ID',
            'Account Number', 'Account Type', 'Industry', 'Annual Revenue',
            'Created By', 'Created by ID', 'Modified By', 'Modified by ID',
            'Created Time', 'Modified Time', 'Billing Street', 'Billing City',
            'Billing State', 'Billing Code', 'Billing Country', 'Description',
            'Last Activity Time', 'Layout', 'Layout ID', 'Tag',
            'Water System No', 'Website URL', 'Principal Country Served'
        ]
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
class CsvWriterPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        print("CsvWriterPipeline spider has been open")
        self.file = open('output.csv', 'a+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        print("CsvWriterPipeline spider has been close")
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        #print(item)
        self.exporter.export_item(item)
        return item
Esempio n. 39
0
class CSVPipeline(object):
    def __init__(self, path):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_items.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = [
            'date',
            'url',
            'category',
            'keywords',
            'title',
            'author',
            'text',
            'title_latin',
            'author_latin',
            'text_latin',
        ]
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 40
0
class VnexpressPipeline:
    def __init__(self):
        self.file = open('items2.csv', 'ab+')
        self.exporter = CsvItemExporter(self.file, 'unicode')
        self.exporter.start_exporting

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_duplicate(self, item):
        link = item['link']

        # if link in self.seen:
        #     raise DropItem('Duplicate item %s' % link)

        self.seen.add(link)

    def pre_process(self, item):
        item['category'] = str(item['category'])
        item['date'] = str(item['date'])
        item['title'] = str(item['title'])
        item['body'] = str(item['body'])
        item['comment'] = str(item['comment'])
        item['link'] = str(item['link'])

        # get id user comment
        user = str(item['user'])
        user = str(re.findall(r'\d+', user))
        print(type(user))
        item['user'] = user

    def process_item(self, item, spider):
        self.pre_process(item)
        self.exporter.export_item(item)

        return item
Esempio n. 41
0
class CSVExportPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):

        # print( str(spider) )
        # pdb.set_trace()

        file = open('%s' % spider.nameOfFile, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)

        #each spider has a different items
        if ( spider.name is 'fleetintel_list' ):
            self.exporter.fields_to_export = ['Company', 'Model', 'MSN', 'YoM', 'Reg', 'Comments']
        elif ( spider.name is 'Available_assets' ):
            self.exporter.fields_to_export = ['Category', 'Company', 'Contact_webPage', 'Contact_email', 'Contact_phone', 'Model', 'YoM', 'MSN', 'TFHs_TFCs', 'Engines', 'F_B_E', 'OL_A_S', 'LU', 'AD', 'ESN', 'L_E_S']
        
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 42
0
class VivanunciosPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_items.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = [
            'idPropiedad', 'category_id', 'agent_id', 'user_id',
            'type_popiedad', 'title', 'slug', 'body', 'image_name',
            'image_ext', 'meta_keywords', 'meta_desc', 'status', 'create_date',
            'updated_at', 'address', 'city', 'state', 'zip_propiedad',
            'country', 'latitude', 'longitude', 'price', 'beds', 'services',
            'characteristics', 'bath', 'year', 'features', 'is_delete',
            'featured', 'size', 'related', 'disponible', 'tipoLetra',
            'tipoPublicado', 'url_pagina', 'url_vendedor', 'nombre_vendedor',
            'id_anuncio', 'leyenda', 'sitio'
        ]
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.file.pop(spider)
        file.close()

    def process_item(self, item, spider):
        # build your row to export, then export the row
        self.exporter.export_item(item)
        return item
Esempio n. 43
0
class BigMLPipeline(BigMLAPIMixIn):

    AUTH_ERRMSG = (
        "{errtype:s} BigML credentials. Please supply BIGML_USERNAME"
        " and BIGML_API_KEY as either Scrapy settings or environment"
        " variables."
    )

    def __init__(self, username=None, api_key=None, source_name=None, dev_mode=None):
        self.source_name = source_name
        self.get_bigml_api(username, api_key, dev_mode=dev_mode)

    @classmethod
    def from_crawler(cls, crawler):
        o = cls(
            username=crawler.settings["BIGML_USERNAME"],
            api_key=crawler.settings["BIGML_API_KEY"],
            source_name=crawler.settings.get("BIGML_SOURCE_NAME", "Scrapy"),
            dev_mode=crawler.settings.getbool("BIGML_DEVMODE", False),
        )
        o.crawler = crawler
        o.settings = crawler.settings
        return o

    def open_spider(self, spider):
        self.tempfile = TemporaryFile(prefix="bigml-feed-")
        self.exporter = CsvItemExporter(self.tempfile)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.tempfile.seek(0)
        self.export_to_bigml(self.tempfile, self.source_name)

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 44
0
class HarvestmanPipeline(object):
    """Pipepline definition for spiders in the harvestman_spider project"""

    def __init__(self):
        """__init__, innit."""
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        csv_file = settings.CSV_FILE_OUTPUT_DIR.format(
            spider.base_url.split('/')[2],
            datetime.date.today().strftime('%Y-%m-%d'))

        if spider.name == 'google_serp_spider':
            file = open(csv_file, 'w')
            self.files[spider] = file
            # note this outputs as a tab seperated csv, rather than comma.
            self.exporter = CsvItemExporter(file, delimiter='\t')
            self.exporter.start_exporting()

    def spider_closed(self, spider):
        if spider.name == 'google_serp_spider':
            self.exporter.finish_exporting()
            file = self.files.pop(spider)
            file.close()

    def process_item(self, item, spider):
        if spider.name == 'google_serp_spider':
            self.exporter.export_item(item)
            return item
Esempio n. 45
0
class JDDetailCsvPipeline(object):
    """保存为CSV格式文件的管道类"""
    def open_spider(self,spider):
        # 保存csv数据库文件对象
        self.f = open("jddetail.csv", "wb")
        # 创建csv文件读写对象
        self.csv_exporter = CsvItemExporter(self.f)
        # 开始进行csv文件的读写
        self.csv_exporter.start_exporting()

    def process_item(self,item,spider):
        # 每次写入一个item数据
        print(type(item))
        print(item)
        #print(chardet.detect(list(dict(item).values())[0]))
        print("--" * 50)
        self.csv_exporter.export_item(item)
        return item

    def close_spider(self, spider):
        # 结束csv文件读写
        self.csv_exporter.finish_exporting()
        # 关闭文件
        self.f.close()
Esempio n. 46
0
class EnrolldataPipeline(object):
    """
                 company_name             会社名
                 job_name                ポジション 
                 link_url                募集詳細link   https://type.jp
                 nearest_station         住所
                 longitude                 経度
                 latitude                  緯度
                 source                    出所
                 occupation                職種
                 annual_income_min         年収min
                 annual_income_max         年収max
                 published_time            サイト内での掲載時間
                 create_data              クロリングした時間 

    """
    def open_spider(self, spider):
        self.file = open("test.csv", "wb")
        self.exporter = CsvItemExporter(self.file,
                                        fields_to_export=[
                                            "company_name", "job_name",
                                            "link_url", "nearest_station",
                                            "longitude", "latitude", "source",
                                            "occupation", "annual_income_min",
                                            "annual_income_max",
                                            "published_time", "create_data"
                                        ])
        self.exporter.start_exporting()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()
Esempio n. 47
0
class ExtractPipeline(object):
    def __init__(self):
        self.files = {}
        self.file_name = f'silver_sneakers_{time.time()}.csv'
        self.export_fields = [
            'address1', 'amenityIDs', 'city', 'corpID', 'counter',
            'flexClasses', 'genderSpecific', 'hasBoomClass', 'hasFlex',
            'hasSilverSneakersClass', 'locID', 'locationType', 'mileDistance',
            'name', 'phone', 'state', 'upmcPersonalTrainer', 'zipCode'
        ]

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        output_file = open(
            self.file_name,
            'w+b',
        )
        self.files[spider] = output_file
        self.exporter = CsvItemExporter(output_file,
                                        fields_to_export=self.export_fields)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        output_file = self.files.pop(spider)
        output_file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 48
0
class CityAvgPricePipeline(object):
    def __init__(self):
        self.files = {}
        self.file_path = './data/avg.csv'
        self.num = 0

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open(self.file_path, 'a+b')
        self.files[spider] = file
        kwargs = {
            'fields_to_export': ['city_name', 'avg_price', 'last_price']}
        self.exporter = CsvItemExporter(file, include_headers_line=False, **kwargs)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()
        send_mail("%s is closed!,time is  %s" % (spider.name, time.ctime()))

        print("spider closed!")

    def process_item(self, item, spider):
        if isinstance(item, CityAvgItem):
            self.exporter.export_item(item)
            self.num += 1
        if self.num % 100 == 0:
            print("save avg_data %s times" % self.num)
        return item
Esempio n. 49
0
class CSVPipeline(object):
    def __init__(self):
        self.files = {}
        self.exporter = None
        self.exporter1 = None

    def open_spider(self, spider):
        file = open('%s_items.csv' % spider.name, 'w+b')
        file1 = open('%s.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.files['covid'] = file1
        self.exporter = CsvItemExporter(file)
        self.exporter1 = CsvItemExporter(file1)
        self.exporter.fields_to_export = [
            'countries', 'total_cases', 'new_cases', 'total_recovered',
            'active_cases', 'total_cases_per_million', 'death_per_million',
            'total_deaths', 'new_deaths'
        ]
        self.exporter1.fields_to_export = [
            'new_cases', 'total_cases', 'total_deaths', 'new_deaths'
        ]
        self.exporter.start_exporting()
        self.exporter1.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.exporter1.finish_exporting()
        file = self.files.pop(spider)
        file1 = self.files.pop('covid')
        file.close()
        file1.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        self.exporter1.export_item(item)
        return item
Esempio n. 50
0
class CSVPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        if spider.name in 'realestate':
            self.file = open('current_listing.csv', 'w+b')
        else:
            self.file = open('past_listing.csv', 'w+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)

        return item
Esempio n. 51
0
class Douban250Pipeline(object):
    def __init__(self):
        self.fp = open("douban.csv", "wb")
        self.exporter = CsvItemExporter(self.fp,
                                        fields_to_export=[
                                            'movie_name',
                                            'movie_director_actors',
                                            'movie_time_country',
                                            'movie_grade',
                                            'comment_number',
                                            'movie_introduce',
                                        ])
        # self.exporter.start_exporting()

    def open_spider(self, spider):
        print("爬虫开始了")

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self):
        # self.exporter.finish_exporting()
        self.fp.close()
Esempio n. 52
0
class CsvWriterPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        filename = "output_rmob_" + time.strftime("%Y%m%d-%H%M%S")
        self.file = open(filename + '.csv', 'w+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.fields_to_export = [
            "id", "brand", "full_title", "year", "transmission", "price"
        ]
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 53
0
class ToCsvPipeline(object):
    """Outputs data to a csv file."""
    def open_spider(self, spider):
        scrapedate = datetime.now().strftime('%Y%m%d_%H%M%S')
        if not path.exists(settings.CSV_STORE):
            mkdir(settings.CSV_STORE)
        assert path.isdir(settings.CSV_STORE), \
        '{} is not a directory'.format(settings.CSV_STORE)
        pth_csv = path.join(settings.CSV_STORE,
                            'data_{}.csv'.format(scrapedate))
        self.file = open(pth_csv, 'w+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.fields_to_export = [
            'title', 'address', 'cuisines', 'opening', 'phone', 'website'
        ]
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 54
0
class CSVExportPipelines(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s.csv' % spider.name, 'wb')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file,
                                        include_headers_line=True,
                                        join_multivalued=',',
                                        lineterminator='\n')
        self.exporter.fields_to_export = [
            'date', 'episode', 'artist', 'song', 'link', 'image'
        ]

        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        required_fields = ['episode']  # your list of required fields
        if all(field in item for field in required_fields):
            self.exporter.export_item(item)
            return item
        else:
            raise DropItem("Item null")
Esempio n. 55
0
class HomeWorkMarketCsv(object):
    def __init__(self):
        self.file = open("jobs.csv", 'wb')
        self.exporter = CsvItemExporter(self.file, unicode)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        # usinf gmail to send mail
        mailer = MailSender(smtphost="smtp.gmail.com",
                            mailfrom='',
                            smtpuser="",
                            smtppass="",
                            smtpport=587)
        myFile = open("jobs.csv", "r")
        self.file.close()
        mailer.send(to=["*****@*****.**"],
                    subject="Scrapy mail",
                    body="Did you receive this, oh!",
                    attachs=(("twors", "text/plain", myFile), ))

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 56
0
class MiPipeline(object):
    def open_spider(self, spider):
        self.file = open('LorealProductInfo.csv', 'wb')
        self.exporter = CsvItemExporter(self.file,
                                        fields_to_export=[
                                            'title',
                                            'subtitle',
                                            'image_urls',
                                            'image_paths',
                                            'attr',
                                            'price',
                                        ])
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        if item['title'] is not None:
            self.exporter.export_item(item)
            return item
        else:
            raise DropItem('Drop item without title')
Esempio n. 57
0
class CsvExportPipeline(object):

    fields_to_export = [
        'Title',
        'Author',
        'AuthorLifetime',
        'TotalLength',
        'Language',
        'Genre',
        'Readers',
        'NumberOfReaders',
        'WikipediaLink',
        'AuthorWikipediaLink',
        'CatalogedOnDate',
        'DescriptionText',
        'LibrivoxUrlOfTitle',
        'LinksToAll128kMp3Files',
        'HasCoverArt',
        'HasCdInsertArt'
    ]

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        FILES_STORE = settings.FILES_STORE
        self.file = open(FILES_STORE + 'Librivox-Book-List.csv', 'w+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.fields_to_export = self.fields_to_export
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        FILES_STORE = settings.FILES_STORE
        invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
        title_dir = item['Title']
        for each_char in invalid_chars:
            title_dir = title_dir.replace(each_char, '-')

        if not os.path.exists(FILES_STORE + title_dir):
            os.makedirs(FILES_STORE + title_dir)
        # write txt files
        for each_file in self.fields_to_export:
            txt_file = FILES_STORE + title_dir + '/' + each_file + '.txt'
            with open(txt_file, 'w') as outfile:
                outfile.write(item[each_file])
        return item

    def convert_csv_to_excel(self, csv_file, excel_file):
        workbook = Workbook(excel_file)
        worksheet = workbook.add_worksheet()
        with open(csv_file, 'rb') as f:
            reader = csv.reader(f)
            for r, row in enumerate(reader):
                for c, col in enumerate(row):
                    worksheet.write(r, c, col)
        workbook.close()
Esempio n. 58
0
class CsvExportPipeline(object):

    spiders_to_processors = None

    def __init__(self):
        self.files = {}
        self.exporter = None

        self.spiders_to_processors = {
            teams.TeamsSpider.__name__: TeamProcessor,
            team_season.TeamSeasonSpider.__name__: TeamSeasonProcessor,
            players.PlayersSpider.__name__: PlayerProcessor,
            player_season.PlayerSeasonSpider.__name__: PlayerSeasonProcessor,
        }

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()

        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)

        return pipeline

    def spider_opened(self, spider):
        """
        called when the spider is started
        """

        try:
            processor = self.spiders_to_processors[type(spider).__name__]()
        except KeyError:
            self.exporter = None
            return

        file = open(processor.get_storage_filepath(spider), "w+b")

        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        """
        called when the spider is finished crawling
        """

        if self.exporter:

            self.exporter.finish_exporting()

            file = self.files.pop(spider)
            file.close()

    def process_item(self, item, spider):
        """
        called every time an item is yielded from a spider
        """

        if self.exporter:

            self.exporter.export_item(item)
            return item