Esempio n. 1
2
class CsvExportPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_societies.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = ['name', 'president', 'email', 'url', 'facebook', 'membership', 'about',
                                          'date_established']
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 2
1
class FacupPipeline(object):

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline


		#create files and instantiate exporter class
		#then run start_exporting() method, this is required for item exporter class
	def spider_opened(self, spider):
		self.results_csv = open('results_3.csv', 'wb') 
		self.missing_csv = open('results_miss_2.csv', 'wb') 
		self.results_exporter = CsvItemExporter(self.results_csv)
		self.missing_exporter = CsvItemExporter(self.missing_csv)
		self.results_exporter.start_exporting() 
		self.missing_exporter.start_exporting() 
		
    def process_item(self, item, spider):
		self.results_exporter = CsvItemExporter(self.results_csv)
		self.missing_exporter = CsvItemExporter(self.missing_csv)
        return item
		
    def spider_closed(self, spider):
		self.results_exporter.finish_exporting() 
		self.missing_exporter.finish_exporting() 
		self.results_csv.close()
		self.missing_csv.close()
Esempio n. 3
1
class JsonExportPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
#         file = open('%s_data.xml' % spider.name, 'w+b')
        import os
        filePath = os.path.dirname(__file__)
        outputDir = filePath +'/output/'
        file = open(outputDir + '%s_data.csv' % spider.name, 'w+b')
        self.files[spider] = file
#         self.exporter = JsonItemExporter(file)
        self.exporter = CsvItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 4
0
 def assertExportResult(self, item, expected, **kwargs):
     fp = BytesIO()
     ie = CsvItemExporter(fp, **kwargs)
     ie.start_exporting()
     ie.export_item(item)
     ie.finish_exporting()
     self.assertCsvEqual(fp.getvalue(), expected)
Esempio n. 5
0
class DumpToFile(object):
    """
    Dump harvested data into flat file, no other logic is implemented here
    (it's "Dump" :-)
    """
    def __init__(self):
        self.files = {}
        self.counter = 0

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        # TODO: verify if still needed for registration of spider_closed/opened event?
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        filename = spider.get_dump_filepath()
        f = open(filename, 'w')
        self.files[spider.name] = f
        # by default csv module uses Windows-style line terminators (\r\n)
        self.exporter = CsvItemExporter(f, include_headers_line=True, delimiter='|', lineterminator='\n')
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        f = self.files.pop(spider.name)
        f.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        # for counter, could set att in spider at closing
        self.counter += 1
        return item
Esempio n. 6
0
class CSVPipeline(object):

  def __init__(self):
    self.files = {}

  @classmethod
  def from_crawler(cls, crawler):
    pipeline = cls()
    crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
    crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
    return pipeline

  def spider_opened(self, spider):
    file = open('%s_items.csv' % spider.name, 'w+b')
    self.files[spider] = file
    self.exporter = CsvItemExporter(file)
    self.exporter.fields_to_export = ["filename", "titel", "publicatie", "dossiernummer", "organisatie", "publicatiedatum", "publicatietype", "file_urls"]
    self.exporter.start_exporting()

  def spider_closed(self, spider):
    self.exporter.finish_exporting()
    file = self.files.pop(spider)
    file.close()

  def process_item(self, item, spider):
    self.exporter.export_item(item)
    return item
Esempio n. 7
0
class CSVPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_items.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file,delimiter='\t')
        self.exporter.fields_to_export = ['userId','bookId','name','rating','relativeRating','booklistNum']
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
class CSVExportPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s' % spider.nameOfFile, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)

        self.exporter.fields_to_export = ['originalString', 'translatedString']

        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 9
0
class CsvExportPipeline(object):
    """
    app.pipelines.exporter_csv.CsvExportPipeline
    """
    def __init__(self):
        self.files = {}
        self.exporter = None

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file_csv = open('%s_items.csv' % spider.name, 'w+b')
        self.files[spider] = file_csv
        self.exporter = CsvItemExporter(file_csv)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file_csv = self.files.pop(spider)
        file_csv.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 10
0
class CsvExportPipeline(object):

    def __init__(self):

        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):

        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)

        return pipeline

    def spider_opened(self, spider):

        file = open('vagas.csv', 'wb')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):

        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):

        self.exporter.export_item(item)

        return item
Esempio n. 11
0
class catalogscraperPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open("%s_items.csv" % spider.name, "w+b")
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = ["title"]
        #'subject', 'description', 'creator', 'source', 'published', 'rights', 'citation', 'url']
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 12
0
class WebcrawlerPipeline(object):
    def __init__ (self):
        self.files = {}
        pass
    
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline
    
    def spider_opened(self, spider):
        file = open("%s_urls.txt" % (spider.name), "w+b")
        self.files[spider] = file
        self.exporter = CsvItemExporter(file, include_headers_line=False)
        self.exporter.start_exporting()
        pass
    
    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()
        pass
    
    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
    pass
Esempio n. 13
0
class CSVWriterPipeline(object):
    
    def __init__(self,filename):
        self.filename = filename
        
    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings
        filename = settings.get('OUTPUT_FILE')
        pipeline = cls(filename)
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.file = open(self.filename, 'w+b')
        self.exporter = CsvItemExporter(self.file,include_headers_line=True)
        self.exporter.encoding='utf-8'
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 14
0
class BitcoinTalkCrawlerPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_items.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = \
            ['timestamp', 'category_id', 'topic_id', 'topic_title',
             'message_number', 'message_author', 'message_text']
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 15
0
class AmazonCsvPipeline(object):
    def open_spider(self, spider):
        # 保存csv数据的文件对象
        self.f = open("Amazon_goods_crawl.csv", "w")
        # 创建csv文件读写对象
        self.csv_exporter = CsvItemExporter(self.f)
        # 开始进行csv文件读写
        self.csv_exporter.start_exporting()
        # 根据商品标题进行去重处理
        self.add_title = set()

    def process_item(self, item, spider):
        if item['title'] in self.add_title:
            print u'[EEROR] 数据已保存,勿重复%s'% item['title']
        else:
            self.add_title.add(item['title'])
            # 每次写入一个item数据
            # print u'[INFO] 正在写入csv文件中%s'% item['title']
            self.csv_exporter.export_item(item)
        return item

    def close_spider(self, spider):
        # 结束csv文件读写
        # print u'[INFO] 写入csv文件已完成'
        self.csv_exporter.finish_exporting()
        # 关闭文件
        self.f.close()
Esempio n. 16
0
 def test_header_export_two_items(self):
     for item in [self.i, dict(self.i)]:
         output = BytesIO()
         ie = CsvItemExporter(output)
         ie.start_exporting()
         ie.export_item(item)
         ie.export_item(item)
         ie.finish_exporting()
         self.assertCsvEqual(output.getvalue(), 'age,name\r\n22,John\xc2\xa3\r\n22,John\xc2\xa3\r\n')
Esempio n. 17
0
class FashionnovaPipeline(object):
    def __init__(self):
        self.filename = 'fashionnova.csv'
    def open_spider(self, spider):
        self.csvfile = open(self.filename, 'wb')
        self.exporter = CsvItemExporter(self.csvfile)
        self.exporter.start_exporting()
    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.csvfile.close()
    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 18
0
class TsvPipeline(object):
    def __init__(self):
        self.files = dict()

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open(spider.name+'-'+datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S")+'.tsv*', 
                           'wb')
        self.files[spider] = file

        self.exporter = CsvItemExporter(file, include_headers_line=True, join_multivalued=';', encoding="utf-8", delimiter='\t')
        if spider.name=='user':
            self.exporter.fields_to_export = ['uid', 'name', 'nickname', 'joindate', 'activedate']
        elif spider.name=='subject':
            self.exporter.fields_to_export = ['subjectid', 'order', 'subjectname', 'subjecttype', 'rank', 'date', 'votenum', 'favnum', 'staff', 'relations']
        elif spider.name=='record':
            self.exporter.fields_to_export = ['uid', 'name', 'nickname', 'iid', 'typ', 'state', 'adddate', 'rate', 'tags', 'comment']
        elif spider.name=='index':
            self.exporter.fields_to_export = ['indexid', 'creator', 'favourite', 'date', 'items']
        elif spider.name=='friends':
            self.exporter.fields_to_export = ['user', 'friend']
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        filename = file.name
        newname = filename[:-5]+'-'+datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S")+'.tsv'
        file.close()
        os.rename(filename, newname)
        if UPLOAD_TO_AZURE_STORAGE:
            block_blob_service = BlockBlobService(account_name=AZURE_ACCOUNT_NAME, account_key=AZURE_ACCOUNT_KEY)
            block_blob_service.create_blob_from_path(AZURE_CONTAINER,
                                                    newname,
                                                    newname,
                                                    content_settings=ContentSettings(content_type='text/tab-separated-values')
                                                            )
                                                            

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 19
0
class MacystopPipeline(object):
    def __init__(self):
        self.filename = 'topallproduct_all.csv'

    def open_spider(self, spider):
        self.csvfile = open(self.filename, 'wb')
        self.exporter = CsvItemExporter(self.csvfile)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.csvfile.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
class RegistryScraperPipeline(object):

	def __init__(self):
		self.filename = 'registry_scraper/output/employment_site.csv'

	def open_spider(self, spider):
		self.csvfile = open(self.filename, 'wb')
		self.exporter = CsvItemExporter(self.csvfile)
		self.exporter.start_exporting()

	def close_spider(self, spider):
		self.exporter.finish_exporting()
		self.csvfile.close()

	def process_item(self, item, spider):
		self.exporter.export_item(item)
		return item
Esempio n. 21
0
class AqiCsvPipeline(object):
    def open_spider(self, spider):
        # 保存csv数据的文件对象
        self.f = open('aqi.csv', 'w')
        # 创建csv文件读写对象
        self.csv_exporter = CsvItemExporter(self.f)
        # 开始进行csv文件读写
        self.csv_exporter.start_exporting()

    def process_item(self, item, spider):
        # 每次写入一个item数据
        self.csv_exporter.export_item(item)
        return item

    def close_spider(self, spider):
        # 结束csv文件读写
        self.csv_exporter.finish_exporting()
        # 关闭文件
        self.f.close()
Esempio n. 22
0
class CsvExportPipeline(object):

    def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.files = {}

    def spider_opened(self, spider):
        file = open('%s_jobs.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 23
0
class CrawlerPipeline(object):
    EXPORT_PATH = os.getenv("HOME")

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        path = CrawlerPipeline.EXPORT_PATH + "/" + spider.spider_id + "_export.csv"
        export_file = open(path, "ab" if os.path.isfile(path) else "wb")

        self.files[spider.spider_id] = export_file
        self.exporter = CsvItemExporter(export_file)
        self.exporter.fields_to_export = [
            "item_id",
            "url",
            "num_links",
            "num_images",
            "num_scripts",
            "num_styles",
            "headers",
            "text",
        ]
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        export_file = self.files.pop(spider.spider_id)
        export_file.close()

    def process_item(self, item, spider):
        # This is a common path among ALL crawlers
        self.exporter.export_item(item)
        return item
Esempio n. 24
0
class CsvPipeline(object):  #

    stats_name = 'csvpipeline'

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    def __init__(self, crawler):
        self.stats = crawler.stats
        self.stats.set_value('done', 0)
        self.settings = crawler.settings

    def open_spider(self, spider):
        if not os.path.exists(spider.scraped_key):
            os.makedirs(spider.scraped_key)

        self.file = open(f'{spider.scraped_key}_result.csv', 'w+b')
        self.exporter = CsvItemExporter(self.file, include_headers_line=True)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()
        print('\n\n')

        logger.info(f'===> RESULT AT:{spider.scraped_key}_result.csv')

    # @defer.inlineCallbacks
    def process_item(self, item, spider):
        self.stats.inc_value('done')
        self.exporter.export_item(item)
        if self.stats.get_value('done') % 100 == 0:
            logger.info("--> CSV pineline: Done %s/ %s",
                        self.stats.get_value('done'),
                        self.stats.get_value('total'))

        return item
Esempio n. 25
0
class DouJobsPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.file = open('dou_jobs.csv', 'w+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.fields_to_export = [
            'title', 'city', 'salary', 'description', 'company', 'date', 'url'
        ]
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 26
0
class CSVWriterPipeline(object):
    def open_spider(self, spider):
        file = open(spider.output_filename, 'wb')
        self.file_handle = file
        self.exporter = CsvItemExporter(file, delimiter='\t')
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file_handle.close()
        full_path = os.getcwd() + os.sep + spider.output_filename
        sys.stdout.write(full_path)
        sys.stdout.flush()

    def process_item(self, item, spider):
        item.setdefault('uuid', str(uuid.uuid1()))
        item.setdefault('date', datetime.datetime.now().strftime("%Y%m%d%H%M"))
        self.exporter.fields_to_export = spider.fields_to_export
        for field in item.keys():
            if field not in self.exporter.fields_to_export:
                self.exporter.fields_to_export.append(field)
        self.exporter.export_item(item)
        return item
Esempio n. 27
0
class WriteItemsPipeline(object):
    def open_spider(self, spider):
        self.csvfile1 = open('beer_info.csv', 'wb')
        self.exporter1 = CsvItemExporter(self.csvfile1)
        self.exporter1.start_exporting()

        self.csvfile2 = open('beer_reviews.csv', 'wb')
        self.exporter2 = CsvItemExporter(self.csvfile2)
        self.exporter2.start_exporting()

    def process_item(self, item, spider):
        if isinstance(item, BeerItem):
            self.exporter1.export_item(item)
        elif isinstance(item, ReviewItem):
            self.exporter2.export_item(item)
        return item

    def close_spider(self, spider):
        self.exporter1.finish_exporting()
        self.csvfile1.close()

        self.exporter2.finish_exporting()
        self.csvfile2.close()
Esempio n. 28
0
class TortuPipeline:
    def __init__(self):
        self.file = open("./path/data.csv", 'wb')
        self.exporter = CsvItemExporter(self.file, unicode)
        self.exporter.fields_to_export = [
            'Account Owner', 'Account Owner ID', 'Account Name', 'Phone',
            'Account Site', 'Fax', 'Parent Account', 'Parent Account ID',
            'Account Number', 'Account Type', 'Industry', 'Annual Revenue',
            'Created By', 'Created by ID', 'Modified By', 'Modified by ID',
            'Created Time', 'Modified Time', 'Billing Street', 'Billing City',
            'Billing State', 'Billing Code', 'Billing Country', 'Description',
            'Last Activity Time', 'Layout', 'Layout ID', 'Tag',
            'Water System No', 'Website URL', 'Principal Country Served'
        ]
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 29
0
class MyPipeline(ImagesPipeline):
    def __init__(self, store_uri, download_func=None, settings=None):
        super(MyPipeline, self).__init__(store_uri,
                                         settings=settings,
                                         download_func=download_func)
        self.file = open("data/raw_labels.csv", 'wb')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.start_exporting()

    def file_path(self, request, response=None, info=None):
        #item=request.meta['item'] # Like this you can use all from item, not just url.
        image_guid = request.url.split('/')[-1]
        return image_guid

    def close_spider(self, spider):
        #super(ImagesPipeline, self).close_spider(spider)
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        super(ImagesPipeline, self).process_item(item, spider)
        self.exporter.export_item(item)
        return item
Esempio n. 30
0
class CsvPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.file = open('output.csv', 'w+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):

        #remove all spaces
        item['content'] = [s for s in item['content'] if not s.isspace()]
        self.exporter.export_item(item)
        return item
Esempio n. 31
0
class DzenPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        #self.bot = Bot(token=TOKEN)
        self.file = open('four_results.csv', 'a+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.start_exporting()
        #self.bot.send_message(chat_id=ID, text="Starting")

    def spider_closed(self, spider):
        #self.bot.send_message(chat_id=ID, text="It's broken!!!!!!!!!!!!!!")
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 32
0
class CryptocurrencyPipeline:
    def __init__(self, file_name):
        self.file = open("bitcoin.csv", "wb")
        self.exporter = CsvItemExporter(self.file)

    @classmethod
    def from_crawler(cls, crawler):
        file_name = getattr(crawler.spider, "name")
        return cls(file_name)

    def open_spider(self, spider):

        self.exporter.start_exporting()

    def process_item(self, item, spider):

        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):

        self.exporter.finish_exporting()
        self.file.close()
class CsvWriterPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        print("CsvWriterPipeline spider has been open")
        self.file = open('output.csv', 'a+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        print("CsvWriterPipeline spider has been close")
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        #print(item)
        self.exporter.export_item(item)
        return item
Esempio n. 34
0
class CSVExportPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):

        # print( str(spider) )
        # pdb.set_trace()

        file = open('%s' % spider.nameOfFile, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)

        #each spider has a different items
        if ( spider.name is 'fleetintel_list' ):
            self.exporter.fields_to_export = ['Company', 'Model', 'MSN', 'YoM', 'Reg', 'Comments']
        elif ( spider.name is 'Available_assets' ):
            self.exporter.fields_to_export = ['Category', 'Company', 'Contact_webPage', 'Contact_email', 'Contact_phone', 'Model', 'YoM', 'MSN', 'TFHs_TFCs', 'Engines', 'F_B_E', 'OL_A_S', 'LU', 'AD', 'ESN', 'L_E_S']
        
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 35
0
class CSVPipeline(object):
    def __init__(self, path):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_items.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = [
            'date',
            'url',
            'category',
            'keywords',
            'title',
            'author',
            'text',
            'title_latin',
            'author_latin',
            'text_latin',
        ]
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 36
0
class VivanunciosPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_items.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = [
            'idPropiedad', 'category_id', 'agent_id', 'user_id',
            'type_popiedad', 'title', 'slug', 'body', 'image_name',
            'image_ext', 'meta_keywords', 'meta_desc', 'status', 'create_date',
            'updated_at', 'address', 'city', 'state', 'zip_propiedad',
            'country', 'latitude', 'longitude', 'price', 'beds', 'services',
            'characteristics', 'bath', 'year', 'features', 'is_delete',
            'featured', 'size', 'related', 'disponible', 'tipoLetra',
            'tipoPublicado', 'url_pagina', 'url_vendedor', 'nombre_vendedor',
            'id_anuncio', 'leyenda', 'sitio'
        ]
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.file.pop(spider)
        file.close()

    def process_item(self, item, spider):
        # build your row to export, then export the row
        self.exporter.export_item(item)
        return item
Esempio n. 37
0
class VnexpressPipeline:
    def __init__(self):
        self.file = open('items2.csv', 'ab+')
        self.exporter = CsvItemExporter(self.file, 'unicode')
        self.exporter.start_exporting

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_duplicate(self, item):
        link = item['link']

        # if link in self.seen:
        #     raise DropItem('Duplicate item %s' % link)

        self.seen.add(link)

    def pre_process(self, item):
        item['category'] = str(item['category'])
        item['date'] = str(item['date'])
        item['title'] = str(item['title'])
        item['body'] = str(item['body'])
        item['comment'] = str(item['comment'])
        item['link'] = str(item['link'])

        # get id user comment
        user = str(item['user'])
        user = str(re.findall(r'\d+', user))
        print(type(user))
        item['user'] = user

    def process_item(self, item, spider):
        self.pre_process(item)
        self.exporter.export_item(item)

        return item
Esempio n. 38
0
class BigMLPipeline(BigMLAPIMixIn):

    AUTH_ERRMSG = (
        "{errtype:s} BigML credentials. Please supply BIGML_USERNAME"
        " and BIGML_API_KEY as either Scrapy settings or environment"
        " variables."
    )

    def __init__(self, username=None, api_key=None, source_name=None, dev_mode=None):
        self.source_name = source_name
        self.get_bigml_api(username, api_key, dev_mode=dev_mode)

    @classmethod
    def from_crawler(cls, crawler):
        o = cls(
            username=crawler.settings["BIGML_USERNAME"],
            api_key=crawler.settings["BIGML_API_KEY"],
            source_name=crawler.settings.get("BIGML_SOURCE_NAME", "Scrapy"),
            dev_mode=crawler.settings.getbool("BIGML_DEVMODE", False),
        )
        o.crawler = crawler
        o.settings = crawler.settings
        return o

    def open_spider(self, spider):
        self.tempfile = TemporaryFile(prefix="bigml-feed-")
        self.exporter = CsvItemExporter(self.tempfile)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.tempfile.seek(0)
        self.export_to_bigml(self.tempfile, self.source_name)

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 39
0
class CSVPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        if spider.name in 'realestate':
            self.file = open('current_listing.csv', 'w+b')
        else:
            self.file = open('past_listing.csv', 'w+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)

        return item
Esempio n. 40
0
class HomeWorkMarketCsv(object):
    def __init__(self):
        self.file = open("jobs.csv", 'wb')
        self.exporter = CsvItemExporter(self.file, unicode)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        # usinf gmail to send mail
        mailer = MailSender(smtphost="smtp.gmail.com",
                            mailfrom='',
                            smtpuser="",
                            smtppass="",
                            smtpport=587)
        myFile = open("jobs.csv", "r")
        self.file.close()
        mailer.send(to=["*****@*****.**"],
                    subject="Scrapy mail",
                    body="Did you receive this, oh!",
                    attachs=(("twors", "text/plain", myFile), ))

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 41
0
class CsvWriterPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        filename = "output_rmob_" + time.strftime("%Y%m%d-%H%M%S")
        self.file = open(filename + '.csv', 'w+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.fields_to_export = [
            "id", "brand", "full_title", "year", "transmission", "price"
        ]
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 42
0
class ToCsvPipeline(object):
    """Outputs data to a csv file."""
    def open_spider(self, spider):
        scrapedate = datetime.now().strftime('%Y%m%d_%H%M%S')
        if not path.exists(settings.CSV_STORE):
            mkdir(settings.CSV_STORE)
        assert path.isdir(settings.CSV_STORE), \
        '{} is not a directory'.format(settings.CSV_STORE)
        pth_csv = path.join(settings.CSV_STORE,
                            'data_{}.csv'.format(scrapedate))
        self.file = open(pth_csv, 'w+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.fields_to_export = [
            'title', 'address', 'cuisines', 'opening', 'phone', 'website'
        ]
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 43
0
class CityAvgPricePipeline(object):
    def __init__(self):
        self.files = {}
        self.file_path = './data/avg.csv'
        self.num = 0

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open(self.file_path, 'a+b')
        self.files[spider] = file
        kwargs = {
            'fields_to_export': ['city_name', 'avg_price', 'last_price']}
        self.exporter = CsvItemExporter(file, include_headers_line=False, **kwargs)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()
        send_mail("%s is closed!,time is  %s" % (spider.name, time.ctime()))

        print("spider closed!")

    def process_item(self, item, spider):
        if isinstance(item, CityAvgItem):
            self.exporter.export_item(item)
            self.num += 1
        if self.num % 100 == 0:
            print("save avg_data %s times" % self.num)
        return item
Esempio n. 44
0
class ExpertsExtractPipeline(object):
    def __init__(self):
        self.files = {}
        self.file_name = 'CUNY Law.csv'
        self.export_fields = [
            "name", "phone", "email", "biography", "headshot", "faculty_page",
            "areas_of_expertise_1", "areas_of_expertise_2",
            "areas_of_expertise_3", "areas_of_expertise_4",
            "areas_of_expertise_5", "areas_of_expertise_6",
            "areas_of_expertise_7", "areas_of_expertise_8",
            "areas_of_expertise_9"
        ]

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):

        output_file = open(self.file_name, 'w+b')
        self.files[spider] = output_file
        self.exporter = CsvItemExporter(output_file,
                                        fields_to_export=self.export_fields)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        output_file = self.files.pop(spider)
        output_file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 45
0
class HarvestmanPipeline(object):
    """Pipepline definition for spiders in the harvestman_spider project"""

    def __init__(self):
        """__init__, innit."""
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        csv_file = settings.CSV_FILE_OUTPUT_DIR.format(
            spider.base_url.split('/')[2],
            datetime.date.today().strftime('%Y-%m-%d'))

        if spider.name == 'google_serp_spider':
            file = open(csv_file, 'w')
            self.files[spider] = file
            # note this outputs as a tab seperated csv, rather than comma.
            self.exporter = CsvItemExporter(file, delimiter='\t')
            self.exporter.start_exporting()

    def spider_closed(self, spider):
        if spider.name == 'google_serp_spider':
            self.exporter.finish_exporting()
            file = self.files.pop(spider)
            file.close()

    def process_item(self, item, spider):
        if spider.name == 'google_serp_spider':
            self.exporter.export_item(item)
            return item
Esempio n. 46
0
class CSVExportPipelines(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s.csv' % spider.name, 'wb')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file,
                                        include_headers_line=True,
                                        join_multivalued=',',
                                        lineterminator='\n')
        self.exporter.fields_to_export = [
            'date', 'episode', 'artist', 'song', 'link', 'image'
        ]

        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        required_fields = ['episode']  # your list of required fields
        if all(field in item for field in required_fields):
            self.exporter.export_item(item)
            return item
        else:
            raise DropItem("Item null")
Esempio n. 47
0
class EnrolldataPipeline(object):
    """
                 company_name             会社名
                 job_name                ポジション 
                 link_url                募集詳細link   https://type.jp
                 nearest_station         住所
                 longitude                 経度
                 latitude                  緯度
                 source                    出所
                 occupation                職種
                 annual_income_min         年収min
                 annual_income_max         年収max
                 published_time            サイト内での掲載時間
                 create_data              クロリングした時間 

    """
    def open_spider(self, spider):
        self.file = open("test.csv", "wb")
        self.exporter = CsvItemExporter(self.file,
                                        fields_to_export=[
                                            "company_name", "job_name",
                                            "link_url", "nearest_station",
                                            "longitude", "latitude", "source",
                                            "occupation", "annual_income_min",
                                            "annual_income_max",
                                            "published_time", "create_data"
                                        ])
        self.exporter.start_exporting()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()
Esempio n. 48
0
class JDDetailCsvPipeline(object):
    """保存为CSV格式文件的管道类"""
    def open_spider(self,spider):
        # 保存csv数据库文件对象
        self.f = open("jddetail.csv", "wb")
        # 创建csv文件读写对象
        self.csv_exporter = CsvItemExporter(self.f)
        # 开始进行csv文件的读写
        self.csv_exporter.start_exporting()

    def process_item(self,item,spider):
        # 每次写入一个item数据
        print(type(item))
        print(item)
        #print(chardet.detect(list(dict(item).values())[0]))
        print("--" * 50)
        self.csv_exporter.export_item(item)
        return item

    def close_spider(self, spider):
        # 结束csv文件读写
        self.csv_exporter.finish_exporting()
        # 关闭文件
        self.f.close()
Esempio n. 49
0
class ExtractPipeline(object):
    def __init__(self):
        self.files = {}
        self.file_name = f'silver_sneakers_{time.time()}.csv'
        self.export_fields = [
            'address1', 'amenityIDs', 'city', 'corpID', 'counter',
            'flexClasses', 'genderSpecific', 'hasBoomClass', 'hasFlex',
            'hasSilverSneakersClass', 'locID', 'locationType', 'mileDistance',
            'name', 'phone', 'state', 'upmcPersonalTrainer', 'zipCode'
        ]

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        output_file = open(
            self.file_name,
            'w+b',
        )
        self.files[spider] = output_file
        self.exporter = CsvItemExporter(output_file,
                                        fields_to_export=self.export_fields)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        output_file = self.files.pop(spider)
        output_file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 50
0
class MiPipeline(object):
    def open_spider(self, spider):
        self.file = open('LorealProductInfo.csv', 'wb')
        self.exporter = CsvItemExporter(self.file,
                                        fields_to_export=[
                                            'title',
                                            'subtitle',
                                            'image_urls',
                                            'image_paths',
                                            'attr',
                                            'price',
                                        ])
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        if item['title'] is not None:
            self.exporter.export_item(item)
            return item
        else:
            raise DropItem('Drop item without title')
Esempio n. 51
0
class CSVPipeline(object):
    def __init__(self):
        self.files = {}
        self.exporter = None
        self.exporter1 = None

    def open_spider(self, spider):
        file = open('%s_items.csv' % spider.name, 'w+b')
        file1 = open('%s.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.files['covid'] = file1
        self.exporter = CsvItemExporter(file)
        self.exporter1 = CsvItemExporter(file1)
        self.exporter.fields_to_export = [
            'countries', 'total_cases', 'new_cases', 'total_recovered',
            'active_cases', 'total_cases_per_million', 'death_per_million',
            'total_deaths', 'new_deaths'
        ]
        self.exporter1.fields_to_export = [
            'new_cases', 'total_cases', 'total_deaths', 'new_deaths'
        ]
        self.exporter.start_exporting()
        self.exporter1.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.exporter1.finish_exporting()
        file = self.files.pop(spider)
        file1 = self.files.pop('covid')
        file.close()
        file1.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        self.exporter1.export_item(item)
        return item
Esempio n. 52
0
class NicheItemPipeline(object):
    def open_spider(self, spider):
        self.entFile = open(FILENAME_ENTITIES, 'wb')
        self.entExporter = CsvItemExporter(self.entFile)
        self.entExporter.start_exporting()

        self.factFile = open(FILENAME_FACTS, 'wb')
        self.factExporter = CsvItemExporter(self.factFile)
        self.factExporter.start_exporting()

        self.grdFile = open(FILENAME_GRADES, 'wb')
        self.grdExporter = CsvItemExporter(self.grdFile)
        self.grdExporter.start_exporting()

        self.bdgFile = open(FILENAME_BADGES, 'wb')
        self.bdgExporter = CsvItemExporter(self.bdgFile)
        self.bdgExporter.start_exporting()

    def close_spider(self, spider):
        self.entExporter.finish_exporting()
        self.entFile.close()

        self.factExporter.finish_exporting()
        self.factFile.close()

        self.grdExporter.finish_exporting()
        self.grdFile.close()

        self.bdgExporter.finish_exporting()
        self.bdgFile.close()

    def process_item(self, item, spider):
        if isinstance(item, EntityItem):
            self.entExporter.export_item(item)
        elif isinstance(item, FactItem):
            self.factExporter.export_item(item)
        elif isinstance(item, GradeItem):
            self.grdExporter.export_item(item)
        elif isinstance(item, BadgeItem):
            self.bdgExporter.export_item(item)
        return item
Esempio n. 53
0
class profitItemPipline(object):
    def __init__(self):
        self.file_profit=open(file='profit.csv',mode='wb')
        self.file_assets=open('asset.csv', 'wb')
        self.file_cash = open('cash.csv', 'wb')

    def open_spider(self,spider):
        self.exporter_profit=CsvItemExporter(file=self.file_profit,encoding='utf-8')
        self.exporter_assets= CsvItemExporter(file=self.file_assets,encoding='utf-8')
        self.exporter_cash = CsvItemExporter(file=self.file_cash, encoding='utf-8')
        self.exporter_profit.start_exporting()
        self.exporter_assets.start_exporting()
        self.exporter_cash.start_exporting()



    def process_item(self,item,spider):
        if isinstance(item,profitItem):
            self.exporter_profit.fields_to_export=['STOCK_NUMBER','YEAREND_DATE_PROFIT','TURNOVER','PBT','OPER_PROFIT','NET_PROF','INCOME_NETTRADING','INCOME_NETFEE','INCOME_INTEREST','EPS','DPS']
            self.exporter_profit.export_item(item)
            return item
        elif isinstance(item,assetsItem):
            self.exporter_assets.fields_to_export = ['STOCK_NUMBER','YEAREND_DATE_ASSETS', 'TOTAL_LIAB', 'TOTAL_DEBT', 'TOTAL_ASS',
                                              'OTHER_ASS', 'LOAN_TO_BANK', 'INVENTORY', 'FIX_ASS',
                                              'FINANCIALASSET_SALE', 'EQUITY', 'DERIVATIVES_LIABILITIES',
                                              'DERIVATIVES_ASSET', 'DEPOSITS_FROM_CUSTOMER', 'CURR_LIAB', 'CURR_ASS',
                                              'CASH_SHORTTERMFUND', 'CASH']
            self.exporter_assets.export_item(item)
            return item
        else:
            self.exporter_cash.fields_to_export=[
                'STOCK_NUMBER','YEAREND_DATE_CASH','CF_NCF_OPERACT','CF_INV','CF_INT_REC','CF_INT_PAID','CF_FIN_ACT','CF_EXCH',
                'CF_END','CF_DIV_REC','CF_DIV_PAID','CF_CHANGE_CSH','CF_BEG'
            ]
            self.exporter_cash.export_item(item)
            return item


    def close_spider(self,spider):
        self.exporter_profit.finish_exporting()
        self.exporter_assets.finish_exporting()
        self.exporter_cash.finish_exporting()
        self.file_profit.close()
        self.file_assets.close()
        self.file_cash.close()
Esempio n. 54
0
class ExtractPipeline(object):
    def __init__(self):
        self.files = {}

        if not os.path.exists(OUTPUT_PATH):
            os.makedirs(OUTPUT_PATH)
        utc_time = datetime.datetime.utcnow()
        tz_info = pytz.timezone('Asia/Kolkata')
        utc = pytz.utc
        time_local = utc.localize(utc_time).astimezone(tz_info)
        self.start_formatted_time = time_local.strftime('%d%b%Y_%Hhr%Mmin')
        self.file_name = '{}/Amazon_Trade-In_Buy_Pricing_New{}.csv'.format(
            OUTPUT_PATH, self.start_formatted_time)

        self.export_fields = [
            'title', 'isbn10', 'isbn13', 'purchase_cost', 'shipping_cost',
            'processing_cost', 'net_buying_cost'
        ]

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.start_time = datetime.datetime.utcnow()
        tz_info = pytz.timezone('Asia/Kolkata')
        utc = pytz.utc
        time_local = utc.localize(self.start_time).astimezone(tz_info)
        log.info('[+++++] Starting Time (IST;Asia-Mumbai): {}'.format(
            time_local.strftime('%b %d, %Y @%Hhr %Mmin %Ssec')))
        output_file = open(self.file_name, 'w+b')
        self.files[spider] = output_file
        self.exporter = CsvItemExporter(output_file,
                                        fields_to_export=self.export_fields)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        output_file = self.files.pop(spider)
        output_file.close()

        utc_time = datetime.datetime.utcnow()
        tz_info = pytz.timezone('Asia/Kolkata')
        utc = pytz.utc
        started_time = utc.localize(self.start_time).astimezone(tz_info)
        finished_time = utc.localize(utc_time).astimezone(tz_info)

        self.finish_formatted_time = finished_time.strftime('%d%b%Y_%Hhr%Mmin')

        new_name = self.file_name.replace(self.start_formatted_time,
                                          self.finish_formatted_time)
        os.rename(self.file_name, new_name)

        log.info(
            '[+++++] Output file path: /home/FM/results/Trade_In/Amazon_Buy_New'
        )
        log.info('[+++++] Output file name: {}'.format(
            new_name.rsplit('/', 1)[-1]))

        time_taken = self.strfdelta(
            utc_time - self.start_time,
            "{hours} hours {minutes} minutes {seconds} seconds")
        log.info('[+++++] Starting Time (IST;Asia-Mumbai): {}'.format(
            started_time.strftime('%b %d, %Y @%Hhr %Mmin %Ssec')))
        log.info('[+++++] Finished Time (IST;Asia-Mumbai): {}'.format(
            finished_time.strftime('%b %d, %Y @%Hhr %Mmin %Ssec')))
        log.info('[+++++] Total Time Taken: {}'.format(time_taken))

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def strfdelta(self, tdelta, fmt):
        d = dict()
        d["hours"], rem = divmod(tdelta.seconds, 3600)
        d["minutes"], d["seconds"] = divmod(rem, 60)
        return fmt.format(**d)
Esempio n. 55
0
class CsvExportPipeline(object):

    spiders_to_processors = None

    def __init__(self):
        self.files = {}
        self.exporter = None

        self.spiders_to_processors = {
            teams.TeamsSpider.__name__: TeamProcessor,
            team_season.TeamSeasonSpider.__name__: TeamSeasonProcessor,
            players.PlayersSpider.__name__: PlayerProcessor,
            player_season.PlayerSeasonSpider.__name__: PlayerSeasonProcessor,
        }

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()

        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)

        return pipeline

    def spider_opened(self, spider):
        """
        called when the spider is started
        """

        try:
            processor = self.spiders_to_processors[type(spider).__name__]()
        except KeyError:
            self.exporter = None
            return

        file = open(processor.get_storage_filepath(spider), "w+b")

        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        """
        called when the spider is finished crawling
        """

        if self.exporter:

            self.exporter.finish_exporting()

            file = self.files.pop(spider)
            file.close()

    def process_item(self, item, spider):
        """
        called every time an item is yielded from a spider
        """

        if self.exporter:

            self.exporter.export_item(item)
            return item
Esempio n. 56
0
class CsvPipeline(object):
    """
    Pipeline uses a built in feed exporter from Scrapy currently
    Outputs csv files for local development to the /local_output/ folder
    """

    path = subprocess.check_output(['git', 'rev-parse', '--show-toplevel']).decode("utf-8").rstrip()  + '/city_scrapers/local_outputs/'

    def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.files = {}

    def spider_opened(self, spider):
        self.stamp = datetime.datetime.now().strftime('%Y%m%d_%H%M')
        self.fname = '{}{}_{}.csv'.format(self.path, spider.name, self.stamp)
        file = open(self.fname, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = ['agency_name', '_type',
                                          'id', 'name', 'description',
                                          'classification', 'start_time',
                                          'end_time', 'timezone', 'status',
                                          'all_day', 'location_name',
                                          'location_url',
                                          'location_address',
                                          'source_url', 'source_note',
                                          'scraped_time'
                                          ]
        self.exporter.start_exporting()

    def spider_closed(self, spider, deleteme=False):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()
        if deleteme:
            remove(self.fname)

    def process_item(self, item, spider):
        new_item = item.copy()

        # flatten location
        new_item['start_time'] = datetime.datetime.strftime(new_item['start_time'], '%Y-%m-%d %H:%M')
        try:
            new_item['end_time'] = datetime.datetime.strftime(new_item['end_time'], '%Y-%m-%d %H:%M')
        except:
            pass
        new_item['location_url'] = get_key(new_item, 'location.url')
        new_item['location_name'] = get_key(new_item, 'location.name')
        new_item['location_address'] = get_key(new_item, 'location.address')
        new_item['source_url'] = new_item.get('sources', [{'url': ''}])[0].get('url', '')
        new_item['source_note'] = new_item.get('sources', [{'note': ''}])[0].get('note', '')
        new_item['agency_name'] = spider.long_name
        new_item['scraped_time'] = datetime.datetime.strftime(datetime.datetime.strptime(self.stamp, '%Y%m%d_%H%M'), '%Y-%m-%d %H:%M')
        new_item = {k: self._format_values(k, v) for k, v in new_item.items() if k in self.exporter.fields_to_export}

        self.exporter.export_item(new_item)
        return new_item

    def _format_values(self, k, v):
        if ((v is None) or v == '') and (k not in ['start_time', 'end_time']):
            return 'N/A'
        if k == 'location_name':
            return ' '.join([w.capitalize() for w in v.split(' ')])
        if isinstance(v, bool):
            return int(v)
        return v
class InventtorySpidersPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        print("We're in the pipeline!")
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        timestr = time.strftime("%Y%m%d-%H%M%S")
        spider_name = str(spider.name)
        print(spider_name)
        ###OPTION 1###
        spider_path = '/Users/romerchris/Desktop/Desktop/Companies/inventtory/inventtory/PAMM/Complements/Scraping/Scrapy/inventtory_spiders_CHRIS_COPY/inventtory_data_scraping/inventtory_data_scraping/inventtory_spiders/' + spider_name + '_' + option_1
        ###OPTION 2###
        #spider_path = '/Users/romerchris/Desktop/Desktop/Companies/inventtory/inventtory/PAMM/Complements/Scraping/Scrapy/inventtory_spiders_CHRIS_COPY/inventtory_data_scraping/inventtory_data_scraping/inventtory_spiders/' + spider_name + '_' + option_2
        if not os.path.exists(spider_path):
            os.makedirs(spider_path)
            csv_path = spider_path + '/' + 'CSV_' + spider_name
            txt_path = spider_path + '/' + 'TXT_' + spider_name
            log_path = spider_path + '/' + 'LOG_' + spider_name
            if not os.path.exists(csv_path):
                os.makedirs(csv_path)
                os.makedirs(txt_path)
                os.makedirs(log_path)
                try:
                    len([name for name in os.listdir(csv_path) if os.path.isfile(os.path.join(csv_path, name))])
                    num = len([name for name in os.listdir(csv_path) if os.path.isfile(os.path.join(csv_path, name))]) + 1
                except:
                    num = 1
                name_of_csv = spider_name + '_' + str(num) + '.csv'
                new_file = open(csv_path +'/'+'%s_%d.csv' % (spider_name, num), 'w+b')
                self.files[spider] = new_file
                self.exporter = CsvItemExporter(new_file, 'data', 'row')

                ################
                ###SET SCHEMA###
                ################

                if 'espacenet' in spider_name:
                    self.exporter.fields_to_export = ['document_url', 'key_identifier', 'patent_country', 'patent_country_code', 'patent_application_number', 'patent_number', 'patent_name', 'page_bookmark', 'inventors', 'applicants', 'classification_international', 'classification_cooperative', 'application_number', 'priority_numbers', 'abstract', 'patent_description', 'original_claims', 'claims_tree', 'cited_documents', 'citing_documents', 'INPADOC_legal_status', 'INPADOC_patent_family']
                    self.exporter.start_exporting()
                elif 'uspto' in spider_name:
                    self.exporter.fields_to_export = ['document_url', 'key_identifier', 'patent_country_code', 'patent_country', 'patent_application_number', 'patent_number', 'patent_name', 'patent_publish_date', 'patent_kind_code', 'abstract', 'inventors', 'applicant_1', 'applicant_2', 'applicant_3', 'applicant_4', 'applicant_5', 'applicant_6', 'applicant_7', 'applicant_8', 'applicant_9', 'applicant_10', 'assignee', 'family_ID', 'application_number', 'filed_date', 'pct_filed', 'pct_number', 'pct_pub_number', 'pct_pub_date', 'related_US_patent_document_1', 'related_US_patent_document_2', 'related_US_patent_document_3', 'related_US_patent_document_4', 'related_US_patent_document_5', 'related_US_patent_document_6', 'related_US_patent_document_7', 'related_US_patent_document_8', 'related_US_patent_document_9', 'related_US_patent_document_10', 'current_US_class', 'current_CPC_class', 'current_international_class', 'class_at_publication', 'international_class', 'field_of_search', 'prior_pub_data_document_identifier', 'prior_pub_data_publication_date', 'references_cited', 'references_primary_examiner', 'references_assistant_examiner', 'references_attorney_agent_or_firm', 'ref_cited_US_patent_documents', 'ref_cited_foreign_patent_documents', 'ref_cited_other_references', 'other_references', 'other_references_primary_examiner', 'other_references_assistant_examiner', 'other_references_attorney_agent_or_firm', 'patent_claims', 'parent_case_text', 'patent_description']
                    self.exporter.start_exporting()
            else:
                try:
                    len([name for name in os.listdir(csv_path) if os.path.isfile(os.path.join(csv_path, name))])
                    num = len([name for name in os.listdir(csv_path) if os.path.isfile(os.path.join(csv_path, name))]) + 1
                except:
                    num = 1
                name_of_csv = spider_name + '_' + str(num) + '.csv'
                new_file=open(csv_path +'/'+'%s_%d.csv' % (spider_name, num), 'w+b')
                self.files[spider] = new_file
                self.exporter = CsvItemExporter(new_file, 'data', 'row')

                ################
                ###SET SCHEMA###
                ################

                if 'espacenet' in spider_name:
                    print('espacenet pipeline!')
                    self.exporter.fields_to_export = ['document_url', 'key_identifier', 'patent_country', 'patent_country_code', 'patent_application_number', 'patent_number', 'patent_name', 'page_bookmark', 'inventors', 'applicants', 'classification_international', 'classification_cooperative', 'application_number', 'priority_numbers', 'abstract', 'patent_description', 'original_claims', 'claims_tree', 'cited_documents', 'citing_documents', 'INPADOC_legal_status', 'INPADOC_patent_family']
                    self.exporter.start_exporting()
                elif 'uspto' in spider_name:
                    print('uspto pipeline!')
                    self.exporter.fields_to_export = ['document_url', 'key_identifier', 'patent_country_code', 'patent_country', 'patent_application_number', 'patent_number', 'patent_name', 'patent_publish_date', 'patent_kind_code', 'abstract', 'inventors', 'applicant_1', 'applicant_2', 'applicant_3', 'applicant_4', 'applicant_5', 'applicant_6', 'applicant_7', 'applicant_8', 'applicant_9', 'applicant_10', 'assignee', 'family_ID', 'application_number', 'filed_date', 'pct_filed', 'pct_number', 'pct_pub_number', 'pct_pub_date', 'related_US_patent_document_1', 'related_US_patent_document_2', 'related_US_patent_document_3', 'related_US_patent_document_4', 'related_US_patent_document_5', 'related_US_patent_document_6', 'related_US_patent_document_7', 'related_US_patent_document_8', 'related_US_patent_document_9', 'related_US_patent_document_10', 'current_US_class', 'current_CPC_class', 'current_international_class', 'class_at_publication', 'international_class', 'field_of_search', 'prior_pub_data_document_identifier', 'prior_pub_data_publication_date', 'references_cited', 'references_primary_examiner', 'references_assistant_examiner', 'references_attorney_agent_or_firm', 'ref_cited_US_patent_documents', 'ref_cited_foreign_patent_documents', 'ref_cited_other_references', 'other_references', 'other_references_primary_examiner', 'other_references_assistant_examiner', 'other_references_attorney_agent_or_firm', 'patent_claims', 'parent_case_text', 'patent_description']
                    self.exporter.start_exporting()

        else:
            csv_path = spider_path + '/' + 'CSV_' + spider_name
            txt_path = spider_path + '/' + 'TXT_' + spider_name
            log_path = spider_path + '/' + 'LOG_' + spider_name
            if not os.path.exists(csv_path):
                os.makedirs(csv_path)
                os.makedirs(txt_path)
                try:
                    len([name for name in os.listdir(csv_path) if os.path.isfile(os.path.join(csv_path, name))])
                    num = len([name for name in os.listdir(csv_path) if os.path.isfile(os.path.join(csv_path, name))]) + 1
                except:
                    num = 1
                name_of_csv = spider_name + '_' + str(num) + '.csv'
                new_file=open(csv_path + '/' + '%s_%d.csv' % (spider_name, num), 'w+b')
                self.files[spider] = new_file
                self.exporter = CsvItemExporter(new_file, 'data', 'row')
                
                ################
                ###SET SCHEMA###
                ################
                
                if 'espacenet' in spider_name:
                    print('espacenet pipeline!')
                    self.exporter.fields_to_export = ['document_url', 'key_identifier', 'patent_country', 'patent_country_code', 'patent_application_number', 'patent_number', 'patent_name', 'page_bookmark', 'inventors', 'applicants', 'classification_international', 'classification_cooperative', 'application_number', 'priority_numbers', 'abstract', 'patent_description', 'original_claims', 'claims_tree', 'cited_documents', 'citing_documents', 'INPADOC_legal_status', 'INPADOC_patent_family']
                    self.exporter.start_exporting()
                elif 'uspto' in spider_name:
                    print('uspto pipeline!')
                    self.exporter.fields_to_export = ['document_url', 'key_identifier', 'patent_country_code', 'patent_country', 'patent_application_number', 'patent_number', 'patent_name', 'patent_publish_date', 'patent_kind_code', 'abstract', 'inventors', 'applicant_1', 'applicant_2', 'applicant_3', 'applicant_4', 'applicant_5', 'applicant_6', 'applicant_7', 'applicant_8', 'applicant_9', 'applicant_10', 'assignee', 'family_ID', 'application_number', 'filed_date', 'pct_filed', 'pct_number', 'pct_pub_number', 'pct_pub_date', 'related_US_patent_document_1', 'related_US_patent_document_2', 'related_US_patent_document_3', 'related_US_patent_document_4', 'related_US_patent_document_5', 'related_US_patent_document_6', 'related_US_patent_document_7', 'related_US_patent_document_8', 'related_US_patent_document_9', 'related_US_patent_document_10', 'current_US_class', 'current_CPC_class', 'current_international_class', 'class_at_publication', 'international_class', 'field_of_search', 'prior_pub_data_document_identifier', 'prior_pub_data_publication_date', 'references_cited', 'references_primary_examiner', 'references_assistant_examiner', 'references_attorney_agent_or_firm', 'ref_cited_US_patent_documents', 'ref_cited_foreign_patent_documents', 'ref_cited_other_references', 'other_references', 'other_references_primary_examiner', 'other_references_assistant_examiner', 'other_references_attorney_agent_or_firm', 'patent_claims', 'parent_case_text', 'patent_description']
                    self.exporter.start_exporting()
                
            else:
                
                try:
                    len([name for name in os.listdir(csv_path) if os.path.isfile(os.path.join(csv_path, name))])
                    num = len([name for name in os.listdir(csv_path) if os.path.isfile(os.path.join(csv_path, name))]) + 1
                except:
                    num = 1
                name_of_csv = spider_name + '_' + str(num) + '.csv'
                new_file = open(csv_path + '/' + '%s_%d.csv' % (spider_name, num), 'w+b')
                self.files[spider] = new_file
                self.exporter = CsvItemExporter(new_file, 'data', 'row')

                ################
                ###SET SCHEMA###
                ################
                
                if 'espacenet' in spider_name:
                    print('espacenet pipeline!')
                    self.exporter.fields_to_export = ['document_url', 'key_identifier', 'patent_country', 'patent_country_code', 'patent_application_number', 'patent_number', 'patent_name', 'page_bookmark', 'inventors', 'applicants', 'classification_international', 'classification_cooperative', 'application_number', 'priority_numbers', 'abstract', 'patent_description', 'original_claims', 'claims_tree', 'cited_documents', 'citing_documents', 'INPADOC_legal_status', 'INPADOC_patent_family']
                    self.exporter.start_exporting()
                elif 'uspto' in spider_name:
                    print('uspto pipeline!')
                    self.exporter.fields_to_export = ['document_url', 'key_identifier', 'patent_country_code', 'patent_country', 'patent_application_number', 'patent_number', 'patent_name', 'patent_publish_date', 'patent_kind_code', 'abstract', 'inventors', 'applicant_1', 'applicant_2', 'applicant_3', 'applicant_4', 'applicant_5', 'applicant_6', 'applicant_7', 'applicant_8', 'applicant_9', 'applicant_10', 'assignee', 'family_ID', 'application_number', 'filed_date', 'pct_filed', 'pct_number', 'pct_pub_number', 'pct_pub_date', 'related_US_patent_document_1', 'related_US_patent_document_2', 'related_US_patent_document_3', 'related_US_patent_document_4', 'related_US_patent_document_5', 'related_US_patent_document_6', 'related_US_patent_document_7', 'related_US_patent_document_8', 'related_US_patent_document_9', 'related_US_patent_document_10', 'current_US_class', 'current_CPC_class', 'current_international_class', 'class_at_publication', 'international_class', 'field_of_search', 'prior_pub_data_document_identifier', 'prior_pub_data_publication_date', 'references_cited', 'references_primary_examiner', 'references_assistant_examiner', 'references_attorney_agent_or_firm', 'ref_cited_US_patent_documents', 'ref_cited_foreign_patent_documents', 'ref_cited_other_references', 'other_references', 'other_references_primary_examiner', 'other_references_assistant_examiner', 'other_references_attorney_agent_or_firm', 'patent_claims', 'parent_case_text', 'patent_description']
                    self.exporter.start_exporting()

        ################################
        ###GENERATE CONVERSION SCRIPT###
        ################################
                
        new_rows = []

        if 'espacenet' in spider_name:
            convert_script = 'format_data_ESPACENET.py'
        elif 'uspto' in spider_name:
            convert_script = 'format_data_USPTO.py'

        if not os.path.exists(spider_path):
            os.makedirs(spider_path)
            with open(convert_script) as f_old:
                reader = f_old.readlines()
                for i, row in enumerate(reader):
                    new_rows.append(row)
            find1 = re.match('(.*?=\s*)?\'\/Users.*',new_rows[12]).group(1)
            find2 = re.match('with\s*open\(\'(.*?)\'.*',new_rows[15]).group(1)
            #FILE PATH - new_rows[12]
            new_rows[12] = find1 + '\'' + txt_path + '\''
            #FILE NAME - new_rows[15]   
            new_rows[15] = new_rows[15].replace(find2, 'CSV_' + spider_name + '/' + name_of_csv)
            name_of_file = "convert_" + spider_name + ".py"
            complete_name = os.path.join(spider_path, name_of_file)
            f_new = open(complete_name,"w")
            for i, line in enumerate(new_rows):
                print(line)
                f_new.write(line)
            f_new.close()
        else:
            with open(convert_script) as f_old:
                reader = f_old.readlines()
                for i, row in enumerate(reader):
                    new_rows.append(row)
            find1 = re.match('(.*?=\s*)?\'\/Users.*',new_rows[12]).group(1)
            find2 = re.match('with\s*open\(\'(.*?)\'.*',new_rows[15]).group(1)
            #FILE PATH - new_rows[12]
            new_rows[12] = find1 + '\'' + txt_path + '\''
            #FILE NAME - new_rows[15]   
            new_rows[15] = new_rows[15].replace(find2, 'CSV_' + spider_name + '/' + name_of_csv)
            name_of_file = "convert_" + spider_name + ".py"
            complete_name = os.path.join(spider_path, name_of_file)
            f_new = open(complete_name,"w")
            for i, line in enumerate(new_rows):
                print(line)
                f_new.write(line)
            f_new.close()
        
            

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        new_file = self.files.pop(spider)
        new_file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Esempio n. 58
0
class CsvExportPipeline(object):

    fields_to_export = [
        'Title',
        'Author',
        'AuthorLifetime',
        'TotalLength',
        'Language',
        'Genre',
        'Readers',
        'NumberOfReaders',
        'WikipediaLink',
        'AuthorWikipediaLink',
        'CatalogedOnDate',
        'DescriptionText',
        'LibrivoxUrlOfTitle',
        'LinksToAll128kMp3Files',
        'HasCoverArt',
        'HasCdInsertArt'
    ]

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        FILES_STORE = settings.FILES_STORE
        self.file = open(FILES_STORE + 'Librivox-Book-List.csv', 'w+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.fields_to_export = self.fields_to_export
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        FILES_STORE = settings.FILES_STORE
        invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
        title_dir = item['Title']
        for each_char in invalid_chars:
            title_dir = title_dir.replace(each_char, '-')

        if not os.path.exists(FILES_STORE + title_dir):
            os.makedirs(FILES_STORE + title_dir)
        # write txt files
        for each_file in self.fields_to_export:
            txt_file = FILES_STORE + title_dir + '/' + each_file + '.txt'
            with open(txt_file, 'w') as outfile:
                outfile.write(item[each_file])
        return item

    def convert_csv_to_excel(self, csv_file, excel_file):
        workbook = Workbook(excel_file)
        worksheet = workbook.add_worksheet()
        with open(csv_file, 'rb') as f:
            reader = csv.reader(f)
            for r, row in enumerate(reader):
                for c, col in enumerate(row):
                    worksheet.write(r, c, col)
        workbook.close()
Esempio n. 59
0
class ArticlePipeline(object):
    def __init__(self):
        # set up comment file output
        self.comment_file = open("./data/comments.csv", 'wb')
        self.comment_exporter = CsvItemExporter(self.comment_file)
        self.comment_exporter.start_exporting()

        # set up article output fields heeeeeeere!
        self.article_file = open("./data/article.csv", 'wb')
        field_to_export = [
            'aid', 'board', 'page', 'author', 'reply', 'category', 'title',
            'time', 'ip', 'country', 'country_flag', 'content', 'signature',
            'comments', 'comment_users', 'score', 'plus', 'minus', 'edit'
        ]
        self.article_exporter = CsvItemExporter(
            self.article_file, fields_to_export=field_to_export)
        self.article_exporter.start_exporting()

        # ip-country check
        self.country_dbcon = geoipdb.Reader('./db/GeoLite2-Country.mmdb')
        with open('./db/country_dict.pckl', 'rb') as infile:
            self.country_dict = pickle.load(infile)

    def close_spider(self, spider):
        self.comment_exporter.finish_exporting()
        self.comment_file.close()
        self.article_exporter.finish_exporting()
        self.article_file.close()
        self.country_dbcon.close()

#%% item parse

    def _check_country(self, item):
        flag = 0
        if item['country'] is None or item[
                'country'] not in self.country_dict.values():
            flag = 1
            try:
                lookup = self.country_dbcon.country(item['ip'])
                country = self.country_dict.get(lookup.country.iso_code,
                                                'country_na')
                return country, flag
            except geoip2.errors.AddressNotFoundError:
                return 'ip_na', flag
            except ValueError:
                return 'ip_wrong', flag
        else:
            return item['country'], flag

    def process_item(self, item, spider):
        # clean article data
        item['time'] = datetime.strptime(item['time'], '%a %b  %d %H:%M:%S %Y')
        item['country'] = re.sub('[\(\)]', '',
                                 item['country']) if item['country'] else None
        country, country_flag = self._check_country(item)
        item['country'] = country
        item['country_flag'] = country_flag
        title_re = re.search('(Re: )?(\[.+?\])?(.+)', item['title'])
        item['reply'] = 1 if title_re.group(1) else 0
        item['category'] = re.sub(
            '[\[\]]', '', title_re.group(2)) if title_re.group(2) else None
        item['title'] = title_re.group(3).strip()
        item['content'] = item['content'].strip() if item['content'] else None
        item['signature'] = re.sub(
            '^[\-\n]+', '', item['signature']) if item['signature'] else None

        # clean comment data and dump
        data = zip(item['comm_author'], item['comm_time'], item['comm_type'],
                   item['comm_content'])
        comms = pd.DataFrame(
            data,
            columns=['comm_author', 'comm_time', 'comm_type', 'comm_content'])

        def time_translate(x):
            try:
                return datetime.strptime(
                    '{}/{}'.format(item['time'].year, x.strip()),
                    '%Y/%m/%d %H:%M')
            except ValueError:
                return None

        comms['comm_time'] = comms['comm_time'].apply(time_translate)
        comms['comm_type'] = comms['comm_type'].str.strip().replace({
            u'→': 0,
            u'推': 1,
            u'噓': -1
        })
        comms['comm_content'] = comms['comm_content'].replace(
            '^:?', '', regex=True).str.strip()

        for idx, row in comms.iterrows():
            commItem = CommentItem()
            commItem['aid'] = item['aid']
            commItem['board'] = item['board']
            commItem['comm_author'] = row['comm_author']
            commItem['comm_time'] = row['comm_time']
            commItem['comm_type'] = row['comm_type']
            commItem['comm_content'] = row['comm_content']
            self.comment_exporter.export_item(commItem)

        # calculate comment columns for article
        item['comments'] = len(comms)
        item['comment_users'] = len(comms.comm_author.unique())
        if item['comments']:
            score_stat = comms.comm_type.value_counts()
            item['plus'] = score_stat[1] if 1 in score_stat.index else 0
            item['minus'] = score_stat[-1] if -1 in score_stat.index else 0
            item['score'] = item['plus'] - item['minus']
        else:
            item['plus'] = 0
            item['minus'] = 0
            item['score'] = 0

        # dump article data
        self.article_exporter.export_item(item)
        return item