コード例 #1
2
ファイル: pipelines.py プロジェクト: 2khc/Python-Projects
class CsvExportPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_societies.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = ['name', 'president', 'email', 'url', 'facebook', 'membership', 'about',
                                          'date_established']
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #2
1
ファイル: pipelines.py プロジェクト: uuhako/myScrapy
class JsonExportPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
#         file = open('%s_data.xml' % spider.name, 'w+b')
        import os
        filePath = os.path.dirname(__file__)
        outputDir = filePath +'/output/'
        file = open(outputDir + '%s_data.csv' % spider.name, 'w+b')
        self.files[spider] = file
#         self.exporter = JsonItemExporter(file)
        self.exporter = CsvItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #3
1
ファイル: pipelines.py プロジェクト: david-macleod/facup
class FacupPipeline(object):

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline


		#create files and instantiate exporter class
		#then run start_exporting() method, this is required for item exporter class
	def spider_opened(self, spider):
		self.results_csv = open('results_3.csv', 'wb') 
		self.missing_csv = open('results_miss_2.csv', 'wb') 
		self.results_exporter = CsvItemExporter(self.results_csv)
		self.missing_exporter = CsvItemExporter(self.missing_csv)
		self.results_exporter.start_exporting() 
		self.missing_exporter.start_exporting() 
		
    def process_item(self, item, spider):
		self.results_exporter = CsvItemExporter(self.results_csv)
		self.missing_exporter = CsvItemExporter(self.missing_csv)
        return item
		
    def spider_closed(self, spider):
		self.results_exporter.finish_exporting() 
		self.missing_exporter.finish_exporting() 
		self.results_csv.close()
		self.missing_csv.close()
コード例 #4
0
class CSVWriterPipeline(object):
    
    def __init__(self,filename):
        self.filename = filename
        
    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings
        filename = settings.get('OUTPUT_FILE')
        pipeline = cls(filename)
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.file = open(self.filename, 'w+b')
        self.exporter = CsvItemExporter(self.file,include_headers_line=True)
        self.exporter.encoding='utf-8'
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #5
0
ファイル: test_exporters.py プロジェクト: Rokicto/scrapy
 def assertExportResult(self, item, expected, **kwargs):
     fp = BytesIO()
     ie = CsvItemExporter(fp, **kwargs)
     ie.start_exporting()
     ie.export_item(item)
     ie.finish_exporting()
     self.assertCsvEqual(fp.getvalue(), expected)
コード例 #6
0
ファイル: pipelines.py プロジェクト: mart2010/brd
class DumpToFile(object):
    """
    Dump harvested data into flat file, no other logic is implemented here
    (it's "Dump" :-)
    """
    def __init__(self):
        self.files = {}
        self.counter = 0

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        # TODO: verify if still needed for registration of spider_closed/opened event?
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        filename = spider.get_dump_filepath()
        f = open(filename, 'w')
        self.files[spider.name] = f
        # by default csv module uses Windows-style line terminators (\r\n)
        self.exporter = CsvItemExporter(f, include_headers_line=True, delimiter='|', lineterminator='\n')
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        f = self.files.pop(spider.name)
        f.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        # for counter, could set att in spider at closing
        self.counter += 1
        return item
コード例 #7
0
class CsvExportPipeline(object):
    """
    app.pipelines.exporter_csv.CsvExportPipeline
    """
    def __init__(self):
        self.files = {}
        self.exporter = None

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file_csv = open('%s_items.csv' % spider.name, 'w+b')
        self.files[spider] = file_csv
        self.exporter = CsvItemExporter(file_csv)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file_csv = self.files.pop(spider)
        file_csv.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #8
0
class CSVExportPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s' % spider.nameOfFile, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)

        self.exporter.fields_to_export = ['originalString', 'translatedString']

        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #9
0
class CsvExportPipeline(object):

    def __init__(self):

        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):

        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)

        return pipeline

    def spider_opened(self, spider):

        file = open('vagas.csv', 'wb')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):

        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):

        self.exporter.export_item(item)

        return item
コード例 #10
0
class WebcrawlerPipeline(object):
    def __init__ (self):
        self.files = {}
        pass
    
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline
    
    def spider_opened(self, spider):
        file = open("%s_urls.txt" % (spider.name), "w+b")
        self.files[spider] = file
        self.exporter = CsvItemExporter(file, include_headers_line=False)
        self.exporter.start_exporting()
        pass
    
    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()
        pass
    
    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
    pass
コード例 #11
0
ファイル: pipelines.py プロジェクト: eadebruijn/Webscraping
class CSVPipeline(object):

  def __init__(self):
    self.files = {}

  @classmethod
  def from_crawler(cls, crawler):
    pipeline = cls()
    crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
    crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
    return pipeline

  def spider_opened(self, spider):
    file = open('%s_items.csv' % spider.name, 'w+b')
    self.files[spider] = file
    self.exporter = CsvItemExporter(file)
    self.exporter.fields_to_export = ["filename", "titel", "publicatie", "dossiernummer", "organisatie", "publicatiedatum", "publicatietype", "file_urls"]
    self.exporter.start_exporting()

  def spider_closed(self, spider):
    self.exporter.finish_exporting()
    file = self.files.pop(spider)
    file.close()

  def process_item(self, item, spider):
    self.exporter.export_item(item)
    return item
コード例 #12
0
ファイル: pipelines.py プロジェクト: yujiaxinlong/Crawlers
class CSVPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_items.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file,delimiter='\t')
        self.exporter.fields_to_export = ['userId','bookId','name','rating','relativeRating','booklistNum']
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #13
0
ファイル: pipelines.py プロジェクト: Joker-Cch/Amazon
class AmazonCsvPipeline(object):
    def open_spider(self, spider):
        # 保存csv数据的文件对象
        self.f = open("Amazon_goods_crawl.csv", "w")
        # 创建csv文件读写对象
        self.csv_exporter = CsvItemExporter(self.f)
        # 开始进行csv文件读写
        self.csv_exporter.start_exporting()
        # 根据商品标题进行去重处理
        self.add_title = set()

    def process_item(self, item, spider):
        if item['title'] in self.add_title:
            print u'[EEROR] 数据已保存,勿重复%s'% item['title']
        else:
            self.add_title.add(item['title'])
            # 每次写入一个item数据
            # print u'[INFO] 正在写入csv文件中%s'% item['title']
            self.csv_exporter.export_item(item)
        return item

    def close_spider(self, spider):
        # 结束csv文件读写
        # print u'[INFO] 写入csv文件已完成'
        self.csv_exporter.finish_exporting()
        # 关闭文件
        self.f.close()
コード例 #14
0
class BitcoinTalkCrawlerPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_items.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = \
            ['timestamp', 'category_id', 'topic_id', 'topic_title',
             'message_number', 'message_author', 'message_text']
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #15
0
class catalogscraperPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open("%s_items.csv" % spider.name, "w+b")
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = ["title"]
        #'subject', 'description', 'creator', 'source', 'published', 'rights', 'citation', 'url']
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #16
0
 def test_header_export_two_items(self):
     for item in [self.i, dict(self.i)]:
         output = BytesIO()
         ie = CsvItemExporter(output)
         ie.start_exporting()
         ie.export_item(item)
         ie.export_item(item)
         ie.finish_exporting()
         self.assertCsvEqual(output.getvalue(), 'age,name\r\n22,John\xc2\xa3\r\n22,John\xc2\xa3\r\n')
コード例 #17
0
ファイル: pipelines.py プロジェクト: kellyho15/capstone
class FashionnovaPipeline(object):
    def __init__(self):
        self.filename = 'fashionnova.csv'
    def open_spider(self, spider):
        self.csvfile = open(self.filename, 'wb')
        self.exporter = CsvItemExporter(self.csvfile)
        self.exporter.start_exporting()
    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.csvfile.close()
    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #18
0
class TsvPipeline(object):
    def __init__(self):
        self.files = dict()

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open(spider.name+'-'+datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S")+'.tsv*', 
                           'wb')
        self.files[spider] = file

        self.exporter = CsvItemExporter(file, include_headers_line=True, join_multivalued=';', encoding="utf-8", delimiter='\t')
        if spider.name=='user':
            self.exporter.fields_to_export = ['uid', 'name', 'nickname', 'joindate', 'activedate']
        elif spider.name=='subject':
            self.exporter.fields_to_export = ['subjectid', 'order', 'subjectname', 'subjecttype', 'rank', 'date', 'votenum', 'favnum', 'staff', 'relations']
        elif spider.name=='record':
            self.exporter.fields_to_export = ['uid', 'name', 'nickname', 'iid', 'typ', 'state', 'adddate', 'rate', 'tags', 'comment']
        elif spider.name=='index':
            self.exporter.fields_to_export = ['indexid', 'creator', 'favourite', 'date', 'items']
        elif spider.name=='friends':
            self.exporter.fields_to_export = ['user', 'friend']
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        filename = file.name
        newname = filename[:-5]+'-'+datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S")+'.tsv'
        file.close()
        os.rename(filename, newname)
        if UPLOAD_TO_AZURE_STORAGE:
            block_blob_service = BlockBlobService(account_name=AZURE_ACCOUNT_NAME, account_key=AZURE_ACCOUNT_KEY)
            block_blob_service.create_blob_from_path(AZURE_CONTAINER,
                                                    newname,
                                                    newname,
                                                    content_settings=ContentSettings(content_type='text/tab-separated-values')
                                                            )
                                                            

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #19
0
ファイル: pipelines.py プロジェクト: kellyho15/capstone
class MacystopPipeline(object):
    def __init__(self):
        self.filename = 'topallproduct_all.csv'

    def open_spider(self, spider):
        self.csvfile = open(self.filename, 'wb')
        self.exporter = CsvItemExporter(self.csvfile)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.csvfile.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #20
0
class WriteItemPipeline(object):
	def __init__(self):
		self.filename = 'Glassdoor.csv'

	def open_spider(self, spider):
		self.csvfile = open(self.filename, "wb")
		self.exporter = CsvItemExporter(self.csvfile)
		self.exporter.start_exporting()

	def close_spider(self, spider):
		self.exporter.finish_exporting()
		self.csvfile.close()

	def process_item(self, item, spider):
		self.exporter.export_item(item)

		return item
コード例 #21
0
class PyCsvPipeline:
    def open_spider(self, spider):
        self.file = open("/home/bladestone/lbb.csv", "wb")
        self.exporter = CsvItemExporter(self.file,
                                        fields_to_export=[
                                            "schoolName", "currentBatch",
                                            "totalNumberInPlan"
                                        ])
        self.exporter.start_exporting()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()
コード例 #22
0
class RegistryScraperPipeline(object):

	def __init__(self):
		self.filename = 'registry_scraper/output/employment_site.csv'

	def open_spider(self, spider):
		self.csvfile = open(self.filename, 'wb')
		self.exporter = CsvItemExporter(self.csvfile)
		self.exporter.start_exporting()

	def close_spider(self, spider):
		self.exporter.finish_exporting()
		self.csvfile.close()

	def process_item(self, item, spider):
		self.exporter.export_item(item)
		return item
コード例 #23
0
class ReutersPipeline(object):

    def __init__(self):
        self.filename = 'reuters_news.csv'

    def open_spider(self, spider):
        self.csvfile = open(self.filename, 'wb')
        self.exporter = CsvItemExporter(self.csvfile)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.csvfile.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #24
0
class WriteItemPipeline(object):

    def __init__(self):
        self.filename = 'howlongtobeat_playtimes.csv'

    def open_spider(self, spider):
        self.csvfile = open(self.filename, 'wb')
        self.exporter = CsvItemExporter(self.csvfile)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.csvfile.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #25
0
class CsvPipeline(object):
    def __init__(self):
        self.file = open('kaist_pulse.csv', 'wb')
        self.exporter = CsvItemExporter(self.file, encoding='utf-8')
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()
        df = pd.read_csv('kaist_pulse.csv')
        excel = pd.ExcelWriter('kaist_pulse.xlsx')
        df.to_excel(excel, index=False)
        excel.save()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #26
0
ファイル: pipelines.py プロジェクト: iamxuwenjin/op_gg
class PlayerPipeline(object):
    def open_spider(self, spider):
        self.filename = open("player_info.csv", "wb")
        # 创建一个csv文件读写对象,参数是需要保存数据的csv文件对象
        self.csv_exporter = CsvItemExporter(self.filename)
        # 表示开始进行数据写入
        self.csv_exporter.start_exporting()

    def process_item(self, item, spider):
        if isinstance(item, PlayerItem):
            self.csv_exporter.export_item(item)
        return item

    def close_spider(self, spider):
        # 表示结束数据写入
        self.csv_exporter.finish_exporting()
        self.filename.close()
コード例 #27
0
ファイル: pipelines.py プロジェクト: j-rossi-nl/coliee-2019
class WriteItemPipeline(object):
    def __init__(self):
        self.filename = 'coliee.csv'

    def open_spider(self, spider):
        self.csvfile = open(self.filename, 'wb')
        self.exporter = CsvItemExporter(self.csvfile)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.csvfile.close()

    def process_item(self, item, spider):
        print('Process: {}'.format(item))
        self.exporter.export_item(item)
        return item
コード例 #28
0
ファイル: pipelines.py プロジェクト: SunMark978/SinicaCrawler
class PdfCsvPipeline(CsvItemExporter):
    '''紀錄一共有幾筆pdf'''

    def __init__(self):
        self.fname = os.path.join(SAVE_PATH, "pdflist.csv")
        self.file = open(self.fname, "wb")
        self.exporter = CsvItemExporter(file=self.file,
                                        fields_to_export=["files", "file_urls"])
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #29
0
ファイル: pipelines.py プロジェクト: lukeasoiler/cfcrawler
class CSVPipeline(object):

    def __init__(self):
      self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
      pipeline = cls()
      crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
      crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
      return pipeline

    def spider_opened(self, spider):

      while True:
          global name_website
          name_website = input("Projekt Namen eingeben: ")
          break
      
      file = open('%s_%s.csv' % (spider.name, name_website), 'w+b')
      self.files[spider] = file
      self.exporter = CsvItemExporter(file)
      self.exporter.fields_to_export = ["url", "status_code", "cache_control", "title", "title_length", "title_count", "description", "description_length",  "description_count", "canonical", "canonical_self", "h1", "h1_count", "wordcount", "internal_links", "external_links", "amp_html", "amp_valid", "redirect_location", "referrer"]
      self.exporter.start_exporting()

    def spider_closed(self, spider):
      self.exporter.finish_exporting()
      file = self.files.pop(spider)
      file.close()

      #given I am using Windows i need to elimate the blank lines in the csv file
      print("Starting csv blank line cleaning")
      with open('%s_%s.csv' % (spider.name, name_website), 'r') as f:
        reader = csv.reader(f)
        original_list = list(reader)
        cleaned_list = list(filter(None,original_list))

      with open('%s_%s_cleaned.csv' % (spider.name, name_website), 'w', newline='') as output_file:
          wr = csv.writer(output_file, dialect='excel')
          for data in cleaned_list:
            wr.writerow(data)

    def process_item(self, item, spider):
      self.exporter.export_item(item)
      return item
コード例 #30
0
class ExpertsExtractPipeline(object):
    def __init__(self):
        self.files = {}
        self.file_name = 'CUNY Graduate Center.csv'
        self.export_fields = [
            "name", "title_1", "title_2", "title_3", "department_1",
            "department_2", "department_3", "phone", "email", "website",
            "biography", "headshot", "faculty_page", "areas_of_expertise_1",
            "areas_of_expertise_2", "areas_of_expertise_3",
            "areas_of_expertise_4", "areas_of_expertise_5",
            "areas_of_expertise_6", "areas_of_expertise_7",
            "areas_of_expertise_8", "areas_of_expertise_9",
            "areas_of_expertise_10", "areas_of_expertise_11",
            "areas_of_expertise_12", "areas_of_expertise_13",
            "areas_of_expertise_14", "areas_of_expertise_15",
            "areas_of_expertise_16", "areas_of_expertise_17",
            "areas_of_expertise_18", "areas_of_expertise_19",
            "areas_of_expertise_20", "areas_of_expertise_21",
            "areas_of_expertise_22", "areas_of_expertise_23",
            "areas_of_expertise_24", "areas_of_expertise_25"
        ]

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):

        output_file = open(self.file_name, 'w+b')
        self.files[spider] = output_file
        self.exporter = CsvItemExporter(output_file,
                                        fields_to_export=self.export_fields)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        output_file = self.files.pop(spider)
        output_file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #31
0
class MySQLPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.file = open('output2.csv', 'w+b')
        self.exporter = CsvItemExporter(self.file,
                                        quoting=csv.QUOTE_ALL,
                                        lineterminator="\n")
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.file.close()
        conn = pymysql.connect(host=MYSQL_HOST,
                               db='scrapedb',
                               user=MYSQL_USER,
                               passwd=MYSQL_PWORD,
                               charset='utf8',
                               use_unicode=True,
                               local_infile=True)
        cursor = conn.cursor()
        for tableName in [
                'walmart_latest_crawl', 'walmart_products_unique',
                'walmart_products'
        ]:
            cursor.execute(
                """LOAD DATA LOCAL INFILE 'output2.csv' INTO TABLE %s
                                FIELDS TERMINATED BY ','
                                ENCLOSED BY '"'
                                LINES TERMINATED BY '\n'
                                IGNORE 1 LINES
                                (category,product_url,description,rating,img_url,brand,upc,seller,num_ratings,
                                department,quantity,external_id,price,name);"""
                % tableName)
        conn.commit()
        conn.close()
        self.exporter.finish_exporting()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #32
0
class ExtractPipeline(object):
    def __init__(self):
        self.files = {}
        self.file_name = f'basspro_results_{time.time()}.csv'
        self.export_fields = [
            'ItemCode',
            'Name',
            'Reviews',
            'Rating',
            'Caliber',
            'BulletWeight',
            'BulletType',
            'Units',
            'Price',
            'IsOnSale',
            'RegularPrice',
            # 'Availability',
            'Link',
        ]

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        output_file = open(
            self.file_name,
            'w+b',
        )
        self.files[spider] = output_file
        self.exporter = CsvItemExporter(output_file,
                                        fields_to_export=self.export_fields)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        output_file = self.files.pop(spider)
        output_file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #33
0
class CompletenessPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('/data/incomplete_%s_products.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        complete = True

        if (item['name'] is None) or (item['name'] == ''):
            complete = False
        if (item['brand'] is None) or (item['brand'] == ''):
            complete = False
        if (item['description'] is None) or (item['description'] == ''):
            complete = False
        if (item['url'] is None) or (item['url'] == ''):
            complete = False
        if (item['original_price'] is None) or (item['original_price'] == ''):
            complete = False
        if (item['price'] is None) or (item['price'] == ''):
            complete = False
        if (item['image_urls'] is None) or (item['image_urls'] == ''):
            complete = False

        if not complete:
            self.exporter.export_item(item)
            raise DropItem("Missing one or more element in %s" % item['url'])

        return item
コード例 #34
0
class CsvExportPipeline(object):

    def __init__(self):
        self.file = None
        self.exporter = None

    def open_spider(self, spider):
        self.file = open('output.csv', 'w+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #35
0
ファイル: pipelines.py プロジェクト: SangwooLee2/webmd01
class WebmdPipeline(object):
    # 418 lsw blocked class WriteItemPipeline(object):

    def __init__(self):
        self.filename = 'webmd_reviews.csv'

    def open_spider(self, spider):
        self.csvfile = open(self.filename, 'wb')
        self.exporter = CsvItemExporter(self.csvfile)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.csvfile.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #36
0
class WriteItemPipeline(object):
    def __init__(self):
        #t = datetime.datetime.now()
        #self.filename = 'nasdaq_headlines_' + t.strftime('%Y%m%d') + '.csv'
        self.filename = 'nasdaq_headlines.csv'

    def open_spider(self, spider):
        self.csvfile = open(self.filename, 'wb')
        self.exporter = CsvItemExporter(self.csvfile)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.csvfile.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #37
0
class TheBeatlesChordsPipeline(object):
    def __init__(self):
        self.filename = 'the_beatles_chords.csv'

    def open_spider(self, spider):
        self.csvfile = open(self.filename, 'wb')
        self.exporter = CsvItemExporter(self.csvfile)
        self.exporter.fields_to_export = ['name', 'chords']
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.csvfile.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
        
コード例 #38
0
class CsvPipeline(object):
    """
    写入有序的数据到CSV表格
    """
    def __init__(self):
        self.file = open('./result/crawl_result.csv', 'wb')
        self.exporter = CsvItemExporter(self.file, encoding='gbk')
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        item = OrderedDict(item)  # 转换为有序的字典
        item = json.dumps(item, ensure_ascii=False)  # 转换为JSON格式
        self.exporter.export_item(eval(item))  # 输出到CSV表格
        return item
コード例 #39
0
class FormScraperPipeline:
    def open_spider(self, spider):
        ext = tldextract.extract(spider.url)
        file_name = '.'.join(
            (ext.domain, ext.suffix, 'csv')
        )  # basically just removing the scheme so it doesnt mess with file paths
        print(file_name)
        self.file = open(file_name, 'wb')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #40
0
    def process_item(self, item, spider):
        print(
            "=================== process_item in pipeline ======================="
        )
        prd_name = item['prd_name']
        link = item['link']
        comments = item['comments']
        output = f'|{prd_name}|\t|{link}|\t|{comments}|\n\n'
        with open('./phone.txt', 'a+', encoding='utf-8') as article:
            article.write(output)

        with open("./phones.csv", "a+b") as f:
            exporter = CsvItemExporter(f, include_headers_line=False)
            exporter.start_exporting()
            exporter.export_item(item)
            exporter.finish_exporting()

        return item
コード例 #41
0
ファイル: pipelines.py プロジェクト: Epizana/ebay_scrape
class WriteItemPipeline(
        object
):  #can copy and paste into my own script! However changes the filename.
    def __init__(self):
        self.filename = 'ebay11.csv'

    def open_spider(self, spider):
        self.csvfile = open(self.filename, 'wb')  #,/n = '' for windows.
        self.exporter = CsvItemExporter(self.csvfile)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.csvfile.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #42
0
ファイル: pipelines.py プロジェクト: mks212/NYCRentData
class RentPipeline(object):
    def __init__(self):
        self.filename = 'rent_info.csv'

    def open_spider(self, spider):
        self.csvfile = open(
            self.filename,
            'ab')  #open in append mode so it adds instead of overwrites
        self.exporter = CsvItemExporter(self.csvfile)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.csvfile.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #43
0
class GooglenewsPipeline(object):
    def __init__(self):
        write_date = SearchInformation.str_from_date.replace(".", "")
        self.file = open("from" + write_date + "_" + "GoogleNews.csv", "wb")
        #  ,newline=""
        self.exporter = CsvItemExporter(self.file,
                                        encoding='utf-8',
                                        delimiter="-")
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
        '''   
コード例 #44
0
class CsvExporterPipeline(object):
    def __init__(self):
        self.files = {}

    def open_spider(self, spider):
        file = open('{}.csv'.format(spider.name), 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file, include_headers_line=False)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #45
0
class WriteItemPipeline(object):

    def __init__(self):
        self.filename = 'weatherThirtyYearsKLAX1986.csv'

    def open_spider(self, spider):
        self.csvfile = open(self.filename, 'wb')
        self.exporter = CsvItemExporter(self.csvfile)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.csvfile.close()

    def process_item(self, item, spider):
        #print "processing \n"
        self.exporter.export_item(item)
        return item
コード例 #46
0
class Tc58CSVPipeline(object):
    def open_spider(self, spider):
        # 创建csv文件对象,拥有写权限
        self.csv = open("gav.csv", "w")
        # 查创建一个Csv文件读写对象,参数是csv文件对象
        self.csvexporter = CsvItemExporter(self.csv)
        # 指定读写权限,可以开始写入数据
        self.csvexporter.start_exporting()

    def process_item(self, item, spider):
        # 将item数据写入到csv文件里
        self.csvexporter.export_item(item)
        return item

    def close_spider(self, spider):
        # 表述数据写入结束
        self.csvexporter.finish_exporting()
        self.csv.close()
コード例 #47
0
ファイル: pipelines.py プロジェクト: akay126/hk_housing_web
class CentaPartPipeline(object):
    def __init__(self):
        self.filename = 'Centa_Part.csv'

    def open_spider(self, spider):
        self.csvfile = open(self.filename, 'wb')
        self.exporter = CsvItemExporter(self.csvfile)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.csvfile.close()
        f = read_csv(self.filename)
        f.to_csv(self.filename, index=False)

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #48
0
class WriteItemPipeline(object):
    def __init__(self):
        #target : 1(single), 2(albums)
        self.filename = 'acharts_singles.csv'
        #self.filename = 'acharts_albums.csv'

    def open_spider(self, spider):
        self.csvfile = open(self.filename, 'wb')
        self.exporter = CsvItemExporter(self.csvfile)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.csvfile.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #49
0
class WeiboPipeline(object):

    def __init__(self):
        print('begin')
        self.file = open("./fans_data.csv", "wb")
        self.exporter = CsvItemExporter(self.file,       
        fields_to_export = ['fid', 'screen_name', 'profile_image_url', 'profile_url', 'followers_count', 'follow_count', 'desc1'])
        self.exporter.start_exporting()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    # @classmethod
    # def from_crawler(cls,crawler):
    #     #从settings.py 里获取配置信息
    #     return cls(
    #         host=crawler.settings.get('MYSQL_HOST'),
    #         user=crawler.settings.get('MYSQL_USER'),
    #         password=crawler.settings.get('MYSQL_PASSWORD'),
    #         database=crawler.settings.get('MYSQL_DATABASE'),
    #         port=crawler.settings.get('MYSQL_PORT')
    #     )

    # def open_spider(self,spider):
    #     """
    #     当Spider开启时,这个方法被调用
    #     :param spider: Spider 的实例
    #     :return:
    #     """
    #     self.conn = pymysql.connect(
    #         host =self.host,
    #         user=self.user,
    #         password=self.password,
    #         database=self.database,
    #         port=self.port,
    #         charset='utf8'
    #     )
    #     self.cursor = self.conn.cursor()

    def close_spider(self, spider):
        print('done')
        self.exporter.finish_exporting()
        self.file.close()
コード例 #50
0
class AmsemailbotPipeline(object):
    def __init__(self):
        if os.path.exists('crawled_emails.csv'):
            os.remove('crawled_emails.csv')
        self.file = open("crawled_emails.csv", 'wb')
        self.exporter = CsvItemExporter(self.file, include_headers_line=True)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    # @classmethod
    # def from_crawler(cls, crawler):
    #     pipeline = cls()
    #     crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
    #     crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
    #     return pipeline

    # def spider_opened(self, spider):
    #     if os.path.exists('crawled_emails.csv'):
    #         os.remove('crawled_emails')
    #     self.file = open('crawled_emails.csv', 'w+b')
    #     self.exporter = CsvItemExporter(self.file)
    #     self.exporter.start_exporting()

    # def spider_closed(self, spider):
    #     self.exporter.finish_exporting()
    #     self.file.close()

    def process_item(self, item, spider):
        if item.get('title'):
            if item.get('author'):
                if item.get('email'):
                    self.file = open('crawled_emails.csv', 'w+b')
                    self.exporter.export_item(item)
                    return item
                else:
                    raise DropItem('Missing email')
            else:
                raise DropItem('Missing author')
        else:
            raise DropItem('Missing Title')
        return item
コード例 #51
0
ファイル: pipelines.py プロジェクト: Joker-Cch/AQI
class AqiCsvPipeline(object):
    def open_spider(self, spider):
        # 保存csv数据的文件对象
        self.f = open('aqi.csv', 'w')
        # 创建csv文件读写对象
        self.csv_exporter = CsvItemExporter(self.f)
        # 开始进行csv文件读写
        self.csv_exporter.start_exporting()

    def process_item(self, item, spider):
        # 每次写入一个item数据
        self.csv_exporter.export_item(item)
        return item

    def close_spider(self, spider):
        # 结束csv文件读写
        self.csv_exporter.finish_exporting()
        # 关闭文件
        self.f.close()
コード例 #52
0
class CsvExportPipeline(object):

    def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.files = {}

    def spider_opened(self, spider):
        file = open('%s_jobs.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #53
0
class CrawlerPipeline(object):
    EXPORT_PATH = os.getenv("HOME")

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        path = CrawlerPipeline.EXPORT_PATH + "/" + spider.spider_id + "_export.csv"
        export_file = open(path, "ab" if os.path.isfile(path) else "wb")

        self.files[spider.spider_id] = export_file
        self.exporter = CsvItemExporter(export_file)
        self.exporter.fields_to_export = [
            "item_id",
            "url",
            "num_links",
            "num_images",
            "num_scripts",
            "num_styles",
            "headers",
            "text",
        ]
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        export_file = self.files.pop(spider.spider_id)
        export_file.close()

    def process_item(self, item, spider):
        # This is a common path among ALL crawlers
        self.exporter.export_item(item)
        return item
コード例 #54
0
class CSVExportPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):

        # print( str(spider) )
        # pdb.set_trace()

        file = open('%s' % spider.nameOfFile, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)

        #each spider has a different items
        if ( spider.name is 'fleetintel_list' ):
            self.exporter.fields_to_export = ['Company', 'Model', 'MSN', 'YoM', 'Reg', 'Comments']
        elif ( spider.name is 'Available_assets' ):
            self.exporter.fields_to_export = ['Category', 'Company', 'Contact_webPage', 'Contact_email', 'Contact_phone', 'Model', 'YoM', 'MSN', 'TFHs_TFCs', 'Engines', 'F_B_E', 'OL_A_S', 'LU', 'AD', 'ESN', 'L_E_S']
        
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #55
0
class BigMLPipeline(BigMLAPIMixIn):

    AUTH_ERRMSG = (
        "{errtype:s} BigML credentials. Please supply BIGML_USERNAME"
        " and BIGML_API_KEY as either Scrapy settings or environment"
        " variables."
    )

    def __init__(self, username=None, api_key=None, source_name=None, dev_mode=None):
        self.source_name = source_name
        self.get_bigml_api(username, api_key, dev_mode=dev_mode)

    @classmethod
    def from_crawler(cls, crawler):
        o = cls(
            username=crawler.settings["BIGML_USERNAME"],
            api_key=crawler.settings["BIGML_API_KEY"],
            source_name=crawler.settings.get("BIGML_SOURCE_NAME", "Scrapy"),
            dev_mode=crawler.settings.getbool("BIGML_DEVMODE", False),
        )
        o.crawler = crawler
        o.settings = crawler.settings
        return o

    def open_spider(self, spider):
        self.tempfile = TemporaryFile(prefix="bigml-feed-")
        self.exporter = CsvItemExporter(self.tempfile)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.tempfile.seek(0)
        self.export_to_bigml(self.tempfile, self.source_name)

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #56
0
ファイル: pipelines.py プロジェクト: alanrhannah/harvestman
class HarvestmanPipeline(object):
    """Pipepline definition for spiders in the harvestman_spider project"""

    def __init__(self):
        """__init__, innit."""
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        csv_file = settings.CSV_FILE_OUTPUT_DIR.format(
            spider.base_url.split('/')[2],
            datetime.date.today().strftime('%Y-%m-%d'))

        if spider.name == 'google_serp_spider':
            file = open(csv_file, 'w')
            self.files[spider] = file
            # note this outputs as a tab seperated csv, rather than comma.
            self.exporter = CsvItemExporter(file, delimiter='\t')
            self.exporter.start_exporting()

    def spider_closed(self, spider):
        if spider.name == 'google_serp_spider':
            self.exporter.finish_exporting()
            file = self.files.pop(spider)
            file.close()

    def process_item(self, item, spider):
        if spider.name == 'google_serp_spider':
            self.exporter.export_item(item)
            return item
コード例 #57
0
class CSVPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        if spider.name in 'realestate':
            self.file = open('current_listing.csv', 'w+b')
        else:
            self.file = open('past_listing.csv', 'w+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)

        return item
コード例 #58
0
ファイル: pipelines.py プロジェクト: mkogrady/hoop_io
class CsvExportPipeline(object):

    spiders_to_processors = None

    def __init__(self):
        self.files = {}
        self.exporter = None

        self.spiders_to_processors = {
            teams.TeamsSpider.__name__: TeamProcessor,
            team_season.TeamSeasonSpider.__name__: TeamSeasonProcessor,
            players.PlayersSpider.__name__: PlayerProcessor,
            player_season.PlayerSeasonSpider.__name__: PlayerSeasonProcessor,
        }

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()

        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)

        return pipeline

    def spider_opened(self, spider):
        """
        called when the spider is started
        """

        try:
            processor = self.spiders_to_processors[type(spider).__name__]()
        except KeyError:
            self.exporter = None
            return

        file = open(processor.get_storage_filepath(spider), "w+b")

        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        """
        called when the spider is finished crawling
        """

        if self.exporter:

            self.exporter.finish_exporting()

            file = self.files.pop(spider)
            file.close()

    def process_item(self, item, spider):
        """
        called every time an item is yielded from a spider
        """

        if self.exporter:

            self.exporter.export_item(item)
            return item
コード例 #59
0
class CsvExportPipeline(object):

    fields_to_export = [
        'Title',
        'Author',
        'AuthorLifetime',
        'TotalLength',
        'Language',
        'Genre',
        'Readers',
        'NumberOfReaders',
        'WikipediaLink',
        'AuthorWikipediaLink',
        'CatalogedOnDate',
        'DescriptionText',
        'LibrivoxUrlOfTitle',
        'LinksToAll128kMp3Files',
        'HasCoverArt',
        'HasCdInsertArt'
    ]

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        FILES_STORE = settings.FILES_STORE
        self.file = open(FILES_STORE + 'Librivox-Book-List.csv', 'w+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.fields_to_export = self.fields_to_export
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        FILES_STORE = settings.FILES_STORE
        invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*']
        title_dir = item['Title']
        for each_char in invalid_chars:
            title_dir = title_dir.replace(each_char, '-')

        if not os.path.exists(FILES_STORE + title_dir):
            os.makedirs(FILES_STORE + title_dir)
        # write txt files
        for each_file in self.fields_to_export:
            txt_file = FILES_STORE + title_dir + '/' + each_file + '.txt'
            with open(txt_file, 'w') as outfile:
                outfile.write(item[each_file])
        return item

    def convert_csv_to_excel(self, csv_file, excel_file):
        workbook = Workbook(excel_file)
        worksheet = workbook.add_worksheet()
        with open(csv_file, 'rb') as f:
            reader = csv.reader(f)
            for r, row in enumerate(reader):
                for c, col in enumerate(row):
                    worksheet.write(r, c, col)
        workbook.close()