class CsvExportPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_societies.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = ['name', 'president', 'email', 'url', 'facebook', 'membership', 'about', 'date_established'] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
def spider_opened(self, spider): self.results_csv = open('results_3.csv', 'wb') self.missing_csv = open('results_miss_2.csv', 'wb') self.results_exporter = CsvItemExporter(self.results_csv) self.missing_exporter = CsvItemExporter(self.missing_csv) self.results_exporter.start_exporting() self.missing_exporter.start_exporting()
class FacupPipeline(object): @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline #create files and instantiate exporter class #then run start_exporting() method, this is required for item exporter class def spider_opened(self, spider): self.results_csv = open('results_3.csv', 'wb') self.missing_csv = open('results_miss_2.csv', 'wb') self.results_exporter = CsvItemExporter(self.results_csv) self.missing_exporter = CsvItemExporter(self.missing_csv) self.results_exporter.start_exporting() self.missing_exporter.start_exporting() def process_item(self, item, spider): self.results_exporter = CsvItemExporter(self.results_csv) self.missing_exporter = CsvItemExporter(self.missing_csv) return item def spider_closed(self, spider): self.results_exporter.finish_exporting() self.missing_exporter.finish_exporting() self.results_csv.close() self.missing_csv.close()
class JsonExportPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): # file = open('%s_data.xml' % spider.name, 'w+b') import os filePath = os.path.dirname(__file__) outputDir = filePath +'/output/' file = open(outputDir + '%s_data.csv' % spider.name, 'w+b') self.files[spider] = file # self.exporter = JsonItemExporter(file) self.exporter = CsvItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
def open_spider(self, spider): # 保存csv数据的文件对象 self.f = open('aqi.csv', 'w') # 创建csv文件读写对象 self.csv_exporter = CsvItemExporter(self.f) # 开始进行csv文件读写 self.csv_exporter.start_exporting()
def spider_opened(self, spider): file = open("%s_items.csv" % spider.name, "w+b") self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = ["title"] #'subject', 'description', 'creator', 'source', 'published', 'rights', 'citation', 'url'] self.exporter.start_exporting()
def spider_opened(self, spider): if spider.name in 'realestate': self.file = open('current_listing.csv', 'w+b') else: self.file = open('past_listing.csv', 'w+b') self.exporter = CsvItemExporter(self.file) self.exporter.start_exporting()
def spider_opened(self, spider): filename = spider.get_dump_filepath() f = open(filename, 'w') self.files[spider.name] = f # by default csv module uses Windows-style line terminators (\r\n) self.exporter = CsvItemExporter(f, include_headers_line=True, delimiter='|', lineterminator='\n') self.exporter.start_exporting()
def spider_opened(self, spider): file = open('%s_societies.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = ['name', 'president', 'email', 'url', 'facebook', 'membership', 'about', 'date_established'] self.exporter.start_exporting()
def spider_opened(self, spider): file = open('%s' % spider.nameOfFile, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = ['originalString', 'translatedString'] self.exporter.start_exporting()
def spider_opened(self, spider): file = open('%s_items.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = \ ['timestamp', 'category_id', 'topic_id', 'topic_title', 'message_number', 'message_author', 'message_text'] self.exporter.start_exporting()
def assertExportResult(self, item, expected, **kwargs): fp = BytesIO() ie = CsvItemExporter(fp, **kwargs) ie.start_exporting() ie.export_item(item) ie.finish_exporting() self.assertCsvEqual(fp.getvalue(), expected)
def open_spider(self, spider): # 保存csv数据的文件对象 self.f = open("Amazon_goods_crawl.csv", "w") # 创建csv文件读写对象 self.csv_exporter = CsvItemExporter(self.f) # 开始进行csv文件读写 self.csv_exporter.start_exporting() # 根据商品标题进行去重处理 self.add_title = set()
def spider_opened(self, spider): path = CrawlerPipeline.EXPORT_PATH + "/" + spider.spider_id + '_export.csv' export_file = open(path, 'ab' if os.path.isfile(path) else 'wb') self.files[spider.spider_id] = export_file self.exporter = CsvItemExporter(export_file) self.exporter.fields_to_export = [ "item_id", "url", "num_links", "num_images", "num_scripts", "num_styles", "headers", "text" ] self.exporter.start_exporting()
def spider_opened(self, spider): # file = open('%s_data.xml' % spider.name, 'w+b') import os filePath = os.path.dirname(__file__) outputDir = filePath +'/output/' file = open(outputDir + '%s_data.csv' % spider.name, 'w+b') self.files[spider] = file # self.exporter = JsonItemExporter(file) self.exporter = CsvItemExporter(file) self.exporter.start_exporting()
def test_header_export_two_items(self): for item in [self.i, dict(self.i)]: output = BytesIO() ie = CsvItemExporter(output) ie.start_exporting() ie.export_item(item) ie.export_item(item) ie.finish_exporting() self.assertCsvEqual(output.getvalue(), 'age,name\r\n22,John\xc2\xa3\r\n22,John\xc2\xa3\r\n')
def spider_opened(self, spider): csv_file = settings.CSV_FILE_OUTPUT_DIR.format( spider.base_url.split('/')[2], datetime.date.today().strftime('%Y-%m-%d')) if spider.name == 'google_serp_spider': file = open(csv_file, 'w') self.files[spider] = file # note this outputs as a tab seperated csv, rather than comma. self.exporter = CsvItemExporter(file, delimiter='\t') self.exporter.start_exporting()
class CsvExportPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('vagas.csv', 'wb') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class CSVPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_items.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file,delimiter='\t') self.exporter.fields_to_export = ['userId','bookId','name','rating','relativeRating','booklistNum'] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class AmazonCsvPipeline(object): def open_spider(self, spider): # 保存csv数据的文件对象 self.f = open("Amazon_goods_crawl.csv", "w") # 创建csv文件读写对象 self.csv_exporter = CsvItemExporter(self.f) # 开始进行csv文件读写 self.csv_exporter.start_exporting() # 根据商品标题进行去重处理 self.add_title = set() def process_item(self, item, spider): if item['title'] in self.add_title: print u'[EEROR] 数据已保存,勿重复%s'% item['title'] else: self.add_title.add(item['title']) # 每次写入一个item数据 # print u'[INFO] 正在写入csv文件中%s'% item['title'] self.csv_exporter.export_item(item) return item def close_spider(self, spider): # 结束csv文件读写 # print u'[INFO] 写入csv文件已完成' self.csv_exporter.finish_exporting() # 关闭文件 self.f.close()
class WebcrawlerPipeline(object): def __init__ (self): self.files = {} pass @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open("%s_urls.txt" % (spider.name), "w+b") self.files[spider] = file self.exporter = CsvItemExporter(file, include_headers_line=False) self.exporter.start_exporting() pass def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() pass def process_item(self, item, spider): self.exporter.export_item(item) return item pass
class CSVWriterPipeline(object): def __init__(self,filename): self.filename = filename @classmethod def from_crawler(cls, crawler): settings = crawler.settings filename = settings.get('OUTPUT_FILE') pipeline = cls(filename) crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.file = open(self.filename, 'w+b') self.exporter = CsvItemExporter(self.file,include_headers_line=True) self.exporter.encoding='utf-8' self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class CSVExportPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s' % spider.nameOfFile, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = ['originalString', 'translatedString'] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class CsvExportPipeline(object): """ app.pipelines.exporter_csv.CsvExportPipeline """ def __init__(self): self.files = {} self.exporter = None @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file_csv = open('%s_items.csv' % spider.name, 'w+b') self.files[spider] = file_csv self.exporter = CsvItemExporter(file_csv) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file_csv = self.files.pop(spider) file_csv.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class catalogscraperPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open("%s_items.csv" % spider.name, "w+b") self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = ["title"] #'subject', 'description', 'creator', 'source', 'published', 'rights', 'citation', 'url'] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class BitcoinTalkCrawlerPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_items.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = \ ['timestamp', 'category_id', 'topic_id', 'topic_title', 'message_number', 'message_author', 'message_text'] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class CSVPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_items.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = ["filename", "titel", "publicatie", "dossiernummer", "organisatie", "publicatiedatum", "publicatietype", "file_urls"] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class DumpToFile(object): """ Dump harvested data into flat file, no other logic is implemented here (it's "Dump" :-) """ def __init__(self): self.files = {} self.counter = 0 @classmethod def from_crawler(cls, crawler): pipeline = cls() # TODO: verify if still needed for registration of spider_closed/opened event? crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): filename = spider.get_dump_filepath() f = open(filename, 'w') self.files[spider.name] = f # by default csv module uses Windows-style line terminators (\r\n) self.exporter = CsvItemExporter(f, include_headers_line=True, delimiter='|', lineterminator='\n') self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() f = self.files.pop(spider.name) f.close() def process_item(self, item, spider): self.exporter.export_item(item) # for counter, could set att in spider at closing self.counter += 1 return item
def spider_opened(self, spider): file = open(spider.name+'-api-'+datetime.datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S")+'.tsv*', 'wb') self.files[spider] = file self.exporter = CsvItemExporter(file, include_headers_line=True, join_multivalued=';', encoding="utf-8", delimiter='\t') if spider.name=='user': self.exporter.fields_to_export = ['uid', 'name', 'nickname', 'joindate', 'activedate'] elif spider.name=='subject': self.exporter.fields_to_export = ['subjectid', 'order', 'subjectname', 'subjecttype', 'rank', 'date', 'votenum', 'favnum', 'staff'] elif spider.name=='record': self.exporter.fields_to_export = ['uid', 'name', 'nickname', 'iid', 'typ', 'state', 'adddate', 'rate', 'tags', 'comment'] self.exporter.start_exporting()
def spider_opened(self, spider): """ called when the spider is started """ try: processor = self.spiders_to_processors[type(spider).__name__]() except KeyError: self.exporter = None return file = open(processor.get_storage_filepath(spider), "w+b") self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.start_exporting()
class ScrapyOfficialNewspapersPipeline: def __init__(self): dir = "./" file_name = "Scraped_Documents_local.csv" file = dir + file_name self.file = open(file, 'ab') self.exporter_1 = CsvItemExporter(self.file, include_headers_line = False, encoding = 'Latin1') self.exporter_2 = CsvItemExporter(self.file, include_headers_line = False, encoding = 'utf-8') self.exporter_1.start_exporting() self.exporter_2.start_exporting() def close_spider(self, spider): self.exporter_1.finish_exporting() self.exporter_2.finish_exporting() self.file.close() def process_item(self, item, spider): try: self.exporter_1.export_item(item) except: self.exporter_2.export_item(item) return item
class CSVExportPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): if spider.name in ['zaubacorp', 'sector', 'sector1', 'exhaustive']: filename = '%s-jobs-%s.csv' % ( spider.name, datetime.utcnow().strftime('%d%m%Y%H%M%s')) path = os.path.expanduser("/tmp/jobs-data/%s" % filename) elif spider.name in ['phoneandemail', 'companiesinmumbai']: return else: filename = '%s_cand_%s.csv' % (spider.name, time.strftime("%d_%m_%Y")) path = os.path.expanduser("/tmp/candidate/%s" % filename) file = open(path, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): if spider.name in ['phoneandemail', 'companiesinmumbai']: return self.exporter.finish_exporting() file = self.files.pop(spider) filename = file.name file.close() if spider.name in ['zaubacorp', 'sector', 'sector1', 'exhaustive']: self._send_email(filename) def process_item(self, item, spider): if spider.name not in [ 'zaubacorp', 'sector', 'sector1', 'phoneandemail', 'companiesinmumbai', 'exhaustive' ]: src = " " for i in item['source']: src = src + "," + i source = src.replace(" ,", "") item['source'] = source self.exporter.export_item(item) return item def _send_email(self, filename): print('====sending email %s ====' % filename) msg = MIMEMultipart('alternative') msg['From'] = "*****@*****.**" msg['To'] = "*****@*****.**" msg['Subject'] = 'Portal scraping - %s' % datetime.utcnow().strftime( '%d-%b-%Y') # attach the csv file file = open(filename) attachment = MIMEText(file.read(), _subtype='csv') file.close() attachment.add_header('Content-Disposition', 'attachment', filename=filename.split('/')[-1].strip()) msg.attach(attachment) s = smtplib.SMTP_SSL("smtp.gmail.com", 465) s.login("*****@*****.**", "Nishit123") # s.sendmail("*****@*****.**", ["*****@*****.**", # "*****@*****.**", # "*****@*****.**"], # msg.as_string()) s.sendmail("*****@*****.**", ["*****@*****.**"], msg.as_string()) def _send_candidate_email(self, filename): print('====sending email %s ====' % filename) msg = MIMEMultipart('alternative') msg['From'] = "*****@*****.**" msg['To'] = "*****@*****.**" msg['Subject'] = 'Candidate scraping - %s' % datetime.utcnow( ).strftime('%d-%b-%Y') # attach the csv file file = open(filename) attachment = MIMEText(file.read(), _subtype='csv') file.close() attachment.add_header('Content-Disposition', 'attachment', filename=filename.split('/')[-1].strip()) msg.attach(attachment) s = smtplib.SMTP_SSL("smtp.gmail.com", 465) s.login("*****@*****.**", "Nishit123") s.sendmail("*****@*****.**", [ "*****@*****.**", "*****@*****.**", "*****@*****.**" ], msg.as_string())
class TedEuropaEuPipeline(object): @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.duplicates = [] time = datetime.now().strftime('%Y%m%d') self.document_ids = [] self.file = open('{}_TED_Search_Result.csv'.format(time), 'wb') self.file_details = open('{}_TED_Details_Result.csv'.format(time), 'wb') self.file_cpv_codes = open('{}_TED_Details_CPV.csv'.format(time), 'wb') self.file_data = open('{}_TED_Data.csv'.format(time), 'wb') self.exporter = CsvItemExporter(self.file) self.details_exporter = CsvItemExporter(self.file_details) self.cpv_exporter = CsvItemExporter(self.file_cpv_codes) self.data_exporter = CsvItemExporter(self.file_data) self.exporter.fields_to_export = [ 'document_id', 'description', 'short_description', 'country', 'publication_date', 'deadline' ] self.details_exporter.fields_to_export = [ 'url', 'document_id', 'name', 'value', 'lot_no', 'total', 'currency', 'contracting_country', 'award_date', 'product_type', 'contracting_authority', 'contracting_authority_city', 'NrTendersRecieved', 'NrTendersRecievedSME', 'NrTendersRecievedoEU', 'NrTendersRecievednonEU', 'NrTendersRecievedelectronic', 'Consortium' ] self.data_exporter.fields_to_export = [ 'url', 'document_id', 'TI', 'ND', 'PD', 'OJ', 'TW', 'AU', 'OL', 'HD', 'CY', 'AA', 'HA', 'DS', 'NC', 'PR', 'TD', 'RP', 'TY', 'AC', 'PC', 'RC', 'IA', 'DI' ] self.exporter.start_exporting() self.details_exporter.start_exporting() self.cpv_exporter.start_exporting() self.data_exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() self.file.close() self.details_exporter.finish_exporting() self.file_details.close() self.cpv_exporter.finish_exporting() self.file_cpv_codes.close() self.data_exporter.finish_exporting() self.file_data.close() def process_item(self, item, spider): if item['document_id'] not in self.duplicates: self.exporter.export_item(item) self.duplicates.append(item['document_id']) if 'cpv_code' in item and item['cpv_code']: for code in item['cpv_code']: self.cpv_exporter.export_item({ 'document_id': item['document_id'], 'cpv_code': code }) self.details_exporter.export_item(item) if item['document_id'] not in self.document_ids: self.document_ids.append(item['document_id']) self.data_exporter.export_item(item) return item
def open_spider(self, spider): self.file = open('website_data.csv', 'wb') self.exporter = CsvItemExporter(self.file, encoding='gb2312') self.exporter.start_exporting()
def __init__(self): self.file = open('chictr.csv', 'wb') self.exporter = CsvItemExporter(self.file) self.exporter.start_exporting()
def __init__(self): now = time.strftime("%Y-%m-%d %H-%M-%S", time.localtime()) filename = 'data/' + now + ".csv" self.file = open(filename, 'wb') self.export = CsvItemExporter(self.file)
def open_spider(self, spider): self.file = open('%s.csv' % spider.name, 'w+b') self.exporter = CsvItemExporter(self.file) self.exporter.start_exporting()
def open_spider(self, spider): self.file = open("{0}.csv".format(spider.fileRangeName), 'wb') self.exporter = CsvItemExporter(self.file, encoding='utf-8', fields_to_export=fieldnames) self.exporter.start_exporting()
def open_spider(self, spider): self.file = open( 'C:\\Users\\Administrator\\Desktop\\数据分析\\qingguo.csv', 'wb') #Running time self.exporter = CsvItemExporter(self.file) self.exporter.start_exporting()
def open_spider(self, spider): self.file = open('dianpincity.csv', 'wb') self.exporter = CsvItemExporter(self.file) self.exporter.start_exporting()
def open_spider(self, spider): self.file = open('honglingjing.csv', 'wb') self.exporter = CsvItemExporter(self.file) self.exporter.start_exporting()
def spider_opened(self, spider): file = open('%s_listings.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.start_exporting()
def __init__(self): self.file = open("newsUrlCrawl.csv", 'wb') self.exporter = CsvItemExporter(self.file, encoding='utf-8') self.exporter.start_exporting()
def __init__(self): self.file = open("ResultCSV.csv", 'wb') self.exporter = CsvItemExporter(self.file, unicode) self.exporter.start_exporting()
def _get_exporter(self, **kwargs): return CsvItemExporter(self.output, **kwargs)
class CsvExportPipeline(object): def __init__(self): self.provDict = {} self.cityDict = {} self.city_names = [] self.files = {} file_prov_path = os.path.join(dist_path, 'provinces.csv') file_city_path = os.path.join(dist_path, 'cities.csv') file_county_path = os.path.join(dist_path, 'counties.csv') file_prov = open(file_prov_path, 'w+b') file_city = open(file_city_path, 'w+b') file_county = open(file_county_path, 'w+b') self.files[file_prov_path] = file_prov self.files[file_city_path] = file_city self.files[file_county_path] = file_county self.prov_exporter = CsvItemExporter(file_prov) self.city_exporter = CsvItemExporter(file_city) self.county_exporter = CsvItemExporter(file_county) @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.prov_exporter.start_exporting() self.city_exporter.start_exporting() self.county_exporter.start_exporting() def spider_closed(self, spider): self.prov_exporter.finish_exporting() self.city_exporter.finish_exporting() self.county_exporter.finish_exporting() for file in self.files.values(): file.close() def process_item(self, item, spider): code = item['code'] export_item = AdmExportItem() export_item['code'] = code export_item['name'] = item['name'] export_item['first_letter'] = item['first_letter'] self.export_province(code, export_item) self.export_city(code, export_item) self.export_county(code, export_item) return item def export_province(self, code, item): if Area.is_prov(code): self.provDict[code] = dict(item) item['level'] = 1 item['parent_code'] = 0 if code in consts.CHINA_REGION: kv = consts.CHINA_REGION.get(code) item['region'] = kv.value else: item['region'] = 0 self.prov_exporter.export_item(item) def export_city(self, code, item): if Area.is_city(code): if code in self.cityDict: return prov_code = int(str(code)[0:2] + '0000') item['level'] = 2 item['parent_code'] = prov_code item['region'] = 0 province = self.provDict.get(prov_code) if prov_code in consts.MUNICIPALITIES: if item['name'] in consts.EXCLUDE_NAMES: name = province['name'] item['name'] = name item['first_letter'] = _first_letter(name) p_name = str(item['parent_code']) + ':' + item['name'] if p_name not in self.city_names: self.city_names.append(p_name) if not item['name'] in consts.EXCLUDE_NAMES: self.city_exporter.export_item(item) self.cityDict[code] = dict(item) def export_county(self, code, item): if Area.is_county(code): prov_code = int(str(code)[0:2] + '0000') city_code = int(str(code)[0:4] + '00') if prov_code == 500000 and city_code == 500200: city_code = 500100 # province = self.provDict.get(prov_code) city = self.cityDict.get(city_code) item['region'] = 0 if prov_code not in consts.MUNICIPALITIES: if city is None or city['name'] in consts.EXCLUDE_NAMES: item['level'] = 2 item['parent_code'] = prov_code self.city_exporter.export_item(item) else: if item['name'] in consts.EXCLUDE_NAMES: pass else: item['level'] = 3 item['parent_code'] = city_code self.county_exporter.export_item(item) else: item['level'] = 3 item['parent_code'] = city_code self.county_exporter.export_item(item)
def open_spider(self, spider): self.file = open('amazon_bestseller.csv', 'wb') self.exporter = CsvItemExporter(self.file) self.exporter.start_exporting()
def __init__(self): ts = int(time()) self.file = open("tinder%s.csv" % ts, 'wb') self.exporter = CsvItemExporter(self.file) self.exporter.start_exporting()
def open_spider(self, spider): pathX = r'C:/Users/TiGa/Documents/bootcampPreWork/Week 2/alldata' self.csvfile = open(Path(pathX,self.filename), mode='w+b') self.exporter = CsvItemExporter(self.csvfile) self.exporter.start_exporting()
def spider_opened(self, spider): self.files = dict([ (name, open("./"+name+'.csv','w+b')) for name in self.SaveTypes ]) self.exporters = dict([ (name,CsvItemExporter(self.files[name])) for name in self.SaveTypes]) [e.start_exporting() for e in self.exporters.values()]
def __init__(self): print('begin') self.file = open("./fans_data.csv", "wb") self.exporter = CsvItemExporter(self.file, fields_to_export = ['fid', 'screen_name', 'profile_image_url', 'profile_url', 'followers_count', 'follow_count', 'desc1']) self.exporter.start_exporting()
class CSVPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_items_test6.csv' % spider.name, 'w+b') self.files[spider] = file self.exporter = CsvItemExporter(file) self.exporter.fields_to_export = [ 'company_name', 'company_address', 'company_status', 'company_type', 'company_number', 'officer1_first_names', 'officer1_last_name', 'officer2_first_names', 'officer2_last_name', 'officer3_first_names', 'officer3_last_name', 'officer4_first_names', 'officer4_last_name', 'officer5_first_names', 'officer5_last_name', 'officer6_first_names', 'officer6_last_name', 'officer7_first_names', 'officer7_last_name', 'officer8_first_names', 'officer8_last_name', 'officer9_first_names', 'officer9_last_name', 'officer10_first_names', 'officer10_last_name', 'officer11_first_names', 'officer11_last_name', 'officer12_first_names', 'officer12_last_name', 'officer13_first_names', 'officer13_last_name', 'officer14_first_names', 'officer14_last_name', 'officer15_first_names', 'officer15_last_name', 'officer16_first_names', 'officer16_last_name', 'officer17_first_names', 'officer17_last_name', 'officer18_first_names', 'officer18_last_name', 'officer19_first_names', 'officer19_last_name', 'officer20_first_names', 'officer20_last_name', 'officer1_occupation', 'officer2_occupation', 'officer3_occupation', 'officer4_occupation', 'officer5_occupation', 'officer6_occupation', 'officer7_occupation', 'officer8_occupation', 'officer9_occupation', 'officer10_occupation', 'officer11_occupation', 'officer12_occupation', 'officer13_occupation', 'officer14_occupation', 'officer15_occupation', 'officer16_occupation', 'officer17_occupation', 'officer18_occupation', 'officer19_occupation', 'officer20_occupation', 'officer1_status', 'officer2_status', 'officer3_status', 'officer4_status', 'officer5_status', 'officer6_status', 'officer7_status', 'officer8_status', 'officer9_status', 'officer10_status', 'officer11_status', 'officer12_status', 'officer13_status', 'officer14_status', 'officer15_status', 'officer16_status', 'officer17_status', 'officer18_status', 'officer19_status', 'officer20_status', 'nature_of_business', 'officers_url' ] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
def open_spider(self, spider): self.file = open(moneyfacts_file, 'ab') self.exporter = CsvItemExporter(file=self.file, include_headers_line=False) self.exporter.fields_to_export = moneyfacts_fields
class NicheItemPipeline(object): def open_spider(self, spider): self.entFile = open(FILENAME_ENTITIES, 'wb') self.entExporter = CsvItemExporter(self.entFile) self.entExporter.start_exporting() self.factFile = open(FILENAME_FACTS, 'wb') self.factExporter = CsvItemExporter(self.factFile) self.factExporter.start_exporting() self.grdFile = open(FILENAME_GRADES, 'wb') self.grdExporter = CsvItemExporter(self.grdFile) self.grdExporter.start_exporting() self.bdgFile = open(FILENAME_BADGES, 'wb') self.bdgExporter = CsvItemExporter(self.bdgFile) self.bdgExporter.start_exporting() def close_spider(self, spider): self.entExporter.finish_exporting() self.entFile.close() self.factExporter.finish_exporting() self.factFile.close() self.grdExporter.finish_exporting() self.grdFile.close() self.bdgExporter.finish_exporting() self.bdgFile.close() def process_item(self, item, spider): if isinstance(item, EntityItem): self.entExporter.export_item(item) elif isinstance(item, FactItem): self.factExporter.export_item(item) elif isinstance(item, GradeItem): self.grdExporter.export_item(item) elif isinstance(item, BadgeItem): self.bdgExporter.export_item(item) return item
def __init__(self): self.file = open(f"{filename}{category}{extension}", 'wb') self.exporter = CsvItemExporter(self.file) self.exporter.start_exporting()
def __init__(self): self.file = open("test_scrape.csv", 'wb') self.exporter = CsvItemExporter(self.file) self.exporter.start_exporting()
def open_spider(self, spider): self.csvfile = open( self.filename, 'ab') #open in append mode so it adds instead of overwrites self.exporter = CsvItemExporter(self.csvfile) self.exporter.start_exporting()
def open_spider(self, spider): self.csvfile = open(self.filename, 'wb') self.exporter = CsvItemExporter(self.csvfile) self.exporter.start_exporting()
def __init__(self): self.file = open("category_links.csv", 'wb') self.exporter = CsvItemExporter(self.file) self.exporter.start_exporting()
def open_spider(self, spider): self.file = open('../File_data/ncity_air_quality_add.csv', 'wb') self.exporter = CsvItemExporter(self.file) self.exporter.start_exporting()