class GnewsPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
         pipeline = cls()
         crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
         crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
         return pipeline

    def spider_opened(self, spider):
        file = open('%s.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file,True,'\n')
        self.exporter.fields_to_export=['category','topstory','snippet','link','originallink','sublinks','sublinktext','gpost','gpostsnip','extras','extraslink','related']
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
    '''def process_item(self, item, spider):
Example #2
0
class YangmaodangPipeline(object):
    '''
    保存采集的水木羊毛信息,将其保存到csv文件中,并将其传到邮箱中。
    '''
    def __init__(self):
        self.filename = 'output/newsmth-'+time.strftime('%Y%m%d')+'.csv'
        self.file = open(self.filename, 'wb')
        self.items = []
        # self.file.write('$$'.join(YangmaodangItem.fields))

    def open_spider(self, spider):
        self.exporter = CsvItemExporter(self.file)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        # 利用回复数对文章排序
        sortedlist = sorted(self.items, key=lambda x: int(operator.itemgetter('reply_num')(x)), reverse=True)
        for item in sortedlist:
            self.exporter.export_item(item)

        self.exporter.finish_exporting()
        self.file.close()

        send_email(self.filename)


    def process_item(self, item, spider):
        self.items.append(item)
        # self.exporter.export_item(item)
        return item
Example #3
0
class DataPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_%s.csv' % (spider.name, datetime.datetime.strftime(datetime.datetime.now(),'%Y%m%d')), 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = ['id','first_name', 'last_name','company', 'email','title','city', 'state', 'zip_code','country', 'address', 'address2', 'headquarter_phone', 'contact_phone','updated']
        self.exporter.start_exporting()        

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Example #4
0
class CSVPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)

    def spider_opened(self, spider):
        result_file = open('crawler products.csv', 'w+b')
        self.files[spider] = result_file
        self.exporter = CsvItemExporter(result_file)
        self.exporter.fields_to_export = [
            'name', 'image', 'link', 'model', 'upc', 'ean', 'currencycode',
            'locale', 'price', 'saleprice', 'sku', 'retailer_key', 'instore',
            'shiptostore', 'shippingphrase', 'productstockstatus',
            'categories', 'gallery', 'features', 'condition'
        ]
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        result_file = self.files.pop(spider)
        result_file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Example #5
0
class CSVWriterPipeline(object):
    filename = ""

    def __init__(self):
        self.files = {}
        self.exporter1 = CsvItemExporter(
            fields_to_export=BillionPricesIndiaItem.fields.keys(),
            file=open("mobiles.csv", 'wb'))

        @classmethod
        def from_crawler(cls, crawler):
            pipeline = cls()
            crawler.signals.connect(pipeline.spider_opened,
                                    signals.spider_opened)
            crawler.signals.connect(pipeline.spider_closed,
                                    signals.spider_closed)
            return pipeline

    def spider_opened(self, spider):
        self.exporter1.start_exporting()

    def spider_closed(self, spider):
        self.exporter1.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter1.export_item(item)
        return item
Example #6
0
class CSVPipeline(object):
    def __init__(self):
        self.files = {}
        self.exporter = None

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        try:
            fo = open(spider.output_file, 'w+b')
        except IOError as e:
            spider.crawler.engine.close_spider(spider, "ERROR: Can't create CSV file: " + str(e))
            return

        self.files[spider] = fo
        self.exporter = CsvItemExporter(fo)
        self.exporter.fields_to_export = settings.getlist("EXPORT_FIELDS")
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        if self.exporter is not None:
            self.exporter.finish_exporting()
            f = self.files.pop(spider)
            f.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Example #7
0
class CSVPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_items.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = [
            'Subject', 'Start_Date', 'Start_Time', 'End_Date', 'End_Time',
            'Location', 'All_Day_Event'
        ]
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Example #8
0
class GameListingPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_products.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):

        keys = ['name','address','zipCode','jobCostMin','jobCostMax',
            'contactName','contactPhone','website','licenseNumber',
            'averageRating','profileUrl','followers','following',
            'badgeCount','projectCount','reviewCount','commentCount']
        dictionary = item_to_dictionary(item,keys)
        # print 'document to insert',dictionary
        client.insert('updatedListings', dictionary, callback=insert_callback)
        # client.insert('listings', dictionary, callback=insert_callback)

        self.exporter.export_item(item)
Example #9
0
class CSVPipeline(object):

  def __init__(self):
    self.files = {}

  @classmethod
  def from_crawler(cls, crawler):
    pipeline = cls()
    crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
    crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
    return pipeline

  def spider_opened(self, spider):
    file = open('%s_items.csv' % spider.name, 'w+b')
    self.files[spider] = file
    self.exporter = CsvItemExporter(file)
    self.exporter.fields_to_export = ['name', 'rank', 'overallScore', 'teachingScore', 'internationalOutlook', 'industryIncome', 'research', 'citations', 'textBelow']
    self.exporter.start_exporting()

  def spider_closed(self, spider):
    self.exporter.finish_exporting()
    file = self.files.pop(spider)
    file.close()

  def process_item(self, item, spider):
    self.exporter.export_item(item)
    return item
Example #10
0
class FinancePipeline(object):
	def __init__(self):
		self.files = {}

	@classmethod
	def from_crawler(cls, crawler):
		pipeline = cls()
		crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
		crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
		return pipeline

	def spider_opened(self, spider):
		file = open('%s_stock.csv' % spider.code, 'w+b')
		self.files[spider] = file
		self.exporter = CsvItemExporter(file, 
				fields_to_export=['date','Open','High', 'Low', 'Close', 'Volume', 'AdjClose'])
		self.exporter.start_exporting()

	def spider_closed(self, spider):
		self.exporter.finish_exporting()
		file = self.files.pop(spider)
		file.close()

	def process_item(self, item, spider):
		self.exporter.export_item(item)
		return item
Example #11
0
class EaCOpenListBotPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_%s_items.csv' % (spider.name, spider.category), 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = ['vendor', 'product', 'default']
        #self.exporter.fields_to_export = ['default']
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
class CSVPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_items.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = ['team_year', 'track', 'region']
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Example #13
0
class CsvExportPipeline(object):
	def process_item(self, item, spider):

		outputdir = '%s%s/%s' % (settings['ADAPTFM_OUTPUT_PATH'], spider.folder, item['brandCategory'][0])
		name = item['brandFeed'][0].replace('http://','').replace('/','_').replace('.xml','')
		filename = '%s/%s.csv' % (outputdir, name)
		if not os.path.isdir (os.path.dirname(filename)):
			os.mkdir(os.path.dirname(filename))

		file = open(filename, 'a+b')


		self.exporter = CsvItemExporter(file)
		self.exporter.start_exporting()



		self.exporter.export_item(item)


		self.exporter.finish_exporting()
		file.close()


		return item
Example #14
0
class CsvExportPipeline(object):

    def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.files = {}

    def spider_opened(self, spider):
        file = open('%s_%s.csv' % (spider.name, int(time.time())), 'w+b')
        self.files[spider] = file
        if 'yopt' in spider.name:
            self.exporter = CsvItemExporter(file,fields_to_export = ['date','instrument','option_symbol','symbol','expiration','type','strike','last','change','bid','ask','volume','open_int'],dialect='excel')
        elif 'prices' in spider.name:
            self.exporter = CsvItemExporter(file,fields_to_export = ['date','open','high','low','close','volume','adj_close'],dialect='excel')
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        if item is None:
            raise DropItem("None")
        self.exporter.export_item(item)
        return item 
Example #15
0
class CSVPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_result.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = [
            'title', 'brand', 'description', 'price', 'main_image_url',
            'additional_image_urls', 'sku', 'category'
        ]
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Example #16
0
class OpossumPipeline(object):
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.file = open('/home/moorcock/work/mrs_opossum/items.csv', 'w+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.fields_to_export = ['id', 'title', 'image', 'keywords']
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        item_exp = ExportImageItem(
            id=item['id'],
            title=item['title'].strip(' \t\n'),
            image=item['images'][0]['path'].split('/')[-1].split('.')[0],
            keywords=item['keywords']
        )
        self.exporter.export_item(item_exp)
        return item_exp
class TutorialPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_items.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        list = ['id','title', 'time', 'director', 'year', 'star','cost']
        self.exporter.fields_to_export = list
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Example #18
0
class BuildingsPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open("buildings.csv", "w+b")
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = fields_to_export
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Example #19
0
class TutorialPipeline(object):

  def __init__(self):
    self.files = {}

  @classmethod
  def from_crawler(cls, crawler):
    pipeline = cls()
    crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
    crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
    return pipeline

  def spider_opened(self, spider):
    file = open("mediabase.csv", 'w+b')
    self.files[spider] = file
    self.exporter = CsvItemExporter(file)
    #self.exporter.fields_to_export = ["Name","Address","City","Neighborhood","State","Zip","Phone","Website","Image_url","Hours_Mon","Hours_Tue","Hours_Wed","Hours_Thu","Hours_Fri","Hours_Sat","Hours_Sun","Price","TakesReservation","Delivery","TakeOut","AcceptsCreditCards","GoodFor","Parking","WheelChairAccessible","BikeParking","GoodForKids","GoodForGroups","Attire","Ambience","NoiseLevel","Alcohol","OutDoorSeating","Wifi","HasTV","WaiterService","Caters","Url"]
    self.exporter.fields_to_export = ["Type","Area","PlaceName","Web","Tel","Address","Zip","Town","Hours","CompanyName","OrganizationNo","Turnover","Employed","LastName","FirstName","Telephone","AllabolagUrl","EniroUrl"]
    self.exporter.start_exporting()
  def spider_closed(self, spider):
    self.exporter.finish_exporting()
    file = self.files.pop(spider)
    file.close()

  def process_item(self, item, spider):
    self.exporter.export_item(item)
    return item
class ChainxyPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open(
            '%s_%s.csv' %
            (spider.name,
             datetime.datetime.strftime(datetime.datetime.now(), '%Y%m%d')),
            'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = [
            'title', 'desc', 'image', 'stars', 'quality', 'imdb_code',
            'keywords', 'genres', 'year', 'first_air_date', 'eps', 'type',
            'server_f1', 'server_f2', 'vidnode', 'rapidvideo', 'streamango',
            'openload1', 'openload2'
        ]
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
class ProjetodsPipeline(object):
    # Create the csv file.
    def __init__(self):
        self.file = open("booksdata.csv", 'wb')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    # Process each job detail
    def process_item(self, item, spider):
        #Clean text (punctuation)
        translator = str.maketrans('', '', string.punctuation)
        item['title'] = item['title'].translate(translator)
        item['local'] = item['local'].translate(translator).replace(" – ", " ")
        item['company_name'] = item['company_name'].translate(translator)
        # Remove HTML Markup
        soup = BeautifulSoup(item['description'])
        item['description'] = soup.get_text(" ",
                                            strip=True).translate(translator)
        # Treat empty field
        if item['salary'] is None:
            item['salary'] = "NA"
        else:
            item['salary'] = item['salary'].replace(",", ".")

        self.exporter.export_item(item)

        return item
Example #22
0
class ChainxyPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_%s.csv' % (spider.name, datetime.datetime.strftime(datetime.datetime.now(),'%Y%m%d')), 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        # self.exporter.fields_to_export = ['company_name','contact','phone1','phone2','email','average','reviews','address','member_for','based_in']
        self.exporter.fields_to_export = ['title', 'save', 'desc', 'long_desc', 'image']
        self.exporter.start_exporting()        

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Example #23
0
class ParkingCrawlerPipeline(object):
    def __init__(self):
        self.files = {}

    def process_item(self, item, spider):
        return item

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('/tmp/%s_log.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = ['_id', 'name', 'count', 'free', 'timestamp', 'lat', 'lon', 'url']
        self.exporter.include_headers_line = 'false';
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Example #24
0
 def assertExportResult(self, item, expected, **kwargs):
     fp = BytesIO()
     ie = CsvItemExporter(fp, **kwargs)
     ie.start_exporting()
     ie.export_item(item)
     ie.finish_exporting()
     self.assertCsvEqual(fp.getvalue(), expected)
Example #25
0
class CSVPipeline(object):

  def __init__(self):
    self.files = {}

  @classmethod
  def from_crawler(cls, crawler):
    pipeline = cls()
    crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
    crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
    return pipeline

  def spider_opened(self, spider):
    file = open('%s.csv' % spider.name, 'w+b')
    self.files[spider] = file
    self.exporter = CsvItemExporter(file)
    self.exporter.fields_to_export = ['player', 'round_1', 'round_2', 'round_3', 'round_4', 'round_5', 'round_6', 'round_7', 'round_8', 'round_9', 'round_10', 'round_11', 'round_12', 'round_13', 'round_14', 'round_15', 'round_16', 'round_17', 'round_18', 'round_19', 'round_20', 'round_21', 'round_22', 'round_23']
    self.exporter.start_exporting()

  def spider_closed(self, spider):
    self.exporter.finish_exporting()
    file = self.files.pop(spider)
    file.close()

  def process_item(self, item, spider):
    self.exporter.export_item(item)
    return item
Example #26
0
class MakkanPipeline(object):

  def __init__(self):
    self.files = {}

  @classmethod
  def from_crawler(cls, crawler):
    pipeline = cls()
    crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
    crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
    return pipeline

  def spider_opened(self, spider):
    file = open('%s_items.csv' % spider.name, 'w+b')
    self.files[spider] = file
    self.exporter = CsvItemExporter(file)
    self.exporter.fields_to_export = ['data_id','Building_name','config_type','Selling_price','Monthly_Rent','lat','longt','platform','city','listing_date','txn_type','property_type','locality','sqft','Status','listing_by','name_lister','Details','address','price_on_req','sublocality','age','google_place_id','immediate_possession','mobile_lister','areacode','management_by_landlord','carpet_area','updated_date']
    self.exporter.start_exporting()

  def spider_closed(self, spider):
    self.exporter.finish_exporting()
    file = self.files.pop(spider)
    file.close()

  def process_item(self, item, spider):
    self.exporter.export_item(item)
    return item
Example #27
0
class GanjiPipeline(object):

    def __init__(self):
        self.target_files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def gen_filename(self, spider):
        return '.'.join([spider.name, datetime.datetime.now().strftime('%Y%m%d%H%M%S'), 'csv'])

    def spider_opened(self, spider):
        target_file = open(self.gen_filename(spider), 'wb')
        self.target_files[spider] = target_file
        self.exporter = CsvItemExporter(target_file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        target_file = self.target_files.pop(spider)
        target_file.close()

    def process_item(self, item, spider):
        if not (item.get('price') and item.get('summary')):
            raise DropItem('not price or summary')
        self.exporter.export_item(item)
        return item
Example #28
0
class CVSExport(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
         pipeline = cls()
         crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
         crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
         return pipeline

    def spider_opened(self, spider):
        file = open('postUGR_items.csv', 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
    	self.exporter.export_item(item)
    	return item
Example #29
0
class LandpinPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open(
            '%s_%s.csv' %
            (spider.name,
             datetime.datetime.strftime(datetime.datetime.now(), '%Y%m%d')),
            'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = [
            'url', 'state', 'county', 'apn', 'gps', 'size', 'price', 'zoning',
            'legal_description'
        ]
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Example #30
0
class CSVPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_items.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        # self.exporter.fields_to_export = ['title','author','country','note','publish_date','press','Score','Star','People_nums']
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Example #31
0
class CsvExportPipeline(object):
    def __init__(self):
        # self.duplicates = {}
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.files = {}

    def spider_opened(self, spider):
        # self.duplicates[spider] = set()
        file = open("%s_%s.csv" % (spider.name, int(time.time())), "w+b")
        self.files[spider] = file
        self.exporter = CsvItemExporter(file, fields_to_export=["description", "phone"])
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        # del self.duplicates[spider]
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        # if item['description'] in self.duplicates[spider]:
        #    raise DropItem("Duplicateitemfound: %s" % item)
        # else:
        #    self.duplicates[spider].add(item['description'])
        #    self.exporter.export_item(item)
        #    return item
        if item is None:
            raise DropItem("None")
        self.exporter.export_item(item)
        return item
Example #32
0
class CsvExportPipeline(object):
    def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.files = {}

    def spider_opened(self, spider):
        file = open('%s_%s.csv' % (spider.name, int(time.time())), 'w+b')
        self.files[spider] = file
        #self.exporter = CsvItemExporter(file,fields_to_export = ['pid','price','curr','date','source','title','heading','url','sku','in_stock','image'],dialect='excel',delimiter=';')
        self.exporter = CsvItemExporter(file, fields_to_export=['product_id', 'price', 'price_usd', 'currency', 'when_created',
                                                                'source', 'title', 'heading', 'url', 'in_stock',
                                                                'image'], dialect='excel')
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        if item is None:
            raise DropItem("None")
        self.exporter.export_item(item)
        return item
Example #33
0
class ChainxyPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open(
            '%s_%s.csv' %
            (spider.name,
             datetime.datetime.strftime(datetime.datetime.now(), '%Y%m%d')),
            'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = [
            'store_name', 'store_number', 'address', 'address2', 'city',
            'state', 'zip_code', 'country', 'phone_number', 'latitude',
            'longitude', 'store_hours', 'store_type', 'other_fields',
            'coming_soon'
        ]
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Example #34
0
class CSVPipeline(object):

  def __init__(self):
    self.files = {}

  @classmethod
  def from_crawler(cls, crawler):
    pipeline = cls()
    crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
    crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
    return pipeline

  def spider_opened(self, spider):
    file = open('%s.csv' % spider.name, 'w+b')
    self.files[spider] = file
    self.exporter = CsvItemExporter(file)
    self.exporter.fields_to_export = ['brand','name','division','category','price','image_link']
    self.exporter.start_exporting()

  def spider_closed(self, spider):
    self.exporter.finish_exporting()
    file = self.files.pop(spider)
    file.close()

  def process_item(self, item, spider):
    self.exporter.export_item(item)
    return item
Example #35
0
class YangmaodangPipeline(object):
    '''
    保存采集的水木羊毛信息,将其保存到csv文件中,并将其传到邮箱中。
    '''
    def __init__(self):
        self.filename = 'output/newsmth-' + time.strftime('%Y%m%d') + '.csv'
        self.file = open(self.filename, 'wb')
        self.items = []
        # self.file.write('$$'.join(YangmaodangItem.fields))

    def open_spider(self, spider):
        self.exporter = CsvItemExporter(self.file)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        # 利用回复数对文章排序
        sortedlist = sorted(
            self.items,
            key=lambda x: int(operator.itemgetter('reply_num')(x)),
            reverse=True)
        for item in sortedlist:
            self.exporter.export_item(item)

        self.exporter.finish_exporting()
        self.file.close()

        send_email(self.filename)

    def process_item(self, item, spider):
        self.items.append(item)
        # self.exporter.export_item(item)
        return item
Example #36
0
class TutorialPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_items.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        list = ['id', 'title', 'time', 'director', 'year', 'star', 'cost']
        self.exporter.fields_to_export = list
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Example #37
0
class OregonPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_%s.csv' % (spider.name, datetime.datetime.strftime(datetime.datetime.now(),'%Y%m%d')), 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = ['bill','committee','session','text','url', 'filename']
        self.exporter.start_exporting()        

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Example #38
0
class CrawlerDataPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.file = open('data.csv', 'w+b')
        self.exporter = CsvItemExporter(self.file)
        self.exporter.fields_to_export = [
            'product_asin', 'product_name', 'product_is_have_patten',
            'product_description', 'image_link', 'original_image', 'color',
            'patten', 'price', 'imported_code'
        ]
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Example #39
0
class CSVPipeline(object):

	def __init__(self):
		self.files = {}
		
	@classmethod
	def from_crawler(cls, crawler):
		pipeline = cls()
		crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
		crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
		return pipeline
		
	def spider_opened(self, spider):
		file = open('%s_items.csv' % spider.name, 'w+b')
		self.files[spider] = file
		self.exporter = CsvItemExporter(file)
		self.exporter.fields_to_export = ['scrape_date', 'scrape_time', 'strain_name', 'strain_type', 'website', 'strain_highlights', 'num_ratings', 'avg_rating', 'num_of_review', 'flavor_one', 'flavor_two', 'flavor_three', 'effect_one', 'effect_one_score', 'effect_two', 'effect_two_score', 'effect_three', 'effect_three_score', 'effect_four', 'effect_four_score', 'effect_five', 'effect_five_score', 'medical_one', 'medical_one_score', 'medical_two', 'medical_two_score', 'medical_three', 'medical_three_score', 'medical_four', 'medical_four_score', 'medical_five', 'medical_five_score', 'ailment_one', 'ailment_two', 'ailment_three', 'ailment_four', 'ailment_five', 'negative_one', 'negative_one_score', 'negative_two', 'negative_two_score', 'negative_three', 'negative_three_score', 'negative_four', 'negative_four_score', 'negative_five', 'negative_five_score', 'most_popular_one', 'most_popular_two', 'most_popular_three', 'most_popular_four', 'most_popular_five', 'most_popular_six', 'most_popular_seven', 'most_popular_eight', 'most_popular_nine', 'most_popular_ten']
		self.exporter.start_exporting()
		
	def spider_closed(self, spider):
		self.exporter.finish_exporting()
		file = self.files.pop(spider)
		file.close()
		
	def process_item(self, item, spider):
		self.exporter.export_item(item)
		return item
class MultiCSVItemPipeline(object):
    def __init__(self):
        self.files = {}
        self.exporter1 = CsvItemExporter(fields_to_export=ProfRatingItem.fields.keys(),file=open("profRating.csv",'wb'))
        self.exporter2 = CsvItemExporter(fields_to_export=ProfSummaryItem.fields.keys(),file=open("profSummary.csv",'wb'))

	@classmethod
	def from_crawler(cls, crawler):
		pipeline = cls()
		crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
		crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
		return pipeline

    def spider_opened(self, spider):
    	self.exporter1.start_exporting()
    	self.exporter2.start_exporting()

    def spider_closed(self, spider):
        self.exporter1.finish_exporting()
        self.exporter2.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter1.export_item(item)
        self.exporter2.export_item(item)
        return item
Example #41
0
 def assertExportResult(self, item, expected, **kwargs):
     fp = BytesIO()
     ie = CsvItemExporter(fp, **kwargs)
     ie.start_exporting()
     ie.export_item(item)
     ie.finish_exporting()
     self.assertCsvEqual(fp.getvalue(), expected)
Example #42
0
class ChainxyPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_%s.csv' % (spider.name, datetime.datetime.strftime(datetime.datetime.now(),'%Y%m%d')), 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = ["name","number","item_type","location","building","bedroom","bathroom","size","title_deep_number","description","date","link","photo"]
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Example #43
0
class AirtspidersPipeline(object):
    def __init__(self):
        self.files = {SIXCOMPDATA.csv}

    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_items.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = [
            'Company Name', 'Current Price', 'Previous Close', 'Day\'s Range',
            'Historical Volatility', 'Market Cap', 'Shares Outstanding', 'EPS',
            'P/E Ratio', 'Beta (Volatility)', 'Percent Held by Institutions'
        ]
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Example #44
0
class PwdhoundsPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_items.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = [
            'id', 'name', 'link', 'index', 'parent_id'
        ]
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Example #45
0
class CSVPipeline(object):

	def __init__(self):
		self.files = {}

	@classmethod
	def from_crawler(cls, crawler):
		pipeline = cls()
		crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
		crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
		return pipeline

	def spider_opened(self, spider):
		file = open('%s_items.csv' % spider.name, 'w+b')
		self.files[spider] = file
		self.exporter = CsvItemExporter(file)
		self.exporter.include_headers_line=False
		self.exporter.fields_to_export = ["url","status","date","mls","address","price","beds","baths","homesize","lotsize","description","images"]
		self.exporter.start_exporting()

	def spider_closed(self, spider):
		self.exporter.finish_exporting()
		file = self.files.pop(spider)
		file.close()

	def process_item(self, item, spider):
		self.exporter.export_item(item)
		return item
Example #46
0
class CsvExportPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.filepath = os.path.abspath(spider.output_file)
        file = open(self.filepath, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()
        log.msg('CSV output file location: "%s"' % self.filepath)

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Example #47
0
class YoutubespiderPipeline(object):
    def __init__(self):
        """
        Define the CSVItemExporter for the YouTubeDataModel.

        Item Exportation, file encoding and the sequence of fields defined.
        """
        self.csv_exporter = CsvItemExporter(open('data-master.csv', 'wb'))
        self.csv_exporter.encoding = 'utf-8'
        self.csv_exporter.fields_to_export = [
            'url', 'title', 'views', 'likes', 'dislikes', 'channel_name',
            'publish_date', 'channel_subscriber_count'
        ]

        self.csv_exporter.start_exporting()

    def spider_closed(self, spider):
        self.csv_exporter.finish_exporting()

    def process_item(self, item, spider):
        """
        Exports item through Item Exporter

        :param item: containing the data
        :param spider: spider that extracted and saved inside item
        :return: the item itself
        """
        self.csv_exporter.export_item(item)
        return item
Example #48
0
class MroPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        if not getattr(crawler.spider, '_custom_csv', False):
            return None
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open(
            getattr(spider, 'output_filename',
                    'result_{}.csv'.format(spider.name)), 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = getattr(spider, 'output_fields', None)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        self.files.pop(spider).close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
class FtcompanydataPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Example #50
0
class ScarpylinkPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_products.csv' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        string = '<a href=\'%s\' title=\'%s\'>%s</a>' % (
            item['url'], item['title'], item['title'])
        print(string)
        self.exporter.export_item(item)
        return item
Example #51
0
class CSVPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open("{}_{}_{}__{}_{}_{}.csv".format(spider.pdda, spider.pddm, spider.pddj, spider.pdfa, spider.pdfm,
                                                    spider.pdfj), 'a+b')
        self.files[spider] = file
        kwargs = {}
        kwargs['delimiter'] = ';'
        kwargs['quoting'] = csv.QUOTE_ALL
        self.exporter = CsvItemExporter(file, include_headers_line=False, **kwargs)
        self.exporter.fields_to_export = ["name", "address", "zipcode", "city", "number", "date"]
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Example #52
0
class AsiaOddsPipeline(object):
	def __init__(self):
		self.files = {}
		
	@classmethod
	def from_crawler(cls, crawler):
		pipeline = cls()
		crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
		crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
		return pipeline
		
	def spider_opened(self, spider):
		if isinstance(spider, MatchSpider):
			file = open('%s_asia_%s.csv' % (spider.name,spider.match_date), 'w')
		else:
			file = open('%s_output.csv' % spider.name, 'w')
		self.files[spider] = file
		self.exporter = CsvItemExporter(file)
		self.exporter.start_exporting()

	def process_item(self, item, spider):
		if isinstance(item, AsiaOddsItem):
			self.exporter.export_item(item)
			#raise DropItem("AsiaOdds item handled.")
			return item
		else:
			return item
	
	def spider_closed(self, spider):
		self.exporter.finish_exporting()
		file = self.files.pop(spider)
		file.close()
Example #53
0
class DoubantvPipeline(CsvItemExporter):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        print('==========pipeline==========from_crawler==========')
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        filename = 'douban_tv_hanju.csv'
        savefile = open(filename, 'wb+')
        self.files[spider] = savefile
        print('==========pipeline==========spider_opened==========')
        self.exporter = CsvItemExporter(savefile)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        print('==========pipeline==========spider_closed==========')
        self.exporter.finish_exporting()
        savefile = self.files.pop(spider)
        savefile.close()

    def process_item(self, item, spider):
        print('==========pipeline==========process_item==========')
        print(type(item))
        self.exporter.export_item(item)
        return item
Example #54
0
class ChainxyPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open(
            '%s_%s.csv' %
            (spider.name,
             datetime.datetime.strftime(datetime.datetime.now(), '%Y%m%d')),
            'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = [
            'Sport_name', 'Date', 'Time', 'Team1_name', 'Team1_points',
            'Team1_spread', 'Team1_win', 'Team1_total', 'Team2_name',
            'Team2_points', 'Team2_spread', 'Team2_win', 'Team2_total', 'Draw',
            'last_update'
        ]
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Example #55
0
class WikipediaEventCSVPipeline(object):
  def __init__(self):
    self.files = {}

  @classmethod
  def from_crawler(cls, crawler):
    pipeline = cls()
    crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
    crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
    return pipeline

  def spider_opened(self, spider):
    file = open('%s_items.csv' % spider.name, 'w+b')
    self.files[spider] = file
    self.exporter = CsvItemExporter(file)
    self.exporter.fields_to_export = ['date','day_of_week','category','sub_category','news_header','source_names', 'source_list']
    self.exporter.start_exporting()

  def spider_closed(self, spider):
    self.exporter.finish_exporting()
    file = self.files.pop(spider)
    file.close()

  def process_item(self, item, spider):
    self.exporter.export_item(item)
    return item
Example #56
0
class ExportCSV(object):

    """
    Exporting to export/csv/spider-name.csv file
    """

    def __init__(self):
        self.files = {}
        self.exporter = None

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file_to_save = open('exports/csv/%s.csv' % spider.name, 'w+b')
        self.files[spider] = file_to_save
        self.exporter = CsvItemExporter(file_to_save)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file_to_save = self.files.pop(spider)
        file_to_save.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
class IcaneMetadataPipeline(object):

    def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.files = {}
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('metadata_historical.csv', 'w+b')
        self.files[spider] = file
        self.exporter = CsvItemExporter(file)
        self.exporter.fields_to_export = ['uri', 'title', 'dataUpdated', 'units', 'sourceLink', 'initialPeriod', 'lastPeriod', 'sourceLabel', 'periodicity', 'referenceArea']        
        self.exporter.start_exporting()
        
       
    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()
        

    def process_item(self, item, spider):

        if not item['sourceLabel'] or not item['sourceLabel'][0]:
            self.exporter.export_item(item)
            return item #return only items with any empty field
        elif not item['sourceLink'] or not item['sourceLink'][0]:
            self.exporter.export_item(item)
            return item #return only items with any empty field
        elif not item['initialPeriod'] or not item['initialPeriod'][0]:
            self.exporter.export_item(item)
            return item #return only items with any empty field
        elif not item['lastPeriod'] or not item['lastPeriod'][0]:
            self.exporter.export_item(item)
            return item #return only items with any empty field
        elif not item['periodicity'] or not item['periodicity'][0]:
            self.exporter.export_item(item)
            return item #return only items with any empty field
        elif not item['referenceArea'] or not item['referenceArea'][0]:
            self.exporter.export_item(item)
            return item #return only items with any empty field
        elif not item['dataUpdated'] or not item['dataUpdated'][0]: 
            self.exporter.export_item(item)
            return item #return only items with any empty field
        elif item['units'][0]:        
            units =  re.sub('[.]','',item['units'][0]) #remove any dot
            units = ''.join(units.split()) #remove whitespaces
            if not units:
                self.exporter.export_item(item)
                return item #return only items with any empty field
        else:
            raise DropItem("Discarded item: metadata fields filled.")  
Example #58
0
 def test_header_export_two_items(self):
     for item in [self.i, dict(self.i)]:
         output = BytesIO()
         ie = CsvItemExporter(output)
         ie.start_exporting()
         ie.export_item(item)
         ie.export_item(item)
         ie.finish_exporting()
         self.assertCsvEqual(output.getvalue(), 'age,name\r\n22,John\xc2\xa3\r\n22,John\xc2\xa3\r\n')
    def test_join_multivalue(self):
        class TestItem2(Item):
            name = Field()
            friends = Field()

        i = TestItem2(name='John', friends=['Mary', 'Paul'])
        output = StringIO()
        ie = CsvItemExporter(output, include_headers_line=False)
        ie.start_exporting()
        ie.export_item(i)
        ie.finish_exporting()
        self.assertEqual(output.getvalue(), '"Mary,Paul",John\r\n')