コード例 #1
0
ファイル: pipelines.py プロジェクト: taichao/newsspider
class BaseFilePipeline(object):
    def __init__(self,saved_path):
        self.files = {}
        self.saved_path = saved_path
        self.exporter = None

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls(crawler.settings.get('SAVED_PATH'))
        return pipeline


    def open_spider(self, spider):
        tp = self.gettype()['name']
        filename = '%s_%s.json' % (spider.name,tp)
        filename = os.path.join(self.saved_path,filename)

        file_ = open(filename,'w+b')
        self.files[spider] = file_
        self.exporter = JsonItemExporter(file_,ensure_ascii=False,encoding='utf-8')
        self.exporter.start_exporting()

    def gettype():
        pass

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        file_ = self.files.pop(spider)
        file_.close()
コード例 #2
0
 def spider_opened(self, spider):
     if spider.__class__ == MayorsSpider:
         mayor_file = open("data/mayor_candidates.json", "w+b")
         council_file = open("data/city_counsils.json", "w+b")
         self.files.append(mayor_file)
         self.files.append(council_file)
         self.mayor_exporter = JsonItemExporter(mayor_file)
         self.council_exporter = JsonItemExporter(council_file)
         self.mayor_exporter.start_exporting()
         self.council_exporter.start_exporting()
     elif spider.__class__ == RegionCountiesSpider:
         counties_file = open("data/region_counties.json", "w+b")
         self.counties_exporter = JsonItemExporter(counties_file)
         self.files.append(counties_file)
コード例 #3
0
ファイル: pipelines.py プロジェクト: spsp01/spider_ceneo
 def open_spider(self, spider):
     # Creates 4 files for storage scraped items
     self.category_file = open('spider/scraped/category.json', 'wb')
     self.category_exporter = JsonItemExporter(self.category_file, encoding="utf-8")
     self.category_exporter.start_exporting()
     self.product_file = open('spider/scraped/product.json', 'wb')
     self.product_exporter = JsonItemExporter(self.product_file, encoding="utf-8")
     self.product_exporter.start_exporting()
     self.shop_file = open('spider/scraped/shop.json', 'wb')
     self.shop_exporter = JsonItemExporter(self.shop_file, encoding="utf-8")
     self.shop_exporter.start_exporting()
     self.product_price_file = open('spider/scraped/productprice.json', 'wb')
     self.product_price_exporter = JsonItemExporter(self.product_price_file, encoding="utf-8")
     self.product_price_exporter.start_exporting()
コード例 #4
0
ファイル: pipelines.py プロジェクト: Chouvic/food2vec
 def __init__(self, spider_name):
   self.file = open("output/{}_recipes.json".format(spider_name), 'wb')
   self.file.write(
       '{"date_scraped": "%s", "recipes": ' % datetime.datetime.now()
   )
   self.exporter = JsonItemExporter(self.file, encoding='utf-8',
                                    ensure_ascii=False)
   self.exporter.start_exporting()
コード例 #5
0
ファイル: pipelines.py プロジェクト: taichao/newsspider
    def open_spider(self, spider):
        tp = self.gettype()['name']
        filename = '%s_%s.json' % (spider.name,tp)
        filename = os.path.join(self.saved_path,filename)

        file_ = open(filename,'w+b')
        self.files[spider] = file_
        self.exporter = JsonItemExporter(file_,ensure_ascii=False,encoding='utf-8')
        self.exporter.start_exporting()
コード例 #6
0
ファイル: pipelines.py プロジェクト: bgcolors/ztcrawl
class JsonExportPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = codecs.open('%s_data.json' % spider.name, 'w+b', encoding='utf-8')
        self.files[spider] = file
        self.exporter = JsonItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #7
0
class JsonExportPipeline(object):
    """
    app.pipelines.exporter_json.JsonExportPipeline
    """
    def __init__(self):
        self.files = {}
        self.exporter = None

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file_json = open('%s_items.json' % spider.name, 'w+b')
        self.files[spider] = file_json
        self.exporter = JsonItemExporter(file_json)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file_json = self.files.pop(spider)
        file_json.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #8
0
ファイル: pipelines.py プロジェクト: AnnaYe/NjuptSpider
class JsonPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('/home/gaoliang/Desktop/result.json', 'w+b')
        self.files[spider] = file
        self.exporter = JsonItemExporter(file, ensure_ascii=False)  # 添加ensure_ascii=False用于使json保存中文不乱码
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #9
0
ファイル: pipelines.py プロジェクト: eatskolnikov/DEVCA2016
class SaveItemToJson(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_items.json' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = JsonItemExporter(file=file)
        print self.exporter
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #10
0
ファイル: pipelines.py プロジェクト: Chouvic/food2vec
class JsonPipeline(object):
  """Save Pipeline output to JSON."""
  def __init__(self, spider_name):
    self.file = open("output/{}_recipes.json".format(spider_name), 'wb')
    self.file.write(
        '{"date_scraped": "%s", "recipes": ' % datetime.datetime.now()
    )
    self.exporter = JsonItemExporter(self.file, encoding='utf-8',
                                     ensure_ascii=False)
    self.exporter.start_exporting()

  @classmethod
  def from_crawler(cls, crawler):
    return cls(
        spider_name=crawler.spider.name
    )

  def close_spider(self):
    self.exporter.finish_exporting()
    self.file.write("}")
    self.file.close()

  def process_item(self, item):
    self.exporter.export_item(item)
    return item
コード例 #11
0
ファイル: pipelines.py プロジェクト: 65kg/core-scrapy
class JsonExportPipeline(object):
    def __init__(self):
        _log.info('JsonExportPipeline.init....')
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        _log.info('JsonExportPipeline.from_crawler....')
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        _log.info('JsonExportPipeline.spider_opened....')
        file = open('%s.json' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = JsonItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        _log.info('JsonExportPipeline.spider_closed....')
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        _log.info('JsonExportPipeline.process_item....')
        self.exporter.export_item(item)
        return item
コード例 #12
0
ファイル: mixins.py プロジェクト: gilbertoalexsantos/uris
class JsonPipelineExporterMixin:
    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_items.json' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = JsonItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        item = self.pre_process_item(item)
        self.exporter.export_item(item)
        return item

    def pre_process_item(self, item):
        return item
コード例 #13
0
ファイル: pipelines.py プロジェクト: 201585052/-
class JsonExporterPipeline(object):
    # 调用scrapy提供的json_export 导出json文件
    def __init__(self):
        self.file = open('articleexport.json','wb')
        self.exporter = JsonItemExporter(self.file,encoding="utf-8",ensure_ascii = False)
        self.exporter.start_exporting()
    def close_spider(self,spider):
        self.exporter.finish_exporting()
        self.file.close()
    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #14
0
ファイル: pipelines.py プロジェクト: sunnykaka/skwander
    def process_item(self, item, spider):

        designer_dir_name = skutils.escape_filename(item['name'])
        designer_dir_path = os.path.join(GlobalState.data_dir, designer_dir_name)
        file_path = os.path.join(designer_dir_path, designer_dir_name)

        # write json file
        with open('%s.json' % file_path, 'w+b') as f:
            exporter = JsonItemExporter(f)
            exporter.start_exporting()
            exporter.export_item(item)
            exporter.finish_exporting()

        # write excel file
        excelutils.write_designer_excel(item, file_path, designer_dir_name)

        return item
コード例 #15
0
ファイル: pipelines.py プロジェクト: anamarce/Visions-crawler
class VisionsJsonPipeline(object):
    def __init__(self):
        self.exporter = None

    def open_spider(self, spider):
        self.exporter = JsonItemExporter(open('%s.json' %spider.name, 'wb'))
        self.exporter.start_exporting()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.exporter.finish_exporting()
コード例 #16
0
ファイル: pipelines.py プロジェクト: kunghunglu/lmcrawler
class JsonWriterPipeline(BaseItemExporter):

  def __init__(self, **kwargs):
    self._configure(kwargs)
    self.files = {} 
    self.encoder = json.JSONEncoder(ensure_ascii=False, **kwargs)
 
  @classmethod
  def from_crawler(cls, crawler):
    pipeline = cls()
    crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
    crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
    return pipeline

  def spider_opened(self, spider):
    file = codecs.open('item.json', 'wb', encoding="utf-8")
    self.files[spider] = file
    self.exporter = JsonItemExporter(file)
    self.exporter.start_exporting()

  def spider_closed(self, spider):
    self.exporter.finish_exporting()
    file = self.files.pop(spider)
    file.close() 

  def process_item(self, item, spider):

    if item['title']: # and item['image_url'] :
      item['description'] = re.sub("\r|\n","", item['description'])
      item['general_impression'] = re.sub("\r|\n","", item['general_impression'])
      item['subject_of_photo'] = re.sub("\r|\n","", item['subject_of_photo'])
      item['composition'] = re.sub("\r|\n","", item['composition'])
      item['use_of_camera'] = re.sub("\r|\n","", item['use_of_camera'])
      item['depth_of_field'] = re.sub("\r|\n","", item['depth_of_field'])
      item['color_lighting'] = re.sub("\r|\n","", item['color_lighting'])
      item['focus'] = re.sub("\r|\n","", item['focus'])

      ##line = json.dumps(dict(item)) + '\n'
      ##self.file.write(line)
      self.exporter.export_item(item)
    return item   
コード例 #17
0
ファイル: pipelines.py プロジェクト: marcmilan/urban-fiesta
class JsonExportPipeline(object):

    def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.files = {}

    def spider_opened(self, spider):
        file = open('%s_items.json' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = JsonItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
コード例 #18
0
class WikicrawlerPipeline(object):

    def __init__(self):
        self.item_file = open('items.json', 'wb')
        self.exporter = JsonItemExporter(self.item_file)

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.exporter.start_exporting()

    def process_item(self, item, spider):
        self.exporter.export_item(item)

    def spider_closed(self):
        self.exporter.finish_exporting()
        self.item_file.close()
コード例 #19
0
ファイル: pipelines.py プロジェクト: lores/bi_spider
class SiteMapJsonExportPipeline(object):
	'''Process the SiteMap spider output Items, and write them as JSON to an output file. The output file is taken from the Spider's config (spider.config)'''

	@classmethod
	def from_crawler(cls, crawler):
		''' Boilerplate '''
		pipeline = cls()
		crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
		crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
		return pipeline

	def spider_opened(self, spider):
		self.file = open(spider.config['map_file'], 'wb')
		self.exporter = JsonItemExporter(self.file)
		self.exporter.start_exporting()

	def spider_closed(self, spider):
		self.exporter.finish_exporting()
		self.file.close()

	def process_item(self, item, spider):
		self.exporter.export_item(item)
		return item
コード例 #20
0
 def __init__(self):
     self.file = open('../data/search_results.json', 'wb')
     self.exporter = JsonItemExporter(self.file,encoding='utf-8',ensure_ascii=False)
     self.exporter.start_exporting()
コード例 #21
0
ファイル: pipelines.py プロジェクト: anamarce/Visions-crawler
 def open_spider(self, spider):
     self.exporter = JsonItemExporter(open('%s.json' %spider.name, 'wb'))
     self.exporter.start_exporting()
コード例 #22
0
 def open_spider(self, spider):
     f = open('items.json', 'wb')
     self.exporter = JsonItemExporter(f)
     self.exporter.start_exporting()
コード例 #23
0
 def __init__(self):
     # self.file = open('/output/article_exporter.json', 'wb')
     self.file = open('/output/company_exporter.json', 'wb')
     self.exporter = JsonItemExporter(self.file,
                                      encoding='utf-8',
                                      ensure_ascii=False)
コード例 #24
0
ファイル: pipelines.py プロジェクト: LouiseMcMahon/UKCaveGIS
 def open_spider(self, spider):
     self.files[spider.registry] = open('data/' + spider.registry + '.json',
                                        'wb')
     self.exporters[spider.registry] = JsonItemExporter(
         self.files[spider.registry], encoding='utf-8', ensure_ascii=False)
     self.exporters[spider.registry].start_exporting()
コード例 #25
0
 def _get_exporter(self, **kwargs):
     return JsonItemExporter(self.output, **kwargs)
コード例 #26
0
ファイル: pipelines.py プロジェクト: easherma/scrape-n-geo
 def __init__(self):
     self.file = open(generate_file_name('json', 'output'), 'wb')
     self.exporter = JsonItemExporter(self.file,
                                      encoding='utf-8',
                                      ensure_ascii=False)
     self.exporter.start_exporting()
コード例 #27
0
ファイル: pipelines.py プロジェクト: kunghunglu/lmcrawler
 def spider_opened(self, spider):
   file = codecs.open('item.json', 'wb', encoding="utf-8")
   self.files[spider] = file
   self.exporter = JsonItemExporter(file)
   self.exporter.start_exporting()
コード例 #28
0
ファイル: pipelines.py プロジェクト: 65kg/core-scrapy
 def spider_opened(self, spider):
     _log.info('JsonExportPipeline.spider_opened....')
     file = open('%s.json' % spider.name, 'w+b')
     self.files[spider] = file
     self.exporter = JsonItemExporter(file)
     self.exporter.start_exporting()
コード例 #29
0
 def __init__(self):
     self.file = open('article2.json', 'wb')
     self.exporter = JsonItemExporter(self.file,
                                      encoding='utf-8',
                                      ensure_ascii=False)
コード例 #30
0
ファイル: pipelines.py プロジェクト: Tmacshamgod/WikiCrawler
 def __init__(self):
     self.item_file = open('items.json', 'wb')
     self.exporter = JsonItemExporter(self.item_file)
コード例 #31
0
 def open_spider(self, spider):
     time_now = datetime.datetime.now().strftime('%m-%d-%Y')
     file_name = f"House-{time_now}.json"
     self.file = open(file_name, 'wb')
     self.exporter = JsonItemExporter(self.file)
     self.exporter.start_exporting()
コード例 #32
0
 def spider_opened(self, spider):
     file = open(self.out_file, 'w+b')
     self.files[spider] = file
     self.exporter = JsonItemExporter(file)
     self.exporter.start_exporting()
コード例 #33
0
 def __init__(self):
     self.item_file = open('items.json', 'wb')
     self.exporter = JsonItemExporter(self.item_file)
コード例 #34
0
 def __init__(self):
     self.file = open("trans.json", 'wb')
     self.exporter = JsonItemExporter(
         self.file
     )  # json is  by definition an unordered collection thus you need to order cols in python later
     self.exporter.start_exporting()
コード例 #35
0
ファイル: pipelines.py プロジェクト: rajesh67/shopstats
	def spider_opened(self, spider):
		file = open('%s_%s.json' % (spider.name,spider.categoryId), 'w+b')
		self.files[spider] = file
		self.exporter = JsonItemExporter(file)
		self.exporter.start_exporting()
コード例 #36
0
 def __init__(self):
     self.fp=open("tv.json","wb")
     self.exporter=JsonItemExporter(self.fp,encoding='utf-8',ensure_ascii=False)
     self.exporter.start_exporting()
コード例 #37
0
 def __init__(self):
     self.file = open("news_Crawl_from20060101_200809011.json", 'wb')
     self.exporter = JsonItemExporter(self.file, encoding='utf-8')
     self.exporter.start_exporting()
コード例 #38
0
ファイル: json.py プロジェクト: louis-xy/spider-service
 def __init__(self, file_path):
     self.file = open("{}".format(file_path), 'wb')
     self.exporter = JsonItemExporter(self.file,
                                      encoding='utf-8',
                                      ensure_ascii=False)
     self.exporter.start_exporting()
コード例 #39
0
ファイル: pipelines.py プロジェクト: gzgdouru/scrapy_test
 def __init__(self):
     self.file = open("articleexport.json", "wb")
     self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False)
     self.exporter.start_exporting()
コード例 #40
0
ファイル: pipelines.py プロジェクト: ThePornDatabase/scrapy
class TpdbApiScenePipeline:
    def __init__(self, crawler):
        if crawler.settings['ENABLE_MONGODB']:
            db = MongoClient(crawler.settings['MONGODB_URL'])
            self.db = db['scrapy']

        self.crawler = crawler

        if crawler.settings.get('path'):
            path = crawler.settings.get('path')
        else:
            path = crawler.settings.get('DEFAULT_EXPORT_PATH')

        if crawler.settings.get('file'):
            filename = crawler.settings.get('file')
            if '\\' not in filename and '/' not in filename:
                filename = Path(path, filename)
        else:
            filename = Path(path, '%s_%s.json' % (crawler.spidercls.name, time.strftime('%Y%m%d-%H%M')))

        if crawler.settings.getbool('export'):
            print(f'*** Exporting to file: {filename}')
            self.fp = open(filename, 'wb')
            self.fp.write('{"scenes":['.encode())

            if crawler.settings.getbool('oneline'):
                self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8')
            else:
                self.exporter = JsonItemExporter(self.fp, ensure_ascii=False, encoding='utf-8', sort_keys=True, indent=2)

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    async def process_item(self, item, spider):
        if spider.debug is True:
            return item

        # So we don't re-send scenes that have already been scraped
        if self.crawler.settings['ENABLE_MONGODB']:
            if spider.force is not True:
                result = self.db.scenes.find_one({'url': item['url']})
                if result is not None:
                    return

        payload = {
            'title': item['title'],
            'description': item['description'],
            'date': item['date'],
            'image': item['image'],
            'image_blob': item['image_blob'],
            'url': item['url'],
            'performers': item['performers'],
            'tags': item['tags'],
            'external_id': str(item['id']),
            'site': item['site'],
            'trailer': item['trailer'],
            'parent': item['parent'],
            'network': item['network'],
            'force_update': self.crawler.settings.getbool('FORCE_UPDATE'),
        }

        # Post the scene to the API - requires auth with permissions
        if self.crawler.settings['TPDB_API_KEY'] and not spider.settings.get('local'):
            headers = {
                'Authorization': 'Bearer %s' % self.crawler.settings['TPDB_API_KEY'],
                'Accept': 'application/json',
                'Content-Type': 'application/json',
                'User-Agent': 'tpdb-scraper/1.0.0'
            }

            response = Http.post('https://api.metadataapi.net/scenes', json=payload, headers=headers)
            if response:
                if response.ok:
                    disp_result = 'Submitted OK'
                else:
                    disp_result = 'Submission Error: Code #%d' % response.status_code
            else:
                disp_result = 'Submission Error: No Response Code'

            url_hash = hashlib.sha1(str(item['url']).encode('utf-8')).hexdigest()

            if self.crawler.settings['MONGODB_ENABLE']:
                if not response.ok:
                    self.db.errors.replace_one({'_id': url_hash}, {
                        'url': item['url'],
                        'error': 1,
                        'when': datetime.now().isoformat(),
                        'response': response.json()
                    }, upsert=True)
                else:
                    self.db.scenes.replace_one(
                        {'_id': url_hash}, dict(item), upsert=True)
        else:
            disp_result = 'Local Run, Not Submitted'

        if spider.settings.getbool('display') and spider.settings.get('LOG_LEVEL') == 'INFO':
            if len(item['title']) >= 50:
                title_length = 5
            else:
                title_length = 55 - len(item['title'])

            if len(item['site']) >= 15:
                site_length = 5
            else:
                site_length = 20 - len(item['site'])

            if "T" in item['date']:
                disp_date = re.search(r'(.*)T\d', item['date']).group(1)
            else:
                disp_date = item['date']

            print(f"Item: {item['title'][0:50]}" + " " * title_length + f"{item['site'][0:15]}" + " " * site_length + f"\t{str(item['id'])[0:15]}\t{disp_date}\t{item['url']}\t{disp_result}")

        if spider.settings.getbool('export'):
            item2 = item.copy()
            if not spider.settings.get('showblob'):
                if 'image_blob' in item2:
                    item2.pop('image_blob', None)
            self.exporter.export_item(item2)

        return item

    def close_spider(self, spider):
        if spider.settings.getbool('export'):
            self.fp.write(']}'.encode())
            self.fp.close()
コード例 #41
0
 def open_spider(self, spider):
     self.file = open(result_json_path, "wb")
     self.exporter = JsonItemExporter(self.file,
                                      encoding="utf-8",
                                      ensure_ascii=False)
     self.exporter.start_exporting()
コード例 #42
0
ファイル: pipelines.py プロジェクト: ThePornDatabase/scrapy
class TpdbApiPerformerPipeline:
    def __init__(self, crawler):
        if crawler.settings['ENABLE_MONGODB']:
            db = MongoClient(crawler.settings['MONGODB_URL'])
            self.db = db['scrapy']

        self.crawler = crawler

        if crawler.settings.get('path'):
            path = crawler.settings.get('path')
        else:
            path = crawler.settings.get('DEFAULT_EXPORT_PATH')

        if crawler.settings.get('file'):
            filename = crawler.settings.get('file')
            if '\\' not in filename and '/' not in filename:
                filename = Path(path, filename)
        else:
            filename = Path(path, '%s_%s-performers.json' % (crawler.spidercls.name, time.strftime('%Y%m%d-%H%M')))

        if crawler.settings.getbool('export'):
            print(f"*** Exporting to file: {filename}")
            self.fp = open(filename, 'wb')
            self.fp.write('{"scenes":['.encode())

            if crawler.settings.getbool('oneline'):
                self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8')
            else:
                self.exporter = JsonItemExporter(self.fp, ensure_ascii=False, encoding='utf-8', sort_keys=True, indent=2)

    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler)

    async def process_item(self, item, spider):
        if self.crawler.settings['ENABLE_MONGODB']:
            if spider.force is not True:
                result = self.db.performers.find_one({'url': item['url']})
                if result is not None:
                    return

        payload = {
            'name': item['name'],
            'site': item['network'],
            'url': item['url'],
            'image': item['image'],
            'image_blob': item['image_blob'],
            'bio': item['bio'],
            'gender': item['gender'],
            'birthday': item['birthday'],
            'astrology': item['astrology'],
            'birthplace': item['birthplace'],
            'ethnicity': item['ethnicity'],
            'nationality': item['nationality'],
            'eyecolor': item['eyecolor'],
            'haircolor': item['haircolor'],
            'weight': item['weight'],
            'height': item['height'],
            'measurements': item['measurements'],
            'tattoos': item['tattoos'],
            'piercings': item['piercings'],
            'cupsize': item['cupsize'],
            'fakeboobs': item['fakeboobs'],
        }

        # Post the scene to the API - requires auth with permissions
        if self.crawler.settings['TPDB_API_KEY'] and not spider.settings.get('local'):
            headers = {
                'Authorization': 'Bearer %s' % self.crawler.settings['TPDB_API_KEY'],
                'Accept': 'application/json',
                'Content-Type': 'application/json',
                'User-Agent': 'tpdb-scraper/1.0.0'
            }

            response = Http.post('https://api.metadataapi.net/performer_sites', json=payload, headers=headers, verify=False)
            if response:
                if response.ok:
                    disp_result = 'Submitted OK'
                else:
                    disp_result = 'Submission Error: Code #' + str(response.status_code)
            else:
                disp_result = 'Submission Error: No Response Code'

            if self.crawler.settings['MONGODB_ENABLE']:
                url_hash = hashlib.sha1(str(item['url']).encode('utf-8')).hexdigest()
                if not response.ok:
                    self.db.errors.replace_one({'_id': url_hash}, {
                        'url': item['url'],
                        'error': 1,
                        'when': datetime.now().isoformat(),
                        'response': response.json()
                    }, upsert=True)
                else:
                    self.db.performers.replace_one({'_id': url_hash}, dict(item), upsert=True)
        else:
            disp_result = 'Local Run, Not Submitted'

        if spider.settings.getbool('display') and spider.settings.get('LOG_LEVEL') == 'INFO':
            name_length = 50 - len(item['name'])
            if name_length < 1:
                name_length = 1

            print(f"Performer: {item['name']}" + " " * name_length + f"{item['network']}\t{item['url']}\t{disp_result}")

        if spider.settings.getbool('export'):
            item2 = item.copy()
            if not spider.settings.get('showblob'):
                if "image_blob" in item2:
                    item2.pop('image_blob', None)
            self.exporter.export_item(item2)

        return item

    def close_spider(self, spider):
        if spider.settings.getbool('export'):
            self.fp.write(']}'.encode())
            self.fp.close()
コード例 #43
0
 def spider_opened(self, spider):
     file = open('%s.json' % spider.name, 'w+b')
     self.files[spider] = file
     self.exporter = JsonItemExporter(file)
     self.exporter.start_exporting()
コード例 #44
0
 def open_spider(self, spider):
     self.file = open('results.jl', 'wb')
     self.exp = JsonItemExporter(self.file, indent=4)
     self.exp.start_exporting()
コード例 #45
0
ファイル: pipelines.py プロジェクト: thomasxutm/shipwiki
 def __init__(self):
     self.file = open('baike.json', 'wb')
     self.exporter = JsonItemExporter(self.file,
                                      encoding="utf-8",
                                      ensure_ascii=False)
     self.exporter.start_exporting()
コード例 #46
0
 def spider_opened(self, spider):
     _log.info('JsonExportPipeline.spider_opened....')
     file = open('%s.json' % spider.name, 'w+b')
     self.files[spider] = file
     self.exporter = JsonItemExporter(file)
     self.exporter.start_exporting()
コード例 #47
0
ファイル: pipelines.py プロジェクト: AnnaYe/NjuptSpider
 def spider_opened(self, spider):
     file = open('/home/gaoliang/Desktop/result.json', 'w+b')
     self.files[spider] = file
     self.exporter = JsonItemExporter(file, ensure_ascii=False)  # 添加ensure_ascii=False用于使json保存中文不乱码
     self.exporter.start_exporting()
コード例 #48
0
ファイル: pipelines.py プロジェクト: weningTT/loom
    def spider_opened(self, spider):
        file = open('%s_products.json' % spider.name, 'w+b')
        self.files[spider] = file
	self.exporter = JsonItemExporter(file,ensure_ascii=False)
        self.exporter.start_exporting()
コード例 #49
0
ファイル: pipelines.py プロジェクト: xzmeng/autohome
 def __init__(self):
     self.file = open('questions_exporter.json', 'wb')
     self.exporter = JsonItemExporter(self.file,
                                      encoding='utf-8',
                                      ensure_ascii=False)  # 输出中文格式
     self.exporter.start_exporting()
コード例 #50
0
 def __init__(self):
     self.file = open('book2.json', 'wb')
     self.exporter = JsonItemExporter(file=self.file,
                                      encoding='utf-8',
                                      ensure_ascii=False)
     self.exporter.start_exporting()
コード例 #51
0
ファイル: pipelines.py プロジェクト: GitForShadow/pythonjob
 def __init__(self):
     self.file = open('articleexport.json', 'ab')
     self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False)
     self.exporter.start_exporting()
コード例 #52
0
ファイル: pipelines.py プロジェクト: zhudms/Python_Crawler
 def __init__(self):
     self.fp = open('duanzi.josn', 'wb')
     self.exporter = JsonItemExporter(self.fp,
                                      ensure_ascii=False,
                                      encoding='utf-8')
     self.exporter.start_exporting()  ##开始导入
コード例 #53
0
ファイル: itemExportor.py プロジェクト: Andy-wangke/Front_end
 def __init__(self,file,**kwrgs):
     JsonItemExporter.__init__(self,file,ensure_ascii = False,**kwrgs)
コード例 #54
0
 def __init__(self):
     self.file = open("data_export.json", 'wb')
     self.exporter = JsonItemExporter(self.file,
                                      encoding='utf-8',
                                      ensure_ascii=False)
     self.exporter.start_exporting()
コード例 #55
0
ファイル: pipelines.py プロジェクト: onurg/channel-hound
 def __init__(self):
     self.file = open('channels.json', 'wb')
     self.exporter = JsonItemExporter(self.file)
     self.exporter.start_exporting()
コード例 #56
0
ファイル: pipelines.py プロジェクト: nicogig/SoFIFA_Scraper
 def __init__(self):
     self.file = open("../data/json/players_urls.json", 'wb')
     self.exporter = JsonItemExporter(self.file,
                                      encoding='utf-8',
                                      ensure_ascii=False)
     self.exporter.start_exporting()
コード例 #57
0
ファイル: pipelines.py プロジェクト: marcmilan/urban-fiesta
 def spider_opened(self, spider):
     file = open('%s_items.json' % spider.name, 'w+b')
     self.files[spider] = file
     self.exporter = JsonItemExporter(file)
     self.exporter.start_exporting()
コード例 #58
0
ファイル: pipelines.py プロジェクト: asdfx2x3/crawlingProject
 def open_spider(self, spider):
     self.file = open('Guoke.json', 'wb')
     self.exporter = JsonItemExporter(self.file, ensure_ascii=False, encoding='utf-8')
     self.exporter.start_exporting()
コード例 #59
0
 def open_spider(self, spider):
     self.file = open(self.filename, 'wb')
     self.exporter = JsonItemExporter(self.file)
     self.exporter.start_exporting()
コード例 #60
0
 def create_exporter(self, filename):
     file = open(filename, "w+b")
     exporter = JsonItemExporter(file)
     exporter.start_exporting()
     self.files.append(file)
     return exporter