Example #1
0
    def process_item(self, item, spider):
        DATA_STORE = spider.settings.get('DATA_STORE')
        if item and \
                'book' in item.keys() and \
                'text' in item.keys() and \
                'mp3' in item.keys() and \
                'url' in item.keys():
            book_name = get_book_name(item)

            ensure_dir(os.path.join(DATA_STORE, book_name))

            filename = os.path.join(DATA_STORE, book_name,
                                    get_filename(item, 'txt'))
            if not os.path.exists(filename):
                chapter_file = open(filename, 'w')
                self.files[spider] = chapter_file
                self.exporter = FileExporter(chapter_file)
                self.exporter.start_exporting()
                self.exporter.export_item("\n".join(item['text']))
                self.exporter.finish_exporting()
                chapter_file = self.files.pop(spider)
                chapter_file.close()
        if item and \
                'name' in item.keys() and \
                'urls' in item.keys():
            found_in_bible_file = False
            CONTENT_FILE = os.path.join(DATA_STORE,
                                        spider.settings.get('CONTENT_FILE'))
            if os.path.exists(CONTENT_FILE):
                with open(CONTENT_FILE, 'r') as bible:
                    for books in bible:
                        if item['name'] in books:
                            found_in_bible_file = True

                            break
            else:
                ensure_dir('%s' % DATA_STORE)

            if not found_in_bible_file:
                bible_file = open(CONTENT_FILE, 'a+')
                self.files[spider] = bible_file
                self.exporter = JsonLinesItemExporter(bible_file)
                self.exporter.start_exporting()
                self.exporter.export_item(item)
                self.exporter.finish_exporting()
                chapter_file = self.files.pop(spider)
                chapter_file.close()
        return item
Example #2
0
    def open_spider(self, spider):
        self.file = open('articles.json', 'a+b')

        self.exporter = JsonLinesItemExporter(self.file)
        self.exporter.start_exporting()

        self.file.seek(0)
        articles_seen = [
            json.loads(line)['url'] for line in self.file.read().splitlines()
        ]
        self.articles_seen = set(articles_seen)

        self.dirname = os.path.join("articles",
                                    spider.allowed_domains[0]) + "/html"
        if not os.path.exists(self.dirname):
            os.makedirs(self.dirname)
Example #3
0
 def exporter_for_format(feed_format, f):
     if feed_format == 'csv':
         return CsvItemExporter(f)
     elif feed_format == 'xml':
         return XmlItemExporter(f)
     elif feed_format == 'json':
         return JsonItemExporter(f)
     elif feed_format == 'jsonlines':
         return JsonLinesItemExporter(f)
     elif feed_format == 'pickle':
         return PickleItemExporter(f)
     elif feed_format == 'marshal':
         return MarshalItemExporter(f)
     else:
         raise ValueError(
             'Export format {} is not supported'.format(feed_format))
Example #4
0
 def __init__(self):
     self.path = PROJECT_PATH
     self.json = open(
         os.path.join(
             self.path,
             'Bssale{}.json'.format(datetime.now().strftime('%Y-%m-%d'))),
         'ab')
     self.json_exporter = JsonLinesItemExporter(self.json,
                                                ensure_ascii=False,
                                                encoding='utf-8')
     self.csv = open(
         os.path.join(
             self.path,
             'Bssale{}.csv'.format(datetime.now().strftime('%Y-%m-%d'))),
         'ab')
     self.csv_exporter = CsvItemExporter(self.csv, encoding='utf-8')
     self.kafka_producer = None
Example #5
0
    def __init__(self, user_data_dir):
        '''Open file to save the exported Items'''
        self.user_data_dir = user_data_dir

        if not os.path.isdir(self.user_data_dir):
            os.makedirs(self.user_data_dir)

        # save info of BoardItem
        self.board_info = open(self.user_data_dir + 'boards.json', 'w+b')
        self.board_exporter = JsonItemExporter(self.board_info,
                                               encoding='utf-8',
                                               indent=4)

        # save info of PinItem
        self.pin_info = open(self.user_data_dir + 'pins.json', 'w+b')
        self.pin_exporter = JsonLinesItemExporter(self.pin_info,
                                                  encoding='utf-8',
                                                  indent=4)
Example #6
0
    def spider_opened(self, spider):

        file_pprnt = open(
            '%s_pprint-items0' % spider.name,
            'w+b',
        )
        file_jsl = open(
            '%s_json-items0' % spider.name,
            'w+b',
        )

        self.jsl_exporter = JsonLinesItemExporter(file_jsl)
        self.pprnt_exporter = PprintItemExporter(file_pprnt)

        self.files[spider] = [file_pprnt, file_jsl]
        self.pprnt_exporter.indent = 2
        self.pprnt_exporter.start_exporting()
        self.jsl_exporter.start_exporting()
Example #7
0
    def process_item(self, item, spider):
        '''Save item info to loacl file'''
        if isinstance(item, VmgirlsItem):
            self.girls_info = open(
                os.path.join(self.user_data_dir, 'vmgirls.json'), 'w+b')
            self.girls_exporter = JsonLinesItemExporter(self.girls_info,
                                                        encoding='utf-8',
                                                        indent=4)

            self.girls_exporter.start_exporting()

            for url, title in zip(item['theme_urls'], item['theme_titles']):
                single_item = {'theme_url': url, 'title': title}
                self.girls_exporter.export_item(single_item)

            self.girls_exporter.finish_exporting()
            self.girls_info.close()
        return item
Example #8
0
    def _exporter_for_item(self, item):
        unit = ""

        search_topic = item["search_topic"]
        search_location = item["search_location"]
        teacher = item["teacher"]
        unit = "-".join([search_topic, search_location])

        full_search_date = time.strftime("%Y-%m-%d")

        export_name = f'{search_topic}_{full_search_date}_{search_location.lower()}.json'

        if unit not in self.unit_to_exporter:
            f = open(f"./data/scraper/{full_search_date}/{export_name}", 'wb')
            exporter = JsonLinesItemExporter(f)
            exporter.start_exporting()
            self.unit_to_exporter[unit] = exporter
        
        return self.unit_to_exporter[unit]
Example #9
0
    def process_item(self, item, spider):

        dirName = "crawled"
        if not os.path.exists(dirName):
            os.makedirs(dirName)

        date = item["date"]
        filename = "naver_news_{}.json".format(date)

        filpath = os.path.join(dirName, filename)

        mode = "wb"

        if os.path.exists(filpath):
            mode = "ab"

        with open(filpath, mode) as f:
            exporter = JsonLinesItemExporter(f, encoding="utf-8")
            exporter.export_item(item)
            return item
Example #10
0
    def process_item(self, item, spider):
        DATA_STORE = spider.settings.get('DATA_STORE')
        if item and \
                'letter' in item.keys() and \
                'strongs_number' in item.keys() and \
                'word_original' in item.keys() and \
                'word_translated' in item.keys():
            found_in_words_file = False

            language = 'all'
            if item['strongs_number'][0] == 'H':
                language = 'hebrew'
            elif item['strongs_number'][0] == 'G':
                language = 'greek'

            WORDS_FILE = os.path.join(
                DATA_STORE,
                spider.settings.get('DICTIONARY_FILE') %
                (language, item['letter']))
            if os.path.exists(WORDS_FILE):
                with open(WORDS_FILE, 'r') as words:
                    for word in words:
                        data = json.loads(word)
                        if item['word_translated'] == data['word_translated'] and \
                                item['strongs_number'] == data['strongs_number']:
                            found_in_words_file = True
                            break
            else:
                ensure_dir('%s' % os.path.dirname(WORDS_FILE))

            if not found_in_words_file:
                words_file = open(WORDS_FILE, 'a+')
                self.files[spider] = words_file
                self.exporter = JsonLinesItemExporter(words_file)
                self.exporter.start_exporting()
                self.exporter.export_item(item)
                self.exporter.finish_exporting()
                word_file = self.files.pop(spider)
                word_file.close()
        return item
Example #11
0
    def _make_fileobj(self):
        """
        Build file object from items.
        """

        bio = BytesIO()
        f = gzip.GzipFile(mode='wb', fileobj=bio) if self.use_gzip else bio

        # Build file object using ItemExporter
        exporter = JsonLinesItemExporter(f, encoding='utf-8')
        exporter.start_exporting()
        for item in self.items:
            exporter.export_item(item)
        exporter.finish_exporting()

        if f is not bio:
            f.close()  # Close the file if GzipFile

        # Seek to the top of file to be read later
        bio.seek(0)

        return bio
Example #12
0
    def __init__(self, crawler):
        if crawler.settings['ENABLE_MONGODB']:
            db = MongoClient(crawler.settings['MONGODB_URL'])
            self.db = db['scrapy']

        self.crawler = crawler

        if crawler.settings.get('path'):
            path = crawler.settings.get('path')
        else:
            path = crawler.settings.get('DEFAULT_EXPORT_PATH')

        if crawler.settings.get('file'):
            filename = crawler.settings.get('file')
            if '\\' not in filename and '/' not in filename:
                filename = Path(path, filename)
        else:
            filename = Path(
                path, '%s_%s-performers.json' %
                (crawler.spidercls.name, time.strftime('%Y%m%d-%H%M')))

        if crawler.settings.getbool('export'):
            print(f"*** Exporting to file: {filename}")
            self.fp = open(filename, 'wb')
            self.fp.write('{"scenes":['.encode())

            if crawler.settings.getbool('oneline'):
                self.exporter = JsonLinesItemExporter(self.fp,
                                                      ensure_ascii=False,
                                                      encoding='utf-8')
            else:
                self.exporter = JsonItemExporter(self.fp,
                                                 ensure_ascii=False,
                                                 encoding='utf-8',
                                                 sort_keys=True,
                                                 indent=2)
Example #13
0
 def __init__(self):
     #wb以二进制方式打开
     self.fp = open("comments.json", 'wb')
     self.exporter = JsonLinesItemExporter(self.fp,
                                           ensure_ascii=False,
                                           encoding='utf-8')
Example #14
0
 def __init__(self):
     self.fp = open("zufang1.json", 'wb')
     self.exporter = JsonLinesItemExporter(self.fp,
                                           ensure_ascii=False,
                                           encoding='utf-8')
     self.exporter.start_exporting()
Example #15
0
 def open_spider(self, spider):
     print('爬虫开始啦')
     self.export = JsonLinesItemExporter(self.fp,
                                         ensure_ascii=False,
                                         encoding="utf-8")
Example #16
0
 def __init__(self):
     # self.fp = open("D:\\spark-input\\kugouMusic.json", "wb")
     self.fp = open("data\\jobbole.json", "wb")
     self.exporter = JsonLinesItemExporter(self.fp,
                                           ensure_ascii=False,
                                           encoding="utf-8")
Example #17
0
 def __init__(self):
     # JsonItemExporter 是以二进制的形式打开和写入的,所以要加b
     self.fp = open("duanzi.json", 'wb')
     self.exporter = JsonLinesItemExporter(self.fp,
                                           ensure_ascii=False,
                                           encoding='utf-8')
Example #18
0
 def open_spider(self, spider):
     """Initialize export JSON lines file."""
     self.file = open("gov.json", "wb")
     self.exporter = JsonLinesItemExporter(self.file, ensure_ascii=False)
     self.exporter.start_exporting()
Example #19
0
 def __init__(self):
     self.fp = open('wxjc_dev.json', 'wb')
     self.exporter = JsonLinesItemExporter(self.fp,
                                           ensure_ascii=False,
                                           encoding='utf-8')
Example #20
0
 def __init__(self):
     self.fp = open("doutula.json", 'wb')
     self.exporter = JsonLinesItemExporter(self.fp,
                                           ensure_ascii=False,
                                           encoding='utf-8')
Example #21
0
 def __init__(self):
     self.fp = open("tallxiu.json", "wb")
     self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False)
Example #22
0
 def export_item(self, item):
     if self.exporter == None:
         self.exporter = JsonLinesItemExporter(file=self.file,encoding='utf-8')
     self.exporter.export_item(item)
Example #23
0
 def open_spider(self, spider):
     self.file = open("output.jsonl", 'w')
     self.exporter = JsonLinesItemExporter(self.file,
                                           encoding='utf8',
                                           ensure_ascii=False)
     self.exporter.start_exporting()
 def open_spider(self, spider):
     timestamp = datetime.now().strftime(r'%Y-%m-%d_%H-%M-%S')
     file_name = 'data/timeline_data_' + timestamp + '.jl'
     self.file = open(file_name, 'wb')
     self.exporter = JsonLinesItemExporter(self.file, encoding='utf-8')
Example #25
0
 def __init__(self):
     self.fp = open('jobs.json', 'wb')
     self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False)
Example #26
0
 def spider_opened(self, spider):
     file = open('info_{}.json'.format(self.file_count), 'a+b')
     self.files[spider] = file
     self.exporter = JsonLinesItemExporter(file)
     self.exporter.start_exporting()
Example #27
0
 def __init__(self):
     self.file = open("dates.jsonl", 'wb')
     self.exporter = JsonLinesItemExporter(self.file,
                                           encoding='utf-8',
                                           ensure_ascii=False)
     self.exporter.start_exporting()
Example #28
0
 def __init__(self):
     # 存储在本地文件中
     self.fp = open("moocsql.json", 'wb')
     self.exporter = JsonLinesItemExporter(self.fp,
                                           ensure_ascii=False,
                                           encoding='utf-8')
Example #29
0
 def __init__(self):
     self.fp = open("duanzhi.json", "wb")
     self.expoter = JsonLinesItemExporter(self.fp,
                                          ensure_ascii=False,
                                          encoding='utf-8')
Example #30
0
 def __init__(self):
     now_time = time.strftime("%Y-%m-%d %H-%M-%S", time.localtime())
     self.fp = open(now_time + ".json", 'wb')
     self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False)