Example #1
0
class CardPipeline(object):

    def __init__(self):
        self.files = {}

    def process_item(self, item, spider):
        if not item['wb_nick']\
                or not item['wb_location']\
                or not item['wb_images']:
            raise DropItem
        print item['wb_nick'][0]
        item['wb_content'] = ''.join(item['wb_content'])
        item['wb_date'] = item['wb_date'][0]
        item['wb_location'] = item['wb_location'][0]
        images_urls = item.pop('wb_images')
        item['wb_images'] = []
        for image_url in images_urls:
            image_url = image_url.replace('thumbnail', 'large')
            image_url = image_url.replace('square', 'large')
            item['wb_images'].append(image_url)
        self.exporter.export_item(item)
        return item

    def open_spider(self, spider):
        file = open('json/{}_products.json'.format(spider.name), 'w+b')
        self.files[spider] = file
        self.exporter = JsonLinesItemExporter(file)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()
Example #2
0
 def spider_opened(self, spider):
     for i in self.JSONWriters.values():
         file = open('%s_out.json' % i, 'w+b')
         self.files[spider] = file
         exporter = JsonLinesItemExporter(file)
         self.exporters[i] = exporter
         exporter.start_exporting()
     print(self.exporters)
 def spider_opened(self, spider):
     # The file created on Dec20 2015 will be named as "12-20-2015.json"
     datestr = date.today().strftime("%m-%d-%Y")
     file = open('scraped_data/%s.json' % datestr, 'w+b')
     self.files[spider] = file
     self.exporter = JsonLinesItemExporter(file, ensure_ascii=False)
     self.exporter.start_exporting()
Example #4
0
    def __init__(self):
        scrapy.Spider.__init__(self)
        baseurl = 'https://domaintyper.com/top-websites/most-popular-websites-with-edu-domain/page/'
        logpath = '/home/www/flasksite/static/scenes/unipagestats/schools.jl'

        self.start_urls = [baseurl + str(i) for i in xrange(1, 30)]
        self.domain = 'domaintyper.com'
        self.exporter = JsonLinesItemExporter(open(logpath, 'wb+'))
Example #5
0
def load_table(table, source = "default/test-0.jsonlines",\
			   modifier = "" , dblogin = "******" , \
			   as_name = None ):
	filename = source if as_name is None else as_name
	dbname, collectioname = parse_path(filename , modifier)
	connection = pymongo.MongoClient( dblogin )
	db = getattr(connection, dbname)
	collection = getattr( db , collectioname )
	try:
		result = collection.insert_many((set_id(obj) for obj in odicts(table)),ordered=False)
	except BulkWriteError as e:
		result = e.details
		errs = set()
		with open( "%s.%s" % (filename,"errors") , "a") as f:
			exporter = JsonLinesItemExporter(f)
			exporter.start_exporting()
			for err in result.get("writeErrors"):
				if not err.get("op").get("_id") in errs:
					obj = dict( item = err.get("op") , \
								error = err.get("errmsg") )
					errs.add( err.get("op").get("_id") )
					exporter.export_item(obj)
			exporter.finish_exporting()
			f.close()
	return result
Example #6
0
 def export_item(self, item):
     storage_file = open(self.item_storage_path(item["id"]), "w")
     item_exporter = JsonLinesItemExporter(storage_file)
     item_exporter.start_exporting()
     item_exporter.export_item(item)
     item_exporter.finish_exporting()
     storage_file.close()
Example #7
0
class JsonExport(object):
    def open_spider(self, spider):
        if spider and spider.settings.get('EXPORTER_PATH'):
            path = spider.settings.get('EXPORTER_PATH')
            file_name = spider.name + '.json'
        else:
            path = EXPORTER_PATH
            file_name = 'questions.json'

        self._file = open(path + file_name, 'w+b')
        self._exporter = JsonLinesItemExporter(self._file)

    def process_item(self, item, spider):
        self._exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self._file.close()
Example #8
0
class DuanziPipleline:
    def __init__(self):
        self.fp = open("duanzi.json", 'wb')
        self.exporter = JsonLinesItemExporter(self.fp,
                                              ensure_ascii=False,
                                              encoding='utf-8')
        self.exporter.start_exporting()

    def open_spider(self, spider):
        print("start....")

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.fp.close()
        print("over....")
Example #9
0
    def process_item(self, item, spider):

        filename = item['filename']
        del item['filename']

        # if the file exists it will append the data 
        JsonLinesItemExporter(open(filename, "ab")).export_item(item)

        return item
Example #10
0
class Scrapy01Pipeline(object):
    def __init__(self):
        print("__init__")
        self.ft = open("./Scrapy01/files/qsbk.json", "wb")  #注意以二进制的形式打开
        self.exporter = JsonLinesItemExporter(self.ft,
                                              ensure_ascii=False,
                                              encoding='utf-8')

    def open_spider(self, spider):
        print("open_spider")

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.ft.close()
        print("close_spider")
Example #11
0
class WechatAppPipeline(object):
    def __init__(self):
        self.json = open('wechat.json', 'wb')
        #Json写入器
        self.json_export = JsonLinesItemExporter(self.json,
                                                 ensure_ascii=False,
                                                 encoding='utf-8')

    def run(self):
        print("爬虫开始")

    def process_item(self, item, spider):
        self.json_export.export_item(item)
        return item

    def close_spider(self):
        self.json.close()
        print("爬虫结束")
Example #12
0
class NewHousePipeline(object):
    def __init__(self):
        
        self.newhouse_fp = open("newhouse.json",'wb')
        self.esf_fp = open("esf.json",'wb')
        self.newhouse_exporter = JsonLinesItemExporter(self.newhouse_fp, ensure_ascii=False)
        self.esf_exporter = JsonLinesItemExporter(self.esf_fp, ensure_ascii=False)

    def process_item(self, item, spider):
        if "newhouse" == item.get("house_style"):
            self.newhouse_exporter.export_item(item)
        elif "esf" == item.get("house_style"):
            self.esf_exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.newhouse_fp.close()
        self.esf_fp.close()
Example #13
0
    def process_item(self, item, spider):
        '''Save item info to loacl file'''
        if isinstance(item, VmgirlsItem):
            self.girls_info = open(
                os.path.join(self.user_data_dir, 'vmgirls.json'), 'w+b')
            self.girls_exporter = JsonLinesItemExporter(self.girls_info,
                                                        encoding='utf-8',
                                                        indent=4)

            self.girls_exporter.start_exporting()

            for url, title in zip(item['theme_urls'], item['theme_titles']):
                single_item = {'theme_url': url, 'title': title}
                self.girls_exporter.export_item(single_item)

            self.girls_exporter.finish_exporting()
            self.girls_info.close()
        return item
Example #14
0
class BaiducrawlerPipeline:
    def __init__(self):
        self.fp = open("data.json", "wb")
        self.exporter = JsonLinesItemExporter(self.fp,
                                              ensure_ascii=False,
                                              encoding='utf-8')

    def open_spider(self, spider):
        print("爬虫开始了")

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        print("存了")
        return item

    def close_spider(self, spider):
        self.fp.close()
        print("爬虫结束了")
Example #15
0
class HexunPipeline(object):
    def __init__(self):
        ssstime = time.strftime("%Y-%m-%d %H-%M-%S", time.localtime())
        self.fp = open("文章网址" + ssstime + ".json", 'wb')
        self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False)

    def open_spider(self, spider):
        print("=====爬虫开始力=====")

    def process_item(self, item, spider):

        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        self.fp.close()
        print("=====爬虫结束力=====")
Example #16
0
class WxappPipeline:
    # 当爬虫被打开的时候会调用
    def open_spider(self, spider):
        print("爬虫开始执行。。。")
        fileName = "article.json"
        self.fp = open(fileName, "wb")  # 必须以二进制的形式打开文件
        self.exporter = JsonLinesItemExporter(self.fp,
                                              ensure_ascii=False,
                                              encoding="utf-8")

    # 当爬虫有item传过来的时候会调用
    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    # 当爬虫关闭的时候会调用
    def close_spider(self, spider):
        print("爬虫执行结束")
Example #17
0
    def __init__(self, user_data_dir):
        '''Open file to save the exported Items'''
        self.user_data_dir = user_data_dir

        if not os.path.isdir(self.user_data_dir):
            os.makedirs(self.user_data_dir)

        # save info of BoardItem
        self.board_info = open(self.user_data_dir + 'boards.json', 'w+b')
        self.board_exporter = JsonItemExporter(self.board_info,
                                               encoding='utf-8',
                                               indent=4)

        # save info of PinItem
        self.pin_info = open(self.user_data_dir + 'pins.json', 'w+b')
        self.pin_exporter = JsonLinesItemExporter(self.pin_info,
                                                  encoding='utf-8',
                                                  indent=4)
Example #18
0
class JrsjPipeline:
    def __init__(self):
        #wb以二进制方式打开
        self.fp = open("xinwenli.json", 'wb')
        self.exporter = JsonLinesItemExporter(self.fp,
                                              ensure_ascii=False,
                                              encoding='utf-8')

    def open_spider(self, spider):
        print('爬虫开始了...')

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.fp.close()
        print('爬虫结束了')
Example #19
0
 def spider_opened(self, spider):
     if not os.path.isdir(self.outputs):
         os.mkdir(self.outputs)
     path = os.path.join(self.outputs, f'{spider.name}.json')
     if os.path.isfile(path):
         os.unlink(path)
     file = open(path, 'a+b')
     self.exporters[spider.name] = JsonLinesItemExporter(file)
     self.exporters[spider.name].start_exporting()
Example #20
0
class SohousePipeline(object):
    def __init__(self):
        self.file1 = open('D:/newhouse.jsno','wb')
        self.file2 = open("D:/esfhouse.json",'wb')
        self.expoter1 = JsonLinesItemExporter(self.file1,ensure_ascii=False)
        self.expoter2 = JsonLinesItemExporter(self.file2, ensure_ascii=False)

    def process_item(self, item, spider): # 区分item
        if isinstance(item,ESFitem):
            self.expoter2.export_item(item)
        if isinstance(item,NewHouseItem):
            self.expoter1.export_item(item)
        return item


    def close_spider(self,spider):
        self.file1.close()
        self.file2.close()
Example #21
0
class AmazoncleaningPipeline(object):
    def __init__(self):
        self.fp = open("results.jl", 'wb')
        self.exporter = JsonLinesItemExporter(self.fp,
                                              ensure_ascii=False,
                                              encoding='utf-8')

    def open_spider(self, spider):
        pass

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        self.fp.write(b'')

        return item

    def close_spider(self, spider):
        self.fp.close()
Example #22
0
    def spider_opened(self, spider):

        file_pprnt = open(
            '%s_pprint-items0' % spider.name,
            'w+b',
        )
        file_jsl = open(
            '%s_json-items0' % spider.name,
            'w+b',
        )

        self.jsl_exporter = JsonLinesItemExporter(file_jsl)
        self.pprnt_exporter = PprintItemExporter(file_pprnt)

        self.files[spider] = [file_pprnt, file_jsl]
        self.pprnt_exporter.indent = 2
        self.pprnt_exporter.start_exporting()
        self.jsl_exporter.start_exporting()
class LianjiaPipeline(object):
    def __init__(self):
        self.esfhouse_fp = open("esfhouse.json","wb")
        self.esfhouse_exporter = JsonLinesItemExporter(self.esfhouse_fp,ensure_ascii=False)
        self.zfhouse_fp = open("zfhouse.json", "wb")
        self.zfhouse_exporter = JsonLinesItemExporter(self.zfhouse_fp, ensure_ascii=False)

    def process_item(self, item, spider):
        if isinstance(item,EsfItem):
            self.esfhouse_exporter.export_item(item)
            return item
        else:
            self.zfhouse_exporter.export_item(item)
            return item

    def close_spider(self,spider):
        self.esfhouse_fp.close()
        self.zfhouse_fp.close()
Example #24
0
class CapterraPipeline(object):
    def __init__(self):
        self.fp=open("capterra.json",'wb')
        self.exporter=JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf-8')
        # self.exporter.start_exporting()
    def open_spider(self,spider):
        print("CapterraItem爬虫开始了!")
        pass
    def process_item(self,item,spider):
        if isinstance(item,CapterraItem):
            self.exporter.export_item(item)
        return item
        # self.exporter.export_item(item)
        # return item
    def close_spider(self,spider):
        self.fp.close()
        print("CapterraItem爬虫结束了!")
        pass
Example #25
0
class QsbkPipeline(object):
    def __init__(self):
        self.fp = open('duanzi.json', 'wb')
        self.exporter = JsonLinesItemExporter(self.fp,
                                              ensure_ascii=False,
                                              encoding='utf-8')

    def open_spider(self, spider):
        print('爬虫开始')

    def close_spider(self, spider):
        self.fp.close()
        print('爬虫结束')

    def process_item(self, item, spider):
        self.exporter.export_item(item)

        return item
 def process_item(self, item, spider):
     filename = str(item['listing'][0]['id']) + '.jl'
     with open(filename, 'wb') as file:
         exporter = JsonLinesItemExporter(
             file, fields_to_export=['listing', 'trovokasa'])
         exporter.start_exporting()
         exporter.export_item(item)
         exporter.finish_exporting()
         pathlib.Path(__file__).parents[1].joinpath(filename).rename(
             self.feed_path.joinpath(filename))
     return item
Example #27
0
class FangTianXiaScrapyPipeline(object):
    def __init__(self):
        self.newhouse_fp = open('newhouse.json', 'ab')
        self.esfhouse_fp = open('esfhouse.json', 'ab')
        self.newhouse_exporter = JsonLinesItemExporter(self.newhouse_fp,
                                                       ensure_ascii=False)
        self.esfhouse_exporter = JsonLinesItemExporter(self.esfhouse_fp,
                                                       ensure_ascii=False)

    def process_item(self, item, spider):
        if isinstance(item, NewHouseItem):
            self.newhouse_exporter.export_item(item)
        elif isinstance(item, EsfHouseItem):
            self.esfhouse_exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.newhouse_fp.close()
        self.esfhouse_fp.close()
Example #28
0
    def _exporter_for_item(self, item):
        unit = ""

        search_topic = item["search_topic"]
        search_location = item["search_location"]
        teacher = item["teacher"]
        unit = "-".join([search_topic, search_location])

        full_search_date = time.strftime("%Y-%m-%d")

        export_name = f'{search_topic}_{full_search_date}_{search_location.lower()}.json'

        if unit not in self.unit_to_exporter:
            f = open(f"./data/scraper/{full_search_date}/{export_name}", 'wb')
            exporter = JsonLinesItemExporter(f)
            exporter.start_exporting()
            self.unit_to_exporter[unit] = exporter
        
        return self.unit_to_exporter[unit]
Example #29
0
class FangtianxiaPipeline(object):
    def __init__(self):
        # 用两个文件和两个写入器来写入信息
        fp1 = open('fang1.json', 'wb')
        fp2 = open('fang2.json', 'wb')
        self.exporter1 = JsonLinesItemExporter(fp1,
                                               ensure_ascii=False,
                                               encoding='utf-8')
        self.exporter2 = JsonLinesItemExporter(fp2,
                                               ensure_ascii=False,
                                               encoding='utf-8')

    def process_item(self, item, spider):
        # 用isinstance对item作判断,看具体要保存到哪个文件
        if isinstance(item, FangtianxiaFirstItem):
            self.exporter1.export_item(item)
        if isinstance(item, FangtianxiaSecondItem):
            self.exporter2.export_item(item)
        return item
Example #30
0
class FangPipeline:
    def __init__(self):
        self.newhouse_fp = open('newhouse.json', 'wb')
        self.ershouhouse_fp = open('ershouhouse.json', 'wb')
        self.newhouse_exporter = JsonLinesItemExporter(self.newhouse_fp,
                                                       ensure_ascii=False)
        self.ershouhouse_exporter = JsonLinesItemExporter(self.ershouhouse_fp,
                                                          ensure_ascii=False)

    def process_item(self, item, spider):
        if isinstance(item, NewhouseItem):
            self.newhouse_exporter.export_item(item)
        elif isinstance(item, ErshouItem):
            self.ershouhouse_exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.newhouse_fp.close()
        self.ershouhouse_fp.close()
Example #31
0
class QsbkPipeline(object):
    def __init__(self):
        # 打开文件句柄
        self.fp = open("duanzi.json", "wb")
        self.exporter = JsonLinesItemExporter(self.fp,
                                              ensure_ascii=False,
                                              encoding="utf-8")

    def open_spider(self, spider):
        print("爬虫开始了....")

    def process_item(self, item, spider):
        # 接受yield传参,序列化为json格式,不使用默认ascii码
        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.fp.close()
        print("爬虫结束了.....")
Example #32
0
class QsbkScrapyPipeline(object):
    def __init__(self):
        self.fp = open('duanzi_2.json', 'wb')
        self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8')

    def open_spider(self, spider):
        print("爬虫开始了。。。")

    def process_item(self, item, spider):
        """
        将数据保存到json文件中
        item: 若spider中通过yield返回数据,则被参数item接收
        """
        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.fp.close()
        print("爬虫结束了。。。")
Example #33
0
class QsbkPipeline:
    def __init__(self):
        #wb是以二进制方式打开,因为JsonItemExporter在写入数字时是以bite形式写入
        self.fp=open("duanzi.json","wb")
        self.exporter=JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding="utf-8")


    def open_spider(self,spider):
        print("这是爬虫开始了")


    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item


    def close_spider(self,spider):
        self.fp.close()
        print("爬虫结束了")
Example #34
0
class BsbdjPipeline(object):
    def __init__(self):
        self.fp = open('budejie.json', 'wb')
        self.exporter = JsonLinesItemExporter(self.fp,
                                              ensure_ascii=False,
                                              encoding='utf-8')

    def open_spider(self, spider):
        print('爬虫开始了...')
        pass

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.fp.close()
        print('爬虫结束了...')
        pass
Example #35
0
class BjdyPipeline(object):
    def __init__(self):
        self.fp = open("bjdy.json", 'wb')
        self.exporter = JsonLinesItemExporter(self.fp,
                                              ensure_ascii=False,
                                              encoding='utf-8')
        self.fp.write(b"[")

    def open_spider(self, spider):
        pass

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        self.fp.write(b',')
        return item

    def close_spider(self, spider):
        self.fp.write(b"]")
        pass
class SoufangPipeline(object):
    def __init__(self):
        self.newhouse_fp = open('newhouse.json', 'wb')
        self.esfhouse_fp = open('esfhouse.json', 'wb')
        self.newhouse_exporter = JsonLinesItemExporter(self.newhouse_fp,
                                                       ensure_ascii=False)
        self.esfhouse_exporter = JsonLinesItemExporter(self.esfhouse_fp,
                                                       ensure_ascii=False)

    def process_item(self, item, spider):
        if item['genre'] == '新房':
            self.newhouse_exporter.export_item(item)
        else:
            self.esfhouse_exporter.export_item(item)
        return item

    def close_spider(self, spider):
        self.newhouse_fp.close()
        self.esfhouse_fp.close()
Example #37
0
class QiushiPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        # The file created on Dec20 2015 will be named as "12-20-2015.json"
        datestr = date.today().strftime("%m-%d-%Y")
        file = open('scraped_data/%s.json' % datestr, 'w+b')
        self.files[spider] = file
        self.exporter = JsonLinesItemExporter(file, ensure_ascii=False)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        # print item['author']
        # print item['title']
        # print item['content']
        # print item['href']

        return item
Example #38
0
class KinoPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
         pipeline = cls()
         crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
         crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
         return pipeline

    def spider_opened(self, spider):
        # It might be possible to add indent=4 and ensure_ascii=False somewhere.
        file = open('output/' + spider.name + '.jsonl', 'w+b', encoding='utf-8')
        self.files[spider] = file
        self.exporter = JsonLinesItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Example #39
0
class JsonExportPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_all.json' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = JsonLinesItemExporter(file, encoding='utf-8', ensure_ascii=False)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
class JsonLinesExportPipeline(object):
    """
    app.pipelines.exporter_json_lines.JsonLinesExportPipeline
    """
    def __init__(self):
        self.files = {}
        self.exporter = None

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file_json_lines = open('%s_item_lines.json' % spider.name, 'w+b')
        self.files[spider] = file_json_lines
        self.exporter = JsonLinesItemExporter(file_json_lines)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file_json_lines = self.files.pop(spider)
        file_json_lines.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Example #41
0
class MedPipeline(object):
    def __init__(self):
        self.ids_seen = set()

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.file = open('medData.json', 'wb')
        self.expoter = JsonLinesItemExporter(self.file)
        self.expoter.start_exporting()
        
    def spider_closed(self, spider):
        self.expoter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        if int(item['reply_num'][0]) == 0:
        	raise DropItem("no reply in %s" % item)
        elif item['post_id'] in self.ids_seen:
        	raise DropItem("Duplicate item found: %s" % item)
        else:
            self.ids_seen.add(item['post_id'])
            self.expoter.export_item(item)
            return item
Example #42
0
class EduSpider(scrapy.Spider):
    """ Used to scrape .edu websites for web technology statistics """
    name = 'edu'

    def __init__(self):
        scrapy.Spider.__init__(self)
        baseurl = 'https://domaintyper.com/top-websites/most-popular-websites-with-edu-domain/page/'
        logpath = '/home/www/flasksite/static/scenes/unipagestats/schools.jl'

        self.start_urls = [baseurl + str(i) for i in xrange(1, 30)]
        self.domain = 'domaintyper.com'
        self.exporter = JsonLinesItemExporter(open(logpath, 'wb+'))

    def parse(self, response):
        self.exporter.start_exporting()
        urls = [url.encode('utf-8') for url in response.css('.wsTR > td:nth-child(2)').xpath('text()').extract()]
        for url in urls:
            fullurl = 'http://www.' + url + '/'
            yield scrapy.Request(fullurl, callback=self.parse_edu_site)

    def parse_edu_site(self, response):
        data = SiteData()
        tc = TagCounter()

        # Fill summary fields
        data['url'] = response.url
        data['domain'] = '.'.join(response.url.split('/')[2].split('.')[-2:])
        data['name'] = data['domain'].split('.')[0]
        data['title'] = response.xpath('//title/text()').extract()[0].encode('utf-8')

        # Fill CSS fields
        data['css_paths'] = [stylesheet.encode('utf-8') for stylesheet in response.xpath('//link[@rel="stylesheet"]/@href').extract()]
        data['css_files'] = [stylesheet.split('/')[-1] for stylesheet in data['css_paths']]

        # Fill JS fields
        data['js_paths'] = [script.encode('utf-8') for script in response.xpath('//script/@src').extract()]
        data['js_files'] = [script.split('/')[-1] for script in data['js_paths']]

        # Fill tag fields
        tc.feed(response.body)
        data['tagcount'] = tc.tagcount
        data['nonvoidcount'] = tc.nonvoid_tagcount
        data['topnest'] = tc.topnest

        self.exporter.export_item(data)
        yield data

    def __del__(self):
        scrapy.Spider.__del__(self)
        self.exporter.finish_exporting()
class OKCupidJsonPipeline(object):
    def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.files = {}

    def spider_opened(self, spider):
        file = open('%s_items.json' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = JsonLinesItemExporter(file, ensure_ascii=True)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        #print item
        #uItem = urllib.urlencode(item)
        #jItem = dumps(uItem, cls=PythonObjectEncoder)
        self.exporter.export_item(item)
        return item
Example #44
0
 def spider_opened(self, spider):
     file = open('%s_all.json' % spider.name, 'w+b')
     self.files[spider] = file
     self.exporter = JsonLinesItemExporter(file, encoding='utf-8', ensure_ascii=False)
     self.exporter.start_exporting()
Example #45
0
 def spider_opened(self, spider):
     # It might be possible to add indent=4 and ensure_ascii=False somewhere.
     file = open('output/' + spider.name + '.jsonl', 'w+b', encoding='utf-8')
     self.files[spider] = file
     self.exporter = JsonLinesItemExporter(file)
     self.exporter.start_exporting()
 def spider_opened(self, spider):
     file = open('%s_items.json' % spider.name, 'w+b')
     self.files[spider] = file
     self.exporter = JsonLinesItemExporter(file, ensure_ascii=True)
     self.exporter.start_exporting()
Example #47
0
 def open_spider(self, spider):
     """Initialize export JSON lines file."""
     self.file = open("gov.json", "wb")
     self.exporter = JsonLinesItemExporter(self.file, ensure_ascii=False)
     self.exporter.start_exporting()
Example #48
0
class ResolutionPipeline(object):
    """Pipeline used for ResolutionSpider."""
    def __init__(self):
        self.file = None
        self.exporter = None

        # compile regular expressions:

        # input looks like 'dec14R.aspx'
        # we need the resolution number (14R)
        self.resolution_number_pattern = re.compile(r"^\D+(?P<number>.+?)\..*$")

        # input looks like 'ממשלה/הממשלה ה - 34 בנימין נתניהו;'
        # we need the government number (34) and prime minister name (בנימין נתניהו)
        self.gov_pattern = re.compile(r'^.+\s??\-\s?(?P<gov_number>.+?)\s+?(?P<pm_name>.+?);?$')

    def open_spider(self, spider):
        """Initialize export JSON lines file."""
        self.file = open("gov.json", "wb")
        self.exporter = JsonLinesItemExporter(self.file, ensure_ascii=False)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        """Close export file."""
        self.file.close()
        self.exporter.finish_exporting()

    def process_item(self, item, spider):
        """Sanitize text for each field, and export to file."""
        try:
            data = {
                'url': item["url"],
                'date': self.get_date(item).timestamp,
                'resolution_number': self.get_resolution_number(item),
                'gov_number': self.get_gov_number(item),
                'pm_name': self.get_pm_name(item),
                'title': self.get_title(item),
                'subject': self.get_subject(item),
                'body': self.get_body(item),
            }
        except ResolutionError as ex:
            # if one of the fields fails sanitation,
            # raise and exception
            # and export the url leading to the specific resolution
            # for later (human) review
            self.exporter.export_item({'error': repr(ex),
                                       'url': item["url"],
                                      })
        else:
            self.exporter.export_item(data)

        return item

    # the following are specific field handling functions
    # e.g. cleaning, stripping, etc.
    # these should be called before dumping the data

    def get_date(self, item):
        if len(item["date"]) != 1:
            raise ResolutionError("Date field length is not 1 for item %s", item)
        return arrow.get(item["date"][0], "YYYYMMDD")

    def get_resolution_number(self, item):
        if len(item["resolution_number"]) != 1:
            raise ResolutionError("Resolution number field length is not 1 for item %s", item)
        return self.resolution_number_pattern.search(item["resolution_number"][0]).group('number')

    def get_gov_number(self, item):
        if len(item["gov"]) != 1:
            raise ResolutionError("Government field length is not 1 for item %s", item)
        gov_match = self.gov_pattern.search(item["gov"][0])
        return gov_match.group("gov_number")

    def get_pm_name(self, item):
        if len(item["gov"]) != 1:
            raise ResolutionError("Government field length is not 1 for item %s", item)
        gov_match = self.gov_pattern.search(item["gov"][0])
        return gov_match.group("pm_name")

    def get_title(self, item):
        if len(item["title"]) == 0:
            raise ResolutionError("Title fields is empty for item %s", item)
        return '\n'.join(item["title"]).strip()

    def get_subject(self, item):
        if len(item["subject"]) == 0:
            raise ResolutionError("Subject field is empty for item %s", item)
        return '\n'.join(item["subject"]).strip()

    def get_body(self, item):
        if len(item["body"]) == 0:
            raise ResolutionError("Body field is empty for item %s", item)
        # return '\n'.join(item["body"]).strip()

        # body is originally a list of lines
        # it is intentionally not stripped
        # some resolutions have custom css, tables,
        # and other crap which i'd rather not process here,
        # but in a later stage, unrelated to the scraper
        return item["body"]
 def spider_opened(self, spider):
     file_json_lines = open('%s_item_lines.json' % spider.name, 'w+b')
     self.files[spider] = file_json_lines
     self.exporter = JsonLinesItemExporter(file_json_lines)
     self.exporter.start_exporting()
Example #50
0
 def open_spider(self, spider):
     file = open('json/{}_products.json'.format(spider.name), 'w+b')
     self.files[spider] = file
     self.exporter = JsonLinesItemExporter(file)
     self.exporter.start_exporting()
Example #51
0
 def spider_opened(self, spider):
     self.file = open('medData.json', 'wb')
     self.expoter = JsonLinesItemExporter(self.file)
     self.expoter.start_exporting()
Example #52
0
 def spider_opened(self, spider):
     file = open('%s_hp_stories.jl' % spider.name, 'w+b')
     self.files[spider] = file
     self.exporter = JsonLinesItemExporter(file)
     self.exporter.start_exporting()