class ImagePipeline(object): def __init__(self): self.files = {} self.times = 1 self.count = 1 self.distant = 20 @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_20.xml' % self.times, 'w+b') self.files[spider] = file self.exporter = XmlItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): if scrapy.Request(item['imgeurl'][1], callback=self.check): DropItem('No Image for this item') elif self.count <= self.distant: self.count += 1 self.exporter.export_item(item) return item else: self.exporter.finish_exporting() file = self.files.pop(spider) file.close() self.times += 1 file = open('%s_20.xml' % self.times, 'w+b') self.files[spider] = file self.exporter = XmlItemExporter(file) self.exporter.start_exporting() self.count = 1 self.processs_item(item, spider) def check(self, response): if response.body is None: return false else: return true
def spider_opened(self, spider): xml_name = str(spider.allowed_domains) xml_name = xml_name[2:-2] file = open('../../output/%s_crawled.xml' % xml_name, 'w+b') self.files[spider] = file self.exporter = XmlItemExporter(file, root_element = 'root', item_element = 'item') self.exporter.start_exporting()
def assertExportResult(self, item, expected_value): fp = BytesIO() ie = XmlItemExporter(fp) ie.start_exporting() ie.export_item(item) ie.finish_exporting() self.assertXmlEquivalent(fp.getvalue(), expected_value)
def spider_opened(self, spider): # todo: json 변경에 대해 검토하자.(현재는 인코딩 깨짐) # file = open('%s.civilAppeal.json' % spider.allowed_domains[0], 'w+b') print 'path %s/%s.%s.civilAppeal.xml' % ( EXPORT_PATH, spider.allowed_domains[0], datetime.date.today().isoformat()) file = open( '%s/%s.%s.civilAppeal.xml' % (EXPORT_PATH, spider.allowed_domains[0], datetime.date.today().isoformat()), 'w+b') self.files[spider] = file # self.exporter = JsonLinesItemExporter(file ) self.exporter = XmlItemExporter(file) self.exporter.start_exporting()
class XmlExportPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): xml_name = str(spider.allowed_domains) xml_name = xml_name[2:-2] file = open('../../output/%s_crawled.xml' % xml_name, 'w+b') self.files[spider] = file self.exporter = XmlItemExporter(file, root_element = 'root', item_element = 'item') self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class XmlExportPipeline(object): """ app.pipelines.exporter_xml.XmlExportPipeline """ def __init__(self): self.files = {} self.exporter = None @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file_xml = open('%s_items.xml' % spider.name, 'w+b') self.files[spider] = file_xml self.exporter = XmlItemExporter(file_xml) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file_xml = self.files.pop(spider) file_xml.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class Exporter(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): # todo: json 변경에 대해 검토하자.(현재는 인코딩 깨짐) # file = open('%s.civilAppeal.json' % spider.allowed_domains[0], 'w+b') print 'path %s/%s.%s.civilAppeal.xml' % ( EXPORT_PATH, spider.allowed_domains[0], datetime.date.today().isoformat()) file = open( '%s/%s.%s.civilAppeal.xml' % (EXPORT_PATH, spider.allowed_domains[0], datetime.date.today().isoformat()), 'w+b') self.files[spider] = file # self.exporter = JsonLinesItemExporter(file ) self.exporter = XmlItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class PyDataXmlExport(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('pydata_items.xml', 'w+b') self.files[spider] = file self.exporter = XmlItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class ScrapyMangafoxPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_all.xml' % spider.name, 'w+b') self.files[spider] = file self.exporter = XmlItemExporter(file) self.exporter.fields_to_export = ['title','genres','rating','description','authors','published','link'] self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
def process_item(self, item, spider): if scrapy.Request(item['imgeurl'][1], callback=self.check): DropItem('No Image for this item') elif self.count <= self.distant: self.count += 1 self.exporter.export_item(item) return item else: self.exporter.finish_exporting() file = self.files.pop(spider) file.close() self.times += 1 file = open('%s_20.xml' % self.times, 'w+b') self.files[spider] = file self.exporter = XmlItemExporter(file) self.exporter.start_exporting() self.count = 1 self.processs_item(item, spider)
class XmlExportPipeline(object): def __init__(self): self.files = {} def open_spider(self, spider): file = open('%s_products.xml' % spider.name, 'w+b') self.files[spider] = file self.exporter = XmlItemExporter(file) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class Tech163Pipeline(object): def __init__(self): pass @classmethod def from_crawler(cls,crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened,signals.spider_opened) crawler.signals.connect(pipeline.spider_closed,signals.spider_closed) return pipeline def spider_opened(self,spider): self.file = open('play1.xml','wb') self.expoter = XmlItemExporter(self.file) self.expoter.start_exporting() def spider_closed(self,spider): self.expoter.finish_exporting() self.file.close() def process_item(self,item,spider): self.expoter.export_item(item) return item
class XmlPipeline(object): @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.file = open(filename, "wb") self.exporter = XmlItemExporter(self.file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class MobygamesPipeline(object): global adult_ratings adult_ratings = ['18', 'Adults Only'] global licensed licensed = [u'Licensed\xa0Title'] def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open(spider.settings['FILES_STORE'] + '/%s.xml' % 'export', 'w+b') self.files[spider] = file self.exporter = XmlItemExporter(file,item_element='game', root_element='games') self.exporter.start_exporting() return def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() return def process_item(self, item, spider): logging.info("Exporting item:" + item['title']) # do not export utility fields # pop them from item dict item.pop("search_region", None) item.pop("search_title", None) item.pop("search_platform", None) item.pop("screenshot_urls", None) item.pop("cover_urls", None) if 'date' in item.keys(): item['date'] = str(dateutil.parser.parse(item['date'])) if 'rating' in item.keys(): if [i for i in adult_ratings if i in item['rating']]: item['mature'] = '' item['rating'] = '18' else: item['rating'] = item['rating'][0] if 'misc' in item.keys(): if [i for i in licensed if i in item['misc']]: item['licensed'] = '' try: value = item['platforms'] except KeyError: # Key is not present item['exclusive'] ='' pass # clean up description d = item['description'].encode('utf-8') start = 0 end = d.find('[ edit description') item['description'] = d[start:end].strip().replace(" "," ") self.exporter.export_item(item) return item
class MobygamesPipeline(object): # global adult_ratings # adult_ratings = ['18', 'Adults Only'] # global licensed # licensed = [u'Licensed\xa0Title'] global group_addon group_addon = "extension" global group_anatomy group_anatomy = "Anatomie" global group_astronomy group_astronomy = "Astronomie" global group_comic group_comic = "Bande Dessinee" global group_bowling group_bowling = "bowling" global group_boxing group_boxing = "boxe" global group_cluedo group_cluedo = "cluedo" global group_coinop group_coinop = "coin-op" global group_chess group_chess = "echec" global group_flipper group_flipper = "flipper" global group_football group_football = "football" global group_snooker group_snooker = "snooker" global group_isometric group_isometric = "3D iso" global group_mastermind group_mastermind = "STYLE : mastermind" global group_othello group_othello = "STYLE : othello" global group_qbert group_qbert = "STYLE : qbert" global groups_map groups_map = { "female protagonist": "Protagonist: Female", "Hardware double buffer" : "Hardware: double buffering", "extended RAM" : "Hardware: Extended RAM", "Scroll Hard horizontal" : "Hardware: Horizontal scrolling", "Scroll Hard vertical" : "Hardware: Vertical scrolling", "Scroll Hard multidirectional" : "Hardware: Multidirectional scrolling", "Software Scroll" : "Software scrolling", "Parallax Scroll" : "Parallax scrolling", "Multi-Mode" : "Display: Multi-Mode", "Dual Playfield" : "Display: Dual Playfield", "MODE0 titlescreen" : "Display: MODE0 Title Screen", "MODE1 titlescreen" : "Display: MODE1 Title Screen", "MODE2 titlescreen" : "Display: MODE2 Title Screen", "MODE0 inside" : "Display: MODE0 In Game", "MODE1 inside" : "Display: MODE1 In Game", "MODE2 inside" : "Display: MODE2 In Game", "MODE1 special" : "Display: MODE1 Raster", "Overscan full" : "Display: Full Overscan", "Overscan horizontal" : "Display: Horizontal Overscan", "Overscan vertical" : "Display: Vertical Overscan", "Biggerscreen" : "Display: Big Screen", "Smallscreen" : "Display: Small Screen", "Normalscreen Smalldisplay" : "Display: Normal Screen", "Bande Dessinee" : "Inspiration: Comics", "blitz" : "Blitz variants", "Dessin Animee" : "Inspiration: TV cartoons", "film" : "Inspiration: Movies", "livre" : "Inspiration: Literature", "memory" : "Concentration variants", "serie tv" : "Inspiration: TV series", "Speccy Port" : "Speccy Port", "STYLE : boulder dash" : "Boulder Dash variants", "STYLE : check man" : "Check Man variants", "STYLE : marble madness" : "Genre: Rolling ball", "STYLE : pac-man" : "Pac-Man variants", "STYLE : pingo" : "Pengo variants", "STYLE: puzznic" : "Genre: Tile matching puzzle (creation)", "simon" : 'Gameplay feature: "Simon says"', "STYLE : tetris" : "Tetris variants", "STYLE : tron" : "Genre: Light Cycle", "STYLE : yam" : "Yahtzee variants", "extension" : group_addon, "Anatomie" : group_anatomy, "Astronomie" : group_astronomy, "bowling" : group_bowling, "cluedo" : group_cluedo, "coin-op" : group_coinop, "echec" : group_chess, "flipper" : group_flipper, "football" : group_football, "snooker" : group_snooker, "3D iso" : group_isometric, "STYLE : mastermind" : group_mastermind, "STYLE : othello" : group_othello, "STYLE : qbert" : group_qbert } global game_breakout game_breakout = "GAME -> Breakout"; global game_fight game_fight = "GAME -> Fight"; global game_management game_management = "GAME -> Management"; global game_arcade game_arcade = "GAME -> Arcade"; global game_platform game_platform = "GAME -> Platformer"; global game_quiz game_quiz = "GAME -> Quiz"; global game_shmup game_shmup = "GAME -> Shoot'Em Up"; global game_targetshooting game_targetshooting = "GAME -> Target shooting"; global game_edu_history_geography game_edu_history_geography = "EDUCATIONAL -> History, Geography"; global game_edu_math game_edu_math = "EDUCATIONAL -> Maths, Geometry"; global game_edu_grammar game_edu_grammar = "EDUCATIONAL -> spelling, Grammar"; global genres_map genres_map = { "GAME -> Reflexion": "Puzzle", "GAME -> Action": "Action", "GAME -> Adventure": "Adventure", "GAME -> Race" : "Racing / Driving", "GAME -> Management" : "Strategy/Tactics", "GAME -> Role-playing" : "Role-Playing (RPG)", "GAME -> Maze" : "Puzzle", "GAME -> Run & Gun" : "Action", "GAME -> Simulation" : "Simulation", "GAME -> Sport" : "Sports", "GAME -> Strategy" : "Strategy/Tactics", "EDUCATIONAL -> Course, Tutorial" : "Educational", "EDUCATIONAL -> Other" : "Educational", game_arcade : "Action", game_breakout : "Action", game_fight : "Action", game_management : "Strategy/Tactics", game_platform : "Action", game_quiz : "Puzzle", game_shmup : "Action", game_targetshooting : "Action", game_edu_history_geography : "Educational", game_edu_math : "Educational", game_edu_grammar : "Educational" } global players_map players_map = { "1 player": "Single Player", "2 alternating players": "2-Player Alternating", "3 alternating players": "3-Player Alternating", "4 alternating players": "4-Player Alternating", "5 alternating players": "5-Player Alternating", "6 alternating players": "6-Player Alternating", "7 alternating players": "7-Player Alternating", "8 alternating players": "8-Player Alternating", "9 alternating players": "9-Player Alternating", "10 alternating players": "10-Player Alternating", "2 simultaneous cooperating players" : "2-Player Simultaneous;Cooperative", "3 simultaneous cooperating players" : "3-Player Simultaneous;Cooperative", "4 simultaneous cooperating players" : "4-Player Simultaneous;Cooperative", "5 simultaneous cooperating players" : "5-Player Simultaneous;Cooperative", "6 simultaneous cooperating players" : "6-Player Simultaneous;Cooperative", "7 simultaneous cooperating players" : "7-Player Simultaneous;Cooperative", "8 simultaneous cooperating players" : "8-Player Simultaneous;Cooperative", "9 simultaneous cooperating players" : "9-Player Simultaneous;Cooperative", "10 simultaneous cooperating players" : "10-Player Simultaneous;Cooperative", "2 simultaneous opposing players" : "2-Player Simultaneous;Versus", "3 simultaneous opposing players" : "3-Player Simultaneous;Versus", "4 simultaneous opposing players" : "4-Player Simultaneous;Versus", "5 simultaneous opposing players" : "5-Player Simultaneous;Versus", "6 simultaneous opposing players" : "6-Player Simultaneous;Versus", "7 simultaneous opposing players" : "7-Player Simultaneous;Versus", "8 simultaneous opposing players" : "8-Player Simultaneous;Versus", "9 simultaneous opposing players" : "9-Player Simultaneous;Versus", "10 simultaneous opposing players" : "10-Player Simultaneous;Versus", } def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open(spider.settings['FILES_STORE'] + '/%s.xml' % 'export', 'w+b') self.files[spider] = file self.exporter = XmlItemExporter(file,item_element='game', root_element='games') self.exporter.start_exporting() return def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() return def process_item(self, item, spider): logging.info("Exporting item:" + item['title']) # do not export utility fields # pop them from item dict #~ item.pop("search_num", None) #~ item.pop("search_title", None) item.pop("screenshot_urls", None) item.pop("image_urls", None) # initialize arrays if needed self.initKey(item, 'gameplay') self.initKey(item, 'group') self.initKey(item, 'educational') self.initKey(item, 'addon') self.initKey(item, 'sport') self.initKey(item, 'narrative') self.initKey(item, 'visual') self.initKey(item, 'pacing') ######################## ###### MAP GROUPS ###### if 'group' in item.keys() and item['group']: groupslist = item['group'].split(";") newgroupslist = [] for g in groupslist: mappedgroup = self.map_group( g ); if mappedgroup: if mappedgroup == group_addon: item['addon'].append('Map / Level') elif mappedgroup == group_anatomy: item['educational'].append('Science') elif mappedgroup == group_astronomy: item['educational'].append('Science') elif mappedgroup == group_bowling: item['sport'].append('Bowling') elif mappedgroup == group_boxing: item['sport'].append('Boxing') elif mappedgroup == group_cluedo: item['narrative'].append('Detective / Mystery') item['gameplay'].append('Board Game') newgroupslist.append( "Genre: Board game - Clue" ) newgroupslist.append( "Board game translations" ) elif mappedgroup == group_coinop: item['gameplay'].append('Arcade') elif mappedgroup == group_chess: item['gameplay'].append('Chess') elif mappedgroup == group_flipper: item['gameplay'].append('Pinball') elif mappedgroup == group_football: item['sport'].append('Football (European) / Soccer') elif mappedgroup == group_snooker: item['sport'].append('Pool / Snooker') elif mappedgroup == group_isometric: item['visual'].append('Isometric') elif mappedgroup == group_mastermind: item['gameplay'].append('Puzzle-solving') newgroupslist.append( 'Mastermind variants' ) elif mappedgroup == group_othello: item['gameplay'].append('Board Game') item['pacing'].append('Turn-based') newgroupslist.append( "Board game translations" ) newgroupslist.append( "Genre: Board game - Reversi / Othello" ) elif mappedgroup == group_qbert: item['gameplay'].append('Puzzle-solving') newgroupslist.append( "Q*Bert variants" ) else: newgroupslist.append( mappedgroup ) item['group'] = ";".join( newgroupslist ) ######################## ######################## ######################### ###### MAP GENRES ###### # check if game is breakout # if yes then # 1. add to groups:Breakout variants # 2. add custom field gameplay:Paddle / Pong # 3. add custom field gameplay:Arcade if item['genre'] == game_breakout: item['group'] += ';Breakout variants' item['gameplay'].append('Paddle / Pong') item['gameplay'].append('Arcade') # if game is of type fight # add custom field gameplay:Fighting if item['genre'] == game_fight: item['gameplay'].append('Fighting') # if type of game is management, add custom field # gameplay:Managerial / Business Simulation if item['genre'] == game_management: item['gameplay'].append('Managerial / Business Simulation') # if game is of type arcade, add custom field # gameplay:Arcade if item['genre'] == game_arcade: item['gameplay'].append('Arcade') # if type is platform, add custom field # gameplay:Platform if item['genre'] == game_platform: item['gameplay'].append('Platform') # if type is of quiz, add custom field # gameplay:Game Show / Trivia / Quiz if item['genre'] == game_quiz: item['gameplay'].append('Game Show / Trivia / Quiz') # if has type shmup add custom field # gameplay:Shooter if item['genre'] == game_shmup: item['gameplay'].append('Shooter') # if has type of target shooting add custom field # gameplay:Shooter if item['genre'] == game_targetshooting: item['gameplay'].append('Shooter') # if game is of educational type history / geography # add custom fields # 1. Educational:Geography # 3. Educational:History if item['genre'] == game_edu_history_geography: item['educational'].append('Geography') item['educational'].append('History') # if type is of Educational Math, add custom field # Educational:Math / Logic if item['genre'] == game_edu_math: item['educational'].append('Math / Logic') # if has type Educational grammar / spelling # add custom field educational:Reading / Writing if item['genre'] == game_edu_grammar: item['educational'].append('Reading / Writing') # # map genre to mobygames genres # remove it if there is no corresponding genre # if item['genre']: item['genre'] = self.map_genre( item['genre'] ) ######################## ######################## ######################### ###### MAP PLAYERS ###### if 'players' in item.keys() and item['players']: playerslist = item['players'].split(";") newplayerslist = [] for p in playerslist: mappedplayer = self.map_player( p ); if mappedplayer: newplayerslist.append( mappedplayer ) setplayerslist = set( newplayerslist ) item['players'] = ";".join( setplayerslist ) ######################## ######################## # convert year to date 1/1/year if 'year' in item.keys() and item['year']: item['date'] = str(dateutil.parser.parse('1/1/' + item['year'][0])) item.pop("year", None) if 'year' in item.keys(): item.pop("year", None) # scale score to 1 to 5 if 'criticScore' in item.keys() and item['criticScore']: if " / " in item['criticScore'][0]: scores = item['criticScore'][0].split(' / ') item['criticScore'][0] = str( int( float(5) * float(scores[0]) / float(scores[1]) ) ) else: item.pop("criticScore", None) # if 'misc' in item.keys(): # if [i for i in licensed if i in item['misc']]: # item['licensed'] = '' # delete empty keys self.delKey( item, 'genre') self.delKey( item, 'gameplay') self.delKey( item, 'group') self.delKey( item, 'educational') self.delKey(item, 'addon') self.delKey(item, 'sport') self.delKey(item, 'narrative') self.delKey(item, 'visual') self.delKey(item, 'pacing') self.exporter.export_item(item) return item # map genre to mobygames compatible genres def map_genre(self,x): if x in genres_map: return genres_map[x] else: return None # map players to mobygames compatible players def map_player(self,x): for playerkey in players_map.keys(): if playerkey.lower() in x.lower() or x.lower() in playerkey.lower(): return players_map[playerkey] return None # map groups to readable / presentable Amstrad custom groups # or map them to related mobygames game groups def map_group(self,x): for groupkey in groups_map.keys(): if groupkey.lower() in x.lower() or x.lower() in groupkey.lower(): return groups_map[groupkey] return None def initKey(self,item,key): if key not in item.keys(): item[key] = [] def delKey(self,item,key): if not item[key]: del item[key]
def spider_opened(self, spider): file = open('%s_20.xml' % self.times, 'w+b') self.files[spider] = file self.exporter = XmlItemExporter(file) self.exporter.start_exporting()
def spider_opened(self, spider): file = open('myfile/%s.xml' % spider.name, 'w+b') self.files[spider] = file # 实例化一个XmlItemExporter对象 self.exporter = XmlItemExporter(file) self.exporter.start_exporting()
def open_spider(self, spider): self.file = open('honglingjing.xml', 'wb') self.exporter = XmlItemExporter(self.file) self.exporter.start_exporting()
def __init__(self): self.file = open( '/home/CORPUSERS/xp017845/zxmcrawl/caipiao/cp_products.xml', 'w+b') self.exporter = XmlItemExporter(self.file, item_element='item', root_element='root')
def spider_opened(self, spider): file = open('europython_items.xml', 'w+b') self.files[spider] = file self.exporter = XmlItemExporter(file) self.exporter.start_exporting()
class XmlExportPipeline(object): def __init__(self): self.file=open('items.xml','w') self.file2=open('items_sin_etiq.xml','w') @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.exporter=XmlItemExporter(self.file) self.exporter.start_exporting() self.exporter2=XmlItemExporter(self.file2) self.exporter2.start_exporting() def spider_closed(self, spider): self.exporter2.finish_exporting() self.file2.close() self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): if (len(item['etiquetas'])==0): self.exporter2.export_item(item) else: self.exporter.export_item(item) return item
def open_spider(self, spider): file = open('%s_products.xml' % spider.name, 'w+b') self.files[spider] = file self.exporter = XmlItemExporter(file) self.exporter.start_exporting()
def spider_opened(self, spider): file = open('%s_dump.xml' % spider.name, 'wb') self.files[spider] = file self.exporter = XmlItemExporter(file, encoding='utf-8') self.exporter.start_exporting()
def __init__(self): self.fp = open('qidian_dev.xls', 'wb') # self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8') self.exporter = XmlItemExporter(self.fp, ensure_ascii=False, encoding='utf-8')
class MobygamesPipeline(object): global adult_ratings adult_ratings = ['18', 'Adults Only'] global licensed licensed = [u'Licensed\xa0Title'] def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open(spider.settings['FILES_STORE'] + '/%s.xml' % 'export', 'w+b') self.files[spider] = file self.exporter = XmlItemExporter(file, item_element='game', root_element='games') self.exporter.start_exporting() return def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() return def process_item(self, item, spider): logging.info("Exporting item:" + item['title']) # do not export utility fields # pop them from item dict item.pop("search_region", None) item.pop("search_title", None) item.pop("search_platform", None) item.pop("screenshot_urls", None) item.pop("cover_urls", None) if 'date' in item.keys(): item['date'] = str(dateutil.parser.parse(item['date'])) if 'rating' in item.keys(): if [i for i in adult_ratings if i in item['rating']]: item['mature'] = '' item['rating'] = '18' else: item['rating'] = item['rating'][0] if 'misc' in item.keys(): if [i for i in licensed if i in item['misc']]: item['licensed'] = '' try: value = item['platforms'] except KeyError: # Key is not present item['exclusive'] = '' pass # clean up description d = item['description'].encode('utf-8') start = 0 end = d.find('[ edit description') item['description'] = d[start:end].strip().replace(" ", " ") self.exporter.export_item(item) return item
def __init__(self): self.file = open('book2.xml', 'wb') self.exporter = XmlItemExporter(file=self.file, encoding='utf-8') self.exporter.start_exporting()
def __init__(self, file, **kwargs): XmlItemExporter.__init__(self, file, **kwargs)
def _get_exporter(self, **kwargs): return XmlItemExporter(self.output, **kwargs)
def __init__(self): self.file = open("assets/movies.xml", 'wb') self.exporter = XmlItemExporter(self.file, encoding='utf-8') self.exporter.start_exporting()
def spider_opened(self, spider): file_xml = open('%s_items.xml' % spider.name, 'w+b') self.files[spider] = file_xml self.exporter = XmlItemExporter(file_xml) self.exporter.start_exporting()
def spider_opened(self, spider): self.exporter=XmlItemExporter(self.file) self.exporter.start_exporting() self.exporter2=XmlItemExporter(self.file2) self.exporter2.start_exporting()
def spider_opened(self, spider): '''Open XML file for writing''' outfile = open('%s.xml' % spider.name, 'w+b') self.files[spider] = outfile self.exporter = XmlItemExporter(file) self.exporter.start_exporting()
def spider_opened(self,spider): self.file = open('play1.xml','wb') self.expoter = XmlItemExporter(self.file) self.expoter.start_exporting()
def __init__(self): file_name = str(datetime.datetime.now().date()) + '.xml' self.file = open(file_name, 'wb') self.exporter = XmlItemExporter(file=self.file) self.exporter.start_exporting()
class XmlExportPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s.xml' % spider.name, 'w+b') self.files[spider] = file self.exporter = XmlItemExporter(file) self.exporter.start_exporting() pass def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): if spider.name is 'match': league = item['league'].strip(' \t\n\r') if league == "": league = "Unknown" filename = 'matches/' + item[ 'country'] + '/' + league + '/' + item['season'] + '/' + str( item['stage']) + '/%s.xml' % item['matchId'] if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise with open(filename, 'w+b') as f: self.files[item['matchId']] = f self.exporter = XmlItemExporter(f) self.exporter.fields_to_export = [ 'country', 'league', 'season', 'stage', 'matchId', 'date', 'homeTeamId', 'awayTeamId', 'homeTeamFullName', 'awayTeamFullName', 'homeTeamAcronym', 'awayTeamAcronym', 'homeTeamGoal', 'awayTeamGoal', 'homePlayers', 'awayPlayers', 'homePlayersId', 'awayPlayersId', 'homePlayersX', 'awayPlayersX', 'homePlayersY', 'awayPlayersY', 'goal', 'shoton', 'shotoff', 'foulcommit', 'card', 'cross', 'corner', 'possession' ] self.exporter.export_item(item) return item elif spider.name is 'player': filename = 'players/' + item['name'] + '_' + item[ 'matchId'] + '_' + item['fifaId'] + '.xml' if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise with open(filename, 'w+b') as f: self.files[item['name']] = f self.exporter = XmlItemExporter(f) self.exporter.fields_to_export = [ 'name', 'matchId', 'fifaId', 'birthday', 'height', 'weight', 'stats' ] self.exporter.export_item(item) return item
def process_item(self, item, spider): if spider.name is 'match': filename = 'matches/' \ + item['country'] \ + '/' + item['league'] \ + '/' + item['season'] \ + '/' + str(item['stage']) \ +'/%s.xml' % item['matchId'] if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise file = open(filename, 'w+b') self.files[item['matchId']] = file self.exporter = XmlItemExporter(file) self.exporter.fields_to_export = [ 'country', 'league', 'season', 'stage', 'matchId', 'date', 'homeTeamId', 'awayTeamId', 'homeTeamFullName', 'awayTeamFullName', 'homeTeamAcronym', 'awayTeamAcronym', 'homeTeamGoal', 'awayTeamGoal', 'homePlayers', 'awayPlayers', 'homePlayersId', 'awayPlayersId', 'homePlayersX', 'awayPlayersX', 'homePlayersY', 'awayPlayersY', 'goal', 'shoton', 'shotoff', 'foulcommit', 'card', 'cross', 'corner', 'possession'] self.exporter.export_item(item) return item elif spider.name is 'player': filename = 'players/' \ + item['name']+'_'+item['matchId']+'_'+item['fifaId']+'.xml' if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise file = open(filename, 'w+b') self.files[item['name']] = file self.exporter = XmlItemExporter(file) self.exporter.fields_to_export = [ 'name', 'matchId', 'fifaId', 'birthday', 'height', 'weight', 'stats'] self.exporter.export_item(item) return item
def spider_opened(self, spider): file = open('pydata_items.xml', 'w+b') self.files[spider] = file self.exporter = XmlItemExporter(file) self.exporter.start_exporting()
def spider_opened(self, spider): file = open(spider.settings['FILES_STORE'] + '/%s.xml' % 'export', 'w+b') self.files[spider] = file self.exporter = XmlItemExporter(file,item_element='game', root_element='games') self.exporter.start_exporting() return
class XmlExportPipeline(object): def __init__(self): self.files = {} self.exporter = {} @classmethod def from_crawler(cls, crawler): '''Receives data from the crawler engine, creates the output pipelines''' pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): '''Open XML file for writing''' outfile = open('%s.xml' % spider.name, 'w+b') self.files[spider] = outfile self.exporter = XmlItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): '''Close the spider''' self.exporter.finish_exporting() outfile = self.files.pop(spider) outfile.close() def process_item(self, item, spider): '''Actually processes the xml file content''' if spider.name is 'match': filename = 'matches/' \ + item['country'] \ + '/' + item['league'] \ + '/' + item['season'] \ + '/' + str(item['stage']) \ +'/%s.xml' % item['matchId'] if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise with open(filename, 'w+b') as outfile: self.files[item['matchId']] = outfile self.exporter = XmlItemExporter(outfile) self.exporter.fields_to_export = [ 'country', 'league', 'season', 'stage', 'matchId', 'date', 'homeTeamId', 'awayTeamId', 'homeTeamFullName', 'awayTeamFullName', 'homeTeamAcronym', 'awayTeamAcronym', 'homeTeamGoal', 'awayTeamGoal', 'homePlayers', 'awayPlayers', 'homePlayersId', 'awayPlayersId', 'homePlayersX', 'awayPlayersX', 'homePlayersY', 'awayPlayersY', 'goal', 'shoton', 'shotoff', 'foulcommit', 'card', 'cross', 'corner', 'possession' ] self.exporter.export_item(item) return item elif spider.name is 'player': filename = 'players/' \ + item['name']+'_'+item['matchId']+'_'+item['fifaId']+'.xml' if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise with open(filename, 'w+b') as outfile: self.files[item['name']] = file self.exporter = XmlItemExporter(file) self.exporter.fields_to_export = [ 'name', 'matchId', 'fifaId', 'birthday', 'height', 'weight', 'stats' ] self.exporter.export_item(item) return item
def spider_opened(self, spider): file = open('%s.xml' % spider.name, 'wb') self.files[spider] = file self.exporter = XmlItemExporter(file) self.exporter.start_exporting()
class XmlExportPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): #file = open('%s.xml' % spider.name, 'w+b') #self.files[spider] = file #self.exporter = XmlItemExporter(file) #self.exporter.start_exporting() pass def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): if spider.name is 'match': filename = 'matches/' \ + item['country'] \ + '/' + item['league'] \ + '/' + item['season'] \ + '/' + str(item['stage']) \ +'/%s.xml' % item['matchId'] if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise file = open(filename, 'w+b') self.files[item['matchId']] = file self.exporter = XmlItemExporter(file) self.exporter.fields_to_export = [ 'country', 'league', 'season', 'stage', 'matchId', 'date', 'homeTeamId', 'awayTeamId', 'homeTeamFullName', 'awayTeamFullName', 'homeTeamAcronym', 'awayTeamAcronym', 'homeTeamGoal', 'awayTeamGoal', 'homePlayers', 'awayPlayers', 'homePlayersId', 'awayPlayersId', 'homePlayersX', 'awayPlayersX', 'homePlayersY', 'awayPlayersY', 'goal', 'shoton', 'shotoff', 'foulcommit', 'card', 'cross', 'corner', 'possession'] self.exporter.export_item(item) return item elif spider.name is 'player': filename = 'players/' \ + item['name']+'_'+item['matchId']+'_'+item['fifaId']+'.xml' if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise file = open(filename, 'w+b') self.files[item['name']] = file self.exporter = XmlItemExporter(file) self.exporter.fields_to_export = [ 'name', 'matchId', 'fifaId', 'birthday', 'height', 'weight', 'stats'] self.exporter.export_item(item) return item
def open_spider(self, spider): self.file = open('dianpincity.xml', 'wb') self.exporter = XmlItemExporter(self.file) self.exporter.start_exporting()
def spider_opened(self, spider): self.file = open(filename, "wb") self.exporter = XmlItemExporter(self.file) self.exporter.start_exporting()
def spider_opened(self, spider): file = open('%s_urls.xml' % spider.name, 'w+b') self.files[spider] = file self.exporter = XmlItemExporter(file) self.exporter.start_exporting()
def open_spider(self, spider): self.file = open('amazon_bestseller.xml', 'wb') self.exporter = XmlItemExporter(self.file) self.exporter.start_exporting()
def spider_opened(self, spider): file = open('%s_all.xml' % spider.name, 'w+b') self.files[spider] = file self.exporter = XmlItemExporter(file) self.exporter.fields_to_export = ['title','genres','rating','description','authors','published','link'] self.exporter.start_exporting()