class XmlWritePipeline(object): """docstring for XmlWritePipeline""" def __init__(self): pass @classmethod def from_crawler(cls,crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.file = open('bbsData.xml', 'wb') self.expoter = XmlItemExporter(self.file) self.expoter.start_exporting() def spider_closed(self, spider): self.expoter.finish_exporting() self.file.close() # process the crawled data, define and call dataProcess function # dataProcess('bbsData.xml', 'text.txt') def process_item(self, item, spider): self.expoter.export_item(item) return item
class FicheroXmlPipeline(object): def __init__(self): dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.files = {} file_tags = open('posts_con_tags.xml' , 'w+b') file_notags = open('posts_sin_tags.xml', 'w+b') self.files['tags'] = file_tags self.files['notags'] = file_notags self.exporter_tags = XmlItemExporter(file_tags) self.exporter_notags = XmlItemExporter(file_notags) def spider_opened(self, spider): self.exporter_tags.start_exporting() self.exporter_notags.start_exporting() def spider_closed(self, spider): self.exporter_tags.finish_exporting() self.exporter_notags.finish_exporting() file = self.files.pop('tags') file.close() file = self.files.pop('notags') file.close() def process_item(self, item, spider): if item['tags']: self.exporter_tags.export_item(item) else: self.exporter_notags.export_item(item) return item
class RueventsPipeline(object): def __init__(self): self.duplicates = {} self.files = {} dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) def spider_opened(self, spider): self.duplicates[spider]=set() file = open('%s_items.xml' % spider.name, 'w+b') self.files[spider] = file self.exporter = XmlItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): del self.duplicates[spider] self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): if item['event_id'] in self.duplicates[spider]: raise DropItem("Duplicate item found!") else: self.duplicates[spider].add(item['event_id']) self.exporter.export_item(item) return item
class RueventsPipeline(object): def __init__(self): self.duplicates = {} self.files = {} dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) def spider_opened(self, spider): self.duplicates[spider] = set() file = open('%s_items.xml' % spider.name, 'w+b') self.files[spider] = file self.exporter = XmlItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): del self.duplicates[spider] self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): if item['event_id'] in self.duplicates[spider]: raise DropItem("Duplicate item found!") else: self.duplicates[spider].add(item['event_id']) self.exporter.export_item(item) return item
class guardadoXMLPipeline(object): def __init__(self): dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) def spider_opened(self, spider): # fichero de guardado self.file = open('datos.xml', 'w+b') self.exporter = XmlItemExporter(self.file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): """Solo procesa las imagenes y se deshace de las referencias a los videos """ if item['ruta_imagen']: self.exporter.export_item(item) return item else: raise DropItem("Este dia no hay imagen %s" % item)
class DamePostPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file1 = open('%s.xml' % spider.name, 'w+b') file2 = open('%s_without_tags.xml' % spider.name, 'w+b') self.files[spider] = [file1,file2] self.exporter1 = XmlItemExporter(file1) self.exporter2 = XmlItemExporter(file2) self.exporter1.start_exporting() self.exporter2.start_exporting() def spider_closed(self, spider): self.exporter1.finish_exporting() self.exporter2.finish_exporting() files = self.files.pop(spider) files[0].close() files[1].close() def process_item(self, item, spider): if not item['tag_list']: self.exporter2.export_item(item) else: self.exporter1.export_item(item) return item
class TagPipeline(object): """ Sólo exporta los posts con etiquetas (tags) definidas """ def __init__(self): # Conexión de las señales de apertura y cierre del spider dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) def spider_opened(self, spider): # Crea el fichero para la exportación self.file = open('posts_con_tags.xml', 'w+b') # Inicializa el exportardor y comienza la exportación self.exporter = XmlItemExporter(self.file) self.exporter.start_exporting() def spider_closed(self, spider): # Termina la exportación self.exporter.finish_exporting() # Cierra el fichero self.file.close() def process_item(self, item, spider): if item['etiquetas']: # Al menos una etiqueta definida, exporta el item self.exporter.export_item(item) return item
class XmlExportWithOutLabels(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('postUGR_withOutLabel.xml', 'w+b') self.files[spider] = file self.exporter = XmlItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): if not item['etiquetas']: self.exporter.export_item(item) return item
class XmlExportPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): f = open('%s_products.xml' % spider.name, 'w+b') self.files[spider] = f self.exporter = XmlItemExporter(f) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() f = self.files.pop(spider) f.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
def assertExportResult(self, item, expected_value): fp = BytesIO() ie = XmlItemExporter(fp) ie.start_exporting() ie.export_item(item) ie.finish_exporting() self.assertXmlEquivalent(fp.getvalue(), expected_value)
class XmlExportPipeline(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.close_spider, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_products.xml' % spider.name, 'w+b') self.files[spider] = file self.exporter = XmlItemExporter(file) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
class XmlExportPipelineWithoutTags(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('%s_post_without_tags.xml' % spider.name, 'w+b') self.files[spider] = file self.exporter = XmlItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): if item['tag'][0]=="": self.exporter.export_item(item) return item
class conEtiquetaPipeline(object): """ Las entradas que tienen etiqueta las pasa al archivo entradas_etiquetadas.xml """ def __init__(self): dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) def spider_opened(self, spider): # fichero de guardado self.file = open('entradas_etiquetadas.xml', 'w+b') self.exporter = XmlItemExporter(self.file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): """ si el item tiene una etiqueta lo exporta al archivo prefijado """ if item['etiquetas']: self.exporter.export_item(item) return item
class OfficielebekendmakingenPipeline(object): def __init__(self): dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.files = {} def spider_opened(self, spider): logtime = datetime.today() file = open( "%s/itemlog_%s_%s.xml" % (settings.get("LOG_DIR"), logtime.strftime("%Y-%m-%d_%H_%M"), spider.domain_name), "w+b", ) self.files[spider] = file self.exporter = XmlItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, spider, item): for field in item: if item[field]: item[field] = item[field][0] self.exporter.export_item(item) return item
class XmlWritePipeline(object): def __init__(self): pass @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.file = open('bbsData.xml', 'wb') self.expoter = XmlItemExporter(self.file) self.expoter.start_exporting() def spider_closed(self, spider): self.expoter.finish_exporting() self.file.close() # process the crawled data, define and call dataProcess function # dataProcess('bbsData.xml', 'text.txt') def process_item(self, item, spider): self.expoter.export_item(item) return item
class EuropythonXmlExport(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('europython_items.xml', 'w+b') self.files[spider] = file self.exporter = XmlItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item
def test_multivalued_fields(self): output = StringIO() item = TestItem(name=[u'John\xa3', u'Doe']) ie = XmlItemExporter(output) ie.start_exporting() ie.export_item(item) ie.finish_exporting() expected_value = '<?xml version="1.0" encoding="utf-8"?>\n<items><item><name><value>John\xc2\xa3</value><value>Doe</value></name></item></items>' self.assertEqual(output.getvalue(), expected_value)
def get_new_fileexporter (self, item, spider): #get the owner try: owner = item['ownerKey'][0] except: owner = 'other' log.msg('creating fileExporter for %s' % (owner), level=log.INFO) #close any existing exporters and files if owner in self.fileExporters[spider]: fileExporter = self.fileExporters[spider][owner] file = fileExporter['file'] exporter = fileExporter['exporter'] exporter.finish_exporting() file.close() dir = '/'.join([spider.folder, owner]) # one batch per time that the spider has been resumed #batch = self.resumeCount[spider] #spider.state['resume_count'] if owner in self.seq[spider]: seq = self.seq[spider][owner] seq += 1 log.msg('owner %s exists, incrementing count %d' % (owner, seq), level=log.DEBUG) self.seq[spider][owner] = seq else: seq = self.seq[spider][owner] = 1 log.msg('owner NOT %s exist, incrementing count %d' % (owner, seq), level=log.DEBUG) #ignore seq and use timestamp to allow job to resume withou having to track sequence filename = '%s%s/%s_%d.xml' % (settings['ADAPTFM_OUTPUT_PATH'], dir, spider.name, time.time()) if not os.path.isdir (os.path.dirname(filename)): os.mkdir(os.path.dirname(filename)) # spider.currentFilename = filename file = open(filename, 'w+b') # start exporting exporter = XmlItemExporter(file) exporter.start_exporting() fileExporter = {'exporter': exporter, 'file':file} # add to spider/owner self.fileExporters[spider][owner] = fileExporter log.msg('get_new_fileexporter %s' % (filename), level=log.DEBUG) return fileExporter
def run(self, args, opts): if len(args) != 1: return False if opts.output: file = open(opts.output, 'w+b') exporter = XmlItemExporter(file) dispatcher.connect(exporter.export_item, signal=signals.item_passed) exporter.start_exporting() module = _import_file(args[0]) scrapymanager.runonce(module.SPIDER) if opts.output: exporter.finish_exporting()
class VitrinBotXMLPipeline(object): def __init__(self): self.product_count = 0 self.page_count = 0 self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.create_xml(spider) def spider_closed(self, spider): self.close_xml(spider) def process_item(self, item, spider): # @todo sayfalama için xmldeki ürün sayısı kontrol edilecek. if self.product_count >= settings["MAX_PRODUCT_PER_XML"]: self.page_count += 1 self.product_count = 0 self.close_xml(spider) self.create_xml(spider) self.product_count += 1 self.exporter.export_item(item) return item def get_xml_path(self, spider): # markafoni-%d.xml xml_filename = spider.xml_filename % self.page_count return '%s/%s' % (settings["XML_DUMP_DIR"], xml_filename) def close_xml(self, spider): self.exporter.finish_exporting() dump_file = self.files.pop(spider) dump_file.close() def create_xml(self, spider): dump_file = open(self.get_xml_path(spider), 'w+b') self.files[spider] = dump_file self.exporter = XmlItemExporter(dump_file, root_element="products", item_element="product") self.exporter.start_exporting()
class XmlExportPipeline(object): def __init__(self): self.files_si = open("entradas_con_tags.xml", "w+b") self.files_no = open("entradas_sin_tags.xml", "w+b") self.exporter_si = XmlItemExporter(self.files_si) self.exporter_no = XmlItemExporter(self.files_no) self.exporter_si.start_exporting() self.exporter_no.start_exporting() def process_item(self, item, spider): if len(item["tags"]) == 0: self.exporter_no.export_item(item) else: self.exporter_si.export_item(item) return item
class XmlExportWithLabels(object): def __init__(self): self.files = {} @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): file = open('postUGR_withLabel.xml', 'w+b') self.files[spider] = file self.exporter = XmlItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() #Construcción de la tabla html ficheroHTML = open('postUGR_items.html', "w") ficheroHTML.write("<!DOCTYPE html>\n<html>\n<header>\n<title>Fichero CSV a HTML</title>\n</header>\n<body>\n<table border=\"1\">\n") hdlFicheroCSV = open('postUGR_items.csv', "rb") objFilaCSV = csv.reader(hdlFicheroCSV) rownum = 0 header = "Campo" for row in objFilaCSV: ficheroHTML.write("<tr>") colnum = 0 for col in row: ficheroHTML.write("<td>") ficheroHTML.write(col) ficheroHTML.write("</td>") colnum += 1 ficheroHTML.write("</tr>") rownum += 1 ficheroHTML.write("</table>\n</body>\n</html>") ficheroHTML.close() def process_item(self, item, spider): if item['etiquetas']: self.exporter.export_item(item) return item
def run(self, args, opts): if len(args) != 1: return False if opts.output: file = open(opts.output, 'w+b') exporter = XmlItemExporter(file) dispatcher.connect(exporter.export_item, signal=signals.item_passed) exporter.start_exporting() module = _import_file(args[0]) # schedule spider and start engine scrapymanager.queue.append_spider(module.SPIDER) scrapymanager.start() if opts.output: exporter.finish_exporting()
class OslpostPipeline(object): # Constructor def __init__(self): # Conectamos las señales para las spiders dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) # Iniciamos el diccionario de ficheros (en caso de que haya varios spiders) self.files = {} # Manejador de señal de Spider_opened def spider_opened(self, spider): # Creamos el fichero .xml file = open('items.xml', 'w') # Creamos la entrada al diccionario self.files[spider] = file # Establecemos el exportador xml self.exporter = XmlItemExporter(file) # Comenzamos a exportar self.exporter.start_exporting() # Manejador de señal de Spider_opened def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() # Procesador de items def process_item(self, item, spider): # Procesamos las imagenes recogidas en el item # Los contenidos los dejamos intactos para no perder la información # de los enlaces y los objetos complejos (videos, presentaciones, etc) imagenes = item['images'] # Filtro para las imágenes # REGULAR EXPRESION COMPILATION expresion = r'<img src="([^"]+)' regexp = re.compile(expresion, re.I | re.MULTILINE | re.DOTALL) # FIND ALL CASES OF THE REG. EXPR. enlaces = [] for i in imagenes: for l in regexp.findall(i[0]): # Extraemos los enlaces enlaces.append(l) # Establecemos el field correcto item['images'] = enlaces # Exportamos el item self.exporter.export_item(item) return item
def test_nested_list_item(self): output = StringIO() i1 = TestItem(name=u'foo') i2 = TestItem(name=u'bar') i3 = TestItem(name=u'buz', age=[i1, i2]) ie = XmlItemExporter(output) ie.start_exporting() ie.export_item(i3) ie.finish_exporting() expected_value = '<?xml version="1.0" encoding="utf-8"?>\n'\ '<items><item>'\ '<age>'\ '<value><name>foo</name></value>'\ '<value><name>bar</name></value>'\ '</age>'\ '<name>buz</name>'\ '</item></items>' self.assertEqual(output.getvalue(), expected_value)
def test_nested_list_item(self): output = StringIO() i1 = TestItem(name=u"foo") i2 = TestItem(name=u"bar") i3 = TestItem(name=u"buz", age=[i1, i2]) ie = XmlItemExporter(output) ie.start_exporting() ie.export_item(i3) ie.finish_exporting() expected_value = ( '<?xml version="1.0" encoding="utf-8"?>\n' "<items><item>" "<age>" "<value><name>foo</name></value>" "<value><name>bar</name></value>" "</age>" "<name>buz</name>" "</item></items>" ) self.assertXmlEquivalent(output.getvalue(), expected_value)
class XmlExportPipeline(object): def __init__(self): dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.files = {} def spider_opened(self, spider): file = open('%s_products.xml' % spider.name, 'w+b') self.files[spider] = file self.exporter = XmlItemExporter(file) self.exporter.start_exporting() def spider_closed(self, spider): self.exporter.finish_exporting() file = self.files.pop(spider) file.close() def process_item(self, spider, item): self.exporter.export_item(item) return item
def test_nested_item(self): output = StringIO() i1 = TestItem(name=u'foo\xa3hoo', age='22') i2 = TestItem(name=u'bar', age=i1) i3 = TestItem(name=u'buz', age=i2) ie = XmlItemExporter(output) ie.start_exporting() ie.export_item(i3) ie.finish_exporting() expected_value = '<?xml version="1.0" encoding="utf-8"?>\n'\ '<items><item>'\ '<age>'\ '<age>'\ '<age>22</age>'\ '<name>foo\xc2\xa3hoo</name>'\ '</age>'\ '<name>bar</name>'\ '</age>'\ '<name>buz</name>'\ '</item></items>' self.assertXmlEquivalent(output.getvalue(), expected_value)
def test_nested_item(self): output = BytesIO() i1 = TestItem(name=u'foo\xa3hoo', age='22') i2 = TestItem(name=u'bar', age=i1) i3 = TestItem(name=u'buz', age=i2) ie = XmlItemExporter(output) ie.start_exporting() ie.export_item(i3) ie.finish_exporting() expected_value = '<?xml version="1.0" encoding="utf-8"?>\n'\ '<items><item>'\ '<age>'\ '<age>'\ '<age>22</age>'\ '<name>foo\xc2\xa3hoo</name>'\ '</age>'\ '<name>bar</name>'\ '</age>'\ '<name>buz</name>'\ '</item></items>' self.assertXmlEquivalent(output.getvalue(), expected_value)
class XmlExportPipeline2(object): def process_item(self, item, spider): outputdir = '%s%s/%s' % (settings['ADAPTFM_OUTPUT_PATH'], spider.folder, item['brandCategory'][0]) name = item['brandFeed'][0].replace('http://','').replace('/','_').replace('.xml','') episode = item['episodeTitle'] filename = '%s/%s-%s.xml' % (outputdir, name, episode) if not os.path.isdir (os.path.dirname(filename)): os.mkdir(os.path.dirname(filename)) file = open(filename, 'a+b') self.exporter = XmlItemExporter(file) self.exporter.start_exporting() self.exporter.export_item(item) self.exporter.finish_exporting() file.close() return item
class XmlExportPipeline(object): count = 0 def __init__(self): self.exporter = None self.outdir = None self.file = None def process_item(self, item, spider): XmlExportPipeline.count += 1 self.outdir = spider.date.strftime('%Y%m%d') self.file = open( 'reuters/%s/%s_item.xml' % (self.outdir, XmlExportPipeline.count), 'w+b') self.exporter = XmlItemExporter(self.file, root_element='items', item_element='story') self.exporter.start_exporting() self.exporter.export_item(item) self.exporter.finish_exporting() return item
class XmlWritePipeline(object): def __init__(self): pass @classmethod def from_crawler(cls, crawler): pipeline = cls() crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) return pipeline def spider_opened(self, spider): self.file = open("bbsData.xml", "wb") self.expoter = XmlItemExporter(self.file) self.expoter.start_exporting() def spider_closed(self, spider): self.expoter.finish_exporting() self.file.close() def process_item(self, item, spider): self.expoter.export_item(item) return item