Example #1
0
 def __init__(self):
     self.files_si = open("entradas_con_tags.xml", "w+b")
     self.files_no = open("entradas_sin_tags.xml", "w+b")
     self.exporter_si = XmlItemExporter(self.files_si)
     self.exporter_no = XmlItemExporter(self.files_no)
     self.exporter_si.start_exporting()
     self.exporter_no.start_exporting()
Example #2
0
    def spider_opened(self, spider):
        file1 = open('%s.xml' % spider.name, 'w+b')
        file2 = open('%s_without_tags.xml' % spider.name, 'w+b')

        self.files[spider] = [file1,file2]
        self.exporter1 = XmlItemExporter(file1)
        self.exporter2 = XmlItemExporter(file2)
        self.exporter1.start_exporting()
        self.exporter2.start_exporting()
Example #3
0
 def __init__(self):
     dispatcher.connect(self.spider_opened, signals.spider_opened)
     dispatcher.connect(self.spider_closed, signals.spider_closed)
     self.files = {}
     file_tags = open('posts_con_tags.xml' , 'w+b')
     file_notags = open('posts_sin_tags.xml', 'w+b')
     self.files['tags'] = file_tags
     self.files['notags'] = file_notags
     self.exporter_tags = XmlItemExporter(file_tags)
     self.exporter_notags = XmlItemExporter(file_notags)
Example #4
0
 def run(self, args, opts):
     if len(args) != 1:
         return False
     if opts.output:
         file = open(opts.output, 'w+b')
         exporter = XmlItemExporter(file)
         dispatcher.connect(exporter.export_item, signal=signals.item_passed)
         exporter.start_exporting()
     module = _import_file(args[0])
     scrapymanager.runonce(module.SPIDER)
     if opts.output:
         exporter.finish_exporting()
Example #5
0
	def get_new_fileexporter (self, item, spider):
		#get the owner
		try:
			owner = item['ownerKey'][0]
		except:
			owner = 'other'
	
		log.msg('creating fileExporter for %s' % (owner), level=log.INFO)
		
		#close any existing exporters and files
		if owner in self.fileExporters[spider]:
			fileExporter = self.fileExporters[spider][owner]
			file = fileExporter['file']
			exporter = fileExporter['exporter']
			exporter.finish_exporting()
			file.close()
		
		dir = '/'.join([spider.folder, owner])	
		
		# one batch per time that the spider has been resumed
		#batch = self.resumeCount[spider] #spider.state['resume_count']
		
		
		if owner in self.seq[spider]:
			seq = self.seq[spider][owner]
			seq += 1
			log.msg('owner %s exists, incrementing count %d' % (owner, seq), level=log.DEBUG)
			self.seq[spider][owner] = seq
		
		else:
			seq = self.seq[spider][owner] = 1
			log.msg('owner NOT %s exist, incrementing count %d' % (owner, seq), level=log.DEBUG)
		
		#ignore seq and use timestamp to allow job to resume withou having to track sequence
		filename =  '%s%s/%s_%d.xml' % (settings['ADAPTFM_OUTPUT_PATH'], dir, spider.name, time.time())

		if not os.path.isdir (os.path.dirname(filename)):
			os.mkdir(os.path.dirname(filename))
			
		# spider.currentFilename = filename	
		file = open(filename, 'w+b')
		
		# start exporting
		exporter = XmlItemExporter(file)
		exporter.start_exporting()
		
		fileExporter = {'exporter': exporter, 'file':file}
		# add to spider/owner
		self.fileExporters[spider][owner] = fileExporter
		
		log.msg('get_new_fileexporter %s' % (filename), level=log.DEBUG)
		
		return fileExporter
Example #6
0
 def assertExportResult(self, item, expected_value):
     fp = BytesIO()
     ie = XmlItemExporter(fp)
     ie.start_exporting()
     ie.export_item(item)
     ie.finish_exporting()
     self.assertXmlEquivalent(fp.getvalue(), expected_value)
Example #7
0
class VitrinBotXMLPipeline(object):

    def __init__(self):
        self.product_count = 0
        self.page_count = 0
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.create_xml(spider)

    def spider_closed(self, spider):
        self.close_xml(spider)

    def process_item(self, item, spider):
        # @todo sayfalama için xmldeki ürün sayısı kontrol edilecek.

        if self.product_count >= settings["MAX_PRODUCT_PER_XML"]:
            self.page_count += 1
            self.product_count = 0
            self.close_xml(spider)
            self.create_xml(spider)

        self.product_count += 1
        self.exporter.export_item(item)
        return item

    def get_xml_path(self, spider):
        # markafoni-%d.xml
        xml_filename = spider.xml_filename % self.page_count
        return '%s/%s' % (settings["XML_DUMP_DIR"], xml_filename)

    def close_xml(self, spider):
        self.exporter.finish_exporting()
        dump_file = self.files.pop(spider)
        dump_file.close()

    def create_xml(self, spider):
        dump_file = open(self.get_xml_path(spider), 'w+b')

        self.files[spider] = dump_file
        self.exporter = XmlItemExporter(dump_file, root_element="products", item_element="product")
        self.exporter.start_exporting()
Example #8
0
    def process_item(self, item, spider):

        XmlExportPipeline.count += 1
        self.outdir = spider.date.strftime('%Y%m%d')
        self.file = open(
            'reuters/%s/%s_item.xml' % (self.outdir, XmlExportPipeline.count),
            'w+b')
        self.exporter = XmlItemExporter(self.file,
                                        root_element='items',
                                        item_element='story')

        self.exporter.start_exporting()
        self.exporter.export_item(item)
        self.exporter.finish_exporting()
        return item
Example #9
0
    def spider_opened(self, spider):

        # fichero de guardado
        self.file = open('datos.xml', 'w+b')

        self.exporter = XmlItemExporter(self.file)
        self.exporter.start_exporting()
Example #10
0
	def spider_opened(self, spider):
		
		# fichero de guardado
		self.file = open('entradas_no_etiquetadas.xml', 'w+b')
		
		self.exporter = XmlItemExporter(self.file)
		self.exporter.start_exporting()
Example #11
0
    def run(self, args, opts):
        if len(args) != 1:
            return False
        if opts.output:
            file = open(opts.output, 'w+b')
            exporter = XmlItemExporter(file)
            dispatcher.connect(exporter.export_item, signal=signals.item_passed)
            exporter.start_exporting()
        module = _import_file(args[0])

        # schedule spider and start engine
        scrapymanager.queue.append_spider(module.SPIDER)
        scrapymanager.start()

        if opts.output:
            exporter.finish_exporting()
 def test_multivalued_fields(self):
     output = StringIO()
     item = TestItem(name=[u'John\xa3', u'Doe'])
     ie = XmlItemExporter(output)
     ie.start_exporting()
     ie.export_item(item)
     ie.finish_exporting()
     expected_value = '<?xml version="1.0" encoding="utf-8"?>\n<items><item><name><value>John\xc2\xa3</value><value>Doe</value></name></item></items>'
     self.assertEqual(output.getvalue(), expected_value)
Example #13
0
    def spider_opened(self, spider):

        # Crea el fichero para la exportación
        self.file = open('posts_sin_tags.xml', 'w+b')

        # Inicializa el exportardor y comienza la exportación
        self.exporter = XmlItemExporter(self.file)
        self.exporter.start_exporting()
Example #14
0
 def assertExportResult(self, item, expected_value):
     fp = BytesIO()
     ie = XmlItemExporter(fp)
     ie.start_exporting()
     ie.export_item(item)
     ie.finish_exporting()
     self.assertXmlEquivalent(fp.getvalue(), expected_value)
Example #15
0
	def spider_opened(self, spider):
		# Creamos el fichero .xml
		file = open('items.xml', 'w')
		# Creamos la entrada al diccionario
		self.files[spider] = file
		# Establecemos el exportador xml
		self.exporter = XmlItemExporter(file)
		# Comenzamos a exportar
		self.exporter.start_exporting()
 def spider_opened(self, spider):
     logtime = datetime.today()
     file = open(
         "%s/itemlog_%s_%s.xml" % (settings.get("LOG_DIR"), logtime.strftime("%Y-%m-%d_%H_%M"), spider.domain_name),
         "w+b",
     )
     self.files[spider] = file
     self.exporter = XmlItemExporter(file)
     self.exporter.start_exporting()
 def test_multivalued_fields(self):
     output = StringIO()
     item = TestItem(name=[u'John\xa3', u'Doe'])
     ie = XmlItemExporter(output)
     ie.start_exporting()
     ie.export_item(item)
     ie.finish_exporting()
     expected_value = '<?xml version="1.0" encoding="utf-8"?>\n<items><item><name><value>John\xc2\xa3</value><value>Doe</value></name></item></items>'
     self.assertEqual(output.getvalue(), expected_value)
Example #18
0
class EuropythonXmlExport(object):
	
	def __init__(self):
		self.files = {}

	@classmethod
	def from_crawler(cls, crawler):
		pipeline = cls()
		crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
		crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
		return pipeline

	def spider_opened(self, spider):
		file = open('europython_items.xml', 'w+b')
		self.files[spider] = file
		self.exporter = XmlItemExporter(file)
		self.exporter.start_exporting()

	def spider_closed(self, spider):
		self.exporter.finish_exporting()
		file = self.files.pop(spider)
		file.close()

	def process_item(self, item, spider):
		self.exporter.export_item(item)
		return item
Example #19
0
class TagPipeline(object):
    """
        Sólo exporta los posts con etiquetas (tags) definidas
    """

    def __init__(self):
        # Conexión de las señales de apertura y cierre del spider
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)

    def spider_opened(self, spider):

        # Crea el fichero para la exportación
        self.file = open('posts_con_tags.xml', 'w+b')

        # Inicializa el exportardor y comienza la exportación
        self.exporter = XmlItemExporter(self.file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
    
        # Termina la exportación
        self.exporter.finish_exporting()

        # Cierra el fichero
        self.file.close()

    def process_item(self, item, spider):

        if item['etiquetas']:

            # Al menos una etiqueta definida, exporta el item
            self.exporter.export_item(item)

        return item
Example #20
0
class XmlWritePipeline(object):
    def __init__(self):
        pass

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        self.file = open('bbsData.xml', 'wb')
        self.expoter = XmlItemExporter(self.file)
        self.expoter.start_exporting()

    def spider_closed(self, spider):
        self.expoter.finish_exporting()
        self.file.close()

        # process the crawled data, define and call dataProcess function
        # dataProcess('bbsData.xml', 'text.txt')

    def process_item(self, item, spider):
        self.expoter.export_item(item)
        return item
Example #21
0
class XmlExportPipeline(object):
    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
        pipeline = cls()
        crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
        crawler.signals.connect(pipeline.close_spider, signals.spider_closed)
        return pipeline

    def spider_opened(self, spider):
        file = open('%s_products.xml' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = XmlItemExporter(file)
        self.exporter.start_exporting()

    def close_spider(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Example #22
0
class XmlExportPipeline(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
         pipeline = cls()
         crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
         crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
         return pipeline

    def spider_opened(self, spider):
        f = open('%s_products.xml' % spider.name, 'w+b')
        self.files[spider] = f
        self.exporter = XmlItemExporter(f)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        f = self.files.pop(spider)
        f.close()

    def process_item(self, item, spider):
        self.exporter.export_item(item)
        return item
Example #23
0
class RueventsPipeline(object):
    def __init__(self):
        self.duplicates = {}
        self.files = {}
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)

    def spider_opened(self, spider):
        self.duplicates[spider] = set()
        file = open('%s_items.xml' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = XmlItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        del self.duplicates[spider]
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
        if item['event_id'] in self.duplicates[spider]:
            raise DropItem("Duplicate item found!")
        else:
            self.duplicates[spider].add(item['event_id'])
            self.exporter.export_item(item)
            return item
Example #24
0
class RueventsPipeline(object):
	def __init__(self):
		self.duplicates = {}
		self.files = {}
		dispatcher.connect(self.spider_opened, signals.spider_opened)
		dispatcher.connect(self.spider_closed, signals.spider_closed)

	def spider_opened(self, spider):
		self.duplicates[spider]=set()
		file = open('%s_items.xml' % spider.name, 'w+b')
		self.files[spider] = file
		self.exporter = XmlItemExporter(file)
		self.exporter.start_exporting()
	
	def spider_closed(self, spider):
		del self.duplicates[spider]
		self.exporter.finish_exporting()
		file = self.files.pop(spider)
		file.close()

	def process_item(self, item, spider):
		if item['event_id'] in self.duplicates[spider]:
			raise DropItem("Duplicate item found!")
		else:
			self.duplicates[spider].add(item['event_id'])
			self.exporter.export_item(item)
			return item
Example #25
0
class XmlWritePipeline(object):
	"""docstring for XmlWritePipeline"""
	
	def __init__(self):
		pass
	
	@classmethod
	def from_crawler(cls,crawler):
		pipeline = cls()
		crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
		crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
		return pipeline
	
	def spider_opened(self, spider):
		self.file = open('bbsData.xml', 'wb')
		self.expoter = XmlItemExporter(self.file)
		self.expoter.start_exporting()
	
	def spider_closed(self, spider):
		self.expoter.finish_exporting()
		self.file.close()
		# process the crawled data, define and call dataProcess function
		# dataProcess('bbsData.xml', 'text.txt')
	
	def process_item(self, item, spider):
		self.expoter.export_item(item)
		return item
Example #26
0
class XmlExportWithOutLabels(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
         pipeline = cls()
         crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
         crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
         return pipeline

    def spider_opened(self, spider):
        file = open('postUGR_withOutLabel.xml', 'w+b')
        self.files[spider] = file
        self.exporter = XmlItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):
    	if not item['etiquetas']:
    		self.exporter.export_item(item)
    	return item
Example #27
0
class XmlExportPipelineWithoutTags(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
         pipeline = cls()
         
         crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
         crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
         return pipeline

    def spider_opened(self, spider):
        file = open('%s_post_without_tags.xml' % spider.name, 'w+b')
        self.files[spider] = file
        self.exporter = XmlItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, item, spider):

        if item['tag'][0]=="":
			self.exporter.export_item(item)
        return item
Example #28
0
class guardadoXMLPipeline(object):

    def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)

    def spider_opened(self, spider):

        # fichero de guardado
        self.file = open('datos.xml', 'w+b')

        self.exporter = XmlItemExporter(self.file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
    
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
        """Solo procesa las imagenes y se deshace de las referencias a los videos """
        if item['ruta_imagen']:
            self.exporter.export_item(item)
            return item
        else:
            raise DropItem("Este dia no hay imagen  %s" % item)
class OfficielebekendmakingenPipeline(object):
    def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.files = {}

    def spider_opened(self, spider):
        logtime = datetime.today()
        file = open(
            "%s/itemlog_%s_%s.xml" % (settings.get("LOG_DIR"), logtime.strftime("%Y-%m-%d_%H_%M"), spider.domain_name),
            "w+b",
        )
        self.files[spider] = file
        self.exporter = XmlItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
        self.exporter.finish_exporting()
        file = self.files.pop(spider)
        file.close()

    def process_item(self, spider, item):
        for field in item:
            if item[field]:
                item[field] = item[field][0]
        self.exporter.export_item(item)
        return item
Example #30
0
class conEtiquetaPipeline(object):
    """ Las entradas que tienen etiqueta las pasa al archivo entradas_etiquetadas.xml """

    def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)

    def spider_opened(self, spider):

        # fichero de guardado
        self.file = open('entradas_etiquetadas.xml', 'w+b')

        self.exporter = XmlItemExporter(self.file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
    
        self.exporter.finish_exporting()
        self.file.close()

    def process_item(self, item, spider):
		""" si el item tiene una etiqueta lo exporta al archivo prefijado """
		if item['etiquetas']:
			self.exporter.export_item(item)
		return item
Example #31
0
class XmlExportPipeline(object):
    def __init__(self):
        self.files_si = open("entradas_con_tags.xml", "w+b")
        self.files_no = open("entradas_sin_tags.xml", "w+b")
        self.exporter_si = XmlItemExporter(self.files_si)
        self.exporter_no = XmlItemExporter(self.files_no)
        self.exporter_si.start_exporting()
        self.exporter_no.start_exporting()

    def process_item(self, item, spider):
        if len(item["tags"]) == 0:
            self.exporter_no.export_item(item)
        else:
            self.exporter_si.export_item(item)
        return item
 def test_nested_list_item(self):
     output = StringIO()
     i1 = TestItem(name=u'foo')
     i2 = TestItem(name=u'bar')
     i3 = TestItem(name=u'buz', age=[i1, i2])
     ie = XmlItemExporter(output)
     ie.start_exporting()
     ie.export_item(i3)
     ie.finish_exporting()
     expected_value =  '<?xml version="1.0" encoding="utf-8"?>\n'\
             '<items><item>'\
                 '<age>'\
                     '<value><name>foo</name></value>'\
                     '<value><name>bar</name></value>'\
                 '</age>'\
                 '<name>buz</name>'\
             '</item></items>'
     self.assertEqual(output.getvalue(), expected_value)
Example #33
0
 def test_nested_item(self):
     output = BytesIO()
     i1 = TestItem(name=u'foo\xa3hoo', age='22')
     i2 = TestItem(name=u'bar', age=i1)
     i3 = TestItem(name=u'buz', age=i2)
     ie = XmlItemExporter(output)
     ie.start_exporting()
     ie.export_item(i3)
     ie.finish_exporting()
     expected_value = '<?xml version="1.0" encoding="utf-8"?>\n'\
             '<items><item>'\
                 '<age>'\
                     '<age>'\
                         '<age>22</age>'\
                         '<name>foo\xc2\xa3hoo</name>'\
                     '</age>'\
                     '<name>bar</name>'\
                 '</age>'\
                 '<name>buz</name>'\
             '</item></items>'
     self.assertXmlEquivalent(output.getvalue(), expected_value)
 def test_nested_list_item(self):
     output = StringIO()
     i1 = TestItem(name=u'foo')
     i2 = TestItem(name=u'bar')
     i3 = TestItem(name=u'buz', age=[i1, i2])
     ie = XmlItemExporter(output)
     ie.start_exporting()
     ie.export_item(i3)
     ie.finish_exporting()
     expected_value =  '<?xml version="1.0" encoding="utf-8"?>\n'\
             '<items><item>'\
                 '<age>'\
                     '<value><name>foo</name></value>'\
                     '<value><name>bar</name></value>'\
                 '</age>'\
                 '<name>buz</name>'\
             '</item></items>'
     self.assertEqual(output.getvalue(), expected_value)
Example #35
0
class XmlExportWithLabels(object):

    def __init__(self):
        self.files = {}

    @classmethod
    def from_crawler(cls, crawler):
         pipeline = cls()
         crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
         crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
         return pipeline

    def spider_opened(self, spider):
        file = open('postUGR_withLabel.xml', 'w+b')
        self.files[spider] = file
        self.exporter = XmlItemExporter(file)
        self.exporter.start_exporting()

    def spider_closed(self, spider):
		self.exporter.finish_exporting()
		file = self.files.pop(spider)
		file.close()
		
		#Construcción de la tabla html
		ficheroHTML = open('postUGR_items.html', "w")
		ficheroHTML.write("<!DOCTYPE html>\n<html>\n<header>\n<title>Fichero CSV a HTML</title>\n</header>\n<body>\n<table border=\"1\">\n")
       
		hdlFicheroCSV = open('postUGR_items.csv', "rb")
		objFilaCSV = csv.reader(hdlFicheroCSV)
		rownum = 0
		header = "Campo"
		for row in objFilaCSV:
			ficheroHTML.write("<tr>")
			colnum = 0
			for col in row:
				ficheroHTML.write("<td>")
				ficheroHTML.write(col)
				ficheroHTML.write("</td>")
				colnum += 1
		ficheroHTML.write("</tr>")      
		rownum += 1
		ficheroHTML.write("</table>\n</body>\n</html>")
		ficheroHTML.close()
			
    def process_item(self, item, spider):
    	if item['etiquetas']:
    		self.exporter.export_item(item)
    	return item
Example #36
0
class OslpostPipeline(object):
	# Constructor
	def __init__(self):
		# Conectamos las señales para las spiders
		dispatcher.connect(self.spider_opened, signals.spider_opened)
		dispatcher.connect(self.spider_closed, signals.spider_closed)
		# Iniciamos el diccionario de ficheros (en caso de que haya varios spiders)
		self.files = {}
	
	# Manejador de señal de Spider_opened
	def spider_opened(self, spider):
		# Creamos el fichero .xml
		file = open('items.xml', 'w')
		# Creamos la entrada al diccionario
		self.files[spider] = file
		# Establecemos el exportador xml
		self.exporter = XmlItemExporter(file)
		# Comenzamos a exportar
		self.exporter.start_exporting()
	
	# Manejador de señal de Spider_opened
	def spider_closed(self, spider):
		self.exporter.finish_exporting()
		file = self.files.pop(spider)
		file.close()
	
	# Procesador de items
	def process_item(self, item, spider):
		# Procesamos las imagenes recogidas en el item
		# Los contenidos los dejamos intactos para no perder la información
		# de los enlaces y los objetos complejos (videos, presentaciones, etc)
		imagenes = item['images']
		# Filtro para las imágenes
		# REGULAR EXPRESION COMPILATION
		expresion = r'<img src="([^"]+)'
		regexp = re.compile(expresion, re.I | re.MULTILINE | re.DOTALL)
		# FIND ALL CASES OF THE REG. EXPR.
		enlaces = []
		for i in imagenes:
			for l in regexp.findall(i[0]):
				# Extraemos los enlaces
				enlaces.append(l)
		# Establecemos el field correcto
		item['images'] = enlaces
		# Exportamos el item
		self.exporter.export_item(item)
		return item
 def test_nested_list_item(self):
     output = StringIO()
     i1 = TestItem(name=u"foo")
     i2 = TestItem(name=u"bar")
     i3 = TestItem(name=u"buz", age=[i1, i2])
     ie = XmlItemExporter(output)
     ie.start_exporting()
     ie.export_item(i3)
     ie.finish_exporting()
     expected_value = (
         '<?xml version="1.0" encoding="utf-8"?>\n'
         "<items><item>"
         "<age>"
         "<value><name>foo</name></value>"
         "<value><name>bar</name></value>"
         "</age>"
         "<name>buz</name>"
         "</item></items>"
     )
     self.assertXmlEquivalent(output.getvalue(), expected_value)
Example #38
0
	def process_item(self, item, spider):

		outputdir = '%s%s/%s' % (settings['ADAPTFM_OUTPUT_PATH'], spider.folder, item['brandCategory'][0])
		name = item['brandFeed'][0].replace('http://','').replace('/','_').replace('.xml','')
		episode = item['episodeTitle']
		filename = '%s/%s-%s.xml' % (outputdir, name, episode)
		if not os.path.isdir (os.path.dirname(filename)):
			os.mkdir(os.path.dirname(filename))

		file = open(filename, 'a+b')


		self.exporter = XmlItemExporter(file)
		self.exporter.start_exporting()

		self.exporter.export_item(item)

		self.exporter.finish_exporting()
		file.close()


		return item
Example #39
0
 def test_nested_item(self):
     output = StringIO()
     i1 = TestItem(name=u'foo\xa3hoo', age='22')
     i2 = TestItem(name=u'bar', age=i1)
     i3 = TestItem(name=u'buz', age=i2)
     ie = XmlItemExporter(output)
     ie.start_exporting()
     ie.export_item(i3)
     ie.finish_exporting()
     expected_value = '<?xml version="1.0" encoding="utf-8"?>\n'\
             '<items><item>'\
                 '<age>'\
                     '<age>'\
                         '<age>22</age>'\
                         '<name>foo\xc2\xa3hoo</name>'\
                     '</age>'\
                     '<name>bar</name>'\
                 '</age>'\
                 '<name>buz</name>'\
             '</item></items>'
     self.assertXmlEquivalent(output.getvalue(), expected_value)
Example #40
0
 def spider_opened(self, spider):
     file = open('%s_products.xml' % spider.name, 'w+b')
     self.files[spider] = file
     self.exporter = XmlItemExporter(file)
     self.exporter.start_exporting()
Example #41
0
 def spider_opened(self, spider):
     self.file = open('bbsData.xml', 'wb')
     self.expoter = XmlItemExporter(self.file)
     self.expoter.start_exporting()
Example #42
0
 def spider_opened(self, spider):
     self.duplicates[spider] = set()
     file = open('%s_items.xml' % spider.name, 'w+b')
     self.files[spider] = file
     self.exporter = XmlItemExporter(file)
     self.exporter.start_exporting()
 def _get_exporter(self, **kwargs):
     return XmlItemExporter(self.output, **kwargs)
Example #44
0
 def spider_opened(self, spider):
     file = open('postUGR_withLabel.xml', 'w+b')
     self.files[spider] = file
     self.exporter = XmlItemExporter(file)
     self.exporter.start_exporting()
Example #45
0
    def create_xml(self, spider):
        dump_file = open(self.get_xml_path(spider), 'w+b')

        self.files[spider] = dump_file
        self.exporter = XmlItemExporter(dump_file, root_element="products", item_element="product")
        self.exporter.start_exporting()