Ejemplo n.º 1
0
def csviter(obj, delimiter=None, headers=None, encoding=None):
    """ Returns an iterator of dictionaries from the given csv object

    obj can be:
    - a Response object
    - a unicode string
    - a string encoded as utf-8

    delimiter is the character used to separate field on the given obj.

    headers is an iterable that when provided offers the keys
    for the returned dictionaries, if not the first row is used.
    """
    encoding = obj.encoding if isinstance(obj, TextResponse) else encoding or 'utf-8'
    def _getrow(csv_r):
        return [str_to_unicode(field, encoding) for field in csv_r.next()]

    lines = StringIO(body_or_str(obj, unicode=False))
    if delimiter:
        csv_r = csv.reader(lines, delimiter=delimiter)
    else:
        csv_r = csv.reader(lines)

    if not headers:
        headers = _getrow(csv_r)

    while True:
        row = _getrow(csv_r)
        if len(row) != len(headers):
            log.msg(format="ignoring row %(csvlnum)d (length: %(csvrow)d, should be: %(csvheader)d)",
                    level=log.WARNING, csvlnum=csv_r.line_num, csvrow=len(row), csvheader=len(headers))
            continue
        else:
            yield dict(zip(headers, row))
Ejemplo n.º 2
0
def csviter(obj, delimiter=None, headers=None, encoding=None):
    """ Returns an iterator of dictionaries from the given csv object

    obj can be:
    - a Response object
    - a unicode string
    - a string encoded as utf-8

    delimiter is the character used to separate field on the given obj.

    headers is an iterable that when provided offers the keys
    for the returned dictionaries, if not the first row is used.
    """
    encoding = obj.encoding if isinstance(obj, TextResponse) else encoding or 'utf-8'
    def _getrow(csv_r):
        return [str_to_unicode(field, encoding) for field in csv_r.next()]

    lines = StringIO(body_or_str(obj, unicode=False))
    if delimiter:
        csv_r = csv.reader(lines, delimiter=delimiter)
    else:
        csv_r = csv.reader(lines)

    if not headers:
        headers = _getrow(csv_r)

    while True:
        row = _getrow(csv_r)
        if len(row) != len(headers):
            log.msg("ignoring row %d (length: %d, should be: %d)" % (csv_r.line_num, len(row), len(headers)), log.WARNING)
            continue
        else:
            yield dict(zip(headers, row))
Ejemplo n.º 3
0
 def parseList(self, response):
     nodename = 'loc'
     text = body_or_str(response)
     r = re.compile(r"(<%s[\s>])(.*?)(</%s>)" % (nodename, nodename),
                    re.DOTALL)
     for match in r.finditer(text):
         url = match.group(2)
         yield scrapy.Request(url, self.parse_items)
Ejemplo n.º 4
0
	def parse(self, response):
		nodename = 'loc'
		text = body_or_str(response)
		r = re.compile(r"(<%s[\s>])(.*?)(</%s>)" % (nodename, nodename), re.DOTALL)
		for match in r.finditer(text):
			url = match.group(2)
			if url != '':
			#if url == 'http://www.mytime.de/Haushalt/Kueche/Kuechenkleingeraete/Eismaschinen/Bestron_DHY_1705_Eismaschine-_gruen_0810951088.html' or url == 'http://www.mytime.de/Suesswaren_und_Knabbereien/Suessgebaeck/Waffeln_und_Waffelmischungen/Findeisen_Original_Waffel-Eistueten_4502020624.html' or url == 'http://www.mytime.de/Schreibwaren/Schreiben_und_Zeichnen/Fineliner/Stabilo_OHPen_universal_permanent_Folienschreiber_superfein_gruen_4510050990.html' or url =='http://www.mytime.de/Schreibwaren/Ordnen-_Archivieren_und_Organisieren/Schreibtisch-Utensilien/Casio_XR-9X1_Schriftband_9mm_Black_Ink_clear_4510050962.html' or url =='http://www.mytime.de/Schreibwaren/Schreiben_und_Zeichnen/Farbstifte_und_Filzstifte/Stabilo_Trio_Scribbi_Fasermaler_gruen_4510050924.html' or url=='http://www.mytime.de/Suesswaren_und_Knabbereien/Knabbereien/Nuesse_und_Knabbermischungen/Jeden_Tag_Cashew-Kerne_geroestet_und_gesalzen_4502020587.html' or url== 'http://www.mytime.de/Suesswaren_und_Knabbereien/Knabbereien/Nuesse_und_Knabbermischungen/Seeberger_Blanchierte_Mandeln_Honig_und_Salz_4502111669.html':
				yield Request(url, callback=self.parse_page)
Ejemplo n.º 5
0
    def test_body_or_str_encoding(self):
        self.assertTrue(isinstance(body_or_str(self.dummy_response, unicode=False), str))
        self.assertTrue(isinstance(body_or_str(self.dummy_response, unicode=True), unicode))

        self.assertTrue(isinstance(body_or_str('text', unicode=False), str))
        self.assertTrue(isinstance(body_or_str('text', unicode=True), unicode))

        self.assertTrue(isinstance(body_or_str(u'text', unicode=False), str))
        self.assertTrue(isinstance(body_or_str(u'text', unicode=True), unicode))
Ejemplo n.º 6
0
    def test_body_or_str_encoding(self):
        self.assertTrue(
            isinstance(body_or_str(self.dummy_response, unicode=False), str))
        self.assertTrue(
            isinstance(body_or_str(self.dummy_response, unicode=True),
                       unicode))

        self.assertTrue(isinstance(body_or_str('text', unicode=False), str))
        self.assertTrue(isinstance(body_or_str('text', unicode=True), unicode))

        self.assertTrue(isinstance(body_or_str(u'text', unicode=False), str))
        self.assertTrue(isinstance(body_or_str(u'text', unicode=True),
                                   unicode))
Ejemplo n.º 7
0
 def parse(self, response):
     nodename = 'loc'
     list = []
     text = body_or_str(response)
     r = re.compile(r"(<%s[\s>])(.*?)(</%s>)" % (nodename, nodename), re.DOTALL)
     for match in r.finditer(text):
         url = match.group(2)
         list.append(url)
     # post_nodes = response.css('tbody .floated-thumb .post-thumb a')
     # 循环整个数组,并将当前列表页每一个节点交给scrapy
     for post_node in list:
         # 解析每一个节点上的url
         if re.search('breed',post_node):
             post_url = post_node
         # yield 关键字会将后面的网址自动交个scrapy进行下载
         # urljoin方法会将url拼接成完整的地址
         # callback回调函数将给scrapy下载目标网址操作执行完成之后执行的方法
         # 并将列表页取到的img地址,通过meat属性进行传递,交给parse_detail函数
             yield Request(url=post_url, callback=self.parse_detail,  dont_filter=True)
Ejemplo n.º 8
0
def xmliter(obj, nodename):
    """Return a iterator of XPathSelector's over all nodes of a XML document,
       given tha name of the node to iterate. Useful for parsing XML feeds.

    obj can be:
    - a Response object
    - a unicode string
    - a string encoded as utf-8
    """
    HEADER_START_RE = re.compile(r"^(.*?)<\s*%s(?:\s|>)" % nodename, re.S)
    HEADER_END_RE = re.compile(r"<\s*/%s\s*>" % nodename, re.S)
    text = body_or_str(obj)

    header_start = re.search(HEADER_START_RE, text)
    header_start = header_start.group(1).strip() if header_start else ""
    header_end = re_rsearch(HEADER_END_RE, text)
    header_end = text[header_end[1] :].strip() if header_end else ""

    r = re.compile(r"<%s[\s>].*?</%s>" % (nodename, nodename), re.DOTALL)
    for match in r.finditer(text):
        nodetext = header_start + match.group() + header_end
        yield XmlXPathSelector(text=nodetext).select("//" + nodename)[0]
Ejemplo n.º 9
0
def xmliter(obj, nodename):
    """Return a iterator of Selector's over all nodes of a XML document,
       given tha name of the node to iterate. Useful for parsing XML feeds.

    obj can be:
    - a Response object
    - a unicode string
    - a string encoded as utf-8
    """
    HEADER_START_RE = re.compile(r'^(.*?)<\s*%s(?:\s|>)' % nodename, re.S)
    HEADER_END_RE = re.compile(r'<\s*/%s\s*>' % nodename, re.S)
    text = body_or_str(obj)

    header_start = re.search(HEADER_START_RE, text)
    header_start = header_start.group(1).strip() if header_start else ''
    header_end = re_rsearch(HEADER_END_RE, text)
    header_end = text[header_end[1]:].strip() if header_end else ''

    r = re.compile(r"<%s[\s>].*?</%s>" % (nodename, nodename), re.DOTALL)
    for match in r.finditer(text):
        nodetext = header_start + match.group() + header_end
        yield Selector(text=nodetext, type='xml').xpath('//' + nodename)[0]
Ejemplo n.º 10
0
 def test_body_or_str_extraction(self):
     self.assertEqual(body_or_str(self.dummy_response), 'dummy_response')
     self.assertEqual(body_or_str('text'), 'text')
Ejemplo n.º 11
0
 def test_body_or_str_input(self):
     self.assertTrue(isinstance(body_or_str(self.dummy_response), basestring))
     self.assertTrue(isinstance(body_or_str('text'), basestring))
     self.assertRaises(Exception, body_or_str, 2)
Ejemplo n.º 12
0
 def test_body_or_str_extraction(self):
     self.assertEqual(body_or_str(self.dummy_response), 'dummy_response')
     self.assertEqual(body_or_str('text'), 'text')
Ejemplo n.º 13
0
 def test_body_or_str_input(self):
     self.assertTrue(
         isinstance(body_or_str(self.dummy_response), basestring))
     self.assertTrue(isinstance(body_or_str('text'), basestring))
     self.assertRaises(Exception, body_or_str, 2)
Ejemplo n.º 14
0
 def test_body_or_str_extraction(self):
     self.assertEqual(body_or_str(self.dummy_response), "dummy_response")
     self.assertEqual(body_or_str("text"), "text")