Esempio n. 1
0
 def parse(self, response):
     sel = response.xpath('.//*[@class="post_info"]')
     if not sel:
         self.log('posts are not find')
         return
     self.group_id = response.xpath('.//div[@id="group_followers"]/a/@href').re('group.=(\d+?)$')[0]
     for s in sel:
         wall_text = s.xpath('div[@class="wall_text"]')
         text = wall_text.xpath('div/div[@class="wall_post_text"]').extract()
         spam_words = get_spam_words_from_msg(text, self.spam_words_from_file)
         if spam_words:
             l = ItemLoader(item=VkItem(), selector=s, response=response)
             date = s.xpath('div[@class="replies"]/div/small/a[1]/span/text()').extract()
             date = l.get_value(date, MapCompose(normalize_date), TakeFirst())
             if is_date_less_last_date(date, self.days_count_to_parse):
                 return
             l.add_value('id', wall_text.xpath('div/a/@data-from-id').extract())
             l.add_value('name', wall_text.xpath('div/a/text()').extract())
             l.add_value('text', text)
             l.add_value('date', date)
             l.add_value('words', spam_words)
             yield l.load_item()
             #ban => Request()
         replies_hidden = s.xpath('.//a[@class="wr_header"]/@onclick')
         if replies_hidden:
             url = get_url_hided_replies(replies_hidden[0].extract(), self.main_page)
             yield Request(url=url, callback=self.get_hided_items)
         else:
             replies = s.xpath('.//div[@class="reply_table"]').extract()
             for reply in replies:
                 raw_html = ''.join(reply.splitlines()).encode('utf-8')
                 html_response = HtmlResponse(url=response.url, body=raw_html)
                 for i in self.get_replies_items(html_response):
                     yield i.load_item()
     yield Request(url=self.get_next_msgs_url(), method='POST', callback=self.parse, body=self.get_post_body_for_next_msgs())
Esempio n. 2
0
def testing():
    il = ItemLoader(item=CarPart())
    il.add_value('price', [u'1800 р.'])#, re='(\d+)')

    url = il.get_value(['/qqq/www'], TakeFirst())
    base_url = 'http://host.ru'
    fin_url = '/'.join([base_url, url])
    print 'fin: ', fin_url
    il.add_value('shop_url', fin_url)
    il.add_value('info', 'some_info')

    item = il.load_item()
    print type(item), dir(item)
    print item.has_key('brand')
Esempio n. 3
0
 def get_replies_items(self, response):
     regex = r'id.{2,3}reply_delete-([\d_]+?)\\?".+?data-from-id.{2,3}?([\d\-]+?)\\?".*?\>(.+?)\<\\?/a\>.+?wall_reply_text.{1,2}\>?(.+?)\<\\?/div\>.+?rel_date.*?"\>(.+?)\<\\?/span\>'
     m = re.finditer(regex, response.body_as_unicode())
     items = []
     for i in m:
         text = i.group(4)
         spam_words = get_spam_words_from_msg([text], self.spam_words_from_file)
         if spam_words:
             l = ItemLoader(item=VkItem(), response=response)
             l.add_value('id', i.group(2))
             l.add_value('name', i.group(3))
             l.add_value('text', text)
             l.add_value('date', l.get_value(i.group(5), MapCompose(normalize_date), TakeFirst()))
             l.add_value('words', spam_words)
             items.append(l)
     return items
Esempio n. 4
0
 def parse(self, response):
     sel = response.xpath('.//*[@class="post_info"]')
     if not sel:
         self.log('posts are not find')
         return
     self.group_id = response.xpath(
         './/div[@id="group_followers"]/a/@href').re('group.=(\d+?)$')[0]
     for s in sel:
         wall_text = s.xpath('div[@class="wall_text"]')
         text = wall_text.xpath(
             'div/div[@class="wall_post_text"]').extract()
         spam_words = get_spam_words_from_msg(text,
                                              self.spam_words_from_file)
         if spam_words:
             l = ItemLoader(item=VkItem(), selector=s, response=response)
             date = s.xpath(
                 'div[@class="replies"]/div/small/a[1]/span/text()'
             ).extract()
             date = l.get_value(date, MapCompose(normalize_date),
                                TakeFirst())
             if is_date_less_last_date(date, self.days_count_to_parse):
                 return
             l.add_value('id',
                         wall_text.xpath('div/a/@data-from-id').extract())
             l.add_value('name', wall_text.xpath('div/a/text()').extract())
             l.add_value('text', text)
             l.add_value('date', date)
             l.add_value('words', spam_words)
             yield l.load_item()
             #ban => Request()
         replies_hidden = s.xpath('.//a[@class="wr_header"]/@onclick')
         if replies_hidden:
             url = get_url_hided_replies(replies_hidden[0].extract(),
                                         self.main_page)
             yield Request(url=url, callback=self.get_hided_items)
         else:
             replies = s.xpath('.//div[@class="reply_table"]').extract()
             for reply in replies:
                 raw_html = ''.join(reply.splitlines()).encode('utf-8')
                 html_response = HtmlResponse(url=response.url,
                                              body=raw_html)
                 for i in self.get_replies_items(html_response):
                     yield i.load_item()
     yield Request(url=self.get_next_msgs_url(),
                   method='POST',
                   callback=self.parse,
                   body=self.get_post_body_for_next_msgs())
Esempio n. 5
0
 def get_replies_items(self, response):
     regex = r'id.{2,3}reply_delete-([\d_]+?)\\?".+?data-from-id.{2,3}?([\d\-]+?)\\?".*?\>(.+?)\<\\?/a\>.+?wall_reply_text.{1,2}\>?(.+?)\<\\?/div\>.+?rel_date.*?"\>(.+?)\<\\?/span\>'
     m = re.finditer(regex, response.body_as_unicode())
     items = []
     for i in m:
         text = i.group(4)
         spam_words = get_spam_words_from_msg([text],
                                              self.spam_words_from_file)
         if spam_words:
             l = ItemLoader(item=VkItem(), response=response)
             l.add_value('id', i.group(2))
             l.add_value('name', i.group(3))
             l.add_value('text', text)
             l.add_value(
                 'date',
                 l.get_value(i.group(5), MapCompose(normalize_date),
                             TakeFirst()))
             l.add_value('words', spam_words)
             items.append(l)
     return items