コード例 #1
0
ファイル: olx_spider.py プロジェクト: rahulverma/scrap_olx
    def get_time(self, date_string):
        if 'Today' in date_string:
            return date_today()
        elif 'Yesterday' in date_string:
            return date_today() - timedelta(1)

        return datetime.strptime(date_string,'%d/%m/%Y').date()
コード例 #2
0
ファイル: olx_spider.py プロジェクト: rahulverma/scrap_olx
 def get_list_page_date(self, date_string_list):
     date_string = ' \n '.join(date_string_list)
     if 'Today' in date_string:
         return date_today()
     elif 'Yesterday' in date_string:
         return date_today() - timedelta(1)
     else:
         date=search(r"[^0-9]*([0-9]+\s[a-zA-Z]+)", date_string, flags=DOTALL)
         if date:
             date_without_year = datetime.strptime(date.group(1), '%d %b').date()
             return date_without_year.replace(year=self.get_year(date_without_year))
         else:
             raise ValueError("Did not find the date")
コード例 #3
0
ファイル: olx_spider.py プロジェクト: rahulverma/scrap_olx
    def extract_item_detail(self, selector, path):
        item = olxItem()

        item['url'] = path

        title = selector.css('div#item-title-top span:first-child ::text').extract()
        if title:
            item['title'] = title[0].strip()
        else:
            item['title'] = ''

        name_tel = selector.css('#item-data .name-tel:first-child')
        name = name_tel.css('strong ::text').extract()
        if name:
            item['name'] = name[0]
        else:
            item['name'] = ''

        phone = name_tel.css('::text').re(r'[0-9\+]{8,12}')
        if phone:
            item['phone'] = int(phone[0].replace('+', ''))
        else:
            item['phone'] = 0


        price = selector.css('div.price ::text').re(r'[0-9][0-9,]+')
        if price:
            item['price'] = int(price[0].replace(',', ''))
        else:
            item['price'] = 0

        time = selector.css('.time-info ::text').extract()
        if time:
            item['time'] = self.get_time(time[0])
        else:
            item['time'] = date_today()

        types = selector.css('#description-text .optionals ::text').re('^(?!Type).+$')
        if types:
            item['types'] = types
        else:
            item['types'] = []

        image = selector.css('#big-viewer a').xpath("@href").extract()
        image = map(unicode.strip, image)
        if image:
            item['image'] = image
        else:
            item['image'] = []

        desc = selector.css('#description-text ::text').extract()
        desc = ''.join(desc)
        desc = sub(r"When you call.*OLX.in", '', desc)
        desc = sub(r"Type:$\n.*$", "", desc, flags=MULTILINE)
        desc = sub(r' +', ' ', desc)
        desc = sub(r'\n+','\n', desc)
        desc = sub(r"(^\n+|\n+$)", "", desc)

        if desc:
            item['desc'] = desc
        else:
            item['desc'] = ''

        return item
コード例 #4
0
ファイル: olx_spider.py プロジェクト: rahulverma/scrap_olx
 def get_year(self, date):
     today = date_today()
     date_with_this_year = date.replace(year=today.year)
     if date_with_this_year > today:
         return today.year - 1
     return today.year