def get_time(self, date_string): if 'Today' in date_string: return date_today() elif 'Yesterday' in date_string: return date_today() - timedelta(1) return datetime.strptime(date_string,'%d/%m/%Y').date()
def get_list_page_date(self, date_string_list): date_string = ' \n '.join(date_string_list) if 'Today' in date_string: return date_today() elif 'Yesterday' in date_string: return date_today() - timedelta(1) else: date=search(r"[^0-9]*([0-9]+\s[a-zA-Z]+)", date_string, flags=DOTALL) if date: date_without_year = datetime.strptime(date.group(1), '%d %b').date() return date_without_year.replace(year=self.get_year(date_without_year)) else: raise ValueError("Did not find the date")
def extract_item_detail(self, selector, path): item = olxItem() item['url'] = path title = selector.css('div#item-title-top span:first-child ::text').extract() if title: item['title'] = title[0].strip() else: item['title'] = '' name_tel = selector.css('#item-data .name-tel:first-child') name = name_tel.css('strong ::text').extract() if name: item['name'] = name[0] else: item['name'] = '' phone = name_tel.css('::text').re(r'[0-9\+]{8,12}') if phone: item['phone'] = int(phone[0].replace('+', '')) else: item['phone'] = 0 price = selector.css('div.price ::text').re(r'[0-9][0-9,]+') if price: item['price'] = int(price[0].replace(',', '')) else: item['price'] = 0 time = selector.css('.time-info ::text').extract() if time: item['time'] = self.get_time(time[0]) else: item['time'] = date_today() types = selector.css('#description-text .optionals ::text').re('^(?!Type).+$') if types: item['types'] = types else: item['types'] = [] image = selector.css('#big-viewer a').xpath("@href").extract() image = map(unicode.strip, image) if image: item['image'] = image else: item['image'] = [] desc = selector.css('#description-text ::text').extract() desc = ''.join(desc) desc = sub(r"When you call.*OLX.in", '', desc) desc = sub(r"Type:$\n.*$", "", desc, flags=MULTILINE) desc = sub(r' +', ' ', desc) desc = sub(r'\n+','\n', desc) desc = sub(r"(^\n+|\n+$)", "", desc) if desc: item['desc'] = desc else: item['desc'] = '' return item
def get_year(self, date): today = date_today() date_with_this_year = date.replace(year=today.year) if date_with_this_year > today: return today.year - 1 return today.year