def get_list(url, region, page = 1): links = adsparser.ads_list(url, page).parse() parse_next_page = True ads = [] for link in links: if Ad.get_by_key_name(link) is None: try: parser = adsparser.parse(link, region) except: continue if parser.date: created_at = parser.date created_at = datetime.datetime.combine(parser.date.date(), datetime.datetime.now().time()) else: created_at = datetime.datetime.now() ad = Ad(key_name = link, title = parser.title, source = parser.get_name(), md5 = parser.md5, contact = parser.contact, phone = parser.phone, price = parser.price, parent_url = url, created_at = created_at, region = parser.region ) if parser.address_id: ad.address_id = parser.address_id[0] if parser.agent: ad.rating = 0 print ad.created_at ads.append(ad) time.sleep(1) else: print "ad already found" parse_next_page = False print "saving ads: %d" % len(ads) db.put(ads) for ad in ads: taskqueue.add(queue_name = 'quick', url = '/ad/check', params = {'key': ad.key().name() }) if parse_next_page or len(ads) > 36: print "parsing page %d" % (page+1) get_list(url, region, page+1)
def post(self): url = self.request.get('url') force_next_page = self.request.get('force_next_page') logging.info("Gettings ads list: %s" % url) try: page = int(self.request.get('page')) except: page = 1 links = adsparser.ads_list(unicode(url).encode('utf-8'), page).parse() parse_next_page = True for link in links: if Ad.get_by_key_name(link) is None: taskqueue.add(queue_name = adsparser.parser_name(url), url="/ad", params = {'url': link, 'parent_url': url, 'region': self.request.get('region')}) else: if not force_next_page: parse_next_page = False if parse_next_page and page < MAX_PAGES: taskqueue.add(url="/ads", params = {'url': url, 'page': page+1, 'force_next_page': force_next_page, 'region': self.request.get('region')})
arr.append(" ".join(phones)) print " ".join(arr) exit() """ #print avtoru_parser.AvtoruAdParser("http://cars.auto.ru/cars/used/sale/11458490-f214.html").parse() #exit(); #print adsparser.parse("http://nedvizhimost.slando.ru/moskva/srochno-snimu-2-h-komnatnuyu-kvartiru_P_38854056.html", 'msk') #exit() #urls = adsparser.ads_list("http://nedvizhimost.slando.ru/moskva/1377_1.html").parse() #urls = adsparser.ads_list("http://www.novoebenevo.ru/sdam/area_3").parse() urls = adsparser.ads_list( "http://nedvizhimost.slando.ru/moskva/1376_T2_1.html").parse() for url in urls: print adsparser.parse(url, 'msk') exit() """ #ArendaOpen parser phones = [] for url in urls: page = BeautifulSoup(urllib2.urlopen(url)) bs = page.findAll('b') for b in bs: try: phone = adsparser.rPHONE.search(b.string) if phone:
print " ".join(arr) exit() """ #print avtoru_parser.AvtoruAdParser("http://cars.auto.ru/cars/used/sale/11458490-f214.html").parse() #exit(); #print adsparser.parse("http://nedvizhimost.slando.ru/moskva/srochno-snimu-2-h-komnatnuyu-kvartiru_P_38854056.html", 'msk') #exit() #urls = adsparser.ads_list("http://nedvizhimost.slando.ru/moskva/1377_1.html").parse() #urls = adsparser.ads_list("http://www.novoebenevo.ru/sdam/area_3").parse() urls = adsparser.ads_list("http://nedvizhimost.slando.ru/moskva/1376_T2_1.html").parse() for url in urls: print adsparser.parse(url, 'msk') exit() """ #ArendaOpen parser phones = [] for url in urls: page = BeautifulSoup(urllib2.urlopen(url)) bs = page.findAll('b') for b in bs: try: phone = adsparser.rPHONE.search(b.string)