def post(self): url = self.request.get('url') logging.info("Processing ad: %s" % url) try: parser = adsparser.parse(url, self.request.get('region')) except urllib2.HTTPError: parser = None if parser: if parser.get_name() == 'emls': key_name = parser.phone or url if Ad.get_by_key_name(key_name): return True else: key_name = url if parser.date: created_at = parser.date created_at = datetime.datetime.combine(parser.date.date(), datetime.datetime.now().time()) else: created_at = datetime.datetime.now() ad = Ad(key_name = key_name, title = parser.title, source = parser.get_name(), md5 = parser.md5, contact = parser.contact, phone = parser.phone, price = parser.price, parent_url = self.request.get('parent_url'), created_at = created_at, offer_type = parser.offer_type, region = parser.region ) if parser.image: ad.image = parser.image ad.has_image = True if parser.address_id: ad.address_id = parser.address_id[0] if len(parser.address_id) > 1: ad.needs_moderation = True ad.moderation_type = 1 ad.addresses = parser.address_id if parser.agent: ad.rating = 0 if parser.phone is None: ad.moderation_type = 4 ad.put() taskqueue.add(queue_name = 'quick', url = '/ad/check', params = {'key': url})
def get_list(url, region, page = 1): links = adsparser.ads_list(url, page).parse() parse_next_page = True ads = [] for link in links: if Ad.get_by_key_name(link) is None: try: parser = adsparser.parse(link, region) except: continue if parser.date: created_at = parser.date created_at = datetime.datetime.combine(parser.date.date(), datetime.datetime.now().time()) else: created_at = datetime.datetime.now() ad = Ad(key_name = link, title = parser.title, source = parser.get_name(), md5 = parser.md5, contact = parser.contact, phone = parser.phone, price = parser.price, parent_url = url, created_at = created_at, region = parser.region ) if parser.address_id: ad.address_id = parser.address_id[0] if parser.agent: ad.rating = 0 print ad.created_at ads.append(ad) time.sleep(1) else: print "ad already found" parse_next_page = False print "saving ads: %d" % len(ads) db.put(ads) for ad in ads: taskqueue.add(queue_name = 'quick', url = '/ad/check', params = {'key': ad.key().name() }) if parse_next_page or len(ads) > 36: print "parsing page %d" % (page+1) get_list(url, region, page+1)
def get_ads(cursor = None, count = 0): print "getting ads %d" % count ads = Ad.all().filter("source =", "avito").order("-created_at") print ads[0].created_at if cursor: ads = ads.with_cursor(cursor) ads_for_put = [] ads_for_delete = [] for ad in ads.fetch(10): try: parser = adsparser.parse(ad.key().name(), 'spb') except StandardError as e: msg = e.__str__() if msg == 'HTTP Error 404: Not found': print "deleting" ad.deleted = True ads_for_put.append(ad) continue if parser.phone_key is None: continue phone_url = "%s?pkey=%s" % (ad.key().name().replace('items/', 'items/phone/'), parser.phone_key) phone_cmd = command.replace("__url__", phone_url) print ad.key().name() fin, fout = os.popen4(phone_cmd) phone = fout.read() time.sleep(2) f = open("result.txt", "r") phone = adsparser.format_phone(f.read()) f.close() if parser.is_real_agent: ad.rating = 0 else: if ad.phone is None or ad.phone == '': ad.rating = 100 if ad.phone is not None and ad.phone != '' and ad.phone != phone: new_ad = clone_entity(ad, key_name = "%s?v2" % ad.key().name(), parent = ad) new_ad.phone = phone new_ad.created_at = datetime.datetime.now() ads_for_put.append(new_ad) if ad.phone is None or ad.phone == '': ad.phone = phone ad.created_at = datetime.datetime.combine(ad.created_at.date(), datetime.datetime.now().time()) ads_for_put.append(ad) print "saving ads" db.put(ads_for_put) print "ads saved" for ad in ads_for_put: try: print "adding task" taskqueue.add(queue_name = 'quick', url = '/ad/check', params = {'key': ad.key().name() }) except: pass print "tasks added" get_ads(ads.cursor(), count + 10)
exit() """ #print avtoru_parser.AvtoruAdParser("http://cars.auto.ru/cars/used/sale/11458490-f214.html").parse() #exit(); #print adsparser.parse("http://nedvizhimost.slando.ru/moskva/srochno-snimu-2-h-komnatnuyu-kvartiru_P_38854056.html", 'msk') #exit() #urls = adsparser.ads_list("http://nedvizhimost.slando.ru/moskva/1377_1.html").parse() #urls = adsparser.ads_list("http://www.novoebenevo.ru/sdam/area_3").parse() urls = adsparser.ads_list( "http://nedvizhimost.slando.ru/moskva/1376_T2_1.html").parse() for url in urls: print adsparser.parse(url, 'msk') exit() """ #ArendaOpen parser phones = [] for url in urls: page = BeautifulSoup(urllib2.urlopen(url)) bs = page.findAll('b') for b in bs: try: phone = adsparser.rPHONE.search(b.string) if phone: phones.append(adsparser.format_phone(phone.group(0))) except: pass
def get(self): url = self.request.get('url') parser = adsparser.parse(url, self.request.get('region')) self.response.out.write(parser)
def get_ads(cursor=None, count=0): print "getting ads %d" % count ads = Ad.all().filter("source =", "avito").order("-created_at") print ads[0].created_at if cursor: ads = ads.with_cursor(cursor) ads_for_put = [] ads_for_delete = [] for ad in ads.fetch(10): try: parser = adsparser.parse(ad.key().name(), 'spb') except StandardError as e: msg = e.__str__() if msg == 'HTTP Error 404: Not found': print "deleting" ad.deleted = True ads_for_put.append(ad) continue if parser.phone_key is None: continue phone_url = "%s?pkey=%s" % (ad.key().name().replace( 'items/', 'items/phone/'), parser.phone_key) phone_cmd = command.replace("__url__", phone_url) print ad.key().name() fin, fout = os.popen4(phone_cmd) phone = fout.read() time.sleep(2) f = open("result.txt", "r") phone = adsparser.format_phone(f.read()) f.close() if parser.is_real_agent: ad.rating = 0 else: if ad.phone is None or ad.phone == '': ad.rating = 100 if ad.phone is not None and ad.phone != '' and ad.phone != phone: new_ad = clone_entity(ad, key_name="%s?v2" % ad.key().name(), parent=ad) new_ad.phone = phone new_ad.created_at = datetime.datetime.now() ads_for_put.append(new_ad) if ad.phone is None or ad.phone == '': ad.phone = phone ad.created_at = datetime.datetime.combine( ad.created_at.date(), datetime.datetime.now().time()) ads_for_put.append(ad) print "saving ads" db.put(ads_for_put) print "ads saved" for ad in ads_for_put: try: print "adding task" taskqueue.add(queue_name='quick', url='/ad/check', params={'key': ad.key().name()}) except: pass print "tasks added" get_ads(ads.cursor(), count + 10)
""" #print avtoru_parser.AvtoruAdParser("http://cars.auto.ru/cars/used/sale/11458490-f214.html").parse() #exit(); #print adsparser.parse("http://nedvizhimost.slando.ru/moskva/srochno-snimu-2-h-komnatnuyu-kvartiru_P_38854056.html", 'msk') #exit() #urls = adsparser.ads_list("http://nedvizhimost.slando.ru/moskva/1377_1.html").parse() #urls = adsparser.ads_list("http://www.novoebenevo.ru/sdam/area_3").parse() urls = adsparser.ads_list("http://nedvizhimost.slando.ru/moskva/1376_T2_1.html").parse() for url in urls: print adsparser.parse(url, 'msk') exit() """ #ArendaOpen parser phones = [] for url in urls: page = BeautifulSoup(urllib2.urlopen(url)) bs = page.findAll('b') for b in bs: try: phone = adsparser.rPHONE.search(b.string) if phone: phones.append(adsparser.format_phone(phone.group(0))) except: