コード例 #1
0
ファイル: tasks.py プロジェクト: buger/rentfilter
    def post(self):
        url = self.request.get('url')

        logging.info("Processing ad: %s" % url)

        try:
            parser = adsparser.parse(url, self.request.get('region'))
        except urllib2.HTTPError:
            parser = None

        if parser:
            if parser.get_name() == 'emls':
                key_name = parser.phone or url

                if Ad.get_by_key_name(key_name):
                    return True
            else:
                key_name = url

            if parser.date:
                created_at = parser.date
                created_at = datetime.datetime.combine(parser.date.date(), datetime.datetime.now().time())
            else:
                created_at = datetime.datetime.now()

            ad = Ad(key_name = key_name,
                    title = parser.title,
                    source = parser.get_name(),
                    md5 = parser.md5,
                    contact = parser.contact,
                    phone = parser.phone,
                    price = parser.price,
                    parent_url = self.request.get('parent_url'),
                    created_at = created_at,
                    offer_type = parser.offer_type,
                    region = parser.region
                    )

            if parser.image:
                ad.image = parser.image
                ad.has_image = True

            if parser.address_id:
                ad.address_id = parser.address_id[0]

                if len(parser.address_id) > 1:
                    ad.needs_moderation = True
                    ad.moderation_type = 1

                ad.addresses = parser.address_id

            if parser.agent:
                ad.rating = 0

            if parser.phone is None:
                ad.moderation_type = 4

            ad.put()

            taskqueue.add(queue_name = 'quick', url = '/ad/check', params = {'key': url})
コード例 #2
0
ファイル: vk_parser.py プロジェクト: buger/rentfilter
def get_list(url, region, page = 1):
    links = adsparser.ads_list(url, page).parse()

    parse_next_page = True

    ads = []

    for link in links:
        if Ad.get_by_key_name(link) is None:
            try:
                parser = adsparser.parse(link, region)
            except:
                continue

            if parser.date:
                created_at = parser.date
                created_at = datetime.datetime.combine(parser.date.date(), datetime.datetime.now().time())
            else:
                created_at = datetime.datetime.now()

            ad = Ad(key_name = link,
                    title = parser.title,
                    source = parser.get_name(),
                    md5 = parser.md5,
                    contact = parser.contact,
                    phone = parser.phone,
                    price = parser.price,
                    parent_url = url,
                    created_at = created_at,
                    region = parser.region
                    )

            if parser.address_id:
                ad.address_id = parser.address_id[0]

            if parser.agent:
                ad.rating = 0

            print ad.created_at

            ads.append(ad)

            time.sleep(1)
        else:
            print "ad already found"
            parse_next_page = False

    print "saving ads: %d" % len(ads)
    db.put(ads)

    for ad in ads:
        taskqueue.add(queue_name = 'quick', url = '/ad/check', params = {'key': ad.key().name() })

    if parse_next_page or len(ads) > 36:
        print "parsing page %d" % (page+1)
        get_list(url, region, page+1)
コード例 #3
0
def get_ads(cursor = None, count = 0):
    print "getting ads %d" % count

    ads = Ad.all().filter("source =", "avito").order("-created_at")

    print ads[0].created_at

    if cursor:
        ads = ads.with_cursor(cursor)

    ads_for_put = []
    ads_for_delete = []

    for ad in ads.fetch(10):
        try:
            parser = adsparser.parse(ad.key().name(), 'spb')
        except StandardError as e:
            msg = e.__str__()
            if msg == 'HTTP Error 404: Not found':
                print "deleting"
                ad.deleted = True

                ads_for_put.append(ad)

            continue

        if parser.phone_key is None:
            continue

        phone_url = "%s?pkey=%s" % (ad.key().name().replace('items/', 'items/phone/'), parser.phone_key)
        phone_cmd = command.replace("__url__", phone_url)

        print ad.key().name()

        fin, fout = os.popen4(phone_cmd)
        phone = fout.read()

        time.sleep(2)

        f = open("result.txt", "r")
        phone = adsparser.format_phone(f.read())
        f.close()

        if parser.is_real_agent:
            ad.rating = 0
        else:
            if ad.phone is None or ad.phone == '':
                ad.rating = 100

        if ad.phone is not None and ad.phone != '' and ad.phone != phone:
            new_ad = clone_entity(ad, key_name = "%s?v2" % ad.key().name(), parent = ad)
            new_ad.phone = phone
            new_ad.created_at = datetime.datetime.now()

            ads_for_put.append(new_ad)

        if ad.phone is None or ad.phone == '':
            ad.phone = phone
            ad.created_at = datetime.datetime.combine(ad.created_at.date(), datetime.datetime.now().time())

        ads_for_put.append(ad)

    print "saving ads"
    db.put(ads_for_put)
    print "ads saved"

    for ad in ads_for_put:
        try:
            print "adding task"
            taskqueue.add(queue_name = 'quick', url = '/ad/check', params = {'key': ad.key().name() })
        except:
            pass

    print "tasks added"

    get_ads(ads.cursor(), count + 10)
コード例 #4
0
ファイル: parser.py プロジェクト: buger/rentfilter
exit()
"""

#print avtoru_parser.AvtoruAdParser("http://cars.auto.ru/cars/used/sale/11458490-f214.html").parse()
#exit();

#print adsparser.parse("http://nedvizhimost.slando.ru/moskva/srochno-snimu-2-h-komnatnuyu-kvartiru_P_38854056.html", 'msk')
#exit()

#urls = adsparser.ads_list("http://nedvizhimost.slando.ru/moskva/1377_1.html").parse()
#urls = adsparser.ads_list("http://www.novoebenevo.ru/sdam/area_3").parse()
urls = adsparser.ads_list(
    "http://nedvizhimost.slando.ru/moskva/1376_T2_1.html").parse()

for url in urls:
    print adsparser.parse(url, 'msk')

exit()
"""
#ArendaOpen parser
phones = []
for url in urls:
    page = BeautifulSoup(urllib2.urlopen(url))
    bs = page.findAll('b')
    for b in bs:
        try:
            phone = adsparser.rPHONE.search(b.string)
            if phone:
                phones.append(adsparser.format_phone(phone.group(0)))
        except:
            pass
コード例 #5
0
ファイル: tasks.py プロジェクト: buger/rentfilter
    def get(self):
        url = self.request.get('url')
        parser = adsparser.parse(url, self.request.get('region'))

        self.response.out.write(parser)
コード例 #6
0
def get_ads(cursor=None, count=0):
    print "getting ads %d" % count

    ads = Ad.all().filter("source =", "avito").order("-created_at")

    print ads[0].created_at

    if cursor:
        ads = ads.with_cursor(cursor)

    ads_for_put = []
    ads_for_delete = []

    for ad in ads.fetch(10):
        try:
            parser = adsparser.parse(ad.key().name(), 'spb')
        except StandardError as e:
            msg = e.__str__()
            if msg == 'HTTP Error 404: Not found':
                print "deleting"
                ad.deleted = True

                ads_for_put.append(ad)

            continue

        if parser.phone_key is None:
            continue

        phone_url = "%s?pkey=%s" % (ad.key().name().replace(
            'items/', 'items/phone/'), parser.phone_key)
        phone_cmd = command.replace("__url__", phone_url)

        print ad.key().name()

        fin, fout = os.popen4(phone_cmd)
        phone = fout.read()

        time.sleep(2)

        f = open("result.txt", "r")
        phone = adsparser.format_phone(f.read())
        f.close()

        if parser.is_real_agent:
            ad.rating = 0
        else:
            if ad.phone is None or ad.phone == '':
                ad.rating = 100

        if ad.phone is not None and ad.phone != '' and ad.phone != phone:
            new_ad = clone_entity(ad,
                                  key_name="%s?v2" % ad.key().name(),
                                  parent=ad)
            new_ad.phone = phone
            new_ad.created_at = datetime.datetime.now()

            ads_for_put.append(new_ad)

        if ad.phone is None or ad.phone == '':
            ad.phone = phone
            ad.created_at = datetime.datetime.combine(
                ad.created_at.date(),
                datetime.datetime.now().time())

        ads_for_put.append(ad)

    print "saving ads"
    db.put(ads_for_put)
    print "ads saved"

    for ad in ads_for_put:
        try:
            print "adding task"
            taskqueue.add(queue_name='quick',
                          url='/ad/check',
                          params={'key': ad.key().name()})
        except:
            pass

    print "tasks added"

    get_ads(ads.cursor(), count + 10)
コード例 #7
0
ファイル: parser.py プロジェクト: biddyweb/rentfilter
"""



#print avtoru_parser.AvtoruAdParser("http://cars.auto.ru/cars/used/sale/11458490-f214.html").parse()
#exit();

#print adsparser.parse("http://nedvizhimost.slando.ru/moskva/srochno-snimu-2-h-komnatnuyu-kvartiru_P_38854056.html", 'msk')
#exit()

#urls = adsparser.ads_list("http://nedvizhimost.slando.ru/moskva/1377_1.html").parse()
#urls = adsparser.ads_list("http://www.novoebenevo.ru/sdam/area_3").parse()
urls = adsparser.ads_list("http://nedvizhimost.slando.ru/moskva/1376_T2_1.html").parse()

for url in urls:
    print adsparser.parse(url, 'msk')

exit()

"""
#ArendaOpen parser
phones = []
for url in urls:
    page = BeautifulSoup(urllib2.urlopen(url))
    bs = page.findAll('b')
    for b in bs:
        try:
            phone = adsparser.rPHONE.search(b.string)
            if phone:
                phones.append(adsparser.format_phone(phone.group(0)))
        except: