Ejemplo n.º 1
0
def tp_rest_list_page_num(self, index_url, city_id, part):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    print "Now Proxy is " + PROXY
    headers = {
        'User-agent': GetUserAgent()
    }
    page = requests.get(index_url, proxies=proxies, headers=headers)
    page.encoding = 'utf8'
    if len(page.text) < 100:
        update_proxy('Platform', PROXY, x, '23')
        self.retry()
    page.encoding = 'utf8'
    doc = PyQuery(page.text)
    doc.make_links_absolute(index_url)
    num_list = []
    for item in doc('.pageNumbers a').items():
        num = int(rest_oa_pattern.findall(item.attr.href)[0])
        num_list.append(num)

    tp_rest_detail_page_url.delay(index_url, city_id, part)
    try:
        for page_num in range(30, max(num_list) + 30, 30):
            g_num = rest_g_pattern.findall(index_url)[0]
            tp_rest_detail_page_url.delay(index_url.replace('-g' + g_num, '-g{0}-oa{1}'.format(g_num, page_num)),
                                          city_id, part)
    except:
        pass
Ejemplo n.º 2
0
def tp_rest_detail_page_url(self, page_num_url, city_id, part):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    print "Now Proxy is " + PROXY
    headers = {
        'User-agent': GetUserAgent()
    }
    page = requests.get(page_num_url, proxies=proxies, headers=headers)
    page.encoding = 'utf8'
    if len(page.text) < 100:
        update_proxy('Platform', PROXY, x, '23')
        self.retry()
    doc = PyQuery(page.text)
    doc.make_links_absolute(page_num_url)

    data = []
    worker = u'daodao_poi_base_data'

    for item in doc('.property_title').items():
        href = item.attr.href
        if 'Restaurant_Review' in href:
            args = json.dumps(
                {u'target_url': unicode(href), u'city_id': unicode(city_id), u'type': u'rest'})
            task_id = get_task_id(worker, args=args)
            data.append((task_id, worker, args, unicode(part).replace(u'list', u'detail')))
    print insert_task(data=data)
Ejemplo n.º 3
0
def qyer_country_spider(self, country_id, country_link, debug=False, **kwargs):
    """
    抓取穷游上的城市数据
    country_id:
        int, index country info
    country_en:
        str. country_en
    country_link:
        str.
    """
    http_tools = init_qyer_session(debug=True)
    x = time.time()
    spider_proxy = "socks5://" + get_proxy(source="Platform")
    qyer_db = QyerModel(**save_db_config)

    try:
        spider_ret = http_tools(country_link, proxy=spider_proxy)
        status_code = spider_ret[1]
        if status_code != 200 and status_code != 404:
            raise Exception(str(status_code))

        page_html = etree.HTML(spider_ret[0])
        country_max_page = find_max_page(page_html)
        save_data = [country_max_page, country_id]
        qyer_db.update_country_page(save_data)
        update_task(kwargs['task_id'])
    except Exception as exc:
        update_proxy('Platform', spider_proxy, x, '23')
        self.retry(exc=traceback.format_exc(exc))
Ejemplo n.º 4
0
def get_cities(self, gid, country_id, offset):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY}
    headers = {'User-agent': GetUserAgent()}
    try:
        target_url = 'http://www.tripadvisor.cn/TourismChildrenAjax?geo={0}&offset={1}&desktop=true'.format(
            gid, offset)
        page = requests.get(target_url, headers=headers, proxies=proxies)
        page.encoding = 'utf8'
        content = page.text

        res = re.findall(
            'ta.store\(\'tourism.popularCitiesMaxPage\', \'(\d+)\'\);',
            content)

        has_next = False
        if res is not None and res != []:
            if offset < int(res[0]):
                has_next = True

        result = []
        for line in _parse_city(content=content, target_url=target_url):
            per_city = list(line)
            per_city.append(country_id)
            result.append(per_city)

        print insert_db(result)

        if has_next:
            get_cities.delay(gid, country_id, offset + 1)
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
Ejemplo n.º 5
0
def tripadvisor_city_query_task(self, city_name, **kwargs):
    x = time.time()
    PROXY = get_proxy(source="Platform")
    proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY}
    headers = {'User-agent': GetUserAgent()}

    try:
        conn = pymysql.connect(host='10.10.180.145',
                               user='******',
                               passwd='hourong',
                               db='SuggestName',
                               charset="utf8")
        with conn as cursor:
            print(city_name)
            quote_string = quote(city_name.encode('utf8'))
            page = requests.get(
                'http://www.tripadvisor.cn/TypeAheadJson?interleaved=true&types=geo%2Ctheme_park%2Cair&neighborhood_geos=true&link_type=geo&details=true&max=6&hglt=true&query={0}&action=API&uiOrigin=GEOSCOPE&source=GEOSCOPE'
                .format(quote_string),
                proxies=proxies,
                headers=headers)
            page.encoding = 'utf8'
            content = page.text.replace('while(1);', '')
            for line in get_query_data(content=content,
                                       query_string=city_name):
                cursor.execute(
                    'insert into TripAdvisorSuggestCity (`QueryName`,`Name`,`coords`,`Url`) VALUES (%s,%s,%s,%s)',
                    line)
        conn.close()
        update_task(kwargs['task_id'])
        print "Success with " + PROXY + ' CODE 0'
        update_proxy('Platform', PROXY, x, '0')
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
Ejemplo n.º 6
0
def detail_page(self, pid, page_num, city_id, part):
    x = time.time()
    PROXY = get_proxy(source="Platform")
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    headers = {
        'User-agent': GetUserAgent(),
    }

    try:
        data = {
            u'page': unicode(page_num),
            u'type': u'city',
            u'pid': unicode(pid),
            u'sort': u'32',
            u'subsort': u'all',
            u'isnominate': u'-1',
            u'haslastm': u'false',
            u'rank': u'6'
        }
        json_page = requests.post(u'http://place.qyer.com/poi.php?action=list_json', data=data, proxies=proxies,
                                  headers=headers)
        json_page.encoding = u'utf8'
        content = json_page.text
        j_data = json.loads(content)
        task_data = []
        url_result = []
        for attr in j_data[u'data'][u'list']:
            worker = u'qyer_poi_task'
            args = json.dumps(
                {u'target_url': unicode(u'http:' + attr[u'url']), u'city_id': unicode(city_id)})
            task_id = get_task_id(worker=worker, args=args)
            task_data.append((task_id, worker, args, unicode(part.replace('list', 'detail'))))
            url_result.append(u'http' + attr[u'url'])
        result = insert_task(data=task_data)
        print result
        print url_result
        return result
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
Ejemplo n.º 7
0
def tp_rest_city_page(self, city_url, city_id, part):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    print "Now Proxy is " + PROXY
    headers = {
        'User-agent': GetUserAgent()
    }
    page = requests.get(city_url, proxies=proxies, headers=headers)
    page.encoding = 'utf8'
    if len(page.text) < 100:
        update_proxy('Platform', PROXY, x, '23')
        self.retry()
    doc = PyQuery(page.text)
    doc.make_links_absolute(city_url)
    for item in doc('.restaurants.twoLines a').items():
        tp_rest_list_page_num.delay(item.attr.href, city_id, part)
Ejemplo n.º 8
0
def get_long_comment(self, target_url, language, miaoji_id, special_str):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    headers = {
        'User-agent': GetUserAgent()
    }

    try:
        page = requests.get(target_url, headers=headers, proxies=proxies, timeout=120)
        page.encoding = 'utf8'
        data = long_comment_parse(page.content, target_url, language, miaoji_id)
        update_proxy('Platform', PROXY, x, '0')
        print "Success with " + PROXY + ' CODE 0'
        return insert_db((data,), 'tp_comment_' + special_str)
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
Ejemplo n.º 9
0
def venere_comment(self, target_url, **kwargs):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    headers = {
        'User-agent': GetUserAgent()
    }

    try:
        page = requests.get(target_url, proxies=proxies, headers=headers, timeout=120)
        page.encoding = 'utf8'
        result = venere_comment_parser(page.text, target_url)
        if not result:
            update_proxy('Platform', PROXY, x, '23')
            self.retry()
        else:
            update_task(kwargs['mongo_task_id'])
            print "Success with " + PROXY + ' CODE 0'
            update_proxy('Platform', PROXY, x, '0')

        return result
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
Ejemplo n.º 10
0
def yelp_price_level(self, target_url, mid):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    headers = {
        'User-agent': GetUserAgent()
    }

    try:
        page = requests.get(target_url, proxies=proxies, headers=headers, timeout=120)
        price_level = get_yelp_price_level(page)
        if not page.text:
            update_proxy('Platform', PROXY, x, '23')
            self.retry()
        else:
            print "Success with " + PROXY + ' CODE 0'
            update_proxy('Platform', PROXY, x, '0')
            print yelp_price_level_update_db((price_level, mid))
        return price_level
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
Ejemplo n.º 11
0
def hotel_base_data(self, source, url, other_info, **kwargs):
    x = time.time()
    PROXY = get_proxy(source="Platform")
    proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY}
    headers = {'User-agent': GetUserAgent()}

    try:
        page = requests.get(url, proxies=proxies, headers=headers, timeout=240)
        page.encoding = 'utf8'
        content = page.text
        # agoda 特殊情况 start
        url_about = 'https://www.agoda.com/NewSite/zh-cn/Hotel/AboutHotel?hotelId={0}&languageId=8&hasBcomChildPolicy=False'.format(
            other_info['source_id'])
        page_about = requests.get(url=url_about, headers=headers)
        page_about.encoding = 'utf8'
        about_content = page_about.text
        other_info['about_content'] = about_content

        # agoda end
        result = parse_hotel(content=content,
                             url=url,
                             other_info=other_info,
                             source=source)
        if not result:
            update_proxy('Platform', PROXY, x, '23')
            self.retry()
        else:
            update_task(kwargs['task_id'])
            print "Success with " + PROXY + ' CODE 0'
            update_proxy('Platform', PROXY, x, '0')
        return result
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
Ejemplo n.º 12
0
def get_lost_rest_new(self, target_url, city_id, **kwargs):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    headers = {
        'User-agent': GetUserAgent()
    }
    try:
        page = requests.get(target_url, headers=headers, proxies=proxies, timeout=15)
        page.encoding = 'utf8'
        result = rest_parser(page.content, target_url, city_id)
        if result == 'Error':
            self.retry()
        else:
            update_task(task_id=kwargs['mongo_task_id'])
            update_proxy('Platform', PROXY, x, '23')
        return result
    except Exception as exc:
        self.retry(exc=traceback.format_exc(exc))
Ejemplo n.º 13
0
def shutter_spider(self, vid, search_kw, debug=False, **kwargs):
    """
    shutterstock 图片搜索爬取
    """
    if search_kw is None or search_kw == "null":
        # todo logging null key words
        return None
    x = time.time()
    spider_proxy = 'socks5://' + get_proxy(source="Platform")
    try:
        spider = ShutterShockPicSpider(search_kw, spider_proxy, debug)
        pic_ret = spider.pic_search()
        pic_save_data = shutter_pic_data_assembly(vid, search_kw, pic_ret)
        spider_db = PicModel(**save_db_config)
        for _, save_data_map in pic_save_data.items():
            spider_db.insert_pic_many(save_data_map["table"],
                                      save_data_map["fields"],
                                      save_data_map["values"])
        update_task(kwargs['task_id'])
    except Exception as exc:
        update_proxy('Platform', spider_proxy, x, '23')
        self.retry(exc=traceback.format_exc(exc))
Ejemplo n.º 14
0
def get_pid_total_page(self, target_url, city_id, part):
    x = time.time()
    PROXY = get_proxy(source="Platform")
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    headers = {
        'User-agent': GetUserAgent()
    }
    try:
        html_page = requests.get(target_url, proxies=proxies, headers=headers)
        html_page.encoding = u'utf8'
        content = html_page.text
        pid = re.findall(u'PID :\'(\d+)\'', content)[0]
        total_attr = re.findall(u'景点\((\d+)\)', content)[0]
        # return pid, (int(total_attr) // 15) + 1
        print pid, total_attr
        for page_num in range(1, (int(total_attr) // 15) + 2):
            detail_page.delay(pid, page_num, city_id, part)
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
Ejemplo n.º 15
0
def qyer_city_query_task(self, city_name, **kwargs):
    x = time.time()
    PROXY = get_proxy(source="Platform")
    proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY}
    headers = {
        'User-agent': GetUserAgent(),
        'Referer': "http://www.qyer.com/",
    }

    try:
        conn = pymysql.connect(host='10.10.180.145',
                               user='******',
                               passwd='hourong',
                               db='SuggestName',
                               charset="utf8")
        with conn as cursor:
            print(city_name)
            quote_string = quote(city_name.encode('utf8'))
            page = requests.get(
                'http://www.qyer.com/qcross/home/ajax?action=search&keyword={0}'
                .format(quote_string),
                proxies=proxies,
                headers=headers)
            page.encoding = 'utf8'
            content = page.text.replace('while(1);', '')
            for line in get_query_data(content=content,
                                       query_string=city_name):
                cursor.execute(
                    'insert into QyerSuggestCity (`QueryName`,`Name`,`BelongName`,`Url`) VALUES (%s,%s,%s,%s)',
                    line)
        conn.close()
        update_task(kwargs['task_id'])
        print "Success with " + PROXY + ' CODE 0'
        update_proxy('Platform', PROXY, x, '0')
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
Ejemplo n.º 16
0
def qyer_city_spider(self,
                     country_id,
                     country_en,
                     country_link,
                     debug=False,
                     **kwargs):
    """
    抓取穷游上的城市数据
    country_id:
        int, index country info
    country_en:
        str. country_en
    country_link:
        str.
    """
    if country_en in city_state:
        country_type = "city_state"
    else:
        country_type = "city_list"
    http_tools = init_qyer_session(debug=True)
    x = time.time()
    country_args = {"country_en": country_en, "country_id": country_id}
    spider_proxy = "socks5://" + get_proxy(source="Platform")
    qyer_db = QyerModel(**save_db_config)

    try:
        spider_ret = http_tools(country_link, proxy=spider_proxy)
        status_code = spider_ret[1]
        if status_code != 200 and status_code != 404:
            raise Exception(str(status_code))

        save_data = platform_page_parse(country_type, spider_ret[0],
                                        **country_args)
        qyer_db.insert_many_data(*save_data)
    except Exception as exc:
        update_proxy('Platform', spider_proxy, x, '23')
        self.retry(exc=traceback.format_exc(exc))
Ejemplo n.º 17
0
def get_site_url(self, target_url, source_id, table_name):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    try:
        res = _get_site_url(target_url)
        if res == 'Error':
            update_proxy('Platform', PROXY, x, '23')
            self.retry()
        else:
            print "Success with " + PROXY + ' CODE 0'
            update_proxy('Platform', PROXY, x, '0')
            update_site_url(res, source_id, table_name=table_name)
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
Ejemplo n.º 18
0
def booking_detail_task(self, url, task):
    try:
        result = booking_detail_crawl()
        if not result:
            x = time.time()
            update_proxy('Platform', PROXY, x, '23')
            self.retry()
        else:
            x = time.time()
            print "Success with " + PROXY + ' CODE 0'
            update_proxy('Platform', PROXY, x, '0')
        return result
    except Exception as exc:
        x = time.time()
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
Ejemplo n.º 19
0
def get_comment(self, target_url, language, miaoji_id, special_str, **kwargs):
    if language == 'en':
        data = {
            'mode': 'filterReviews',
            'filterLang': 'en'
        }
    elif language == 'zhCN':
        data = {
            'mode': 'filterReviews',
            'filterLang': 'zh_CN'
        }
    else:
        return "Error, no such language"

    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    headers = {
        'User-agent': GetUserAgent()
    }

    if data != '':
        try:
            page = requests.post(target_url, data, headers=headers, proxies=proxies, timeout=120)
            page.encoding = 'utf8'
            res = parse(page.text, target_url, language, miaoji_id, special_str)
            if res == 0:
                update_proxy('Platform', PROXY, x, '23')
                self.retry(countdown=120)
            else:
                # update_task(kwargs['mongo_task_id'])
                update_proxy('Platform', PROXY, x, '0')
                print "Success with " + PROXY + ' CODE 0'
        except Exception as exc:
            update_proxy('Platform', PROXY, x, '23')
            self.retry(exc=traceback.format_exc(exc), countdown=120)
Ejemplo n.º 20
0
def get_daodao_image_url(self, source_url, mid, **kwargs):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    print "Now Proxy is " + PROXY
    headers = {
        'User-agent': GetUserAgent()
    }

    try:
        detail_id = re.findall('-d(\d+)', source_url)[0]
        target_url = 'http://www.tripadvisor.cn/LocationPhotoAlbum?detail=' + detail_id
        page = requests.get(target_url, proxies=proxies, headers=headers, timeout=240)
        page.encoding = 'utf8'
        if not page.text:
            update_proxy('Platform', PROXY, x, '23')
            self.retry()
        else:
            print "Success with " + PROXY + ' CODE 0'
            root = PyQuery(page.text)
            images_list = []
            for div in root('.photos.inHeroList div').items():
                images_list.append(div.attr['data-bigurl'])
            img_list = '|'.join(images_list)
            if img_list == '':
                self.retry()
            data = (mid, source_url, img_list)
            print insert_daodao_image_list(data)
            update_proxy('Platform', PROXY, x, '0')
            update_task(kwargs['mongo_task_id'])
        return data
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))