예제 #1
0
    def gather_profiles(title):
        while True:
            # logger.info('[worker {}] running'.format(title))
            uid = que.get()
            if uid is None:
                logger.info('[worker {}] shutting down'.format(title))
                break
            logger.info('[user {}]'.format(uid))
            url = TA_ROOT + 'MemberOverlay?uid=' + uid
            simple_soup = common.load_soup_online(url).find(
                'div', class_='memberOverlay')
            profile_url = re.search('(?<=")/members/.+(?=")', str(simple_soup))
            if profile_url is None:
                profile_url = re.search(
                    '(?<=")/MemberProfile-a_uid.[A-Z0-9]+(?=")',
                    str(simple_soup))

            result = []
            if profile_url is not None:
                profile_url = TA_ROOT + profile_url.group(0).strip()
                result.append(simple_soup.prettify())
                detail_soup = common.load_soup_online(profile_url)
                member_soup = detail_soup.find('div',
                                               id='MODULES_MEMBER_CENTER')
                if member_soup is not None:
                    result.append(member_soup.prettify())
                    record = [uid, '\r\n'.join(result)]
                    with lock:
                        with taDB(common.TA_DB) as db:
                            db.insert_a_user(record)
                else:
                    if '404' in detail_soup.find('title').string:
                        with lock:
                            with taDB(common.TA_DB) as db:
                                db.remove_user_id_in_review(uid)
                                logger.info('\tuser id removed')
                    else:
                        logger.info('\tfailed to fetch full profile')
                        que.put(uid)
            else:
                logger.info('\tno profile url')
                que.put(uid)

            time.sleep(common.SLEEP_TIME)
            que.task_done()
예제 #2
0
def start(init_url):
    urls = []
    # init_url = input('specify an area: ')
    url_pattern = re.search(r'-g\d+-', init_url)
    url_pos = url_pattern.start() + len(url_pattern.group(0))
    page_number = find_max_page(common.load_soup_online(init_url))
    print('locations in {} pages.'.format(page_number))
    for idx_page in range(page_number):
        page_url = init_url if idx_page == 0 else ''.join(
                [init_url[:url_pos], 'oa', str(idx_page * 20),
                 '-', init_url[url_pos:]])
        print('[page {}] {}'.format(idx_page + 1, page_url))
        soup = common.load_soup_online(page_url)
        for hotel in soup.findAll('div', class_='geo_name'):
            a = hotel.find('a')
            a = common.TA_ROOT + a['href'][1:]
            urls.append(a)
    urls = list(set(urls))
    print('{} locations found.'.format(len(urls)))
    common.write_file('locations', ';'.join(urls))
예제 #3
0
        def find_hotel_ids(url_str):
            soup_container = common.load_soup_online(url_str)
            hdr = soup_container.find('div', class_='hdrTxt')
            if num_page == 1 and hdr is not None:
                divs_soup = hdr.find_all_previous(
                    'div', id=re.compile('^HOTELDEAL\d+'))
            else:
                divs_soup = soup_container.find_all(
                    'div', id=re.compile('^HOTELDEAL\d+'))

            page_pairs = []
            for link in divs_soup:
                # len('HOTELDEAL') = 6
                pair_hid = link['id'][9:]
                pair_url = link.find('div',
                                     class_='listing_title').find('a')['href']
                page_pairs.append({pair_hid: pair_url[1:]})
            return page_pairs
예제 #4
0
def start(gid, init_url):
    def gather_hotels(title):
        def calc_max_page(soup_container):
            return math.ceil(find_num_hotels(soup_container) / HOTEL_PER_PAGE)

        def find_hotel_ids(url_str):
            soup_container = common.load_soup_online(url_str)
            hdr = soup_container.find('div', class_='hdrTxt')
            if num_page == 1 and hdr is not None:
                divs_soup = hdr.find_all_previous(
                    'div', id=re.compile('^HOTELDEAL\d+'))
            else:
                divs_soup = soup_container.find_all(
                    'div', id=re.compile('^HOTELDEAL\d+'))

            page_pairs = []
            for link in divs_soup:
                # len('HOTELDEAL') = 6
                pair_hid = link['id'][9:]
                pair_url = link.find('div',
                                     class_='listing_title').find('a')['href']
                page_pairs.append({pair_hid: pair_url[1:]})
            return page_pairs

        def update_hotel_ids(new_pairs, pair_list):
            for new_pair in new_pairs:
                pair_key, pair_value = next(iter(new_pair.items()))
                # if hotel id not duplicate
                if pair_key not in pair_list:
                    pair_list[pair_key] = pair_value

        while True:
            # logger.info('[worker {}] running'.format(title))
            pid = que.get()
            if pid is None:
                logger.info('[worker {}] shutting down'.format(title))
                break
            paras = '&'.join([
                'seen=0', 'sequence=1', 'geo=' + gid,
                'requestingServlet=Hotels', 'refineForm=true', 'hs=',
                'adults=2', 'rooms=1', 'o=a' + str(pid * HOTEL_PER_PAGE),
                'pageSize=&rad=0', 'dateBumped=NONE',
                'displayedSortOrder=popularity'
            ])
            page_url = ''.join([init_url, '?', paras])
            logger.info('[page {}] {}'.format(pid + 1, page_url))
            # print('aa')
            hotels = find_hotel_ids(page_url)
            # print('bb')
            if hotels is None:
                que.put(pid)
            elif len(hotels) < HOTEL_PER_PAGE and pid < num_page - 1:
                que.put(pid)
            elif pid == num_page - 1 \
                    and len(hotels) < num_hotel % HOTEL_PER_PAGE:
                que.put(pid)
            else:
                with lock:
                    update_hotel_ids(hotels, hid_pairs)
                    logger.info('\t#{}, totaling {}'.format(
                        pid, len(hid_pairs)))
                    with taDB(common.TA_DB) as db:
                        record = [gid, str(hid_pairs)]
                        db.insert_a_location(record)

            time.sleep(common.SLEEP_TIME)
            que.task_done()

    s1 = gid
    loc_name = init_url[init_url.index(gid) + len(gid) +
                        1:init_url.rindex('-')]
    logger.info('[location {}] {}'.format(gid, loc_name.replace('_', ' ')))
    soup = common.load_soup_online(init_url)
    num_page = find_max_page(soup)
    logger.info('hotels Pages {} '.format(num_page))

    num_hotel = find_num_hotels(soup)
    logger.info('{} hotels in {} pages'.format(num_hotel, num_page))

    with taDB(common.TA_DB) as iodb:
        hid_pairs = iodb.get_hotel_url_pairs(gid)
    logger.info('{} hotels in local cache'.format(len(hid_pairs)))

    # collecting hotel ids might take multiple iterations
    while len(hid_pairs) < num_hotel:
        que = queue.Queue()

        threads = []
        thread_size = common.SNIPPET_THREAD_NUM
        for j in range(thread_size):
            t = threading.Thread(target=gather_hotels, args=(str(j + 1)))
            t.start()
            threads.append(t)

        # set start value to math.ceil(len(hid_pairs) / HOTEL_PER_PAGE)
        # rather than 0 if the hotels are ordered in the list
        [que.put(x) for x in range(0, num_page)]

        que.join()

        for k in range(thread_size):
            que.put(None)
        for t in threads:
            t.join()

    logger.info('all hotel ids are ready'.format(len(hid_pairs)))