def review_result_is_valid(hotel_id): with taDB(common.TA_DB) as db: record = db.read_a_hotel(hotel_id) if record is None: return False rno = record[3] if int(rno) == 0: logger.info('[hotel {}] PASSED: no reviews'.format(hotel_id)) return True rid_str = record[4] rids = ast.literal_eval(rid_str) if rno < len(rids): return False with taDB(common.TA_DB) as db: for rid in rids: rrecord = db.read_a_review(rid) if rrecord is None: return False html = rrecord[1] if html is None: logger.info('[hotel {}] FAILED: HTML is absent'.format(hotel_id)) return False rec_soup = common.load_soup_string(html) if rec_soup.find('div', id=''.join(['review_', rid])) is None: print(html) logger.info('[hotel {}] FAILED: corrupted HTML'.format(hotel_id)) return False logger.info('[hotel {}] PASSED: verified'.format(hotel_id)) return True
def gather_reviews(title): def gen_review_url(rid): return TA_ROOT + 'OverlayWidgetAjax?' + '&'.join( ['Mode=EXPANDED_HOTEL_REVIEWS', 'metaReferer=Hotel_Review', 'reviews=' + rid]) while True: # logger.info('[worker {}] running'.format(title)) hotel_id = que.get() if hotel_id is None: logger.info('[worker {}] shutting down'.format(title)) break with taDB(common.TA_DB) as db: record = db.read_a_hotel(hotel_id) if record is None: continue rid_str = record[4] rids = ast.literal_eval(rid_str) new_rids = [] slice_num = math.ceil( len(rids) / common.REVIEW_CHUNK_SIZE) for slicePos in range(slice_num): time.sleep(common.SLEEP_TIME) spos = slicePos * common.REVIEW_CHUNK_SIZE epos = (slicePos + 1) * common.REVIEW_CHUNK_SIZE \ if slicePos + 1 < slice_num else len(rids) id_string = ','.join(rids[spos: epos]) logger.info('\t[hotel {}] from {} to {}' .format(hotel_id, spos + 1, epos)) url = gen_review_url(id_string) web_data = requests.get(url) web_text = web_data.text new_rids.extend(save_reviews(web_text)) diff_flag = False diff_set = set(rids).difference(set(new_rids)) for diff in diff_set: print('found diff') url = gen_review_url(diff) web_data = requests.get(url) blank = re.findall( '(?<=id=\")review_\d+(?=\")', web_data.text, re.IGNORECASE) if len(blank) > 0: diff_flag = True logger.info('{} is not empty'.format(diff)) break if not diff_flag: if diff_set: with lock: with taDB(common.TA_DB) as db: db.update_review_list_in_hotel( hotel_id, len(new_rids), str(new_rids)) logger.info('\t[hotel {}] review indexes updated' .format(hotel_id)) else: logger.info('\ttry again later') logger.info('\t{}'.format(diff_set)) que.put(hotel_id) que.task_done()
def gather_profiles(title): while True: # logger.info('[worker {}] running'.format(title)) uid = que.get() if uid is None: logger.info('[worker {}] shutting down'.format(title)) break logger.info('[user {}]'.format(uid)) url = TA_ROOT + 'MemberOverlay?uid=' + uid simple_soup = common.load_soup_online(url).find( 'div', class_='memberOverlay') profile_url = re.search('(?<=")/members/.+(?=")', str(simple_soup)) if profile_url is None: profile_url = re.search( '(?<=")/MemberProfile-a_uid.[A-Z0-9]+(?=")', str(simple_soup)) result = [] if profile_url is not None: profile_url = TA_ROOT + profile_url.group(0).strip() result.append(simple_soup.prettify()) detail_soup = common.load_soup_online(profile_url) member_soup = detail_soup.find('div', id='MODULES_MEMBER_CENTER') if member_soup is not None: result.append(member_soup.prettify()) record = [uid, '\r\n'.join(result)] with lock: with taDB(common.TA_DB) as db: db.insert_a_user(record) else: if '404' in detail_soup.find('title').string: with lock: with taDB(common.TA_DB) as db: db.remove_user_id_in_review(uid) logger.info('\tuser id removed') else: logger.info('\tfailed to fetch full profile') que.put(uid) else: logger.info('\tno profile url') que.put(uid) time.sleep(common.SLEEP_TIME) que.task_done()
def user_is_valid(uid): with taDB(common.TA_DB) as db: user_record = db.read_a_user(uid) if user_record is None: return False html = user_record[0] if html is None: return False soup = common.load_soup_string(html) if soup.find('div', id='MODULES_MEMBER_CENTER') is None: logger.info('[user {}] FAILED: corrupted'.format(uid)) return False else: logger.info('[user {}] PASSED: verified'.format(uid)) return True
def start(gid): def gather_review_ids(title): while True: logger.info('[worker {}] running'.format(title)) cur_pair = que.get() if cur_pair is None: logger.info('[worker {}] shutting down'.format(title)) break hid, hurl = next(iter(cur_pair.items())) hurl = TA_ROOT + hurl logger.info('[hotel {}] {}'.format(hid, hurl)) html, rno, rid_list = find_review_ids(hid, hurl) if rid_list is not None: record = [hid, html, gid, rno, str(rid_list)] with lock: with taDB(common.TA_DB) as idb: idb.insert_a_hotel(record) else: logger.info('\ttry again later') que.put(cur_pair) que.task_done() que = queue.Queue() with taDB(common.TA_DB) as iodb: hid_pairs = iodb.get_hotel_url_pairs(gid) threads = [] thread_size = common.DETAIL_THREAD_NUM for j in range(thread_size): t = threading.Thread(target=gather_review_ids, args=(str(j + 1))) t.start() threads.append(t) logger.info('hid_pairs -- {}'.format(len(hid_pairs))) [ que.put({key: hid_pairs[key]}) for key in hid_pairs if not review_index_is_valid(key) ] que.join() for k in range(thread_size): que.put(None) for t in threads: t.join() logger.info('all review ids are ready')
def save_reviews(web_data): web_soup = common.load_soup_string(web_data) review_soups = web_soup.find_all('div', id=re.compile('review_\d+')) records = [] any_rids = [] for x in review_soups: # len('review_') = 7 any_rid = x['id'][7:] any_html = x.prettify() any_uid = re.search('[A-Z0-9]{32}', any_html) if any_uid is not None: any_uid = any_uid.group(0) any_rids.append(any_rid) records.append((any_rid, any_html, any_uid)) with lock: with taDB(common.TA_DB) as db: db.insert_many_reviews(records) return any_rids
def gather_review_ids(title): while True: # logger.info('[worker {}] running'.format(title)) cur_pair = que.get() if cur_pair is None: logger.info('[worker {}] shutting down'.format(title)) break hid, hurl = next(iter(cur_pair.items())) hurl = TA_ROOT + hurl logger.info('[hotel {}] {}'.format(hid, hurl)) html, rno, rid_list = find_review_ids(hid, hurl) if rid_list is not None: record = [hid, html, gid, rno, str(rid_list)] with lock: with taDB(common.TA_DB) as idb: idb.insert_a_hotel(record) else: logger.info('\ttry again later') que.put(cur_pair) que.task_done()
def review_index_is_valid(hid): with taDB(common.TA_DB) as db: record = db.read_a_hotel(hid) if record is not None: rno = record[3] rid_str = record[4] rids = ast.literal_eval(rid_str) is_having = len(rids) if rno > is_having or is_having != len(set(rids)): print('should_have {}, is_having {}'.format(rno, is_having)) logger.info('[hotel {}] FAILED: corrupted'.format(hid)) return False elif rno < is_having: logger.info('[hotel {}] PASSED: extra reviews'.format(hid)) return True else: logger.info('[hotel {}] PASSED: verified'.format(hid)) return True else: return False
_.SLEEP_TIME, _.SNIPPET_THREAD_NUM, _.DETAIL_THREAD_NUM, _.REVIEW_THREAD_NUM, _.USER_THREAD_NUM)) # location list url_list = config['LOCATION']['List'].split(';') logging.info('{} locations found'.format(len(url_list))) return url_list if __name__ == "__main__": init_logger() fn = common.TA_DB if not isfile(fn): with taDB(common.TA_DB) as db: db.create_tables() logging.info('database {} created.'.format(fn)) urls = load_config() # for url in urls: # gid = re.sub('\D', '', url) # crawlSnippets.start(gid, url.strip()) # crawlHotels.start(gid) # crawlReviews.start(gid) # crawlUsers.start() # with taDB(common.TA_DB) as db: # db.extract_hotel_info() # db.extract_review_info() # db.extract_user_info()
def gather_hotels(title): def calc_max_page(soup_container): return math.ceil(find_num_hotels(soup_container) / HOTEL_PER_PAGE) def find_hotel_ids(url_str): soup_container = common.load_soup_online(url_str) hdr = soup_container.find('div', class_='hdrTxt') if num_page == 1 and hdr is not None: divs_soup = hdr.find_all_previous( 'div', id=re.compile('^HOTELDEAL\d+')) else: divs_soup = soup_container.find_all( 'div', id=re.compile('^HOTELDEAL\d+')) page_pairs = [] for link in divs_soup: # len('HOTELDEAL') = 6 pair_hid = link['id'][9:] pair_url = link.find('div', class_='listing_title').find('a')['href'] page_pairs.append({pair_hid: pair_url[1:]}) return page_pairs def update_hotel_ids(new_pairs, pair_list): for new_pair in new_pairs: pair_key, pair_value = next(iter(new_pair.items())) # if hotel id not duplicate if pair_key not in pair_list: pair_list[pair_key] = pair_value while True: # logger.info('[worker {}] running'.format(title)) pid = que.get() if pid is None: logger.info('[worker {}] shutting down'.format(title)) break paras = '&'.join([ 'seen=0', 'sequence=1', 'geo=' + gid, 'requestingServlet=Hotels', 'refineForm=true', 'hs=', 'adults=2', 'rooms=1', 'o=a' + str(pid * HOTEL_PER_PAGE), 'pageSize=&rad=0', 'dateBumped=NONE', 'displayedSortOrder=popularity' ]) page_url = ''.join([init_url, '?', paras]) logger.info('[page {}] {}'.format(pid + 1, page_url)) # print('aa') hotels = find_hotel_ids(page_url) # print('bb') if hotels is None: que.put(pid) elif len(hotels) < HOTEL_PER_PAGE and pid < num_page - 1: que.put(pid) elif pid == num_page - 1 \ and len(hotels) < num_hotel % HOTEL_PER_PAGE: que.put(pid) else: with lock: update_hotel_ids(hotels, hid_pairs) logger.info('\t#{}, totaling {}'.format( pid, len(hid_pairs))) with taDB(common.TA_DB) as db: record = [gid, str(hid_pairs)] db.insert_a_location(record) time.sleep(common.SLEEP_TIME) que.task_done()
def start(gid, init_url): def gather_hotels(title): def calc_max_page(soup_container): return math.ceil(find_num_hotels(soup_container) / HOTEL_PER_PAGE) def find_hotel_ids(url_str): soup_container = common.load_soup_online(url_str) hdr = soup_container.find('div', class_='hdrTxt') if num_page == 1 and hdr is not None: divs_soup = hdr.find_all_previous( 'div', id=re.compile('^HOTELDEAL\d+')) else: divs_soup = soup_container.find_all( 'div', id=re.compile('^HOTELDEAL\d+')) page_pairs = [] for link in divs_soup: # len('HOTELDEAL') = 6 pair_hid = link['id'][9:] pair_url = link.find('div', class_='listing_title').find('a')['href'] page_pairs.append({pair_hid: pair_url[1:]}) return page_pairs def update_hotel_ids(new_pairs, pair_list): for new_pair in new_pairs: pair_key, pair_value = next(iter(new_pair.items())) # if hotel id not duplicate if pair_key not in pair_list: pair_list[pair_key] = pair_value while True: # logger.info('[worker {}] running'.format(title)) pid = que.get() if pid is None: logger.info('[worker {}] shutting down'.format(title)) break paras = '&'.join([ 'seen=0', 'sequence=1', 'geo=' + gid, 'requestingServlet=Hotels', 'refineForm=true', 'hs=', 'adults=2', 'rooms=1', 'o=a' + str(pid * HOTEL_PER_PAGE), 'pageSize=&rad=0', 'dateBumped=NONE', 'displayedSortOrder=popularity' ]) page_url = ''.join([init_url, '?', paras]) logger.info('[page {}] {}'.format(pid + 1, page_url)) # print('aa') hotels = find_hotel_ids(page_url) # print('bb') if hotels is None: que.put(pid) elif len(hotels) < HOTEL_PER_PAGE and pid < num_page - 1: que.put(pid) elif pid == num_page - 1 \ and len(hotels) < num_hotel % HOTEL_PER_PAGE: que.put(pid) else: with lock: update_hotel_ids(hotels, hid_pairs) logger.info('\t#{}, totaling {}'.format( pid, len(hid_pairs))) with taDB(common.TA_DB) as db: record = [gid, str(hid_pairs)] db.insert_a_location(record) time.sleep(common.SLEEP_TIME) que.task_done() s1 = gid loc_name = init_url[init_url.index(gid) + len(gid) + 1:init_url.rindex('-')] logger.info('[location {}] {}'.format(gid, loc_name.replace('_', ' '))) soup = common.load_soup_online(init_url) num_page = find_max_page(soup) logger.info('hotels Pages {} '.format(num_page)) num_hotel = find_num_hotels(soup) logger.info('{} hotels in {} pages'.format(num_hotel, num_page)) with taDB(common.TA_DB) as iodb: hid_pairs = iodb.get_hotel_url_pairs(gid) logger.info('{} hotels in local cache'.format(len(hid_pairs))) # collecting hotel ids might take multiple iterations while len(hid_pairs) < num_hotel: que = queue.Queue() threads = [] thread_size = common.SNIPPET_THREAD_NUM for j in range(thread_size): t = threading.Thread(target=gather_hotels, args=(str(j + 1))) t.start() threads.append(t) # set start value to math.ceil(len(hid_pairs) / HOTEL_PER_PAGE) # rather than 0 if the hotels are ordered in the list [que.put(x) for x in range(0, num_page)] que.join() for k in range(thread_size): que.put(None) for t in threads: t.join() logger.info('all hotel ids are ready'.format(len(hid_pairs)))
def start(): def user_is_valid(uid): with taDB(common.TA_DB) as db: user_record = db.read_a_user(uid) if user_record is None: return False html = user_record[0] if html is None: return False soup = common.load_soup_string(html) if soup.find('div', id='MODULES_MEMBER_CENTER') is None: logger.info('[user {}] FAILED: corrupted'.format(uid)) return False else: logger.info('[user {}] PASSED: verified'.format(uid)) return True def gather_profiles(title): while True: # logger.info('[worker {}] running'.format(title)) uid = que.get() if uid is None: logger.info('[worker {}] shutting down'.format(title)) break logger.info('[user {}]'.format(uid)) url = TA_ROOT + 'MemberOverlay?uid=' + uid simple_soup = common.load_soup_online(url).find( 'div', class_='memberOverlay') profile_url = re.search('(?<=")/members/.+(?=")', str(simple_soup)) if profile_url is None: profile_url = re.search( '(?<=")/MemberProfile-a_uid.[A-Z0-9]+(?=")', str(simple_soup)) result = [] if profile_url is not None: profile_url = TA_ROOT + profile_url.group(0).strip() result.append(simple_soup.prettify()) detail_soup = common.load_soup_online(profile_url) member_soup = detail_soup.find('div', id='MODULES_MEMBER_CENTER') if member_soup is not None: result.append(member_soup.prettify()) record = [uid, '\r\n'.join(result)] with lock: with taDB(common.TA_DB) as db: db.insert_a_user(record) else: if '404' in detail_soup.find('title').string: with lock: with taDB(common.TA_DB) as db: db.remove_user_id_in_review(uid) logger.info('\tuser id removed') else: logger.info('\tfailed to fetch full profile') que.put(uid) else: logger.info('\tno profile url') que.put(uid) time.sleep(common.SLEEP_TIME) que.task_done() # extract unique user ids from reviews logger.info('retrieving users...') with taDB(common.TA_DB) as iodb: iodb.generate_unique_users() uids = iodb.read_all_user_ids() logger.info('{} users found'.format(len(uids))) que = queue.Queue() threads = [] thread_size = common.USER_THREAD_NUM for j in range(thread_size): t = threading.Thread(target=gather_profiles, args=(str(j + 1))) t.start() threads.append(t) [que.put(x) for x in uids if not user_is_valid(x)] que.join() for k in range(thread_size): que.put(None) for t in threads: t.join() logger.info('all user ids are ready')