def review_result_is_valid(hotel_id): with taDB(common.TA_DB) as db: record = db.read_a_hotel(hotel_id) if record is None: return False rno = record[3] if int(rno) == 0: logger.info('[hotel {}] PASSED: no reviews'.format(hotel_id)) return True rid_str = record[4] rids = ast.literal_eval(rid_str) if rno < len(rids): return False with taDB(common.TA_DB) as db: for rid in rids: rrecord = db.read_a_review(rid) if rrecord is None: return False html = rrecord[1] if html is None: logger.info('[hotel {}] FAILED: HTML is absent'.format(hotel_id)) return False rec_soup = common.load_soup_string(html) if rec_soup.find('div', id=''.join(['review_', rid])) is None: print(html) logger.info('[hotel {}] FAILED: corrupted HTML'.format(hotel_id)) return False logger.info('[hotel {}] PASSED: verified'.format(hotel_id)) return True
def __init__(self, html): self._soup = common.load_soup_string(html) # JSON self._json = json.loads( str( self._soup.find('script', type='application/ld+json').getText()))
def __init__(self, html): self._review_soup = common.load_soup_string( html).find('div', class_='reviewSelector') # review id # len('review_') = 7 self.rid = self._review_soup['id'].strip()[7:] # user id (optional) # len('_UID') =4; len(uid) = 32 uid_string = self._review_soup.find( 'div', class_='member_info').find( 'div', id=re.compile('^UID_')) self.uid = None if uid_string is not None: self.uid = uid_string['id'].strip()[4:36] # BUBBLE self._bubble = self._review_soup.find( 'div', class_='innerBubble').find( 'div', class_='wrap') # RATING self._inline = self._bubble.find( 'div', class_='reviewItemInline') # RECOMMEND self._rec_bar = self._bubble.find( 'div', class_='rating-list')
def __init__(self, html): self.soup = common.load_soup_string(html) # OVERLAY self.overlay = self.soup.find('div', class_='memberOverlay') # LEFT PROFILE self.left_profile = self.soup.find('div', class_='leftProfile') # RIGHT CONTRIBUTIONS self.right_con = self.soup.find('div', class_='rightContributions')
def user_is_valid(uid): with taDB(common.TA_DB) as db: user_record = db.read_a_user(uid) if user_record is None: return False html = user_record[0] if html is None: return False soup = common.load_soup_string(html) if soup.find('div', id='MODULES_MEMBER_CENTER') is None: logger.info('[user {}] FAILED: corrupted'.format(uid)) return False else: logger.info('[user {}] PASSED: verified'.format(uid)) return True
def save_reviews(web_data): web_soup = common.load_soup_string(web_data) review_soups = web_soup.find_all('div', id=re.compile('review_\d+')) records = [] any_rids = [] for x in review_soups: # len('review_') = 7 any_rid = x['id'][7:] any_html = x.prettify() any_uid = re.search('[A-Z0-9]{32}', any_html) if any_uid is not None: any_uid = any_uid.group(0) any_rids.append(any_rid) records.append((any_rid, any_html, any_uid)) with lock: with taDB(common.TA_DB) as db: db.insert_many_reviews(records) return any_rids