def process_reviews(self, rs, item_id, id_db, id_list): """ Inputs: Amazon Reviews object, and a filehandle. Output: Returns number of reviews processed. Writes reviews to file. """ sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') count = 0 output_str = "" contents = [] review_ids = [] ratings =[] review_sentence_num = [] checker = len(id_db) > 0 for r in rs.full_reviews(): try: if self.debug: logging.debug( "{} | {} | {}".format( r.id, r.date, self._encode_safe( r.text))) if r.text != "None": if r.id in id_list: print "Review already in current scraped list, pass" continue else: if checker: # if we need to check reviewID if r.id in id_db: print 'scraped review is passed as it is in db' continue else: count += 1 id_list.append(r.id) sentenceContent_list = getSentencesFromReview(self._encode_safe(r.text)) print "First sentence: " + sentenceContent_list[0] for content in sentenceContent_list: contents.append(content) sentence_num = len(sentenceContent_list) review_sentence_num.append(sentence_num) review_ids.append(r.id) ratings.append(float(r.rating) * 5) # rating directly from API is normalized to 1 else: #don't need to check reviewID count += 1 id_list.append(r.id) sentenceContent_list = getSentencesFromReview(self._encode_safe(r.text)) print "First sentence: " + sentenceContent_list[0] for content in sentenceContent_list: contents.append(content) sentence_num = len(sentenceContent_list) review_sentence_num.append(sentence_num) review_ids.append(r.id) ratings.append(float(r.rating) * 5) # rating directly from API is normalized to 1 except: logging.warn('Encoding problem with review {}'.format(r.id)) return count, contents, review_ids, ratings, review_sentence_num
def process_reviews(self, rs): """ Inputs: Amazon Reviews object, and a filehandle. Output: Returns number of reviews processed. Writes reviews to file. """ sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') count = 0 output_str = "" contents = [] for r in rs.full_reviews(): count += 1 try: if self.debug: logging.debug( "{} | {} | {}".format( r.id, r.date, self._encode_safe( r.text))) if r.text != "None": sentenceContent_list = getSentencesFromReview(self._encode_safe(r.text)) print "First sentence: " + sentenceContent_list[0] for content in sentenceContent_list: contents.append(content) except: logging.warn( 'Encoding problem with review {}'.format( r.id)) return count, contents
def upsert_all_reviews_bulk(review_file_path, meta_dict): """Based on the fact that a product's review is consecutive, this function bulk upsert all the reviews for one product""" reviewParser = parse(review_file_path) client, db = connect_to_db() db_product_collection = db.product_collection db_product_collection.create_index([("product_id", ASCENDING)]) i=0 num_found = 0 print "building product_collection in database" product_id = "a" for review in reviewParser: i += 1 if i % 1000 ==0: print i #new data: product_id_new = review['asin'] contents_new = getSentencesFromReview(review['reviewText']) num_sentence = len(contents_new) review_id_new = review['reviewerID'] rating_new = review['overall'] # If the product id is the same, then just concatenate the field if product_id_new == product_id: contents = contents + contents_new review_ids.append(review_id_new) ratings.append(rating_new) review_ending_sentence.append(num_sentence + review_ending_sentence[-1]) num_reviews += 1 # If encountering new product: save previous product, and initialize the product variables elif product_id_new != product_id: if i > 1: upsert_new_product(db_product_collection, product_id, product_name, category, contents, review_ids, ratings, review_ending_sentence, num_reviews, ft_senIdx, ft_score) product_id = product_id_new product_name = [] category = [] if product_id in meta_dict: product = meta_dict[product_id] if 'product_name' in product: product_name = product['product_name'] if 'category' in product: category = product['category'] contents = contents_new review_ids = [] ratings = [] review_ending_sentence = [] review_ids.append(review_id_new) ratings.append(rating_new) review_ending_sentence.append(num_sentence) num_reviews = 1 ft_score = {} ft_senIdx = {} client.close()
def upsert_review_for_product_id(review, db_product_collection, meta_dict): """For each review, if it belongs to the category indicated by "category_name", add it to the product_collection in db""" product_id = review['asin'] query_res = list(db_product_collection.find({"product_id": product_id})) contents_new = getSentencesFromReview(review['reviewText']) num_sentence = len(contents_new) review_id_new = review['reviewerID'] rating_new = review['overall'] isfound = 0 if product_id in meta_dict: product = meta_dict[product_id] product_name = product['product_name'] category = product['category'] if len(category) > 0: isfound = 1 #if product already exists: add to the current product information if len(query_res) > 0: contents = query_res[0]["contents"] + contents_new review_ids = query_res[0]["review_ids"] ratings = query_res[0]["ratings"] review_ids.append(review_id_new) ratings.append(rating_new) review_ending_sentence_list = query_res[0]["review_ending_sentence"] review_ending_sentence_new = num_sentence + review_ending_sentence_list[-1] review_ending_sentence_list.append(review_ending_sentence_new) num_reviews = query_res[0]["num_reviews"] + 1 update_field = { "contents": contents, "review_ids": review_ids, "ratings": ratings, "review_ending_sentence": review_ending_sentence_list, "num_reviews": num_reviews, "category": category } # if product not in database: else: contents = contents_new review_ids = [] ratings = [] review_ending_sentence_list = [] review_ids.append(review_id_new) ratings.append(rating_new) review_ending_sentence_list.append(num_sentence) num_reviews = 1 update_field = { "contents": contents, "product_name": product_name, "review_ids": review_ids, "ratings": ratings, "review_ending_sentence": review_ending_sentence_list, "num_reviews": num_reviews, "category": category, "ft_score": {}, "ft_senIdx": {} } query = {"product_id": product_id} db_product_collection.update(query, {"$set": update_field}, True) return isfound
def scrape_reviews_hard(productID, prod_review_ids_db, max_scrape_loop = 1, current_loop=0): ''' This method scraps directly from website and does not need userID or the AmazonScrape object However, it can only scrape the 5 top ranked review. ''' if current_loop > max_scrape_loop: return [], [], [], [], [], [] else: try: current_loop += 1 doc = getWebPage(productID) XPATH_NAME = '//h1[@id="title"]//text()' XPATH_RATINGS = '//div[contains(@id, "rev-dpReviewsMostHelpfulAUI")]/div/div/a/i/span//text()' XPATH_REVIEWS_IDS = '//div[contains(@id, "rev-dpReviewsMostHelpfulAUI")]/a[2]/@id' RAW_NAME = doc.xpath(XPATH_NAME) RAW_RATINGS = doc.xpath(XPATH_RATINGS) ratings = [int(float((x[:3]))) for x in RAW_RATINGS] #remove the rest of the string RAW_REVIEWS_IDS = doc.xpath(XPATH_REVIEWS_IDS) product_name = ' '.join(''.join(RAW_NAME).split()) if RAW_NAME else None review_ids = [x[:x.index(".")] for x in RAW_REVIEWS_IDS] #remove the rest of the string contents = [] review_sentence_num = [] ind_new_review = [] for index in range(len(review_ids)): review_id = review_ids[index] if review_id in prod_review_ids_db: print "scraped review is passed by backup_scraper as it is in db" continue else: ind_new_review.append(index) XPATH_REVIEW_BODY = '//div[contains(@id, "revData-dpReviewsMostHelpfulAUI-%s")]/div//text()' % review_id RAW_REVIEW_BODY = doc.xpath(XPATH_REVIEW_BODY) review_content = "" for RAW_REVIEW in RAW_REVIEW_BODY: review = RAW_REVIEW.strip().encode('utf-8').decode('utf-8') review_content += (review + " ") review_sentences = getSentencesFromReview(review_content) print "First sentence: {0}".format(review_sentences[0]) sentence_num = len(review_sentences) review_sentence_num.append(sentence_num) contents.extend(review_sentences) if len(ind_new_review) > 0: print('new reviews available from scrape_reviews_hard') review_ids = [review_ids[j] for j in ind_new_review] ratings = [ratings[j] for j in ind_new_review] else: review_ids = [] ratings = [] #Getting review_ending_sentence: if len(review_sentence_num) == 0: review_ending_sentence = [] else: review_ending_sentence = [0] for num in review_sentence_num: review_ending_sentence.append(num + review_ending_sentence[-1]) review_ending_sentence = review_ending_sentence[1:] scraped_pages_new = [1] return product_name, contents, review_ids, ratings, review_ending_sentence, scraped_pages_new except: time.sleep(int(random.random() * 1.5 + 1) + random.random()) print 'scraper failed, reinitiate for the %d th time' % current_loop return scrape_reviews_hard(productID, checker, max_scrape_loop, current_loop)