def __init__(self, max_sleep_time, host, port): super().__init__(max_sleep_time, host, port) self.api_url = "http://{}:{}/api_review".format(self.host, self.port) self.base_url = "https://www.goodreads.com/review/show/" self.parser = Review_Parser() self.data_type = "review"
class Review_Minion(Minion): def __init__(self, max_sleep_time, host, port): super().__init__(max_sleep_time, host, port) self.api_url = "http://{}:{}/api_review".format(self.host, self.port) self.base_url = "https://www.goodreads.com/review/show/" self.parser = Review_Parser() self.data_type = "review" def parse(self): if not self.soup_tuple_queue.empty(): soup_tuple = self.soup_tuple_queue.get() id, soup = soup_tuple[0], soup_tuple[1] try: is_review_valid = self.parser.review_soup_is_valid(soup) if is_review_valid: date = self.parser.review_soup_to_date(soup) book_title = self.parser.review_soup_to_book_title(soup) book_id = self.parser.review_soup_to_book_id(soup) rating = self.parser.review_soup_to_rating(soup) reviewer_href = self.parser.review_soup_to_reviewer_href( soup) progress_dict = self.parser.review_soup_to_progress_dict( soup) start_date = self.parser.progress_dict_to_start_date( progress_dict) finished_date = self.parser.progress_dict_to_finish_date( progress_dict) shelved_date = self.parser.progress_dict_to_shelved_date( progress_dict) else: date = None book_title = None book_id = None rating = None reviewer_href = None start_date = None finished_date = None shelved_date = None data_string = "{},{},{},{},{},{},{},{},{},{}".format( id, is_review_valid, date, book_title, book_id, rating, reviewer_href, start_date, finished_date, shelved_date) self.data_strings_queue.put(data_string) except AttributeError: print("Unable to parse {} id = {}. discarding soup".format( self.data_type, id)) self.soup_tuple_queue.task_done()
def __init__(self, min_id, max_id, max_data_points, max_sleep_time, file_name): super().__init__(max_sleep_time, file_name) #Specifics for Review Data Collection self.id_list = range(min_id, max_id) self.base_url = "https://www.goodreads.com/review/show/" self.data_point_type = "Reviews" #counter self.max_data_points = max_data_points #Review Parser self.parser = Review_Parser()
reviews_list = [ rev_url_med, rev_url_twilight, rev_url_gatsby, rev_url_error, rev_url_grounded ] review_input_format = ["string_format", "bytes_format"] review_parser_methods = [ "review_soup_is_valid", "review_soup_to_date", "review_soup_to_book_title", "review_soup_to_book_id", "review_soup_to_rating", "review_soup_to_reviewer_href", "progress_dict_to_start_date", "progress_dict_to_finish_date", "progress_dict_to_shelved_date" ] scraper = Scraper() parser = Review_Parser() def run_data_test(): log_file = open("review_data_diagnosis_log.csv", "a") log_file.write( "rev_name,input_format,parser_method,parse_output,parse_output_type") for review_url in reviews_list: print("Analyzing: {}".format(review_url)) for format in review_input_format: ##CONVERT URL TO HTML, DEPENDING ON THE FORMAT
class Review_Data_Collector(Data_Collector): def __init__(self, min_id, max_id, max_data_points, max_sleep_time, file_name): super().__init__(max_sleep_time, file_name) #Specifics for Review Data Collection self.id_list = range(min_id, max_id) self.base_url = "https://www.goodreads.com/review/show/" self.data_point_type = "Reviews" #counter self.max_data_points = max_data_points #Review Parser self.parser = Review_Parser() def generate_current_url(self): self.current_id = random.choice(self.id_list) self.current_url = self.base_url + str(self.current_id) def add_headers_to_log_file(self): self.datafile.write( "ID,is_URL_valid,review_publication_date,book_title,book_id,rating,reviewer_href,started_reading_date,finished_reading_date,shelved_date,log_time" ) def parse(self): self.is_current_valid = self.parser.review_soup_is_valid( self.current_soup) if self.is_current_valid: self.current_date = self.parser.review_soup_to_date( self.current_soup) self.current_book_title = self.parser.review_soup_to_book_title( self.current_soup) self.current_book_id = self.parser.review_soup_to_book_id( self.current_soup) self.current_rating = self.parser.review_soup_to_rating( self.current_soup) self.current_reviewer_href = self.parser.review_soup_to_reviewer_href( self.current_soup) self.current_progress_dict = self.parser.review_soup_to_progress_dict( self.current_soup) self.current_start_date = self.parser.progress_dict_to_start_date( self.current_progress_dict) self.current_finished_date = self.parser.progress_dict_to_finish_date( self.current_progress_dict) self.current_shelved_date = self.parser.progress_dict_to_shelved_date( self.current_progress_dict) else: self.current_date = None self.current_book_title = None self.current_book_id = None self.current_rating = None self.current_reviewer_href = None self.current_start_date = None self.current_finished_date = None self.current_shelved_date = None def log_data(self): self.datafile.write("\n{},{},{},{},{},{},{},{},{},{}".format( str(self.current_id), self.is_current_valid, self.current_date, self.current_book_title, self.current_book_id, self.current_rating, self.current_reviewer_href, self.current_start_date, self.current_finished_date, self.current_shelved_date)) self.data_points_counter += 1