def __init__(self, max_sleep_time, host, port):
        super().__init__(max_sleep_time, host, port)

        self.api_url = "http://{}:{}/api_review".format(self.host, self.port)
        self.base_url = "https://www.goodreads.com/review/show/"
        self.parser = Review_Parser()
        self.data_type = "review"
class Review_Minion(Minion):
    def __init__(self, max_sleep_time, host, port):
        super().__init__(max_sleep_time, host, port)

        self.api_url = "http://{}:{}/api_review".format(self.host, self.port)
        self.base_url = "https://www.goodreads.com/review/show/"
        self.parser = Review_Parser()
        self.data_type = "review"

    def parse(self):

        if not self.soup_tuple_queue.empty():

            soup_tuple = self.soup_tuple_queue.get()
            id, soup = soup_tuple[0], soup_tuple[1]

            try:

                is_review_valid = self.parser.review_soup_is_valid(soup)

                if is_review_valid:
                    date = self.parser.review_soup_to_date(soup)
                    book_title = self.parser.review_soup_to_book_title(soup)
                    book_id = self.parser.review_soup_to_book_id(soup)
                    rating = self.parser.review_soup_to_rating(soup)
                    reviewer_href = self.parser.review_soup_to_reviewer_href(
                        soup)

                    progress_dict = self.parser.review_soup_to_progress_dict(
                        soup)
                    start_date = self.parser.progress_dict_to_start_date(
                        progress_dict)
                    finished_date = self.parser.progress_dict_to_finish_date(
                        progress_dict)
                    shelved_date = self.parser.progress_dict_to_shelved_date(
                        progress_dict)

                else:
                    date = None
                    book_title = None
                    book_id = None
                    rating = None
                    reviewer_href = None
                    start_date = None
                    finished_date = None
                    shelved_date = None

                data_string = "{},{},{},{},{},{},{},{},{},{}".format(
                    id, is_review_valid, date, book_title, book_id, rating,
                    reviewer_href, start_date, finished_date, shelved_date)
                self.data_strings_queue.put(data_string)

            except AttributeError:

                print("Unable to parse {} id = {}. discarding soup".format(
                    self.data_type, id))

            self.soup_tuple_queue.task_done()
Example #3
0
    def __init__(self, min_id, max_id, max_data_points, max_sleep_time,
                 file_name):

        super().__init__(max_sleep_time, file_name)

        #Specifics for Review Data Collection
        self.id_list = range(min_id, max_id)
        self.base_url = "https://www.goodreads.com/review/show/"
        self.data_point_type = "Reviews"

        #counter
        self.max_data_points = max_data_points

        #Review Parser
        self.parser = Review_Parser()
reviews_list = [
    rev_url_med, rev_url_twilight, rev_url_gatsby, rev_url_error,
    rev_url_grounded
]

review_input_format = ["string_format", "bytes_format"]
review_parser_methods = [
    "review_soup_is_valid", "review_soup_to_date", "review_soup_to_book_title",
    "review_soup_to_book_id", "review_soup_to_rating",
    "review_soup_to_reviewer_href", "progress_dict_to_start_date",
    "progress_dict_to_finish_date", "progress_dict_to_shelved_date"
]

scraper = Scraper()
parser = Review_Parser()


def run_data_test():

    log_file = open("review_data_diagnosis_log.csv", "a")
    log_file.write(
        "rev_name,input_format,parser_method,parse_output,parse_output_type")

    for review_url in reviews_list:

        print("Analyzing: {}".format(review_url))

        for format in review_input_format:

            ##CONVERT URL TO HTML, DEPENDING ON THE FORMAT
Example #5
0
class Review_Data_Collector(Data_Collector):
    def __init__(self, min_id, max_id, max_data_points, max_sleep_time,
                 file_name):

        super().__init__(max_sleep_time, file_name)

        #Specifics for Review Data Collection
        self.id_list = range(min_id, max_id)
        self.base_url = "https://www.goodreads.com/review/show/"
        self.data_point_type = "Reviews"

        #counter
        self.max_data_points = max_data_points

        #Review Parser
        self.parser = Review_Parser()

    def generate_current_url(self):
        self.current_id = random.choice(self.id_list)
        self.current_url = self.base_url + str(self.current_id)

    def add_headers_to_log_file(self):

        self.datafile.write(
            "ID,is_URL_valid,review_publication_date,book_title,book_id,rating,reviewer_href,started_reading_date,finished_reading_date,shelved_date,log_time"
        )

    def parse(self):

        self.is_current_valid = self.parser.review_soup_is_valid(
            self.current_soup)

        if self.is_current_valid:
            self.current_date = self.parser.review_soup_to_date(
                self.current_soup)
            self.current_book_title = self.parser.review_soup_to_book_title(
                self.current_soup)
            self.current_book_id = self.parser.review_soup_to_book_id(
                self.current_soup)
            self.current_rating = self.parser.review_soup_to_rating(
                self.current_soup)
            self.current_reviewer_href = self.parser.review_soup_to_reviewer_href(
                self.current_soup)

            self.current_progress_dict = self.parser.review_soup_to_progress_dict(
                self.current_soup)
            self.current_start_date = self.parser.progress_dict_to_start_date(
                self.current_progress_dict)
            self.current_finished_date = self.parser.progress_dict_to_finish_date(
                self.current_progress_dict)
            self.current_shelved_date = self.parser.progress_dict_to_shelved_date(
                self.current_progress_dict)

        else:
            self.current_date = None
            self.current_book_title = None
            self.current_book_id = None
            self.current_rating = None
            self.current_reviewer_href = None
            self.current_start_date = None
            self.current_finished_date = None
            self.current_shelved_date = None

    def log_data(self):

        self.datafile.write("\n{},{},{},{},{},{},{},{},{},{}".format(
            str(self.current_id), self.is_current_valid, self.current_date,
            self.current_book_title, self.current_book_id, self.current_rating,
            self.current_reviewer_href, self.current_start_date,
            self.current_finished_date, self.current_shelved_date))

        self.data_points_counter += 1