Ejemplo n.º 1
0
def main():
    # Output files generated by GoogleBooksRetrieval.py are multiple files which could be duplicate
    # This method unifies the data and makes sure there is no document duplication
    # Before running, make sure all files are in input directory and a directory called input_sanitized is created.

    input_file_list = get_files_in_input_dir()

    item_ids, total_items, total_duplicates = set(), 0, 0
    complete_json = []
    for fl in input_file_list:
        if fl[0] == '.':        # avoiding hidden file
            continue
        out_data = []
        with open("input/" + fl, mode='r', encoding='utf-8') as a_file:
            for line in a_file.readlines():
                temp_json = json.loads(line)
                for data in temp_json:
                    if data['id'] not in item_ids:
                        total_items += 1
                        item_ids.add(data['id'])
                        out_data.append(data)
                    else:
                        total_duplicates += 1
        if len(out_data) > 0:
            # To make the data processing easy, each record is converted to json and is added to a new line.
            with open("input_sanitized/data", mode='a', encoding='utf-8') as a_file:
                for data in out_data:
                    a_file.write(json.dumps(data) + "\n")
                    complete_json.append(data)
    # data.json is also created, which has all data in a single file and in json format
    with open("data.json", mode='w', encoding='utf-8') as a_fl:
        a_fl.write(json.dumps(complete_json))

    print("Total duplicates found were %d, and total items were %d" % (total_duplicates, total_items))
def main():
    # Since google blocks our ip due excessive requests, data has to be retrieved on a sequential basis.
    # This method generates a file in "input" directory for each search term
    # Search term are to be provided in category_list.py as list

    maxResults = 40     # 40 is the maximum number of records that could be extracted from google books api.

    google_api = "https://www.googleapis.com/books/v1/volumes"
    get_params = {'langRestrict': 'en', 'maxResults': maxResults, 'printType': 'books', 'orderBy': 'newest'}

    completed_items_set = set()
    output_list = []

    retry_counter, retry_limit, retry_time_seconds = 0, 100, \
                                                     int(input("What should the retry interval be in seconds?\n"))

    try:
        # completed_items_checklist has the list of ids of books whose data are downloaded
        with open("completed_items_checklist", mode='r', encoding='utf-8') as a_file:
            for line in a_file.readlines():
                completed_items_set.add(line)
    except FileNotFoundError:
        pass
    print("Total books retrieved so far %d" % (len(completed_items_set)))

    def write_to_completed_items_checklist(data):
        with open("completed_items_checklist", mode='a', encoding='utf-8') as a_file:
            a_file.write(data + "\n")

    completed_categories = get_files_in_input_dir()

    h = httplib2.Http(".cache")

    num_char_dict = {0:'th', 1:'st', 2:'nd', 3:'rd', 4:'th', 5:'th', 6:'th', 7:'th', 8:'th', 9:'th'}

    def request_data(startIndex):
        nonlocal retry_counter, retry_limit, retry_time_seconds, num_char_dict
        get_params['startIndex'] = startIndex
        response, content = h.request(google_api + "?" + urlencode(get_params))
        if response.status != 200:
            # if google blocks our ip, the following message would be displayed
            # userRateLimitExceededUnreg
            if "userRateLimitExceededUnreg" in str(content) and retry_counter < retry_limit:
                retry_counter += 1
                print("Waiting to get the hold lifted from Google")
                # waiting for google to lift the hold
                time.sleep(retry_time_seconds)

                print("Retrying for the %d'%s time" %(retry_counter, num_char_dict[retry_counter % 10]))
                request_data(startIndex)
            else:
                print("Response to google api was %s.\nContent is %s" %(response, content))
                return 1
        obj = json.loads(content.decode("utf-8"))
        try:
            obj['items']
        except KeyError:
            return None
        return obj

    required_data = ['title', 'subtitle', 'authors', 'categories', 'description', 'averageRating', 'imageLinks',
                     'pageCount', 'publisher', 'infoLink', 'maturityRating', 'industryIdentifiers']

    for category in category_list:
        if category in completed_categories:
            continue

        print("Querying for ", category)

        get_params['q'] = category
        index_counter = 0
        data = request_data(index_counter)

        while data is not None:
            if data == 1:
                return
            for item in data['items']:
                temp_item_dict = {}
                temp_item_dict['id'] = item['id']
                if temp_item_dict['id'] in completed_items_set:
                    continue
                for param in required_data:
                    try:
                        temp_item_dict[param] = item['volumeInfo'][param]
                    except KeyError as k:
                        pass

                    if temp_item_dict.get('industryIdentifiers') is not None:
                        for identifier in temp_item_dict['industryIdentifiers']:
                                try:
                                    temp_item_dict[identifier['type']] = identifier['identifier']
                                except KeyError as k:
                                    pass
                                    # print("Couldn't get isbn ")
                        temp_item_dict.pop('industryIdentifiers')
                output_list.append(temp_item_dict)
                completed_items_set.add(temp_item_dict['id'])
                write_to_completed_items_checklist(temp_item_dict['id'])
            index_counter += len(data['items'])
            print("Number of retrieved items %d" % len(output_list), end='\r')
            data = request_data(index_counter)
        out_data = json.dumps(output_list)
        print("", end="\r")
        with open("input/" + category, mode='w', encoding='utf-8') as a_file:
            print("Writing data in input/", category)
            a_file.write(out_data)
        completed_categories.append(category)
        print("Finished %.2f%%" % (100*len(completed_categories)/len(category_list)))
def main():
    # Since google blocks our ip due excessive requests, data has to be retrieved on a sequential basis.
    # This method generates a file in "input" directory for each search term
    # Search term are to be provided in category_list.py as list

    maxResults = 40  # 40 is the maximum number of records that could be extracted from google books api.

    google_api = "https://www.googleapis.com/books/v1/volumes"
    get_params = {
        'langRestrict': 'en',
        'maxResults': maxResults,
        'printType': 'books',
        'orderBy': 'newest'
    }

    completed_items_set = set()
    output_list = []

    retry_counter, retry_limit, retry_time_seconds = 0, 100, \
                                                     int(input("What should the retry interval be in seconds?\n"))

    try:
        # completed_items_checklist has the list of ids of books whose data are downloaded
        with open("completed_items_checklist", mode='r',
                  encoding='utf-8') as a_file:
            for line in a_file.readlines():
                completed_items_set.add(line)
    except FileNotFoundError:
        pass
    print("Total books retrieved so far %d" % (len(completed_items_set)))

    def write_to_completed_items_checklist(data):
        with open("completed_items_checklist", mode='a',
                  encoding='utf-8') as a_file:
            a_file.write(data + "\n")

    completed_categories = get_files_in_input_dir()

    h = httplib2.Http(".cache")

    num_char_dict = {
        0: 'th',
        1: 'st',
        2: 'nd',
        3: 'rd',
        4: 'th',
        5: 'th',
        6: 'th',
        7: 'th',
        8: 'th',
        9: 'th'
    }

    def request_data(startIndex):
        nonlocal retry_counter, retry_limit, retry_time_seconds, num_char_dict
        get_params['startIndex'] = startIndex
        response, content = h.request(google_api + "?" + urlencode(get_params))
        if response.status != 200:
            # if google blocks our ip, the following message would be displayed
            # userRateLimitExceededUnreg
            if "userRateLimitExceededUnreg" in str(
                    content) and retry_counter < retry_limit:
                retry_counter += 1
                print("Waiting to get the hold lifted from Google")
                # waiting for google to lift the hold
                time.sleep(retry_time_seconds)

                print("Retrying for the %d'%s time" %
                      (retry_counter, num_char_dict[retry_counter % 10]))
                request_data(startIndex)
            else:
                print("Response to google api was %s.\nContent is %s" %
                      (response, content))
                return 1
        obj = json.loads(content.decode("utf-8"))
        try:
            obj['items']
        except KeyError:
            return None
        return obj

    required_data = [
        'title', 'subtitle', 'authors', 'categories', 'description',
        'averageRating', 'imageLinks', 'pageCount', 'publisher', 'infoLink',
        'maturityRating', 'industryIdentifiers'
    ]

    for category in category_list:
        if category in completed_categories:
            continue

        print("Querying for ", category)

        get_params['q'] = category
        index_counter = 0
        data = request_data(index_counter)

        while data is not None:
            if data == 1:
                return
            for item in data['items']:
                temp_item_dict = {}
                temp_item_dict['id'] = item['id']
                if temp_item_dict['id'] in completed_items_set:
                    continue
                for param in required_data:
                    try:
                        temp_item_dict[param] = item['volumeInfo'][param]
                    except KeyError as k:
                        pass

                    if temp_item_dict.get('industryIdentifiers') is not None:
                        for identifier in temp_item_dict[
                                'industryIdentifiers']:
                            try:
                                temp_item_dict[identifier[
                                    'type']] = identifier['identifier']
                            except KeyError as k:
                                pass
                                # print("Couldn't get isbn ")
                        temp_item_dict.pop('industryIdentifiers')
                output_list.append(temp_item_dict)
                completed_items_set.add(temp_item_dict['id'])
                write_to_completed_items_checklist(temp_item_dict['id'])
            index_counter += len(data['items'])
            print("Number of retrieved items %d" % len(output_list), end='\r')
            data = request_data(index_counter)
        out_data = json.dumps(output_list)
        print("", end="\r")
        with open("input/" + category, mode='w', encoding='utf-8') as a_file:
            print("Writing data in input/", category)
            a_file.write(out_data)
        completed_categories.append(category)
        print("Finished %.2f%%" %
              (100 * len(completed_categories) / len(category_list)))