for fi in files:

        if os.path.getsize(root + fi) >= min_size:

            book = BookItem()
            SQL_INSERT_QUERY = ""

            book['author'] = fi.split('___')[0]
            book['title'] = fi.split('___')[1][:-4]

            with open(root + fi, 'r') as doc_file:
                book['content'] = doc_file.read()

            author = Author(book['author'])
            author_queried_id = connect_to_database.test_if_author_exists(author)

            """
                The test_if_author_exists function returns -1
                if the name of the author is not found on the database.

                Visit connect_to_database.py for more details.
            """
            if author_queried_id is -1:
                SQL_INSERT_QUERY += author.get_author_insert_query()

            """
                There is a checking in the Document class to
                see if author_queried_id is 0, which indicates
                the author was not found in the database
    for fi in files:

        if os.path.getsize(root + fi) >= min_size:

            book = BookItem()
            SQL_INSERT_QUERY = ""

            book['author'] = fi.split('___')[0]
            book['title'] = fi.split('___')[1][:-4]

            with open(root + fi, 'r') as doc_file:
                book['content'] = doc_file.read()

            author = Author(book['author'])
            author_queried_id = connect_to_database.test_if_author_exists(
                author)
            """
                The test_if_author_exists function returns -1
                if the name of the author is not found on the database.

                Visit connect_to_database.py for more details.
            """
            if author_queried_id is -1:
                SQL_INSERT_QUERY += author.get_author_insert_query()
            """
                There is a checking in the Document class to
                see if author_queried_id is 0, which indicates
                the author was not found in the database

                In this case, the script will first insert the
                info of that author into the database. Then,
Esempio n. 3
0
def process_book_item(book):
    """
        This function will catch the pipeline from Gutenberg
        crawler, precisely, the book item/object and continue
        to dump the data into the database.

        The following code will store the meta-data of the
        book item. After that, the zip file will be extracted
        by read_file_get_content function in order to get its
        content.
    """

    SQL_INSERT_QUERY = ''
    zip_path = dir_path + book['host_path'][0]

    author = Author(book['author'])
    author_queried_id = connect_to_database.test_if_author_exists(author)

    # Empty content variable for the storage of content
    content = ''
    """
        The test_if_author_exists function returns -1
        if the name of the author is not found on the database.

        Visit connect_to_database.py for more details.
    """
    if author_queried_id is -1:
        SQL_INSERT_QUERY += author.get_author_insert_query()
    """
        We do need to check the file type because somehow
        Gutenberg provides txt file
    """
    try:

        if zipfile.is_zipfile(zip_path):

            with zipfile.ZipFile(zip_path, 'r') as z:
                z.extractall(path_to_store_txt)
                """
                        There is a checking in the Document class to
                        see if author_queried_id is 0, which indicates
                        the author was not found in the database

                        In this case, the script will first insert the
                        info of that author into the database. Then,
                        the document will use that newly generated author_id
                        to do its job.

                        Otherwise, the script will just use the author_id
                        returned by the connect_to_database.test_if_author_exists(author)
                        function.
                    """
                content = read_file_get_content(z.namelist()[0])

        else:
            content = read_file_get_content(zip_path)

    except NotImplementedError:
        print "Broken zip file"
        return False
    except IOError:
        print "File not found"
        return False

    SQL_INSERT_QUERY += Document(-1, author_queried_id, book['title'],
                                 book['lang'], book['loc_class'],
                                 book['rdate'], content,
                                 book['gutenberg_url']).get_doc_insert_query()

    connect_to_database.execute_insert_query(SQL_INSERT_QUERY)