コード例 #1
0
def read_paragraphs_and_split(doc):
    SQL_INSERT_QUERY = ''

    # Historical problem
    ch = Chapter(doc.get_doc_id(), -1)
    SQL_INSERT_QUERY += ch.get_chapter_insert_query()

    paragraphs = doc.get_doc_paragraphs()
    for para in paragraphs:
        p = Paragraph(doc.get_doc_id(), para)
        SQL_INSERT_QUERY += feature_queries_preprocessing.get_fact_insert_query(doc.get_doc_id(), p)

    connect_to_database.execute_insert_query(SQL_INSERT_QUERY)
コード例 #2
0
def read_paragraphs_and_split(doc):
    SQL_INSERT_QUERY = ''

    # Historical problem
    ch = Chapter(doc.get_doc_id(), -1)
    SQL_INSERT_QUERY += ch.get_chapter_insert_query()

    paragraphs = doc.get_doc_paragraphs()
    for para in paragraphs:
        p = Paragraph(doc.get_doc_id(), para)
        SQL_INSERT_QUERY += feature_queries_preprocessing.get_fact_insert_query(
            doc.get_doc_id(), p)

    connect_to_database.execute_insert_query(SQL_INSERT_QUERY)
コード例 #3
0
# previous = ''
# for word in word_list:
#
#     if previous == word:
#         continue
#     previous = word
#
#     for idx in range(0, len(word) - 1):
#         print word[idx], ' - ', word[idx + 1]
#
#
# print '---------------------------------------------------------------------------------'

previous = ''
for item in connect_to_database.execute_select_query(SQL_SELECT_QUERY):

    for word in item['bigram'].split('-'):
        if previous == word:
            continue
        previous = word

        SQL_INSERT_QUERY = ''
        for idx in range(0, len(word) - 1):
            SQL_INSERT_QUERY += 'INSERT INTO char_bigram_feature(bigram_id, doc_id, para_id, char_bigram) ' \
                                'VALUES ({}, {}, {}, {})'.format(item['bigram_id'], item['doc_id'], item['para_id'],
                                                                 QuotedString(word[idx] + ' - ' + word[idx + 1])
                                                                 .getquoted())

        connect_to_database.execute_insert_query(SQL_INSERT_QUERY)
コード例 #4
0
def process_book_item(book):
    """
        This function will catch the pipeline from Gutenberg
        crawler, precisely, the book item/object and continue
        to dump the data into the database.

        The following code will store the meta-data of the
        book item. After that, the zip file will be extracted
        by read_file_get_content function in order to get its
        content.
    """

    SQL_INSERT_QUERY = ''
    zip_path = dir_path + book['host_path'][0]

    author = Author(book['author'])
    author_queried_id = connect_to_database.test_if_author_exists(author)

    # Empty content variable for the storage of content
    content = ''
    """
        The test_if_author_exists function returns -1
        if the name of the author is not found on the database.

        Visit connect_to_database.py for more details.
    """
    if author_queried_id is -1:
        SQL_INSERT_QUERY += author.get_author_insert_query()
    """
        We do need to check the file type because somehow
        Gutenberg provides txt file
    """
    try:

        if zipfile.is_zipfile(zip_path):

            with zipfile.ZipFile(zip_path, 'r') as z:
                z.extractall(path_to_store_txt)
                """
                        There is a checking in the Document class to
                        see if author_queried_id is 0, which indicates
                        the author was not found in the database

                        In this case, the script will first insert the
                        info of that author into the database. Then,
                        the document will use that newly generated author_id
                        to do its job.

                        Otherwise, the script will just use the author_id
                        returned by the connect_to_database.test_if_author_exists(author)
                        function.
                    """
                content = read_file_get_content(z.namelist()[0])

        else:
            content = read_file_get_content(zip_path)

    except NotImplementedError:
        print "Broken zip file"
        return False
    except IOError:
        print "File not found"
        return False

    SQL_INSERT_QUERY += Document(-1, author_queried_id, book['title'],
                                 book['lang'], book['loc_class'],
                                 book['rdate'], content,
                                 book['gutenberg_url']).get_doc_insert_query()

    connect_to_database.execute_insert_query(SQL_INSERT_QUERY)