def read_paragraphs_and_split(doc): SQL_INSERT_QUERY = '' # Historical problem ch = Chapter(doc.get_doc_id(), -1) SQL_INSERT_QUERY += ch.get_chapter_insert_query() paragraphs = doc.get_doc_paragraphs() for para in paragraphs: p = Paragraph(doc.get_doc_id(), para) SQL_INSERT_QUERY += feature_queries_preprocessing.get_fact_insert_query(doc.get_doc_id(), p) connect_to_database.execute_insert_query(SQL_INSERT_QUERY)
def read_paragraphs_and_split(doc): SQL_INSERT_QUERY = '' # Historical problem ch = Chapter(doc.get_doc_id(), -1) SQL_INSERT_QUERY += ch.get_chapter_insert_query() paragraphs = doc.get_doc_paragraphs() for para in paragraphs: p = Paragraph(doc.get_doc_id(), para) SQL_INSERT_QUERY += feature_queries_preprocessing.get_fact_insert_query( doc.get_doc_id(), p) connect_to_database.execute_insert_query(SQL_INSERT_QUERY)
# previous = '' # for word in word_list: # # if previous == word: # continue # previous = word # # for idx in range(0, len(word) - 1): # print word[idx], ' - ', word[idx + 1] # # # print '---------------------------------------------------------------------------------' previous = '' for item in connect_to_database.execute_select_query(SQL_SELECT_QUERY): for word in item['bigram'].split('-'): if previous == word: continue previous = word SQL_INSERT_QUERY = '' for idx in range(0, len(word) - 1): SQL_INSERT_QUERY += 'INSERT INTO char_bigram_feature(bigram_id, doc_id, para_id, char_bigram) ' \ 'VALUES ({}, {}, {}, {})'.format(item['bigram_id'], item['doc_id'], item['para_id'], QuotedString(word[idx] + ' - ' + word[idx + 1]) .getquoted()) connect_to_database.execute_insert_query(SQL_INSERT_QUERY)
def process_book_item(book): """ This function will catch the pipeline from Gutenberg crawler, precisely, the book item/object and continue to dump the data into the database. The following code will store the meta-data of the book item. After that, the zip file will be extracted by read_file_get_content function in order to get its content. """ SQL_INSERT_QUERY = '' zip_path = dir_path + book['host_path'][0] author = Author(book['author']) author_queried_id = connect_to_database.test_if_author_exists(author) # Empty content variable for the storage of content content = '' """ The test_if_author_exists function returns -1 if the name of the author is not found on the database. Visit connect_to_database.py for more details. """ if author_queried_id is -1: SQL_INSERT_QUERY += author.get_author_insert_query() """ We do need to check the file type because somehow Gutenberg provides txt file """ try: if zipfile.is_zipfile(zip_path): with zipfile.ZipFile(zip_path, 'r') as z: z.extractall(path_to_store_txt) """ There is a checking in the Document class to see if author_queried_id is 0, which indicates the author was not found in the database In this case, the script will first insert the info of that author into the database. Then, the document will use that newly generated author_id to do its job. Otherwise, the script will just use the author_id returned by the connect_to_database.test_if_author_exists(author) function. """ content = read_file_get_content(z.namelist()[0]) else: content = read_file_get_content(zip_path) except NotImplementedError: print "Broken zip file" return False except IOError: print "File not found" return False SQL_INSERT_QUERY += Document(-1, author_queried_id, book['title'], book['lang'], book['loc_class'], book['rdate'], content, book['gutenberg_url']).get_doc_insert_query() connect_to_database.execute_insert_query(SQL_INSERT_QUERY)