from data_analysis import data_warehouse from data_etl import plaintext_data_etl from data_etl.db_schema_classes.document import Document """ A list of lists is returned and stored in the variable 'results'. Use the database column-name 'doc_content' to reference the content. Visit connect_to_database.py for more details. """ docs_in_fact = [ row['doc_id'] for row in data_warehouse.get_doc_ids_from_database_fact() ] docs = data_warehouse.get_docs_from_database_document_by_doc_id(doc_id=74) for doc in docs: print "Dumping novel with doc_id ", str(doc['doc_id']) document = Document(doc['doc_id'], doc['author_id'], doc['doc_title'], 'lang', 'loc', '1882-02-25', doc['doc_content'], 'url') print len(document.get_doc_paragraphs()) # for author_id in range(8940, 8950): # """ # Using this method is more memory-friendly as the documents is # retrieved sequentially # """ # print "do author id", author_id # #docs = data_warehouse.get_docs_from_database_document_by_author_id(author_id) # docs = data_warehouse.get_docs_from_database_document_by_doc_id(doc_id=74) # # for doc in docs:
from csv_exportation import data_to_csv from data_analysis import data_warehouse header_row = ['author id', 'document id', 'paragraph id'] + ['feature ' + str(i) for i in range(1, 57)] documents = [item['doc_id'] for item in data_warehouse.get_doc_ids_from_database_fact()] data_list = [] for doc_id in documents: data_list.extend(data_warehouse.get_cross_tab_features_from_database_by_doc_id(doc_id)) data_to_csv.write_csvfile_output('stylo_features.csv', header_row, data_list)
from data_analysis import data_warehouse from data_etl import plaintext_data_etl from data_etl.db_schema_classes.document import Document """ A list of lists is returned and stored in the variable 'results'. Use the database column-name 'doc_content' to reference the content. Visit connect_to_database.py for more details. """ docs_in_fact = [row['doc_id'] for row in data_warehouse.get_doc_ids_from_database_fact()] for author_id in range(8940, 8950): """ Using this method is more memory-friendly as the documents is retrieved sequentially """ print "do author id", author_id docs = data_warehouse.get_docs_from_database_document_by_author_id(author_id) for doc in docs: if doc['doc_id'] in docs_in_fact: docs.remove(doc) print doc['doc_id'], "has already been done" continue print "Dumping novel with doc_id ", str(doc['doc_id']) plaintext_data_etl.read_paragraphs_and_split(Document(doc['doc_id'], doc['author_id'], doc['doc_title'], 'lang', 'loc', '1882-02-25', doc['doc_content'], 'url'))