Esempio n. 1
0
def fomulate_set_paragraphes(author_id, set_size):
    document_list = data_warehouse.get_docs_from_database_document_by_author_id(author_id)
    paragraph_list = [data_warehouse.get_cross_tab_features_from_database_by_doc_id(idx) for idx in document_list]

    for idx in range(0, len(paragraph_list), set_size):
        yield paragraph_list[idx:idx + set_size]
from data_etl import plaintext_data_etl
from data_etl.db_schema_classes.document import Document

"""
    A list of lists is returned and stored in the variable 'results'.
    Use the database column-name 'doc_content' to reference the content.

    Visit connect_to_database.py for more details.
"""
docs_in_fact = [row['doc_id'] for row in data_warehouse.get_doc_ids_from_database_fact()]

for author_id in range(8940, 8950):
    """
        Using this method is more memory-friendly as the documents is
        retrieved sequentially
    """
    print "do author id", author_id 
    docs = data_warehouse.get_docs_from_database_document_by_author_id(author_id)

    for doc in docs:
        if doc['doc_id'] in docs_in_fact:
            docs.remove(doc)
            print doc['doc_id'], "has already been done"
            continue

        print "Dumping novel with doc_id ", str(doc['doc_id'])
        plaintext_data_etl.read_paragraphs_and_split(Document(doc['doc_id'], doc['author_id'],
                                                              doc['doc_title'], 'lang', 'loc',
                                                              '1882-02-25', doc['doc_content'],
                                                              'url'))