def create_tokens(database_id, analysis_db, word_types_db, tokens, verbose=False): """Add token sequences to the database. database_id -- the dict key specifying the database in django analysis_db -- the Analysis django database object word_types_db -- what is returned by get_all_word_types tokens -- An iterator over tokens and token-topic assignments; each item is of the form (document_index, start_index, token, token_abstraction, topic_number_list) verbose -- if True notifications of progress will be output to the console """ def commit_tokens(t): WordToken.objects.using(database_id).bulk_create(t) del t[:] def commit_topic_assignments(t): WordTokenTopic.objects.using(database_id).bulk_create(t) del t[:] with transaction.atomic(using=database_id): if verbose: num_docs = analysis_db.dataset.documents.count() timer = VerboseTimer(num_docs) if WordToken.objects.using(database_id).all().exists(): token_id = WordToken.objects.using(database_id).all().aggregate( Max('id'))['id__max'] + 1 else: token_id = 0 tokens_to_commit = [] topic_tokens_to_commit = [] topics_db = {t.number: t for t in analysis_db.topics.all()} documents_db = { d.index: d for d in analysis_db.dataset.documents.all() } total_tokens = 0 total_topic_tokens = 0 token_index = 0 prev_document_index = None for document_index, start_index, token, token_abstraction, topic_number_list in tokens: if len(token) > MAX_TOKEN_LENGTH or len( token_abstraction) > MAX_TOKEN_LENGTH: continue if prev_document_index != document_index: prev_document_index = document_index token_index = 0 if verbose: timer.tick() word_token = WordToken( id=token_id, document_id=documents_db[document_index].id, analysis=analysis_db, word_type_id=word_types_db[token].id, word_type_abstraction=word_types_db[token_abstraction], token_index=token_index, start_index=start_index) tokens_to_commit.append(word_token) for topic_number in topic_number_list: try: topic_db = topics_db[topic_number] except: topic_db = Topic.objects.using(database_id).create( analysis=analysis_db, number=topic_number) topics_db[topic_number] = topic_db word_token_topic = WordTokenTopic(token=word_token, topic=topic_db) topic_tokens_to_commit.append(word_token_topic) total_topic_tokens += 1 if len(tokens_to_commit) > MAX_TOKENS_IN_MEMORY: commit_tokens(tokens_to_commit) commit_topic_assignments(topic_tokens_to_commit) token_index += 1 token_id += 1 total_tokens += 1 commit_tokens(tokens_to_commit) commit_topic_assignments(topic_tokens_to_commit) if verbose: print('Number of tokens created:', total_tokens) print('Number of topic token relationships created:', total_topic_tokens)
def create_tokens(database_id, analysis_db, word_types_db, tokens, verbose=False): """Add token sequences to the database. database_id -- the dict key specifying the database in django analysis_db -- the Analysis django database object word_types_db -- what is returned by get_all_word_types tokens -- An iterator over tokens and token-topic assignments; each item is of the form (document_index, start_index, token, token_abstraction, topic_number_list) verbose -- if True notifications of progress will be output to the console """ def commit_tokens(t): WordToken.objects.using(database_id).bulk_create(t) del t[:] def commit_topic_assignments(t): WordTokenTopic.objects.using(database_id).bulk_create(t) del t[:] with transaction.atomic(using=database_id): if verbose: num_docs = analysis_db.dataset.documents.count() timer = VerboseTimer(num_docs) if WordToken.objects.using(database_id).all().exists(): token_id = WordToken.objects.using(database_id).all().aggregate(Max("id"))["id__max"] + 1 else: token_id = 0 tokens_to_commit = [] topic_tokens_to_commit = [] topics_db = {t.number: t for t in analysis_db.topics.all()} documents_db = {d.index: d for d in analysis_db.dataset.documents.all()} total_tokens = 0 total_topic_tokens = 0 token_index = 0 prev_document_index = None for document_index, start_index, token, token_abstraction, topic_number_list in tokens: if len(token) > MAX_TOKEN_LENGTH or len(token_abstraction) > MAX_TOKEN_LENGTH: continue if prev_document_index != document_index: prev_document_index = document_index token_index = 0 if verbose: timer.tick() word_token = WordToken( id=token_id, document_id=documents_db[document_index].id, analysis=analysis_db, word_type_id=word_types_db[token].id, word_type_abstraction=word_types_db[token_abstraction], token_index=token_index, start_index=start_index, ) tokens_to_commit.append(word_token) for topic_number in topic_number_list: try: topic_db = topics_db[topic_number] except: topic_db = Topic.objects.using(database_id).create(analysis=analysis_db, number=topic_number) topics_db[topic_number] = topic_db word_token_topic = WordTokenTopic(token=word_token, topic=topic_db) topic_tokens_to_commit.append(word_token_topic) total_topic_tokens += 1 if len(tokens_to_commit) > MAX_TOKENS_IN_MEMORY: commit_tokens(tokens_to_commit) commit_topic_assignments(topic_tokens_to_commit) token_index += 1 token_id += 1 total_tokens += 1 commit_tokens(tokens_to_commit) commit_topic_assignments(topic_tokens_to_commit) if verbose: print("Number of tokens created:", total_tokens) print("Number of topic token relationships created:", total_topic_tokens)
def create_documents(database_id, dataset_db, dataset, meta_types_db, verbose=False): """Create entries for documents and their associated metadata. database_id -- the dict key specifying the database in django dataset_db -- the Dataset django database object dataset -- the AbstractDataset object meta_types_db -- what is returned by the get_all_metadata_types method in the metadata.utilities module verbose -- if True print out progress to the console; do nothing otherwise """ document_dir = join(dataset_db.dataset_dir, RELATIVE_DOCUMENT_DIRECTORY) document_metadata_types = dataset.document_metadata_types create_metadata_types(database_id, document_metadata_types, meta_types_db) # Helper function def bulk_create_documents(documents, metadata): if len(documents) == 0: return with transaction.atomic(using=database_id): low_high = (documents[0].index, documents[-1].index) # create document entries Document.objects.using(database_id).bulk_create(documents) names = [] for doc in documents: names.append(doc.filename) # retrieve documents from database since bulk_create doesn't return # a primary key documents_db = \ Document.objects.using(database_id).filter(dataset=dataset_db, index__range=low_high).order_by('index') # create metadata entries create_metadata(database_id, documents_db, DocumentMetadataValue, 'document', document_metadata_types, meta_types_db, metadata) del documents[:] del metadata[:] documents_to_commit = [] documents_metadata_to_commit = [] already_created_documents = {d.filename: d for d in Document.objects.using(database_id).filter(dataset=dataset_db.id)} if verbose: timer = VerboseTimer(len(dataset)) for doc_index, doc in enumerate(dataset): if verbose: timer.tick() filename = doc.name # Create document and get metadata if filename not in already_created_documents: full_path = os.path.join(document_dir, filename) metadata = doc.metadata content = doc.content with io.open(full_path, 'w', encoding='utf-8') as f: f.write(content) doc_db = Document(dataset=dataset_db, filename=filename, index=doc_index, length=len(content)) documents_to_commit.append(doc_db) documents_metadata_to_commit.append(metadata) # Bulk create periodically to keep memory usage minimized if len(documents_to_commit) > MAX_DOCUMENTS_TO_COMMIT: bulk_create_documents(documents_to_commit, documents_metadata_to_commit) bulk_create_documents(documents_to_commit, documents_metadata_to_commit) if verbose: print("Document count:", doc_index + 1)