コード例 #1
0
ファイル: utilities.py プロジェクト: robbymeals/topicalguide
def create_tokens(database_id,
                  analysis_db,
                  word_types_db,
                  tokens,
                  verbose=False):
    """Add token sequences to the database.
    database_id -- the dict key specifying the database in django
    analysis_db -- the Analysis django database object
    word_types_db -- what is returned by get_all_word_types
    tokens -- An iterator over tokens and token-topic assignments; each item is of the form
              (document_index, start_index, token, token_abstraction, topic_number_list)
    verbose -- if True notifications of progress will be output to the console
    """
    def commit_tokens(t):
        WordToken.objects.using(database_id).bulk_create(t)
        del t[:]

    def commit_topic_assignments(t):
        WordTokenTopic.objects.using(database_id).bulk_create(t)
        del t[:]

    with transaction.atomic(using=database_id):
        if verbose:
            num_docs = analysis_db.dataset.documents.count()
            timer = VerboseTimer(num_docs)

        if WordToken.objects.using(database_id).all().exists():
            token_id = WordToken.objects.using(database_id).all().aggregate(
                Max('id'))['id__max'] + 1
        else:
            token_id = 0

        tokens_to_commit = []
        topic_tokens_to_commit = []
        topics_db = {t.number: t for t in analysis_db.topics.all()}
        documents_db = {
            d.index: d
            for d in analysis_db.dataset.documents.all()
        }

        total_tokens = 0
        total_topic_tokens = 0
        token_index = 0
        prev_document_index = None
        for document_index, start_index, token, token_abstraction, topic_number_list in tokens:
            if len(token) > MAX_TOKEN_LENGTH or len(
                    token_abstraction) > MAX_TOKEN_LENGTH:
                continue

            if prev_document_index != document_index:
                prev_document_index = document_index
                token_index = 0
                if verbose: timer.tick()

            word_token = WordToken(
                id=token_id,
                document_id=documents_db[document_index].id,
                analysis=analysis_db,
                word_type_id=word_types_db[token].id,
                word_type_abstraction=word_types_db[token_abstraction],
                token_index=token_index,
                start_index=start_index)
            tokens_to_commit.append(word_token)

            for topic_number in topic_number_list:
                try:
                    topic_db = topics_db[topic_number]
                except:
                    topic_db = Topic.objects.using(database_id).create(
                        analysis=analysis_db, number=topic_number)
                    topics_db[topic_number] = topic_db
                word_token_topic = WordTokenTopic(token=word_token,
                                                  topic=topic_db)
                topic_tokens_to_commit.append(word_token_topic)
                total_topic_tokens += 1

            if len(tokens_to_commit) > MAX_TOKENS_IN_MEMORY:
                commit_tokens(tokens_to_commit)
                commit_topic_assignments(topic_tokens_to_commit)
            token_index += 1
            token_id += 1
            total_tokens += 1

        commit_tokens(tokens_to_commit)
        commit_topic_assignments(topic_tokens_to_commit)

        if verbose:
            print('Number of tokens created:', total_tokens)
            print('Number of topic token relationships created:',
                  total_topic_tokens)
コード例 #2
0
def create_tokens(database_id, analysis_db, word_types_db, tokens, verbose=False):
    """Add token sequences to the database.
    database_id -- the dict key specifying the database in django
    analysis_db -- the Analysis django database object
    word_types_db -- what is returned by get_all_word_types
    tokens -- An iterator over tokens and token-topic assignments; each item is of the form
              (document_index, start_index, token, token_abstraction, topic_number_list)
    verbose -- if True notifications of progress will be output to the console
    """

    def commit_tokens(t):
        WordToken.objects.using(database_id).bulk_create(t)
        del t[:]

    def commit_topic_assignments(t):
        WordTokenTopic.objects.using(database_id).bulk_create(t)
        del t[:]

    with transaction.atomic(using=database_id):
        if verbose:
            num_docs = analysis_db.dataset.documents.count()
            timer = VerboseTimer(num_docs)

        if WordToken.objects.using(database_id).all().exists():
            token_id = WordToken.objects.using(database_id).all().aggregate(Max("id"))["id__max"] + 1
        else:
            token_id = 0

        tokens_to_commit = []
        topic_tokens_to_commit = []
        topics_db = {t.number: t for t in analysis_db.topics.all()}
        documents_db = {d.index: d for d in analysis_db.dataset.documents.all()}

        total_tokens = 0
        total_topic_tokens = 0
        token_index = 0
        prev_document_index = None
        for document_index, start_index, token, token_abstraction, topic_number_list in tokens:
            if len(token) > MAX_TOKEN_LENGTH or len(token_abstraction) > MAX_TOKEN_LENGTH:
                continue

            if prev_document_index != document_index:
                prev_document_index = document_index
                token_index = 0
                if verbose:
                    timer.tick()

            word_token = WordToken(
                id=token_id,
                document_id=documents_db[document_index].id,
                analysis=analysis_db,
                word_type_id=word_types_db[token].id,
                word_type_abstraction=word_types_db[token_abstraction],
                token_index=token_index,
                start_index=start_index,
            )
            tokens_to_commit.append(word_token)

            for topic_number in topic_number_list:
                try:
                    topic_db = topics_db[topic_number]
                except:
                    topic_db = Topic.objects.using(database_id).create(analysis=analysis_db, number=topic_number)
                    topics_db[topic_number] = topic_db
                word_token_topic = WordTokenTopic(token=word_token, topic=topic_db)
                topic_tokens_to_commit.append(word_token_topic)
                total_topic_tokens += 1

            if len(tokens_to_commit) > MAX_TOKENS_IN_MEMORY:
                commit_tokens(tokens_to_commit)
                commit_topic_assignments(topic_tokens_to_commit)
            token_index += 1
            token_id += 1
            total_tokens += 1

        commit_tokens(tokens_to_commit)
        commit_topic_assignments(topic_tokens_to_commit)

        if verbose:
            print("Number of tokens created:", total_tokens)
            print("Number of topic token relationships created:", total_topic_tokens)
コード例 #3
0
ファイル: utilities.py プロジェクト: robbymeals/topicalguide
def create_documents(database_id, dataset_db, dataset, 
                     meta_types_db, verbose=False):
    """Create entries for documents and their associated metadata.
    database_id -- the dict key specifying the database in django
    dataset_db -- the Dataset django database object
    dataset -- the AbstractDataset object
    meta_types_db -- what is returned by the get_all_metadata_types method in the
                     metadata.utilities module
    verbose -- if True print out progress to the console; do nothing otherwise
    """
    document_dir = join(dataset_db.dataset_dir, RELATIVE_DOCUMENT_DIRECTORY)
    document_metadata_types = dataset.document_metadata_types
    create_metadata_types(database_id, document_metadata_types,
                          meta_types_db)
    # Helper function
    def bulk_create_documents(documents, metadata):
        if len(documents) == 0: return
        with transaction.atomic(using=database_id):
            low_high = (documents[0].index, documents[-1].index)
            # create document entries
            Document.objects.using(database_id).bulk_create(documents)
            names = []
            for doc in documents:
                names.append(doc.filename)
            # retrieve documents from database since bulk_create doesn't return
            # a primary key
            documents_db = \
                Document.objects.using(database_id).filter(dataset=dataset_db,
                index__range=low_high).order_by('index')
            # create metadata entries
            create_metadata(database_id, 
                            documents_db, 
                            DocumentMetadataValue, 'document',
                            document_metadata_types,
                            meta_types_db,
                            metadata)
        del documents[:]
        del metadata[:]
    
    documents_to_commit = []
    documents_metadata_to_commit = []
    already_created_documents = {d.filename: d for d in Document.objects.using(database_id).filter(dataset=dataset_db.id)}
    if verbose: timer = VerboseTimer(len(dataset))
    for doc_index, doc in enumerate(dataset):
        if verbose: timer.tick()
        filename = doc.name
        # Create document and get metadata
        if filename not in already_created_documents:
            full_path = os.path.join(document_dir, filename)
            metadata = doc.metadata
            content = doc.content
            with io.open(full_path, 'w', encoding='utf-8') as f:
                f.write(content)
            doc_db = Document(dataset=dataset_db, filename=filename, index=doc_index, length=len(content))
            documents_to_commit.append(doc_db)
            documents_metadata_to_commit.append(metadata)
        # Bulk create periodically to keep memory usage minimized
        if len(documents_to_commit) > MAX_DOCUMENTS_TO_COMMIT:
            bulk_create_documents(documents_to_commit, 
                documents_metadata_to_commit)
    bulk_create_documents(documents_to_commit, documents_metadata_to_commit)
    if verbose: print("Document count:", doc_index + 1)