Esempio n. 1
0
def run_analysis(database_id,
                 dataset_name,
                 analysis,
                 directories,
                 topic_namers=None,
                 verbose=False):
    """Give the analysis the text for the documents allowing the bulk of the
    work to be done by the analysis. Import the tokens, topic token 
    relationships, topics, etc., into the database.
    Positional Arguments:
    database_id -- the dict key specifying the database in django
    dataset_name -- the name that uniquely identifies which dataset this
                    analysis will be run on
    analysis -- an AbstractAnalysis object
    directories -- dict returned from get_common_working_directories
    
    Keyword Arguments:
    topic_namers -- a list of AbstractTopicNamers that take an Analysis Django database
                    object and create topic names according to a naming scheme
    verbose -- if True notifications of progress will be output to the console
    
    Return the unique analysis name for the given dataset.
    """
    if verbose: print('Running analysis:', analysis.name)
    document_iterator = Document.objects.using(database_id).filter(
        dataset__name=dataset_name).order_by('index')
    document_iterator.prefetch_related('dataset', 'metadata')
    analysis.run_analysis(document_iterator)

    dataset_db = Dataset.objects.using(database_id).get(name=dataset_name)
    # word types should be relatively sparse, so we load all of them into memory
    word_types_db = get_all_word_types(database_id)
    meta_types_db = get_all_metadata_types(database_id)

    if verbose: print('Creating analysis entry.')
    analysis_db = create_analysis(database_id, dataset_db, analysis,
                                  meta_types_db)

    if verbose: print('Creating word type entries.')
    create_word_type_entries(database_id, analysis.get_vocab_iterator(),
                             word_types_db)

    if not analysis_db.tokens.exists():
        if verbose: print('Creating token entries.')
        create_tokens(database_id,
                      analysis_db,
                      word_types_db,
                      analysis.get_token_iterator(),
                      verbose=verbose)

    if verbose: print('Adjusting topic heirarchy.')
    create_topic_heirarchy(database_id, analysis_db,
                           analysis.get_hierarchy_iterator())

    if not analysis_db.stopwords.exists():
        if verbose: print('Creating stopword entries.')
        create_stopwords(database_id, analysis_db, word_types_db,
                         analysis.stopwords)

    if not analysis_db.excluded_words.exists():
        if verbose: print('Creating excluded word entries.')
        create_excluded_words(database_id, analysis_db, word_types_db,
                              analysis.excluded_words)

    if verbose: print('Naming topics.')

    if DATABASE_OPTIMIZE_DEBUG:
        con = connections[database_id]
        query_count = len(con.queries)

    if topic_namers == None:
        topic_namers = DEFAULT_TOPIC_NAMERS
    create_topic_names(database_id, analysis_db, topic_namers, verbose=verbose)

    if DATABASE_OPTIMIZE_DEBUG:
        total_queries = len(con.queries) - query_count
        print("Namers used %d queries." % (total_queries, ))
        if total_queries > 10:
            for query in con.queries[query_count:]:
                print(query['time'])
                print(query['sql'])

    if verbose: print('Running metrics.')
    run_metrics(database_id, dataset_db.name, analysis_db.name,
                BASIC_ANALYSIS_METRICS)

    return analysis.name
def run_analysis(database_id, dataset_name, analysis, directories, 
                 topic_namers=None, verbose=False):
    """Give the analysis the text for the documents allowing the bulk of the
    work to be done by the analysis. Import the tokens, topic token 
    relationships, topics, etc., into the database.
    Positional Arguments:
    database_id -- the dict key specifying the database in django
    dataset_name -- the name that uniquely identifies which dataset this
                    analysis will be run on
    analysis -- an AbstractAnalysis object
    directories -- dict returned from get_common_working_directories
    
    Keyword Arguments:
    topic_namers -- a list of AbstractTopicNamers that take an Analysis Django database
                    object and create topic names according to a naming scheme
    verbose -- if True notifications of progress will be output to the console
    
    Return the unique analysis name for the given dataset.
    """
    if verbose: print('Running analysis:', analysis.name)
    document_iterator = Document.objects.using(database_id).filter(dataset__name=dataset_name).order_by('index')
    document_iterator.prefetch_related('dataset', 'metadata')
    analysis.run_analysis(document_iterator)
    
    dataset_db = Dataset.objects.using(database_id).get(name=dataset_name)
    # word types should be relatively sparse, so we load all of them into memory
    word_types_db = get_all_word_types(database_id)
    meta_types_db = get_all_metadata_types(database_id)
    
    if verbose: print('Creating analysis entry.')
    analysis_db = create_analysis(database_id, dataset_db, analysis, meta_types_db)
    
    if verbose: print('Creating word type entries.')
    create_word_type_entries(database_id, analysis.get_vocab_iterator(), word_types_db)
    
    if not analysis_db.tokens.exists():
        if verbose: print('Creating token entries.')
        create_tokens(database_id, analysis_db, word_types_db, analysis.get_token_iterator(), verbose=verbose)
    
    if verbose: print('Adjusting topic heirarchy.')
    create_topic_heirarchy(database_id, analysis_db, analysis.get_hierarchy_iterator())
    
    if not analysis_db.stopwords.exists():
        if verbose: print('Creating stopword entries.')
        create_stopwords(database_id, analysis_db, word_types_db, analysis.stopwords)
    
    if not analysis_db.excluded_words.exists():
        if verbose: print('Creating excluded word entries.')
        create_excluded_words(database_id, analysis_db, word_types_db, analysis.excluded_words)
    
    if verbose: print('Naming topics.')
    
    if DATABASE_OPTIMIZE_DEBUG:
        con = connections[database_id]
        query_count = len(con.queries)
    
    if topic_namers == None:
        topic_namers = DEFAULT_TOPIC_NAMERS
    create_topic_names(database_id, analysis_db, topic_namers, verbose=verbose)
    
    if DATABASE_OPTIMIZE_DEBUG:
        total_queries = len(con.queries) - query_count
        print("Namers used %d queries."%(total_queries,))
        if total_queries > 10:
            for query in con.queries[query_count:]:
                print(query['time'])
                print(query['sql'])
    
    if verbose: print('Running metrics.')
    run_metrics(database_id, dataset_db.name, analysis_db.name, BASIC_ANALYSIS_METRICS)
    
    return analysis.name
Esempio n. 3
0
def check_analysis(database_id,
                   dataset_name,
                   analysis,
                   directories,
                   topic_namers=None,
                   verbose=False):
    """The purpose of this is to test an analysis for basic errors
    such as incorrect token indices and start indices, inappropriate topic
    heirarchies, tokens that are too long, whether or not a token is a
    stopword, and whether or not a token is part of the vocabulary.  Any errors
    are posted to the console.
    """
    def dict_to_string(d):
        result = ''
        for k, v in d.iteritems():
            result += unicode(k) + ': ' + unicode(v) + '\n'
        return result

    print('Analysis Name:', analysis.name)
    print('Analysis Metadata:', dict_to_string(analysis.metadata))
    print('Analysis Metadata Types:', dict_to_string(analysis.metadata_types))

    print('Running Analysis...')
    document_iterator = Document.objects.using(database_id).filter(
        dataset__name=dataset_name).order_by('index')
    document_iterator.prefetch_related('dataset', 'metadata',
                                       'metadata__metadata_type')
    analysis.run_analysis(document_iterator)

    word_types_db = get_all_word_types(database_id)

    new_word_types = {}
    new_word_type_count = 0
    existing_word_type_count = 0
    for word_type in analysis.get_vocab_iterator():
        if word_type in word_types_db:
            existing_word_type_count += 1
        else:
            new_word_type_count += 1
            new_word_types[word_type.lower()] = True
    print('New Word Types:', new_word_type_count)
    print('Existing Word Types:', existing_word_type_count)

    topics = {}
    curr_doc = -1
    curr_text = ''
    for document_index, start_index, token, token_abstraction, topic_number_list in analysis.get_token_iterator(
    ):
        try:
            if document_index != curr_doc:
                curr_doc = document_index
                curr_text = document_iterator[curr_doc].get_content()
            assert start_index >= 0 and start_index < len(curr_text)
            assert token.lower() == curr_text[start_index:start_index +
                                              len(token)].lower()
            assert token_abstraction != None
            assert token in word_types_db or token in new_word_types
            assert token_abstraction in word_types_db or token_abstraction in new_word_types
            assert len(
                token
            ) < MAX_TOKEN_LENGTH, 'Max length of token strings is %d.' % (
                MAX_TOKEN_LENGTH)
            assert len(
                token_abstraction
            ) < MAX_TOKEN_LENGTH, 'Max length of token abstraction strings is %d.' % (
                MAX_TOKEN_LENGTH)
            for topic_num in topic_number_list:
                if topic_num not in topics:
                    topics[topic_num] = True
                assert topic_num >= 0
                assert type(topic_num) == int
        except AssertionError:
            print(document_index, start_index, token, token_abstraction,
                  topic_number_list,
                  curr_text[start_index:start_index + len(token)])
            print(curr_text)
            raise
    print('Basic token check passes.')
    print('Number of Topics:', len(topics))

    parent_children = {}
    for parent_num, child_num in analysis.get_hierarchy_iterator():
        assert parent_num in topics
        assert child_num in topics
        if parent_num not in parent_children:
            parent_children[parent_num] = {child_num: True}
        else:
            parent_children[parent_num][child_num] = True
        if child_num in parent_children:
            assert parent_num not in parent_children[child_num]
    print('Topic hierarchy checks out.')
    print('Topic hierarchy:', parent_children)

    print('Stopword Count:', len(analysis.stopwords))
    print('Excluded Count:', len(analysis.excluded_words))
def check_analysis(database_id, dataset_name, analysis, directories, 
                 topic_namers=None, verbose=False):
    """The purpose of this is to test an analysis for basic errors
    such as incorrect token indices and start indices, inappropriate topic
    heirarchies, tokens that are too long, whether or not a token is a
    stopword, and whether or not a token is part of the vocabulary.  Any errors
    are posted to the console.
    """
    def dict_to_string(d):
        result = ''
        for k, v in d.iteritems():
            result += unicode(k) + ': ' + unicode(v) + '\n'
        return result
    
    print('Analysis Name:', analysis.name)
    print('Analysis Metadata:', dict_to_string(analysis.metadata))
    print('Analysis Metadata Types:', dict_to_string(analysis.metadata_types))
    
    print('Running Analysis...')
    document_iterator = Document.objects.using(database_id).filter(dataset__name=dataset_name).order_by('index')
    document_iterator.prefetch_related('dataset', 'metadata', 'metadata__metadata_type')
    analysis.run_analysis(document_iterator)
    
    word_types_db = get_all_word_types(database_id)
    
    new_word_types = {}
    new_word_type_count = 0
    existing_word_type_count = 0
    for word_type in analysis.get_vocab_iterator():
        if word_type in word_types_db:
            existing_word_type_count += 1
        else:
            new_word_type_count += 1
            new_word_types[word_type.lower()] = True
    print('New Word Types:', new_word_type_count)
    print('Existing Word Types:', existing_word_type_count)
    
    topics = {}
    curr_doc = -1
    curr_text = ''
    for document_index, start_index, token, token_abstraction, topic_number_list in analysis.get_token_iterator():
        try:
            if document_index != curr_doc:
                curr_doc = document_index
                curr_text = document_iterator[curr_doc].get_content()
            assert start_index >= 0 and start_index < len(curr_text)
            assert token.lower() == curr_text[start_index: start_index + len(token)].lower()
            assert token_abstraction != None
            assert token in word_types_db or token in new_word_types
            assert token_abstraction in word_types_db or token_abstraction in new_word_types
            assert len(token) < MAX_TOKEN_LENGTH, 'Max length of token strings is %d.'%(MAX_TOKEN_LENGTH)
            assert len(token_abstraction) < MAX_TOKEN_LENGTH, 'Max length of token abstraction strings is %d.'%(MAX_TOKEN_LENGTH)
            for topic_num in topic_number_list:
                if topic_num not in topics:
                    topics[topic_num] = True
                assert topic_num >= 0
                assert type(topic_num) == int
        except AssertionError:
            print(document_index, start_index, token, token_abstraction, topic_number_list, curr_text[start_index: start_index + len(token)])
            print(curr_text)
            raise
    print('Basic token check passes.')
    print('Number of Topics:', len(topics))
    
    parent_children = {}
    for parent_num, child_num in analysis.get_hierarchy_iterator():
        assert parent_num in topics
        assert child_num in topics
        if parent_num not in parent_children:
            parent_children[parent_num] = {child_num: True}
        else:
            parent_children[parent_num][child_num] = True
        if child_num in parent_children:
            assert parent_num not in parent_children[child_num]
    print('Topic hierarchy checks out.')
    print('Topic hierarchy:', parent_children)
    
    print('Stopword Count:', len(analysis.stopwords))
    print('Excluded Count:', len(analysis.excluded_words))