def run_analysis(database_id, dataset_name, analysis, directories, topic_namers=None, verbose=False): """Give the analysis the text for the documents allowing the bulk of the work to be done by the analysis. Import the tokens, topic token relationships, topics, etc., into the database. Positional Arguments: database_id -- the dict key specifying the database in django dataset_name -- the name that uniquely identifies which dataset this analysis will be run on analysis -- an AbstractAnalysis object directories -- dict returned from get_common_working_directories Keyword Arguments: topic_namers -- a list of AbstractTopicNamers that take an Analysis Django database object and create topic names according to a naming scheme verbose -- if True notifications of progress will be output to the console Return the unique analysis name for the given dataset. """ if verbose: print('Running analysis:', analysis.name) document_iterator = Document.objects.using(database_id).filter( dataset__name=dataset_name).order_by('index') document_iterator.prefetch_related('dataset', 'metadata') analysis.run_analysis(document_iterator) dataset_db = Dataset.objects.using(database_id).get(name=dataset_name) # word types should be relatively sparse, so we load all of them into memory word_types_db = get_all_word_types(database_id) meta_types_db = get_all_metadata_types(database_id) if verbose: print('Creating analysis entry.') analysis_db = create_analysis(database_id, dataset_db, analysis, meta_types_db) if verbose: print('Creating word type entries.') create_word_type_entries(database_id, analysis.get_vocab_iterator(), word_types_db) if not analysis_db.tokens.exists(): if verbose: print('Creating token entries.') create_tokens(database_id, analysis_db, word_types_db, analysis.get_token_iterator(), verbose=verbose) if verbose: print('Adjusting topic heirarchy.') create_topic_heirarchy(database_id, analysis_db, analysis.get_hierarchy_iterator()) if not analysis_db.stopwords.exists(): if verbose: print('Creating stopword entries.') create_stopwords(database_id, analysis_db, word_types_db, analysis.stopwords) if not analysis_db.excluded_words.exists(): if verbose: print('Creating excluded word entries.') create_excluded_words(database_id, analysis_db, word_types_db, analysis.excluded_words) if verbose: print('Naming topics.') if DATABASE_OPTIMIZE_DEBUG: con = connections[database_id] query_count = len(con.queries) if topic_namers == None: topic_namers = DEFAULT_TOPIC_NAMERS create_topic_names(database_id, analysis_db, topic_namers, verbose=verbose) if DATABASE_OPTIMIZE_DEBUG: total_queries = len(con.queries) - query_count print("Namers used %d queries." % (total_queries, )) if total_queries > 10: for query in con.queries[query_count:]: print(query['time']) print(query['sql']) if verbose: print('Running metrics.') run_metrics(database_id, dataset_db.name, analysis_db.name, BASIC_ANALYSIS_METRICS) return analysis.name
def run_analysis(database_id, dataset_name, analysis, directories, topic_namers=None, verbose=False): """Give the analysis the text for the documents allowing the bulk of the work to be done by the analysis. Import the tokens, topic token relationships, topics, etc., into the database. Positional Arguments: database_id -- the dict key specifying the database in django dataset_name -- the name that uniquely identifies which dataset this analysis will be run on analysis -- an AbstractAnalysis object directories -- dict returned from get_common_working_directories Keyword Arguments: topic_namers -- a list of AbstractTopicNamers that take an Analysis Django database object and create topic names according to a naming scheme verbose -- if True notifications of progress will be output to the console Return the unique analysis name for the given dataset. """ if verbose: print('Running analysis:', analysis.name) document_iterator = Document.objects.using(database_id).filter(dataset__name=dataset_name).order_by('index') document_iterator.prefetch_related('dataset', 'metadata') analysis.run_analysis(document_iterator) dataset_db = Dataset.objects.using(database_id).get(name=dataset_name) # word types should be relatively sparse, so we load all of them into memory word_types_db = get_all_word_types(database_id) meta_types_db = get_all_metadata_types(database_id) if verbose: print('Creating analysis entry.') analysis_db = create_analysis(database_id, dataset_db, analysis, meta_types_db) if verbose: print('Creating word type entries.') create_word_type_entries(database_id, analysis.get_vocab_iterator(), word_types_db) if not analysis_db.tokens.exists(): if verbose: print('Creating token entries.') create_tokens(database_id, analysis_db, word_types_db, analysis.get_token_iterator(), verbose=verbose) if verbose: print('Adjusting topic heirarchy.') create_topic_heirarchy(database_id, analysis_db, analysis.get_hierarchy_iterator()) if not analysis_db.stopwords.exists(): if verbose: print('Creating stopword entries.') create_stopwords(database_id, analysis_db, word_types_db, analysis.stopwords) if not analysis_db.excluded_words.exists(): if verbose: print('Creating excluded word entries.') create_excluded_words(database_id, analysis_db, word_types_db, analysis.excluded_words) if verbose: print('Naming topics.') if DATABASE_OPTIMIZE_DEBUG: con = connections[database_id] query_count = len(con.queries) if topic_namers == None: topic_namers = DEFAULT_TOPIC_NAMERS create_topic_names(database_id, analysis_db, topic_namers, verbose=verbose) if DATABASE_OPTIMIZE_DEBUG: total_queries = len(con.queries) - query_count print("Namers used %d queries."%(total_queries,)) if total_queries > 10: for query in con.queries[query_count:]: print(query['time']) print(query['sql']) if verbose: print('Running metrics.') run_metrics(database_id, dataset_db.name, analysis_db.name, BASIC_ANALYSIS_METRICS) return analysis.name
def check_analysis(database_id, dataset_name, analysis, directories, topic_namers=None, verbose=False): """The purpose of this is to test an analysis for basic errors such as incorrect token indices and start indices, inappropriate topic heirarchies, tokens that are too long, whether or not a token is a stopword, and whether or not a token is part of the vocabulary. Any errors are posted to the console. """ def dict_to_string(d): result = '' for k, v in d.iteritems(): result += unicode(k) + ': ' + unicode(v) + '\n' return result print('Analysis Name:', analysis.name) print('Analysis Metadata:', dict_to_string(analysis.metadata)) print('Analysis Metadata Types:', dict_to_string(analysis.metadata_types)) print('Running Analysis...') document_iterator = Document.objects.using(database_id).filter( dataset__name=dataset_name).order_by('index') document_iterator.prefetch_related('dataset', 'metadata', 'metadata__metadata_type') analysis.run_analysis(document_iterator) word_types_db = get_all_word_types(database_id) new_word_types = {} new_word_type_count = 0 existing_word_type_count = 0 for word_type in analysis.get_vocab_iterator(): if word_type in word_types_db: existing_word_type_count += 1 else: new_word_type_count += 1 new_word_types[word_type.lower()] = True print('New Word Types:', new_word_type_count) print('Existing Word Types:', existing_word_type_count) topics = {} curr_doc = -1 curr_text = '' for document_index, start_index, token, token_abstraction, topic_number_list in analysis.get_token_iterator( ): try: if document_index != curr_doc: curr_doc = document_index curr_text = document_iterator[curr_doc].get_content() assert start_index >= 0 and start_index < len(curr_text) assert token.lower() == curr_text[start_index:start_index + len(token)].lower() assert token_abstraction != None assert token in word_types_db or token in new_word_types assert token_abstraction in word_types_db or token_abstraction in new_word_types assert len( token ) < MAX_TOKEN_LENGTH, 'Max length of token strings is %d.' % ( MAX_TOKEN_LENGTH) assert len( token_abstraction ) < MAX_TOKEN_LENGTH, 'Max length of token abstraction strings is %d.' % ( MAX_TOKEN_LENGTH) for topic_num in topic_number_list: if topic_num not in topics: topics[topic_num] = True assert topic_num >= 0 assert type(topic_num) == int except AssertionError: print(document_index, start_index, token, token_abstraction, topic_number_list, curr_text[start_index:start_index + len(token)]) print(curr_text) raise print('Basic token check passes.') print('Number of Topics:', len(topics)) parent_children = {} for parent_num, child_num in analysis.get_hierarchy_iterator(): assert parent_num in topics assert child_num in topics if parent_num not in parent_children: parent_children[parent_num] = {child_num: True} else: parent_children[parent_num][child_num] = True if child_num in parent_children: assert parent_num not in parent_children[child_num] print('Topic hierarchy checks out.') print('Topic hierarchy:', parent_children) print('Stopword Count:', len(analysis.stopwords)) print('Excluded Count:', len(analysis.excluded_words))
def check_analysis(database_id, dataset_name, analysis, directories, topic_namers=None, verbose=False): """The purpose of this is to test an analysis for basic errors such as incorrect token indices and start indices, inappropriate topic heirarchies, tokens that are too long, whether or not a token is a stopword, and whether or not a token is part of the vocabulary. Any errors are posted to the console. """ def dict_to_string(d): result = '' for k, v in d.iteritems(): result += unicode(k) + ': ' + unicode(v) + '\n' return result print('Analysis Name:', analysis.name) print('Analysis Metadata:', dict_to_string(analysis.metadata)) print('Analysis Metadata Types:', dict_to_string(analysis.metadata_types)) print('Running Analysis...') document_iterator = Document.objects.using(database_id).filter(dataset__name=dataset_name).order_by('index') document_iterator.prefetch_related('dataset', 'metadata', 'metadata__metadata_type') analysis.run_analysis(document_iterator) word_types_db = get_all_word_types(database_id) new_word_types = {} new_word_type_count = 0 existing_word_type_count = 0 for word_type in analysis.get_vocab_iterator(): if word_type in word_types_db: existing_word_type_count += 1 else: new_word_type_count += 1 new_word_types[word_type.lower()] = True print('New Word Types:', new_word_type_count) print('Existing Word Types:', existing_word_type_count) topics = {} curr_doc = -1 curr_text = '' for document_index, start_index, token, token_abstraction, topic_number_list in analysis.get_token_iterator(): try: if document_index != curr_doc: curr_doc = document_index curr_text = document_iterator[curr_doc].get_content() assert start_index >= 0 and start_index < len(curr_text) assert token.lower() == curr_text[start_index: start_index + len(token)].lower() assert token_abstraction != None assert token in word_types_db or token in new_word_types assert token_abstraction in word_types_db or token_abstraction in new_word_types assert len(token) < MAX_TOKEN_LENGTH, 'Max length of token strings is %d.'%(MAX_TOKEN_LENGTH) assert len(token_abstraction) < MAX_TOKEN_LENGTH, 'Max length of token abstraction strings is %d.'%(MAX_TOKEN_LENGTH) for topic_num in topic_number_list: if topic_num not in topics: topics[topic_num] = True assert topic_num >= 0 assert type(topic_num) == int except AssertionError: print(document_index, start_index, token, token_abstraction, topic_number_list, curr_text[start_index: start_index + len(token)]) print(curr_text) raise print('Basic token check passes.') print('Number of Topics:', len(topics)) parent_children = {} for parent_num, child_num in analysis.get_hierarchy_iterator(): assert parent_num in topics assert child_num in topics if parent_num not in parent_children: parent_children[parent_num] = {child_num: True} else: parent_children[parent_num][child_num] = True if child_num in parent_children: assert parent_num not in parent_children[child_num] print('Topic hierarchy checks out.') print('Topic hierarchy:', parent_children) print('Stopword Count:', len(analysis.stopwords)) print('Excluded Count:', len(analysis.excluded_words))