コード例 #1
0
def import_dataset(database_id, dataset, directories, **kwargs):
    """Transfer documents and import content into database.
    Positional arguments:
    database_id -- the dict key specifying the database in django
    dataset -- an AbstractDataset
    directories -- dict returned from get_common_working_directories
    
    Keyword arguments:
    public -- make the dataset public (default False)
    public_documents -- make the document text available (default False)
    verbose -- print output about progress (default False)
    
    Return the dataset's name/identifier.
    """
    verbose = kwargs.setdefault('verbose', False)

    if verbose: print('Importing dataset: ' + dataset.name)

    dataset_dir = directories['dataset']

    if DATABASE_OPTIMIZE_DEBUG:
        con = connections[database_id]
        query_count = len(con.queries)

    meta_types_db = get_all_metadata_types(database_id)

    if verbose: print('Creating dataset entry.')
    dataset_db = create_dataset(database_id, dataset, dataset_dir,
                                meta_types_db, **kwargs)

    if DATABASE_OPTIMIZE_DEBUG:
        print('Dataset and metadata query count:',
              len(con.queries) - query_count)
        query_count = len(con.queries)

    if verbose: print('Copying documents and creating document entries.')
    if not dataset_db.documents.exists():
        create_documents(database_id, dataset_db, dataset, meta_types_db,
                         verbose)

    if DATABASE_OPTIMIZE_DEBUG:
        print('Documents and metadata query count:',
              len(con.queries) - query_count)

    dataset_db.visible = True
    dataset_db.save()

    if verbose: print('Running dataset metrics.')
    run_metrics(database_id, dataset_db.name, None, BASIC_DATASET_METRICS)

    if verbose: print('Done importing ' + dataset.name + '.')

    return dataset.name
コード例 #2
0
def import_dataset(database_id, dataset, directories, **kwargs):
    """Transfer documents and import content into database.
    Positional arguments:
    database_id -- the dict key specifying the database in django
    dataset -- an AbstractDataset
    directories -- dict returned from get_common_working_directories
    
    Keyword arguments:
    public -- make the dataset public (default False)
    public_documents -- make the document text available (default False)
    verbose -- print output about progress (default False)
    
    Return the dataset's name/identifier.
    """
    verbose = kwargs.setdefault('verbose', False)
    
    if verbose: print('Importing dataset: '+dataset.name)
    
    dataset_dir = directories['dataset']
    
    if DATABASE_OPTIMIZE_DEBUG:
        con = connections[database_id]
        query_count = len(con.queries)
    
    meta_types_db = get_all_metadata_types(database_id)
    
    if verbose: print('Creating dataset entry.')
    dataset_db = create_dataset(database_id, dataset, dataset_dir, meta_types_db, **kwargs)
    
    if DATABASE_OPTIMIZE_DEBUG:
        print('Dataset and metadata query count:', len(con.queries) - query_count)
        query_count = len(con.queries)
    
    if verbose: print('Copying documents and creating document entries.')
    if not dataset_db.documents.exists():
        create_documents(database_id, dataset_db, dataset, meta_types_db, verbose)
    
    if DATABASE_OPTIMIZE_DEBUG:
        print('Documents and metadata query count:', len(con.queries) - query_count)
    
    dataset_db.visible = True
    dataset_db.save()
    
    if verbose: print('Running dataset metrics.')
    run_metrics(database_id, dataset_db.name, None, BASIC_DATASET_METRICS)
    
    if verbose: print('Done importing '+dataset.name+'.')
    
    return dataset.name
コード例 #3
0
def run_analysis(database_id,
                 dataset_name,
                 analysis,
                 directories,
                 topic_namers=None,
                 verbose=False):
    """Give the analysis the text for the documents allowing the bulk of the
    work to be done by the analysis. Import the tokens, topic token 
    relationships, topics, etc., into the database.
    Positional Arguments:
    database_id -- the dict key specifying the database in django
    dataset_name -- the name that uniquely identifies which dataset this
                    analysis will be run on
    analysis -- an AbstractAnalysis object
    directories -- dict returned from get_common_working_directories
    
    Keyword Arguments:
    topic_namers -- a list of AbstractTopicNamers that take an Analysis Django database
                    object and create topic names according to a naming scheme
    verbose -- if True notifications of progress will be output to the console
    
    Return the unique analysis name for the given dataset.
    """
    if verbose: print('Running analysis:', analysis.name)
    document_iterator = Document.objects.using(database_id).filter(
        dataset__name=dataset_name).order_by('index')
    document_iterator.prefetch_related('dataset', 'metadata')
    analysis.run_analysis(document_iterator)

    dataset_db = Dataset.objects.using(database_id).get(name=dataset_name)
    # word types should be relatively sparse, so we load all of them into memory
    word_types_db = get_all_word_types(database_id)
    meta_types_db = get_all_metadata_types(database_id)

    if verbose: print('Creating analysis entry.')
    analysis_db = create_analysis(database_id, dataset_db, analysis,
                                  meta_types_db)

    if verbose: print('Creating word type entries.')
    create_word_type_entries(database_id, analysis.get_vocab_iterator(),
                             word_types_db)

    if not analysis_db.tokens.exists():
        if verbose: print('Creating token entries.')
        create_tokens(database_id,
                      analysis_db,
                      word_types_db,
                      analysis.get_token_iterator(),
                      verbose=verbose)

    if verbose: print('Adjusting topic heirarchy.')
    create_topic_heirarchy(database_id, analysis_db,
                           analysis.get_hierarchy_iterator())

    if not analysis_db.stopwords.exists():
        if verbose: print('Creating stopword entries.')
        create_stopwords(database_id, analysis_db, word_types_db,
                         analysis.stopwords)

    if not analysis_db.excluded_words.exists():
        if verbose: print('Creating excluded word entries.')
        create_excluded_words(database_id, analysis_db, word_types_db,
                              analysis.excluded_words)

    if verbose: print('Naming topics.')

    if DATABASE_OPTIMIZE_DEBUG:
        con = connections[database_id]
        query_count = len(con.queries)

    if topic_namers == None:
        topic_namers = DEFAULT_TOPIC_NAMERS
    create_topic_names(database_id, analysis_db, topic_namers, verbose=verbose)

    if DATABASE_OPTIMIZE_DEBUG:
        total_queries = len(con.queries) - query_count
        print("Namers used %d queries." % (total_queries, ))
        if total_queries > 10:
            for query in con.queries[query_count:]:
                print(query['time'])
                print(query['sql'])

    if verbose: print('Running metrics.')
    run_metrics(database_id, dataset_db.name, analysis_db.name,
                BASIC_ANALYSIS_METRICS)

    return analysis.name
コード例 #4
0
def run_analysis(database_id, dataset_name, analysis, directories, 
                 topic_namers=None, verbose=False):
    """Give the analysis the text for the documents allowing the bulk of the
    work to be done by the analysis. Import the tokens, topic token 
    relationships, topics, etc., into the database.
    Positional Arguments:
    database_id -- the dict key specifying the database in django
    dataset_name -- the name that uniquely identifies which dataset this
                    analysis will be run on
    analysis -- an AbstractAnalysis object
    directories -- dict returned from get_common_working_directories
    
    Keyword Arguments:
    topic_namers -- a list of AbstractTopicNamers that take an Analysis Django database
                    object and create topic names according to a naming scheme
    verbose -- if True notifications of progress will be output to the console
    
    Return the unique analysis name for the given dataset.
    """
    if verbose: print('Running analysis:', analysis.name)
    document_iterator = Document.objects.using(database_id).filter(dataset__name=dataset_name).order_by('index')
    document_iterator.prefetch_related('dataset', 'metadata')
    analysis.run_analysis(document_iterator)
    
    dataset_db = Dataset.objects.using(database_id).get(name=dataset_name)
    # word types should be relatively sparse, so we load all of them into memory
    word_types_db = get_all_word_types(database_id)
    meta_types_db = get_all_metadata_types(database_id)
    
    if verbose: print('Creating analysis entry.')
    analysis_db = create_analysis(database_id, dataset_db, analysis, meta_types_db)
    
    if verbose: print('Creating word type entries.')
    create_word_type_entries(database_id, analysis.get_vocab_iterator(), word_types_db)
    
    if not analysis_db.tokens.exists():
        if verbose: print('Creating token entries.')
        create_tokens(database_id, analysis_db, word_types_db, analysis.get_token_iterator(), verbose=verbose)
    
    if verbose: print('Adjusting topic heirarchy.')
    create_topic_heirarchy(database_id, analysis_db, analysis.get_hierarchy_iterator())
    
    if not analysis_db.stopwords.exists():
        if verbose: print('Creating stopword entries.')
        create_stopwords(database_id, analysis_db, word_types_db, analysis.stopwords)
    
    if not analysis_db.excluded_words.exists():
        if verbose: print('Creating excluded word entries.')
        create_excluded_words(database_id, analysis_db, word_types_db, analysis.excluded_words)
    
    if verbose: print('Naming topics.')
    
    if DATABASE_OPTIMIZE_DEBUG:
        con = connections[database_id]
        query_count = len(con.queries)
    
    if topic_namers == None:
        topic_namers = DEFAULT_TOPIC_NAMERS
    create_topic_names(database_id, analysis_db, topic_namers, verbose=verbose)
    
    if DATABASE_OPTIMIZE_DEBUG:
        total_queries = len(con.queries) - query_count
        print("Namers used %d queries."%(total_queries,))
        if total_queries > 10:
            for query in con.queries[query_count:]:
                print(query['time'])
                print(query['sql'])
    
    if verbose: print('Running metrics.')
    run_metrics(database_id, dataset_db.name, analysis_db.name, BASIC_ANALYSIS_METRICS)
    
    return analysis.name