def create_analysis(database_id, dataset_db, analysis, meta_types_db):
    """Create the dataset entry and metadata entries.
    database_id -- the dict key specifying the database in django
    dataset_db -- the Dataset django database object
    analysis -- an AbstractAnalysis object
    meta_types_db -- what is returned by the get_all_metadata_types method in the
                     metadata.utilities module
    Return the Analysis django database object created.
    """
    with transaction.atomic(using=database_id):
        analysis_db, created = Analysis.objects.using(database_id).get_or_create(dataset=dataset_db, name=analysis.name)

        if created:
            metadata_types = analysis.metadata_types
            create_metadata_types(database_id, metadata_types, meta_types_db)
            create_metadata(
                database_id,
                [analysis_db],
                AnalysisMetadataValue,
                "analysis",
                metadata_types,
                meta_types_db,
                [analysis.metadata],
            )
    return analysis_db
Exemple #2
0
def create_dataset(database_id, dataset, dataset_dir, meta_types_db, **kwargs):
    """Create the dataset entry and metadata entries.
    database_id -- the dict key specifying the database in django
    dataset -- an AbstractDataset type
    dataset_dir -- the directory that the dataset can use to store documents
                   and analysis directories to store intermediate results
    meta_types_db -- what is returned by the get_all_metadata_types method in the
                     metadata.utilities module
    
    Keyword Arguments:
    public -- make this dataset public (anybody can explore it)
    public_documents -- make the document text publicly available
    
    Return the Dataset django database object after creation.
    """
    with transaction.atomic(using=database_id):
        dataset_db, created = Dataset.objects.using(database_id).\
                get_or_create(name=dataset.name, 
                              dataset_dir=dataset_dir)
        
        if created:
            dataset_db.public = kwargs.setdefault('public', False)
            dataset_db.public_documents = kwargs.setdefault('public_documents', False)
            dataset_db.visible = False
            dataset_db.save()
            metadata_types = dataset.metadata_types
            create_metadata_types(database_id, metadata_types, meta_types_db)
            create_metadata(database_id, [dataset_db], 
                            DatasetMetadataValue, 'dataset',
                            metadata_types, 
                            meta_types_db, 
                            [dataset.metadata])
    return dataset_db
Exemple #3
0
def create_analysis(database_id, dataset_db, analysis, meta_types_db):
    """Create the dataset entry and metadata entries.
    database_id -- the dict key specifying the database in django
    dataset_db -- the Dataset django database object
    analysis -- an AbstractAnalysis object
    meta_types_db -- what is returned by the get_all_metadata_types method in the
                     metadata.utilities module
    Return the Analysis django database object created.
    """
    with transaction.atomic(using=database_id):
        analysis_db, created = Analysis.objects.using(database_id).\
                get_or_create(dataset=dataset_db, name=analysis.name)

        if created:
            metadata_types = analysis.metadata_types
            create_metadata_types(database_id, metadata_types, meta_types_db)
            create_metadata(database_id, [analysis_db], AnalysisMetadataValue,
                            'analysis', metadata_types, meta_types_db,
                            [analysis.metadata])
    return analysis_db
Exemple #4
0
def create_documents(database_id, dataset_db, dataset, 
                     meta_types_db, verbose=False):
    """Create entries for documents and their associated metadata.
    database_id -- the dict key specifying the database in django
    dataset_db -- the Dataset django database object
    dataset -- the AbstractDataset object
    meta_types_db -- what is returned by the get_all_metadata_types method in the
                     metadata.utilities module
    verbose -- if True print out progress to the console; do nothing otherwise
    """
    document_dir = join(dataset_db.dataset_dir, RELATIVE_DOCUMENT_DIRECTORY)
    document_metadata_types = dataset.document_metadata_types
    create_metadata_types(database_id, document_metadata_types,
                          meta_types_db)
    # Helper function
    def bulk_create_documents(documents, metadata):
        if len(documents) == 0: return
        with transaction.atomic(using=database_id):
            low_high = (documents[0].index, documents[-1].index)
            # create document entries
            Document.objects.using(database_id).bulk_create(documents)
            names = []
            for doc in documents:
                names.append(doc.filename)
            # retrieve documents from database since bulk_create doesn't return
            # a primary key
            documents_db = \
                Document.objects.using(database_id).filter(dataset=dataset_db,
                index__range=low_high).order_by('index')
            # create metadata entries
            create_metadata(database_id, 
                            documents_db, 
                            DocumentMetadataValue, 'document',
                            document_metadata_types,
                            meta_types_db,
                            metadata)
        del documents[:]
        del metadata[:]
    
    documents_to_commit = []
    documents_metadata_to_commit = []
    already_created_documents = {d.filename: d for d in Document.objects.using(database_id).filter(dataset=dataset_db.id)}
    if verbose: timer = VerboseTimer(len(dataset))
    for doc_index, doc in enumerate(dataset):
        if verbose: timer.tick()
        filename = doc.name
        # Create document and get metadata
        if filename not in already_created_documents:
            full_path = os.path.join(document_dir, filename)
            metadata = doc.metadata
            content = doc.content
            with io.open(full_path, 'w', encoding='utf-8') as f:
                f.write(content)
            doc_db = Document(dataset=dataset_db, filename=filename, index=doc_index, length=len(content))
            documents_to_commit.append(doc_db)
            documents_metadata_to_commit.append(metadata)
        # Bulk create periodically to keep memory usage minimized
        if len(documents_to_commit) > MAX_DOCUMENTS_TO_COMMIT:
            bulk_create_documents(documents_to_commit, 
                documents_metadata_to_commit)
    bulk_create_documents(documents_to_commit, documents_metadata_to_commit)
    if verbose: print("Document count:", doc_index + 1)