Ejemplo n.º 1
0
def merge(table1: biom.Table, table2: biom.Table) -> biom.Table:
    table1_sids = set(table1.ids(axis='sample'))
    table2_sids = set(table2.ids(axis='sample'))
    if len(table1_sids & table2_sids) > 0:
        raise ValueError('Some samples are present in both tables: %s' %
                         ', '.join(table1_sids & table2_sids))
    return table1.merge(table2)
Ejemplo n.º 2
0
def pad_features_in_test_data(train_table: biom.Table, 
                                test_table: biom.Table) -> biom.Table:
    '''
    Do feature alignment on train and test tables by adding zero-padding features that
    only existed in the train table into test table.

    Parameters
    ----------
    train_table: biom.Table
    A biom table with train data
    test_table: biom.Table
    A biom table with test data
    
    Returns
    -------
    new_test_biom: biom.Table
    A biom table with the updated test data with identical set of
        features in the train table.
    '''

    train_feature_ids = train_table.ids(axis='observation')
    test_feature_ids = test_table.ids(axis='observation')

    n_samples = test_table.shape[0]
    #n_features = test_table.shape[1]
    sample_ids= test_table.ids(axis='sample')
    #print("The # of features in the train data: ", len(train_feature_ids))
    #print("The # of features in the original test data: ", len(test_feature_ids))
    train_uniq_f=list(set(train_feature_ids)-set(test_feature_ids))
    shared_f=set(train_feature_ids).intersection(set(test_feature_ids))
    # create a zero matrix for all features uniquely existed in the train table
    padding_table = biom.Table(np.zeros((len(train_uniq_f), n_samples)),
                                train_uniq_f, sample_ids)
    # filter out features that don't exist in the train table in the test table
    test_table.filter(shared_f, axis='observation')

    n_filtered_features = test_table.shape[1]
    if n_filtered_features == 0:
        raise ValueError('No feature overlap between train and test table!'
                         'Check the feature-format consistentcy between tables!')
    # merge the two tables
    new_test_table = test_table.merge(padding_table)

    return new_test_table
def cluster_features(query_table: biom.Table, closed_reference_table: biom.Table,
                                         query_sequences: DNAFASTAFormat,
                                         reference_sequences: pd.Series, thr: float = 0.97,
                                         threads: int = 1, output_log_file: str = None) -> (
        biom.Table, DNAFASTAFormat, DNAFASTAFormat):
    reference_sequences_fasta = get_reference_seqs_from_ids(closed_reference_table, reference_sequences)
    results = cluster_features_closed_reference(sequences=query_sequences, table=query_table,
                                                reference_sequences=reference_sequences_fasta,
                                                perc_identity=thr, threads=threads)

    clustered_table_biom = results[0]

    clustered_sequences_pd = Artifact.load(str(results[1])).view(pd.Series)
    unmatched_sequences_pd = Artifact.load(str(results[2])).view(pd.Series)

    with tempfile.mktemp() as tmp_fp:
        logger_ins = LOG(tmp_fp).get_logger('clustering_features')
        logger_ins.info("The number of OTUs in the reference database is", _15(reference_sequences_fasta).size)
        logger_ins.info("The number of unmatched sequence to the reference alignment is", unmatched_sequences_pd.size)
        logger_ins.info("The number of matched sequences to the reference alignment is", clustered_sequences_pd.size)
        logger_ins.info("Before applying clustering, the total number of counts "
                        "in the original feature table was", np.sum(query_table.sum()))
        logger_ins.info("Before applying clustering, the number of non-zero elements"
                        " of the underlying feature table is", query_table.nnz)
        logger_ins.info("After applying clustering, the total number of counts "
                        "in the original feature table was", np.sum(clustered_table_biom.sum()))
        logger_ins.info("After applying clustering, the number of non-zero elements"
                        " of the underlying feature table is", clustered_table_biom.nnz)
        logger_ins.info("The percent of total counts retained is",
                        np.sum(query_table.sum()) / np.sum(clustered_table_biom.sum()) * 100, "%s")

        query_samples = clustered_table_biom.ids('sample')
        closed_reference_features = closed_reference_table.ids('observation')
        clustered_table_biom = closed_reference_table.merge(clustered_table_biom)
        clustered_table_biom.filter(ids_to_keep=query_samples, axis='sample', inplace=True)
        if len(set(closed_reference_features) - set(clustered_table_biom.ids('sample'))) != 0:
            raise ValueError(
                "Merging two tables failed! There are less features in the final table than expected!"
            )
        if output_log_file:
            shutil.copy(tmp_fp, output_log_file)
    return clustered_table_biom, results[1], results[2]
def reorder_feature_table(query_table: biom.Table, reference_table: biom.Table) -> biom.Table:
    query_samples = query_table.ids()
    ref_samples = set(reference_table.ids())
    for sample in query_samples:
        if sample in ref_samples:
            raise ValueError(
                "The sample", sample, "from your reference data found in your query one, while "
                                      "the two tables should be disjoint."
            )
    merged_table = reference_table.merge(query_table)
    merged_table.filter(ids_to_keep=reference_table.ids('observation'))
    merged_table.filter(ids_to_keep=query_samples)
    for sample in merged_table.ids():
        if sample in ref_samples:
            raise ValueError(
                "The sample", sample, "from your reference data found in your query one, while "
                                      "the two tables should be disjoint."
            )

    return merged_table.sort_order(reference_table.ids('observation'))
Ejemplo n.º 5
0
Archivo: 54.py Proyecto: tkosciol/qiita
def create_non_rarefied_biom_artifact(analysis, biom_data, rarefied_table):
    """Creates the initial non-rarefied BIOM artifact of the analysis

    Parameters
    ----------
    analysis : dict
        Dictionary with the analysis information
    biom_data : dict
        Dictionary with the biom file information
    rarefied_table : biom.Table
        The rarefied BIOM table

    Returns
    -------
    int
        The id of the new artifact
    """
    # The non rarefied biom artifact is the initial biom table of the analysis.
    # This table does not currently exist anywhere, so we need to actually
    # create the BIOM file. To create this BIOM file we need: (1) the samples
    # and artifacts they come from and (2) whether the samples where
    # renamed or not. (1) is on the database, but we need to inferr (2) from
    # the existing rarefied BIOM table. Fun, fun...

    with TRN:
        # Get the samples included in the BIOM table grouped by artifact id
        # Note that the analysis contains a BIOM table per data type included
        # in it, and the table analysis_sample does not differentiate between
        # datatypes, so we need to check the data type in the artifact table
        sql = """SELECT artifact_id, array_agg(sample_id)
                 FROM qiita.analysis_sample
                    JOIN qiita.artifact USING (artifact_id)
                 WHERE analysis_id = %s AND data_type_id = %s
                 GROUP BY artifact_id"""
        TRN.add(sql, [analysis['analysis_id'], biom_data['data_type_id']])
        samples_by_artifact = TRN.execute_fetchindex()

        # Create an empty BIOM table to be the new master table
        new_table = Table([], [], [])
        ids_map = {}
        for a_id, samples in samples_by_artifact:
            # Get the filepath of the BIOM table from the artifact
            artifact = Artifact(a_id)
            biom_fp = None
            for _, fp, fp_type in artifact.filepaths:
                if fp_type == 'biom':
                    biom_fp = fp
            # Note that we are sure that the biom table exists for sure, so
            # no need to check if biom_fp is undefined
            biom_table = load_table(biom_fp)
            samples = set(samples).intersection(biom_table.ids())
            biom_table.filter(samples, axis='sample', inplace=True)
            # we need to check if the table has samples left before merging
            if biom_table.shape[0] != 0 and biom_table.shape[1] != 0:
                new_table = new_table.merge(biom_table)
                ids_map.update({sid: "%d.%s" % (a_id, sid)
                                for sid in biom_table.ids()})

        # Check if we need to rename the sample ids in the biom table
        new_table_ids = set(new_table.ids())
        if not new_table_ids.issuperset(rarefied_table.ids()):
            # We need to rename the sample ids
            new_table.update_ids(ids_map, 'sample', True, True)

        sql = """INSERT INTO qiita.artifact
                    (generated_timestamp, data_type_id, visibility_id,
                     artifact_type_id, submitted_to_vamps)
            VALUES (%s, %s, %s, %s, %s)
            RETURNING artifact_id"""
        # Magic number 4 -> visibility sandbox
        # Magix number 7 -> biom artifact type
        TRN.add(sql, [analysis['timestamp'], biom_data['data_type_id'],
                      4, 7, False])
        artifact_id = TRN.execute_fetchlast()

        # Associate the artifact with the analysis
        sql = """INSERT INTO qiita.analysis_artifact
                    (analysis_id, artifact_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [analysis['analysis_id'], artifact_id])
        # Link the artifact with its file
        dd_id, mp = get_mountpoint('BIOM')[0]
        dir_fp = join(get_db_files_base_dir(), mp, str(artifact_id))
        if not exists(dir_fp):
            makedirs(dir_fp)
        new_table_fp = join(dir_fp, "biom_table.biom")
        with biom_open(new_table_fp, 'w') as f:
            new_table.to_hdf5(f, "Generated by Qiita")

        sql = """INSERT INTO qiita.filepath
                    (filepath, filepath_type_id, checksum,
                     checksum_algorithm_id, data_directory_id)
                 VALUES (%s, %s, %s, %s, %s)
                 RETURNING filepath_id"""
        # Magic number 7 -> filepath_type_id = 'biom'
        # Magic number 1 -> the checksum algorithm id
        TRN.add(sql, [basename(new_table_fp), 7,
                      compute_checksum(new_table_fp), 1, dd_id])
        fp_id = TRN.execute_fetchlast()
        sql = """INSERT INTO qiita.artifact_filepath
                    (artifact_id, filepath_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [artifact_id, fp_id])
        TRN.execute()

    return artifact_id
Ejemplo n.º 6
0
Archivo: 54.py Proyecto: tanaes/qiita
def create_non_rarefied_biom_artifact(analysis, biom_data, rarefied_table):
    """Creates the initial non-rarefied BIOM artifact of the analysis

    Parameters
    ----------
    analysis : dict
        Dictionary with the analysis information
    biom_data : dict
        Dictionary with the biom file information
    rarefied_table : biom.Table
        The rarefied BIOM table

    Returns
    -------
    int
        The id of the new artifact
    """
    # The non rarefied biom artifact is the initial biom table of the analysis.
    # This table does not currently exist anywhere, so we need to actually
    # create the BIOM file. To create this BIOM file we need: (1) the samples
    # and artifacts they come from and (2) whether the samples where
    # renamed or not. (1) is on the database, but we need to inferr (2) from
    # the existing rarefied BIOM table. Fun, fun...

    with TRN:
        # Get the samples included in the BIOM table grouped by artifact id
        # Note that the analysis contains a BIOM table per data type included
        # in it, and the table analysis_sample does not differentiate between
        # datatypes, so we need to check the data type in the artifact table
        sql = """SELECT artifact_id, array_agg(sample_id)
                 FROM qiita.analysis_sample
                    JOIN qiita.artifact USING (artifact_id)
                 WHERE analysis_id = %s AND data_type_id = %s
                 GROUP BY artifact_id"""
        TRN.add(sql, [analysis['analysis_id'], biom_data['data_type_id']])
        samples_by_artifact = TRN.execute_fetchindex()

        # Create an empty BIOM table to be the new master table
        new_table = Table([], [], [])
        ids_map = {}
        for a_id, samples in samples_by_artifact:
            # Get the filepath of the BIOM table from the artifact
            artifact = Artifact(a_id)
            biom_fp = None
            for _, fp, fp_type in artifact.filepaths:
                if fp_type == 'biom':
                    biom_fp = fp
            # Note that we are sure that the biom table exists for sure, so
            # no need to check if biom_fp is undefined
            biom_table = load_table(biom_fp)
            samples = set(samples).intersection(biom_table.ids())
            biom_table.filter(samples, axis='sample', inplace=True)
            # we need to check if the table has samples left before merging
            if biom_table.shape[0] != 0 and biom_table.shape[1] != 0:
                new_table = new_table.merge(biom_table)
                ids_map.update(
                    {sid: "%d.%s" % (a_id, sid)
                     for sid in biom_table.ids()})

        # Check if we need to rename the sample ids in the biom table
        new_table_ids = set(new_table.ids())
        if not new_table_ids.issuperset(rarefied_table.ids()):
            # We need to rename the sample ids
            new_table.update_ids(ids_map, 'sample', True, True)

        sql = """INSERT INTO qiita.artifact
                    (generated_timestamp, data_type_id, visibility_id,
                     artifact_type_id, submitted_to_vamps)
            VALUES (%s, %s, %s, %s, %s)
            RETURNING artifact_id"""
        # Magic number 4 -> visibility sandbox
        # Magix number 7 -> biom artifact type
        TRN.add(
            sql,
            [analysis['timestamp'], biom_data['data_type_id'], 4, 7, False])
        artifact_id = TRN.execute_fetchlast()

        # Associate the artifact with the analysis
        sql = """INSERT INTO qiita.analysis_artifact
                    (analysis_id, artifact_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [analysis['analysis_id'], artifact_id])
        # Link the artifact with its file
        dd_id, mp = get_mountpoint('BIOM')[0]
        dir_fp = join(get_db_files_base_dir(), mp, str(artifact_id))
        if not exists(dir_fp):
            makedirs(dir_fp)
        new_table_fp = join(dir_fp, "biom_table.biom")
        with biom_open(new_table_fp, 'w') as f:
            new_table.to_hdf5(f, "Generated by Qiita")

        sql = """INSERT INTO qiita.filepath
                    (filepath, filepath_type_id, checksum,
                     checksum_algorithm_id, data_directory_id)
                 VALUES (%s, %s, %s, %s, %s)
                 RETURNING filepath_id"""
        # Magic number 7 -> filepath_type_id = 'biom'
        # Magic number 1 -> the checksum algorithm id
        TRN.add(sql, [
            basename(new_table_fp), 7,
            compute_checksum(new_table_fp), 1, dd_id
        ])
        fp_id = TRN.execute_fetchlast()
        sql = """INSERT INTO qiita.artifact_filepath
                    (artifact_id, filepath_id)
                 VALUES (%s, %s)"""
        TRN.add(sql, [artifact_id, fp_id])
        TRN.execute()

    return artifact_id