예제 #1
0
def condense_ids(df,
                 tax_dict,
                 ranks,
                 max_group_size,
                 threshold_assignments=False):
    """
    Create mapping from tax_id to its condensed id.  Also creates the
    assignment hash on either the condensed_id or assignment_tax_id decided
    by the --split-condensed-assignments switch.

    By taking a hash of the set (frozenset) of ids the qseqid is given a
    unique identifier (the hash).  Later, we will use this hash and
    assign an ssignment name based on either the set of condensed_ids or
    assignment_tax_ids.  The modivation for using a hash rather than
    the actual assignment text for grouping is that the assignment text
    can contains extra annotations that are independent of which
    assignment group a qseqid belongs to such as a 100% id star.
    """

    condensed = sequtils.condense_ids(df[ASSIGNMENT_TAX_ID].unique(),
                                      tax_dict,
                                      ranks=ranks,
                                      max_size=max_group_size)

    condensed = pd.DataFrame(condensed.items(),
                             columns=[ASSIGNMENT_TAX_ID, 'condensed_id'])
    condensed = condensed.set_index(ASSIGNMENT_TAX_ID)

    if threshold_assignments:
        assignment_hash = hash(frozenset(condensed.index.unique()))
    else:
        assignment_hash = hash(frozenset(condensed['condensed_id'].unique()))

    condensed['assignment_hash'] = assignment_hash
    return df.join(condensed, on=ASSIGNMENT_TAX_ID)
예제 #2
0
파일: classifier.py 프로젝트: crosenth/bioy
def condense_ids(
        df, tax_dict, ranks, max_group_size, threshold_assignments=False):
    """
    Create mapping from tax_id to its condensed id.  Also creates the
    assignment hash on either the condensed_id or assignment_tax_id decided
    by the --split-condensed-assignments switch.

    By taking a hash of the set (frozenset) of ids the qseqid is given a
    unique identifier (the hash).  Later, we will use this hash and
    assign an ssignment name based on either the set of condensed_ids or
    assignment_tax_ids.  The modivation for using a hash rather than
    the actual assignment text for grouping is that the assignment text
    can contains extra annotations that are independent of which
    assignment group a qseqid belongs to such as a 100% id star.
    """

    condensed = sequtils.condense_ids(
        df[ASSIGNMENT_TAX_ID].unique(),
        tax_dict,
        ranks=ranks,
        max_size=max_group_size)

    condensed = pd.DataFrame(
        condensed.items(),
        columns=[ASSIGNMENT_TAX_ID, 'condensed_id'])
    condensed = condensed.set_index(ASSIGNMENT_TAX_ID)

    if threshold_assignments:
        assignment_hash = hash(frozenset(condensed.index.unique()))
    else:
        assignment_hash = hash(frozenset(condensed['condensed_id'].unique()))

    condensed['assignment_hash'] = assignment_hash
    return df.join(condensed, on=ASSIGNMENT_TAX_ID)
예제 #3
0
    def test03(self):
        """
        test max_size = 0
        """

        taxonomy = self.taxonomy
        thisdatadir = self.thisdatadir

        this_test = sys._getframe().f_code.co_name

        condensed_assignments_ref = path.join(thisdatadir,
                                              this_test,
                                              'assignments.pkl.bz2')
        condensed_assignments_ref = BZ2File(condensed_assignments_ref)
        condensed_assignments_ref = cPickle.load(condensed_assignments_ref)

        condense_assignments = lambda x: sequtils.condense_ids(
            x, taxonomy, max_size=0)
        condensed_assignments = map(condense_assignments, self.assignments)

        self.assertEquals(condensed_assignments, condensed_assignments_ref)