Esempio n. 1
0
def generate_alignment(seqrecords, sto_filename, ref_id_func, opts, load=True):
    from ..simulation import Simulation

    log = getLogger(IDEPI_LOGGER)
    hmm = None

    if hasattr(opts, 'SIM') and opts.SIM == Simulation.DUMB:
        # we're assuming pre-aligned because they're all generated from the same refseq
        with open(sto_filename, 'w') as fh:
            SeqIO.write(seqrecords, fh, 'stockholm')
    else:
        try:
            tmphmm = generate_hmm_(opts)
            tmpaln = generate_alignment_(seqrecords, tmphmm, opts, refseq=opts.REFSEQ)
            copyfile(tmpaln, sto_filename)
            log.debug('finished alignment, output moved to {0:s}'.format(sto_filename))
            with open(tmphmm, 'rb') as hmm_fh:
                hmm = hmm_fh.read()
        finally:
            if exists(tmphmm):
                remove(tmphmm)
            if exists(tmpaln):
                remove(tmpaln)

    if load:
        with open(sto_filename) as fh:
            msa = AlignIO.read(fh, 'stockholm')
        refidx = reference_index(msa, ref_id_func)
        msa = LabeledMSA.from_msa_with_ref(msa, refidx)
        ranges = stockholm_rf_ranges(sto_filename)
        return trim_msa_to_ranges(msa, ranges), hmm

    return None, hmm
Esempio n. 2
0
def generate_alignment(seqrecords, sto_filename, ref_id_func, opts, load=True):
    from ..simulation import Simulation

    log = getLogger(IDEPI_LOGGER)
    hmm = None

    if hasattr(opts, 'SIM') and opts.SIM == Simulation.DUMB:
        # we're assuming pre-aligned because they're all generated from the same refseq
        with open(sto_filename, 'w') as fh:
            SeqIO.write(seqrecords, fh, 'stockholm')
    else:
        try:
            tmphmm = generate_hmm_(opts)
            tmpaln = generate_alignment_(seqrecords,
                                         tmphmm,
                                         opts,
                                         refseq=opts.REFSEQ)
            copyfile(tmpaln, sto_filename)
            log.debug('finished alignment, output moved to {0:s}'.format(
                sto_filename))
            with open(tmphmm, 'rb') as hmm_fh:
                hmm = hmm_fh.read()
        finally:
            if exists(tmphmm):
                remove(tmphmm)
            if exists(tmpaln):
                remove(tmpaln)

    if load:
        with open(sto_filename) as fh:
            msa = AlignIO.read(fh, 'stockholm')
        refidx = reference_index(msa, ref_id_func)
        msa = LabeledMSA.from_msa_with_ref(msa, refidx)
        ranges = stockholm_rf_ranges(sto_filename)
        return trim_msa_to_ranges(msa, ranges), hmm

    return None, hmm
Esempio n. 3
0
def test_discrete(ARGS):
    # set these to this so we don't exclude anything (just testing file generation and parsing)
    ARGS.NUM_FEATURES = 15 # should be enough, the number is known to be 13
    ARGS.MRMR_METHOD = 'MID'
    ARGS.MAX_CONSERVATION = 1.0
    ARGS.MAX_GAP_RATIO    = 1.0
    ARGS.MIN_CONSERVATION = 1.0
    ARGS.CUTOFF = 20.

    # if we don't do this, DOOMBUNNIES
    set_util_params(ARGS.REFSEQ_IDS)

    fd, sto_filename = mkstemp(); close(fd)

    try:
        fh = open(sto_filename, 'w')
        print(TEST_AMINO_STO, file=fh)
        fh.close()

        alignment = AlignIO.read(sto_filename, 'stockholm')

        for ARGS.ENCODER in (AminoEncoder, StanfelEncoder):

            if ARGS.ENCODER == StanfelEncoder:
                TEST_NAMES = TEST_STANFEL_NAMES
                TEST_X = TEST_STANFEL_X
            else:
                TEST_NAMES = TEST_AMINO_NAMES
                TEST_X = TEST_AMINO_X

            # test mRMR and LSVM file generation
            ylabeler = Labeler(
                seqrecord_get_values,
                lambda row: is_refseq(row) or False, # TODO: again filtration function
                lambda x: x > ARGS.CUTOFF,
                False
            )
            alignment, y, ic50 = ylabeler(alignment)

            refidx = reference_index(alignment, is_refseq)
            alignment = LabeledMSA.from_msa_with_ref(alignment, refidx)
            extractor = SiteVectorizer(ARGS.ENCODER)
            x = extractor.fit_transform(alignment)
            colnames = extractor.get_feature_names()

            # test the feature names portion
            try:
                assert(len(colnames) == len(TEST_NAMES))
            except AssertionError:
                raise AssertionError('gen:   %s\ntruth: %s' % (colnames, TEST_NAMES))

            for name in TEST_NAMES:
                try:
                    assert(name in colnames)
                except AssertionError:
                    raise AssertionError('ERROR: \'%s\' not found in %s' % (name, ', '.join(colnames)))

            assert(np.all(TEST_X == x))

            assert(np.all(TEST_Y == y))

            # generate and test the mRMR portion
            mrmr = MRMR(
                estimator=SVC(kernel='linear'),
                n_features_to_select=ARGS.NUM_FEATURES,
                method=ARGS.MRMR_METHOD,
                normalize=ARGS.MRMR_NORMALIZE,
                similar=ARGS.SIMILAR
                )

            mrmr.fit(x, y)

    finally:
        remove(sto_filename)

    print('ALL TESTS PASS', file=sys.stderr)
Esempio n. 4
0
def test_discrete(ARGS):
    # set these to this so we don't exclude anything (just testing file generation and parsing)
    ARGS.NUM_FEATURES = 15 # should be enough, the number is known to be 13
    ARGS.MRMR_METHOD = 'MID'
    ARGS.MAX_CONSERVATION = 1.0
    ARGS.MAX_GAP_RATIO    = 1.0
    ARGS.MIN_CONSERVATION = 1.0
    ARGS.CUTOFF = 20.

    # if we don't do this, DOOMBUNNIES
    set_util_params(ARGS.REFSEQ_IDS)

    fd, sto_filename = mkstemp(); close(fd)

    try:
        fh = open(sto_filename, 'w')
        print(TEST_AMINO_STO, file=fh)
        fh.close()

        alignment = AlignIO.read(sto_filename, 'stockholm')

        for ARGS.ENCODER in (AminoEncoder, StanfelEncoder):

            if ARGS.ENCODER == StanfelEncoder:
                TEST_NAMES = TEST_STANFEL_NAMES
                TEST_X = TEST_STANFEL_X
            else:
                TEST_NAMES = TEST_AMINO_NAMES
                TEST_X = TEST_AMINO_X

            # test mRMR and LSVM file generation
            ylabeler = Labeler(
                seqrecord_get_values,
                lambda row: is_refseq(row) or False, # TODO: again filtration function
                lambda x: x > ARGS.CUTOFF,
                False
            )
            alignment, y, ic50 = ylabeler(alignment)

            refidx = reference_index(alignment, is_refseq)
            alignment = LabeledMSA.from_msa_with_ref(alignment, refidx)
            extractor = MSAVectorizer(ARGS.ENCODER)
            x = extractor.fit_transform(alignment)
            colnames = extractor.get_feature_names()

            # test the feature names portion
            try:
                assert(len(colnames) == len(TEST_NAMES))
            except AssertionError:
                raise AssertionError('gen:   %s\ntruth: %s' % (colnames, TEST_NAMES))

            for name in TEST_NAMES:
                try:
                    assert(name in colnames)
                except AssertionError:
                    raise AssertionError('ERROR: \'%s\' not found in %s' % (name, ', '.join(colnames)))

            assert(np.all(TEST_X == x))

            assert(np.all(TEST_Y == y))

            # generate and test the mRMR portion
            mrmr = MRMR(
                estimator=SVC(kernel='linear'),
                n_features_to_select=ARGS.NUM_FEATURES,
                method=ARGS.MRMR_METHOD,
                normalize=ARGS.MRMR_NORMALIZE,
                similar=ARGS.SIMILAR
                )

            mrmr.fit(x, y)

    finally:
        remove(sto_filename)

    print('ALL TESTS PASS', file=sys.stderr)