Esempio n. 1
0
def by_mrmr(X, y, n_features_to_select=None, only_get_index=True):
    from sklmrmr import MRMR
    mrmr = MRMR(n_features_to_select=n_features_to_select)
    mrmr.fit(X, y)
    index = ret.get_support(indices=True)
    if only_get_index == True:
        return index
    else:
        return mrmr.transform(X, y)
Esempio n. 2
0
 def test_mrmr(self):
     X = np.zeros((10, 10))
     X[5:, 0] = 1
     y = np.zeros(10)
     y[5:] = 1
     model = MRMR(k=1)
     model.fit(X, y)
     assert model.selected_[0] == 0
     assert model.n_features_ == 1
Esempio n. 3
0
def test2():
    from sklearn.svm import SVC
    from sklearn.datasets import load_digits
    from sklmrmr import MRMR

    digits = load_digits()
    X = digits.images.reshape((len(digits.images), -1)).astype(int)
    y = digits.target

    svc = SVC(kernel='linear', C=1)
    mrmr = MRMR(estimator=svc, n_features_to_select=5)
    mrmr.fit(X, y)
    ranking = mrmr.ranking_

    print(ranking)

    return 0
Esempio n. 4
0
def test_discrete(ARGS):
    # set these to this so we don't exclude anything (just testing file generation and parsing)
    ARGS.NUM_FEATURES = 15 # should be enough, the number is known to be 13
    ARGS.MRMR_METHOD = 'MID'
    ARGS.MAX_CONSERVATION = 1.0
    ARGS.MAX_GAP_RATIO    = 1.0
    ARGS.MIN_CONSERVATION = 1.0
    ARGS.CUTOFF = 20.

    # if we don't do this, DOOMBUNNIES
    set_util_params(ARGS.REFSEQ_IDS)

    fd, sto_filename = mkstemp(); close(fd)

    try:
        fh = open(sto_filename, 'w')
        print(TEST_AMINO_STO, file=fh)
        fh.close()

        alignment = AlignIO.read(sto_filename, 'stockholm')

        for ARGS.ENCODER in (AminoEncoder, StanfelEncoder):

            if ARGS.ENCODER == StanfelEncoder:
                TEST_NAMES = TEST_STANFEL_NAMES
                TEST_X = TEST_STANFEL_X
            else:
                TEST_NAMES = TEST_AMINO_NAMES
                TEST_X = TEST_AMINO_X

            # test mRMR and LSVM file generation
            ylabeler = Labeler(
                seqrecord_get_values,
                lambda row: is_refseq(row) or False, # TODO: again filtration function
                lambda x: x > ARGS.CUTOFF,
                False
            )
            alignment, y, ic50 = ylabeler(alignment)

            refidx = reference_index(alignment, is_refseq)
            alignment = LabeledMSA.from_msa_with_ref(alignment, refidx)
            extractor = SiteVectorizer(ARGS.ENCODER)
            x = extractor.fit_transform(alignment)
            colnames = extractor.get_feature_names()

            # test the feature names portion
            try:
                assert(len(colnames) == len(TEST_NAMES))
            except AssertionError:
                raise AssertionError('gen:   %s\ntruth: %s' % (colnames, TEST_NAMES))

            for name in TEST_NAMES:
                try:
                    assert(name in colnames)
                except AssertionError:
                    raise AssertionError('ERROR: \'%s\' not found in %s' % (name, ', '.join(colnames)))

            assert(np.all(TEST_X == x))

            assert(np.all(TEST_Y == y))

            # generate and test the mRMR portion
            mrmr = MRMR(
                estimator=SVC(kernel='linear'),
                n_features_to_select=ARGS.NUM_FEATURES,
                method=ARGS.MRMR_METHOD,
                normalize=ARGS.MRMR_NORMALIZE,
                similar=ARGS.SIMILAR
                )

            mrmr.fit(x, y)

    finally:
        remove(sto_filename)

    print('ALL TESTS PASS', file=sys.stderr)
Esempio n. 5
0
                            ' maximum relevance feature selection')
    parser.add_argument('--method', type=str, default="mid")
    parser.add_argument('--normalize', action='store_true')
    parser.add_argument('--n_features', type=int, default=10)
    parser.add_argument('--file', type=FileType('r'),
                        default=open(DEFAULT_FILE))
    parser.add_argument('--digits', action='store_true')
    ns = parser.parse_args(args)

    if ns.digits:
        X, y = get_digits()
        data_name = 'digits'
    else:
        X, y = read_csv(ns.file)
        data_name = ns.file.name

    model = MRMR(k=ns.n_features, method=ns.method, normalize=ns.normalize)
    names = list("feature_{}".format(i) for i in range(X.shape[1]))

    print('running on {}'.format(data_name))
    print('model: {}'.format(model))

    t = time()
    model.fit(X, y)
    t = time() - t
    print("time: {:.3f} seconds".format(t))

    selected_names = list(names[i]
                          for i in np.argsort(model.ranking_)[:model.k])
    print("selected features:\n{}".format(", ".join(selected_names)))
Esempio n. 6
0
def test_discrete(ARGS):
    # set these to this so we don't exclude anything (just testing file generation and parsing)
    ARGS.NUM_FEATURES = 15 # should be enough, the number is known to be 13
    ARGS.MRMR_METHOD = 'MID'
    ARGS.MAX_CONSERVATION = 1.0
    ARGS.MAX_GAP_RATIO    = 1.0
    ARGS.MIN_CONSERVATION = 1.0
    ARGS.CUTOFF = 20.

    # if we don't do this, DOOMBUNNIES
    set_util_params(ARGS.REFSEQ_IDS)

    fd, sto_filename = mkstemp(); close(fd)

    try:
        fh = open(sto_filename, 'w')
        print(TEST_AMINO_STO, file=fh)
        fh.close()

        alignment = AlignIO.read(sto_filename, 'stockholm')

        for ARGS.ENCODER in (AminoEncoder, StanfelEncoder):

            if ARGS.ENCODER == StanfelEncoder:
                TEST_NAMES = TEST_STANFEL_NAMES
                TEST_X = TEST_STANFEL_X
            else:
                TEST_NAMES = TEST_AMINO_NAMES
                TEST_X = TEST_AMINO_X

            # test mRMR and LSVM file generation
            ylabeler = Labeler(
                seqrecord_get_values,
                lambda row: is_refseq(row) or False, # TODO: again filtration function
                lambda x: x > ARGS.CUTOFF,
                False
            )
            alignment, y, ic50 = ylabeler(alignment)

            refidx = reference_index(alignment, is_refseq)
            alignment = LabeledMSA.from_msa_with_ref(alignment, refidx)
            extractor = MSAVectorizer(ARGS.ENCODER)
            x = extractor.fit_transform(alignment)
            colnames = extractor.get_feature_names()

            # test the feature names portion
            try:
                assert(len(colnames) == len(TEST_NAMES))
            except AssertionError:
                raise AssertionError('gen:   %s\ntruth: %s' % (colnames, TEST_NAMES))

            for name in TEST_NAMES:
                try:
                    assert(name in colnames)
                except AssertionError:
                    raise AssertionError('ERROR: \'%s\' not found in %s' % (name, ', '.join(colnames)))

            assert(np.all(TEST_X == x))

            assert(np.all(TEST_Y == y))

            # generate and test the mRMR portion
            mrmr = MRMR(
                estimator=SVC(kernel='linear'),
                n_features_to_select=ARGS.NUM_FEATURES,
                method=ARGS.MRMR_METHOD,
                normalize=ARGS.MRMR_NORMALIZE,
                similar=ARGS.SIMILAR
                )

            mrmr.fit(x, y)

    finally:
        remove(sto_filename)

    print('ALL TESTS PASS', file=sys.stderr)
Esempio n. 7
0
def main(args=None):
    init_log()

    if args is None:
        args = sys.argv[1:]

    np.seterr(all='raise')

    # so some option parsing
    parser, ns, args = init_args(description="Predict epitope sites.",
                                 args=args)

    parser = hmmer_args(parser)
    parser = featsel_args(parser)
    parser = feature_args(parser)
    parser = mrmr_args(parser)
    parser = rfe_args(parser)
    parser = optstat_args(parser)
    parser = filter_args(parser)
    parser = svm_args(parser)
    parser = cv_args(parser)

    parser.add_argument('ANTIBODY',
                        type=AntibodyTypeFactory(ns.DATA),
                        nargs='+')

    ARGS = parse_args(parser, args, namespace=ns)

    # do some argument parsing
    if ARGS.TEST:
        test_discrete(ARGS)
        finalize_args(ARGS)
        return {}

    # maxrel doesn't support similar
    if ARGS.MRMR_METHOD == 'MAXREL':
        ARGS.SIMILAR = 0.0

    antibodies = tuple(ARGS.ANTIBODY)

    # set the util params
    set_util_params(ARGS.REFSEQ.id)

    # grab the relevant antibody from the SQLITE3 data
    # format as SeqRecord so we can output as FASTA
    # and generate an alignment using HMMER if it doesn't already exist
    seqrecords, clonal, antibodies = ARGS.DATA.seqrecords(
        antibodies, ARGS.CLONAL)

    # if we're doing LOOCV, make sure we set CV_FOLDS appropriately
    if ARGS.LOOCV:
        ARGS.CV_FOLDS = len(seqrecords)

    ab_basename = ''.join(('+'.join(antibodies),
                           '_dna' if ARGS.ENCODER == DNAEncoder else '_amino',
                           '_clonal' if clonal else ''))
    alignment_basename = '_'.join(
        (ab_basename, ARGS.DATA.basename_root, __version__))
    sto_filename = alignment_basename + '.sto'

    # don't capture the second variable, let it be gc'd
    alignment = generate_alignment(seqrecords, sto_filename, is_refseq,
                                   ARGS)[0]

    re_pngs = re_compile(r'N[^P][TS][^P]', re_I)

    ylabeler = Labeler(partial(expression, ARGS.LABEL),
                       partial(skipper, is_refseq, ARGS.SUBTYPES))
    alignment, y, threshold = ylabeler(alignment)

    filter = naive_filter(max_conservation=ARGS.MAX_CONSERVATION,
                          min_conservation=ARGS.MIN_CONSERVATION,
                          max_gap_ratio=ARGS.MAX_GAP_RATIO)

    extractors = [('site_ident', MSAVectorizer(ARGS.ENCODER, filter))]

    if ARGS.RADIUS:
        extractors.append(('pair_ident',
                           MSAVectorizerPairwise(ARGS.ENCODER, filter,
                                                 ARGS.RADIUS)))

    if ARGS.PNGS:
        extractors.append(('pngs', MSAVectorizerRegex(re_pngs, 4,
                                                      name='PNGS')))

    if ARGS.PNGS_PAIRS:
        extractors.append(
            ('pngs_pair', MSAVectorizerRegexPairwise(re_pngs, 4, name='PNGS')))

    extractor = FeatureUnion(extractors, n_jobs=1)  # n_jobs must be 1 for now
    X = extractor.fit_transform(alignment)

    assert y.shape[0] == X.shape[0], \
        "number of classes doesn't match the data: %d vs %d" % (y.shape[0], X.shape[0])

    scorer = Scorer(ARGS.OPTSTAT)

    # do grid-search as part of the svm to avoid
    # performing feature selection on every iteration
    # of the grid search, which naturally takes forever
    svm = GridSearchCV(estimator=SVC(kernel='linear', class_weight='auto'),
                       param_grid=dict(C=list(C_range(*ARGS.LOG2C))),
                       scoring=scorer,
                       n_jobs=int(getenv('NCPU', -1)),
                       pre_dispatch='3 * n_jobs',
                       cv=ARGS.CV_FOLDS - 1)

    results = None
    for n_features in ARGS.FEATURE_GRID:
        results_ = Results(extractor.get_feature_names(), scorer, ARGS.SIMILAR)

        for train_idxs, test_idxs in StratifiedKFold(y, ARGS.CV_FOLDS):

            if train_idxs.sum() < 1 or test_idxs.sum() < 1:
                y_true = y[test_idxs]
                results_.add(y_true, y_true, {})
                continue

            X_train = X[train_idxs]
            y_train = y[train_idxs]

            if ARGS.RFE:
                clf = RFE(estimator=svm,
                          n_features_to_select=n_features,
                          step=ARGS.RFE_STEP)
            else:
                mrmr = MRMR(k=n_features,
                            method=ARGS.MRMR_METHOD,
                            normalize=ARGS.MRMR_NORMALIZE,
                            similar=ARGS.SIMILAR)
                clf = Pipeline([('mrmr', mrmr), ('svm', svm)])

            clf.fit(X_train, y_train)

            X_test = X[test_idxs]
            y_true = y[test_idxs]

            if ARGS.RFE:
                selector_ = clf
                svm_ = clf.estimator_.best_estimator_
            else:
                selector_ = clf.named_steps['mrmr']
                svm_ = clf.named_steps['svm'].best_estimator_

            y_pred = clf.predict(X_test)

            coefs, ranks = coefs_ranks(selector_.ranking_, selector_.support_,
                                       svm_.coef_)

            results_.add(y_true, y_pred, coefs, ranks)

        if results is None or results_ > results:
            results = results_

    # the alignment reflects the number of sequences either naturally
    results.metadata(antibodies, ARGS.LABEL)

    print(results.dumps(), file=ARGS.OUTPUT)

    finalize_args(ARGS)

    return results
Esempio n. 8
0
def main(args=None):
    if args is None:
        args = sys.argv[1:]

    np.seterr(all='raise')

    parser, ns, args = init_args(
        description='learn model for labeled sequences', args=args)

    parser = hmmer_args(parser)
    parser = featsel_args(parser)
    parser = feature_args(parser)
    parser = mrmr_args(parser)
    parser = rfe_args(parser)
    parser = optstat_args(parser)
    parser = filter_args(parser)
    parser = svm_args(parser)
    parser = cv_args(parser)

    def GzipType(string):
        try:
            return gzip_open(string, 'wb')
        except:
            return ArgumentTypeError(
                "cannot open '{0:s}' for writing".format(string))

    parser.add_argument('--tree', dest='TREE')
    parser.add_argument('ANTIBODY',
                        type=AntibodyTypeFactory(ns.DATA),
                        nargs='+')
    parser.add_argument('MODEL', type=GzipType)

    ARGS = parse_args(parser, args, namespace=ns)

    antibodies = tuple(ARGS.ANTIBODY)

    # do some argument parsing
    if ARGS.TEST:
        test_discrete(ARGS)
        finalize_args(ARGS)
        return {}

    if ARGS.MRMR_METHOD == 'MAXREL':
        ARGS.SIMILAR = 0.0

    # set the util params
    set_util_params(ARGS.REFSEQ.id)

    # grab the relevant antibody from the SQLITE3 data
    # format as SeqRecord so we can output as FASTA
    # and generate an alignment using HMMER if it doesn't already exist
    seqrecords, clonal, antibodies = ARGS.DATA.seqrecords(
        antibodies, ARGS.CLONAL)

    ab_basename = ''.join(('+'.join(antibodies),
                           '_dna' if ARGS.ENCODER == DNAEncoder else '_amino',
                           '_clonal' if clonal else ''))
    alignment_basename = '_'.join(
        (ab_basename, ARGS.DATA.basename_root, __version__))
    sto_filename = alignment_basename + '.sto'

    alignment, hmm = generate_alignment(seqrecords, sto_filename, is_refseq,
                                        ARGS)

    re_pngs = re_compile(r'N[^P][TS][^P]', re_I)

    # compute features
    ylabeler = Labeler(partial(expression, ARGS.LABEL),
                       partial(skipper, is_refseq, ARGS.SUBTYPES))
    alignment, y, threshold = ylabeler(alignment)

    filter = naive_filter(max_conservation=ARGS.MAX_CONSERVATION,
                          min_conservation=ARGS.MIN_CONSERVATION,
                          max_gap_ratio=ARGS.MAX_GAP_RATIO)

    extractors = [('site', MSAVectorizer(ARGS.ENCODER, filter))]

    if ARGS.RADIUS:
        extractors.append(('site_pairs',
                           MSAVectorizerPairwise(ARGS.ENCODER, filter,
                                                 ARGS.RADIUS)))

    if ARGS.PNGS:
        extractors.append(('pngs', MSAVectorizerRegex(re_pngs, 4,
                                                      name='PNGS')))

    if ARGS.PNGS_PAIRS:
        extractors.append(
            ('pngs_pairs', MSAVectorizerRegexPairwise(re_pngs, 4,
                                                      name='PNGS')))

    extractor = FeatureUnion(extractors,
                             n_jobs=1)  # n_jobs must be one for now
    X = extractor.fit_transform(alignment)

    Cs = list(C_range(*ARGS.LOG2C))
    scorer = Scorer(ARGS.OPTSTAT)

    # we don't let GridSearchCV do its parallelization over all combinations
    # of grid points, because when the length of FEATURE_GRID is short,
    # it takes way longer than it should

    # usually the # of Cs is larger than the # of ks
    C_jobs = int(getenv('NCPU', -1))
    k_jobs = 1

    # if not, swap the parallelization strategy
    if len(ARGS.FEATURE_GRID) > len(Cs):
        C_jobs, k_jobs = k_jobs, C_jobs

    mrmr = MRMR(method=ARGS.MRMR_METHOD,
                normalize=ARGS.MRMR_NORMALIZE,
                similar=ARGS.SIMILAR)
    svm = GridSearchCV(estimator=SVC(kernel='linear', class_weight='auto'),
                       param_grid=dict(C=Cs),
                       scoring=scorer,
                       n_jobs=C_jobs,
                       pre_dispatch='3 * n_jobs')
    pipe = Pipeline([('mrmr', mrmr), ('svm', svm)])

    if len(ARGS.FEATURE_GRID) == 1:
        pipe.set_params(mrmr__k=ARGS.FEATURE_GRID[0], svm__cv=ARGS.CV_FOLDS)
        clf = pipe.fit(X, y)
    else:
        pipe.set_params(svm__cv=ARGS.CV_FOLDS - 1)
        clf = GridSearchCV(estimator=pipe,
                           param_grid=dict(mrmr__k=ARGS.FEATURE_GRID),
                           scoring=scorer,
                           n_jobs=k_jobs,
                           pre_dispatch='3 * n_jobs',
                           cv=ARGS.CV_FOLDS).fit(X, y).best_estimator_

    pickle_dump((4, ARGS.ENCODER, ARGS.LABEL, hmm, extractor, clf), ARGS.MODEL)
    ARGS.MODEL.close()

    mrmr_ = clf.named_steps['mrmr']
    svm_ = clf.named_steps['svm'].best_estimator_

    coefs, ranks = coefs_ranks(mrmr_.ranking_, mrmr_.support_, svm_.coef_)
    results = Results(extractor.get_feature_names(), scorer, ARGS.SIMILAR)

    results.add(y, clf.predict(X), coefs, ranks)
    results.metadata(antibodies, ARGS.LABEL)

    print(results.dumps(), file=ARGS.OUTPUT)

    finalize_args(ARGS)

    return ARGS.MODEL