Esempio n. 1
0
def main(args=None):
    init_log()

    if args is None:
        args = sys.argv[1:]

    np.seterr(all='raise')

    # so some option parsing
    parser, ns, args = init_args(description="Predict epitope sites.", args=args)

    parser = hmmer_args(parser)
    parser = featsel_args(parser)
    parser = feature_args(parser)
    parser = mrmr_args(parser)
    parser = rfe_args(parser)
    parser = optstat_args(parser)
    parser = filter_args(parser)
    parser = svm_args(parser)
    parser = cv_args(parser)

    parser.add_argument('ANTIBODY', type=AntibodyTypeFactory(ns.DATA), nargs='+')

    ARGS = parse_args(parser, args, namespace=ns)

    # do some argument parsing
    if ARGS.TEST:
        test_discrete(ARGS)
        finalize_args(ARGS)
        return {}

    # maxrel doesn't support similar
    if ARGS.MRMR_METHOD == 'MAXREL':
        ARGS.SIMILAR = 0.0

    antibodies = tuple(ARGS.ANTIBODY)

    # set the util params
    set_util_params(ARGS.REFSEQ.id)

    # grab the relevant antibody from the SQLITE3 data
    # format as SeqRecord so we can output as FASTA
    # and generate an alignment using HMMER if it doesn't already exist
    seqrecords, clonal, antibodies = ARGS.DATA.seqrecords(antibodies, ARGS.CLONAL)

    # if we're doing LOOCV, make sure we set CV_FOLDS appropriately
    if ARGS.LOOCV:
        ARGS.CV_FOLDS = len(seqrecords)

    ab_basename = ''.join((
        '+'.join(antibodies),
        '_dna' if ARGS.ENCODER == DNAEncoder else '_amino',
        '_clonal' if clonal else ''
        ))
    alignment_basename = '_'.join((
        ab_basename,
        ARGS.DATA.basename_root,
        __version__
        ))
    sto_filename = alignment_basename + '.sto'

    # don't capture the second variable, let it be gc'd
    alignment = generate_alignment(seqrecords, sto_filename, is_refseq, ARGS)[0]

    re_pngs = re_compile(r'N[^P][TS][^P]', re_I)

    ylabeler = Labeler(
        partial(expression, ARGS.LABEL),
        partial(skipper, is_refseq, ARGS.SUBTYPES)
        )
    alignment, y, threshold = ylabeler(alignment)

    filter = naive_filter(
        max_conservation=ARGS.MAX_CONSERVATION,
        min_conservation=ARGS.MIN_CONSERVATION,
        max_gap_ratio=ARGS.MAX_GAP_RATIO
        )

    extractors = [('site_ident', MSAVectorizer(ARGS.ENCODER, filter))]

    if ARGS.RADIUS:
        extractors.append(('pair_ident', MSAVectorizerPairwise(ARGS.ENCODER, filter, ARGS.RADIUS)))

    if ARGS.PNGS:
        extractors.append(('pngs', MSAVectorizerRegex(re_pngs, 4, name='PNGS')))

    if ARGS.PNGS_PAIRS:
        extractors.append(
            ('pngs_pair', MSAVectorizerRegexPairwise(re_pngs, 4, name='PNGS'))
            )

    extractor = FeatureUnion(extractors, n_jobs=1)  # n_jobs must be 1 for now
    X = extractor.fit_transform(alignment)

    assert y.shape[0] == X.shape[0], \
        "number of classes doesn't match the data: %d vs %d" % (y.shape[0], X.shape[0])

    scorer = Scorer(ARGS.OPTSTAT)

    # do grid-search as part of the svm to avoid
    # performing feature selection on every iteration
    # of the grid search, which naturally takes forever
    svm = GridSearchCV(
        estimator=SVC(kernel='linear', class_weight='auto'),
        param_grid=dict(C=list(C_range(*ARGS.LOG2C))),
        scoring=scorer,
        n_jobs=int(getenv('NCPU', -1)),
        pre_dispatch='3 * n_jobs',
        cv=ARGS.CV_FOLDS - 1
        )

    results = None
    for n_features in ARGS.FEATURE_GRID:
        results_ = Results(extractor.get_feature_names(), scorer, ARGS.SIMILAR)

        for train_idxs, test_idxs in StratifiedKFold(y, ARGS.CV_FOLDS):

            if train_idxs.sum() < 1 or test_idxs.sum() < 1:
                y_true = y[test_idxs]
                results_.add(y_true, y_true, {})
                continue

            X_train = X[train_idxs]
            y_train = y[train_idxs]

            if ARGS.RFE:
                clf = RFE(
                    estimator=svm,
                    n_features_to_select=n_features,
                    step=ARGS.RFE_STEP
                    )
            else:
                mrmr = MRMR(
                    k=n_features,
                    method=ARGS.MRMR_METHOD,
                    normalize=ARGS.MRMR_NORMALIZE,
                    similar=ARGS.SIMILAR
                    )
                clf = Pipeline([('mrmr', mrmr), ('svm', svm)])

            clf.fit(X_train, y_train)

            X_test = X[test_idxs]
            y_true = y[test_idxs]

            if ARGS.RFE:
                selector_ = clf
                svm_ = clf.estimator_.best_estimator_
            else:
                selector_ = clf.named_steps['mrmr']
                svm_ = clf.named_steps['svm'].best_estimator_

            y_pred = clf.predict(X_test)

            coefs, ranks = coefs_ranks(selector_.ranking_, selector_.support_, svm_.coef_)

            results_.add(y_true, y_pred, coefs, ranks)

        if results is None or results_ > results:
            results = results_

    # the alignment reflects the number of sequences either naturally
    results.metadata(antibodies, ARGS.LABEL)

    print(results.dumps(), file=ARGS.OUTPUT)

    finalize_args(ARGS)

    return results
Esempio n. 2
0
def main(args=None):
    init_log()

    if args is None:
        args = sys.argv[1:]

    np.seterr(all='raise')

    # so some option parsing
    parser, ns, args = init_args(description="Predict epitope sites.",
                                 args=args)

    parser = hmmer_args(parser)
    parser = featsel_args(parser)
    parser = feature_args(parser)
    parser = mrmr_args(parser)
    parser = rfe_args(parser)
    parser = optstat_args(parser)
    parser = filter_args(parser)
    parser = svm_args(parser)
    parser = cv_args(parser)

    parser.add_argument('ANTIBODY',
                        type=AntibodyTypeFactory(ns.DATA),
                        nargs='+')

    ARGS = parse_args(parser, args, namespace=ns)

    # do some argument parsing
    if ARGS.TEST:
        test_discrete(ARGS)
        finalize_args(ARGS)
        return {}

    # maxrel doesn't support similar
    if ARGS.MRMR_METHOD == 'MAXREL':
        ARGS.SIMILAR = 0.0

    antibodies = tuple(ARGS.ANTIBODY)

    # set the util params
    set_util_params(ARGS.REFSEQ.id)

    # grab the relevant antibody from the SQLITE3 data
    # format as SeqRecord so we can output as FASTA
    # and generate an alignment using HMMER if it doesn't already exist
    seqrecords, clonal, antibodies = ARGS.DATA.seqrecords(
        antibodies, ARGS.CLONAL)

    # if we're doing LOOCV, make sure we set CV_FOLDS appropriately
    if ARGS.LOOCV:
        ARGS.CV_FOLDS = len(seqrecords)

    ab_basename = ''.join(('+'.join(antibodies),
                           '_dna' if ARGS.ENCODER == DNAEncoder else '_amino',
                           '_clonal' if clonal else ''))
    alignment_basename = '_'.join(
        (ab_basename, ARGS.DATA.basename_root, __version__))
    sto_filename = alignment_basename + '.sto'

    # don't capture the second variable, let it be gc'd
    alignment = generate_alignment(seqrecords, sto_filename, is_refseq,
                                   ARGS)[0]

    re_pngs = re_compile(r'N[^P][TS][^P]', re_I)

    ylabeler = Labeler(partial(expression, ARGS.LABEL),
                       partial(skipper, is_refseq, ARGS.SUBTYPES))
    alignment, y, threshold = ylabeler(alignment)

    filter = naive_filter(max_conservation=ARGS.MAX_CONSERVATION,
                          min_conservation=ARGS.MIN_CONSERVATION,
                          max_gap_ratio=ARGS.MAX_GAP_RATIO)

    extractors = [('site_ident', MSAVectorizer(ARGS.ENCODER, filter))]

    if ARGS.RADIUS:
        extractors.append(('pair_ident',
                           MSAVectorizerPairwise(ARGS.ENCODER, filter,
                                                 ARGS.RADIUS)))

    if ARGS.PNGS:
        extractors.append(('pngs', MSAVectorizerRegex(re_pngs, 4,
                                                      name='PNGS')))

    if ARGS.PNGS_PAIRS:
        extractors.append(
            ('pngs_pair', MSAVectorizerRegexPairwise(re_pngs, 4, name='PNGS')))

    extractor = FeatureUnion(extractors, n_jobs=1)  # n_jobs must be 1 for now
    X = extractor.fit_transform(alignment)

    assert y.shape[0] == X.shape[0], \
        "number of classes doesn't match the data: %d vs %d" % (y.shape[0], X.shape[0])

    scorer = Scorer(ARGS.OPTSTAT)

    # do grid-search as part of the svm to avoid
    # performing feature selection on every iteration
    # of the grid search, which naturally takes forever
    svm = GridSearchCV(estimator=SVC(kernel='linear', class_weight='auto'),
                       param_grid=dict(C=list(C_range(*ARGS.LOG2C))),
                       scoring=scorer,
                       n_jobs=int(getenv('NCPU', -1)),
                       pre_dispatch='3 * n_jobs',
                       cv=ARGS.CV_FOLDS - 1)

    results = None
    for n_features in ARGS.FEATURE_GRID:
        results_ = Results(extractor.get_feature_names(), scorer, ARGS.SIMILAR)

        for train_idxs, test_idxs in StratifiedKFold(y, ARGS.CV_FOLDS):

            if train_idxs.sum() < 1 or test_idxs.sum() < 1:
                y_true = y[test_idxs]
                results_.add(y_true, y_true, {})
                continue

            X_train = X[train_idxs]
            y_train = y[train_idxs]

            if ARGS.RFE:
                clf = RFE(estimator=svm,
                          n_features_to_select=n_features,
                          step=ARGS.RFE_STEP)
            else:
                mrmr = MRMR(k=n_features,
                            method=ARGS.MRMR_METHOD,
                            normalize=ARGS.MRMR_NORMALIZE,
                            similar=ARGS.SIMILAR)
                clf = Pipeline([('mrmr', mrmr), ('svm', svm)])

            clf.fit(X_train, y_train)

            X_test = X[test_idxs]
            y_true = y[test_idxs]

            if ARGS.RFE:
                selector_ = clf
                svm_ = clf.estimator_.best_estimator_
            else:
                selector_ = clf.named_steps['mrmr']
                svm_ = clf.named_steps['svm'].best_estimator_

            y_pred = clf.predict(X_test)

            coefs, ranks = coefs_ranks(selector_.ranking_, selector_.support_,
                                       svm_.coef_)

            results_.add(y_true, y_pred, coefs, ranks)

        if results is None or results_ > results:
            results = results_

    # the alignment reflects the number of sequences either naturally
    results.metadata(antibodies, ARGS.LABEL)

    print(results.dumps(), file=ARGS.OUTPUT)

    finalize_args(ARGS)

    return results
Esempio n. 3
0
def main(args=None):
    if args is None:
        args = sys.argv[1:]

    np.seterr(all='raise')

    parser, ns, args = init_args(description='learn model for labeled sequences', args=args)

    parser = hmmer_args(parser)
    parser = featsel_args(parser)
    parser = feature_args(parser)
    parser = mrmr_args(parser)
    parser = rfe_args(parser)
    parser = optstat_args(parser)
    parser = filter_args(parser)
    parser = svm_args(parser)
    parser = cv_args(parser)

    def GzipType(string):
        try:
            return gzip_open(string, 'wb')
        except:
            return ArgumentTypeError("cannot open '{0:s}' for writing".format(string))

    parser.add_argument('--tree', dest='TREE')
    parser.add_argument('ANTIBODY', type=AntibodyTypeFactory(ns.DATA), nargs='+')
    parser.add_argument('MODEL', type=GzipType)

    ARGS = parse_args(parser, args, namespace=ns)

    antibodies = tuple(ARGS.ANTIBODY)

    # do some argument parsing
    if ARGS.TEST:
        test_discrete(ARGS)
        finalize_args(ARGS)
        return {}

    if ARGS.MRMR_METHOD == 'MAXREL':
        ARGS.SIMILAR = 0.0

    # set the util params
    set_util_params(ARGS.REFSEQ.id)

    # grab the relevant antibody from the SQLITE3 data
    # format as SeqRecord so we can output as FASTA
    # and generate an alignment using HMMER if it doesn't already exist
    seqrecords, clonal, antibodies = ARGS.DATA.seqrecords(antibodies, ARGS.CLONAL)

    ab_basename = ''.join((
        '+'.join(antibodies),
        '_dna' if ARGS.ENCODER == DNAEncoder else '_amino',
        '_clonal' if clonal else ''
        ))
    alignment_basename = '_'.join((
        ab_basename,
        ARGS.DATA.basename_root,
        __version__
        ))
    sto_filename = alignment_basename + '.sto'

    alignment, hmm = generate_alignment(seqrecords, sto_filename, is_refseq, ARGS)

    re_pngs = re_compile(r'N[^P][TS][^P]', re_I)

    # compute features
    ylabeler = Labeler(
        partial(expression, ARGS.LABEL),
        partial(skipper, is_refseq, ARGS.SUBTYPES)
    )
    alignment, y, threshold = ylabeler(alignment)

    filter = naive_filter(
        max_conservation=ARGS.MAX_CONSERVATION,
        min_conservation=ARGS.MIN_CONSERVATION,
        max_gap_ratio=ARGS.MAX_GAP_RATIO
        )

    extractors = [('site', SiteVectorizer(ARGS.ENCODER, filter))]

    if ARGS.RADIUS:
        extractors.append(('site_pairs', PairwiseSiteVectorizer(ARGS.ENCODER, filter, ARGS.RADIUS)))

    if ARGS.PNGS:
        extractors.append(('pngs', MotifVectorizer(re_pngs, 4, name='PNGS')))

    if ARGS.PNGS_PAIRS:
        extractors.append(
            ('pngs_pairs', PairwiseMotifVectorizer(re_pngs, 4, name='PNGS'))
            )

    extractor = FeatureUnion(extractors, n_jobs=1)  # n_jobs must be one for now
    X = extractor.fit_transform(alignment)

    Cs = list(C_range(*ARGS.LOG2C))
    scorer = Scorer(ARGS.OPTSTAT)

    # we don't let GridSearchCV do its parallelization over all combinations
    # of grid points, because when the length of FEATURE_GRID is short,
    # it takes way longer than it should

    # usually the # of Cs is larger than the # of ks
    C_jobs = int(getenv('NCPU', -1))
    k_jobs = 1

    # if not, swap the parallelization strategy
    if len(ARGS.FEATURE_GRID) > len(Cs):
        C_jobs, k_jobs = k_jobs, C_jobs

    mrmr = MRMR(
        method=ARGS.MRMR_METHOD,
        normalize=ARGS.MRMR_NORMALIZE,
        similar=ARGS.SIMILAR
        )
    svm = GridSearchCV(
        estimator=SVC(kernel='linear', class_weight='auto'),
        param_grid=dict(C=Cs),
        scoring=scorer,
        n_jobs=C_jobs,
        pre_dispatch='3 * n_jobs'
        )
    pipe = Pipeline([('mrmr', mrmr), ('svm', svm)])

    if len(ARGS.FEATURE_GRID) == 1:
        pipe.set_params(mrmr__k=ARGS.FEATURE_GRID[0], svm__cv=ARGS.CV_FOLDS)
        clf = pipe.fit(X, y)
    else:
        pipe.set_params(svm__cv=ARGS.CV_FOLDS - 1)
        clf = GridSearchCV(
            estimator=pipe,
            param_grid=dict(mrmr__k=ARGS.FEATURE_GRID),
            scoring=scorer,
            n_jobs=k_jobs,
            pre_dispatch='3 * n_jobs',
            cv=ARGS.CV_FOLDS
            ).fit(X, y).best_estimator_

    pickle_dump((MODEL_VERSION, ARGS.ENCODER, ARGS.LABEL, hmm, extractor, clf), ARGS.MODEL)
    ARGS.MODEL.close()

    mrmr_ = clf.named_steps['mrmr']
    svm_ = clf.named_steps['svm'].best_estimator_

    coefs, ranks = coefs_ranks(mrmr_.ranking_, mrmr_.support_, svm_.coef_)
    results = Results(extractor.get_feature_names(), scorer, ARGS.SIMILAR)

    results.add(y, clf.predict(X), coefs, ranks)
    results.metadata(antibodies, ARGS.LABEL)

    print(results.dumps(), file=ARGS.OUTPUT)

    finalize_args(ARGS)

    return ARGS.MODEL
Esempio n. 4
0
def main(args=None):
    if args is None:
        args = sys.argv[1:]

    np.seterr(all='raise')

    parser, ns, args = init_args(
        description='learn model for labeled sequences', args=args)

    parser = hmmer_args(parser)
    parser = featsel_args(parser)
    parser = feature_args(parser)
    parser = mrmr_args(parser)
    parser = rfe_args(parser)
    parser = optstat_args(parser)
    parser = filter_args(parser)
    parser = svm_args(parser)
    parser = cv_args(parser)

    def GzipType(string):
        try:
            return gzip_open(string, 'wb')
        except:
            return ArgumentTypeError(
                "cannot open '{0:s}' for writing".format(string))

    parser.add_argument('--tree', dest='TREE')
    parser.add_argument('ANTIBODY',
                        type=AntibodyTypeFactory(ns.DATA),
                        nargs='+')
    parser.add_argument('MODEL', type=GzipType)

    ARGS = parse_args(parser, args, namespace=ns)

    antibodies = tuple(ARGS.ANTIBODY)

    # do some argument parsing
    if ARGS.TEST:
        test_discrete(ARGS)
        finalize_args(ARGS)
        return {}

    if ARGS.MRMR_METHOD == 'MAXREL':
        ARGS.SIMILAR = 0.0

    # set the util params
    set_util_params(ARGS.REFSEQ.id)

    # grab the relevant antibody from the SQLITE3 data
    # format as SeqRecord so we can output as FASTA
    # and generate an alignment using HMMER if it doesn't already exist
    seqrecords, clonal, antibodies = ARGS.DATA.seqrecords(
        antibodies, ARGS.CLONAL)

    ab_basename = ''.join(('+'.join(antibodies),
                           '_dna' if ARGS.ENCODER == DNAEncoder else '_amino',
                           '_clonal' if clonal else ''))
    alignment_basename = '_'.join(
        (ab_basename, ARGS.DATA.basename_root, __version__))
    sto_filename = alignment_basename + '.sto'

    alignment, hmm = generate_alignment(seqrecords, sto_filename, is_refseq,
                                        ARGS)

    re_pngs = re_compile(r'N[^P][TS][^P]', re_I)

    # compute features
    ylabeler = Labeler(partial(expression, ARGS.LABEL),
                       partial(skipper, is_refseq, ARGS.SUBTYPES))
    alignment, y, threshold = ylabeler(alignment)

    filter = naive_filter(max_conservation=ARGS.MAX_CONSERVATION,
                          min_conservation=ARGS.MIN_CONSERVATION,
                          max_gap_ratio=ARGS.MAX_GAP_RATIO)

    extractors = [('site', MSAVectorizer(ARGS.ENCODER, filter))]

    if ARGS.RADIUS:
        extractors.append(('site_pairs',
                           MSAVectorizerPairwise(ARGS.ENCODER, filter,
                                                 ARGS.RADIUS)))

    if ARGS.PNGS:
        extractors.append(('pngs', MSAVectorizerRegex(re_pngs, 4,
                                                      name='PNGS')))

    if ARGS.PNGS_PAIRS:
        extractors.append(
            ('pngs_pairs', MSAVectorizerRegexPairwise(re_pngs, 4,
                                                      name='PNGS')))

    extractor = FeatureUnion(extractors,
                             n_jobs=1)  # n_jobs must be one for now
    X = extractor.fit_transform(alignment)

    Cs = list(C_range(*ARGS.LOG2C))
    scorer = Scorer(ARGS.OPTSTAT)

    # we don't let GridSearchCV do its parallelization over all combinations
    # of grid points, because when the length of FEATURE_GRID is short,
    # it takes way longer than it should

    # usually the # of Cs is larger than the # of ks
    C_jobs = int(getenv('NCPU', -1))
    k_jobs = 1

    # if not, swap the parallelization strategy
    if len(ARGS.FEATURE_GRID) > len(Cs):
        C_jobs, k_jobs = k_jobs, C_jobs

    mrmr = MRMR(method=ARGS.MRMR_METHOD,
                normalize=ARGS.MRMR_NORMALIZE,
                similar=ARGS.SIMILAR)
    svm = GridSearchCV(estimator=SVC(kernel='linear', class_weight='auto'),
                       param_grid=dict(C=Cs),
                       scoring=scorer,
                       n_jobs=C_jobs,
                       pre_dispatch='3 * n_jobs')
    pipe = Pipeline([('mrmr', mrmr), ('svm', svm)])

    if len(ARGS.FEATURE_GRID) == 1:
        pipe.set_params(mrmr__k=ARGS.FEATURE_GRID[0], svm__cv=ARGS.CV_FOLDS)
        clf = pipe.fit(X, y)
    else:
        pipe.set_params(svm__cv=ARGS.CV_FOLDS - 1)
        clf = GridSearchCV(estimator=pipe,
                           param_grid=dict(mrmr__k=ARGS.FEATURE_GRID),
                           scoring=scorer,
                           n_jobs=k_jobs,
                           pre_dispatch='3 * n_jobs',
                           cv=ARGS.CV_FOLDS).fit(X, y).best_estimator_

    pickle_dump((4, ARGS.ENCODER, ARGS.LABEL, hmm, extractor, clf), ARGS.MODEL)
    ARGS.MODEL.close()

    mrmr_ = clf.named_steps['mrmr']
    svm_ = clf.named_steps['svm'].best_estimator_

    coefs, ranks = coefs_ranks(mrmr_.ranking_, mrmr_.support_, svm_.coef_)
    results = Results(extractor.get_feature_names(), scorer, ARGS.SIMILAR)

    results.add(y, clf.predict(X), coefs, ranks)
    results.metadata(antibodies, ARGS.LABEL)

    print(results.dumps(), file=ARGS.OUTPUT)

    finalize_args(ARGS)

    return ARGS.MODEL