Esempio n. 1
0
def main(args=None):
    init_log()

    if args is None:
        args = sys.argv[1:]

    np.seterr(all='raise')

    # so some option parsing
    parser, ns, args = init_args(description="Predict epitope sites.", args=args)

    parser = hmmer_args(parser)
    parser = featsel_args(parser)
    parser = feature_args(parser)
    parser = mrmr_args(parser)
    parser = rfe_args(parser)
    parser = optstat_args(parser)
    parser = filter_args(parser)
    parser = svm_args(parser)
    parser = cv_args(parser)

    parser.add_argument('ANTIBODY', type=AntibodyTypeFactory(ns.DATA), nargs='+')

    ARGS = parse_args(parser, args, namespace=ns)

    # do some argument parsing
    if ARGS.TEST:
        test_discrete(ARGS)
        finalize_args(ARGS)
        return {}

    # maxrel doesn't support similar
    if ARGS.MRMR_METHOD == 'MAXREL':
        ARGS.SIMILAR = 0.0

    antibodies = tuple(ARGS.ANTIBODY)

    # set the util params
    set_util_params(ARGS.REFSEQ.id)

    # grab the relevant antibody from the SQLITE3 data
    # format as SeqRecord so we can output as FASTA
    # and generate an alignment using HMMER if it doesn't already exist
    seqrecords, clonal, antibodies = ARGS.DATA.seqrecords(antibodies, ARGS.CLONAL)

    # if we're doing LOOCV, make sure we set CV_FOLDS appropriately
    if ARGS.LOOCV:
        ARGS.CV_FOLDS = len(seqrecords)

    ab_basename = ''.join((
        '+'.join(antibodies),
        '_dna' if ARGS.ENCODER == DNAEncoder else '_amino',
        '_clonal' if clonal else ''
        ))
    alignment_basename = '_'.join((
        ab_basename,
        ARGS.DATA.basename_root,
        __version__
        ))
    sto_filename = alignment_basename + '.sto'

    # don't capture the second variable, let it be gc'd
    alignment = generate_alignment(seqrecords, sto_filename, is_refseq, ARGS)[0]

    re_pngs = re_compile(r'N[^P][TS][^P]', re_I)

    ylabeler = Labeler(
        partial(expression, ARGS.LABEL),
        partial(skipper, is_refseq, ARGS.SUBTYPES)
        )
    alignment, y, threshold = ylabeler(alignment)

    filter = naive_filter(
        max_conservation=ARGS.MAX_CONSERVATION,
        min_conservation=ARGS.MIN_CONSERVATION,
        max_gap_ratio=ARGS.MAX_GAP_RATIO
        )

    extractors = [('site_ident', MSAVectorizer(ARGS.ENCODER, filter))]

    if ARGS.RADIUS:
        extractors.append(('pair_ident', MSAVectorizerPairwise(ARGS.ENCODER, filter, ARGS.RADIUS)))

    if ARGS.PNGS:
        extractors.append(('pngs', MSAVectorizerRegex(re_pngs, 4, name='PNGS')))

    if ARGS.PNGS_PAIRS:
        extractors.append(
            ('pngs_pair', MSAVectorizerRegexPairwise(re_pngs, 4, name='PNGS'))
            )

    extractor = FeatureUnion(extractors, n_jobs=1)  # n_jobs must be 1 for now
    X = extractor.fit_transform(alignment)

    assert y.shape[0] == X.shape[0], \
        "number of classes doesn't match the data: %d vs %d" % (y.shape[0], X.shape[0])

    scorer = Scorer(ARGS.OPTSTAT)

    # do grid-search as part of the svm to avoid
    # performing feature selection on every iteration
    # of the grid search, which naturally takes forever
    svm = GridSearchCV(
        estimator=SVC(kernel='linear', class_weight='auto'),
        param_grid=dict(C=list(C_range(*ARGS.LOG2C))),
        scoring=scorer,
        n_jobs=int(getenv('NCPU', -1)),
        pre_dispatch='3 * n_jobs',
        cv=ARGS.CV_FOLDS - 1
        )

    results = None
    for n_features in ARGS.FEATURE_GRID:
        results_ = Results(extractor.get_feature_names(), scorer, ARGS.SIMILAR)

        for train_idxs, test_idxs in StratifiedKFold(y, ARGS.CV_FOLDS):

            if train_idxs.sum() < 1 or test_idxs.sum() < 1:
                y_true = y[test_idxs]
                results_.add(y_true, y_true, {})
                continue

            X_train = X[train_idxs]
            y_train = y[train_idxs]

            if ARGS.RFE:
                clf = RFE(
                    estimator=svm,
                    n_features_to_select=n_features,
                    step=ARGS.RFE_STEP
                    )
            else:
                mrmr = MRMR(
                    k=n_features,
                    method=ARGS.MRMR_METHOD,
                    normalize=ARGS.MRMR_NORMALIZE,
                    similar=ARGS.SIMILAR
                    )
                clf = Pipeline([('mrmr', mrmr), ('svm', svm)])

            clf.fit(X_train, y_train)

            X_test = X[test_idxs]
            y_true = y[test_idxs]

            if ARGS.RFE:
                selector_ = clf
                svm_ = clf.estimator_.best_estimator_
            else:
                selector_ = clf.named_steps['mrmr']
                svm_ = clf.named_steps['svm'].best_estimator_

            y_pred = clf.predict(X_test)

            coefs, ranks = coefs_ranks(selector_.ranking_, selector_.support_, svm_.coef_)

            results_.add(y_true, y_pred, coefs, ranks)

        if results is None or results_ > results:
            results = results_

    # the alignment reflects the number of sequences either naturally
    results.metadata(antibodies, ARGS.LABEL)

    print(results.dumps(), file=ARGS.OUTPUT)

    finalize_args(ARGS)

    return results
Esempio n. 2
0
def main(args=None):
    if args is None:
        args = sys.argv[1:]

    np.seterr(all='raise')

    parser, ns, args = init_args(description='learn model for labeled sequences', args=args)

    parser = hmmer_args(parser)
    parser = featsel_args(parser)
    parser = feature_args(parser)
    parser = mrmr_args(parser)
    parser = rfe_args(parser)
    parser = optstat_args(parser)
    parser = filter_args(parser)
    parser = svm_args(parser)
    parser = cv_args(parser)

    def GzipType(string):
        try:
            return gzip_open(string, 'wb')
        except:
            return ArgumentTypeError("cannot open '{0:s}' for writing".format(string))

    parser.add_argument('--tree', dest='TREE')
    parser.add_argument('ANTIBODY', type=AntibodyTypeFactory(ns.DATA), nargs='+')
    parser.add_argument('MODEL', type=GzipType)

    ARGS = parse_args(parser, args, namespace=ns)

    antibodies = tuple(ARGS.ANTIBODY)

    # do some argument parsing
    if ARGS.TEST:
        test_discrete(ARGS)
        finalize_args(ARGS)
        return {}

    if ARGS.MRMR_METHOD == 'MAXREL':
        ARGS.SIMILAR = 0.0

    # set the util params
    set_util_params(ARGS.REFSEQ.id)

    # grab the relevant antibody from the SQLITE3 data
    # format as SeqRecord so we can output as FASTA
    # and generate an alignment using HMMER if it doesn't already exist
    seqrecords, clonal, antibodies = ARGS.DATA.seqrecords(antibodies, ARGS.CLONAL)

    ab_basename = ''.join((
        '+'.join(antibodies),
        '_dna' if ARGS.ENCODER == DNAEncoder else '_amino',
        '_clonal' if clonal else ''
        ))
    alignment_basename = '_'.join((
        ab_basename,
        ARGS.DATA.basename_root,
        __version__
        ))
    sto_filename = alignment_basename + '.sto'

    alignment, hmm = generate_alignment(seqrecords, sto_filename, is_refseq, ARGS)

    re_pngs = re_compile(r'N[^P][TS][^P]', re_I)

    # compute features
    ylabeler = Labeler(
        partial(expression, ARGS.LABEL),
        partial(skipper, is_refseq, ARGS.SUBTYPES)
    )
    alignment, y, threshold = ylabeler(alignment)

    filter = naive_filter(
        max_conservation=ARGS.MAX_CONSERVATION,
        min_conservation=ARGS.MIN_CONSERVATION,
        max_gap_ratio=ARGS.MAX_GAP_RATIO
        )

    extractors = [('site', SiteVectorizer(ARGS.ENCODER, filter))]

    if ARGS.RADIUS:
        extractors.append(('site_pairs', PairwiseSiteVectorizer(ARGS.ENCODER, filter, ARGS.RADIUS)))

    if ARGS.PNGS:
        extractors.append(('pngs', MotifVectorizer(re_pngs, 4, name='PNGS')))

    if ARGS.PNGS_PAIRS:
        extractors.append(
            ('pngs_pairs', PairwiseMotifVectorizer(re_pngs, 4, name='PNGS'))
            )

    extractor = FeatureUnion(extractors, n_jobs=1)  # n_jobs must be one for now
    X = extractor.fit_transform(alignment)

    Cs = list(C_range(*ARGS.LOG2C))
    scorer = Scorer(ARGS.OPTSTAT)

    # we don't let GridSearchCV do its parallelization over all combinations
    # of grid points, because when the length of FEATURE_GRID is short,
    # it takes way longer than it should

    # usually the # of Cs is larger than the # of ks
    C_jobs = int(getenv('NCPU', -1))
    k_jobs = 1

    # if not, swap the parallelization strategy
    if len(ARGS.FEATURE_GRID) > len(Cs):
        C_jobs, k_jobs = k_jobs, C_jobs

    mrmr = MRMR(
        method=ARGS.MRMR_METHOD,
        normalize=ARGS.MRMR_NORMALIZE,
        similar=ARGS.SIMILAR
        )
    svm = GridSearchCV(
        estimator=SVC(kernel='linear', class_weight='auto'),
        param_grid=dict(C=Cs),
        scoring=scorer,
        n_jobs=C_jobs,
        pre_dispatch='3 * n_jobs'
        )
    pipe = Pipeline([('mrmr', mrmr), ('svm', svm)])

    if len(ARGS.FEATURE_GRID) == 1:
        pipe.set_params(mrmr__k=ARGS.FEATURE_GRID[0], svm__cv=ARGS.CV_FOLDS)
        clf = pipe.fit(X, y)
    else:
        pipe.set_params(svm__cv=ARGS.CV_FOLDS - 1)
        clf = GridSearchCV(
            estimator=pipe,
            param_grid=dict(mrmr__k=ARGS.FEATURE_GRID),
            scoring=scorer,
            n_jobs=k_jobs,
            pre_dispatch='3 * n_jobs',
            cv=ARGS.CV_FOLDS
            ).fit(X, y).best_estimator_

    pickle_dump((MODEL_VERSION, ARGS.ENCODER, ARGS.LABEL, hmm, extractor, clf), ARGS.MODEL)
    ARGS.MODEL.close()

    mrmr_ = clf.named_steps['mrmr']
    svm_ = clf.named_steps['svm'].best_estimator_

    coefs, ranks = coefs_ranks(mrmr_.ranking_, mrmr_.support_, svm_.coef_)
    results = Results(extractor.get_feature_names(), scorer, ARGS.SIMILAR)

    results.add(y, clf.predict(X), coefs, ranks)
    results.metadata(antibodies, ARGS.LABEL)

    print(results.dumps(), file=ARGS.OUTPUT)

    finalize_args(ARGS)

    return ARGS.MODEL
Esempio n. 3
0
def main(args=None):
    if args is None:
        args = sys.argv[1:]

    np.seterr(all='raise')

    parser, ns, args = init_args(description='Predict label for unlabeled sequences', args=args)

    parser = hmmer_args(parser)

    parser.add_argument('MODEL', type=PathType)
    parser.add_argument('SEQUENCES', type=PathType)

    ARGS = parse_args(parser, args, namespace=ns)

    with gzip_open(ARGS.MODEL, 'rb') as fh:
        try:
            model = pickle_load(fh)
            if model[0] != MODEL_VERSION:
                raise ImportError('incompatible model version')
            ARGS.ENCODER, ARGS.LABEL, hmm, extractor, clf = model[1:]
        except ImportError:
            msg = 'your model is not of the appropriate version, please re-learn your model'
            raise RuntimeError(msg)

    # create a temporary file wherein space characters have been removed
    with open(ARGS.SEQUENCES) as seq_fh:

        def seqrecords():
            is_dna = ARGS.ENCODER == DNAEncoder
            seq_fmt = seqfile_format(ARGS.SEQUENCES)
            source = Verifier(SeqIO.parse(seq_fh, seq_fmt), DNAAlphabet)
            try:
                for record in source:
                    yield record if is_dna else translate(record)
            except VerifyError:
                if is_dna:
                    msg = (
                        "your model specifies a DNA encoding "
                        "which is incompatible with protein sequences"
                        )
                    raise RuntimeError(msg)
                source.set_alphabet(AminoAlphabet)
                for record in source:
                    yield record

        try:
            fd, tmphmm = mkstemp(); close(fd)
            with open(tmphmm, 'wb') as hmm_fh:
                hmm_fh.write(hmm)
                # explicitly gc hmm
                hmm = None
            tmpaln = generate_alignment_(seqrecords(), tmphmm, ARGS)
            alignment = load_stockholm(tmpaln, trim=True)
        finally:
            if exists(tmphmm):
                remove(tmphmm)
            if exists(tmpaln):
                remove(tmpaln)

    X = extractor.transform(alignment)
    y = clf.predict(X)

    feature_names = extractor.get_feature_names()
    support = clf.named_steps['mrmr'].support_
    labels = ['"{0:s}"'.format(feature_names[i]) for i, s in enumerate(support) if s]
    emptys = [' ' * (len(label) + 2) for label in labels]
    idlen = max(len(r.id) for r in alignment) + 3

    print('{{\n  "label": "{0:s}",\n  "predictions": ['.format(ARGS.LABEL), file=ARGS.OUTPUT)
    for i, r in enumerate(alignment):
        if i > 0:
            print(',')
        features = ['[ ']
        for j, x in enumerate(X[i, support]):
            if x:
                features.append(labels[j])
                features.append(', ')
            else:
                features.append(emptys[j])
        features.append(' ]')
        # replace the last comma with a space
        idx = None
        for k, f in enumerate(features):
            if f == ', ':
                idx = k
        if idx is None:
            features[0] = features[0].rstrip()
            features[-1] = features[-1].lstrip()
        else:
            features[idx] = ''
        features_ = ''.join(features)
        print(
            '    {{{{ "id": {{0:<{0:d}s}} "value": {{1: d}}, "features": {{2:s}} }}}}'.format(
                idlen).format('"{0:s}",'.format(r.id), y[i], features_),
            file=ARGS.OUTPUT, end='')
    print('\n  ]\n}', file=ARGS.OUTPUT)

    finalize_args(ARGS)

    return 0
Esempio n. 4
0
def main(args=None):
    if args is None:
        args = sys.argv[1:]

    np.seterr(all='raise')

    parser, ns, args = init_args(description='Predict label for unlabeled sequences', args=args)

    parser = hmmer_args(parser)

    parser.add_argument('MODEL', type=PathType)
    parser.add_argument('SEQUENCES', type=PathType)

    ARGS = parse_args(parser, args, namespace=ns)

    with gzip_open(ARGS.MODEL, 'rb') as fh:
        try:
            model = pickle_load(fh)
            if model[0] != 4:
                raise ImportError('incompatible model version')
            ARGS.ENCODER, ARGS.LABEL, hmm, extractor, clf = model[1:]
        except ImportError:
            msg = 'your model is not of the appropriate version, please re-learn your model'
            raise RuntimeError(msg)

    # create a temporary file wherein space characters have been removed
    with open(ARGS.SEQUENCES) as seq_fh:

        def seqrecords():
            is_dna = ARGS.ENCODER == DNAEncoder
            seq_fmt = seqfile_format(ARGS.SEQUENCES)
            source = Verifier(SeqIO.parse(seq_fh, seq_fmt), DNAAlphabet)
            try:
                for record in source:
                    yield record if is_dna else translate(record)
            except VerifyError:
                if is_dna:
                    msg = (
                        "your model specifies a DNA encoding "
                        "which is incompatible with protein sequences"
                        )
                    raise RuntimeError(msg)
                source.set_alphabet(AminoAlphabet)
                for record in source:
                    yield record

        try:
            fd, tmphmm = mkstemp(); close(fd)
            with open(tmphmm, 'wb') as hmm_fh:
                hmm_fh.write(hmm)
                # explicitly gc hmm
                hmm = None
            tmpaln = generate_alignment_(seqrecords(), tmphmm, ARGS)
            alignment = load_stockholm(tmpaln, trim=True)
        finally:
            if exists(tmphmm):
                remove(tmphmm)
            if exists(tmpaln):
                remove(tmpaln)

    X = extractor.transform(alignment)
    y = clf.predict(X)

    feature_names = extractor.get_feature_names()
    support = clf.named_steps['mrmr'].support_
    labels = ['"{0:s}"'.format(feature_names[i]) for i, s in enumerate(support) if s]
    emptys = [' ' * (len(label) + 2) for label in labels]
    idlen = max(len(r.id) for r in alignment) + 3

    print('{{\n  "label": "{0:s}",\n  "predictions": ['.format(ARGS.LABEL), file=ARGS.OUTPUT)
    for i, r in enumerate(alignment):
        if i > 0:
            print(',')
        features = ['[ ']
        for j, x in enumerate(X[i, support]):
            if x:
                features.append(labels[j])
                features.append(', ')
            else:
                features.append(emptys[j])
        features.append(' ]')
        # replace the last comma with a space
        idx = None
        for k, f in enumerate(features):
            if f == ', ':
                idx = k
        if idx is None:
            features[0] = features[0].rstrip()
            features[-1] = features[-1].lstrip()
        else:
            features[idx] = ''
        features_ = ''.join(features)
        print(
            '    {{{{ "id": {{0:<{0:d}s}} "value": {{1: d}}, "features": {{2:s}} }}}}'.format(
                idlen).format('"{0:s}",'.format(r.id), y[i], features_),
            file=ARGS.OUTPUT, end='')
    print('\n  ]\n}', file=ARGS.OUTPUT)

    finalize_args(ARGS)

    return 0
Esempio n. 5
0
def main(args=None):
    init_log()

    if args is None:
        args = sys.argv[1:]

    np.seterr(all='raise')

    # so some option parsing
    parser, ns, args = init_args(description="Predict epitope sites.",
                                 args=args)

    parser = hmmer_args(parser)
    parser = featsel_args(parser)
    parser = feature_args(parser)
    parser = mrmr_args(parser)
    parser = rfe_args(parser)
    parser = optstat_args(parser)
    parser = filter_args(parser)
    parser = svm_args(parser)
    parser = cv_args(parser)

    parser.add_argument('ANTIBODY',
                        type=AntibodyTypeFactory(ns.DATA),
                        nargs='+')

    ARGS = parse_args(parser, args, namespace=ns)

    # do some argument parsing
    if ARGS.TEST:
        test_discrete(ARGS)
        finalize_args(ARGS)
        return {}

    # maxrel doesn't support similar
    if ARGS.MRMR_METHOD == 'MAXREL':
        ARGS.SIMILAR = 0.0

    antibodies = tuple(ARGS.ANTIBODY)

    # set the util params
    set_util_params(ARGS.REFSEQ.id)

    # grab the relevant antibody from the SQLITE3 data
    # format as SeqRecord so we can output as FASTA
    # and generate an alignment using HMMER if it doesn't already exist
    seqrecords, clonal, antibodies = ARGS.DATA.seqrecords(
        antibodies, ARGS.CLONAL)

    # if we're doing LOOCV, make sure we set CV_FOLDS appropriately
    if ARGS.LOOCV:
        ARGS.CV_FOLDS = len(seqrecords)

    ab_basename = ''.join(('+'.join(antibodies),
                           '_dna' if ARGS.ENCODER == DNAEncoder else '_amino',
                           '_clonal' if clonal else ''))
    alignment_basename = '_'.join(
        (ab_basename, ARGS.DATA.basename_root, __version__))
    sto_filename = alignment_basename + '.sto'

    # don't capture the second variable, let it be gc'd
    alignment = generate_alignment(seqrecords, sto_filename, is_refseq,
                                   ARGS)[0]

    re_pngs = re_compile(r'N[^P][TS][^P]', re_I)

    ylabeler = Labeler(partial(expression, ARGS.LABEL),
                       partial(skipper, is_refseq, ARGS.SUBTYPES))
    alignment, y, threshold = ylabeler(alignment)

    filter = naive_filter(max_conservation=ARGS.MAX_CONSERVATION,
                          min_conservation=ARGS.MIN_CONSERVATION,
                          max_gap_ratio=ARGS.MAX_GAP_RATIO)

    extractors = [('site_ident', MSAVectorizer(ARGS.ENCODER, filter))]

    if ARGS.RADIUS:
        extractors.append(('pair_ident',
                           MSAVectorizerPairwise(ARGS.ENCODER, filter,
                                                 ARGS.RADIUS)))

    if ARGS.PNGS:
        extractors.append(('pngs', MSAVectorizerRegex(re_pngs, 4,
                                                      name='PNGS')))

    if ARGS.PNGS_PAIRS:
        extractors.append(
            ('pngs_pair', MSAVectorizerRegexPairwise(re_pngs, 4, name='PNGS')))

    extractor = FeatureUnion(extractors, n_jobs=1)  # n_jobs must be 1 for now
    X = extractor.fit_transform(alignment)

    assert y.shape[0] == X.shape[0], \
        "number of classes doesn't match the data: %d vs %d" % (y.shape[0], X.shape[0])

    scorer = Scorer(ARGS.OPTSTAT)

    # do grid-search as part of the svm to avoid
    # performing feature selection on every iteration
    # of the grid search, which naturally takes forever
    svm = GridSearchCV(estimator=SVC(kernel='linear', class_weight='auto'),
                       param_grid=dict(C=list(C_range(*ARGS.LOG2C))),
                       scoring=scorer,
                       n_jobs=int(getenv('NCPU', -1)),
                       pre_dispatch='3 * n_jobs',
                       cv=ARGS.CV_FOLDS - 1)

    results = None
    for n_features in ARGS.FEATURE_GRID:
        results_ = Results(extractor.get_feature_names(), scorer, ARGS.SIMILAR)

        for train_idxs, test_idxs in StratifiedKFold(y, ARGS.CV_FOLDS):

            if train_idxs.sum() < 1 or test_idxs.sum() < 1:
                y_true = y[test_idxs]
                results_.add(y_true, y_true, {})
                continue

            X_train = X[train_idxs]
            y_train = y[train_idxs]

            if ARGS.RFE:
                clf = RFE(estimator=svm,
                          n_features_to_select=n_features,
                          step=ARGS.RFE_STEP)
            else:
                mrmr = MRMR(k=n_features,
                            method=ARGS.MRMR_METHOD,
                            normalize=ARGS.MRMR_NORMALIZE,
                            similar=ARGS.SIMILAR)
                clf = Pipeline([('mrmr', mrmr), ('svm', svm)])

            clf.fit(X_train, y_train)

            X_test = X[test_idxs]
            y_true = y[test_idxs]

            if ARGS.RFE:
                selector_ = clf
                svm_ = clf.estimator_.best_estimator_
            else:
                selector_ = clf.named_steps['mrmr']
                svm_ = clf.named_steps['svm'].best_estimator_

            y_pred = clf.predict(X_test)

            coefs, ranks = coefs_ranks(selector_.ranking_, selector_.support_,
                                       svm_.coef_)

            results_.add(y_true, y_pred, coefs, ranks)

        if results is None or results_ > results:
            results = results_

    # the alignment reflects the number of sequences either naturally
    results.metadata(antibodies, ARGS.LABEL)

    print(results.dumps(), file=ARGS.OUTPUT)

    finalize_args(ARGS)

    return results
Esempio n. 6
0
def main(args=None):
    if args is None:
        args = sys.argv[1:]

    np.seterr(all='raise')

    parser, ns, args = init_args(
        description='learn model for labeled sequences', args=args)

    parser = hmmer_args(parser)
    parser = featsel_args(parser)
    parser = feature_args(parser)
    parser = mrmr_args(parser)
    parser = rfe_args(parser)
    parser = optstat_args(parser)
    parser = filter_args(parser)
    parser = svm_args(parser)
    parser = cv_args(parser)

    def GzipType(string):
        try:
            return gzip_open(string, 'wb')
        except:
            return ArgumentTypeError(
                "cannot open '{0:s}' for writing".format(string))

    parser.add_argument('--tree', dest='TREE')
    parser.add_argument('ANTIBODY',
                        type=AntibodyTypeFactory(ns.DATA),
                        nargs='+')
    parser.add_argument('MODEL', type=GzipType)

    ARGS = parse_args(parser, args, namespace=ns)

    antibodies = tuple(ARGS.ANTIBODY)

    # do some argument parsing
    if ARGS.TEST:
        test_discrete(ARGS)
        finalize_args(ARGS)
        return {}

    if ARGS.MRMR_METHOD == 'MAXREL':
        ARGS.SIMILAR = 0.0

    # set the util params
    set_util_params(ARGS.REFSEQ.id)

    # grab the relevant antibody from the SQLITE3 data
    # format as SeqRecord so we can output as FASTA
    # and generate an alignment using HMMER if it doesn't already exist
    seqrecords, clonal, antibodies = ARGS.DATA.seqrecords(
        antibodies, ARGS.CLONAL)

    ab_basename = ''.join(('+'.join(antibodies),
                           '_dna' if ARGS.ENCODER == DNAEncoder else '_amino',
                           '_clonal' if clonal else ''))
    alignment_basename = '_'.join(
        (ab_basename, ARGS.DATA.basename_root, __version__))
    sto_filename = alignment_basename + '.sto'

    alignment, hmm = generate_alignment(seqrecords, sto_filename, is_refseq,
                                        ARGS)

    re_pngs = re_compile(r'N[^P][TS][^P]', re_I)

    # compute features
    ylabeler = Labeler(partial(expression, ARGS.LABEL),
                       partial(skipper, is_refseq, ARGS.SUBTYPES))
    alignment, y, threshold = ylabeler(alignment)

    filter = naive_filter(max_conservation=ARGS.MAX_CONSERVATION,
                          min_conservation=ARGS.MIN_CONSERVATION,
                          max_gap_ratio=ARGS.MAX_GAP_RATIO)

    extractors = [('site', MSAVectorizer(ARGS.ENCODER, filter))]

    if ARGS.RADIUS:
        extractors.append(('site_pairs',
                           MSAVectorizerPairwise(ARGS.ENCODER, filter,
                                                 ARGS.RADIUS)))

    if ARGS.PNGS:
        extractors.append(('pngs', MSAVectorizerRegex(re_pngs, 4,
                                                      name='PNGS')))

    if ARGS.PNGS_PAIRS:
        extractors.append(
            ('pngs_pairs', MSAVectorizerRegexPairwise(re_pngs, 4,
                                                      name='PNGS')))

    extractor = FeatureUnion(extractors,
                             n_jobs=1)  # n_jobs must be one for now
    X = extractor.fit_transform(alignment)

    Cs = list(C_range(*ARGS.LOG2C))
    scorer = Scorer(ARGS.OPTSTAT)

    # we don't let GridSearchCV do its parallelization over all combinations
    # of grid points, because when the length of FEATURE_GRID is short,
    # it takes way longer than it should

    # usually the # of Cs is larger than the # of ks
    C_jobs = int(getenv('NCPU', -1))
    k_jobs = 1

    # if not, swap the parallelization strategy
    if len(ARGS.FEATURE_GRID) > len(Cs):
        C_jobs, k_jobs = k_jobs, C_jobs

    mrmr = MRMR(method=ARGS.MRMR_METHOD,
                normalize=ARGS.MRMR_NORMALIZE,
                similar=ARGS.SIMILAR)
    svm = GridSearchCV(estimator=SVC(kernel='linear', class_weight='auto'),
                       param_grid=dict(C=Cs),
                       scoring=scorer,
                       n_jobs=C_jobs,
                       pre_dispatch='3 * n_jobs')
    pipe = Pipeline([('mrmr', mrmr), ('svm', svm)])

    if len(ARGS.FEATURE_GRID) == 1:
        pipe.set_params(mrmr__k=ARGS.FEATURE_GRID[0], svm__cv=ARGS.CV_FOLDS)
        clf = pipe.fit(X, y)
    else:
        pipe.set_params(svm__cv=ARGS.CV_FOLDS - 1)
        clf = GridSearchCV(estimator=pipe,
                           param_grid=dict(mrmr__k=ARGS.FEATURE_GRID),
                           scoring=scorer,
                           n_jobs=k_jobs,
                           pre_dispatch='3 * n_jobs',
                           cv=ARGS.CV_FOLDS).fit(X, y).best_estimator_

    pickle_dump((4, ARGS.ENCODER, ARGS.LABEL, hmm, extractor, clf), ARGS.MODEL)
    ARGS.MODEL.close()

    mrmr_ = clf.named_steps['mrmr']
    svm_ = clf.named_steps['svm'].best_estimator_

    coefs, ranks = coefs_ranks(mrmr_.ranking_, mrmr_.support_, svm_.coef_)
    results = Results(extractor.get_feature_names(), scorer, ARGS.SIMILAR)

    results.add(y, clf.predict(X), coefs, ranks)
    results.metadata(antibodies, ARGS.LABEL)

    print(results.dumps(), file=ARGS.OUTPUT)

    finalize_args(ARGS)

    return ARGS.MODEL