def main(argv=sys.argv):
    global OPTIONS

    # so some option parsing
    option_parser = setup_option_parser()
    (OPTIONS, args) = option_parser.parse_args(argv)

    # do some argument parsing
    if OPTIONS.TEST:
        run_tests()
        return 0

    if OPTIONS.RAND_SEED is not None:
        seed(OPTIONS.RAND_SEED)

    if len(args) != 2:
        option_parser.error('ANTIBODY is a required argument')

    # check to make sure our mode is exclusive, and set the default (AMINO) if none is set
    if sum([1 for v in (OPTIONS.AMINO, OPTIONS.DNA, OPTIONS.STANFEL) if v]) > 1:
        option_parser.error('options --amino, --dna, and --stanfel are mutually exclusive')
    elif sum([1 for v in (OPTIONS.AMINO, OPTIONS.DNA, OPTIONS.STANFEL) if v]) == 0:
        OPTIONS.AMINO = True

    # validate the regression method
    cvopts = {}
    if OPTIONS.REGRESSOR_METHOD in regressor_classes:
        cvopts['regressorcls'] = regressor_classes[OPTIONS.REGRESSOR_METHOD]
    else:
        option_parser.error('%s not in the list of available regression methods: \n  %s' % (OPTIONS.REGRESSOR_METHOD,
            '\n  '.join(regressor_classes.keys())))

    if search(r'(?:lar|lasso)$', OPTIONS.REGRESSOR_METHOD):
        if OPTIONS.NUM_FEATURES < 0:
            OPTIONS.NUM_FEATURES = _DEFAULT_NUM_FEATURES
        cvopts['m'] = OPTIONS.NUM_FEATURES
    elif OPTIONS.NUM_FEATURES > 0:
        option_parser.error('--numfeats is a useless parameter for regression method `%s\'' % OPTIONS.REGRESSOR_METHOD)

    cvopts['logspace'] = OPTIONS.LOGSPACE

    # validate the antibody argument, currently a hack exists to make PG9/PG16 work
    # TODO: Fix pg9/16 hax
    antibody = args[1].strip()
    valid_antibodies = sorted(OPTIONS.DATA.antibodies, key=lambda x: x.strip())
    if antibody not in valid_antibodies:
        if ' ' + antibody not in valid_antibodies:
            option_parser.error('%s not in the list of permitted antibodies: \n  %s' % (antibody, '\n  '.join([ab.strip() for ab in valid_antibodies])))
        else:
            antibody = ' ' + antibody

    # validate the subtype option
    valid_subtypes = sorted(OPTIONS.DATA.subtypes, key=lambda x: x.strip().upper())
    for subtype in OPTIONS.SUBTYPES:
        if subtype not in valid_subtypes:
            option_parser.error('%s not in the list of permitted subtypes: \n  %s' % (subtype, '\n  '.join([st.strip() for st in valid_subtypes])))

    if len(OPTIONS.FILTER) != 0:
        if OPTIONS.NUM_FEATURES != -1:
            option_parser.error('--filter and --numfeats are incompatible options')
        else:
            OPTIONS.NUM_FEATURES = len(OPTIONS.FILTER)
    else: # len(OPTIONS.FILTER) == 0
        if OPTIONS.NUM_FEATURES == -1:
            OPTIONS.NUM_FEATURES = _DEFAULT_NUM_FEATURES

    # destroy the parser because optparse docs recommend it
    option_parser.destroy()

    # use the default DNA HXB2 Reference seq if we define --dna but don't give a new default HXB2 Reference seq
    fix_hxb2_fasta()

    # set the util params
    set_util_params(OPTIONS.HXB2_IDS)

    # fetch the alphabet, we'll probably need it later
    alph = Alphabet(mode=Alphabet.STANFEL if OPTIONS.STANFEL else Alphabet.DNA if OPTIONS.DNA else Alphabet.AMINO)

    ab_basename = ''.join((
        antibody,
        '_dna' if OPTIONS.DNA else '_amino',
        '_clonal' if OPTIONS.CLONAL else ''
    ))
    alignment_basename = '_'.join((
        ab_basename,
        OPTIONS.DATA.basename_root,
        __VERSION__
    ))

    # grab the relevant antibody from the SQLITE3 data
    # format as SeqRecord so we can output as FASTA
    # and generate an alignment using HMMER if it doesn't already exist
    seqrecords, clonal = OPTIONS.DATA.seqrecords(antibody, OPTIONS.CLONAL, OPTIONS.DNA)

    # if clonal isn't supported, fallback to default
    if clonal != OPTIONS.CLONAL:
        ab_basename = ''.join(ab_basename.rsplit('_clonal', 1))
        alignment_basename = ''.join(alignment_basename.rsplit('_clonal', 1))

    sto_filename = alignment_basename + '.sto'

    alignment = generate_alignment(seqrecords, sto_filename, is_refidx, OPTIONS)[0]

    ylabeler = Labeler(
        seqrecord_get_values,
        lambda seq: is_HXB2(seq) or False, # TODO: again filtration function
    )
    alignment, y, ic50gt = ylabeler(alignment)

    filter = naivefilter(
        OPTIONS.MAX_CONSERVATION,
        OPTIONS.MIN_CONSERVATION,
        OPTIONS.MAX_GAP_RATIO,
    )
    refidx = alignment_identify_ref(alignment, is_HXB2)
    builder = DataBuilder(
        alignment,
        alph,
        refidx,
        filter
    )
    x = builder(alignment, refidx)
    colnames = builder.labels

    crossvalidator = CrossValidator(
        classifier_cls=Regressor,
        folds=OPTIONS.CV_FOLDS,
        classifier_kwargs=cvopts,
        scorer_cls=ContinuousPerfStats,
        scorer_kwargs={}
    )

    results = crossvalidator.crossvalidate(x, y, classifier_kwargs={}, extra=extract_feature_weights)

    ret = cv_results_to_output(results, colnames)

    print(pretty_fmt_results(ret))

#     mean_len = max([len('%.3f' % v.mu) for v in avg_stats.values()])
#     std_len = max([len('%.3f' % v.sigma) for v in avg_stats.values()])
#     std_len = int(log10(max([1.] + [v.sigma for v in avg_stats.values()]))) + 5
#     for k, v in sorted(avg_stats.items(), key = lambda x: x[0][0]):
#         v_str = u'= %*.3f \xb1 %*.3f' % (mean_len, v.mu, std_len, v.sigma)
#         print(u'  %s%s' % (k, v_str))
#
#     for k, v in avg_weights.items():
#         if abs(v.mu) < 0.0001 and v.sigma == 0.:
#             del avg_weights[k]
#
#     print('\nSignificant positions (top %d):' % (len(avg_weights)))
#
#     if len(avg_weights) > 0:
#         name_len = max(len(k) for k in avg_weights.keys())
#         mean_len = max(len('% .1f' % v.mu) for v in avg_weights.values())
#         std_len = max(len('%.1f' % v.sigma) for v in avg_weights.values())
#         N_len = max(len('%d' % len(v.values)) for v in avg_weights.values())
#         for k, v in sorted(avg_weights.items(), key=lambda x: int(sub(r'[a-zA-Z\[\]]+', '', x[0]))):
#             print(u'  %-*s  % *.1f \xb1 %*.1f (N = %*d)' % (name_len, k, mean_len, v.mu, std_len, v.sigma, N_len, len(v.values)))
#
#     print('\n')

    return 0
Example #2
0
def main(argv=sys.argv):
    global OPTIONS

    # so some option parsing
    option_parser = setup_option_parser()
    (OPTIONS, args) = option_parser.parse_args(argv)

    # do some argument parsing
    if OPTIONS.TEST:
        run_tests()
        return 0

    if OPTIONS.RAND_SEED is not None:
        seed(OPTIONS.RAND_SEED)

    if len(args) != 2:
        option_parser.error('ANTIBODY is a required argument')

    # check to make sure our mode is exclusive, and set the default (AMINO) if none is set
    if sum([1
            for v in (OPTIONS.AMINO, OPTIONS.DNA, OPTIONS.STANFEL) if v]) > 1:
        option_parser.error(
            'options --amino, --dna, and --stanfel are mutually exclusive')
    elif sum([1 for v in (OPTIONS.AMINO, OPTIONS.DNA, OPTIONS.STANFEL)
              if v]) == 0:
        OPTIONS.AMINO = True

    # validate the regression method
    cvopts = {}
    if OPTIONS.REGRESSOR_METHOD in regressor_classes:
        cvopts['regressorcls'] = regressor_classes[OPTIONS.REGRESSOR_METHOD]
    else:
        option_parser.error(
            '%s not in the list of available regression methods: \n  %s' %
            (OPTIONS.REGRESSOR_METHOD, '\n  '.join(regressor_classes.keys())))

    if search(r'(?:lar|lasso)$', OPTIONS.REGRESSOR_METHOD):
        if OPTIONS.NUM_FEATURES < 0:
            OPTIONS.NUM_FEATURES = _DEFAULT_NUM_FEATURES
        cvopts['m'] = OPTIONS.NUM_FEATURES
    elif OPTIONS.NUM_FEATURES > 0:
        option_parser.error(
            '--numfeats is a useless parameter for regression method `%s\'' %
            OPTIONS.REGRESSOR_METHOD)

    cvopts['logspace'] = OPTIONS.LOGSPACE

    # validate the antibody argument, currently a hack exists to make PG9/PG16 work
    # TODO: Fix pg9/16 hax
    antibody = args[1].strip()
    valid_antibodies = sorted(OPTIONS.DATA.antibodies, key=lambda x: x.strip())
    if antibody not in valid_antibodies:
        if ' ' + antibody not in valid_antibodies:
            option_parser.error(
                '%s not in the list of permitted antibodies: \n  %s' %
                (antibody, '\n  '.join([ab.strip()
                                        for ab in valid_antibodies])))
        else:
            antibody = ' ' + antibody

    # validate the subtype option
    valid_subtypes = sorted(OPTIONS.DATA.subtypes,
                            key=lambda x: x.strip().upper())
    for subtype in OPTIONS.SUBTYPES:
        if subtype not in valid_subtypes:
            option_parser.error(
                '%s not in the list of permitted subtypes: \n  %s' %
                (subtype, '\n  '.join([st.strip() for st in valid_subtypes])))

    if len(OPTIONS.FILTER) != 0:
        if OPTIONS.NUM_FEATURES != -1:
            option_parser.error(
                '--filter and --numfeats are incompatible options')
        else:
            OPTIONS.NUM_FEATURES = len(OPTIONS.FILTER)
    else:  # len(OPTIONS.FILTER) == 0
        if OPTIONS.NUM_FEATURES == -1:
            OPTIONS.NUM_FEATURES = _DEFAULT_NUM_FEATURES

    # destroy the parser because optparse docs recommend it
    option_parser.destroy()

    # use the default DNA HXB2 Reference seq if we define --dna but don't give a new default HXB2 Reference seq
    fix_hxb2_fasta()

    # set the util params
    set_util_params(OPTIONS.HXB2_IDS)

    # fetch the alphabet, we'll probably need it later
    alph = Alphabet(mode=Alphabet.STANFEL if OPTIONS.STANFEL else Alphabet.
                    DNA if OPTIONS.DNA else Alphabet.AMINO)

    ab_basename = ''.join((antibody, '_dna' if OPTIONS.DNA else '_amino',
                           '_clonal' if OPTIONS.CLONAL else ''))
    alignment_basename = '_'.join(
        (ab_basename, OPTIONS.DATA.basename_root, __VERSION__))

    # grab the relevant antibody from the SQLITE3 data
    # format as SeqRecord so we can output as FASTA
    # and generate an alignment using HMMER if it doesn't already exist
    seqrecords, clonal = OPTIONS.DATA.seqrecords(antibody, OPTIONS.CLONAL,
                                                 OPTIONS.DNA)

    # if clonal isn't supported, fallback to default
    if clonal != OPTIONS.CLONAL:
        ab_basename = ''.join(ab_basename.rsplit('_clonal', 1))
        alignment_basename = ''.join(alignment_basename.rsplit('_clonal', 1))

    sto_filename = alignment_basename + '.sto'

    alignment = generate_alignment(seqrecords, sto_filename, is_refidx,
                                   OPTIONS)[0]

    ylabeler = Labeler(
        seqrecord_get_values,
        lambda seq: is_HXB2(seq) or False,  # TODO: again filtration function
    )
    alignment, y, ic50gt = ylabeler(alignment)

    filter = naivefilter(
        OPTIONS.MAX_CONSERVATION,
        OPTIONS.MIN_CONSERVATION,
        OPTIONS.MAX_GAP_RATIO,
    )
    refidx = alignment_identify_ref(alignment, is_HXB2)
    builder = DataBuilder(alignment, alph, refidx, filter)
    x = builder(alignment, refidx)
    colnames = builder.labels

    crossvalidator = CrossValidator(classifier_cls=Regressor,
                                    folds=OPTIONS.CV_FOLDS,
                                    classifier_kwargs=cvopts,
                                    scorer_cls=ContinuousPerfStats,
                                    scorer_kwargs={})

    results = crossvalidator.crossvalidate(x,
                                           y,
                                           classifier_kwargs={},
                                           extra=extract_feature_weights)

    ret = cv_results_to_output(results, colnames)

    print(pretty_fmt_results(ret))

    #     mean_len = max([len('%.3f' % v.mu) for v in avg_stats.values()])
    #     std_len = max([len('%.3f' % v.sigma) for v in avg_stats.values()])
    #     std_len = int(log10(max([1.] + [v.sigma for v in avg_stats.values()]))) + 5
    #     for k, v in sorted(avg_stats.items(), key = lambda x: x[0][0]):
    #         v_str = u'= %*.3f \xb1 %*.3f' % (mean_len, v.mu, std_len, v.sigma)
    #         print(u'  %s%s' % (k, v_str))
    #
    #     for k, v in avg_weights.items():
    #         if abs(v.mu) < 0.0001 and v.sigma == 0.:
    #             del avg_weights[k]
    #
    #     print('\nSignificant positions (top %d):' % (len(avg_weights)))
    #
    #     if len(avg_weights) > 0:
    #         name_len = max(len(k) for k in avg_weights.keys())
    #         mean_len = max(len('% .1f' % v.mu) for v in avg_weights.values())
    #         std_len = max(len('%.1f' % v.sigma) for v in avg_weights.values())
    #         N_len = max(len('%d' % len(v.values)) for v in avg_weights.values())
    #         for k, v in sorted(avg_weights.items(), key=lambda x: int(sub(r'[a-zA-Z\[\]]+', '', x[0]))):
    #             print(u'  %-*s  % *.1f \xb1 %*.1f (N = %*d)' % (name_len, k, mean_len, v.mu, std_len, v.sigma, N_len, len(v.values)))
    #
    #     print('\n')

    return 0
def run_tests():
    # set these to this so we don't exclude anything (just testing file generation and parsing)
    OPTIONS.NUM_FEATURES = 15 # should be enough, the number is known to be 13
    OPTIONS.MAXREL = False
    OPTIONS.DNA = False
    OPTIONS.MAX_CONSERVATION = 1.0
    OPTIONS.MAX_GAP_RATIO    = 1.0
    OPTIONS.MIN_CONSERVATION = 1.0

    # if we don't do this, DOOMBUNNIES
    set_util_params(OPTIONS.HXB2_IDS)

    fd, sto_filename = mkstemp(); close(fd)

    try:
        fh = open(sto_filename, 'w')
        print(_TEST_AMINO_STO, file=fh)
        fh.close()

        alignment = AlignIO.read(sto_filename, 'stockholm')

        for OPTIONS.STANFEL in (True, False):

            if OPTIONS.STANFEL:
                OPTIONS.AMINO = False
                _TEST_NAMES = _TEST_STANFEL_NAMES
                _TEST_X = _TEST_STANFEL_X
            else:
                OPTIONS.AMINO = True
                _TEST_NAMES = _TEST_AMINO_NAMES
                _TEST_X = _TEST_AMINO_X

            alph = Alphabet(Alphabet.STANFEL if OPTIONS.STANFEL else Alphabet.DNA if OPTIONS.DNA else Alphabet.AMINO)

            # test mRMR and LSVM file generation
            ylabeler = Labeler(
                seqrecord_get_values,
                lambda seq: is_HXB2(seq) or False, # TODO: again filtration function
            )
            alignment, y, ic50gt = ylabeler(alignment)

            filter = naivefilter(
                OPTIONS.MAX_CONSERVATION,
                OPTIONS.MIN_CONSERVATION,
                OPTIONS.MAX_GAP_RATIO
            )
            refidx = alignment_identify_ref(alignment, is_HXB2)
            builder = DataBuilder(
                alignment,
                alph,
                refidx,
                filter
            )
            x = builder(alignment, refidx)
            colnames = builder.labels

            # test the feature names portion
            try:
                assert(len(colnames) == len(_TEST_NAMES))
            except AssertionError:
                raise AssertionError('gen:   %s\ntruth: %s' % (colnames, _TEST_NAMES))

            for name in _TEST_NAMES:
                try:
                    assert(name in colnames)
                except AssertionError:
                    raise AssertionError('ERROR: \'%s\' not found in %s' % (name, ', '.join(colnames)))

            assert(np.all(_TEST_X == x))

            assert(np.all(_TEST_Y == y))

            # TODO: generate and test the regressor data generation
            # print y, "\n", x

    finally:
        remove(sto_filename)

    print('ALL TESTS PASS', file=sys.stderr)
Example #4
0
def run_tests():
    # set these to this so we don't exclude anything (just testing file generation and parsing)
    OPTIONS.NUM_FEATURES = 15  # should be enough, the number is known to be 13
    OPTIONS.MAXREL = False
    OPTIONS.DNA = False
    OPTIONS.MAX_CONSERVATION = 1.0
    OPTIONS.MAX_GAP_RATIO = 1.0
    OPTIONS.MIN_CONSERVATION = 1.0

    # if we don't do this, DOOMBUNNIES
    set_util_params(OPTIONS.HXB2_IDS)

    fd, sto_filename = mkstemp()
    close(fd)

    try:
        fh = open(sto_filename, 'w')
        print(_TEST_AMINO_STO, file=fh)
        fh.close()

        alignment = AlignIO.read(sto_filename, 'stockholm')

        for OPTIONS.STANFEL in (True, False):

            if OPTIONS.STANFEL:
                OPTIONS.AMINO = False
                _TEST_NAMES = _TEST_STANFEL_NAMES
                _TEST_X = _TEST_STANFEL_X
            else:
                OPTIONS.AMINO = True
                _TEST_NAMES = _TEST_AMINO_NAMES
                _TEST_X = _TEST_AMINO_X

            alph = Alphabet(Alphabet.STANFEL if OPTIONS.STANFEL else Alphabet.
                            DNA if OPTIONS.DNA else Alphabet.AMINO)

            # test mRMR and LSVM file generation
            ylabeler = Labeler(
                seqrecord_get_values,
                lambda seq: is_HXB2(seq) or
                False,  # TODO: again filtration function
            )
            alignment, y, ic50gt = ylabeler(alignment)

            filter = naivefilter(OPTIONS.MAX_CONSERVATION,
                                 OPTIONS.MIN_CONSERVATION,
                                 OPTIONS.MAX_GAP_RATIO)
            refidx = alignment_identify_ref(alignment, is_HXB2)
            builder = DataBuilder(alignment, alph, refidx, filter)
            x = builder(alignment, refidx)
            colnames = builder.labels

            # test the feature names portion
            try:
                assert (len(colnames) == len(_TEST_NAMES))
            except AssertionError:
                raise AssertionError('gen:   %s\ntruth: %s' %
                                     (colnames, _TEST_NAMES))

            for name in _TEST_NAMES:
                try:
                    assert (name in colnames)
                except AssertionError:
                    raise AssertionError('ERROR: \'%s\' not found in %s' %
                                         (name, ', '.join(colnames)))

            assert (np.all(_TEST_X == x))

            assert (np.all(_TEST_Y == y))

            # TODO: generate and test the regressor data generation
            # print y, "\n", x

    finally:
        remove(sto_filename)

    print('ALL TESTS PASS', file=sys.stderr)