def test_crossvalidator_ndarrays_and_lists(self): xvalor = CrossValidator(Optimist, self.folds, learn_func=Optimist.train) #test using ndarray types for x and y rv = xvalor.crossvalidate(self.x, np.array(self.y)) self.assertEqual(rv.stats.get(DiscretePerfStats.ACCURACY).mu, self.accuracy) #test using lists for x and y rv = xvalor.validate(self.x.tolist(), self.y) self.assertEqual(rv.stats.get(DiscretePerfStats.ACCURACY).mu, self.accuracy) #using list for x and ndarray for y rv = xvalor.crossvalidate(self.x.tolist(), np.array(self.y)) self.assertEqual(rv.stats.get(DiscretePerfStats.ACCURACY).mu, self.accuracy) #using ndarray for x and list for y rv = xvalor.validate(self.x, self.y) self.assertEquals(rv.stats.get(DiscretePerfStats.ACCURACY).mu, self.accuracy)
def main(argv=sys.argv): global OPTIONS # so some option parsing option_parser = setup_option_parser() (OPTIONS, args) = option_parser.parse_args(argv) # do some argument parsing if OPTIONS.TEST: run_tests() return 0 if OPTIONS.RAND_SEED is not None: seed(OPTIONS.RAND_SEED) if len(args) != 2: option_parser.error('ANTIBODY is a required argument') # check to make sure our mode is exclusive, and set the default (AMINO) if none is set if sum([1 for v in (OPTIONS.AMINO, OPTIONS.DNA, OPTIONS.STANFEL) if v]) > 1: option_parser.error('options --amino, --dna, and --stanfel are mutually exclusive') elif sum([1 for v in (OPTIONS.AMINO, OPTIONS.DNA, OPTIONS.STANFEL) if v]) == 0: OPTIONS.AMINO = True # validate the regression method cvopts = {} if OPTIONS.REGRESSOR_METHOD in regressor_classes: cvopts['regressorcls'] = regressor_classes[OPTIONS.REGRESSOR_METHOD] else: option_parser.error('%s not in the list of available regression methods: \n %s' % (OPTIONS.REGRESSOR_METHOD, '\n '.join(regressor_classes.keys()))) if search(r'(?:lar|lasso)$', OPTIONS.REGRESSOR_METHOD): if OPTIONS.NUM_FEATURES < 0: OPTIONS.NUM_FEATURES = _DEFAULT_NUM_FEATURES cvopts['m'] = OPTIONS.NUM_FEATURES elif OPTIONS.NUM_FEATURES > 0: option_parser.error('--numfeats is a useless parameter for regression method `%s\'' % OPTIONS.REGRESSOR_METHOD) cvopts['logspace'] = OPTIONS.LOGSPACE # validate the antibody argument, currently a hack exists to make PG9/PG16 work # TODO: Fix pg9/16 hax antibody = args[1].strip() valid_antibodies = sorted(OPTIONS.DATA.antibodies, key=lambda x: x.strip()) if antibody not in valid_antibodies: if ' ' + antibody not in valid_antibodies: option_parser.error('%s not in the list of permitted antibodies: \n %s' % (antibody, '\n '.join([ab.strip() for ab in valid_antibodies]))) else: antibody = ' ' + antibody # validate the subtype option valid_subtypes = sorted(OPTIONS.DATA.subtypes, key=lambda x: x.strip().upper()) for subtype in OPTIONS.SUBTYPES: if subtype not in valid_subtypes: option_parser.error('%s not in the list of permitted subtypes: \n %s' % (subtype, '\n '.join([st.strip() for st in valid_subtypes]))) if len(OPTIONS.FILTER) != 0: if OPTIONS.NUM_FEATURES != -1: option_parser.error('--filter and --numfeats are incompatible options') else: OPTIONS.NUM_FEATURES = len(OPTIONS.FILTER) else: # len(OPTIONS.FILTER) == 0 if OPTIONS.NUM_FEATURES == -1: OPTIONS.NUM_FEATURES = _DEFAULT_NUM_FEATURES # destroy the parser because optparse docs recommend it option_parser.destroy() # use the default DNA HXB2 Reference seq if we define --dna but don't give a new default HXB2 Reference seq fix_hxb2_fasta() # set the util params set_util_params(OPTIONS.HXB2_IDS) # fetch the alphabet, we'll probably need it later alph = Alphabet(mode=Alphabet.STANFEL if OPTIONS.STANFEL else Alphabet.DNA if OPTIONS.DNA else Alphabet.AMINO) ab_basename = ''.join(( antibody, '_dna' if OPTIONS.DNA else '_amino', '_clonal' if OPTIONS.CLONAL else '' )) alignment_basename = '_'.join(( ab_basename, OPTIONS.DATA.basename_root, __VERSION__ )) # grab the relevant antibody from the SQLITE3 data # format as SeqRecord so we can output as FASTA # and generate an alignment using HMMER if it doesn't already exist seqrecords, clonal = OPTIONS.DATA.seqrecords(antibody, OPTIONS.CLONAL, OPTIONS.DNA) # if clonal isn't supported, fallback to default if clonal != OPTIONS.CLONAL: ab_basename = ''.join(ab_basename.rsplit('_clonal', 1)) alignment_basename = ''.join(alignment_basename.rsplit('_clonal', 1)) sto_filename = alignment_basename + '.sto' alignment = generate_alignment(seqrecords, sto_filename, is_refidx, OPTIONS)[0] ylabeler = Labeler( seqrecord_get_values, lambda seq: is_HXB2(seq) or False, # TODO: again filtration function ) alignment, y, ic50gt = ylabeler(alignment) filter = naivefilter( OPTIONS.MAX_CONSERVATION, OPTIONS.MIN_CONSERVATION, OPTIONS.MAX_GAP_RATIO, ) refidx = alignment_identify_ref(alignment, is_HXB2) builder = DataBuilder( alignment, alph, refidx, filter ) x = builder(alignment, refidx) colnames = builder.labels crossvalidator = CrossValidator( classifier_cls=Regressor, folds=OPTIONS.CV_FOLDS, classifier_kwargs=cvopts, scorer_cls=ContinuousPerfStats, scorer_kwargs={} ) results = crossvalidator.crossvalidate(x, y, classifier_kwargs={}, extra=extract_feature_weights) ret = cv_results_to_output(results, colnames) print(pretty_fmt_results(ret)) # mean_len = max([len('%.3f' % v.mu) for v in avg_stats.values()]) # std_len = max([len('%.3f' % v.sigma) for v in avg_stats.values()]) # std_len = int(log10(max([1.] + [v.sigma for v in avg_stats.values()]))) + 5 # for k, v in sorted(avg_stats.items(), key = lambda x: x[0][0]): # v_str = u'= %*.3f \xb1 %*.3f' % (mean_len, v.mu, std_len, v.sigma) # print(u' %s%s' % (k, v_str)) # # for k, v in avg_weights.items(): # if abs(v.mu) < 0.0001 and v.sigma == 0.: # del avg_weights[k] # # print('\nSignificant positions (top %d):' % (len(avg_weights))) # # if len(avg_weights) > 0: # name_len = max(len(k) for k in avg_weights.keys()) # mean_len = max(len('% .1f' % v.mu) for v in avg_weights.values()) # std_len = max(len('%.1f' % v.sigma) for v in avg_weights.values()) # N_len = max(len('%d' % len(v.values)) for v in avg_weights.values()) # for k, v in sorted(avg_weights.items(), key=lambda x: int(sub(r'[a-zA-Z\[\]]+', '', x[0]))): # print(u' %-*s % *.1f \xb1 %*.1f (N = %*d)' % (name_len, k, mean_len, v.mu, std_len, v.sigma, N_len, len(v.values))) # # print('\n') return 0
def main(argv=sys.argv): global OPTIONS # so some option parsing option_parser = setup_option_parser() (OPTIONS, args) = option_parser.parse_args(argv) # do some argument parsing if OPTIONS.TEST: run_tests() return 0 if OPTIONS.RAND_SEED is not None: seed(OPTIONS.RAND_SEED) if len(args) != 2: option_parser.error('ANTIBODY is a required argument') # check to make sure our mode is exclusive, and set the default (AMINO) if none is set if sum([1 for v in (OPTIONS.AMINO, OPTIONS.DNA, OPTIONS.STANFEL) if v]) > 1: option_parser.error( 'options --amino, --dna, and --stanfel are mutually exclusive') elif sum([1 for v in (OPTIONS.AMINO, OPTIONS.DNA, OPTIONS.STANFEL) if v]) == 0: OPTIONS.AMINO = True # validate the regression method cvopts = {} if OPTIONS.REGRESSOR_METHOD in regressor_classes: cvopts['regressorcls'] = regressor_classes[OPTIONS.REGRESSOR_METHOD] else: option_parser.error( '%s not in the list of available regression methods: \n %s' % (OPTIONS.REGRESSOR_METHOD, '\n '.join(regressor_classes.keys()))) if search(r'(?:lar|lasso)$', OPTIONS.REGRESSOR_METHOD): if OPTIONS.NUM_FEATURES < 0: OPTIONS.NUM_FEATURES = _DEFAULT_NUM_FEATURES cvopts['m'] = OPTIONS.NUM_FEATURES elif OPTIONS.NUM_FEATURES > 0: option_parser.error( '--numfeats is a useless parameter for regression method `%s\'' % OPTIONS.REGRESSOR_METHOD) cvopts['logspace'] = OPTIONS.LOGSPACE # validate the antibody argument, currently a hack exists to make PG9/PG16 work # TODO: Fix pg9/16 hax antibody = args[1].strip() valid_antibodies = sorted(OPTIONS.DATA.antibodies, key=lambda x: x.strip()) if antibody not in valid_antibodies: if ' ' + antibody not in valid_antibodies: option_parser.error( '%s not in the list of permitted antibodies: \n %s' % (antibody, '\n '.join([ab.strip() for ab in valid_antibodies]))) else: antibody = ' ' + antibody # validate the subtype option valid_subtypes = sorted(OPTIONS.DATA.subtypes, key=lambda x: x.strip().upper()) for subtype in OPTIONS.SUBTYPES: if subtype not in valid_subtypes: option_parser.error( '%s not in the list of permitted subtypes: \n %s' % (subtype, '\n '.join([st.strip() for st in valid_subtypes]))) if len(OPTIONS.FILTER) != 0: if OPTIONS.NUM_FEATURES != -1: option_parser.error( '--filter and --numfeats are incompatible options') else: OPTIONS.NUM_FEATURES = len(OPTIONS.FILTER) else: # len(OPTIONS.FILTER) == 0 if OPTIONS.NUM_FEATURES == -1: OPTIONS.NUM_FEATURES = _DEFAULT_NUM_FEATURES # destroy the parser because optparse docs recommend it option_parser.destroy() # use the default DNA HXB2 Reference seq if we define --dna but don't give a new default HXB2 Reference seq fix_hxb2_fasta() # set the util params set_util_params(OPTIONS.HXB2_IDS) # fetch the alphabet, we'll probably need it later alph = Alphabet(mode=Alphabet.STANFEL if OPTIONS.STANFEL else Alphabet. DNA if OPTIONS.DNA else Alphabet.AMINO) ab_basename = ''.join((antibody, '_dna' if OPTIONS.DNA else '_amino', '_clonal' if OPTIONS.CLONAL else '')) alignment_basename = '_'.join( (ab_basename, OPTIONS.DATA.basename_root, __VERSION__)) # grab the relevant antibody from the SQLITE3 data # format as SeqRecord so we can output as FASTA # and generate an alignment using HMMER if it doesn't already exist seqrecords, clonal = OPTIONS.DATA.seqrecords(antibody, OPTIONS.CLONAL, OPTIONS.DNA) # if clonal isn't supported, fallback to default if clonal != OPTIONS.CLONAL: ab_basename = ''.join(ab_basename.rsplit('_clonal', 1)) alignment_basename = ''.join(alignment_basename.rsplit('_clonal', 1)) sto_filename = alignment_basename + '.sto' alignment = generate_alignment(seqrecords, sto_filename, is_refidx, OPTIONS)[0] ylabeler = Labeler( seqrecord_get_values, lambda seq: is_HXB2(seq) or False, # TODO: again filtration function ) alignment, y, ic50gt = ylabeler(alignment) filter = naivefilter( OPTIONS.MAX_CONSERVATION, OPTIONS.MIN_CONSERVATION, OPTIONS.MAX_GAP_RATIO, ) refidx = alignment_identify_ref(alignment, is_HXB2) builder = DataBuilder(alignment, alph, refidx, filter) x = builder(alignment, refidx) colnames = builder.labels crossvalidator = CrossValidator(classifier_cls=Regressor, folds=OPTIONS.CV_FOLDS, classifier_kwargs=cvopts, scorer_cls=ContinuousPerfStats, scorer_kwargs={}) results = crossvalidator.crossvalidate(x, y, classifier_kwargs={}, extra=extract_feature_weights) ret = cv_results_to_output(results, colnames) print(pretty_fmt_results(ret)) # mean_len = max([len('%.3f' % v.mu) for v in avg_stats.values()]) # std_len = max([len('%.3f' % v.sigma) for v in avg_stats.values()]) # std_len = int(log10(max([1.] + [v.sigma for v in avg_stats.values()]))) + 5 # for k, v in sorted(avg_stats.items(), key = lambda x: x[0][0]): # v_str = u'= %*.3f \xb1 %*.3f' % (mean_len, v.mu, std_len, v.sigma) # print(u' %s%s' % (k, v_str)) # # for k, v in avg_weights.items(): # if abs(v.mu) < 0.0001 and v.sigma == 0.: # del avg_weights[k] # # print('\nSignificant positions (top %d):' % (len(avg_weights))) # # if len(avg_weights) > 0: # name_len = max(len(k) for k in avg_weights.keys()) # mean_len = max(len('% .1f' % v.mu) for v in avg_weights.values()) # std_len = max(len('%.1f' % v.sigma) for v in avg_weights.values()) # N_len = max(len('%d' % len(v.values)) for v in avg_weights.values()) # for k, v in sorted(avg_weights.items(), key=lambda x: int(sub(r'[a-zA-Z\[\]]+', '', x[0]))): # print(u' %-*s % *.1f \xb1 %*.1f (N = %*d)' % (name_len, k, mean_len, v.mu, std_len, v.sigma, N_len, len(v.values))) # # print('\n') return 0