def fit_sample_noabstr(sequences, argz, random_state=random.random()): ''' graphs -> more graphs graphs are pretty mich (NAME,SEQUENCE),()... ''' # fit a sampler sequences = list(sequences) estimator = estimatorwrapper( nu=.5, cv=2, n_jobs=1) # with .5 it also works for the fewer ones.. sampler = rna.AbstractSampler( radius_list=argz['radius_list'], #[0, 1, 2], # war 0,1 thickness_list=argz['thickness_list'], #[1], # war 2 min_cip_count=argz['mincip_count'], min_interface_count=argz['min_interfacecount'], preprocessor=rnana.PreProcessor(base_thickness_list=[1], ignore_inserts=True), postprocessor=rna.PostProcessor(), estimator=estimator # feasibility_checker=feasibility ) sampler.fit(sequences, grammar_n_jobs=1, grammar_batch_size=1) # logger.info('graph grammar stats:') dataset_size, interface_counts, core_counts, cip_counts = sampler.grammar( ).size() # logger.info('#instances:%d #interfaces: %d #cores: %d #core-interface-pairs: %d' % (dataset_size, interface_counts, core_counts, cip_counts)) sequences = [b for a, b in sequences] sequences = sampler.sample(sequences, n_samples=5, batch_size=1, n_steps=55, n_jobs=1, quick_skip_orig_cip=True, probabilistic_core_choice=False, burnin=6, improving_threshold=0.5, improving_linear_start=0.15, max_size_diff=6, accept_min_similarity=0.55, select_cip_max_tries=30, keep_duplicates=False, include_seed=False, backtrack=2, monitor=False) result = [] for li in sequences: result += li return [r[1] for r in result]
def oneclasstest_fraction(fraction=0.1, repeats=2): # choosing some graphs, # having array to save results for i in range(repeats): badscores = [] goodscores = [] graphs = get_sequences_with_names(size=923) graphs, not_used = random_bipartition_iter( graphs, fraction, random_state=random.random() * i * 1000) estimator = Wrapper(nu=.27, cv=3, n_jobs=-1) sampler = rna.AbstractSampler(radius_list=[0, 1], thickness_list=[2], min_cip_count=1, min_interface_count=2, preprocessor=rna.PreProcessor( base_thickness_list=[1], ignore_inserts=True), postprocessor=rna.PostProcessor(), estimator=estimator) sampler.preprocessor.set_param(sampler.vectorizer) graphmanagers = sampler.preprocessor.fit_transform(graphs) sampler.estimatorobject.fit(graphmanagers, vectorizer=sampler.vectorizer, random_state=sampler.random_state) #test for graphman in graphmanagers: struct = evaltools.dotbracket_to_shape(graphman.structure, shapesversion=SHAPEVERSION) score = sampler.estimatorobject.score(graphman) if struct == "[[][][]]": goodscores.append(score) else: badscores.append(score) print "afraction=%f , instances=%f, good=%d , bad=%d" % ( fraction, fraction * 923, len(goodscores), len(badscores)) a = numpy.array(badscores) print 'bad:mean/std ', numpy.mean(a, axis=0), ' ', numpy.std(a, axis=0) a = numpy.array(goodscores) print 'cgood:mean/std ', numpy.mean(a, axis=0), ' ', numpy.std(a, axis=0) a = numpy.array(goodscores + badscores) print 'dbad+good:mean/std ', numpy.mean(a, axis=0), ' ', numpy.std(a, axis=0) print ''
def fit_sample(graphs, random_state=random.random()): ''' graphs -> more graphs ''' graphs = list(graphs) estimator = estimatorwrapper(nu=.5, cv=2, n_jobs=-1) sampler = rna.AbstractSampler(radius_list=[0, 1], thickness_list=[2], min_cip_count=1, min_interface_count=2, preprocessor=rna.PreProcessor( base_thickness_list=[1], ignore_inserts=True), postprocessor=rna.PostProcessor(), estimator=estimator #feasibility_checker=feasibility ) sampler.fit(graphs, grammar_n_jobs=4, grammar_batch_size=1) logger.info('graph grammar stats:') dataset_size, interface_counts, core_counts, cip_counts = sampler.grammar( ).size() logger.info( '#instances:%d #interfaces: %d #cores: %d #core-interface-pairs: %d' % (dataset_size, interface_counts, core_counts, cip_counts)) graphs = [b for a, b in graphs] graphs = sampler.sample(graphs, n_samples=3, batch_size=1, n_steps=50, n_jobs=4, quick_skip_orig_cip=True, probabilistic_core_choice=True, burnin=10, improving_threshold=0.9, improving_linear_start=0.3, max_size_diff=20, accept_min_similarity=0.65, select_cip_max_tries=30, keep_duplicates=False, include_seed=False, backtrack=10, monitor=False) result = [] for graphlist in graphs: result += graphlist # note that this is a list [('',sequ),..] return result
def fit_sample(graphs, random_state=random.random()): ''' graphs -> more graphs arguments are generated above Oo ''' global arguments graphs = list(graphs) estimator = estimatorwrapper(nu=.5, cv=2, n_jobs=NJOBS) sampler = rna.AbstractSampler( radius_list=[0, 1], thickness_list=[2], min_cip_count=arguments['mincipcount'], min_interface_count=arguments['mininterfacecount'], preprocessor=rna.PreProcessor(base_thickness_list=[1], ignore_inserts=True), postprocessor=rna.PostProcessor(), estimator=estimator #feasibility_checker=feasibility ) sampler.fit(graphs, grammar_n_jobs=NJOBS, grammar_batch_size=1) graphs = [b for a, b in graphs] graphs = sampler.sample(graphs, n_samples=arguments['n_samples'], batch_size=1, n_steps=arguments['n_steps'], n_jobs=NJOBS, quick_skip_orig_cip=arguments['quick_skip'], probabilistic_core_choice=arguments['core_choice'], burnin=arguments['burnin'], improving_threshold=arguments['imp_thresh'], improving_linear_start=arguments['imp_lin_start'], max_size_diff=arguments['maxsizediff'], accept_min_similarity=arguments['acc_min_sim'], select_cip_max_tries=30, keep_duplicates=False, include_seed=False, backtrack=2, monitor=False) result = [] for graphlist in graphs: result += graphlist # note that this is a list [('',sequ),..] return result
def fit_sample_infernal(seques, dummy): """ ok wir machen write fasta muscle, alifold, biopython, create_cm und cmemit oO """ #print seques sequences = [b for a, b in seques] rna.write_fasta(sequences, "tmp.fa") shell_exec('muscle -in tmp.fa -out museld.fa') a, b, out = shell_exec('cat museld.fa | RNAalifold -f F --noPS') ss = getstr(out) to_stockholm('museld.fa', ss, 'sto.sto') shell_exec("cmbuild -F mod3l sto.sto") shell_exec("cmemit -N %d --exp 3.92 mod3l > out.fa" % (len(sequences) * 2)) return fasta_to_list('out.fa')
def eval(repeats,size): result=[] for i in range(repeats): graphs=get_sequences_with_names(size=size, rand=(i+3)*10) zz=fit_sample(graphs) z=[b for a ,b in zz] cmpath='../%s.cm' % RFAM result+=rna.infernal_checker(z,cmfile=cmpath, cmsearchbinarypath='../toolsdata/cmsearch') a = numpy.array(result) mean = numpy.mean(a, axis=0) std = numpy.std(a, axis=0) print 'size:%d mean:%f std:%f' % (size,mean,std) return mean,std
def eval(repeats, size): result = [] for i in range(repeats): graphs = get_sequences_with_names(size=size, rand=(i + 3) * 10) zz = fit_sample(graphs) z = [b for a, b in zz] cmpath = '../%s.cm' % RFAM result += rna.infernal_checker( z, cmfile=cmpath, cmsearchbinarypath='../toolsdata/cmsearch') a = numpy.array(result) mean = numpy.mean(a, axis=0) std = numpy.std(a, axis=0) print 'size:%d mean:%f std:%f' % (size, mean, std) return mean, std
def evaluate(repeats, size, fitsample, RFAM,inputdict,debug): means = [] stds = [] for i in range(repeats): if debug: print 'start rep' sequences,void = utils.get_seq_tups(RFAM+'.fa',size,1) zz = fitsample(sequences,inputdict) # print zz[:3] zz=[b for a ,b in zz] result = rna.infernal_checker(zz, cmfile='../toolsdata/%s.cm' % RFAM, cmsearchbinarypath='../toolsdata/cmsearch') a = np.array(result) means.append(np.mean(a, axis=0)) stds.append(np.std(a, axis=0)) means.sort() stds.sort() #print (size, means, stds) return means[repeats / 2] * 100, stds[repeats / 2] * 100
def evaluate(repeats, size, fitsample): print 'eval:', means = [] stds = [] for i in range(repeats): sequences = get_sequences_with_names(size=size, rand=10) zz = fitsample(sequences) # print zz[:3] # z=[b for a ,b in zz] result = rna.infernal_checker( zz, cmfile='../toolsdata/%s.cm' % RFAM, cmsearchbinarypath='../toolsdata/cmsearch') a = np.array(result) means.append(np.mean(a, axis=0)) stds.append(np.std(a, axis=0)) means.sort() stds.sort() print(size, means, stds) return [means[repeats / 2] * 100, stds[repeats / 2] * 100]
sequences = itertools.islice(fasta_to_sequence("../example/RF00005.fa"), size) return [b for (a, b) in sequences] def get_sequences_with_names(size=9999): sequences = itertools.islice(fasta_to_sequence("../example/RF00005.fa"), size) return sequences ''' learning a grammar ''' import graphlearn.abstract_graphs.learned_RNA as learned import graphlearn.abstract_graphs.RNA as rna from graphlearn import feasibility feas = feasibility.FeasibilityChecker( checklist=[feasibility.default_check, rna.is_rna]) graphs = get_sequences_with_names(150) pp = learned.RnaPreProcessor(base_thickness_list=[2], kmeans_clusters=3, structure_mod=False) sampler = rna.AbstractSampler(radius_list=[0, 1], thickness_list=[1], min_cip_count=2, min_interface_count=2, feasibility_checker=feas, preprocessor=pp) sampler.fit(graphs, grammar_n_jobs=1, grammar_batch_size=1)