def main(): # Get the name of the config file config_file = leverage_efficiency.base.get_config_filename(sys.argv) # Extract the data from source data folder into common format import extract extract.main(config_file) # Update data with most recent values (optional) #import update # This doesn't connect to the rest of the pipeline yet #update.main(config_file) # Calculate derived quantities like returns for input into calculations import transform transform.main(config_file) # Perform leverage efficiency calculations import analysis analysis.main(config_file) # Create figures import plots plots.main(config_file) # Create exact figures used in the paper import paper_plots paper_plots.main(config_file) # Create figures used in the EE lecture notes import lecture_plots lecture_plots.main(config_file)
def test_data_is_unchanged(): fp1 = Path("data/raw_data.csv") d1 = os.stat(fp1) analysis.main() fp2 = Path("data/raw_data.csv") d2 = os.stat(fp2) assert (d1.st_mode == d2.st_mode and d1.st_ino == d2.st_ino and d1.st_dev == d2.st_dev and d1.st_nlink == d1.st_nlink and d1.st_uid == d2.st_uid and d1.st_gid == d2.st_gid and d1.st_size == d2.st_size and d1.st_mtime == d2.st_mtime and d1.st_ctime == d2.st_ctime)
def main(): print config.SCORE_TYPE for experiment in config.EXPERIMENTS: now = time.time() print "\nEXPERIMENT: {}\n".format(experiment) generate_scores.main(config.SHARDS[experiment], experiment) rank_answers.main(experiment) evaluate.main(experiment) analysis.main(experiment) lstring = 'experiment: {}\ntime = {} sec' print lstring.format(str(experiment), str(time.time() - now)) print "Done!"
def map_and_analyze(self, eqfil=None): if self.mapped is None: logger.debug('Mapping disabled.') elif self.mapped is True: logger.debug('is already mapped (skipping)!') return True elif self.mapped is False: with tools.cd(self.path): if eqfil is None: self.mapped = analysis.main(self.map_settings, eqfil) else: analysis.main(self.map_settings, eqfil) else: raise 'WTF'
def test_analysis_main(tmpdir, analysis, module, symptoms, hce, cause_list, resample_test, subset): kwargs = { 'clf': 'random', 'analysis': analysis, 'module': module, 'symptoms': symptoms, 'hce': hce, 'cause_list': cause_list, 'resample_test': resample_test, 'resample_size': 1, 'subset': subset, 'n_splits': 2, 'test_size': 0.25, 'holdout_n': 1, 'outdir': tmpdir.strpath } main(**kwargs)
def main(): path = '/Users/tinghai/Learning/GuanggaoData' os.chdir(path + '/source') import analysis as an an.main() os.chdir(path) lightGBM = pd.read_csv('./result/submit_lightGBM.csv', header=0, sep=' ') xgboost = pd.read_csv('./result/submit_XGBoost.csv', header=0, sep=' ') lightGBM_xgboost = pd.read_csv('./result_fusion/submit_construct_lightGBM_predict_XGBoost.csv', header=0, sep=' ') xgboost_lightGBM = pd.read_csv('./result_fusion/submit_construct_XGBoost_predict_lightGBM.csv', header=0, sep=' ') result = 0.25 * xgboost.iloc[:,1] + 0.25 * lightGBM.iloc[:,1] + 0.35 * lightGBM_xgboost.iloc[:,1] + 0.15 * xgboost_lightGBM.iloc[:,1] result2 = pd.concat([lightGBM.iloc[:,0],pd.DataFrame(result)],axis=1) result2.to_csv(('./result/submit_integrate_' + datetime.datetime.now().strftime('%Y%m%d_%H%M%S') + ".txt"), index=False, index_label=None, header=['instance_id','predicted_score'], sep=' ')
def respond(self, strInput): if strInput.endswith('.pdf'): extract.extract_file(strInput, 1) analysis.main(strInput) answer = 'Your analysis is ready and saved in the parent directory!!' return answer elif strInput.lower() in ['exit', 'goodbye']: answer = self.findmatch(strInput) return answer else: answer = self.findmatch(strInput) if answer == '': self.saveUnknownInput(strInput) return self.listen() else: return answer
def run(directory, source, analysis_types): print 'Running WIICA' inst_results = {} mem_results = {} compile.main(directory, source) inst_results = analysis.main(directory, source, analysis_types) if 'memory' in analysis_types: mem_results = mem_analysis.main(directory, source) all_results = dict(inst_results.items() + mem_results.items()) if 'register' in analysis_types: reg_analysis.main(directory, source) return all_results
def run(directory, source, analysis_types): print "Running WIICA" inst_results = {} mem_results = {} compile.main(directory, source) inst_results = analysis.main(directory, source, analysis_types) if "memory" in analysis_types: mem_results = mem_analysis.main(directory, source) all_results = dict(inst_results.items() + mem_results.items()) if "register" in analysis_types: reg_analysis.main(directory, source) return all_results
def run(directory, kernel, source, arguments, analysis_types): print 'Running WIICA' inst_results = {} mem_results = {} if arguments[0] == 'non': arguments = [] compile.main(directory, kernel, source, arguments) process_trace.main(directory, kernel) inst_results = analysis.main(directory, kernel, analysis_types) if 'memory' in analysis_types: mem_results = mem_analysis.main(directory, kernel) all_results = dict(inst_results.items() + mem_results.items()) if 'register' in analysis_types: reg_analysis.main(directory, kernel) return all_results
def test_smoke_cmd_synthesis(cls): fname = filenames[filename_totest] # Just with one file for smoke test import analysis import synthesis analysis.main([ 'test/' + fname, '--f0_min', '75', '--f0_max', '500', '--f0', 'test/' + fname.replace('.wav', '.f0'), '--spec', 'test/' + fname.replace('.wav', '.spec'), '--nm', 'test/' + fname.replace('.wav', '.nm') ]) synthesis.main([ 'test/' + fname.replace('.wav', '.resynth.wav'), '--fs', '16000', '--f0', 'test/' + fname.replace('.wav', '.f0'), '--spec', 'test/' + fname.replace('.wav', '.spec') ]) synthesis.main([ 'test/' + fname.replace('.wav', '.resynth.wav'), '--fs', '16000', '--f0', 'test/' + fname.replace('.wav', '.f0'), '--spec', 'test/' + fname.replace('.wav', '.spec'), '--nm', 'test/' + fname.replace('.wav', '.nm') ]) # synthesis.main(['test/'+fname.replace('.wav','.resynth.wav'), '--fs', '16000', '--f0', 'test/'+fname.replace('.wav','.f0'), '--spec', 'test/'+fname.replace('.wav','.spec'), '--pdd', 'test/'+fname.replace('.wav','.pdd')]) analysis.main([ 'test/' + fname, '--f0_min', '75', '--f0_max', '200', '--f0_log', '--f0', 'test/' + fname.replace('.wav', '.lf0'), '--spec', 'test/' + fname.replace('.wav', '.spec') ]) synthesis.main([ 'test/' + fname.replace('.wav', '.resynth.wav'), '--fs', '16000', '--logf0', 'test/' + fname.replace('.wav', '.lf0'), '--spec', 'test/' + fname.replace('.wav', '.spec') ]) analysis.main([ 'test/' + fname, '--f0_log', '--f0', 'test/' + fname.replace('.wav', '.lf0'), '--spec_nbfwbnds', '65', '--spec', 'test/' + fname.replace('.wav', '.fwlspec'), '--nm_nbfwbnds', '33', '--nm', 'test/' + fname.replace('.wav', '.fwnm') ]) synthesis.main([ 'test/' + fname.replace('.wav', '.resynth.wav'), '--fs', '16000', '--logf0', 'test/' + fname.replace('.wav', '.lf0'), '--fwlspec', 'test/' + fname.replace('.wav', '.fwlspec'), '--fwnm', 'test/' + fname.replace('.wav', '.fwnm') ])
def Clicked(self, event): text=self.content.get() if text: self.content.set(text) self.conversation=\ "you: " + text + "\n" self.textconversation.insert(tk.END,self.conversation) self.textconversation.see(tk.END) self.textentry.delete(0,tk.END) if text.endswith(".pdf"): self.file = text if 'outlier' in text: if self.file == '': self.conversation=\ "bot: Please specify the file name\n" else: outliers = analysis.main(self.file,True) self.conversation=\ "bot: The Outlier found in the data are:" + "\n" for i,k in enumerate(outliers): self.conversation+= '\t{}. Year: {}\n'.format(i+1,k[1]) for j in k[0].items(): self.conversation+= '\t {} : {}\n'.format(j[0],j[1]) self.content.set(text) self.textconversation.insert(tk.END,self.conversation) self.textconversation.see(tk.END) self.textentry.delete(0,tk.END) else: respond=self.bot.respond(text) self.conversation=\ "bot: " + respond + "\n" self.content.set(text) self.textconversation.insert(tk.END,self.conversation) self.textconversation.see(tk.END) self.textentry.delete(0,tk.END)
def main(n = 150000, quiet = False): """main(n = 150000, quiet = False) This script produces a grid of expected numbers of stars according to the selection criteria of Yusef-Zedah et al. 2009, 702,178-225 The Astrophysical Journal. The grid is in av for visual extinction, apera for aperature size and age for the maxage of the starformation size Parameters ---------- n integer: number of stars to be sampled per parameter set quiet boolean: if true suppresses all standard output Returns ---------- A number of fits-files with the sampled stars for different parameters to be specified in this file. Standard output is used to report progress, it will print out the parameter set to be progressed next and the completeness of the script as AV aperaturesize maxage completeness ETA ETA is the time to complete in seconds based on the single last operation """ t0 = time() #timing possibility if quiet: output_stream = StringIO() else: output_stream = sys.stdout print(t0,file=output_stream) sfr = .01 # star mass function kroupa = np.vectorize(functions.kroupa) mf = dist.Distribution(kroupa, .1, 50.) #star formation history constant_sfr = np.vectorize(functions.constant_sfr) ages = np.logspace(5,7,7) sf = [dist.Distribution(constant_sfr, 1000., ages[i]) for i in range(len(ages))] #sfr = [150000*mf.mean()/(ages[i]-1000.) for i in range(len(ages))] t1 = time() # finished reading the distributions print(t1,file=output_stream) # setting up model data aperas = np.logspace(2, 5, 4) avs = np.linspace(10.0, 50.0, 5) l = 1 mpold, tmpnew = 0., time() parameters = [] for i in range(len(avs)): for j in range(len(aperas)): for k in range(len(ages)): tmpold, tmpnew = tmpnew, time() starformation.main(massfunction = mf, starformationhistory = sf[k], \ A_v = avs[i], sfr = n, apera = aperas[j], maxage = ages[k], \ appendix = "%s_%03d_%06d_%09d" % ('sim',avs[i],aperas[j],ages[k]), quiet=True, precise=False) print(avs[i],aperas[j],ages[k], l/len(avs)/len(aperas)/len(ages), (len(avs)*len(aperas)*len(ages)-l)*(tmpnew-tmpold),file=output_stream) l = l+1 parameters.append([avs[i],aperas[j],ages[k]]) t2 = time() # end of simulation print(t2, t1, t2-t1) print ('number of simulations run: %s' %l , file=output_stream) head = ['#','AV', 'Aperature_size', 'Age'] f = open('out/__head', 'w') f.write( ','.join(head)+'\n' ) np.savetxt(f, parameters) f.close() t3 = time() # end of saving data analysis.main('out') print ('analysis complete' , file=output_stream) t4 = time() # end of analysing data print( 'starting script at %f' %(t0), file=output_stream) print( 'initializing %f' %(t1-t0), file=output_stream) print( "running simulation %f" %(t2-t1), file=output_stream) print( "writing data %f" %(t3-t2), file=output_stream) print( "analysing data %f" %(t4-t3), file=output_stream) print( "________________________", file=output_stream) print( "total runtime %f" %(t4-t0), file=output_stream) print( "finishing script %f" %t4, file=output_stream)
import analysis path_to_data = "/global/cscratch1/sd/zarija/4096/z05.h5" path_to_catalog = '/global/cscratch1/sd/zarija/4096/catalog_z05_iso138.txt' output_mass_frac = "./4096z05/mass_fraction.txt" output_WHIM_data = "./4096z05/WHIM_data.txt" analysis.main(path_to_data, path_to_catalog, output_mass_frac, output_WHIM_data)
import pandas as pd import json import urllib.request import numpy as np import pymongo import districtlist import analysis pd.set_option('mode.chained_assignment', None) conn = 'mongodb://localhost:27017' client = pymongo.MongoClient(conn) db = client.analysis collection = db.districtdata dict_data = {} dict_value = [] print(f'data retrieval in progress ...') for data in districtlist.distlist: url = f'https://www12.statcan.gc.ca/rest/census-recensement/CPR2016.json?lang=E&dguid={data}&topic=10&theme=5¬es=0' analysis.main(url) df_copy = analysis.languages_df.copy() df_copy.drop(['comment'], 1, inplace=True) df_copy.set_index(['rows'], inplace=True) records = json.loads(df_copy.to_json()).values() collection.insert_many(records) print(f'data retrieval done.')
language.append(lan) count.append(0) total += 1 for l in language: i = language.index(l) count[i] = count[i] / float(total) # for i in language: # ind = language.index(i) # if ind > 0: # count[ind] += count[ind-1] # Separate the range 0,1 into ranges = [count[0]] for i in range(1, len(count)): ranges.append(ranges[i - 1] + count[i]) ranges[len(ranges) - 1] = 1 with open("dev.txt") as f2: with open("baseline.txt", "w") as f: for l in f2.readlines(): number = random.random() i = 0 while number > ranges[i]: i += 1 f.write(language[i] + '\n') analysis.main(resultsFile="baseline.txt")
import analysis if __name__ == '__main__': analysis.main()
"Other", "Chinese", "Tibeto-Burman", "Tai-Kadai", "Turkic"] comments = ["Electoral Districts", "Link to Stascan information", "Single Responses: Mothertongue", "None", "None", "None",\ "None", "mainly Arabic, Hebrew, Somali", "includes Khmer, Vietnamese", "includes Tagalog (Philipino)",\ "includes Tamil", "English and French excluded", "includes Hindi, Urdu...", "mainly Farsi and Kurdish",\ "Japanese, Korean, Mongolic", "Niger-Congo, Nilo-Saharan and Creole", "Sign languages and other languages",\ "mainly Mandarin and Cantonese", "None", "mainly Thai and Lao", "None"] language_dict = {} language_dict["rows"] = row_names language_dict["comment"] = comments print(f'data retrieval in progress ...') for data in gd.provlist: url =f'https://www12.statcan.gc.ca/rest/census-recensement/CPR2016.json?lang=E&dguid={data}&topic=10&theme=5¬es=0' district, languages = analysis.main(url) language_dict[district] = languages #df_copy = analysis.languages_df.copy() #df_copy.drop (['comment'], 1, inplace=True) #df_copy.set_index(['rows'], inplace=True) #records = json.loads(df_copy.to_json()).values() #collection.insert_many(records) ###print(len(languages), len(comments), len(row_names)) # When the dictionary is complete, a new dataframe can be put together: languages_df = pd.DataFrame.from_dict(language_dict) #languages_df
count.append(0) total += 1 for l in language: i = language.index(l) count[i] = count[i] / float(total) # for i in language: # ind = language.index(i) # if ind > 0: # count[ind] += count[ind-1] # Separate the range 0,1 into ranges = [count[0]] for i in range(1, len(count)): ranges.append(ranges[i-1] + count[i]) ranges[len(ranges) -1] = 1 with open("dev.txt") as f2: with open("baseline.txt", "w") as f: for l in f2.readlines(): number = random.random() i = 0 while number > ranges[i]: i += 1 f.write(language[i] + '\n') analysis.main(resultsFile="baseline.txt")
def test_smoke_cmd_analysis(cls): fname = filenames[filename_totest] # Just with one file for smoke test import analysis analysis.main(['test/' + fname]) analysis.main( ['test/' + fname, '--f0', 'test/' + fname.replace('.wav', '.f0')]) analysis.main([ 'test/' + fname, '--f0', 'test/' + fname.replace('.wav', '.f0'), '--preproc_fs', '8000' ]) analysis.main([ 'test/' + fname, '--f0_min', '75', '--f0', 'test/' + fname.replace('.wav', '.f0') ]) analysis.main([ 'test/' + fname, '--f0_max', '200', '--f0', 'test/' + fname.replace('.wav', '.f0') ]) analysis.main([ 'test/' + fname, '--f0_min', '81', '--f0_max', '220', '--f0', 'test/' + fname.replace('.wav', '.f0') ]) f0s = np.fromfile('test/' + fname.replace('.wav', '.f0'), dtype=np.float32) f0s = f0s.reshape((-1, 1)) np.savetxt('test/' + fname.replace('.wav', '.f0txt'), f0s) analysis.main([ 'test/' + fname, '--inf0txt', 'test/' + fname.replace('.wav', '.f0txt'), '--spec', 'test/' + fname.replace('.wav', '.spec') ]) analysis.main([ 'test/' + fname, '--inf0bin', 'test/' + fname.replace('.wav', '.f0'), '--spec', 'test/' + fname.replace('.wav', '.spec') ]) analysis.main([ 'test/' + fname, '--f0_log', '--f0', 'test/' + fname.replace('.wav', '.lf0') ]) analysis.main([ 'test/' + fname, '--spec', 'test/' + fname.replace('.wav', '.spec') ]) # analysis.main(['test/'+fname, ' --spec_mceporder', '59', '--spec', 'test/'+fname.replace('.wav','.mcep')]) # Need SPTK for this one analysis.main([ 'test/' + fname, '--spec_nbfwbnds', '65', '--spec', 'test/' + fname.replace('.wav', '.fwlspec') ]) analysis.main([ 'test/' + fname, '--pdd', 'test/' + fname.replace('.wav', '.pdd') ]) # analysis.main(['test/'+fname, '--pdd_mceporder', '60', '--pdd', 'test/'+fname.replace('.wav','.pdd')]) # Need SPTK for this one analysis.main( ['test/' + fname, '--nm', 'test/' + fname.replace('.wav', '.nm')]) analysis.main([ 'test/' + fname, '--nm_nbfwbnds', '33', '--nm', 'test/' + fname.replace('.wav', '.fwnm') ]) # Test pre-processing analysis.main([ 'test/' + fname, '--inf0txt', 'test/' + fname.replace('.wav', '.f0txt'), '--spec', 'test/' + fname.replace('.wav', '.spec_resample16kHz'), '--preproc_fs', '16000' ]) analysis.main([ 'test/' + fname, '--inf0txt', 'test/' + fname.replace('.wav', '.f0txt'), '--spec', 'test/' + fname.replace('.wav', '.spec_preproc_hp'), '--preproc_hp', '100.0' ])
def main(): options, args = loadOptions() # Train & Develop Model s1models = langMap(lambda l: {}) totalCount = langMap() prob = totalCount train(s1models, totalCount) if options.stage == 2: s2models = trainFreqWords(options.N) # Run Model on Training Set predictions = [] testFile = "training.txt" with open(testFile) as f: for line in f: line = line.split("\t", 1)[1] if options.stage == 2: prediction = predict2(line, s1models, s2models, includetl=not options.notag) else: prediction = predict(line, s1models, prob) predictions.append(prediction[0][0]) with open(testFile + ".out", "w") as f: f.write("\n".join(predictions)) analysis.main(testFile, ignoretl = options.notag or not options.test) # Run Model on Development Set predictions = [] testFile = "test.txt" if options.test else "dev.txt" with open(testFile) as f: for line in f.readlines(): key, line = line.split("\t", 1) if options.stage == 2: prediction = predict2(line, s1models, s2models, includetl=not options.notag) else: prediction = predict(line, s1models, prob) if options.verbose: print("PREDICTION:", prediction) print("LINE: " + line) predictions.append(prediction[0][0]) with open(testFile + ".out", "w") as f: f.write("\n".join(predictions)) print("Check " + testFile + ".out for the prediction results.") # Calculate the Precision and Recall analysis.main(testFile, ignoretl = options.notag or not options.test) if options.interactive: while True: try: line = raw_input("Line to parse (or Ctrl-D to shut down): ") except EOFError: print("\nShutting Down...") break if options.stage == 2: prediction = predict2(line, s1models, s2models, includetl= not options.notag) else: prediction = predict(line, s1models, prob) sum_prob = sum([p[1] for p in prediction]) for l, p in prediction: print(' %s : %.2f%%' % (l, p * 100 / sum_prob))
def test_smoke_cmd_synthesis(cls): fname = filenames[filename_totest] # Just with one file for smoke test import analysis import synthesis analysis.main([ 'test/' + fname, '--f0_min', '75', '--f0_max', '500', '--f0', 'test/' + fname.replace('.wav', '.f0'), '--spec', 'test/' + fname.replace('.wav', '.spec'), '--pdd', 'test/' + fname.replace('.wav', '.pdd') ]) synthesis.main([ 'test/' + fname.replace('.wav', '.resynth.wav'), '--fs', '16000', '--f0', 'test/' + fname.replace('.wav', '.f0'), '--spec', 'test/' + fname.replace('.wav', '.spec'), '--pdd', 'test/' + fname.replace('.wav', '.pdd') ]) analysis.main([ 'test/' + fname, '--f0_min', '75', '--f0_max', '500', '--f0', 'test/' + fname.replace('.wav', '.f0'), '--spec', 'test/' + fname.replace('.wav', '.spec'), '--nm', 'test/' + fname.replace('.wav', '.nm') ]) synthesis.main([ 'test/' + fname.replace('.wav', '.resynth.wav'), '--fs', '16000', '--f0', 'test/' + fname.replace('.wav', '.f0'), '--spec', 'test/' + fname.replace('.wav', '.spec') ]) synthesis.main([ 'test/' + fname.replace('.wav', '.resynth.wav'), '--fs', '16000', '--f0', 'test/' + fname.replace('.wav', '.f0'), '--spec', 'test/' + fname.replace('.wav', '.spec'), '--nm', 'test/' + fname.replace('.wav', '.nm') ]) analysis.main([ 'test/' + fname, '--f0_min', '75', '--f0_max', '200', '--f0_log', '--f0', 'test/' + fname.replace('.wav', '.lf0'), '--spec', 'test/' + fname.replace('.wav', '.spec') ]) synthesis.main([ 'test/' + fname.replace('.wav', '.resynth.wav'), '--fs', '16000', '--logf0', 'test/' + fname.replace('.wav', '.lf0'), '--spec', 'test/' + fname.replace('.wav', '.spec') ]) analysis.main([ 'test/' + fname, '--f0_min', '75', '--f0_max', '500', '--f0', 'test/' + fname.replace('.wav', '.f0'), '--spec_nblinlogbnds', '129', '--spec', 'test/' + fname.replace('.wav', '.lspec') ]) synthesis.main([ 'test/' + fname.replace('.wav', '.resynth.wav'), '--fs', '16000', '--logf0', 'test/' + fname.replace('.wav', '.lf0'), '--lspec', 'test/' + fname.replace('.wav', '.lspec') ]) analysis.main([ 'test/' + fname, '--f0_min', '75', '--f0_max', '500', '--f0', 'test/' + fname.replace('.wav', '.f0'), '--spec_fwceporder', '59', '--spec', 'test/' + fname.replace('.wav', '.fwcep'), '--nm_nbfwbnds', '33', '--nm', 'test/' + fname.replace('.wav', '.fwnm') ]) synthesis.main([ 'test/' + fname.replace('.wav', '.resynth.wav'), '--fs', '16000', '--logf0', 'test/' + fname.replace('.wav', '.lf0'), '--fwcep', 'test/' + fname.replace('.wav', '.fwcep'), '--fwnm', 'test/' + fname.replace('.wav', '.fwnm') ]) # This one is the most used and thus should be the last one analysis.main([ 'test/' + fname, '--f0_log', '--f0', 'test/' + fname.replace('.wav', '.lf0'), '--spec_nbfwbnds', '65', '--spec', 'test/' + fname.replace('.wav', '.fwlspec'), '--nm_nbfwbnds', '33', '--nm', 'test/' + fname.replace('.wav', '.fwnm') ]) synthesis.main([ 'test/' + fname.replace('.wav', '.resynth.wav'), '--fs', '16000', '--logf0', 'test/' + fname.replace('.wav', '.lf0'), '--fwlspec', 'test/' + fname.replace('.wav', '.fwlspec'), '--fwnm', 'test/' + fname.replace('.wav', '.fwnm') ])
def main(path, orf_name, yeast_fname, is_annotated, is_aligned, align_pairwise, **kwargs): #algorithm=kwargs.pop('algorithm','mafft') print(orf_name) # start = 2754 # end = 2918 # path = 'data/pgs/YLL059C_2/' # orf_name = 'YLL059C' # yeast_fname = 'data/orf_genomic_all.fasta' # is_annotated = True if is_aligned: filename = [s for s in os.listdir(path) if 'muscle.fa' in s][0] else: filename = [s for s in os.listdir(path) if '_alignment.fa' in s][0] # mcl = MuscleCommandline(input='data/ybr_deneme/YBR196C-A_alignment.fa',out = 'data/ybr_deneme/YBR196C-A_alignment_muscle.fa') # find_best_overlap_id('data/ybr_deneme/Spar') aln = SeqIO.parse(path + '/' + filename, 'fasta') maxlen = 0 for rec in aln: l = len(rec.seq) if l > maxlen: maxlen = l # if maxlen>100000: # return 0 orf_seq = None if is_annotated: yeast = SeqIO.parse(yeast_fname, 'fasta') for record in yeast: if record.id == orf_name: orf_seq = record.seq if orf_seq is None: print(orf_name + ' is not found in ' + yeast_fname) return (0) else: yeast = SeqIO.parse(yeast_fname, 'fasta') for record in yeast: orf_seq = record.seq if align_pairwise: msa_file = list(SeqIO.parse(path + '/' + filename, 'fasta')) ref_seq_record = [rec for rec in msa_file if rec.id == 'Scer'][0] for record in msa_file: if record.id == 'Scer' or len(record.seq) == 0: continue start, end = get_subalignment([ref_seq_record, record], str(orf_seq), path, orf_name, is_aligned=is_aligned, **kwargs) aln_file_name = [ s for s in os.listdir(path) if '_subalignment_extended_' + record.id in s ][0] align = AlignIO.read(path + '/' + aln_file_name, 'fasta') try: ref_seq_id = [ i for i, rec in enumerate(align) if rec.id == 'Scer' ][0] except IndexError: print('Reference sequence name is not in the alignment') find_homologs(align=align, ref_seq_id=ref_seq_id, ref_range=[start, end], orf_name=orf_name, out_path=path, **kwargs) else: start, end = get_subalignment(path + '/' + filename, str(orf_seq), path, orf_name, is_aligned=is_aligned, **kwargs) aln_file_name = [ s for s in os.listdir(path) if '_alignment_muscle' in s ][0] align = AlignIO.read(path + '/' + aln_file_name, 'fasta') try: ref_seq_id = [ i for i, rec in enumerate(align) if rec.id == 'Scer' ][0] except IndexError: print('Reference sequence name is not in the alignment') find_homologs(align=align, ref_seq_id=ref_seq_id, ref_range=[start, end], orf_name=orf_name, out_path=path, **kwargs) # ss = [] analysis.main(path, orf_name, yeast_fname, is_annotated, align_pairwise)
def main(): options, args = loadOptions() # Train & Develop Model s1models = langMap(lambda l: {}) totalCount = langMap() prob = totalCount train(s1models, totalCount) if options.stage == 2: s2models = trainFreqWords(options.N) # Run Model on Training Set predictions = [] testFile = "training.txt" with open(testFile) as f: for line in f: line = line.split("\t", 1)[1] if options.stage == 2: prediction = predict2(line, s1models, s2models, includetl=not options.notag) else: prediction = predict(line, s1models, prob) predictions.append(prediction[0][0]) with open(testFile + ".out", "w") as f: f.write("\n".join(predictions)) analysis.main(testFile, ignoretl=options.notag or not options.test) # Run Model on Development Set predictions = [] testFile = "test.txt" if options.test else "dev.txt" with open(testFile) as f: for line in f.readlines(): key, line = line.split("\t", 1) if options.stage == 2: prediction = predict2(line, s1models, s2models, includetl=not options.notag) else: prediction = predict(line, s1models, prob) if options.verbose: print("PREDICTION:", prediction) print("LINE: " + line) predictions.append(prediction[0][0]) with open(testFile + ".out", "w") as f: f.write("\n".join(predictions)) print("Check " + testFile + ".out for the prediction results.") # Calculate the Precision and Recall analysis.main(testFile, ignoretl=options.notag or not options.test) if options.interactive: while True: try: line = raw_input("Line to parse (or Ctrl-D to shut down): ") except EOFError: print("\nShutting Down...") break if options.stage == 2: prediction = predict2(line, s1models, s2models, includetl=not options.notag) else: prediction = predict(line, s1models, prob) sum_prob = sum([p[1] for p in prediction]) for l, p in prediction: print(' %s : %.2f%%' % (l, p * 100 / sum_prob))
for gprotein in gprotein_list: #print gprotein + '... ', for files in os.listdir('.'): if gprotein in files and 'fweight' in files: features = extract_features(files) #print features hmm_pos, hmm_pos_positions = load_functions.read_hmm(path+'/data/hmm_models/'+gprotein+'_pos.hmm') hmm_neg, hmm_neg_positions = load_functions.read_hmm(path+'/data/hmm_models/'+gprotein+'_neg.hmm') pos = read_gprotein_hmm_out(path+'/temp/'+gprotein+'_pos.out', hmm_pos_positions) neg = read_gprotein_hmm_out(path+'/temp/'+gprotein+'_neg.out', hmm_neg_positions) if hack_directory != None: #if gprotein == 'GNA12': l = analysis.main(path, pos, neg, hmm_pos, hmm_neg, features, gprotein, obj.keys(), obj) open(hack_directory+'/'+str(gprotein)+'.txt', 'w').write(l) #sys.exit() l= 'GPCR\t' for f in features: l+=f+'\t' l+= '\n' data = read_aln(pos, neg, hmm_pos, hmm_neg, l, features, gprotein) #if gprotein == 'GNAI3': # print data feature_matrix = data[:, 2:] model = extract_model(gprotein) min_max = k_fold(path+'/data/feature_files/'+str(gprotein)+'_train.txt')