if sum(1 for l in aa if l == 'X') < 10: prots.append(aa) names.append(name) # <codecell> seq_df = DataFrame({ 'Tat':Series(prots, index = names) }) # <codecell> from SeqProcessTools import align_seq_data_frame out_align = align_seq_data_frame(seq_df, '/home/will/HIVReportGen/Data/BlastDB/ConBseqs.txt') # <codecell> cohort_data = read_csv('Texas_Cohort_Data.txt', sep = '\t') nout_align = out_align.reset_index() nout_align['short_ind'] = nout_align['index'].map(lambda x: x.split('-')[0]) nout = merge(nout_align, cohort_data, left_index = 'short_ind', right_index = 'Patient ID') nout = nout.drop(nout['NeuroCog'] == 'Not Tested', axis = 0) print nout # <codecell>
os.chdir('/home/will/Dropbox/HIVseqs/') sys.path.append('/home/will/HIVReportGen/AnalysisCode/') from SeqProcessTools import read_pat_seq_data, load_training_seq_data, align_seq_data_frame # <codecell> import glob pat_files = glob.glob('/home/will/HIVReportGen/Data/PatientFasta/*.fasta') pat_seq = read_pat_seq_data(pat_files, '/home/will/HIVReportGen/Data/BlastDB/ConBseqs.txt') training_files = glob.glob('/home/will/HIVReportGen/Data/TrainingSequences/*.fasta') training_data = load_training_seq_data(training_files) align_lanl = align_seq_data_frame(training_data, '/home/will/HIVReportGen/Data/BlastDB/ConBseqs.txt') all_seqs = concat([pat_seq, align_lanl]) # <codecell> def get_pairwise_distances(seq_series, tree_file = None, seq_file = None): if seq_file is None: fasta_handle = NTF() if tree_file is None: tree_handle = NTF() else:
for ind, trop in zip(trops.index, trops["Tropism"].values): trop_dict[ind] = trop # <codecell> grouped_seq_df.dropna(subset=["gp120"])[["score"]].to_excel("NewPSSMScores.xlsx") # <codecell> wanted_seq_data = grouped_seq_df.dropna(subset=["gp120"]) # <codecell> print "aligning" align_data = align_seq_data_frame(wanted_seq_data, "/home/will/HIVReportGen/Data/BlastDB/ConBseqs.txt") # <codecell> align_data["Tropism"] = align_data["score"].map(decide_tropism) # <codecell> wanted_data = align_data.dropna(subset=["Tropism"]) # <codecell> from itertools import product def yield_regions(trop_dict):