def predict_position(params, recalculate, mutation, use_neighbor): import wc import objects #use_neighbor = False protein_name = mutation[0] pos = mutation[1] params.set_param('uniprot_id', protein_name) seq = wc.get_stuff(objects.dW, params, recalculate, False, False) msa = wc.get_stuff(objects.agW, params, recalculate, False, False) if use_neighbor: neighbors = wc.get_stuff(objects.neighbors_w, params, recalculate, False, False) try: msa = filter_msa_based_on_pos_neighbors_and_query(seq, pos, msa, neighbors[pos]) except: x = 2 pdb.set_trace() weights = get_weight_of_msa_seqs(msa) res = mutation[3] score = 0 col = msa.get_column(pos) for i in range(len(msa)): if col[i] == res: score += weights[i] return score / sum(weights)
def predict_position_energy(params, recalculate, mutation, use_neighbor, ignore_pos): import wc import objects #use_neighbor = False protein_name = mutation[0] pos = mutation[1] wild_res = mutation[2] mut_res = mutation[3] params.set_param('uniprot_id', protein_name) seq = wc.get_stuff(objects.dW, params, recalculate, False, False) msa = wc.get_stuff(objects.agW, params, recalculate, False, False) score = 0 col = msa.get_column(pos) if not ignore_pos: try: score += -1.0 * math.log( float(col.count(mut_res)+1) / (col.count(wild_res)+1) ) #score = -1.0 * ( float(col.count(mut_res)+1) / (col.count(wild_res)+1) ) except: pdb.set_trace() x=2 print >> sys.stderr, score if col.count(mut_res) == 0: return -1.0 * score if use_neighbor: constraints_a = [(pos,wild_res)] filter_a_msa = filter_msa_based_on_pos_constraint(msa, constraints_a) constraints_b = [(pos,mut_res)] filter_b_msa = filter_msa_based_on_pos_constraint(msa, constraints_b) all_neighbors = wc.get_stuff(objects.neighbors_w, params, recalculate, False, False) neighbors = all_neighbors[pos] for neighbor in neighbors[0:1]: try: na_col = filter_a_msa.get_column(neighbor) nb_col = filter_b_msa.get_column(neighbor) except: pdb.set_trace() x=2 na_col_no_skip = [x for x in na_col if x != '-'] nb_col_no_skip = [x for x in nb_col if x != '-'] score += get_KL_real(nb_col_no_skip, na_col_no_skip) if abs(score) < .001: pdb.set_trace() x=2 return -1.0 * score
def get(obj, p, gotten_stuff, used_ps, check = True): global past used_ps.add(p.get_copy()) print >> sys.stderr, 'starting: ', p.get_param('uniprot_id'), obj, which_job, total_jobs to_get = True gotten_stuff.append([obj, p.get_copy()]) global whether_to_get_anything if whether_to_get_anything and not wc.get_wrapper_instance(obj).has(p, False, whether_to_check_remote): ans = wc.get_stuff(obj, p, False, False, False) print >> sys.stderr, 'took: ', datetime.datetime.now() - past past = datetime.datetime.now() return ans print >> sys.stderr, 'already have: ', p.get_param('uniprot_id'), obj
def human_classify(self, record): import wc import param p = param.param({'pid':record.pid, 'rec_idx':record.idx}) stored_qa = wc.get_stuff(side_effect_human_input_report_labels, p) import quesions the_q = questions.urinary_incontinence try: ans = stored_qa[the_q] except KeyError: raise my_exceptions.NoFxnValueException else: if ans == 0: raise my_exceptions.NoFxnValueException else: if ans in [1,2]: return 1 elif ans in [3,4]: return 0 else: pdb.set_trace() raise
def data_set_from_pid_list(cls, pid_list, params): import wc import objects from global_stuff import get_tumor_cls, get_tumor_w the_data = [] i = 0 for pid in pid_list: print i, pid i += 1 params.set_param('pid', pid) try: a_tumor = wc.get_stuff(get_tumor_w(), params) #assert len(a_tumor.attributes) == get_tumor_cls().num_attributes except my_exceptions.WCFailException: print 'failed to get ', pid except AssertionError: print 'failed to get ', pid, ' number of attributes was incorrect' except Exception: print 'failed to get ', pid, ' not sure of error' else: the_data.append(a_tumor) return cls(the_data)
edge_list = self.get_var_or_file(objects.iW, params, False, False, False) helper.write_mat(edge_list, the_folder + 'edge_list.csv') num_nodes = len(node_features) adj_mat = [ [0 for i in range(num_nodes)] for j in range(num_nodes)] for i in range(len(edge_list)): n1 = edge_list[i][0] n2 = edge_list[i][1] adj_mat[n1][n2] = 1 adj_mat[n2][n1] = 1 helper.write_mat(adj_mat, the_folder + 'adj_mat.csv') info = [str(len(node_features)), str(len(edge_features)), str(2), str(len(node_features[0])), str(len(edge_features[0]))] helper.write_vect(info, the_folder + 'info.txt', the_sep = ' ') return None #pdb_names = ['12as','2jcw','13pk','1a4i','1a4s','1ab8'] pdb_names = ['2jcw'] #chain_letters = ['A','A','A','A','A','A'] chain_letters = ['A'] from parameters import the_params the_params.set_param('pdb_names', pdb_names) the_params.set_param('chain_letters', chain_letters) wc.get_stuff(generate_old_input_files,the_params, False, False, False, False)
import pdb import f as features import new_new_objects as objects import param import wc import global_stuff # hardcode parameters for the experiment here for now. #the_dict = {'pdb_name':'1asy', 'chain_letter':'A', 'edge_feature_list':[features.xW], 'node_feature_list':[features.vW, features.uW, features.wW], 'dist_cut_off':5} #the_params = param.param(the_dict) import helper file_location = 'mf_nodewise_0' folder_name, the_params = helper.read_param(file_location) the_params.set_param('p', '1p3d') the_params.set_param('c', 'A') the_params.set_param('st', 322) the_params.set_param('en', 473) ans = wc.get_stuff(objects.ciW, the_params, False, False, False) print ans
for line in f: name = line.strip() folder = global_stuff.base_folder + name + '/' files = os.listdir(folder) has_easy = False has_dist = False enough_rows = False for a_file in files: if 'easy' in a_file: has_easy = True subprocess.call(['cp', folder+a_file, folder+'msa']) if 'pairwise' in a_file: has_dist = True subprocess.call(['cp', folder+a_file, folder+'dists']) # copy to better file_name msa = wc.get_stuff(objects.agW, param.param({'uniprot_id':name, 'ev':evalue}), False, False, False) if len(msa) > 50: enough_rows = True if has_easy and has_dist and enough_rows: completed.append(name) g = open(global_stuff.completed_list_file, 'w') for name in completed: g.write(name + '\n') f.close g.close()
import pdb pdb.set_trace() import f import new_new_objects as objects import wrapper from wrapper_decorator import dec import wc import global_stuff import sys info_file = sys.argv[1] import helper asdf, the_params = helper.read_param(info_file) the_params.set_param('which_wrapperq', objects.fW) wc.get_stuff(objects.abW, the_params, True, False, False)
import sys import wc, objects input_file = sys.argv[1] output_file = sys.argv[2] use_neighbor = sys.argv[3] == 'T' ignore_pos = sys.argv[4] == 'T' max_neighbor = int(sys.argv[5]) num_trials = int(sys.argv[6]) pseudo_total = float(sys.argv[7]) import global_stuff params = global_stuff.get_param() params.set_param('protein_list_file', input_file) l = wc.get_stuff(objects.filtered_mutation_list_given_protein_list, params) import objects import helper my_output = objects.get_output_obj(params, l, use_neighbor, ignore_pos, max_neighbor, num_trials, pseudo_total, helper.vanilla_similarity, helper.normalize_nothing, helper.mutation_to_class) helper.write_mat(my_output, output_file)
import my_data_types import pickle import get_info import global_stuff import plotters import numpy import my_exceptions import aggregate_features as af from global_stuff import get_tumor_cls import matplotlib.pyplot as plt p = global_stuff.get_param() A = set(wc.get_stuff(objects.PID_with_SS_info, p)) B = set(wc.get_stuff(objects.PID_with_shared_MRN, p)) C = set(wc.get_stuff(objects.PID_with_multiple_tumors, p)) PID_to_use = list(A - B - C) test_PID_to_use = PID_to_use the_data_set = helper.data_set.data_set_from_pid_list(test_PID_to_use, p) treated_data_set = the_data_set.filter(lambda x: f.treatment_code_f().generate(x) in [1,2]) interval_boundaries = [0,0.5,1,2,5] intervals = [my_data_types.ordered_interval(helper.my_timedelta(interval_boundaries[i]*365), helper.my_timedelta(interval_boundaries[i+1]*365)) for i in range(len(interval_boundaries)-1)] side_effect_name = 'incontinence'
if i % total_jobs == which_job: gotten_stuff = [] protein_name = line.strip() p.set_param('uniprot_id',protein_name) import wc import pdb if uniprot_or_pdb_chain == 'U': seq = wc.get_stuff(objects.dW,p) elif uniprot_or_pdb_chain == 'P': seq = wc.get_stuff(objects.pdb_chain_seq,p) print >> sys.stderr, "currently getting: ", protein_name, len(seq) if len(seq) < 1000000: if whether_to_temp: global_stuff.home = global_stuff.temp_home assert global_stuff.base_folder == global_stuff.real_base_folder try:
import wc import objects import param import pdb p = param.param() A = set(wc.get_stuff(objects.PID_with_SS_info, p)) B = set(wc.get_stuff(objects.PID_with_shared_MRN, p)) C = set(wc.get_stuff(objects.PID_with_several_tumors, p)) PID_to_use = A - B - C PID_to_MRN = wc.get_stuff(objects.PID_to_MRN_dict,p) i = 0 lengths = [] for PID in PID_to_use: p.set_param('pid',PID) texts = wc.get_stuff(objects.raw_medical_text,p) lengths.append(len(texts)) print i, PID, len(texts) i += 1 pdb.set_trace()
import global_stuff import wc import param import objects import sys which_job = int(sys.argv[1]) total_jobs = int(sys.argv[2]) which_object = objects.pairwise_dist f = open(global_stuff.protein_list_file, "r") i = 0 for line in f: if i % total_jobs == which_job: protein_name = line.strip() wc.get_stuff(which_obj, param.param({"uniprot_id": protein_name}), True, True, False)
import wc import param import objects import global_stuff import helper import wrapper import sys name = sys.argv[1] which_msa = int(sys.argv[2]) try: itera = int(sys.argv[3]) except: pass p = param.param({'pdb':'1JOS', 'chain':'A', 'which_dataset':'CBS', 'uniprot_id':name, 'co':7.0, 'which_blast':0, 'which_msa':which_msa, 'ev':.05, 'blmax':999999,'hhblits_iter':itera, 'which_neighbors':1, 'protein_list_file':'rascalled_completed', 'to_leon':0, 'to_cluster':1, 'to_rascal':0, 'to_normd':0, 'norm_co':9.0, 'psiblast_iter':itera}) wc.get_stuff(wrapper.my_msa_obj_wrapper, p) p.set_param('to_rascal', 1) wc.get_stuff(wrapper.my_msa_obj_wrapper, p) p.set_param('to_normd', 1) wc.get_stuff(wrapper.my_msa_obj_wrapper, p)
import wc import param import objects p = param.param({'ev':1e-10, 'protein_list_file':'hum_var_msa_dist_completed', 'uniprot_id':'P80075', 'avg_deg':20, 'n_cutoff':0, 'f_cutoff':15}) m = wc.get_stuff(objects.pairwise_dist, p, False, False, False)
from mpi4py import MPI import pdb import helper import sys file_location = sys.argv[1] wrappers = [objects.bhW, objects.cfW] folder_name, the_params = helper.read_param(file_location) the_params.set_param('tj',1) the_params.set_param('wj',0) hp_stash = wc.get_stuff(objects.caW, the_params, False, False, False) comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() f = open('bin/get_features_parallel_'+str(rank), 'w', 0) data_list = wc.get_stuff(objects.ciW, the_params, global_stuff.recalculate, True, True) works = [] for i in range(len(data_list)): if i % size == rank: pdb_name = data_list[i].pdb_name chain_letter = data_list[i].chain_letter start = data_list[i].start end = data_list[i].end
new_folders = [] for folder in pdb_folders: s = folder.strip().split('_') pdb_name = s[0] params = param.param({'p':pdb_name}) while 1: try: g = wc.get_stuff(objects.fW, params, False, False, False) except Exception, err: print err import time time.sleep(20) else: break structure = Bio.PDB.PDBParser().get_structure(params.get_param('p'), g) if s[1] == '': letter = structure[0].child_dict.keys()[0] else: letter = s[1] params.set_param('c', letter) if s[2] == '':
#info_file = constants.INFO_FOLDER + info_file relative_folder, the_params = helper.read_param(info_file) #num_outer_fold = 3 #which_outer_fold = 2 #num_pieces = 330 #which_piece = 96 the_params.set_param('tj', num_pieces) the_params.set_param('wj', which_piece) the_data = wc.get_stuff(objects.brW, the_params, False, True, False) the_params.set_param('s', the_data) the_params.set_param('m', num_outer_fold) the_params.set_param('k', which_outer_fold) the_fold = wc.get_stuff(objects.buW, the_params, False, False, False) import cross_validation_pseudo as cv the_params.set_param('f', the_fold) asdf = wc.get_stuff(objects.cbW, the_params, False, False, False)
def PID_to_MRN(pid): import wc, objects m = wc.get_stuff(objects.PID_to_MRN_dict, param.param()) return m[pid]
def predict_position_energy_weighted(params, mutation, use_neighbor, ignore_pos, max_neighbor, num_trials, pseudo_total, sim_f, to_neighbor_p_value): import wc import objects protein_name = mutation[0] pos = mutation[1] wild_res = mutation[2] mut_res = mutation[3] params.set_param('uniprot_id', protein_name) seq = wc.get_stuff(objects.dW, params) assert seq[pos] == wild_res import wrapper msa = wc.get_stuff(wrapper.my_msa_obj_wrapper, params) score = 0 #seq_weights = [1.0 for i in range(len(msa))] #seq_weights = #params.set_param('which_msa', 0) node_msa = wc.get_stuff(wrapper.my_msa_obj_wrapper, params) column = node_msa.get_column(pos) node_seq_weights = wc.get_stuff(objects.general_seq_weights, params) #params.set_param('which_msa', 2) msa = wc.get_stuff(wrapper.my_msa_obj_wrapper, params) neighbor_seq_weights = wc.get_stuff(objects.general_seq_weights, params) if not ignore_pos: #mut_weight = sum([seq_weights[i] for i in range(len(msa)) if msa[pos,i] == mut_res]) #wild_weight = sum([seq_weights[i] for i in range(len(msa)) if msa[pos,i] == wild_res]) #mut_count = column.count(mut_res) #wild_count= column.count(wild_res) #if wild_similarity < 1: # print wild_res, mut_res, column # pdb.set_trace() mut_similarity = compute_similarity_score_to_residue(column, node_seq_weights, mut_res, sim_f) wild_similarity = compute_similarity_score_to_residue(column, node_seq_weights, wild_res, sim_f) #score += math.log((mut_similarity + 1) / wild_similarity) score += (mut_similarity + 1) / wild_similarity #assert abs(score - second_score) < .001 #score += math.log((mut_weight + 1) / (wild_weight)) #score = -1.0 * mutation[-3] neighbor_score = 0 if use_neighbor: # get neighbors/weights all_neighbors = wc.get_stuff(objects.general_neighbors_w_weight_w, params) pos_neighbors = all_neighbors[pos] sorted_pos_neighbors = sorted(pos_neighbors, key = lambda elt: elt[1], reverse = True) neighbors = [x[0] for x in sorted_pos_neighbors[0:min(max_neighbor,len(sorted_pos_neighbors))]] #neighbor_weights = [x[1] for x in sorted_pos_neighbors[0:min(max_neighbor,len(sorted_pos_neighbors))]] neighbor_weights = [1.0 for i in range(len(neighbors))] # get pseudo_counts pseudo_count_dict = {} for key in range(global_stuff.q): pseudo_count_dict[key] = pseudo_total / global_stuff.q # none of weights have to be normalized def get_neighbor_score(msa, weight_a, weight_b, neighbors, neighbor_weights, pseudo_count_dict): num_neighbors = len(neighbors) assert(len(neighbors) == len(neighbor_weights)) score = 0 neighbor_weights = normalize(neighbor_weights) #neighbors = range(num_neighbors) for i in range(num_neighbors): choose_neighbor_probs = {} for j in range(global_stuff.q): choose_neighbor_probs[j] = 0 choose_neighbor_probs[global_stuff.aa_to_num[seq[neighbors[i]]]] = 1.0 #score += neighbor_weights[i] * get_KL_weighted(msa.get_column(neighbors[i]), weight_a, weight_b, pseudo_count_dict, choose_neighbor_probs) #score += neighbor_weights[i] * get_KL_weighted(msa.get_column(neighbors[i]), weight_a, weight_b, pseudo_count_dict, choose_neighbor_probs) / get_entropy_weighted(msa.get_column(neighbors[i]), neighbor_seq_weights, pseudo_count_dict) asdf = get_KL_weighted(msa.get_column(neighbors[i]), weight_a, weight_b, pseudo_count_dict, choose_neighbor_probs) random_kls = get_random_KLs(msa.get_column(neighbors[i]), neighbor_seq_weights, sum(weight_a), sum(weight_b), pseudo_count_dict, num_trials) #score += asdf * neighbor_weights[i] score += p_value_z(random_kls, asdf) * neighbor_weights[i] #score += neighbor_weights[i] * (asdf / mean(random_kls)) #score += rank(random_kls, asdf) * neighbor_weights[i] return score actual_weight_a = [neighbor_seq_weights[i] if msa[i,pos] == wild_res else 0.0 for i in range(len(msa))] actual_weight_b = [neighbor_seq_weights[i] if msa[i,pos] == mut_res else 0.0 for i in range(len(msa))] neighbor_cols = [ [msa[j,i] for j in range(len(msa))] for i in neighbors] actual_neighbor_score = get_neighbor_score(msa, actual_weight_a, actual_weight_b, neighbors, neighbor_weights, pseudo_count_dict) if actual_neighbor_score < 0: print actual_neighbor_score if to_neighbor_p_value: mut_weight = sum([neighbor_seq_weights[i] for i in range(len(msa)) if msa[i,pos] == mut_res]) wild_weight = sum([neighbor_seq_weights[i] for i in range(len(msa)) if msa[i,pos] == wild_res]) random_scores = [] for i in range(num_trials): random_weight_a = get_random_weight(neighbor_seq_weights, wild_weight) random_weight_b = get_random_weight(neighbor_seq_weights, mut_weight) a_random_score = get_neighbor_score(msa, random_weight_a, random_weight_b, neighbors, neighbor_weights, pseudo_count_dict) random_scores.append(a_random_score) normalize_neighbor_by_z = True if normalize_neighbor_by_z: random_mean = mean(random_scores) random_sd = sd(random_scores) try: neighbor_score = normalize_to_unit(actual_neighbor_score, random_mean, random_sd) except: asdf=2 neighbor_score = 0 else: neighbor_score = rank(random_scores, actual_neighbor_score) else: neighbor_score = actual_neighbor_score print >> sys.stderr, score, neighbor_score, len(msa) return (score - neighbor_score) * -1.0
num_trials = int(sys.argv[5]) pseudo_total = float(sys.argv[6]) to_neighbor_p_value = sys.argv[7] == 'T' import global_stuff params = global_stuff.get_param() import helper helper.parse_p_input(params, sys.argv[8:]) #l = wc.get_stuff(objects.filtered_mutation_list_given_protein_list, params) l = wc.get_stuff(objects.filtered_mutation_list, params) i = 0 my_l = [] for m in l: if i % size == rank: my_l.append(m) i += 1 import objects print rank, len(my_l) which_dataset = params.get_param('which_dataset') if which_dataset == 'cosmic' or which_dataset == 'their_cosmic':
import sys from param import param file_location = sys.argv[1] wrappers = [objects.bhW, objects.cfW] to_pickle = [True, True] folder_name, the_params = helper.read_param(file_location) the_params.set_param('tj',1) the_params.set_param('wj',0) hp_stash = wc.get_stuff(objects.caW, the_params, False, False, False) f = open('bin/get_features_serial', 'w', 0) data_list = wc.get_stuff(objects.ciW, the_params, False, True, True) works = [] all_keys = [ ['nvjd'], ['wjd', 'wpw']] for i in range(len(data_list)): pdb_name = data_list[i].pdb_name
import _test import wc import new_new_objects as objects import pdb _test.init_crf() from param import param #import run_small_search import helper import sys info_file = sys.argv[1] relative_folder, the_params = helper.read_param(info_file) the_params.set_param('tj',1) results = wc.get_stuff(objects.ceW, the_params, False, True, True) pdb.set_trace() print 3
#pdb.set_trace() #print m.get_fragment(a, 10) #print m.get_match(a, ['asdf']) #pdb.set_trace() sosv = bf.single_ordinal_single_value_wrapper_feature p = global_stuff.get_param() #A = set(wc.get_stuff(objects.PID_with_SS_info, p)) A = set(wc.get_stuff(objects.prostate_PID,p)) B = set(wc.get_stuff(objects.PID_with_shared_MRN, p)) C = set(wc.get_stuff(objects.PID_with_multiple_tumors, p)) PID_to_use = list(A - B - C)[:3000] #test_PID_to_use = PID_to_use[2100:2120] #the_data_set = helper.data_set.data_set_from_pid_list(test_PID_to_use, p) for pid in PID_to_use: