def group_go_by_ontology(data, dag, fill_na, accession_to_feature_file): if isinstance(data, str): feature_df = load_data_frame(accession_to_feature_file, fill_na=fill_na) p_go = get_feature_for_accession(feature_df, data, 'uniprot', 'go') else: p_go = [t.upper() for t in data if 'go' in t.lower()] # Separate the namespaces in the go terms. p_go_cc = filter(lambda x: ontology.id_to_node(x, dag).namespace == 'cellular_component', p_go) p_go_bp = filter(lambda x: ontology.id_to_node(x, dag).namespace == 'biological_process', p_go) p_go_mf = filter(lambda x: ontology.id_to_node(x, dag).namespace == 'molecular_function', p_go) assert len(set(p_go_cc) & set(p_go_bp) & set(p_go_mf)) == 0 return {'cc': p_go_cc, 'bp': p_go_bp, 'mf': p_go_mf}
def compute_ss(ppi_tuples): r_file_in = tempfile.mktemp(suffix='.tsv', prefix='r_in_', dir='tmp') r_file_out = tempfile.mktemp(suffix='.tsv', prefix='r_out_', dir='tmp') dag = ontology.load_go_dag(OBO_FILE) feature_df = load_data_frame(ACCESSION_FEATURES_FILE, fill_na=np.NaN) # Write the three seperate GO columns to the r_input_file fp = open(r_file_in, 'w') fp.write("p1\tp2\tp1_go_cc\tp2_go_cc\tp1_go_bp\tp2_go_bp\tp1_go_mf\tp2_go_mf\n") for p1, p2 in ppi_tuples: p1_go = get_feature_for_accession(feature_df, p1, 'uniprot', 'go') p2_go = get_feature_for_accession(feature_df, p2, 'uniprot', 'go') # Separate the namespaces in the go terms. p1_go_cc = set(filter(lambda x: ontology.id_to_node(x, dag).namespace == 'cellular_component', p1_go)) for p in p1_go_cc: assert ontology.id_to_node(p, dag).namespace == 'cellular_component' p2_go_cc = set(filter(lambda x: ontology.id_to_node(x, dag).namespace == 'cellular_component', p2_go)) for p in p2_go_cc: assert ontology.id_to_node(p, dag).namespace == 'cellular_component' p1_go_bp = set(filter(lambda x: ontology.id_to_node(x, dag).namespace == 'biological_process', p1_go)) for p in p1_go_bp: assert ontology.id_to_node(p, dag).namespace == 'biological_process' p2_go_bp = set(filter(lambda x: ontology.id_to_node(x, dag).namespace == 'biological_process', p2_go)) for p in p2_go_bp: assert ontology.id_to_node(p, dag).namespace == 'biological_process' p1_go_mf = set(filter(lambda x: ontology.id_to_node(x, dag).namespace == 'molecular_function', p1_go)) for p in p1_go_mf: assert ontology.id_to_node(p, dag).namespace == 'molecular_function' p2_go_mf = set(filter(lambda x: ontology.id_to_node(x, dag).namespace == 'molecular_function', p2_go)) for p in p2_go_mf: assert ontology.id_to_node(p, dag).namespace == 'molecular_function' fp.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\n".format( p1, p2, ','.join(p1_go_cc), ','.join(p2_go_cc), ','.join(p1_go_bp), ','.join(p2_go_bp), ','.join(p1_go_mf), ','.join(p2_go_mf) ) ) fp.close() # Run R script then collect output from tmp file args = [ 'Rscript', 'semantic_sim.r', '--file={}'.format(r_file_in), '--out={}'.format(r_file_out) ] proc = subprocess.Popen(args) proc.wait() # Parse the r_output into a list sims_tuple = [] with open(r_file_out, 'r') as fp: for line in fp: xs = line.strip().split('\t') p1, p2, cc_ss, bp_ss, mf_ss = xs sims_tuple.append((p1, p2, cc_ss, bp_ss, mf_ss)) fp.close() os.remove(r_file_in) os.remove(r_file_out) return sims_tuple