def get_related_proteins(tfc_id): data, subject = get_node(tfc_id) tmp = rq.get(subject).text label = parseturtle(tmp, ["rdfs:label"], 0) res = parseturtle(tmp, ["sybig:contains"], 0) res.sort() return label, res
def collect_dbd_seqs(mode): # Runs for a while (20+ minutes) # Mode 0: Just return. Mode 1: Write to file. Mode 2: Save to DB (recommended) big_seq_list = [] big_dbd_list = [] if mode == 2: conn = sqlite3.connect('Sequences.db') cu = conn.cursor() p_list = collect_tf_proteins() dbd_path = SEQ_PATH + "Fedora_DBDs/" fseq_path = SEQ_PATH + "Fedora_Seqs/" for nodes in p_list: # nodes = nodes.replace("localhost", "coyote") related_tfc = get_related_tfc(nodes) if not related_tfc: related_tfc = ["-", "None found", "-"] if "http" in related_tfc: print(related_tfc) tmp = rq.get(nodes).text dbds = parseturtle(tmp, ["sybig:dbd"], 0) seqs = parseturtle(tmp, ["sybig:sequence"], 0) nodesplitter = nodes.split('/') nodesplitter = nodesplitter[len(nodesplitter) - 1] # print(nodesplitter) if mode < 2: for x in dbds: big_dbd_list.append([nodesplitter, x]) for y in seqs: big_seq_list.append([nodesplitter, y]) else: if "http" in related_tfc[1]: related_tfc[1] = "NONE" if dbds and not seqs: cu.execute("INSERT INTO SEQUENCES VALUES('" + str(nodesplitter) + "', '" + str(related_tfc[1]) + "', '" + str(dbds[0]) + "', '" + "None" + "')") elif seqs and not dbds: cu.execute("INSERT INTO SEQUENCES VALUES('" + str(nodesplitter) + "', '" + str(related_tfc[1]) + "', '" + "None" + "', '" + str(seqs[0]) + "')") elif seqs and dbds: cu.execute("INSERT INTO SEQUENCES VALUES('" + str(nodesplitter) + "', '" + str(related_tfc[1]) + "', '" + str(dbds[0]) + "', '" + str(seqs[0]) + "')") if mode == 1: with open("DBDs_current", "w") as dbd: for d in big_dbd_list: dbd.write(d[0] + "\t" + d[1] + "\n") os.rename("DBDs_current", dbd_path + "DBDs_current") with open("SEQs_current", "w") as seq: for s in big_seq_list: seq.write(s[0] + "\t" + s[1] + "\n") os.rename("SEQs_current", fseq_path + "SEQs_current") if mode == 2: conn.commit() conn.close() return big_dbd_list, big_seq_list
def get_alignment(tfc_id): if "http://" not in tfc_id: data, subject = get_node(tfc_id) # subject = subject.replace("localhost", "coyote") else: subject = tfc_id # subject = subject.replace("localhost", "coyote") tmp = rq.get(subject).text # Since ldp:contains is not passed to Fuseki, parse turtle data for it res = parseturtle(tmp, ["ldp:contains"], 0) for arguments in res: if "fasta.txt" in arguments: arguments = arguments.replace(" .", '') fnull = open(os.devnull, 'w') sp.call(['wget', '-P', ALIGN_PATH, arguments], stdout=fnull, stderr=sp.STDOUT, close_fds=True) arg_splitter = arguments.split('/') arg_splitter = arg_splitter[len(arg_splitter) - 1] id_splitter = tfc_id.split('/') id_splitter = id_splitter[len(id_splitter) - 1] os.rename(ALIGN_PATH + arg_splitter, ALIGN_PATH + id_splitter) return 1 # No need to investigate further return 1
def create_genera_alignments_repo(): all_tfs = collect_tfc_ids(0, "Genus") for tfc in all_tfs: current_tfc = rq.get(tfc).text child_list = parseturtle(current_tfc, ["sybig:contains"], 0) alignlist = [] for child in child_list: current_child = rq.get(child).text protname = parseturtle(current_child, ["sybig:name"], 0)[0] dbd = parseturtle(current_child, ["sybig:dbd"], 0) if dbd: alignlist.append('>' + str(protname) + '\n') alignlist.append(dbd[0] + '\n') if alignlist: with open(ALIGN_PATH + "Unaligned_" + tfc_uri_to_label(tfc), 'w') as current_align: for line in alignlist: current_align.write(line)
def get_related_tfc(protein_uri): tmp = rq.get(protein_uri).text res1 = parseturtle(tmp, ["sybig:belongs_to"], 0) if res1: res1 = res1[0] tmp = rq.get(res1).text if tmp: res2 = parseturtle(tmp, ["rdfs:label", "sybig:tfclass_id"], 0) if len(res2) >= 2: res2.sort() res = [res1, res2[0], res2[1]] elif len(res2) == 1: res = [res1, res2[0]] else: id_splitter = res1.split('\t') id_splitter = id_splitter[len(id_splitter) - 1] res = [res1, id_splitter] return res else: return [res1, "None", "None"] else: return []
def repo_gather_human(): out_list = [] ch_list = get_children_nodes(FCREPO_URL + 'TF_protein/tf_9606') for child in ch_list: add1, add2 = "-", "-" out = rq.get(child.rstrip(".")).text parsed = parseturtle(out, ['rdfs:label', 'sybig:xref'], 0) if not parsed: print("No turtlefile results found") else: for element in parsed: if "UNIPROT" in element: add1 = element.split(':')[len(element.split(':')) - 1] add2 = parsed[len(parsed) - 1] out_list.append([child, add1, add2]) # print(out_list) return out_list
def get_triples(uri): tmp = rq.get(uri).text res = parseturtle(tmp, [], 1) return res
def investigate_tfclass(): profile_list = [] conn = sqlite3.connect("Sequences.db") cu = conn.cursor() tfclass_list = [] found_list_uniprot, found_list_gs = [], [] ctr = 0 tfclass_checklist = repo_gather_human() print( str(len(tfclass_checklist)) + " human TFs found in the TFClass repository") al_ch_list_full, al_ch_list = alignment_list(5), [] for al in al_ch_list_full: alx = al.split('/')[len(al.split('/')) - 1] al_ch_list.append(alx) with open('/sybig/home/ttu/BSc/test.csv', "r") as proteome_out: log = open('/sybig/home/ttu/BSc/testlog.csv', 'w') for strng in tfclass_checklist: log.write( str(strng[0]) + " " + str(strng[1]) + " " + str(strng[2]) + "\n") reader = csv.reader(proteome_out, delimiter="\t") for row in reader: conn_profile, best_hit, conn_dbd, conn_gs, conn_tfc, conn_p = "No Profile", "None", "No DBD", "-", "-", "-" last_gs, ent = "-", "-" query_uniprot = row[0] query_gs = row[1] for ch in tfclass_checklist: if ch[1] == query_uniprot or ch[2] == query_gs or ch[2].replace('ZNF', 'ZN') == query_gs or\ ch[2].replace('ZNF', 'Z') == query_gs: # Specifically look at proteins that are in TFClass ctr += 1 out = rq.get(ch[0]).text conn_p = tfc_uri_to_label(ch[0]) conn_tfc = parseturtle(out, ['sybig:belongs_to'], 0) if conn_tfc: conn_tfc = tfc_uri_to_label(conn_tfc[0]) dbdl = parseturtle(out, ['sybig:dbd'], 0) if dbdl: conn_dbd = "Has DBD" if conn_gs: conn_gs = parseturtle(out, ['rdfs:label'], 0) if conn_gs: conn_gs = conn_gs[0] if conn_tfc in al_ch_list: profile_list.append(conn_tfc) conn_profile = "Has Profile" for res in cu.execute( "SELECT Entropy FROM ALIGNMENTS WHERE TfcID = \"" + str(conn_tfc) + "\""): ent = res[0] log.write("Adding " + query_uniprot + " with query GS " + query_gs + " and category " + row[2] + ".\n") log.write("Best hit HMMer descriptor is " + row[3] + ".\n") log.write("Connected TFClass protein is " + conn_p + ".\n") log.write(conn_dbd + ", " + conn_profile + ".\n") last_gs = conn_gs tfclass_list.append([ query_uniprot, query_gs, row[2], row[3], conn_p, conn_tfc, conn_gs, conn_dbd, conn_profile, ent ]) if ch[1] == query_uniprot: found_list_uniprot.append(query_uniprot) if ch[2] == query_gs: found_list_gs.append(query_gs) if last_gs == "-": tfclass_list.append([ query_uniprot, query_gs, row[2], row[3], conn_p, conn_tfc, conn_gs, conn_dbd, conn_profile, ent ]) log.close() with open('/sybig/home/ttu/BSc/test2ort.csv', "w") as csv_in: test_writer = csv.writer(csv_in, delimiter="\t", quotechar='"', quoting=csv.QUOTE_MINIMAL) test_writer.writerow([ "Query", "Gene Symbol", "Category", "HMMer HitDescriptor", "TFClass Node", "TFClass Node ID", "TFClass Node GS", "TFClass node has DBD?", "In Profiles", "Entropy" ]) for e in tfclass_list: test_writer.writerow( [e[0], e[1], e[2], e[3], e[4], e[5], e[6], e[7], e[8], e[9]]) print( str(ctr) + " human transcription factors with either UNIPROT ID or fitting gene symbol detected." ) print(str(len(profile_list)) + " profile hits") print( "Transcription factors that have not been found, but should be according to profiles:" ) for tcl in tfclass_checklist: if tcl[1] not in found_list_uniprot and tcl[2] not in found_list_gs: cand_out = rq.get(tcl[0].rstrip(".")).text ctl = parseturtle(cand_out, ['sybig:belongs_to'], 0) if ctl: if tfc_uri_to_label(ctl[0]) in al_ch_list: # print(tfc_uri_to_label(ctl[0])) for res in cu.execute( "SELECT Entropy FROM ALIGNMENTS WHERE TfcID =\"" + str(tfc_uri_to_label(ctl[0])) + "\""): ent = res[0] print(tcl[0] + ", " + tcl[1] + ", " + tcl[2] + " " + str(ent)) conn.close() return tfclass_list