def get_related_proteins(tfc_id):
    data, subject = get_node(tfc_id)
    tmp = rq.get(subject).text
    label = parseturtle(tmp, ["rdfs:label"], 0)
    res = parseturtle(tmp, ["sybig:contains"], 0)
    res.sort()
    return label, res
def collect_dbd_seqs(mode):
    # Runs for a while (20+ minutes)
    # Mode 0: Just return. Mode 1: Write to file. Mode 2: Save to DB (recommended)
    big_seq_list = []
    big_dbd_list = []
    if mode == 2:
        conn = sqlite3.connect('Sequences.db')
        cu = conn.cursor()
    p_list = collect_tf_proteins()
    dbd_path = SEQ_PATH + "Fedora_DBDs/"
    fseq_path = SEQ_PATH + "Fedora_Seqs/"
    for nodes in p_list:
        # nodes = nodes.replace("localhost", "coyote")
        related_tfc = get_related_tfc(nodes)
        if not related_tfc:
            related_tfc = ["-", "None found", "-"]
        if "http" in related_tfc:
            print(related_tfc)
        tmp = rq.get(nodes).text
        dbds = parseturtle(tmp, ["sybig:dbd"], 0)
        seqs = parseturtle(tmp, ["sybig:sequence"], 0)
        nodesplitter = nodes.split('/')
        nodesplitter = nodesplitter[len(nodesplitter) - 1]
        # print(nodesplitter)
        if mode < 2:
            for x in dbds:
                big_dbd_list.append([nodesplitter, x])
            for y in seqs:
                big_seq_list.append([nodesplitter, y])
        else:
            if "http" in related_tfc[1]:
                related_tfc[1] = "NONE"
            if dbds and not seqs:
                cu.execute("INSERT INTO SEQUENCES VALUES('" +
                           str(nodesplitter) + "', '" + str(related_tfc[1]) +
                           "', '" + str(dbds[0]) + "', '" + "None" + "')")
            elif seqs and not dbds:
                cu.execute("INSERT INTO SEQUENCES VALUES('" +
                           str(nodesplitter) + "', '" + str(related_tfc[1]) +
                           "', '" + "None" + "', '" + str(seqs[0]) + "')")
            elif seqs and dbds:
                cu.execute("INSERT INTO SEQUENCES VALUES('" +
                           str(nodesplitter) + "', '" + str(related_tfc[1]) +
                           "', '" + str(dbds[0]) + "', '" + str(seqs[0]) +
                           "')")

    if mode == 1:
        with open("DBDs_current", "w") as dbd:
            for d in big_dbd_list:
                dbd.write(d[0] + "\t" + d[1] + "\n")
        os.rename("DBDs_current", dbd_path + "DBDs_current")
        with open("SEQs_current", "w") as seq:
            for s in big_seq_list:
                seq.write(s[0] + "\t" + s[1] + "\n")
        os.rename("SEQs_current", fseq_path + "SEQs_current")
    if mode == 2:
        conn.commit()
        conn.close()
    return big_dbd_list, big_seq_list
def get_alignment(tfc_id):
    if "http://" not in tfc_id:
        data, subject = get_node(tfc_id)
        # subject = subject.replace("localhost", "coyote")
    else:
        subject = tfc_id
        # subject = subject.replace("localhost", "coyote")
    tmp = rq.get(subject).text
    # Since ldp:contains is not passed to Fuseki, parse turtle data for it
    res = parseturtle(tmp, ["ldp:contains"], 0)
    for arguments in res:
        if "fasta.txt" in arguments:
            arguments = arguments.replace(" .", '')
            fnull = open(os.devnull, 'w')
            sp.call(['wget', '-P', ALIGN_PATH, arguments],
                    stdout=fnull,
                    stderr=sp.STDOUT,
                    close_fds=True)
            arg_splitter = arguments.split('/')
            arg_splitter = arg_splitter[len(arg_splitter) - 1]
            id_splitter = tfc_id.split('/')
            id_splitter = id_splitter[len(id_splitter) - 1]
            os.rename(ALIGN_PATH + arg_splitter, ALIGN_PATH + id_splitter)
            return 1  # No need to investigate further
    return 1
def create_genera_alignments_repo():
    all_tfs = collect_tfc_ids(0, "Genus")
    for tfc in all_tfs:
        current_tfc = rq.get(tfc).text
        child_list = parseturtle(current_tfc, ["sybig:contains"], 0)
        alignlist = []
        for child in child_list:
            current_child = rq.get(child).text
            protname = parseturtle(current_child, ["sybig:name"], 0)[0]
            dbd = parseturtle(current_child, ["sybig:dbd"], 0)
            if dbd:
                alignlist.append('>' + str(protname) + '\n')
                alignlist.append(dbd[0] + '\n')
        if alignlist:
            with open(ALIGN_PATH + "Unaligned_" + tfc_uri_to_label(tfc),
                      'w') as current_align:
                for line in alignlist:
                    current_align.write(line)
def get_related_tfc(protein_uri):
    tmp = rq.get(protein_uri).text
    res1 = parseturtle(tmp, ["sybig:belongs_to"], 0)
    if res1:
        res1 = res1[0]
        tmp = rq.get(res1).text
        if tmp:
            res2 = parseturtle(tmp, ["rdfs:label", "sybig:tfclass_id"], 0)
            if len(res2) >= 2:
                res2.sort()
                res = [res1, res2[0], res2[1]]
            elif len(res2) == 1:
                res = [res1, res2[0]]
            else:
                id_splitter = res1.split('\t')
                id_splitter = id_splitter[len(id_splitter) - 1]
                res = [res1, id_splitter]
            return res
        else:
            return [res1, "None", "None"]
    else:
        return []
Exemple #6
0
def repo_gather_human():
    out_list = []
    ch_list = get_children_nodes(FCREPO_URL + 'TF_protein/tf_9606')
    for child in ch_list:
        add1, add2 = "-", "-"
        out = rq.get(child.rstrip(".")).text
        parsed = parseturtle(out, ['rdfs:label', 'sybig:xref'], 0)
        if not parsed:
            print("No turtlefile results found")
        else:
            for element in parsed:
                if "UNIPROT" in element:
                    add1 = element.split(':')[len(element.split(':')) - 1]
            add2 = parsed[len(parsed) - 1]
            out_list.append([child, add1, add2])
    # print(out_list)
    return out_list
def get_triples(uri):
    tmp = rq.get(uri).text
    res = parseturtle(tmp, [], 1)
    return res
Exemple #8
0
def investigate_tfclass():
    profile_list = []
    conn = sqlite3.connect("Sequences.db")
    cu = conn.cursor()
    tfclass_list = []
    found_list_uniprot, found_list_gs = [], []
    ctr = 0
    tfclass_checklist = repo_gather_human()
    print(
        str(len(tfclass_checklist)) +
        " human TFs found in the TFClass repository")
    al_ch_list_full, al_ch_list = alignment_list(5), []
    for al in al_ch_list_full:
        alx = al.split('/')[len(al.split('/')) - 1]
        al_ch_list.append(alx)
    with open('/sybig/home/ttu/BSc/test.csv', "r") as proteome_out:
        log = open('/sybig/home/ttu/BSc/testlog.csv', 'w')
        for strng in tfclass_checklist:
            log.write(
                str(strng[0]) + " " + str(strng[1]) + " " + str(strng[2]) +
                "\n")
        reader = csv.reader(proteome_out, delimiter="\t")
        for row in reader:
            conn_profile, best_hit, conn_dbd, conn_gs, conn_tfc, conn_p = "No Profile", "None", "No DBD", "-", "-", "-"
            last_gs, ent = "-", "-"
            query_uniprot = row[0]
            query_gs = row[1]
            for ch in tfclass_checklist:
                if ch[1] == query_uniprot or ch[2] == query_gs or ch[2].replace('ZNF', 'ZN') == query_gs or\
                        ch[2].replace('ZNF', 'Z') == query_gs:  # Specifically look at proteins that are in TFClass
                    ctr += 1
                    out = rq.get(ch[0]).text
                    conn_p = tfc_uri_to_label(ch[0])
                    conn_tfc = parseturtle(out, ['sybig:belongs_to'], 0)
                    if conn_tfc:
                        conn_tfc = tfc_uri_to_label(conn_tfc[0])
                    dbdl = parseturtle(out, ['sybig:dbd'], 0)
                    if dbdl:
                        conn_dbd = "Has DBD"
                    if conn_gs:
                        conn_gs = parseturtle(out, ['rdfs:label'], 0)
                        if conn_gs:
                            conn_gs = conn_gs[0]
                    if conn_tfc in al_ch_list:
                        profile_list.append(conn_tfc)
                        conn_profile = "Has Profile"
                        for res in cu.execute(
                                "SELECT Entropy FROM ALIGNMENTS WHERE TfcID = \""
                                + str(conn_tfc) + "\""):
                            ent = res[0]
                    log.write("Adding " + query_uniprot + " with query GS " +
                              query_gs + " and category " + row[2] + ".\n")
                    log.write("Best hit HMMer descriptor is " + row[3] + ".\n")
                    log.write("Connected TFClass protein is " + conn_p + ".\n")
                    log.write(conn_dbd + ", " + conn_profile + ".\n")
                    last_gs = conn_gs
                    tfclass_list.append([
                        query_uniprot, query_gs, row[2], row[3], conn_p,
                        conn_tfc, conn_gs, conn_dbd, conn_profile, ent
                    ])
                    if ch[1] == query_uniprot:
                        found_list_uniprot.append(query_uniprot)
                    if ch[2] == query_gs:
                        found_list_gs.append(query_gs)
            if last_gs == "-":
                tfclass_list.append([
                    query_uniprot, query_gs, row[2], row[3], conn_p, conn_tfc,
                    conn_gs, conn_dbd, conn_profile, ent
                ])

    log.close()
    with open('/sybig/home/ttu/BSc/test2ort.csv', "w") as csv_in:
        test_writer = csv.writer(csv_in,
                                 delimiter="\t",
                                 quotechar='"',
                                 quoting=csv.QUOTE_MINIMAL)
        test_writer.writerow([
            "Query", "Gene Symbol", "Category", "HMMer HitDescriptor",
            "TFClass Node", "TFClass Node ID", "TFClass Node GS",
            "TFClass node has DBD?", "In Profiles", "Entropy"
        ])
        for e in tfclass_list:
            test_writer.writerow(
                [e[0], e[1], e[2], e[3], e[4], e[5], e[6], e[7], e[8], e[9]])
    print(
        str(ctr) +
        " human transcription factors with either UNIPROT ID or fitting gene symbol detected."
    )
    print(str(len(profile_list)) + " profile hits")
    print(
        "Transcription factors that have not been found, but should be according to profiles:"
    )
    for tcl in tfclass_checklist:
        if tcl[1] not in found_list_uniprot and tcl[2] not in found_list_gs:
            cand_out = rq.get(tcl[0].rstrip(".")).text
            ctl = parseturtle(cand_out, ['sybig:belongs_to'], 0)
            if ctl:
                if tfc_uri_to_label(ctl[0]) in al_ch_list:
                    # print(tfc_uri_to_label(ctl[0]))
                    for res in cu.execute(
                            "SELECT Entropy FROM ALIGNMENTS WHERE TfcID =\"" +
                            str(tfc_uri_to_label(ctl[0])) + "\""):
                        ent = res[0]
                    print(tcl[0] + ", " + tcl[1] + ", " + tcl[2] + " " +
                          str(ent))
    conn.close()
    return tfclass_list