def get_transmir_gold_ann_set(goldpath, entitytype): logging.info("loading gold standard... {}".format(goldpath)) gold_entities = set() gold_relations = {} with open(goldpath, 'r') as goldfile: for l in goldfile: tsv = l.strip().split("\t") if tsv[-1].lower() == "human": # print "gold standard", tsv[8], tsv[0], tsv[3], entitytype pmids = tsv[8].split(";") norm_mirna = mirna_graph.map_label(tsv[3]) if norm_mirna < 99: norm_mirna[0] = tsv[3] norm_gene = get_uniprot_name(tsv[0]) for did in pmids: if entitytype == "mirna": gold_entities.add( ("PMID" + did, "0", "0", norm_mirna[0].lower())) elif entitytype == "protein": gold_entities.add( ("PMID" + did, "0", "0", norm_gene[0].lower())) gold_relations[("PMID" + did, norm_mirna[0], norm_gene[0], norm_mirna[0] + "=>" + norm_gene[0])] = [tsv[3] + "=>" + tsv[0]] #gold_relations[("PMID", norm_mirna[0], norm_gene[0], norm_mirna[0] + "=>" + norm_gene[0])] = [tsv[3] + "=>" + tsv[0]] # print gold_entities return gold_entities, gold_relations
def get_ddi_mirna_gold_ann_set(goldpath, entitytype, pairtype): logging.info("loading gold standard... {}".format(goldpath)) gold_offsets = set() gold_pairs = set() original_id_to_offset = {} original_id_to_text = {} tree = ET.parse(goldpath) #with codecs.open(goldpath, 'r', 'utf-8') as xml: root = tree.getroot() #parse DDI corpus file t = time.time() # root = ET.fromstring(xml.read()) rfile = open("corpora/miRNACorpus/miRNAcorpus_relations.txt", 'w') for doc in root.findall("document"): did = doc.get('id') doctext = "" for sentence in doc.findall('sentence'): sentence_text = sentence.get('text') #sentence_text = sentence_text.replace('\r\n', ' ') for entity in sentence.findall('entity'): entity_offset = entity.get('charOffset') if ";" in entity_offset: continue offsets = entity_offset.split("-") start, end = int(offsets[0]) + len(doctext), int( offsets[1]) + len(doctext) + 1 etype = type_match.get(entity.get("type")) original_id_to_offset[entity.get("id")] = (start, end) original_id_to_text[entity.get("id")] = entity.get("text") #print this_sentence.text[offsets[0]:offsets[-1]], entity.get("text") #if "protein" in entity_type.lower() or "mirna" in entity_type.lower(): if etype == entitytype: gold_offsets.add((did, start, end, entity.get("text"))) for pair in sentence.findall('pair'): try: p_type = type_match[pair.get("type")] except KeyError: continue p_true = pair.get("interaction") if p_type == pairtype and p_true == "True": gold_pair = (did, original_id_to_offset[pair.get("e1")], original_id_to_offset[pair.get("e2")], "{}={}>{}".format( original_id_to_text[pair.get("e1")], p_type, original_id_to_text[pair.get("e2")])) gold_pairs.add(gold_pair) norm_mirna = mirna_graph.map_label( original_id_to_text[pair.get("e1")]) if norm_mirna < 99: norm_mirna[0] = original_id_to_text[pair.get("e1")] norm_gene = get_uniprot_name( original_id_to_text[pair.get("e2")]) rfile.write("{}\t{}\n".format(norm_mirna[0], norm_gene[0])) doctext += " " + sentence_text # generate the full text of this document # logging.debug(gold_pairs) rfile.close() return gold_offsets, gold_pairs
def get_ddi_mirna_gold_ann_set(goldpath, entitytype, pairtype): logging.info("loading gold standard... {}".format(goldpath)) gold_offsets = set() gold_pairs = set() original_id_to_offset = {} original_id_to_text = {} tree = ET.parse(goldpath) #with codecs.open(goldpath, 'r', 'utf-8') as xml: root = tree.getroot() #parse DDI corpus file t = time.time() # root = ET.fromstring(xml.read()) rfile = open("corpora/miRNACorpus/miRNAcorpus_relations.txt", 'w') for doc in root.findall("document"): did = doc.get('id') doctext = "" for sentence in doc.findall('sentence'): sentence_text = sentence.get('text') #sentence_text = sentence_text.replace('\r\n', ' ') for entity in sentence.findall('entity'): entity_offset = entity.get('charOffset') if ";" in entity_offset: continue offsets = entity_offset.split("-") start, end = int(offsets[0]) + len(doctext), int(offsets[1]) + len(doctext) + 1 etype = type_match.get(entity.get("type")) original_id_to_offset[entity.get("id")] = (start, end) original_id_to_text[entity.get("id")] = entity.get("text") #print this_sentence.text[offsets[0]:offsets[-1]], entity.get("text") #if "protein" in entity_type.lower() or "mirna" in entity_type.lower(): if etype == entitytype: gold_offsets.add((did, start, end, entity.get("text"))) for pair in sentence.findall('pair'): try: p_type = type_match[pair.get("type")] except KeyError: continue p_true = pair.get("interaction") if p_type == pairtype and p_true == "True": gold_pair = (did, original_id_to_offset[pair.get("e1")], original_id_to_offset[pair.get("e2")], "{}={}>{}".format(original_id_to_text[pair.get("e1")], p_type, original_id_to_text[pair.get("e2")])) gold_pairs.add(gold_pair) norm_mirna = mirna_graph.map_label(original_id_to_text[pair.get("e1")]) if norm_mirna < 99: norm_mirna[0] = original_id_to_text[pair.get("e1")] norm_gene = get_uniprot_name(original_id_to_text[pair.get("e2")]) rfile.write("{}\t{}\n".format(norm_mirna[0], norm_gene[0])) doctext += " " + sentence_text # generate the full text of this document # logging.debug(gold_pairs) rfile.close() return gold_offsets, gold_pairs
def get_mirtex_gold_ann_set(goldpath, entitytype, pairtype): logging.info("loading gold standard... {}".format(goldpath)) annfiles = [ goldpath + '/' + f for f in os.listdir(goldpath) if f.endswith('.ann') ] gold_offsets = set() for current, f in enumerate(annfiles): did = f.split(".")[0] with open(f, 'r') as txt: for line in txt: if line.startswith("T"): tid, ann, etext = line.strip().split("\t") etype, dstart, dend = ann.split(" ") if entitytype == type_match[etype]: dstart, dend = int(dstart), int(dend) gold_offsets.add((did, dstart, dend, etext)) gold_relations = {} with open(goldpath + "/" + "annotations.tsv") as afile: for l in afile: v = l.strip().split("\t") if len(v) < 3: continue did = goldpath + '/' + v[0] # logging.info("{} {} {}".format(did, pairtype, v[-1])) if pairtype == "all" or type_match.get(" ".join( v[-2:])) == pairtype: e1 = v[1].split(";") for mirna in e1: mirna = mirna.replace('"', '') # logging.info(mirna) norm_mirna = mirna_graph.map_label(mirna) if norm_mirna < 99: norm_mirna[0] = mirna e2 = v[2].split(";") for gene in e2: gene = gene.replace('"', '') # logging.info(gene) norm_gene = get_uniprot_name(gene) #gold_relations.add((did, norm_mirna[0], norm_gene[0])) gold_relations[(did, norm_mirna[0], norm_gene[0], norm_mirna[0] + "=>" + norm_gene[0])] = [] return gold_offsets, gold_relations
def get_mirtex_gold_ann_set(goldpath, entitytype, pairtype): logging.info("loading gold standard... {}".format(goldpath)) annfiles = [goldpath + '/' + f for f in os.listdir(goldpath) if f.endswith('.ann')] gold_offsets = set() for current, f in enumerate(annfiles): did = f.split(".")[0] with open(f, 'r') as txt: for line in txt: if line.startswith("T"): tid, ann, etext = line.strip().split("\t") etype, dstart, dend = ann.split(" ") if entitytype == type_match[etype]: dstart, dend = int(dstart), int(dend) gold_offsets.add((did, dstart, dend, etext)) gold_relations = {} with open(goldpath + "/" + "annotations.tsv") as afile: for l in afile: v = l.strip().split("\t") if len(v) < 3: continue did = goldpath + '/' + v[0] # logging.info("{} {} {}".format(did, pairtype, v[-1])) if pairtype == "all" or type_match.get(" ".join(v[-2:])) == pairtype: e1 = v[1].split(";") for mirna in e1: mirna = mirna.replace('"', '') # logging.info(mirna) norm_mirna = mirna_graph.map_label(mirna) if norm_mirna < 99: norm_mirna[0] = mirna e2 = v[2].split(";") for gene in e2: gene = gene.replace('"', '') # logging.info(gene) norm_gene = get_uniprot_name(gene) #gold_relations.add((did, norm_mirna[0], norm_gene[0])) gold_relations[(did, norm_mirna[0], norm_gene[0], norm_mirna[0] + "=>" + norm_gene[0])] = [] return gold_offsets, gold_relations
def get_transmir_gold_ann_set(goldpath, entitytype): logging.info("loading gold standard... {}".format(goldpath)) gold_entities = set() gold_relations = {} with open(goldpath, 'r') as goldfile: for l in goldfile: tsv = l.strip().split("\t") if tsv[-1].lower() == "human": # print "gold standard", tsv[8], tsv[0], tsv[3], entitytype pmids = tsv[8].split(";") norm_mirna = mirna_graph.map_label(tsv[3]) if norm_mirna < 99: norm_mirna[0] = tsv[3] norm_gene = get_uniprot_name(tsv[0]) for did in pmids: if entitytype == "mirna": gold_entities.add(("PMID" + did, "0", "0", norm_mirna[0].lower())) elif entitytype == "protein": gold_entities.add(("PMID" + did, "0", "0", norm_gene[0].lower())) gold_relations[("PMID" + did, norm_mirna[0], norm_gene[0], norm_mirna[0] + "=>" + norm_gene[0])] = [tsv[3] + "=>" + tsv[0]] #gold_relations[("PMID", norm_mirna[0], norm_gene[0], norm_mirna[0] + "=>" + norm_gene[0])] = [tsv[3] + "=>" + tsv[0]] # print gold_entities return gold_entities, gold_relations
# read transmir database and generate corpus from text.mirna_entity import mirna_graph from text.protein_entity import get_uniprot_name db_name = "data/transmir_v1.2.tsv" tfs = set() diseases = set() funcs = set() pmids = set() mirnas = {} # mirname: (function, disease) entries = {} # (tfname, mirname): active with open(db_name, 'r') as dbfile: for line in dbfile: tsv = line.strip().split("\t") tfname = get_uniprot_name(tsv[0]) mirname = mirna_graph.map_label(tsv[3]) tfname = tfname[0] mirname = mirname[0] func = tsv[5].split(";") disease = tsv[6].split(";") active = tsv[7] pmid = tsv[8].split(";") if tsv[-1].lower() == "human": tfs.add(tfname.replace("-", "")) # uniform TF names for f in func: funcs.add(f.strip()) for d in disease: if d != "see HMDD (http://cmbi.bjmu.edu.cn/hmdd)": diseases.add(d.strip()) for p in pmid: pmids.add(p.strip())