Esempio n. 1
0
class TransmirCorpus(PubmedCorpus):
    """
        Corpus generated from the TransmiR database, using distant supervision
    """
    def __init__(self, corpusdir, **kwargs):
        self.mirbase = MirbaseDB(config.mirbase_path)
        self.mirbase.load_graph()
        self.mirnas = set()
        self.tfs = set()
        self.pairs = set()
        self.pmids = set()
        self.normalized_mirnas = set()  # normalized to miRBase
        self.normalized_tfs = set()  #normalized to maybe UniProt
        self.normalized_pairs = set()
        self.db_path = corpusdir
        self.load_database()
        super(TransmirCorpus, self).__init__(corpusdir, self.pmids, **kwargs)
        # TODO: use negatome

    def load_database(self):
        logging.info("Loading TransmiR database...")
        with open(self.db_path, 'r') as dbfile:
            for line in dbfile:
                tsv = line.strip().split("\t")
                if tsv[-1].lower() == "human":
                    tfname = tsv[0]
                    mirname = tsv[3]
                    func = tsv[5].split(";")
                    disease = tsv[6].split(";")
                    active = tsv[7]
                    pmid = tsv[8].split(";")
                    self.tfs.add(tfname)  # uniform TF names
                    if not mirname.startswith("hsa-"):
                        mirname = "hsa-" + mirname
                    self.mirnas.add(mirname)
                    # for f in func:
                    #     funcs.add(f.strip())
                    # for d in disease:
                    #     if d != "see HMDD (http://cmbi.bjmu.edu.cn/hmdd)":
                    #         diseases.add(d.strip())
                    for p in pmid:
                        logging.info(p)
                        self.pmids.add(p.strip())
                    #mirnas[mirname] = (func, [d for d in disease if d != "see HMDD (http://cmbi.bjmu.edu.cn/hmdd)"])
                    #entries[(tfname, mirname)] = (active, pmid)

    def normalize_entities(self):
        logging.info("Normalizing entities...")
        for mir in self.mirnas:
            match = self.mirbase.map_label(mir)
            #if match > 0.6:
            self.normalized_mirnas.add(match[0])

    def load_annotations(self, db_path, etype):
        self.normalize_entities()
Esempio n. 2
0
 def __init__(self, corpusdir, **kwargs):
     self.mirbase = MirbaseDB(config.mirbase_path)
     self.mirbase.load_graph()
     self.mirnas = {}
     self.tfs = {}
     self.pairs = {}
     self.pmids = set()
     self.normalized_mirnas = set()  # normalized to miRBase
     self.normalized_tfs = set()  #normalized to maybe UniProt
     self.normalized_pairs = set()
     self.db_path = corpusdir
     self.load_database()
     super(TransmirCorpus, self).__init__(corpusdir, self.pmids, **kwargs)
Esempio n. 3
0
 def __init__(self, corpusdir, **kwargs):
     self.mirbase = MirbaseDB(config.mirbase_path)
     self.mirbase.load_graph()
     self.mirnas = {}
     self.tfs = {}
     self.pairs = {}
     self.pmids = set()
     self.normalized_mirnas = set() # normalized to miRBase
     self.normalized_tfs = set() #normalized to maybe UniProt
     self.normalized_pairs = set()
     self.db_path = corpusdir
     self.load_database()
     super(TransmirCorpus, self).__init__(corpusdir, self.pmids, **kwargs)
Esempio n. 4
0
mirna_stopwords = set([
    "mediated", "expressing", "deficient", "transfected", "dependent",
    "family", "specific", "null", "independent", "dependant", "overexpressing",
    "binding", "targets", "induced"
])
# "mirna", "mirnas", "mir", "hsa-mir"])

mirna_nextstopwords = set(["inhibitor"])
with open(config.stoplist, 'r') as stopfile:
    for l in stopfile:
        w = l.strip().lower()
        if w not in mirna_stopwords and len(w) > 1:
            mirna_stopwords.add(w)
mirna_stopwords.discard("let")
mirna_graph = MirbaseDB(config.mirbase_path)
mirna_graph.load_graph()


class MirnaEntity(Entity):
    def __init__(self, tokens, sid, *args, **kwargs):
        # Entity.__init__(self, kwargs)
        super(MirnaEntity, self).__init__(tokens, **kwargs)
        self.type = "mirna"
        self.subtype = kwargs.get("subtype")
        self.mirna_acc = None
        self.mirna_name = 0
        self.sid = sid
        self.nextword = kwargs.get("nextword")
        self.go_ids = []
Esempio n. 5
0
if config.use_go:
    from config.config import go_conn as db
__author__ = 'Andre'

mirna_stopwords = set(["mediated", "expressing", "deficient", "transfected", "dependent", "family", "specific", "null",
                       "independent", "dependant", "overexpressing", "binding", "targets", "induced"])
                       # "mirna", "mirnas", "mir", "hsa-mir"])

mirna_nextstopwords = set(["inhibitor"])
with open(config.stoplist, 'r') as stopfile:
    for l in stopfile:
        w = l.strip().lower()
        if w not in mirna_stopwords and len(w) > 1:
            mirna_stopwords.add(w)
mirna_stopwords.discard("let")
mirna_graph = MirbaseDB(config.mirbase_path)
mirna_graph.load_graph()

class MirnaEntity(Entity):
    def __init__(self, tokens, sid, *args, **kwargs):
        # Entity.__init__(self, kwargs)
        super(MirnaEntity, self).__init__(tokens, **kwargs)
        self.type = "mirna"
        self.subtype = kwargs.get("subtype")
        self.mirna_acc = None
        self.mirna_name = 0
        self.sid = sid
        self.nextword = kwargs.get("nextword")
        self.go_ids = []

    def validate(self, ths, rules, *args, **kwargs):
Esempio n. 6
0
class TransmirCorpus(PubmedCorpus):
    """
        Corpus generated from the TransmiR database, using distant supervision
    """
    def __init__(self, corpusdir, **kwargs):
        self.mirbase = MirbaseDB(config.mirbase_path)
        self.mirbase.load_graph()
        self.mirnas = {}
        self.tfs = {}
        self.pairs = {}
        self.pmids = set()
        self.normalized_mirnas = set() # normalized to miRBase
        self.normalized_tfs = set() #normalized to maybe UniProt
        self.normalized_pairs = set()
        self.db_path = corpusdir
        self.load_database()
        super(TransmirCorpus, self).__init__(corpusdir, self.pmids, **kwargs)
        # TODO: use negatome

    def load_database(self):
        logging.info("Loading TransmiR database...")
        with open(self.db_path, 'r') as dbfile:
            for line in dbfile:
                tsv = line.strip().split("\t")
                if tsv[-1].lower() == "human":
                    tfname = tsv[0]
                    mirname = tsv[3]
                    func = tsv[5].split(";")
                    disease = tsv[6].split(";")
                    active = tsv[7]
                    pmid = tsv[8].split(";")

                    # for f in func:
                    #     funcs.add(f.strip())
                    # for d in disease:
                    #     if d != "see HMDD (http://cmbi.bjmu.edu.cn/hmdd)":
                    #         diseases.add(d.strip())
                    for p in pmid:
                        logging.info(p)
                        self.pmids.add(p.strip())
                    #mirnas[mirname] = (func, [d for d in disease if d != "see HMDD (http://cmbi.bjmu.edu.cn/hmdd)"])
                    #entries[(tfname, mirname)] = (active, pmid)

    def normalize_entities(self):
        logging.info("Normalizing entities...")
        for mir in self.mirnas:
            match = self.mirbase.map_label(mir)
            #if match > 0.6:
            self.normalized_mirnas.add(match[0])

    def load_annotations(self, db_path, etype, ptype):
        self.mirnas = {}
        self.tfs = {}
        self.pairs = {}
        self.pmids = set()
        self.normalized_mirnas = set()  # normalized to miRBase
        self.normalized_tfs = set()  # normalized to maybe UniProt
        self.normalized_pairs = set()
        with open(db_path, 'r') as dbfile:
            for line in dbfile:
                tsv = line.strip().split("\t")
                if tsv[-1].lower() == "human":
                    pmids = tsv[8].split(";")
                    tfname = tsv[0]
                    mirname = tsv[3]
                    for pmid in pmids:
                        if pmid not in self.tfs:
                            self.tfs[pmid] = set()
                        if pmid not in self.mirnas:
                            self.mirnas[pmid] = set()

                        if not mirname.startswith("hsa-"):
                            mirname = "hsa-" + mirname
                        #self.mirnas[pmid].add(mirname)

                        tf = None
                        for pmidtf in self.tfs[pmid]:
                            if pmidtf.text == tfname:
                                tf = pmidtf
                        if tf is None:
                            eid = len(self.tfs[pmid]) + len(self.mirnas[pmid])
                            tf = ProteinEntity([], pmid, text=tfname, did=pmid, eid="{}.e{}".format(pmid, eid))
                            self.tfs[pmid].add(tf)

                        mirna = None
                        for pmidmir in self.mirnas[pmid]:
                            if pmidmir.text == mirname:
                                mirna = pmidmir
                        if mirna is None:
                            eid = len(self.tfs[pmid]) + len(self.mirnas[pmid])
                            mirna = MirnaEntity([], pmid, text=mirname, did=pmid, eid="{}.e{}".format(pmid, eid))
                            self.mirnas[pmid].add(mirna)
                        tf.targets.append((mirna.eid, "miRNA-gene"))
                        # print "tf gos: {}".format(" ".join(tf.go_ids))
                        #print "mirna gos: {}".format(" ".join(mirna.go_ids))


        # self.normalize_entities()
        #self.run_analysis()

    def run_analysis(self):
        correct_count = 0 # numver of real miRNA-gene pairs with common gos
        incorrect_count = 0 #number of random miRNA-gene pairs with common go
        all_tfs = []
        all_mirnas = []
        for pmid in self.tfs:
            all_tfs = []
            all_mirnas = []
            correct_count = 0  # numver of real miRNA-gene pairs with common gos
            incorrect_count = 0  # number of random miRNA-gene pairs with common go
            for tf in self.tfs[pmid]:
                all_tfs.append(tf)
                mirna = None
                for mirna_eid in tf.targets:
                    for m in self.mirnas[pmid]:
                        if m.eid == mirna_eid[0]:
                            mirna = m
                            break
                    all_mirnas.append(mirna)
                    # common_gos = set(tf.go_ids).intersection(set(mirna.go_ids))
                    # if len(common_gos) > 0:
                    #     print "{}->{} common gos:{}".format(tf.text, mirna.text, " ".join(common_gos))
                    #     correct_count += 1
            if len(all_tfs) > 1 and len(all_mirnas) > 1:
                for i in range(0, 10):
                    random_tf = random.choice(all_tfs)
                    random_mirna = random.choice(all_mirnas)
                    common_gos = set(random_tf.go_ids).intersection(set(random_mirna.go_ids))
                    if (random_mirna.eid, "miRNA-gene") in random_tf.targets:
                        #if len(common_gos) > 0:
                        if random_mirna.best_go.startswith("GO:") and random_tf.best_go.startswith("GO"):
                            ss = ssm.simui_go(random_mirna.best_go, random_tf.best_go)
                            #print "correct:", ss
                            correct_count += ss
                        else:
                            correct_count += 1
                    else:
                        #if len(common_gos) > 0:
                        if random_mirna.best_go.startswith("GO:") and random_tf.best_go.startswith("GO"):
                            ss = ssm.simui_go(random_mirna.best_go, random_tf.best_go)
                            print "incorrect:", ss
                            incorrect_count += ss
                        else:
                            incorrect_count += 1
                print "{}-{} ({} mirnas, {} tfs".format(correct_count, incorrect_count, len(all_mirnas), len(all_tfs))
Esempio n. 7
0
class TransmirCorpus(PubmedCorpus):
    """
        Corpus generated from the TransmiR database, using distant supervision
    """
    def __init__(self, corpusdir, **kwargs):
        self.mirbase = MirbaseDB(config.mirbase_path)
        self.mirbase.load_graph()
        self.mirnas = {}
        self.tfs = {}
        self.pairs = {}
        self.pmids = set()
        self.normalized_mirnas = set()  # normalized to miRBase
        self.normalized_tfs = set()  #normalized to maybe UniProt
        self.normalized_pairs = set()
        self.db_path = corpusdir
        self.load_database()
        super(TransmirCorpus, self).__init__(corpusdir, self.pmids, **kwargs)
        # TODO: use negatome

    def load_database(self):
        logging.info("Loading TransmiR database...")
        with open(self.db_path, 'r') as dbfile:
            for line in dbfile:
                tsv = line.strip().split("\t")
                if tsv[-1].lower() == "human":
                    tfname = tsv[0]
                    mirname = tsv[3]
                    func = tsv[5].split(";")
                    disease = tsv[6].split(";")
                    active = tsv[7]
                    pmid = tsv[8].split(";")

                    # for f in func:
                    #     funcs.add(f.strip())
                    # for d in disease:
                    #     if d != "see HMDD (http://cmbi.bjmu.edu.cn/hmdd)":
                    #         diseases.add(d.strip())
                    for p in pmid:
                        logging.info(p)
                        self.pmids.add(p.strip())
                    #mirnas[mirname] = (func, [d for d in disease if d != "see HMDD (http://cmbi.bjmu.edu.cn/hmdd)"])
                    #entries[(tfname, mirname)] = (active, pmid)

    def normalize_entities(self):
        logging.info("Normalizing entities...")
        for mir in self.mirnas:
            match = self.mirbase.map_label(mir)
            #if match > 0.6:
            self.normalized_mirnas.add(match[0])

    def load_annotations(self, db_path, etype, ptype):
        self.mirnas = {}
        self.tfs = {}
        self.pairs = {}
        self.pmids = set()
        self.normalized_mirnas = set()  # normalized to miRBase
        self.normalized_tfs = set()  # normalized to maybe UniProt
        self.normalized_pairs = set()
        with open(db_path, 'r') as dbfile:
            for line in dbfile:
                tsv = line.strip().split("\t")
                if tsv[-1].lower() == "human":
                    pmids = tsv[8].split(";")
                    tfname = tsv[0]
                    mirname = tsv[3]
                    for pmid in pmids:
                        if pmid not in self.tfs:
                            self.tfs[pmid] = set()
                        if pmid not in self.mirnas:
                            self.mirnas[pmid] = set()

                        if not mirname.startswith("hsa-"):
                            mirname = "hsa-" + mirname
                        #self.mirnas[pmid].add(mirname)

                        tf = None
                        for pmidtf in self.tfs[pmid]:
                            if pmidtf.text == tfname:
                                tf = pmidtf
                        if tf is None:
                            eid = len(self.tfs[pmid]) + len(self.mirnas[pmid])
                            tf = ProteinEntity([],
                                               pmid,
                                               text=tfname,
                                               did=pmid,
                                               eid="{}.e{}".format(pmid, eid))
                            self.tfs[pmid].add(tf)

                        mirna = None
                        for pmidmir in self.mirnas[pmid]:
                            if pmidmir.text == mirname:
                                mirna = pmidmir
                        if mirna is None:
                            eid = len(self.tfs[pmid]) + len(self.mirnas[pmid])
                            mirna = MirnaEntity([],
                                                pmid,
                                                text=mirname,
                                                did=pmid,
                                                eid="{}.e{}".format(pmid, eid))
                            self.mirnas[pmid].add(mirna)
                        tf.targets.append((mirna.eid, "miRNA-gene"))
                        # print "tf gos: {}".format(" ".join(tf.go_ids))
                        #print "mirna gos: {}".format(" ".join(mirna.go_ids))

        # self.normalize_entities()
        #self.run_analysis()

    def run_analysis(self):
        correct_count = 0  # numver of real miRNA-gene pairs with common gos
        incorrect_count = 0  #number of random miRNA-gene pairs with common go
        all_tfs = []
        all_mirnas = []
        for pmid in self.tfs:
            all_tfs = []
            all_mirnas = []
            correct_count = 0  # numver of real miRNA-gene pairs with common gos
            incorrect_count = 0  # number of random miRNA-gene pairs with common go
            for tf in self.tfs[pmid]:
                all_tfs.append(tf)
                mirna = None
                for mirna_eid in tf.targets:
                    for m in self.mirnas[pmid]:
                        if m.eid == mirna_eid[0]:
                            mirna = m
                            break
                    all_mirnas.append(mirna)
                    # common_gos = set(tf.go_ids).intersection(set(mirna.go_ids))
                    # if len(common_gos) > 0:
                    #     print "{}->{} common gos:{}".format(tf.text, mirna.text, " ".join(common_gos))
                    #     correct_count += 1
            if len(all_tfs) > 1 and len(all_mirnas) > 1:
                for i in range(0, 10):
                    random_tf = random.choice(all_tfs)
                    random_mirna = random.choice(all_mirnas)
                    common_gos = set(random_tf.go_ids).intersection(
                        set(random_mirna.go_ids))
                    if (random_mirna.eid, "miRNA-gene") in random_tf.targets:
                        #if len(common_gos) > 0:
                        if random_mirna.best_go.startswith(
                                "GO:") and random_tf.best_go.startswith("GO"):
                            ss = ssm.simui_go(random_mirna.best_go,
                                              random_tf.best_go)
                            #print "correct:", ss
                            correct_count += ss
                        else:
                            correct_count += 1
                    else:
                        #if len(common_gos) > 0:
                        if random_mirna.best_go.startswith(
                                "GO:") and random_tf.best_go.startswith("GO"):
                            ss = ssm.simui_go(random_mirna.best_go,
                                              random_tf.best_go)
                            print "incorrect:", ss
                            incorrect_count += ss
                        else:
                            incorrect_count += 1
                print "{}-{} ({} mirnas, {} tfs".format(
                    correct_count, incorrect_count, len(all_mirnas),
                    len(all_tfs))