Python SentenceDBの例、textmining.SentenceDB.SentenceDB Pythonの例

コード例 #1

0

ファイルを表示

def analyseFile(splitFileIDs, env):

    fileCoocs = []

    for splitFileID in splitFileIDs:

        ent1File = resultBase + "/"+args.folder1+"/" + splitFileID + ".index"
        ent2File = resultBase + "/"+args.folder2+"/" + splitFileID + ".index"
        relFile = resultBase + "/relations/" + splitFileID + ".index"

        sentFile = args.sentdir + "/" + splitFileID + ".sent"

        ent1Hits = SyngrepHitFile(ent1File, ent1Syns)
        if len(ent1Hits) == 0:
            continue

        ent2Hits = SyngrepHitFile(ent2File, ent2Syns)
        if len(ent2Hits) == 0:
            continue

        relHits = SyngrepHitFile(relFile, relSyns)

        # only load sentences if there's a hit ...
        sentDB = None

        sys.stderr.write("Found something in: " + str(splitFileID) + "\n")

        for docID in ent1Hits:

            if docID in ent2Hits:

                if sentDB == None:
                    sentDB = SentenceDB(sentFile)

                ent1SynHits = ent1Hits.getHitsForDocument(docID)
                ent2SynHits = ent2Hits.getHitsForDocument(docID)

                # if docID == 'a27229723':
                #    [print(x.synonyme) for x in hgncSynHits]
                #    [print(x.synonyme) for x in mirnaSynHits]

                foundCoocs = findCooccurrences(str(docID), ent1SynHits, ent2SynHits, sentDB, relHits)

                fileCoocs += foundCoocs

    sys.stderr.write("Found {cnt} elems in files {ids}\n".format(cnt=str(len(fileCoocs)), ids=str(splitFileIDs)))

    printed = printStuff(None, fileCoocs, None)

    thisProcID = str(os.getpid())
    sys.stderr.write("{procID}: Found {cnt} (printed: {printed}) elems in files {ids}\n".format(
        cnt=str(len(fileCoocs)),
        ids=str(splitFileIDs),
        printed=printed,
        procID=thisProcID))

    return None

コード例 #2

0

ファイルを表示

ファイル: createDiseaseAssoc.PMC.py プロジェクト: mjoppich/miRExplore

def analyseFile(splitFileIDs, env):

    fileCoocs = []

    for splitFileID in splitFileIDs:

        diseaseFile = resultBase + "/disease/" + splitFileID + ".index"

        diseaseHits = SyngrepHitFile(diseaseFile, diseaseSyns)
        if len(diseaseHits) == 0:
            continue

        sentFile = "/mnt/c/dev/data/pmc/allsent/" + splitFileID + ".sent"
        sentDB = SentenceDB(sentFile)

        sys.stderr.write("Found something in: " + str(splitFileID) + "\n")

        for docID in diseaseHits:

            docHits = diseaseHits.getHitsForDocument(docID)

            allSynIDs = set()
            for hit in docHits:
                allSynIDs.add(hit.synonym.id.replace('_', ':', 1))

            removeIDs = set()
            for synID in allSynIDs:
                gterm = celloObo.getID(synID)

                allChildren = gterm.getAllChildren(maxLevel=2)

                for x in allChildren:
                    removeIDs.add(x)

            allowedIDs = [x for x in allSynIDs if not x in removeIDs]
            #allowedIDs = allSynIDs.remove(removeIDs)

            allterms = []
            for synID in allowedIDs:
                gterm = celloObo.getID(synID)
                allterms.append(gterm)

                fileCoocs.append((docID, gterm.id, gterm.name))

    sys.stderr.write("Found {cnt} elems in files {ids}\n".format(
        cnt=str(len(fileCoocs)), ids=str(splitFileIDs)))

    printed = printStuff(None, fileCoocs, None)

    sys.stderr.write(
        "Found {cnt} (printed: {printed}) elems in files {ids}\n".format(
            cnt=str(len(fileCoocs)), ids=str(splitFileIDs), printed=printed))

    return None

コード例 #3

0

ファイルを表示

ファイル: overviewMentioningPubMeds.py プロジェクト: mjoppich/miRExplore

def analyseFile(splitFileIDs, env):

    subject2pmids = defaultdict(set)

    for splitFileID in splitFileIDs:

        fileID = "{:>4}".format(splitFileID).replace(" ", "0")

        fmaFile = resultBase + "/model_anatomy/pubmed18n" + fileID + ".index"
        doidFile = resultBase + "/disease/pubmed18n" + fileID + ".index"
        goFile = resultBase + "/neutrophils/pubmed18n" + fileID + ".index"

        sentFile = "/mnt/c/dev/data/pubmed/pubmed18n" + fileID + ".sent"

        fmaHits = SyngrepHitFile(fmaFile, fmaSyns)
        if len(fmaHits) == 0:
            return

        print("Processing file: ", fileID)

        doidHits = SyngrepHitFile(doidFile, doidSyns)
        goHits = SyngrepHitFile(goFile, goSyns)

        sentDB = SentenceDB(sentFile)

        for docID in fmaHits:

            docFMAHits = fmaHits.getHitsForDocument(docID)
            docDoidHits = doidHits.getHitsForDocument(docID)
            docGOHits = goHits.getHitsForDocument(docID)

            # no tissue hits at all ...
            if len(docFMAHits) == 0:
                continue

            # is neutrophil hit? if no -> continue
            neutrophilMentioned = testMentioned(docFMAHits, neutrophilSynIDs)

            if not neutrophilMentioned:
                continue

            subject2pmids['NEUTROPHIL'].add(int(docID))

            if testMentioned(docFMAHits, tissueIDs):
                subject2pmids['TISSUES'].add(int(docID))

            if testMentioned(docDoidHits, doidSynIDs):
                subject2pmids['DOID'].add(int(docID))

            if testMentioned(docGOHits, goSynIDs):
                subject2pmids['INFLAMM'].add(int(docID))

    return subject2pmids

コード例 #4

0

ファイルを表示

ファイル: createMIRGeneRels.PMC.py プロジェクト: mjoppich/miRExplore

def analyseFile(splitFileIDs, env):

    fileCoocs = []

    for splitFileID in splitFileIDs:

        hgncFile = resultBase + "/hgnc/" + splitFileID + ".index"
        mirnaFile = resultBase + "/mirna/" + splitFileID + ".index"
        relFile = resultBase + "/relations/" + splitFileID + ".index"

        sentFile = "/mnt/c/dev/data/pmc/allsent/" + splitFileID + ".sent"

        mirnaHits = SyngrepHitFile(mirnaFile, mirnaSyns)
        if len(mirnaHits) == 0:
            continue

        hgncHits = SyngrepHitFile(hgncFile, hgncSyns)
        if len(hgncHits) == 0:
            continue

        relHits = SyngrepHitFile(relFile, relSyns)

        sentDB = SentenceDB(sentFile)

        sys.stderr.write("Found something in: " + str(splitFileID) + "\n")

        for docID in mirnaHits:

            if docID in hgncHits:

                mirnaSynHits = mirnaHits.getHitsForDocument(docID)
                hgncSynHits = hgncHits.getHitsForDocument(docID)

                #if docID == 'a27229723':
                #    [print(x.synonyme) for x in hgncSynHits]
                #    [print(x.synonyme) for x in mirnaSynHits]

                foundCoocs = findCooccurrences(str(docID), hgncSynHits,
                                               mirnaSynHits, sentDB, relHits)

                fileCoocs += foundCoocs

    sys.stderr.write("Found {cnt} elems in files {ids}\n".format(
        cnt=str(len(fileCoocs)), ids=str(splitFileIDs)))

    printed = printStuff(None, fileCoocs, None)

    sys.stderr.write(
        "Found {cnt} (printed: {printed}) elems in files {ids}\n".format(
            cnt=str(len(fileCoocs)), ids=str(splitFileIDs), printed=printed))

    return None

コード例 #5

0

ファイルを表示

ファイル: createFMAAssoc.py プロジェクト: mjoppich/miRExplore

    def analyseFile(splitFileIDs, env):

        fileCoocs = []

        for splitFileID in splitFileIDs:

            diseaseFile = resultBase + "/model_anatomy/" + splitFileID + ".index"

            diseaseHits = SyngrepHitFile(diseaseFile, diseaseSyns)
            if len(diseaseHits) == 0:
                continue

            sentFile = args.sentdir + "/" + splitFileID + ".sent"  #"/mnt/c/dev/data/pmc/allsent/"+splitFileID +".sent"
            sentDB = SentenceDB(sentFile)

            sys.stderr.write("Found something in: " + str(splitFileID) + "\n")

            for docID in diseaseHits:

                docHits = diseaseHits.getHitsForDocument(docID)

                synid2loc = defaultdict(list)

                allSynIDs = set()
                for hit in docHits:

                    if "and " in hit.foundSyn:
                        continue

                    allSynIDs.add(hit.synonym.id)

                    synid2loc[hit.synonym.id].append(
                        (str(hit.documentID), hit.position[0],
                         hit.position[1]))

                removeIDs = set()
                for synID in allSynIDs:

                    gterm = getTerm(synID, fmaObo)
                    if gterm == None:
                        sys.stderr.write("Invalid synID: " + synID)
                        removeIDs.add(synID)
                        continue

                    allChildren = gterm.getAllChildren(maxLevel=2)

                    for x in allChildren:
                        removeIDs.add(x)

                allowedIDs = [x for x in allSynIDs if not x in removeIDs]
                #allowedIDs = allSynIDs.remove(removeIDs)

                allterms = []
                for synID in allowedIDs:

                    gterm = getTerm(synID, fmaObo)

                    allterms.append(gterm)

                    fileCoocs.append(
                        (docID, gterm.id, gterm.name, synid2loc[synID]))

        sys.stderr.write("Found {cnt} elems in files {ids}\n".format(
            cnt=str(len(fileCoocs)), ids=str(splitFileIDs)))

        printed = printStuff(None, fileCoocs, None)

        sys.stderr.write(
            "Found {cnt} (printed: {printed}) elems in files {ids}\n".format(
                cnt=str(len(fileCoocs)),
                ids=str(splitFileIDs),
                printed=printed))

        return None

コード例 #6

0

ファイルを表示

ファイル: createTextmining.py プロジェクト: mjoppich/miRExplore

def analyseFile(splitFileIDs, env):

    fileCoocs = []

    for splitFileID in splitFileIDs:

        fileID = "{:>4}".format(splitFileID).replace(" ", "0")

        hgncFile = resultBase + "/hgnc/pubmed18n" + fileID + ".index"
        mirnaFile = resultBase + "/mirna/pubmed18n" + fileID + ".index"
        sentFile = "/mnt/c/dev/data/pubmed/pubmed18n" + fileID + ".sent"

        mirnaHits = SyngrepHitFile(mirnaFile, mirnaSyns)
        if len(mirnaHits) == 0:
            continue

        hgncHits = SyngrepHitFile(hgncFile, hgncSyns)
        if len(hgncHits) == 0:
            continue

        sentDB = SentenceDB(sentFile)

        sys.stderr.write("Found something in: " + str(fileID) + "\n")

        for docID in mirnaHits:

            if docID in hgncHits:

                mirnaSynHits = mirnaHits.getHitsForDocument(docID)
                hgncSynHits = hgncHits.getHitsForDocument(docID)

                #if docID == 'a27229723':
                #    [print(x.synonyme) for x in hgncSynHits]
                #    [print(x.synonyme) for x in mirnaSynHits]

                foundCoocs = findCooccurrences(str(docID), hgncSynHits,
                                               mirnaSynHits, sentDB)

                fileCoocs += foundCoocs

                assocByGene = defaultdict(set)
                for x in foundCoocs:

                    geneID = x.gene
                    geneLabel = 'GENE'
                    mirnaID = x.mirna
                    mirnaLabel = x.idtype

                    assoc = (geneID, geneLabel, mirnaID, mirnaLabel)
                    assocByGene[assoc[0]].add(assoc)

                addDocAsEvidence = False
                assocByTypeForGene = {}
                for gene in assocByGene:
                    assocs = assocByGene[gene]

                    mimatSet = set()
                    miSet = set()
                    orgmirSet = set()
                    familySet = set()

                    for assoc in assocs:
                        if assoc[3] == 'MIRNA':
                            mimatSet.add(assoc)
                        elif assoc[3] == 'MIRNA_PRE':
                            miSet.add(assoc)
                        elif assoc[3] == 'MIRNA_ORGMIR':
                            orgmirSet.add(assoc)
                        elif assoc[3] == 'MIRNA_FAMILY':
                            familySet.add(assoc)
                        else:
                            print("Unknown relation in doc: " + docID)
                            print(assoc)

                    if len(mimatSet) > 0 or len(miSet) > 0 or len(
                            orgmirSet) > 0 or len(familySet) > 0:

                        assocByTypeForGene[gene] = (mimatSet, miSet, orgmirSet,
                                                    familySet)

                    #filter assocs here such that if a taxid specific version was found, not the general version is added
                if len(assocByTypeForGene) > 0:

                    if db:
                        db.createNodeIfNotExists(['EVIDENCE', 'PUBMED'],
                                                 {'id': docID})
                        print("Adding: " + str(docID) + ": " +
                              str(assocByTypeForGene))
                    #
                    # TODO first create all unique edges
                    # TODO add genes to mirna edges and mirnas to gene edges to keep track from where an edge originates
                    # TODO edges should get weights = how many relations have been found
                    #

                    mirnaEdges = defaultdict(set)
                    geneEdges = defaultdict(set)

                    for gene in assocByTypeForGene:
                        assocsForGene = assocByTypeForGene[
                            gene]  # (mimatSet, miSet, orgmirSet, familySet)

                        for subSet in assocsForGene:
                            for cooc in subSet:

                                geneEdges[(cooc[0], cooc[1])].add(
                                    (cooc[2], cooc[3]))
                                mirnaEdges[(cooc[2], cooc[3])].add(
                                    (cooc[0], cooc[1]))

                    for edge in geneEdges:

                        edgeMirnas = [x[0] for x in geneEdges[edge]]

                        if db:
                            db.createRelationship('gene', [edge[1]],
                                                  {'id': edge[0]}, 'pmid',
                                                  ['PUBMED'], {'id': docID},
                                                  ['ST_MENTION'], {
                                                      'type': 'GENE_MENTION',
                                                      'mirnas': edgeMirnas
                                                  })

                    for edge in mirnaEdges:

                        edgeGenes = [x[0] for x in mirnaEdges[edge]]
                        if db:
                            db.createRelationship('pmid', ['PUBMED'],
                                                  {'id': docID}, 'mi',
                                                  [edge[1]], {'id': edge[0]},
                                                  ['ST_MENTION'], {
                                                      'type': 'MIRNA_MENTION',
                                                      'genes': edgeGenes
                                                  })

    sys.stderr.write("Found {cnt} elems in files {ids}\n".format(
        cnt=str(len(fileCoocs)), ids=str(splitFileIDs)))

    printed = printStuff(None, fileCoocs, None)

    sys.stderr.write(
        "Found {cnt} (printed: {printed}) elems in files {ids}\n".format(
            cnt=str(len(fileCoocs)), ids=str(splitFileIDs), printed=printed))

    return None

コード例 #7

0

ファイルを表示

def analyseFile(splitFileIDs, env):

    fileCoocs = []

    for splitFileID in splitFileIDs:

        print(splitFileID, file=sys.stderr)

        ent1File = resultBase + "/" + args.folder1 + "/" + splitFileID + ".index"
        ent2File = resultBase + "/" + args.folder2 + "/" + splitFileID + ".index"
        relFile = resultBase + "/relations/" + splitFileID + ".index"

        sentFile = args.sentdir + "/" + splitFileID + ".sent"

        ent1Hits = SyngrepHitFile(ent1File,
                                  ent1Syns,
                                  sentIDNoText=args.sentid_no_text)
        if len(ent1Hits) == 0:
            continue

        ent2Hits = SyngrepHitFile(ent2File,
                                  ent2Syns,
                                  sentIDNoText=args.sentid_no_text)
        if len(ent2Hits) == 0:
            continue

        relHits = SyngrepHitFile(relFile,
                                 relSyns,
                                 sentIDNoText=args.sentid_no_text)

        # only load sentences if there's a hit ...
        sentDB = None

        sys.stderr.write("Found something in: " + str(splitFileID) + "\n")

        for docID in ent1Hits:

            #if not docID in ["27150436"]:
            #    continue

            if accept_pmids != None:
                if not docID in accept_pmids:
                    continue

            if docID in ent2Hits:

                if sentDB == None:
                    sentDB = SentenceDB(sentFile,
                                        sent_no_byte=args.sent_no_byte)

                ent1SynHits = ent1Hits.getHitsForDocument(docID)
                ent2SynHits = ent2Hits.getHitsForDocument(docID)

                # if docID == 'a27229723':
                #    [print(x.synonyme) for x in hgncSynHits]
                #    [print(x.synonyme) for x in mirnaSynHits]

                foundCoocs = findCooccurrences(str(docID), ent1SynHits,
                                               ent2SynHits, sentDB, relHits)

                for relEntEnt in foundCoocs:

                    if relEntEnt.accepted():
                        print(str(relEntEnt), flush=True)
                    """
                    print(
                        "{ent1}\t{ent1found}\t{ent1type}\t{ent2}\t{ent2found}\t{ent2type}\t{pubmed}\t{sapar}\t{sase}\t{relation}\n".format(
                            ent1=cooc.ent1,
                            ent2=cooc.ent2,
                            ent1found=cooc.ent1found,
                            ent2found=cooc.ent2found,
                            ent1type=cooc.ent1type,
                            ent2type=cooc.ent2type,
                            pubmed=cooc.pubmed,
                            sapar=cooc.sameParagraph,
                            sase=cooc.sameSentence,
                            relation=cooc.relation,
                        ), end='', flush=True)
                    """

                fileCoocs += foundCoocs

    sys.stderr.write("Found {cnt} elems in files {ids}\n".format(
        cnt=str(len(fileCoocs)), ids=str(splitFileIDs)))

    #printed = printStuff(None, fileCoocs, None)

    thisProcID = str(os.getpid())
    sys.stderr.write(
        "{procID}: Found {cnt} (printed: {printed}) elems in files {ids}\n".
        format(cnt=str(len(fileCoocs)),
               ids=str(splitFileIDs),
               printed=len(fileCoocs),
               procID=thisProcID))

    return None

コード例 #8

0

ファイルを表示

    def analyseFile(splitFileIDs, env):

        fileCoocs = []

        for splitFileID in splitFileIDs:

            diseaseFile = resultBase + "/org/" + splitFileID + ".index"

            diseaseHits = SyngrepHitFile(diseaseFile, diseaseSyns)
            if len(diseaseHits) == 0:
                continue

            sentFile = args.sentdir + "/" + splitFileID + ".sent"  #"/mnt/c/dev/data/pmc/allsent/"+splitFileID +".sent"
            sentDB = SentenceDB(sentFile)

            sys.stderr.write("Found something in: " + str(splitFileID) + "\n")

            for docID in diseaseHits:

                docHits = diseaseHits.getHitsForDocument(docID)

                synid2loc = defaultdict(list)

                allSynIDs = set()
                for hit in docHits:

                    if "and " in hit.foundSyn:
                        continue

                    allSynIDs.add(hit.synonym.id)

                    synid2loc[hit.synonym.id].append(
                        (str(hit.documentID), hit.position[0],
                         hit.position[1]))

                #allowedIDs = allSynIDs.remove(removeIDs)

                allowedOrgIDs = []
                evs = []
                for x in synid2loc:

                    if orgID2TLC.get(x, None) != None:
                        allowedOrgIDs.append(x)
                        evs += synid2loc[x]

                fileCoocs.append(
                    (docID, ",".join([orgID2TLC[x]
                                      for x in allowedOrgIDs]), evs))

        sys.stderr.write("Found {cnt} elems in files {ids}\n".format(
            cnt=str(len(fileCoocs)), ids=str(splitFileIDs)))

        printed = printStuff(None, fileCoocs, None)

        sys.stderr.write(
            "Found {cnt} (printed: {printed}) elems in files {ids}\n".format(
                cnt=str(len(fileCoocs)),
                ids=str(splitFileIDs),
                printed=printed))

        return None