Esempio n. 1
0
    def senteniceFile(filenames, env):


        for filename in filenames:
            print(filename)

            basefile = os.path.basename(filename)
            datefile = basefile.replace(".xml.gz", ".date")
            typefile = basefile.replace(".xml.gz", ".pubtype")

            pmid2date = {}
            pmid2types = defaultdict(set)


            with open(storagePath + datefile, 'w') as outdate, open(storagePath + typefile, "w") as outtype:

                pubmedParser = PubmedXMLParser()
                pubmedParser.parseXML(filename)

                for elem in PubmedArticleIterator(pubmedParser):

                    try:

                        entry = PubmedEntry.fromXMLNode(elem)

                        if entry == None:
                            continue

                        pmid2date[entry.pmid] = entry.pub_date

                        for dtype in entry.pub_types:
                            pmid2types[entry.pmid].add(dtype)


                    except:

                        traceback.print_exc()

                        eprint("Exception", datefile)
                        try:

                            pmid = elem.find('MedlineCitation/PMID').text
                            eprint(pmid)

                        except:
                            pass

                        continue

                for x in pmid2date:
                    print(x, "\t".join([str(x) for x in pmid2date[x]]), sep="\t", file=outdate)

                for x in pmid2types:
                    for doctype in pmid2types[x]:
                        print(x, doctype, sep="\t", file=outtype)
Esempio n. 2
0
    def createNodeKeyConstraint(self, label, properties, nodeName='n'):

        eprint("WILL ONLY WORK WITH ENTERPRISE EDITION!")
        return None

        labelStr = self.makeLabels(label)
        propStr = ", ".join([nodeName + "." + x for x in properties])

        createConstraint = "CREATE CONSTRAINT ON (n{label}) ASSERT ({propstr}) IS NODE KEY".format(
            label=labelStr, propstr=propStr)

        return self.runInDatabase(createConstraint)
Esempio n. 3
0
    def parseXML(self, path):

        self.tree = None

        try:
            self.tree = etree.parse(path)
        except:
            try:
                self.tree = etree.fromstring(path)
            except Exception as e:
                eprint("Unable to load graph:", str(e))
                raise
        if '.nxml' in path:
            self.remove_namespace(self.tree)  # strip namespace for

        return self.tree
Esempio n. 4
0
    def runInDatabase(self, query):
        self.dbQueries += 1

        if self.dbQueries % 10000 == 0:
            print(self.dbQueries)

        if self.printQueries:
            print(query)

        if self.simulateDB:
            pass
        else:

            try:

                returnVal = self.session.run(query)
                return returnVal

            except neo4je.ClientError as e:
                eprint(e)
                exit(-1)
    def senteniceFile(filenames, env):

        for filename in filenames:
            print(filename)

            basefile = os.path.basename(filename)
            sentfile = basefile.replace(".xml.gz", ".sent")
            titlefile = basefile.replace(".xml.gz", ".title")
            authorfile = basefile.replace(".xml.gz", ".author")
            citationfile = basefile.replace(".xml.gz", ".citation")

            pmid2title = {}
            pmid2authors = defaultdict(set)
            pmid2citations = defaultdict(set)

            with open(storagePath + sentfile, 'w') as outfile:

                pubmedParser = PubmedXMLParser()
                pubmedParser.parseXML(filename)

                for elem in PubmedArticleIterator(pubmedParser):

                    try:
                        entry = PubmedEntry.fromXMLNode(elem)

                        if entry == None:
                            continue

                        sents = entry.to_sentences(tokenizer)

                        for x in sents:
                            outfile.write(x + "\n")

                        pmidID = entry.getID()

                        if entry.title != None:
                            pmid2title[pmidID] = entry.title

                        if entry.authors != None and len(entry.authors) > 0:
                            for author in entry.authors:  #first, initials, last
                                pmid2authors[pmidID].add(
                                    (author[1], author[2], author[0]))

                        if entry.cites != None and len(entry.cites) > 0:
                            for cite in entry.cites:

                                try:
                                    val = int(cite)
                                    pmid2citations[pmidID].add(val)
                                except:
                                    continue

                    except:

                        eprint("Exception", sentfile)
                        try:

                            pmid = elem.find('MedlineCitation/PMID').text
                            eprint(pmid)

                        except:
                            pass

                        continue

            with open(storagePath + titlefile, 'w') as outfile:

                print(titlefile)

                for pmid in pmid2title:
                    title = pmid2title[pmid]
                    if title == None or len(title) == 0:
                        continue

                    outfile.write(str(pmid) + "\t" + str(title) + "\n")

            with open(storagePath + authorfile, 'w') as outfile:

                print(authorfile)

                for pmid in pmid2authors:
                    authors = pmid2authors[pmid]

                    if authors == None or len(authors) == 0:
                        continue

                    for author in authors:

                        first = author[0] if author[0] != None else ''
                        initials = author[1] if author[1] != None else ''
                        last = author[2] if author[2] != None else ''

                        outfile.write(
                            str(pmid) + "\t" +
                            "\t".join([first, initials, last]) + "\n")

            with open(storagePath + citationfile, 'w') as outfile:

                print(citationfile)

                for pmid in pmid2citations:
                    citations = pmid2citations[pmid]

                    if citations == None or len(citations) == 0:
                        continue

                    for quote in citations:

                        outfile.write(str(pmid) + "\t" + str(quote) + "\n")
Esempio n. 6
0
]
allfileIDs = sorted(allfileIDs, reverse=True)

addUnknownPubmeds = False

retVal = db.matchNodes(['PUBMED'], None, nodename='n')
relevantPMIDs = set()

for x in retVal:

    nodeData = x['n']
    if 'id' in nodeData.properties:
        pmid = nodeData.properties['id']
        relevantPMIDs.add(pmid)
    else:
        eprint("No data in: ", str(nodeData))

if len(relevantPMIDs) == 0:
    eprint("No RELEVANT PUBMED entries found")


def analyseFile(splitFileID, relPMIDs):

    fileID = "{:>4}".format(splitFileID).replace(" ", "0")

    diseaseHitsFile = resultBase + "/disease/medline17n" + fileID + ".index"

    hitsFile = SyngrepHitFile(diseaseHitsFile, diseaseMap)

    if len(hitsFile) == 0:
        return
Esempio n. 7
0
citedByFile = dataDir + "/miRExplore/pubmed_citedby.tsv"

if createCitationLists:
    retVal = db.matchNodes(['PUBMED'], None, nodename='n')

    print("Query finished")

    pmids = set()
    for x in retVal:
        nodeData = x['n']
        if 'id' in nodeData.properties:
            pmid = nodeData.properties['id']
            pmids.add(pmid)

        else:
            eprint("No data in: ", str(nodeData))

    print(len(pmids))

    store = CoCitationStore()
    foundCitations = store.getCites(pmids)
    foundCitedBy = store.getCitedBy(pmids)

    with open(citationFile, 'w') as outfile:
        for pmid in foundCitations:
            outfile.write(
                str(pmid) + "\t" +
                str(",".join([str(x) for x in foundCitations[pmid]])) + "\n")

    with open(citedByFile, 'w') as outfile:
        for pmid in foundCitedBy:
Esempio n. 8
0
for id in id2node:
    node = id2node[id]
    db.createNodeIfNotExists(['CELLLINE'], node)

    allSpecies = id2species.get(id, set())
    cellLineUnique = len(allSpecies) == 1

    for species in allSpecies:
        try:
            taxID = int(species)
            db.createRelationship('tax', ['TAX'], {'id': taxID}, 'cell',
                                  ['CELLLINE'], node, ['HAS_CELLLINE'],
                                  {'unique': cellLineUnique})
        except:
            eprint(str(species) + "is not a valid tax id in database")
            continue

for id in id2derived_from:

    allDerivatives = id2derived_from[id]

    for deriv in allDerivatives:

        if not deriv in id2node:
            eprint("Not in id2node: " + str(deriv))
            continue

        db.createRelationship('id', ['CELLLINE'], {'id': id}, 'other',
                              ['CELLLINE'], {'id': deriv},
                              ['CELLINE_DERIVED_FROM'], None)
Esempio n. 9
0
    def senteniceFile(filenames, env):

        for filename in filenames:
            print(filename)
            storagePath = os.path.dirname(filename) + "/"

            basefile = os.path.basename(filename)
            sentfile = basefile.replace(".xml", ".sent")
            titlefile = basefile.replace(".xml", ".title")
            authorfile = basefile.replace(".xml", ".author")
            citationfile = basefile.replace(".xml", ".citation")
            datefile = basefile.replace(".xml", ".date")
            typefile = basefile.replace(".xml", ".pubtype")
            pmidfile = basefile.replace(".xml", ".pmid")

            pmid2title = {}
            pmid2authors = defaultdict(set)
            pmid2citations = defaultdict(set)

            with open(storagePath + sentfile, 'w') as outfile, open(
                    storagePath + datefile, 'w') as outdate, open(
                        storagePath + typefile,
                        "w") as outtype, open(storagePath + pmidfile,
                                              "w") as outpmid:

                pubmedParser = PubmedXMLParser()
                pubmedParser.parseXML(filename)

                for elem in [pubmedParser.tree]:

                    try:
                        entry = PubmedEntry.fromXMLNode(elem)

                        if entry == None:
                            continue

                        sents = entry.to_sentences(tokenizer)

                        for x in sents:
                            outfile.write(x + "\n")

                        pmidID = entry.getID()

                        if entry.created != None:
                            print(pmidID,
                                  "\t".join([str(x) for x in entry.created]),
                                  sep="\t",
                                  file=outdate)

                        if entry.pub_types != None:
                            for ept in entry.pub_types:
                                print(pmidID, ept, sep="\t", file=outtype)

                        if entry.pmc != None:
                            print(pmidID, entry.pmc, sep="\t", file=outpmid)

                        if entry.title != None:
                            pmid2title[pmidID] = entry.title

                        if entry.authors != None and len(entry.authors) > 0:
                            for author in entry.authors:  #first, initials, last
                                pmid2authors[pmidID].add(
                                    (author[1], author[2], author[0]))

                        if entry.cites != None and len(entry.cites) > 0:
                            for cite in entry.cites:

                                try:
                                    val = int(cite)
                                    pmid2citations[pmidID].add(val)
                                except:
                                    continue

                    except:

                        eprint("Exception", sentfile)
                        exit(-1)
                        try:

                            pmid = PubmedEntry.fromXMLNode(elem)
                            eprint(pmid)

                        except:
                            pass

                        continue

            with open(storagePath + titlefile, 'w') as outfile:

                print(titlefile)

                for pmid in pmid2title:
                    title = pmid2title[pmid]
                    if title == None or len(title) == 0:
                        continue

                    outfile.write(str(pmid) + "\t" + str(title) + "\n")

            with open(storagePath + authorfile, 'w') as outfile:

                print(authorfile)

                for pmid in pmid2authors:
                    authors = pmid2authors[pmid]

                    if authors == None or len(authors) == 0:
                        continue

                    for author in authors:

                        first = author[0] if author[0] != None else ''
                        initials = author[1] if author[1] != None else ''
                        last = author[2] if author[2] != None else ''

                        outfile.write(
                            str(pmid) + "\t" +
                            "\t".join([first, initials, last]) + "\n")

            with open(storagePath + citationfile, 'w') as outfile:

                print(citationfile)

                for pmid in pmid2citations:
                    citations = pmid2citations[pmid]

                    if citations == None or len(citations) == 0:
                        continue

                    for quote in citations:

                        outfile.write(str(pmid) + "\t" + str(quote) + "\n")