def runIndexerCLITool(args): "Wrapper that runs the indexer using an argparse args object" # # Initialize two connections: One for read/write operations and # a separate one for high-volume push (indexing) operations, not supporting read operations # As ZMQ contexts are quite heavyweight, use a shared context for both. # The extra network connection does not incur a significant overhead, especially if # use over unix domain socket connections. # import zmq context = zmq.Context() rwDB = DocumentDB.YakDBDocumentDatabase(mode="REQ", context=context) pushDB = DocumentDB.YakDBDocumentDatabase(mode="PUSH", context=context) #Initialize indexer indexer = TranslatronDocumentIndexer(rwDB, pushDB) #Iterate over documents didAnything = False if not args.no_documents: didAnything = True indexer.indexAllDocuments() if not args.no_entities: didAnything = True indexer.indexAllEntities() if args.statistics: didAnything = True indexer.printTokenFrequency() if not didAnything: print( "No indexer action specified, use --help to show available actions" )
def runPMCImporterCLITool(args): #Open tables with REQ/REP connection DocumentDB.YakDBDocumentDatabase(mode="REQ") #Worker threads will have individual DB connections parser = PMCTARParser(numWorkers=args.workers) for infile in args.infile: if infile.endswith(".tar.gz"): parser.processPMCTarGZ( infile, filterStr=args.filter, contentFilterStr=args.content_filter.lower().encode("utf-8")) elif infile.endswith(".nxml") or infile.endswith(".xml"): parser.processPMCXML(infile)
def run(self): db = DocumentDB.YakDBDocumentDatabase(mode="PUSH") for data in iter(self.queue.get, None): #Convert XML string to document object doc = processPMCFileContent(data) if doc is None: continue #Parse error self.writeQueue.append(doc) #Write if write queue size has been reached if len(self.writeQueue) >= 128: db.writeDocuments(self.writeQueue) self.writeQueue.clear() #Flush remaining if self.writeQueue: db.writeDocuments(self.writeQueue)
def importMeSH(args, infile): db = DocumentDB.YakDBDocumentDatabase(mode="PUSH") # NOTE: MeSH 2015 contains only 27k entities batch = db.entityIdx.newWriteBatch(chunkSize=40000) print(green("Starting to import entities from %s" % infile)) # Read file with open(infile, "r") as infile: writeStartTime = time.time() for mesh in readMeSH(infile): # Write entity to database batch.writeEntity(meshEntryToEntity(mesh)) # Statistics if batch.numWrites % 5000 == 0: deltaT = time.time() - writeStartTime entityWriteRate = batch.numWrites / deltaT print("Wrote %d entities at %.1f e/s" % (batch.numWrites, entityWriteRate)) print("Wrote overall %d entities" % batch.numWrites)
def importUniprot(args, infile): db = DocumentDB.YakDBDocumentDatabase(mode="PUSH") batch = db.entityIdx.newWriteBatch(chunkSize=25000) print(green("Starting to import entities from %s" % infile)) # Read uniprot file, zcat is about 5-10 times faster and # distributes load over multiple cores. p = subprocess.Popen(["zcat", infile], stdout=subprocess.PIPE) writeStartTime = time.time() for uniprot in readUniprot(p.stdout): # Write entity to database batch.writeEntity(uniprotEntryToEntity(uniprot)) # Statistics if batch.numWrites % 10000 == 0: deltaT = time.time() - writeStartTime entityWriteRate = batch.numWrites / deltaT print("Wrote %d entities at %.1f e/s" % (batch.numWrites, entityWriteRate)) #Wait for subprocess to exit p.communicate() print("Wrote overall %d entities" % batch.numWrites)
def importWikimediaPagelist(args, infile): db = DocumentDB.YakDBDocumentDatabase(mode="PUSH") batch = db.entityIdx.newWriteBatch(chunkSize=100000) print(green("Starting to import entities from %s" % infile)) writeStartTime = time.time() for (pageId, pageTitle) in readWikimediaFile(infile): # Write entity to database pageIdStr = pageId.decode("utf-8") batch.writeEntity({ "id": "Wikipedia:" + pageIdStr, "name": pageTitle, "source": "Wikipedia", "type": "Encyclopedia entry", "ref": { "Wikipedia": [pageIdStr] }, }) # Statistics if batch.numWrites % 10000 == 0: deltaT = time.time() - writeStartTime entityWriteRate = batch.numWrites / deltaT print("Wrote %d entities at %.1f e/s" % (batch.numWrites, entityWriteRate)) print("Wrote overall %d entities" % batch.numWrites)
#!/usr/bin/env python3 """ Utility to traverse all Translatron entities and show a set of all databases This is rarely useful and therefore not integrated into the main Translatron CLI """ from Translatron import DocumentDB from collections import Counter if __name__ == "__main__": db = DocumentDB.YakDBDocumentDatabase(mode="REQ") databases = Counter() for _, entity in db.iterateEntities(): if b"ref" in entity: for db in entity[b"ref"].keys(): databases[db] += 1 for database, cnt in databases.items(): print(database.decode("utf-8") + "," + str(cnt))