def run(medline_path, clean, start, end, PROCESSES): con = 'postgresql://*****:*****@localhost/'+db if end != None: end = int(end) if clean: PubMedDB.create_tables(db) PubMedDB.init(db) paths = [] for root, dirs, files in os.walk(medline_path): for filename in files: if os.path.splitext(filename)[-1] in [".xml", ".gz"]: paths.append(os.path.join(root,filename)) paths.sort() pool = Pool(processes=PROCESSES) # start with processors print "Initialized with ", PROCESSES, "processes" #result.get() needs global variable db now - that is why a line "db = options.database" is added in "__main__" - the variable db cannot be given to __start_parser in map_async() result = pool.map_async(_start_parser, paths[start:end]) res = result.get() #without multiprocessing: #for path in paths: # _start_parser(path) print "######################" print "###### Finished ######" print "######################"
def run(medline_path, clean, start, end, PROCESSES): con = 'postgresql://*****:*****@localhost/' + db if end != None: end = int(end) if clean: PubMedDB.create_tables(db) PubMedDB.init(db) paths = [] for root, dirs, files in os.walk(medline_path): for filename in files: if os.path.splitext(filename)[-1] in [".xml", ".gz"]: paths.append(os.path.join(root, filename)) paths.sort() pool = Pool(processes=PROCESSES) # start with processors print "Initialized with ", PROCESSES, "processes" #result.get() needs global variable db now - that is why a line "db = options.database" is added in "__main__" - the variable db cannot be given to __start_parser in map_async() result = pool.map_async(_start_parser, paths[start:end]) res = result.get() #without multiprocessing: #for path in paths: # _start_parser(path) print "######################" print "###### Finished ######" print "######################"
def run(self, medline_path, clean, start, end, PROCESSES): if end is not None: end = int(end) if clean: PubMedDB.create_tables(self.db_engine) paths = [] for root, dirs, files in os.walk(medline_path): for filename in files: if os.path.splitext(filename)[-1] in [".xml", ".gz"]: paths.append(os.path.join(root, filename)) # Don't reload what we've already got with FilePreloadScreener(paths, self.db_engine) as screener: paths = screener.exclude_loaded_files(paths) paths.sort() print "Running for %d files" % (len(paths), ) # result.get() needs global variable `db` now - that is why a line "db = options.database" is added in "__main__" - # the variable db cannot be given to __start_parser in map_async() if PROCESSES > 1 and len(paths) > 1: from contextlib import closing with closing(Pool(processes=PROCESSES)) as pool: print "Running multi-process with %d processes" % (PROCESSES, ) result = pool.map_async(_start_parser, paths[start:end]) res = result.get() # without multiprocessing: else: print "Running single process" for path in paths: _start_parser(path) print "######################" print "###### Finished ######" print "######################"