def downloadHTTP(url, out, fileSuffixFilter=None): if not checkFileSuffixFilter(url, fileSuffixFilter): return fileAlreadyExists = os.path.isfile(out) if fileAlreadyExists: timestamp = os.path.getmtime(out) beforeHash = pubrunner.calcSHA256(out) os.unlink(out) wget.download(url, out, bar=None) if fileAlreadyExists: afterHash = pubrunner.calcSHA256(out) if beforeHash == afterHash: # File hasn't changed so move the modified date back os.utime(out, (timestamp, timestamp))
def gatherPMIDs(inHashDir, outPMIDDir, whichHashes=None, pmidExclusions=None): # Check the age of inHashDir files and outPMIDDir files and check if anything is actually needed if os.path.isdir(outPMIDDir): inHashDir_modifieds = [ os.path.getmtime(os.path.join(root, f)) for root, dir, files in os.walk(inHashDir) for f in files ] outPMIDDir_modifieds = [ os.path.getmtime(os.path.join(root, f)) for root, dir, files in os.walk(inHashDir) for f in files ] # print("max(inHashDir_modifieds)",max(inHashDir_modifieds)) # print("max(outPMIDDir_modifieds)",max(outPMIDDir_modifieds)) if max(inHashDir_modifieds) < max(outPMIDDir_modifieds): print("No PMID update necessary") return files = sorted([os.path.join(inHashDir, f) for f in os.listdir(inHashDir)]) hashes = {} for filename in files: with open(filename) as f: tmpHashes = json.load(f) hashes.update(tmpHashes) pmidHashes = defaultdict(list) pmidToFilename = {} for filename in sorted(hashes.keys()): for pmid in hashes[filename].keys(): if whichHashes is None: hashVal = hashes[filename][pmid] else: try: hashVal = [hashes[filename][pmid][h] for h in whichHashes] except KeyError as e: raise RuntimeError( "The selected hash (%s) from the 'usePubmedHashes' option has not been found in the hash files." % (str(e))) pmidInt = int(pmid) if pmidHashes[pmidInt] != hashVal: pmidHashes[pmidInt] = hashVal pmidToFilename[pmidInt] = filename filenameToPMIDs = defaultdict(list) for pmid, filename in pmidToFilename.items(): filenameToPMIDs[filename].append(pmid) if not os.path.isdir(outPMIDDir): os.makedirs(outPMIDDir) for filename in sorted(hashes.keys()): basename = os.path.basename(filename) outName = os.path.join(outPMIDDir, basename + '.pmids') pmids = sorted(filenameToPMIDs[filename]) if not pmidExclusions is None: pmids = [pmid for pmid in pmids if not pmid in pmidExclusions] fileAlreadyExists = os.path.isfile(outName) if fileAlreadyExists: timestamp = os.path.getmtime(outName) beforeHash = pubrunner.calcSHA256(outName) with open(outName, 'w') as f: for pmid in pmids: f.write("%d\n" % pmid) if fileAlreadyExists: afterHash = pubrunner.calcSHA256(outName) if beforeHash == afterHash: # File hasn't changed so move the modified date back os.utime(outName, (timestamp, timestamp))
def gatherPMIDs(inHashDir, outPMIDDir, whichHashes=None, pmidExclusions=None): # Check the age of inHashDir files and outPMIDDir files and check if anything is actually needed if os.path.isdir(outPMIDDir): inHashDir_modifieds = [ os.path.getmtime(os.path.join(root, f)) for root, dir, files in os.walk(inHashDir) for f in files ] outPMIDDir_modifieds = [ os.path.getmtime(os.path.join(root, f)) for root, dir, files in os.walk(inHashDir) for f in files ] # print("max(inHashDir_modifieds)",max(inHashDir_modifieds)) # print("max(outPMIDDir_modifieds)",max(outPMIDDir_modifieds)) if max(inHashDir_modifieds) < max(outPMIDDir_modifieds): print("No PMID update necessary") return files = sorted([os.path.join(inHashDir, f) for f in os.listdir(inHashDir)]) pmidToFilename = {} pubmedXMLFiles = [ '/projects/jlever/pubrunner_data/resources/PUBMED/pubmed19n1000.xml' ] #memReport(locals()) #for pmid in range(29661075): # pmidToFilename[pmid] = '/projects/jlever/pubrunner_data/resources/PUBMED/pubmed19n1000.xml' if True: maxPmidInt = -1 for filename in files: #continue with open(filename) as f: hashes = json.load(f) keys = list(hashes.keys()) assert len(keys) == 1 pubmedXMLFile = keys[0] pubmedXMLFiles.append(pubmedXMLFile) tempMaxPmid = max(map(int, hashes[pubmedXMLFile].keys())) maxPmidInt = max(maxPmidInt, tempMaxPmid) #print('maxPmidInt:',maxPmidInt) pubmedXMLFiles = [] #firstFile = {} firstFile = [None for _ in range(maxPmidInt + 1)] versionCounts = [0 for _ in range(maxPmidInt + 1)] #versionCounts = np.zeros((maxPmidInt+1),dtype=int) for filename in files: #continue with open(filename) as f: hashes = json.load(f) keys = list(hashes.keys()) assert len(keys) == 1 pubmedXMLFile = keys[0] pubmedXMLFiles.append(pubmedXMLFile) for pmid in hashes[pubmedXMLFile].keys(): pmidInt = int(pmid) #if not pmidInt in firstFile: if firstFile[pmidInt] is None: firstFile[pmidInt] = pubmedXMLFile versionCounts[pmidInt] += 1 #pmidToFilename = { pmid:firstFile[pmid] for pmid,count in versionCounts.items() if count == 1 } #pmidToFilename = { pmid:'/projects/jlever/pubrunner_data/resources/PUBMED/pubmed19n1000.xml' for pmid,count in versionCounts.items() if count == 1 } #pmidToFilename = { pmid:'/projects/jlever/pubrunner_data/resources/PUBMED/pubmed19n1000.xml' for pmid,count in enumerate(versionCounts) if count == 1 } pmidToFilename = list(firstFile) #pmidsToSkip = set(pmid for pmid,count in enumerate(versionCounts) if count == 1) #versionCounts = None #print('hashes',getsizeof(hashes)) #print('pubmedXMLFiles',getsizeof(pubmedXMLFiles)) #print('firstFile',getsizeof(firstFile)) #print('versionCounts',getsizeof(versionCounts)) #print('pmidsToSkip',getsizeof(pmidsToSkip)) if True: runningHashes = {} #defaultdict(lambda : None) for filename in reversed(files): with open(filename) as f: hashes = json.load(f) keys = list(hashes.keys()) assert len(keys) == 1 pubmedXMLFile = keys[0] for pmid in hashes[pubmedXMLFile].keys(): pmidInt = int(pmid) #if pmidInt in pmidsToSkip: # Only one version of this PMID so don't need to track changes if versionCounts[pmidInt] == 1: continue #if not pmid.startswith(pmidPrefix): # continue if whichHashes is None: hashVal = hashes[pubmedXMLFile][pmid] else: try: hashVal = [ hashes[pubmedXMLFile][pmid][h] for h in whichHashes ] except KeyError as e: raise RuntimeError( "The selected hash (%s) from the 'usePubmedHashes' option has not been found in the hash files." % (str(e))) #if pmidInt == 1: # print(pmidInt, filename, firstFile[pmidInt], pubmedXMLFile, pmidInt in runningHashes) # Check this version against a newer version # If this older version is different, leave the pmidToFilename as the newer version and stop looking for this pmid if pmidInt in runningHashes and runningHashes[ pmidInt] != hashVal: #pmidsToSkip.add(pmidInt) versionCounts[pmidInt] = 1 del runningHashes[pmidInt] else: # No newer version to compare against, so set the hash and pmidToFilename to this version runningHashes[pmidInt] = hashVal pmidToFilename[pmidInt] = pubmedXMLFile if firstFile[ pmidInt] == pubmedXMLFile and pmidInt in runningHashes: del runningHashes[pmidInt] #for pmidPrefix in map(str,range(1,10)): if False: pmidHashes = defaultdict(list) for filename in sorted(files): #continue with open(filename) as f: hashes = json.load(f) keys = list(hashes.keys()) assert len(keys) == 1 pubmedXMLFile = keys[0] #pubmedXMLFiles.append(pubmedXMLFile) for pmid in hashes[pubmedXMLFile].keys(): #if not pmid.startswith(pmidPrefix): # continue if whichHashes is None: hashVal = hashes[pubmedXMLFile][pmid] else: try: hashVal = [ hashes[pubmedXMLFile][pmid][h] for h in whichHashes ] except KeyError as e: raise RuntimeError( "The selected hash (%s) from the 'usePubmedHashes' option has not been found in the hash files." % (str(e))) pmidInt = int(pmid) if pmidHashes[pmidInt] != hashVal: pmidHashes[pmidInt] = hashVal pmidToFilename[pmidInt] = pubmedXMLFile pmidHashes = None #pmidToFilename = {} #for pmid in range(29661075): # pmidToFilename[pmid] = '/projects/jlever/pubrunner_data/resources/PUBMED/pubmed19n1000.xml' #sys.exit(0) filenameToPMIDs = defaultdict(list) #filenameToPMIDs = [ [] for _ in pubmedXMLFiles ] #for pmid,filename in pmidToFilename.items(): # filenameToPMIDs[filename].append(pmid) for pmid, filename in enumerate(pmidToFilename): if filename is not None: filenameToPMIDs[filename].append(pmid) if not os.path.isdir(outPMIDDir): os.makedirs(outPMIDDir) for filename in pubmedXMLFiles: basename = os.path.basename(filename) outName = os.path.join(outPMIDDir, basename + '.pmids') #pmids = [ pmid for pmid,f in enumerate(pmidToFilename) if f == filename ] pmids = filenameToPMIDs[filename] if not pmidExclusions is None: pmids = [pmid for pmid in pmids if not pmid in pmidExclusions] fileAlreadyExists = os.path.isfile(outName) if fileAlreadyExists: timestamp = os.path.getmtime(outName) beforeHash = pubrunner.calcSHA256(outName) with open(outName, 'w') as f: for pmid in pmids: f.write("%d\n" % pmid) if fileAlreadyExists: afterHash = pubrunner.calcSHA256(outName) if beforeHash == afterHash: # File hasn't changed so move the modified date back os.utime(outName, (timestamp, timestamp))