def doFetch(options, outputFolder): '''The main fetch function. Returns the number of failures.''' # Verify that required files exist home = os.path.expanduser("~") if not (os.path.exists(home + '/.netrc') and os.path.exists(home + '/.urs_cookies')): logger.error( 'Missing a required authentication file! See instructions here:\n' + ' https://nsidc.org/support/faq/what-options-are-available-bulk-' + 'downloading-data-https-earthdata-login-enabled') return -1 curlPath = asp_system_utils.which("curl") curlOpts = ' -n -L ' cookiePaths = ' -b ~/.urs_cookies -c ~/.urs_cookies ' baseCurlCmd = curlPath + curlOpts + cookiePaths logger.info('Creating output folder: ' + outputFolder) os.system('mkdir -p ' + outputFolder) isSouth = (options.site == 'AN') if options.type == 'nav': # Nav fetching is much less complicated return fetchNavData(options, outputFolder) parsedIndexPath = fetchAndParseIndexFile(options, isSouth, baseCurlCmd, outputFolder) if not icebridge_common.fileNonEmpty(parsedIndexPath): # Some dirs are weird, both images, fireball dems, and ortho. # Just accept whatever there is, but with a warning. logger.info('Warning: Missing index file: ' + parsedIndexPath) # Store file information in a dictionary # - Keep track of the earliest and latest frame logger.info('Reading file list from ' + parsedIndexPath) try: (frameDict, urlDict) = icebridge_common.readIndexFile(parsedIndexPath) except: # We probably ran into old format index file. Must refetch. logger.info('Could not read index file. Try again.') options.refetchIndex = True parsedIndexPath = fetchAndParseIndexFile(options, isSouth, baseCurlCmd, outputFolder) (frameDict, urlDict) = icebridge_common.readIndexFile(parsedIndexPath) if options.stopAfterIndexFetch: return 0 isLidar = (options.type in LIDAR_TYPES) allFrames = sorted(frameDict.keys()) if not isLidar: # The lidar frames use a totally different numbering than the image/ortho/dem frames firstFrame = icebridge_common.getLargestFrame() # start big lastFrame = icebridge_common.getSmallestFrame() # start small for frameNumber in allFrames: if frameNumber < firstFrame: firstFrame = frameNumber if frameNumber > lastFrame: lastFrame = frameNumber if options.allFrames: options.startFrame = firstFrame options.stopFrame = lastFrame if isLidar: # Based on image frames, determine which lidar frames to fetch. if options.ignoreMissingLidar and len(frameDict.keys()) == 0: # Nothing we can do if this run has no lidar and we are told to continue logger.info("Warning: missing lidar, but continuing.") lidarsToFetch = set() else: lidarsToFetch = lidarFilesInRange(frameDict, outputFolder, options.startFrame, options.stopFrame) # There is always a chance that not all requested frames are available. # That is particularly true for Fireball DEMs. Instead of failing, # just download what is present and give a warning. if options.startFrame not in frameDict and not isLidar: logger.info("Warning: Frame " + str(options.startFrame) + " is not found in this flight.") if options.stopFrame and (options.stopFrame not in frameDict) and not isLidar: logger.info("Warning: Frame " + str(options.stopFrame) + " is not found in this flight.") allFilesToFetch = [ ] # Files that we will fetch, relative to the current dir. allUrlsToFetch = [] # Full url of each file. # Loop through all found frames within the provided range currentFileCount = 0 lastFrame = "" if len(allFrames) > 0: lastFrame = allFrames[len(allFrames) - 1] hasTfw = (options.type == 'fireball') hasXml = (isLidar or (options.type == 'ortho') or hasTfw) numFetched = 0 skipCount = 0 for frame in allFrames: # Skip frame outside of range if isLidar: if frameDict[frame] not in lidarsToFetch: continue else: if ((frame < options.startFrame) or (frame > options.stopFrame)): continue # Handle the frame skip option if options.frameSkip > 0: if skipCount < options.frameSkip: skipCount += 1 continue skipCount = 0 filename = frameDict[frame] # Some files have an associated xml file. Fireball DEMs also have a tfw file. currFilesToFetch = [filename] if hasXml: currFilesToFetch.append(icebridge_common.xmlFile(filename)) if hasTfw: currFilesToFetch.append(icebridge_common.tfwFile(filename)) for filename in currFilesToFetch: url = os.path.join(urlDict[frame], filename) outputPath = os.path.join(outputFolder, filename) allFilesToFetch.append(outputPath) allUrlsToFetch.append(url) # Restrict lidar fetch amount according to the parameter if (isLidar and options.maxNumLidarToFetch > 0 and len(allFilesToFetch) > options.maxNumLidarToFetch): # Ensure an even number, to fetch both the lidar file and its xml if options.maxNumLidarToFetch % 2 == 1: options.maxNumLidarToFetch += 1 allFilesToFetch = allFilesToFetch[0:options.maxNumLidarToFetch] allUrlsToFetch = allUrlsToFetch[0:options.maxNumLidarToFetch] icebridge_common.fetchFilesInBatches(baseCurlCmd, MAX_IN_ONE_CALL, options.dryRun, outputFolder, allFilesToFetch, allUrlsToFetch, logger) # Fetch from disk the set of already validated files, if any validFilesList = icebridge_common.validFilesList( os.path.dirname(outputFolder), options.startFrame, options.stopFrame) validFilesSet = set() validFilesSet = icebridge_common.updateValidFilesListFromDisk( validFilesList, validFilesSet) numInitialValidFiles = len(validFilesSet) # Verify that all files were fetched and are in good shape failedFiles = [] for outputPath in allFilesToFetch: if options.skipValidate: continue if not icebridge_common.fileNonEmpty(outputPath): logger.info('Missing file: ' + outputPath) failedFiles.append(outputPath) continue if icebridge_common.hasImageExtension(outputPath): if False: # This check is just so slow. Turn it off for now. # This will impact only the validation of jpegs, # as the other files can be validated via the checksum. # Jpegs will be validated when converting them to 1 band images if outputPath in validFilesSet and os.path.exists(outputPath): #logger.info('Previously validated: ' + outputPath) # verbose continue else: if not icebridge_common.isValidImage(outputPath): logger.info('Found an invalid image. Will wipe it: ' + outputPath) if os.path.exists(outputPath): os.remove(outputPath) failedFiles.append(outputPath) continue else: logger.info('Valid image: ' + outputPath) validFilesSet.add(outputPath) # mark it as validated # Sanity check: XML files must have the right latitude. if icebridge_common.fileExtension(outputPath) == '.xml': if outputPath in validFilesSet and os.path.exists(outputPath): #logger.info('Previously validated: ' + outputPath) #verbose continue else: if os.path.exists(outputPath): try: latitude = icebridge_common.parseLatitude(outputPath) logger.info('Valid file: ' + outputPath) validFilesSet.add(outputPath) # mark it as validated except: # Corrupted file logger.info("Failed to parse latitude, will wipe: " + outputPath) if os.path.exists(outputPath): os.remove(outputPath) failedFiles.append(outputPath) # On a second thought, don't wipe files with wrong latitude, as # next time we run fetch we will have to fetch them again. # Hopefully they will be ignored. #isGood = hasGoodLat(latitude, isSouth) #if not isGood: # logger.info("Wiping XML file " + outputPath + " with bad latitude " + \ # str(latitude)) # os.remove(outputPath) # imageFile = icebridge_common.xmlToImage(outputPath) # if os.path.exists(imageFile): # logger.info("Wiping TIF file " + imageFile + " with bad latitude " + \ # str(latitude)) # os.remove(imageFile) # Verify the chcksum if hasXml and len(outputPath) >= 4 and outputPath[-4:] != '.xml' \ and outputPath[-4:] != '.tfw': if outputPath in validFilesSet and os.path.exists(outputPath): #logger.info('Previously validated: ' + outputPath) # verbose continue else: isGood = icebridge_common.hasValidChkSum(outputPath, logger) if not isGood: xmlFile = icebridge_common.xmlFile(outputPath) logger.info('Found invalid data. Will wipe: ' + outputPath + ' ' + xmlFile) if os.path.exists(outputPath): os.remove(outputPath) if os.path.exists(xmlFile): os.remove(xmlFile) failedFiles.append(outputPath) failedFiles.append(xmlFile) continue else: logger.info('Valid file: ' + outputPath) validFilesSet.add(outputPath) if hasTfw and icebridge_common.fileExtension(outputPath) == '.tfw': if outputPath in validFilesSet and os.path.exists(outputPath): #logger.info('Previously validated: ' + outputPath) continue else: isGood = icebridge_common.isValidTfw(outputPath, logger) if not isGood: xmlFile = icebridge_common.xmlFile(outputPath) logger.info('Found invalid tfw. Will wipe: ' + outputPath + ' ' + xmlFile) if os.path.exists(outputPath): os.remove(outputPath) if os.path.exists(xmlFile): os.remove(xmlFile) failedFiles.append(outputPath) failedFiles.append(xmlFile) continue else: logger.info('Valid tfw file: ' + outputPath) validFilesSet.add(outputPath) # Write to disk the list of validated files, but only if new # validations happened. First re-read that list, in case a # different process modified it in the meantime, such as if two # managers are running at the same time. numFinalValidFiles = len(validFilesSet) if numInitialValidFiles != numFinalValidFiles: validFilesSet = \ icebridge_common.updateValidFilesListFromDisk(validFilesList, validFilesSet) icebridge_common.writeValidFilesList(validFilesList, validFilesSet) numFailed = len(failedFiles) if numFailed > 0: logger.info("Number of files that could not be processed: " + str(numFailed)) return numFailed
def fetchAndParseIndexFileAux(isSouth, separateByLat, dayVal, baseCurlCmd, folderUrl, path, fileType): '''Retrieve the index file for a folder of data and create a parsed version of it that contains frame number / filename pairs.''' # Download the html file curlCmd = baseCurlCmd + ' ' + folderUrl + ' > ' + path logger.info(curlCmd) p = subprocess.Popen(curlCmd, shell=True) os.waitpid(p.pid, 0) # Find all the file names in the index file and # dump them to a new index file logger.info('Extracting file name list from index.html file...') with open(path, 'r') as f: indexText = f.read() # Must wipe this html file. We fetch it too often in different # contexts. If not wiped, the code fails to work in some # very rare but real situations. if os.path.exists(path): os.remove(path) # Extract just the file names fileList = [] # ensure initialization if fileType == 'jpeg': fileList = re.findall(">[0-9_]*.JPG", indexText, re.IGNORECASE) if fileType == 'ortho': fileList = re.findall(">DMS\w*.tif<", indexText, re.IGNORECASE) if fileType == 'fireball': # Fireball DEMs fileList = re.findall(">IODMS\w*DEM.tif", indexText, re.IGNORECASE) if fileType == 'lvis': fileList = re.findall(">ILVIS\w+.TXT", indexText, re.IGNORECASE) if fileType == 'atm1': fileList = re.findall(">ILATM1B[0-9_]*.ATM4\w+.qi", indexText, re.IGNORECASE) # >ILATM1B_20111018_145455.ATM4BT4.qi # or >ILATM1B_20091016_165112.atm4cT3.qi if fileType == 'atm2': # Match ILATM1B_20160713_195419.ATM5BT5.h5 fileList = re.findall(">ILATM1B[0-9_]*.ATM\w+.h5", indexText, re.IGNORECASE) # Get rid of '>' and '<' for fileIter in range(len(fileList)): fileList[fileIter] = fileList[fileIter].replace(">", "") fileList[fileIter] = fileList[fileIter].replace("<", "") # Some runs, eg, https://n5eil01u.ecs.nsidc.org/ICEBRIDGE/IODMS1B.001/2015.09.24 # have files for both GR and AN, with same frame number. Those need to be separated # by latitude. This is a problem only with orthoimages. badXmls = set() outputFolder = os.path.dirname(path) if separateByLat: allFilesToFetch = [] allUrlsToFetch = [] for filename in fileList: xmlFile = icebridge_common.xmlFile(filename) url = os.path.join(folderUrl, xmlFile) outputPath = os.path.join(outputFolder, xmlFile) allFilesToFetch.append(outputPath) allUrlsToFetch.append(url) dryRun = False icebridge_common.fetchFilesInBatches(baseCurlCmd, MAX_IN_ONE_CALL, dryRun, outputFolder, allFilesToFetch, allUrlsToFetch, logger) # Mark the bad ones for xmlFile in allFilesToFetch: latitude = icebridge_common.parseLatitude(xmlFile) isGood = hasGoodLat(latitude, isSouth) if not isGood: badXmls.add(xmlFile) elif (fileType == 'ortho' or fileType == 'fireball'): # Sometimes there is a large gap in the timestamp. That means orthoimages # from previous day are spilling over. If dayVal is 0, we must ignore # the spillover images. If dayVal is 1, we must keep the spillover images # and igore the others. list1 = [] list2 = [] isBigGap = False prevStamp = -1 for filename in fileList: [imageDateString, imageTimeString] = icebridge_common.parseTimeStamps(filename) currStamp = float(imageTimeString) / 1000000.0 # hours if prevStamp < 0: list1.append(filename) prevStamp = currStamp continue # Note that once isBigGap becomes true, it stays true # even when the gap gets small again if currStamp - prevStamp >= 6: # six hour gap is a lot isBigGap = True if not isBigGap: list1.append(filename) else: list2.append(filename) prevStamp = currStamp # for next iteration if isBigGap: if dayVal == 0: fileList = list2[:] # current day else: fileList = list1[:] # spillover from prev day # For each entry that matched the regex, record: the frame number and the file name. frameDict = {} urlDict = {} badFiles = [] for filename in fileList: if len(badXmls) > 0: xmlFile = os.path.join(outputFolder, icebridge_common.xmlFile(filename)) if xmlFile in badXmls: continue frame = icebridge_common.getFrameNumberFromFilename(filename) if frame in frameDict.keys(): # The same frame must not occur twice. if fileType not in LIDAR_TYPES: logger.error("Error: Found two file names with same frame number: " + \ frameDict[frame] + " and " + filename) badFiles.append(filename) badFiles.append(frameDict[frame]) # note that folderUrl can vary among orthoimages, as sometimes # some of them are in a folder for the next day. frameDict[frame] = filename urlDict[frame] = folderUrl # Wipe them all, to be sorted later for badFile in badFiles: if os.path.exists(badFile): logger.info("Deleting: " + badFile) os.remove(badFile) xmlFile = icebridge_common.xmlFile(badFile) if os.path.exists(xmlFile): logger.info("Deleting: " + xmlFile) os.remove(xmlFile) if len(badFiles) > 0: raise Exception("Found files with same frame number") return (frameDict, urlDict)
def doFetch(options, outputFolder): '''The main fetch function. Returns the number of failures.''' # Verify that required files exist home = os.path.expanduser("~") if not (os.path.exists(home+'/.netrc') and os.path.exists(home+'/.urs_cookies')): logger.error('Missing a required authentication file! See instructions here:\n' + ' https://nsidc.org/support/faq/what-options-are-available-bulk-' + 'downloading-data-https-earthdata-login-enabled') return -1 curlPath = asp_system_utils.which("curl") curlOpts = ' -n -L ' cookiePaths = ' -b ~/.urs_cookies -c ~/.urs_cookies ' baseCurlCmd = curlPath + curlOpts + cookiePaths logger.info('Creating output folder: ' + outputFolder) os.system('mkdir -p ' + outputFolder) isSouth = (options.site == 'AN') if options.type == 'nav': # Nav fetching is much less complicated return fetchNavData(options, outputFolder) parsedIndexPath = fetchAndParseIndexFile(options, isSouth, baseCurlCmd, outputFolder) if not icebridge_common.fileNonEmpty(parsedIndexPath): # Some dirs are weird, both images, fireball dems, and ortho. # Just accept whatever there is, but with a warning. logger.info('Warning: Missing index file: ' + parsedIndexPath) # Store file information in a dictionary # - Keep track of the earliest and latest frame logger.info('Reading file list from ' + parsedIndexPath) try: (frameDict, urlDict) = icebridge_common.readIndexFile(parsedIndexPath) except: # We probably ran into old format index file. Must refetch. logger.info('Could not read index file. Try again.') options.refetchIndex = True parsedIndexPath = fetchAndParseIndexFile(options, isSouth, baseCurlCmd, outputFolder) (frameDict, urlDict) = icebridge_common.readIndexFile(parsedIndexPath) if options.stopAfterIndexFetch: return 0 isLidar = (options.type in LIDAR_TYPES) allFrames = sorted(frameDict.keys()) if not isLidar: # The lidar frames use a totally different numbering than the image/ortho/dem frames firstFrame = icebridge_common.getLargestFrame() # start big lastFrame = icebridge_common.getSmallestFrame() # start small for frameNumber in allFrames: if frameNumber < firstFrame: firstFrame = frameNumber if frameNumber > lastFrame: lastFrame = frameNumber if options.allFrames: options.startFrame = firstFrame options.stopFrame = lastFrame if isLidar: # Based on image frames, determine which lidar frames to fetch. if options.ignoreMissingLidar and len(frameDict.keys()) == 0: # Nothing we can do if this run has no lidar and we are told to continue logger.info("Warning: missing lidar, but continuing.") lidarsToFetch = set() else: lidarsToFetch = lidarFilesInRange(frameDict, outputFolder, options.startFrame, options.stopFrame) # There is always a chance that not all requested frames are available. # That is particularly true for Fireball DEMs. Instead of failing, # just download what is present and give a warning. if options.startFrame not in frameDict and not isLidar: logger.info("Warning: Frame " + str(options.startFrame) + " is not found in this flight.") if options.stopFrame and (options.stopFrame not in frameDict) and not isLidar: logger.info("Warning: Frame " + str(options.stopFrame) + " is not found in this flight.") allFilesToFetch = [] # Files that we will fetch, relative to the current dir. allUrlsToFetch = [] # Full url of each file. # Loop through all found frames within the provided range currentFileCount = 0 lastFrame = "" if len(allFrames) > 0: lastFrame = allFrames[len(allFrames)-1] hasTfw = (options.type == 'fireball') hasXml = ( isLidar or (options.type == 'ortho') or hasTfw ) numFetched = 0 skipCount = 0 for frame in allFrames: # Skip frame outside of range if isLidar: if frameDict[frame] not in lidarsToFetch: continue else: if ((frame < options.startFrame) or (frame > options.stopFrame) ): continue # Handle the frame skip option if options.frameSkip > 0: if skipCount < options.frameSkip: skipCount += 1 continue skipCount = 0 filename = frameDict[frame] # Some files have an associated xml file. Fireball DEMs also have a tfw file. currFilesToFetch = [filename] if hasXml: currFilesToFetch.append(icebridge_common.xmlFile(filename)) if hasTfw: currFilesToFetch.append(icebridge_common.tfwFile(filename)) for filename in currFilesToFetch: url = os.path.join(urlDict[frame], filename) outputPath = os.path.join(outputFolder, filename) allFilesToFetch.append(outputPath) allUrlsToFetch.append(url) # Restrict lidar fetch amount according to the parameter if (isLidar and options.maxNumLidarToFetch > 0 and len(allFilesToFetch) > options.maxNumLidarToFetch): # Ensure an even number, to fetch both the lidar file and its xml if options.maxNumLidarToFetch % 2 == 1: options.maxNumLidarToFetch += 1 allFilesToFetch = allFilesToFetch[0:options.maxNumLidarToFetch] allUrlsToFetch = allUrlsToFetch [0:options.maxNumLidarToFetch] icebridge_common.fetchFilesInBatches(baseCurlCmd, MAX_IN_ONE_CALL, options.dryRun, outputFolder, allFilesToFetch, allUrlsToFetch, logger) # Fetch from disk the set of already validated files, if any validFilesList = icebridge_common.validFilesList(os.path.dirname(outputFolder), options.startFrame, options.stopFrame) validFilesSet = set() validFilesSet = icebridge_common.updateValidFilesListFromDisk(validFilesList, validFilesSet) numInitialValidFiles = len(validFilesSet) # Verify that all files were fetched and are in good shape failedFiles = [] for outputPath in allFilesToFetch: if options.skipValidate: continue if not icebridge_common.fileNonEmpty(outputPath): logger.info('Missing file: ' + outputPath) failedFiles.append(outputPath) continue if icebridge_common.hasImageExtension(outputPath): if False: # This check is just so slow. Turn it off for now. # This will impact only the validation of jpegs, # as the other files can be validated via the checksum. # Jpegs will be validated when converting them to 1 band images if outputPath in validFilesSet and os.path.exists(outputPath): #logger.info('Previously validated: ' + outputPath) # verbose continue else: if not icebridge_common.isValidImage(outputPath): logger.info('Found an invalid image. Will wipe it: ' + outputPath) if os.path.exists(outputPath): os.remove(outputPath) failedFiles.append(outputPath) continue else: logger.info('Valid image: ' + outputPath) validFilesSet.add(outputPath) # mark it as validated # Sanity check: XML files must have the right latitude. if icebridge_common.fileExtension(outputPath) == '.xml': if outputPath in validFilesSet and os.path.exists(outputPath): #logger.info('Previously validated: ' + outputPath) #verbose continue else: if os.path.exists(outputPath): try: latitude = icebridge_common.parseLatitude(outputPath) logger.info('Valid file: ' + outputPath) validFilesSet.add(outputPath) # mark it as validated except: # Corrupted file logger.info("Failed to parse latitude, will wipe: " + outputPath) if os.path.exists(outputPath): os.remove(outputPath) failedFiles.append(outputPath) # On a second thought, don't wipe files with wrong latitude, as # next time we run fetch we will have to fetch them again. # Hopefully they will be ignored. #isGood = hasGoodLat(latitude, isSouth) #if not isGood: # logger.info("Wiping XML file " + outputPath + " with bad latitude " + \ # str(latitude)) # os.remove(outputPath) # imageFile = icebridge_common.xmlToImage(outputPath) # if os.path.exists(imageFile): # logger.info("Wiping TIF file " + imageFile + " with bad latitude " + \ # str(latitude)) # os.remove(imageFile) # Verify the chcksum if hasXml and len(outputPath) >= 4 and outputPath[-4:] != '.xml' \ and outputPath[-4:] != '.tfw': if outputPath in validFilesSet and os.path.exists(outputPath): #logger.info('Previously validated: ' + outputPath) # verbose continue else: isGood = icebridge_common.hasValidChkSum(outputPath, logger) if not isGood: xmlFile = icebridge_common.xmlFile(outputPath) logger.info('Found invalid data. Will wipe: ' + outputPath + ' ' + xmlFile) if os.path.exists(outputPath): os.remove(outputPath) if os.path.exists(xmlFile): os.remove(xmlFile) failedFiles.append(outputPath) failedFiles.append(xmlFile) continue else: logger.info('Valid file: ' + outputPath) validFilesSet.add(outputPath) if hasTfw and icebridge_common.fileExtension(outputPath) == '.tfw': if outputPath in validFilesSet and os.path.exists(outputPath): #logger.info('Previously validated: ' + outputPath) continue else: isGood = icebridge_common.isValidTfw(outputPath, logger) if not isGood: xmlFile = icebridge_common.xmlFile(outputPath) logger.info('Found invalid tfw. Will wipe: ' + outputPath + ' ' + xmlFile) if os.path.exists(outputPath): os.remove(outputPath) if os.path.exists(xmlFile): os.remove(xmlFile) failedFiles.append(outputPath) failedFiles.append(xmlFile) continue else: logger.info('Valid tfw file: ' + outputPath) validFilesSet.add(outputPath) # Write to disk the list of validated files, but only if new # validations happened. First re-read that list, in case a # different process modified it in the meantime, such as if two # managers are running at the same time. numFinalValidFiles = len(validFilesSet) if numInitialValidFiles != numFinalValidFiles: validFilesSet = \ icebridge_common.updateValidFilesListFromDisk(validFilesList, validFilesSet) icebridge_common.writeValidFilesList(validFilesList, validFilesSet) numFailed = len(failedFiles) if numFailed > 0: logger.info("Number of files that could not be processed: " + str(numFailed)) return numFailed
def fetchAndParseIndexFile(options, isSouth, baseCurlCmd, outputFolder): '''Create a list of all files that must be fetched unless done already.''' # For AN 20091112, etc, some of the ortho images are stored at the # beginning of the next day's flight. Need to sort this out, and # it is tricky. More comments within the code. fetchNextDay = True separateByLat = (options.type == 'ortho' and isInSeparateByLatTable(options.yyyymmdd)) if separateByLat: # Here we won't fetch the next day, we will just separate by latitude within # a given day fetchNextDay = False orthoOrFireball = ((options.type == 'ortho') or (options.type == 'fireball')) if fetchNextDay: # Normally we fetch for next day only for ortho or fireball. However, # for one single special flight, we do it for jpeg too, as then # the jpegs are also split. if orthoOrFireball or \ ((options.type == 'jpeg') and twoFlightsInOneDay(options.site, options.yyyymmdd)): fetchNextDay = True else: fetchNextDay = False # If we need to parse the next flight day as well, as expected in some runs, # we will fetch two html files, but create a single index out of them. dayVals = [0] if fetchNextDay: dayVals.append(1) indexPath = icebridge_common.htmlIndexFile(outputFolder) currIndexPath = indexPath parsedIndexPath = icebridge_common.csvIndexFile(outputFolder) if options.refetchIndex: os.system('rm -f ' + indexPath) os.system('rm -f ' + parsedIndexPath) if icebridge_common.fileNonEmpty(parsedIndexPath): logger.info('Already have the index file ' + parsedIndexPath + ', keeping it.') return parsedIndexPath frameDict = {} urlDict = {} # We need the list of jpeg frames. Sometimes when fetching ortho images, # and we have to fetch from the next day, don't fetch unless # in the jpeg index. if len(dayVals) > 1 and options.type != 'jpeg': jpegFolder = icebridge_common.getJpegFolder( os.path.dirname(outputFolder)) jpegIndexPath = icebridge_common.csvIndexFile(jpegFolder) (jpegFrameDict, jpegUrlDict) = icebridge_common.readIndexFile(jpegIndexPath) orthoStamp = {} if options.type == 'fireball': # This is a bugfix. Ensure that the fireball DEM has not just # the same frame number, but also same timestamp as the ortho. orthoFolder = icebridge_common.getOrthoFolder( os.path.dirname(outputFolder)) orthoIndexPath = icebridge_common.csvIndexFile(orthoFolder) (orthoFrameDict, orthoUrlDict) = icebridge_common.readIndexFile(orthoIndexPath) for frame in sorted(orthoFrameDict.keys()): filename = orthoFrameDict[frame] [imageDateString, imageTimeString] = icebridge_common.parseTimeStamps(filename) orthoStamp[frame] = imageTimeString for dayVal in dayVals: if len(dayVals) > 1: currIndexPath = indexPath + '.day' + str(dayVal) if options.refetchIndex: os.system('rm -f ' + currIndexPath) # Find folderUrl which contains all of the files if options.type in LIDAR_TYPES: options.allFrames = True # For lidar, always get all the frames! # For lidar, the data can come from one of three sources. # Unfortunately sometimes there is more than one source, and then # we need to pick by latitude. folderUrls = [] lidar_types = [] for lidar in LIDAR_TYPES: folderUrl = getFolderUrl( options.yyyymmdd, options.year, options.month, options.day, dayVal, # note here the dayVal options.site, lidar) logger.info('Checking lidar URL: ' + folderUrl) if checkIfUrlExists(folderUrl): logger.info('Found match with lidar type: ' + lidar) folderUrls.append(folderUrl) lidar_types.append(lidar) if len(folderUrls) == 0: logger.info( 'WARNING: Could not find any lidar data for the given date!' ) elif len(folderUrls) == 1: # Unique solution folderUrl = folderUrls[0] options.type = lidar_types[0] elif len(folderUrls) >= 2: # Multiple solutions. Pick the good one by latitude. logger.info("Multiples URLs to search: " + " ".join(folderUrls)) count = -1 isGood = False for folderUrl in folderUrls: count += 1 (localFrameDict, localUrlDict) = \ fetchAndParseIndexFileAux(isSouth, separateByLat, dayVal, baseCurlCmd, folderUrl, currIndexPath, lidar_types[count]) for frame in sorted(localFrameDict.keys()): filename = localFrameDict[frame] xmlFile = icebridge_common.xmlFile(filename) url = os.path.join(folderUrl, xmlFile) # Download the file curlCmd = baseCurlCmd + ' ' + url + ' > ' + xmlFile logger.info(curlCmd) p = subprocess.Popen(curlCmd, shell=True) os.waitpid(p.pid, 0) latitude = icebridge_common.parseLatitude(xmlFile) if os.path.exists(xmlFile): os.remove(xmlFile) if hasGoodLat(latitude, isSouth): isGood = True options.type = lidar_types[count] logger.info("Good latitude " + str(latitude) + ", will use " + folderUrl + " of type " + lidar_types[count]) else: logger.info("Bad latitude " + str(latitude) + ", will not use " + folderUrl + " of type " + lidar_types[count]) # Stop at first file no matter what break if isGood: break if not isGood: if options.type in LIDAR_TYPES and options.ignoreMissingLidar: logger.info("No lidar. None of these URLs are good: " + " ".join(folderUrls)) else: raise Exception("None of these URLs are good: " + " ".join(folderUrls)) else: # Other cases are simpler folderUrl = getFolderUrl( options.yyyymmdd, options.year, options.month, options.day, dayVal, # note here the dayVal options.site, options.type) logger.info('Fetching from URL: ' + folderUrl) (localFrameDict, localUrlDict) = \ fetchAndParseIndexFileAux(isSouth, separateByLat, dayVal, baseCurlCmd, folderUrl, currIndexPath, options.type) # Append to the main index for frame in sorted(localFrameDict.keys()): if options.type == 'fireball': # This is a bugfix. Ensure that the fireball DEM has not just # the same frame number, but also same timestamp as the ortho. # Otherwise we may accidentally getting one from next day. [imageDateString, imageTimeString] = \ icebridge_common.parseTimeStamps(localFrameDict[frame]) if frame not in orthoStamp: #logger.info("Missing ortho for fireball: " + localFrameDict[frame]) continue if abs(int(imageTimeString) - int(orthoStamp[frame])) > 1000: # Apparently a tolerance is needed. Use 10 seconds, so the number 1000. #logger.info("Will not use fireball DEM whose timestamp differs from ortho.") #logger.info("Fireball is: " + localFrameDict[frame]) #logger.info("Ortho is: " + orthoFrameDict[frame]) continue # Fetch from next day, unless already have a value. And don't fetch # frames not in the jpeg index. if len(dayVals) > 1 and options.type != 'jpeg': if not frame in jpegFrameDict.keys(): continue if frame in frameDict.keys(): continue frameDict[frame] = localFrameDict[frame] urlDict[frame] = localUrlDict[frame] # Write the combined index file icebridge_common.writeIndexFile(parsedIndexPath, frameDict, urlDict) return parsedIndexPath
def fetchAndParseIndexFile(options, isSouth, baseCurlCmd, outputFolder): '''Create a list of all files that must be fetched unless done already.''' # For AN 20091112, etc, some of the ortho images are stored at the # beginning of the next day's flight. Need to sort this out, and # it is tricky. More comments within the code. fetchNextDay = True separateByLat = (options.type == 'ortho' and isInSeparateByLatTable(options.yyyymmdd)) if separateByLat: # Here we won't fetch the next day, we will just separate by latitude within # a given day fetchNextDay = False orthoOrFireball = ( (options.type == 'ortho') or (options.type == 'fireball') ) if fetchNextDay: # Normally we fetch for next day only for ortho or fireball. However, # for one single special flight, we do it for jpeg too, as then # the jpegs are also split. if orthoOrFireball or \ ((options.type == 'jpeg') and twoFlightsInOneDay(options.site, options.yyyymmdd)): fetchNextDay = True else: fetchNextDay = False # If we need to parse the next flight day as well, as expected in some runs, # we will fetch two html files, but create a single index out of them. dayVals = [0] if fetchNextDay: dayVals.append(1) indexPath = icebridge_common.htmlIndexFile(outputFolder) currIndexPath = indexPath parsedIndexPath = icebridge_common.csvIndexFile(outputFolder) if options.refetchIndex: os.system('rm -f ' + indexPath) os.system('rm -f ' + parsedIndexPath) if icebridge_common.fileNonEmpty(parsedIndexPath): logger.info('Already have the index file ' + parsedIndexPath + ', keeping it.') return parsedIndexPath frameDict = {} urlDict = {} # We need the list of jpeg frames. Sometimes when fetching ortho images, # and we have to fetch from the next day, don't fetch unless # in the jpeg index. if len(dayVals) > 1 and options.type != 'jpeg': jpegFolder = icebridge_common.getJpegFolder(os.path.dirname(outputFolder)) jpegIndexPath = icebridge_common.csvIndexFile(jpegFolder) (jpegFrameDict, jpegUrlDict) = icebridge_common.readIndexFile(jpegIndexPath) orthoStamp = {} if options.type == 'fireball': # This is a bugfix. Ensure that the fireball DEM has not just # the same frame number, but also same timestamp as the ortho. orthoFolder = icebridge_common.getOrthoFolder(os.path.dirname(outputFolder)) orthoIndexPath = icebridge_common.csvIndexFile(orthoFolder) (orthoFrameDict, orthoUrlDict) = icebridge_common.readIndexFile(orthoIndexPath) for frame in sorted(orthoFrameDict.keys()): filename = orthoFrameDict[frame] [imageDateString, imageTimeString] = icebridge_common.parseTimeStamps(filename) orthoStamp[frame] = imageTimeString for dayVal in dayVals: if len(dayVals) > 1: currIndexPath = indexPath + '.day' + str(dayVal) if options.refetchIndex: os.system('rm -f ' + currIndexPath) # Find folderUrl which contains all of the files if options.type in LIDAR_TYPES: options.allFrames = True # For lidar, always get all the frames! # For lidar, the data can come from one of three sources. # Unfortunately sometimes there is more than one source, and then # we need to pick by latitude. folderUrls = [] lidar_types = [] for lidar in LIDAR_TYPES: folderUrl = getFolderUrl(options.yyyymmdd, options.year, options.month, options.day, dayVal, # note here the dayVal options.site, lidar) logger.info('Checking lidar URL: ' + folderUrl) if checkIfUrlExists(folderUrl, baseCurlCmd): logger.info('Found match with lidar type: ' + lidar) folderUrls.append(folderUrl) lidar_types.append(lidar) if len(folderUrls) == 0: logger.info('WARNING: Could not find any lidar data for the given date!') elif len(folderUrls) == 1: # Unique solution folderUrl = folderUrls[0] options.type = lidar_types[0] elif len(folderUrls) >= 2: # Multiple solutions. Pick the good one by latitude. logger.info("Multiples URLs to search: " + " ".join(folderUrls)) count = -1 isGood = False for folderUrl in folderUrls: count += 1 (localFrameDict, localUrlDict) = \ fetchAndParseIndexFileAux(isSouth, separateByLat, dayVal, baseCurlCmd, folderUrl, currIndexPath, lidar_types[count]) for frame in sorted(localFrameDict.keys()): filename = localFrameDict[frame] xmlFile = icebridge_common.xmlFile(filename) url = os.path.join(folderUrl, xmlFile) # Download the file curlCmd = baseCurlCmd + ' ' + url + ' > ' + xmlFile logger.info(curlCmd) p = subprocess.Popen(curlCmd, shell=True, universal_newlines=True) os.waitpid(p.pid, 0) latitude = icebridge_common.parseLatitude(xmlFile) if os.path.exists(xmlFile): os.remove(xmlFile) if hasGoodLat(latitude, isSouth): isGood = True options.type = lidar_types[count] logger.info("Good latitude " + str(latitude) + ", will use " + folderUrl + " of type " + lidar_types[count]) else: logger.info("Bad latitude " + str(latitude) + ", will not use " + folderUrl + " of type " + lidar_types[count]) # Stop at first file no matter what break if isGood: break if not isGood: if options.type in LIDAR_TYPES and options.ignoreMissingLidar: logger.info("No lidar. None of these URLs are good: " + " ".join(folderUrls)) else: raise Exception("None of these URLs are good: " + " ".join(folderUrls)) else: # Other cases are simpler folderUrl = getFolderUrl(options.yyyymmdd, options.year, options.month, options.day, dayVal, # note here the dayVal options.site, options.type) logger.info('Fetching from URL: ' + folderUrl) (localFrameDict, localUrlDict) = \ fetchAndParseIndexFileAux(isSouth, separateByLat, dayVal, baseCurlCmd, folderUrl, currIndexPath, options.type) # Append to the main index for frame in sorted(localFrameDict.keys()): if options.type == 'fireball': # This is a bugfix. Ensure that the fireball DEM has not just # the same frame number, but also same timestamp as the ortho. # Otherwise we may accidentally getting one from next day. [imageDateString, imageTimeString] = \ icebridge_common.parseTimeStamps(localFrameDict[frame]) if frame not in orthoStamp: #logger.info("Missing ortho for fireball: " + localFrameDict[frame]) continue if abs(int(imageTimeString) - int(orthoStamp[frame])) > 1000: # Apparently a tolerance is needed. Use 10 seconds, so the number 1000. #logger.info("Will not use fireball DEM whose timestamp differs from ortho.") #logger.info("Fireball is: " + localFrameDict[frame]) #logger.info("Ortho is: " + orthoFrameDict[frame]) continue # Fetch from next day, unless already have a value. And don't fetch # frames not in the jpeg index. if len(dayVals) > 1 and options.type != 'jpeg': if not frame in jpegFrameDict.keys(): continue if frame in frameDict.keys(): continue frameDict[frame] = localFrameDict[frame] urlDict[frame] = localUrlDict[frame] # Write the combined index file icebridge_common.writeIndexFile(parsedIndexPath, frameDict, urlDict) return parsedIndexPath
def fetchAndParseIndexFileAux(isSouth, separateByLat, dayVal, baseCurlCmd, folderUrl, path, fileType): '''Retrieve the index file for a folder of data and create a parsed version of it that contains frame number / filename pairs.''' # Download the html file curlCmd = baseCurlCmd + ' ' + folderUrl + ' > ' + path logger.info(curlCmd) p = subprocess.Popen(curlCmd, shell=True, universal_newlines=True) os.waitpid(p.pid, 0) # Find all the file names in the index file and # dump them to a new index file logger.info('Extracting file name list from index.html file...') with open(path, 'r') as f: indexText = f.read() # Must wipe this html file. We fetch it too often in different # contexts. If not wiped, the code fails to work in some # very rare but real situations. if os.path.exists(path): os.remove(path) # Extract just the file names fileList = [] # ensure initialization if fileType == 'jpeg': fileList = re.findall(">[0-9_]*.JPG", indexText, re.IGNORECASE) if fileType == 'ortho': fileList = re.findall(">DMS\w*.tif<", indexText, re.IGNORECASE) if fileType == 'fireball': # Fireball DEMs fileList = re.findall(">IODMS\w*DEM.tif", indexText, re.IGNORECASE) if fileType == 'lvis': fileList = re.findall(">ILVIS\w+.TXT", indexText, re.IGNORECASE) if fileType == 'atm1': fileList = re.findall(">ILATM1B[0-9_]*.ATM4\w+.qi", indexText, re.IGNORECASE) # >ILATM1B_20111018_145455.ATM4BT4.qi # or >ILATM1B_20091016_165112.atm4cT3.qi if fileType == 'atm2': # Match ILATM1B_20160713_195419.ATM5BT5.h5 fileList = re.findall(">ILATM1B[0-9_]*.ATM\w+.h5", indexText, re.IGNORECASE) # Get rid of '>' and '<' for fileIter in range(len(fileList)): fileList[fileIter] = fileList[fileIter].replace(">", "") fileList[fileIter] = fileList[fileIter].replace("<", "") # Some runs, eg, https://n5eil01u.ecs.nsidc.org/ICEBRIDGE/IODMS1B.001/2015.09.24 # have files for both GR and AN, with same frame number. Those need to be separated # by latitude. This is a problem only with orthoimages. badXmls = set() outputFolder = os.path.dirname(path) if separateByLat: allFilesToFetch = [] allUrlsToFetch = [] for filename in fileList: xmlFile = icebridge_common.xmlFile(filename) url = os.path.join(folderUrl, xmlFile) outputPath = os.path.join(outputFolder, xmlFile) allFilesToFetch.append(outputPath) allUrlsToFetch.append(url) dryRun = False icebridge_common.fetchFilesInBatches(baseCurlCmd, MAX_IN_ONE_CALL, dryRun, outputFolder, allFilesToFetch, allUrlsToFetch, logger) # Mark the bad ones for xmlFile in allFilesToFetch: latitude = icebridge_common.parseLatitude(xmlFile) isGood = hasGoodLat(latitude, isSouth) if not isGood: badXmls.add(xmlFile) elif (fileType == 'ortho' or fileType == 'fireball'): # Sometimes there is a large gap in the timestamp. That means orthoimages # from previous day are spilling over. If dayVal is 0, we must ignore # the spillover images. If dayVal is 1, we must keep the spillover images # and igore the others. list1 = [] list2 = [] isBigGap = False prevStamp = -1 for filename in fileList: [imageDateString, imageTimeString] = icebridge_common.parseTimeStamps(filename) currStamp = float(imageTimeString)/1000000.0 # hours if prevStamp < 0: list1.append(filename) prevStamp = currStamp continue # Note that once isBigGap becomes true, it stays true # even when the gap gets small again if currStamp - prevStamp >= 6: # six hour gap is a lot isBigGap = True if not isBigGap: list1.append(filename) else: list2.append(filename) prevStamp = currStamp # for next iteration if isBigGap: if dayVal == 0: fileList = list2[:] # current day else: fileList = list1[:] # spillover from prev day # For each entry that matched the regex, record: the frame number and the file name. frameDict = {} urlDict = {} badFiles = [] for filename in fileList: if len(badXmls) > 0: xmlFile = os.path.join(outputFolder, icebridge_common.xmlFile(filename)) if xmlFile in badXmls: continue frame = icebridge_common.getFrameNumberFromFilename(filename) if frame in frameDict.keys(): # The same frame must not occur twice. if fileType not in LIDAR_TYPES: logger.error("Error: Found two file names with same frame number: " + \ frameDict[frame] + " and " + filename) badFiles.append(filename) badFiles.append(frameDict[frame]) # note that folderUrl can vary among orthoimages, as sometimes # some of them are in a folder for the next day. frameDict[frame] = filename urlDict[frame] = folderUrl # Wipe them all, to be sorted later for badFile in badFiles: if os.path.exists(badFile): logger.info("Deleting: " + badFile) os.remove(badFile) xmlFile = icebridge_common.xmlFile(badFile) if os.path.exists(xmlFile): logger.info("Deleting: " + xmlFile) os.remove(xmlFile) if len(badFiles) > 0: raise Exception("Found files with same frame number") return (frameDict, urlDict)
def doFetch(options, outputFolder): # Verify that required files exist home = os.path.expanduser("~") if not (os.path.exists(home + '/.netrc') and os.path.exists(home + '/.urs_cookies')): logger.error( 'Missing a required authentication file! See instructions here:\n' + ' https://nsidc.org/support/faq/what-options-are-available-bulk-downloading-data-https-earthdata-login-enabled' ) return -1 curlPath = asp_system_utils.which("curl") curlOpts = ' -n -L ' cookiePaths = ' -b ~/.urs_cookies -c ~/.urs_cookies ' baseCurlCmd = curlPath + curlOpts + cookiePaths logger.info('Creating output folder: ' + outputFolder) os.system('mkdir -p ' + outputFolder) isSouth = (options.site == 'AN') parsedIndexPath = fetchAndParseIndexFile(options, isSouth, baseCurlCmd, outputFolder) if not icebridge_common.fileNonEmpty(parsedIndexPath): # Some dirs are weird, both images, dems, and ortho. # Just accept whatever there is, but with a warning. logger.info('Warning: Missing index file: ' + parsedIndexPath) # Store file information in a dictionary # - Keep track of the earliest and latest frame logger.info('Reading file list from ' + parsedIndexPath) try: (frameDict, urlDict) = readIndexFile(parsedIndexPath) except: # We probably ran into old format index file. Must refetch. logger.info('Could not read index file. Try again.') options.refetchIndex = True parsedIndexPath = fetchAndParseIndexFile(options, isSouth, baseCurlCmd, outputFolder) (frameDict, urlDict) = readIndexFile(parsedIndexPath) allFrames = sorted(frameDict.keys()) firstFrame = icebridge_common.getLargestFrame() # start big lastFrame = icebridge_common.getSmallestFrame() # start small for frameNumber in allFrames: if frameNumber < firstFrame: firstFrame = frameNumber if frameNumber > lastFrame: lastFrame = frameNumber if options.allFrames: options.startFrame = firstFrame options.stopFrame = lastFrame # There is always a chance that not all requested frames are available. # That is particularly true for Fireball DEMs. Instead of failing, # just download what is present and give a warning. if options.startFrame not in frameDict: logger.info("Warning: Frame " + str(options.startFrame) + \ " is not found in this flight.") if options.stopFrame and (options.stopFrame not in frameDict): logger.info("Warning: Frame " + str(options.stopFrame) + \ " is not found in this flight.") allFilesToFetch = [ ] # Files that we will fetch, relative to the current dir. allUrlsToFetch = [] # Full url of each file. # Loop through all found frames within the provided range currentFileCount = 0 lastFrame = "" if len(allFrames) > 0: lastFrame = allFrames[len(allFrames) - 1] hasTfw = (options.type == 'dem') hasXml = ((options.type in LIDAR_TYPES) or (options.type == 'ortho') or hasTfw) numFetched = 0 for frame in allFrames: if (frame >= options.startFrame) and (frame <= options.stopFrame): filename = frameDict[frame] # Some files have an associated xml file. DEMs also have a tfw file. currFilesToFetch = [filename] if hasXml: currFilesToFetch.append(icebridge_common.xmlFile(filename)) if hasTfw: currFilesToFetch.append(icebridge_common.tfwFile(filename)) for filename in currFilesToFetch: url = os.path.join(urlDict[frame], filename) outputPath = os.path.join(outputFolder, filename) allFilesToFetch.append(outputPath) allUrlsToFetch.append(url) if options.maxNumToFetch > 0 and len( allFilesToFetch) > options.maxNumToFetch: allFilesToFetch = allFilesToFetch[0:options.maxNumToFetch] allUrlsToFetch = allUrlsToFetch[0:options.maxNumToFetch] icebridge_common.fetchFilesInBatches(baseCurlCmd, MAX_IN_ONE_CALL, options.dryRun, outputFolder, allFilesToFetch, allUrlsToFetch, logger) # Verify that all files were fetched and are in good shape failedFiles = [] for outputPath in allFilesToFetch: if options.skipValidate: continue if not icebridge_common.fileNonEmpty(outputPath): logger.info('Missing file: ' + outputPath) failedFiles.append(outputPath) continue if icebridge_common.hasImageExtension(outputPath): if not icebridge_common.isValidImage(outputPath): logger.info('Found an invalid image. Will wipe it: ' + outputPath) if os.path.exists(outputPath): os.remove(outputPath) failedFiles.append(outputPath) continue else: logger.info('Valid image: ' + outputPath) # Sanity check: XML files must have the right latitude. if icebridge_common.fileExtension(outputPath) == '.xml': if os.path.exists(outputPath): latitude = icebridge_common.parseLatitude(outputPath) isGood = hasGoodLat(latitude, isSouth) if not isGood: logger.info("Wiping XML file " + outputPath + " with bad latitude " + \ str(latitude)) os.remove(outputPath) imageFile = icebridge_common.xmlToImage(outputPath) if os.path.exists(imageFile): logger.info("Wiping TIF file " + imageFile + " with bad latitude " + \ str(latitude)) os.remove(imageFile) # Verify the chcksum if hasXml and len(outputPath) >= 4 and outputPath[-4:] != '.xml' \ and outputPath[-4:] != '.tfw': isGood = icebridge_common.hasValidChkSum(outputPath) if not isGood: xmlFile = icebridge_common.xmlFile(outputPath) logger.info('Found invalid data. Will wipe it: ' + outputPath + ' ' + xmlFile) if os.path.exists(outputPath): os.remove(outputPath) if os.path.exists(xmlFile): os.remove(xmlFile) failedFiles.append(outputPath) failedFiles.append(xmlFile) continue else: logger.info('Valid chksum: ' + outputPath) if hasTfw and icebridge_common.fileExtension(outputPath) == '.tfw': isGood = icebridge_common.isValidTfw(outputPath) if not isGood: xmlFile = icebridge_common.xmlFile(outputPath) logger.info('Found invalid data. Will wipe it: ' + outputPath + ' ' + xmlFile) if os.path.exists(outputPath): os.remove(outputPath) if os.path.exists(xmlFile): os.remove(xmlFile) failedFiles.append(outputPath) failedFiles.append(xmlFile) continue else: logger.info('Valid tfw file: ' + outputPath) numFailed = len(failedFiles) if numFailed > 0: logger.info("Number of files that could not be processed: " + str(numFailed)) return numFailed
def fetchAndParseIndexFile(options, isSouth, baseCurlCmd, outputFolder): # If we need to parse the next flight day as well, as expected in some runs, # we will fetch two html files, but create a single index out of them. dayVals = [0] if options.fetchNextDay: dayVals.append(1) options.refetchIndex = True # Force refetch, to help with old archives # See if to wipe the index if options.type in LIDAR_TYPES: filename = 'lidar' + '_index.html' else: filename = options.type + '_index.html' indexPath = os.path.join(outputFolder, filename) parsedIndexPath = indexPath + '.csv' if options.refetchIndex: os.system('rm -f ' + indexPath) os.system('rm -f ' + parsedIndexPath) if icebridge_common.fileNonEmpty(parsedIndexPath): logger.info('Already have the index file ' + parsedIndexPath + ', keeping it.') return parsedIndexPath frameDict = {} urlDict = {} tryToSeparateByLat = (options.type == 'ortho') for dayVal in dayVals: if len(dayVals) > 1: indexPath = indexPath + '.day' + str(dayVal) if options.refetchIndex: os.system('rm -f ' + indexPath) # Find folderUrl which contains all of the files if options.type in LIDAR_TYPES: options.allFrames = True # For lidar, always get all the frames! # For lidar, the data can come from one of three sources. # Unfortunately sometimes there is more than one source, and then # we need to pick by latitude. folderUrls = [] lidar_types = [] for lidar in LIDAR_TYPES: folderUrl = getFolderUrl( options.year, options.month, options.day + dayVal, # note here the dayVal options.ext, options.site, lidar) logger.info('Checking lidar URL: ' + folderUrl) if checkIfUrlExists(folderUrl): logger.info('Found match with lidar type: ' + lidar) folderUrls.append(folderUrl) lidar_types.append(lidar) if len(folderUrls) == 0: logger.info( 'WARNING: Could not find any lidar data for the given date!' ) elif len(folderUrls) == 1: # Unique solution folderUrl = folderUrls[0] options.type = lidar_types[0] elif len(folderUrls) >= 2: # Multiple solutions. Pick the good one by latitude. logger.info("Multiples URLs to search: " + " ".join(folderUrls)) count = -1 isGood = False for folderUrl in folderUrls: count += 1 (localFrameDict, localUrlDict) = \ fetchAndParseIndexFileAux(isSouth, tryToSeparateByLat, baseCurlCmd, folderUrl, indexPath, lidar_types[count]) for frame in sorted(localFrameDict.keys()): filename = localFrameDict[frame] xmlFile = icebridge_common.xmlFile(filename) url = os.path.join(folderUrl, xmlFile) # Download the file curlCmd = baseCurlCmd + ' ' + url + ' > ' + xmlFile logger.info(curlCmd) p = subprocess.Popen(curlCmd, shell=True) os.waitpid(p.pid, 0) latitude = icebridge_common.parseLatitude(xmlFile) if os.path.exists(xmlFile): os.remove(xmlFile) if hasGoodLat(latitude, isSouth): isGood = True options.type = lidar_types[count] # Stop at first file no matter what break if isGood: break if not isGood: raise Exception("None of these URLs are good: " + " ".join(folderUrls)) else: # Other cases are simpler folderUrl = getFolderUrl( options.year, options.month, options.day + dayVal, # note here the dayVal options.ext, options.site, options.type) logger.info('Fetching from URL: ' + folderUrl) (localFrameDict, localUrlDict) = \ fetchAndParseIndexFileAux(isSouth, tryToSeparateByLat, baseCurlCmd, folderUrl, indexPath, options.type) # Append to the main index for frame in sorted(localFrameDict.keys()): # If we already have this frame, don't read it from the current day # as that's a recipe for mix-up if frame not in frameDict.keys(): frameDict[frame] = localFrameDict[frame] urlDict[frame] = localUrlDict[frame] # Write the combined index file with open(parsedIndexPath, 'w') as f: for frame in sorted(frameDict.keys()): f.write( str(frame) + ', ' + frameDict[frame] + ', ' + urlDict[frame] + '\n') return parsedIndexPath
def fetchAndParseIndexFileAux(isSouth, tryToSeparateByLat, baseCurlCmd, folderUrl, path, fileType): '''Retrieve the index file for a folder of data and create a parsed version of it that contains frame number / filename pairs.''' # Download the html file curlCmd = baseCurlCmd + ' ' + folderUrl + ' > ' + path logger.info(curlCmd) p = subprocess.Popen(curlCmd, shell=True) os.waitpid(p.pid, 0) # Find all the file names in the index file and # dump them to a new index file logger.info('Extracting file name list from index.html file...') with open(path, 'r') as f: indexText = f.read() # Must wipe this html file. We fetch it too often in different # contexts. If not wiped, the code fails to work in some # very rare but real situations. if os.path.exists(path): os.remove(path) # Extract just the file names fileList = [] # ensure initialization if fileType == 'image': fileList = re.findall(">[0-9_]*.JPG", indexText, re.IGNORECASE) if fileType == 'ortho': fileList = re.findall(">DMS\w*.tif<", indexText, re.IGNORECASE) if fileType == 'dem': fileList = re.findall(">IODMS\w*DEM.tif", indexText, re.IGNORECASE) if fileType == 'lvis': fileList = re.findall(">ILVIS\w+.TXT", indexText, re.IGNORECASE) if fileType == 'atm1': fileList = re.findall(">ILATM1B[0-9_]*.ATM4\w+.qi", indexText, re.IGNORECASE) # >ILATM1B_20111018_145455.ATM4BT4.qi # or >ILATM1B_20091016_165112.atm4cT3.qi if fileType == 'atm2': # Match ILATM1B_20160713_195419.ATM5BT5.h5 fileList = re.findall(">ILATM1B[0-9_]*.ATM\w+.h5", indexText, re.IGNORECASE) # Get rid of '>' and '<' for fileIter in range(len(fileList)): fileList[fileIter] = fileList[fileIter].replace(">", "") fileList[fileIter] = fileList[fileIter].replace("<", "") # Some runs, eg, https://n5eil01u.ecs.nsidc.org/ICEBRIDGE/IODMS1B.001/2015.09.24 # have files for both GR and AN, with same frame number. Those need to be separated # by latitude. This is a problem only with orthoimages. haveToSeparateByLat = False frameDict = {} for filename in fileList: if not tryToSeparateByLat: continue frame = icebridge_common.getFrameNumberFromFilename2(filename) if frame in frameDict.keys(): haveToSeparateByLat = True logger.info("Found a run with files from both AN and GR.") logger.info("Files with same frame number: " + frameDict[frame] + " and " + filename) logger.info("Need to see which to keep.") break frameDict[frame] = filename badXmls = set() outputFolder = os.path.dirname(path) if haveToSeparateByLat: allFilesToFetch = [] allUrlsToFetch = [] for filename in fileList: xmlFile = icebridge_common.xmlFile(filename) url = os.path.join(folderUrl, xmlFile) outputPath = os.path.join(outputFolder, xmlFile) allFilesToFetch.append(outputPath) allUrlsToFetch.append(url) dryRun = False icebridge_common.fetchFilesInBatches(baseCurlCmd, MAX_IN_ONE_CALL, dryRun, outputFolder, allFilesToFetch, allUrlsToFetch, logger) # Mark the bad ones and wipe them too for xmlFile in allFilesToFetch: latitude = icebridge_common.parseLatitude(xmlFile) isGood = hasGoodLat(latitude, isSouth) if not isGood: badXmls.add(xmlFile) if os.path.exists(xmlFile): os.remove(xmlFile) # For each entry that matched the regex, record: the frame number and the file name. frameDict = {} urlDict = {} for filename in fileList: xmlFile = os.path.join(outputFolder, icebridge_common.xmlFile(filename)) if xmlFile in badXmls: continue frame = icebridge_common.getFrameNumberFromFilename2(filename) if frame in frameDict.keys() and haveToSeparateByLat: # This time the same frame must not occur twice raise Exception("Found two file names with same frame number: " + \ frameDict[frame] + " and " + filename) frameDict[frame] = filename # note that folderUrl can vary among orthoimages, as sometimes # some of them are in a folder for the next day. urlDict[frame] = folderUrl return (frameDict, urlDict)