def collect_year(year, data_source = 'NCDC'): """ Collect the GSOD data file for all locations for the specified year. Look locally for the tar file first. If it is not there, and its gzip version is not either, use the ftp connection to retrieve it from data source. """ filename = info2filepath(year) local_folderpath = os.path.join("Data", "GSOD", "gsod_"+str(year)) local_filepath = os.path.join(local_folderpath, filename) if not os.path.isdir(local_folderpath): # Folder not already present os.mkdir(local_folderpath) if count_op_files(local_folderpath) < 10: # probably not all the data files are present if not os.path.exists(local_filepath): # tar file not present either: download it! if data_source == 'NCDC': remote_location = str(year) print("Retrieving archive %s... This may take several minutes." % local_filepath) remote_target = os.path.join(remote_location, filename) retrieve_file(data_source, remote_target, local_filepath) untar(local_filepath) try: panda = datafolder2pandas(local_folderpath) except MemoryError: # For years where there is a large amount of data, it is not possible to # load everything in memory # FIXME: load the data in a memory mapped/pytable stored pandas in this # case? Clarify because the memory error is thrown by mmap. It may be # doing this already, but be running into mmap limitations? warnings.warn("The year %s contains too much data to be loaded into a " "single object in memory") panda = None return panda
def collect_year_at_loc(year, location_WMO, location_WBAN, data_source = 'NCDC', internet_connected = True): """ Collect the data GSOD data file for specified location and specified year. Look locally for the file first. If it is not there, and its gzip version is not either, untar the file if it is present and has not been untared, or use the ftp connection to retrieve it from data source. """ filename = info2filepath(year, location_WMO, location_WBAN) folder_location = os.path.join("Data", "GSOD", "gsod_"+str(year)) filepath = os.path.join(folder_location, filename) print "Attempting to collect %s..." % filepath filepath_found = True if not os.path.exists(filepath): zipped_filepath = filepath+".gz" if os.path.exists(zipped_filepath): unzip(zipped_filepath) elif os.path.exists(os.path.join(folder_location, "gsod_"+str(year)+".tar")): # Possible not to rely on outside servers: untar the file if there # are no op.gz or op files. If not it means that the file is # missing. there_are_op_files = False for filename in os.listdir(folder_location): if os.path.splitext(filename)[1] in [".op", ".op.gz"]: there_are_op_files = True break if not there_are_op_files: untar(os.path.join(folder_location, "gsod_"+str(year)+".tar")) if os.path.isfile(zipped_filepath): unzip(zipped_filepath) else: warnings.warn("File %s is missing from the dataset: skipping " "this location." % zipped_filepath) filepath_found = False elif internet_connected: target_folder = "Data/GSOD/gsod_"+str(year) if not os.path.exists(target_folder): print "Creating locally the folder %s." % target_folder os.mkdir(target_folder) # Download the file from NCDC if data_source == 'NCDC': remote_location = str(year) remote_target = os.path.join(remote_location, filename+".gz") retrieve_file(data_source, remote_target, zipped_filepath) if os.path.isfile(zipped_filepath): unzip(zipped_filepath) else: filepath_found = False else: filepath_found = False if filepath_found: return datafile2pandas(filepath)
filepath_found = True except KeyError, e: log.warn("File %s is missing from the dataset: skipping " "this location." % zipped_filepath) elif internet_connected: target_folder = "Data/GSOD/" if not os.path.exists(target_folder): log.info("Creating locally the folder %s." % target_folder) os.mkdir(target_folder) # Download the file from NCDC if data_source == 'NCDC': remote_location = str(year) remote_target = remote_location + '/' + filename + ".gz" #zch print 'remote', remote_target retrieve_file(data_source, remote_target, zipped_filepath) if os.path.isfile(zipped_filepath): filepath = gzip.open(zipped_filepath) filepath_found = True if filepath_found: return datafile2pandas(filepath) def count_op_files(folder): return len([ filename for filename in os.listdir(folder) if os.path.splitext(filename)[1] in [".op", ".gz"] ]) def collect_year(year, data_source='NCDC'):
filepath_found = True except KeyError, e: log.warn("File %s is missing from the dataset: skipping " "this location." % zipped_filepath) elif internet_connected: target_folder = "Data/GSOD/" if not os.path.exists(target_folder): log.info("Creating locally the folder %s." % target_folder) os.mkdir(target_folder) # Download the file from NCDC if data_source == 'NCDC': remote_location = str(year) remote_target = remote_location + '/' + filename + ".gz" #zch print 'remote', remote_target retrieve_file(data_source, remote_target, zipped_filepath) if os.path.isfile(zipped_filepath): filepath = gzip.open(zipped_filepath) filepath_found = True if filepath_found: return datafile2pandas(filepath) def count_op_files(folder): return len([filename for filename in os.listdir(folder) if os.path.splitext(filename)[1] in [".op", ".gz"]]) def collect_year(year, data_source='NCDC'): """ Collect the GSOD data file for all locations for the specified year. Look locally for the tar file first. If it is not there, and its gzip