def collect_year(year, data_source = 'NCDC'):
    """ Collect the GSOD data file for all locations for the specified
    year. Look locally for the tar file first. If it is not there, and its gzip
    version is not either, use the ftp connection to retrieve it from data
    source.
    """
    filename = info2filepath(year)
    local_folderpath = os.path.join("Data", "GSOD", "gsod_"+str(year))
    local_filepath = os.path.join(local_folderpath, filename)
    if not os.path.isdir(local_folderpath):
        # Folder not already present
        os.mkdir(local_folderpath)
    if count_op_files(local_folderpath) < 10:
        # probably not all the data files are present
        if not os.path.exists(local_filepath):
            # tar file not present either: download it!
            if data_source == 'NCDC':
                remote_location = str(year)
            print("Retrieving archive %s... This may take several minutes." 
                  % local_filepath)
            remote_target = os.path.join(remote_location, filename)
            retrieve_file(data_source, remote_target, local_filepath)
        untar(local_filepath)
    try:
        panda = datafolder2pandas(local_folderpath)
    except MemoryError:
        # For years where there is a large amount of data, it is not possible to
        # load everything in memory
        # FIXME: load the data in a memory mapped/pytable stored pandas in this
        # case? Clarify because the memory error is thrown by mmap. It may be
        # doing this already, but be running into mmap limitations?
        warnings.warn("The year %s contains too much data to be loaded into a "
                      "single object in memory")
        panda = None
    return panda
def collect_year_at_loc(year, location_WMO, location_WBAN, data_source = 'NCDC', 
                        internet_connected = True):
    """ Collect the data GSOD data file for specified location and specified
    year. Look locally for the file first. If it is not there, and its gzip
    version is not either, untar the file if it is present and has not been
    untared, or use the ftp connection to retrieve it from data source.
    """
    filename = info2filepath(year, location_WMO, location_WBAN)
    folder_location = os.path.join("Data", "GSOD", "gsod_"+str(year))
    filepath = os.path.join(folder_location, filename)
    print "Attempting to collect %s..." % filepath
    filepath_found = True
    
    if not os.path.exists(filepath):
        zipped_filepath = filepath+".gz"
        if os.path.exists(zipped_filepath):
            unzip(zipped_filepath)
        elif os.path.exists(os.path.join(folder_location,
                                         "gsod_"+str(year)+".tar")):
            # Possible not to rely on outside servers: untar the file if there
            # are no op.gz or op files. If not it means that the file is
            # missing.
            there_are_op_files = False
            for filename in os.listdir(folder_location):
                if os.path.splitext(filename)[1] in [".op", ".op.gz"]:
                    there_are_op_files = True
                    break
            if not there_are_op_files:
                untar(os.path.join(folder_location, "gsod_"+str(year)+".tar"))
            if os.path.isfile(zipped_filepath):
                unzip(zipped_filepath)
            else:
                warnings.warn("File %s is missing from the dataset: skipping "
                              "this location." % zipped_filepath)
                filepath_found = False
        elif internet_connected:
            target_folder = "Data/GSOD/gsod_"+str(year)
            if not os.path.exists(target_folder):
                print "Creating locally the folder %s." % target_folder
                os.mkdir(target_folder)
            # Download the file from NCDC
            if data_source == 'NCDC':
                remote_location = str(year)
            remote_target = os.path.join(remote_location, filename+".gz")
            retrieve_file(data_source, remote_target, zipped_filepath)
            if os.path.isfile(zipped_filepath):
                unzip(zipped_filepath)
            else:
                filepath_found = False
        else:
            filepath_found = False
        
    if filepath_found:
        return datafile2pandas(filepath)
def collect_year_at_loc(year, location_WMO, location_WBAN, data_source='NCDC',
                        internet_connected=True):
    """ Collect the data GSOD data file for specified location and specified
    year. Look locally for the file first. If it is not there, and its gzip
    version is not either, untar the file if it is present and has not been
    untared, or use the ftp connection to retrieve it from data source.
    """
    filename = info2filepath(year, location_WMO, location_WBAN)
    folder_location = os.path.join("Data", "GSOD") #, "gsod_"+str(year))
    filepath = os.path.join(folder_location, filename)
    log.info("Attempting to collect %s..." % filepath)
    filepath_found = False
    
    if not os.path.exists(filepath):
        zipped_filepath = filepath+".gz"
        tar_filepath = os.path.join(folder_location,"gsod_"+str(year)+".tar")
        if os.path.exists(zipped_filepath):
            # Read from the zip
            filepath = gzip.open(zipped_filepath)
            filepath_found = True
        elif os.path.exists(tar_filepath):
            # Possible not to rely on outside servers: load the file from the
            # tarfile
            archive = tarfile.TarFile(tar_filepath)
            try:
                gzf = archive.extractfile(filename+'.gz')
                filepath = gzip.GzipFile(fileobj = gzf)
                filepath_found = True
            except KeyError, e:
                # Some archives have a './' at the beginning
                try:
                    gzf = archive.extractfile(os.path.join('.', filename+'.gz'))
                    filepath = gzip.GzipFile(fileobj = gzf)
                    filepath_found = True
                except KeyError, e:
                    log.warn("File %s is missing from the dataset: skipping "
                              "this location." % zipped_filepath)