Beispiel #1
0
def maybe_download_and_store_tar(url: str,
                                 root_key: str,
                                 description: str = None,
                                 use_subkeys=True,
                                 **kwargs) -> List[str]:
    # Validate the keys in the directory
    # needs_redownload = False
    # Traverse the key dictionary, and check the integrity of each of the files
    old_keys: List[str] = []
    if DATA_STORE.is_valid(root_key) and validate_subkeys(root_key, old_keys):
        return old_keys

    # This is where the hard work happens
    # First, we have to download the file into the working directory
    data_path = maybe_download(url.split('/')[-1],
                               url,
                               DATA_STORE.working_directory,
                               postprocess=untar,
                               **kwargs)

    # The data path gives us the root key
    keys: List[str] = []
    if use_subkeys:
        keys = register_to_datastore(data_path, root_key, description)
    else:
        DATA_STORE.create_key(root_key, '', force=True)

    return [os.path.join(root_key, k) for k in keys] + [root_key]
Beispiel #2
0
def maybe_download_and_store_single_file(url: str,
                                         key: str,
                                         description: str = None,
                                         postprocess=None,
                                         **kwargs) -> str:
    if not DATA_STORE.is_valid(key):
        # This is where the hard work happens
        # First, we have to download the file into the working directory
        if postprocess is None:
            data_path = maybe_download(
                url.split('/')[-1], url, DATA_STORE.working_directory)
        else:
            data_path = maybe_download(url.split('/')[-1],
                                       url,
                                       DATA_STORE.working_directory,
                                       postprocess=postprocess,
                                       **kwargs)
        DATA_STORE.add_file(key, data_path, description, force=True)
    return key
Beispiel #3
0
def maybe_download_and_store_zip(url: str,
                                 root_key: str,
                                 description: str = None,
                                 use_subkeys=True,
                                 **kwargs) -> List[str]:
    old_keys: List[str] = []
    if DATA_STORE.is_valid(root_key) and validate_subkeys(root_key, old_keys):
        return old_keys
        # Ensure one layer file structure for zip file? TODO (Karen)

    data_path = maybe_download(file_name=url.split("/")[-1],
                               source_url=url,
                               work_directory=DATA_STORE.working_directory,
                               postprocess=unzip,
                               **kwargs)
    keys: List[str] = []
    if use_subkeys:
        keys = register_to_datastore(data_path, root_key, description)
        # DATA_STORE.create_key(root_key, 'root.key', force=True) I removed this because this call removes all the file I have stored with the previous register_to_datastore. (Karen)
    else:
        DATA_STORE.add_folder(root_key, data_path, force=True)

    return [os.path.join(root_key, k) for k in keys]
Beispiel #4
0
    def __init__(self, version: str='wikipedia', dimension: int=300) -> None:

        self.version = version
        self.dimension = dimension
        self.embedding_matrix: Optional[np.ndarray] = None

        if self.version == 'wikipedia':
            # Make sure that the dimension is valid
            if self.dimension not in GloveEmbedding.wikipedia_dimensions:
                raise ValueError('Error: Invalid GLoVe dimension ({}) for Wikipedia dataset. Must be one of {}'.format(self.dimension,
                                                                                                                       GloveEmbedding.wikipedia_dimensions))

            if not DATA_STORE.is_valid('glove/wikipedia/dim{}'.format(self.dimension)):
                # Download the file into the working direcotry
                maybe_download(file_name='glove.6B.zip', source_url='http://nlp.stanford.edu/data/glove.6B.zip',
                               work_directory=DATA_STORE.working_directory, postprocess=unzip)

                # Read the data keys from the file
                log_message('Loading vectors...')
                self.encoder: Dict[str, np.ndarray] = {}
                with open(os.path.join(DATA_STORE.working_directory, 'glove.6B/glove.6B.{}d.txt'.format(self.dimension)), 'r') as glove_file:
                    for line in glove_file:
                        tokens = line.split()
                        self.encoder[tokens[0]] = np.array(
                            [float(x) for x in tokens[1:]])

                # Save the encoder
                with open(DATA_STORE.create_key('glove/wikipedia/dim{}'.format(self.dimension), 'encoder.pkl', force=True), 'wb',) as pkl_file:
                    pickle.dump(self.encoder, pkl_file)
                DATA_STORE.update_hash('glove/wikipedia/dim{}'.format(self.dimension))

            else:
                with open(DATA_STORE['glove/wikipedia/dim{}'.format(self.dimension)], 'rb') as pkl_file:
                    self.encoder = pickle.load(pkl_file)

        elif self.version == 'common-small':
            # Make sure that the dimension is valid
            if self.dimension not in GloveEmbedding.common_small_dimensions:
                raise ValueError('Error: Invalid GLoVe dimension ({}) for Common-Crawl Small dataset. Must be one of {}'.format(self.dimension,
                                                                                                                                GloveEmbedding.common_small_dimensions))

            if not DATA_STORE.is_valid('glove/common-small/dim{}'.format(self.dimension)):
                # Download the file into the working direcotry
                maybe_download(file_name='glove.42B.300d.zip', source_url='http://nlp.stanford.edu/data/glove.42B.300d.zip',
                               work_directory=DATA_STORE.working_directory, postprocess=unzip)

                # Read the data keys from the file
                log_message('Loading vectors...')
                self.encoder: Dict[str, np.ndarray] = {}
                with open(os.path.join(DATA_STORE.working_directory, 'glove.42B.300d/glove.42B.{}d.txt'.format(self.dimension)), 'r') as glove_file:
                    for line in glove_file:
                        tokens = line.split()
                        self.encoder[tokens[0]] = np.array(
                            [float(x) for x in tokens[1:]])

                # Save the encoder
                with open(DATA_STORE.create_key('glove/common-small/dim{}'.format(self.dimension), 'encoder.pkl', force=True), 'wb') as pkl_file:
                    pickle.dump(self.encoder, pkl_file)
                DATA_STORE.update_hash('glove/common-small/dim{}'.format(self.dimension))

            else:
                with open(DATA_STORE['glove/common-small/dim{}'.format(self.dimension)], 'rb') as pkl_file:
                    self.encoder = pickle.load(pkl_file)

        elif self.version == 'common-large':
            # Make sure that the dimension is valid
            if self.dimension not in GloveEmbedding.common_large_dimensions:
                raise ValueError('Error: Invalid GLoVe dimension ({}) for Common-Crawl Large dataset. Must be one of {}'.format(self.dimension,
                                                                                                                                GloveEmbedding.common_large_dimensions))

            if not DATA_STORE.is_valid('glove/common-large/dim{}'.format(self.dimension)):
                # Download the file into the working direcotry
                maybe_download(file_name='glove.840B.300d.zip', source_url='http://nlp.stanford.edu/data/glove.840B.300d.zip',
                               work_directory=DATA_STORE.working_directory, postprocess=unzip)

                # Read the data keys from the file
                log_message('Loading vectors...')
                self.encoder: Dict[str, np.ndarray] = {}
                with open(os.path.join(DATA_STORE.working_directory, 'glove.840B.300d/glove.840B.{}d.txt'.format(self.dimension)), 'r') as glove_file:
                    for line in glove_file:
                        tokens = line.split()
                        self.encoder[tokens[0]] = np.array(
                            [float(x) for x in tokens[1:]])

                # Save the encoder
                with open(DATA_STORE.create_key('glove/common-large/dim{}'.format(self.dimension), 'encoder.pkl', force=True), 'wb') as pkl_file:
                    pickle.dump(self.encoder, pkl_file)
                DATA_STORE.update_hash('glove/common-large/dim{}'.format(self.dimension))

            else:
                with open(DATA_STORE['glove/common-large/dim{}'.format(self.dimension)], 'rb') as pkl_file:
                    self.encoder = pickle.load(pkl_file)

        elif self.version == 'twitter':
            # Make sure that the dimension is valid
            if self.dimension not in GloveEmbedding.twitter_dimensions:
                raise ValueError('Error: Invalid GLoVe dimension ({}) for Common-Crawl Large dataset. Must be one of {}'.format(self.dimension,
                                                                                                                                GloveEmbedding.twitter_dimensions))

            if not DATA_STORE.is_valid('glove/twitter/dim{}'.format(self.dimension)):
                # Download the file into the working direcotry
                maybe_download(file_name='glove.twitter.27B.zip', source_url='http://nlp.stanford.edu/data/glove.twitter.27B.zip',
                               work_directory=DATA_STORE.working_directory, postprocess=unzip)

                # Read the data keys from the file
                log_message('Loading vectors...')
                self.encoder: Dict[str, np.ndarray] = {}
                with open(os.path.join(DATA_STORE.working_directory, 'glove.twitter.27B/glove.twitter.27B.{}d.txt'.format(self.dimension)), 'r') as glove_file:
                    for line in glove_file:
                        tokens = line.split()
                        self.encoder[tokens[0]] = np.array(
                            [float(x) for x in tokens[1:]])

                # Save the encoder
                with open(DATA_STORE.create_key('glove/twitter/dim{}'.format(self.dimension), 'encoder.pkl', force=True), 'wb') as pkl_file:
                    pickle.dump(self.encoder, pkl_file)
                DATA_STORE.update_hash('glove/twitter/dim{}'.format(self.dimension))

            else:
                with open(DATA_STORE['glove/twitter/dim{}'.format(self.dimension)], 'rb') as pkl_file:
                    self.encoder = pickle.load(pkl_file)
        else:
            raise ValueError('Error: Invalid GLoVe Version: {}, Must be one of {}'.format(
                version, GloveEmbedding.valid_versions))
Beispiel #5
0
 def test_maybe_download(self):
     file_path = maybe_download("sample.txt", self.sample_download_location,
                                self.working_directory)
     if not self.passed:
         self.assertTrue(compare(file_path, self.sample_txt))
     return