Ejemplo n.º 1
0
    def test_vectors_get_vecs(self):
        vec = GloVe(name='twitter.27B', dim='25')
        self.assertEqual(vec.vectors.shape[0], len(vec))

        tokens = ['chip', 'baby', 'Beautiful']
        token_vecs = vec.get_vecs_by_tokens(tokens).numpy()
        self.assertEqual(token_vecs.shape[0], len(tokens))
        self.assertEqual(token_vecs.shape[1], vec.dim)
        assert_allclose(vec[tokens[0]].numpy(), token_vecs[0])
        assert_allclose(vec[tokens[1]].numpy(), token_vecs[1])
        assert_allclose(vec['<unk>'].numpy(), token_vecs[2])

        token_one_vec = vec.get_vecs_by_tokens(tokens[0],
                                               lower_case_backup=True).numpy()
        self.assertEqual(token_one_vec.shape[0], vec.dim)
        assert_allclose(vec[tokens[0].lower()].numpy(), token_one_vec)

        # Delete the vectors after we're done to save disk space on CI
        if os.environ.get("TRAVIS") == "true":
            zip_file = os.path.join(self.project_root, ".vector_cache",
                                    "glove.6B.zip")
            conditional_remove(zip_file)
            for dim in ["50", "100", "200", "300"]:
                conditional_remove(
                    os.path.join(self.project_root, ".vector_cache",
                                 "glove.6B.{}d.txt".format(dim)))
Ejemplo n.º 2
0
    def _process_movie_fea(self):
        """

        Parameters
        ----------
        movie_info : pd.DataFrame
        name :  str

        Returns
        -------
        movie_features : np.ndarray
            Generate movie features by concatenating embedding and the year

        """
        if self._name == 'ml-100k':
            GENRES = GENRES_ML_100K
        elif self._name == 'ml-1m':
            GENRES = GENRES_ML_1M
        elif self._name == 'ml-10m':
            GENRES = GENRES_ML_10M
        else:
            raise NotImplementedError

        TEXT = data.Field(tokenize='spacy')
        embedding = GloVe(name='840B', dim=300)

        title_embedding = np.zeros(shape=(self.movie_info.shape[0], 300),
                                   dtype=np.float32)
        release_years = np.zeros(shape=(self.movie_info.shape[0], 1),
                                 dtype=np.float32)
        p = re.compile(r'(.+)\s*\((\d+)\)')
        for i, title in enumerate(self.movie_info['title']):
            match_res = p.match(title)
            if match_res is None:
                print('{} cannot be matched, index={}, name={}'.format(
                    title, i, self._name))
                title_context, year = title, 1950
            else:
                title_context, year = match_res.groups()
            # We use average of glove
            title_embedding[i, :] = embedding.get_vecs_by_tokens(
                TEXT.tokenize(title_context)).numpy().mean(axis=0)
            release_years[i] = float(year)
        movie_features = np.concatenate(
            (title_embedding,
             (release_years - 1950.0) / 100.0, self.movie_info[GENRES]),
            axis=1)
        return movie_features
class TweetsAsCharsAndWordsDataset(Dataset):
    def __init__(self,
                 data_path,
                 alphabet_path,
                 is_labeled=True,
                 l0=501,
                 l1=131,
                 max_samples=None,
                 word_emb_name="twitter.27B",
                 word_emb_dim=200,
                 vector_cache_path=None):
        """A dataset object whose samples consist of *both*
            - the (padded) concatenation of the word vectors of a tweet, and
            - the per-character one-hot encoding of the same tweet.

        Arguments:
            data_path: path of (label and) data file in csv.
            alphabet_path: path of alphabet json file.
            is_labeled: whether the data_path file contains labels, or only the tweets.
            l1: max length of a sample, in nb of characters.
            l1: max length of a sample, in nb of words.
            max_samples: (for dev,) only keep the max_samples first samples of the data.

            word_emb_name: name of the word embedding to use, used by torchtext.GloVe.
            word_emb_dim: dimension of the word embedding to use, used by torchtext.GloVe.
            vector_cache_path: path to cache directory, used by torchtext.GloVe.
        """
        self.glove = GloVe(name=word_emb_name,
                           dim=word_emb_dim,
                           cache=vector_cache_path)
        print("loaded pretrained GloVe word-embeddings.")
        self.data_path = data_path
        self.alphabet_path = alphabet_path
        self.is_labeled = is_labeled
        self.l0 = l0
        self.l1 = l1
        with open(alphabet_path) as f:
            self.alphabet = ''.join(json.load(f))
        self.raw_nb_feats = len(self.alphabet)
        self.pro_nb_feats = word_emb_dim
        # TODO: setting max_samples only makes sense if the csv itself was shuffled
        # X_txt = pd.read_csv(data_path, nrows=max_samples) # only keep max_samples first samples, or keep all if None
        X_txt = pd.read_csv(data_path)
        if max_samples:
            assert is_labeled, "must not use `max_samples` for unlabeled (assumed test-) data, as shuffling would modify the samples' ordering"
            X_txt = X_txt.sample(frac=1).reset_index(
                drop=True
            ).iloc[:max_samples]  # shuffle then select max_samples first
        self.y = X_txt['label'].to_numpy().astype(
            np.integer, copy=False) if is_labeled else None
        self.X_pro = X_txt['preprocessed_segmented_tweet'].to_numpy()
        self.X_raw = X_txt['raw_tweet'].to_numpy()

    def __len__(self):
        return self.X_raw.shape[0]

    def __getitem__(self, idx):
        X_raw = self.get_item_raw(idx)
        X_pro = self.get_item_pro(idx)
        # even if X consists of two distinct parts, still output X,y so that auxiliary functions work without modification
        if self.is_labeled:
            return (X_raw, X_pro), self.y[idx]
        else:
            return (X_raw, X_pro)

    def get_item_pro(self, idx):
        words = self.X_pro[idx].lower().split()
        words += [""] * (self.l1 - len(words)
                         )  # pad with zeros until of correct size
        assert len(words) == self.l1
        X = self.glove.get_vecs_by_tokens(words, lower_case_backup=True)
        # for i in np.where(~X.bool().all(axis=1))[0]: # print OOV words
        #     if words[i] != "":
        #         print("out-of-vocabulary:", i, words[i])
        assert X.shape == (self.l1, self.glove.dim)
        return X

    def get_item_raw(self, idx):
        seq = self.X_raw[idx]
        X = self.oneHotEncode(seq)
        assert X.shape == (
            self.l0, self.raw_nb_feats
        )  # NOTE: this is the transpose of what Xiaochen did
        return X

    def char2idx(self, character):
        return self.alphabet.find(character)

    def oneHotEncode(self, seq):
        X = torch.zeros(self.l0, self.raw_nb_feats)
        for i, char in enumerate(seq[::-1]):
            char_idx = self.char2idx(char)
            if char_idx != -1:  # if char is in present in self.alphabet
                X[i, char_idx] = 1.0
        return X