Esempio n. 1
0
    def _fit_transform(self):
        X, Y = self.set_X_Y(self.X, self.Y)
        X_Train, Y_Train, X_Test, Y_Test = self._fit(X, Y)
        max_words = 1000
        tokenize = text.Tokenizer(num_words=max_words, char_level=False)
        tokenize.fit_on_texts(pd.Series([y for x in self.X_Train for y in x]))
        self.X_Train = tokenize.texts_to_matrix(
            pd.Series([y for x in X_Train for y in x]))
        self.X_Test = tokenize.texts_to_matrix(
            pd.Series([y for x in X_Test for y in x]))

        return self.X_Train, self.X_Test, self.Y_Train, self.Y_Test
Esempio n. 2
0
train_size = int(len(body_file) * .5)
print("Train size: %d" % train_size)
print("Test size: %d" % (len(body_file) - train_size))

train_posts = body_file['post'][:train_size]
train_tags = label_file['label'][:train_size]

test_posts = body_file['post'][train_size:]
test_tags = label_file['label'][train_size:]

train_posts
len(train_posts), len(train_tags)
len(test_posts), len(test_tags)

max_words = 10
tokenize = text.Tokenizer(num_words=max_words, char_level=False)
tokenize.fit_on_texts(train_posts)  # only fit on train

text = "the"
tokenize.texts_to_matrix([text])

x_train = tokenize.texts_to_matrix(train_posts)
x_test = tokenize.texts_to_matrix(test_posts)

print(tokenize.word_index)

encoder = LabelEncoder()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)
    def __init__(self, init_seed, maxlen, nb_words, skip_top, test_split):
        self.start_char = 1
        self.oov_char = 2
        self.index_from = 3

        files = [
            "Dennis+Schwartz", "James+Berardinelli", "Scott+Renshaw",
            "Steve+Rhodes"
        ]
        texts, ratings = [], []
        for file in files:
            with open("data/scaledata/" + file + "/subj." + file, "r") as f:
                texts += list(f)
            with open("data/scaledata/" + file + "/rating." + file, "r") as f:
                ratings += list(f)
        tokenizer = text.Tokenizer(filters='')
        tokenizer.fit_on_texts(texts)
        X = tokenizer.texts_to_sequences(texts)
        Y = [float(rating) for rating in ratings]

        # Shuffle data:
        np.random.seed(init_seed)
        np.random.shuffle(X)
        np.random.seed(init_seed)
        np.random.shuffle(Y)

        # Parse data
        X = [[self.start_char] + [w + self.index_from for w in x] for x in X]

        new_X = []
        new_Y = []
        for x, y in zip(X, Y):
            for i in range(0, len(x), maxlen):
                new_X.append(x[i:i + maxlen])
                new_Y.append(y)
        X = np.array(new_X)
        Y = np.array(new_Y)
        # by convention, use 2 as OOV word
        # reserve 'index_from' (=3 by default) characters: 0 (padding), 1 (start), 2 (OOV)
        X = [[
            self.oov_char if (w >= nb_words or w < skip_top) else w for w in x
        ] for x in X]

        self.X_train = X[:int(len(X) * (1 - test_split))]
        self.Y_train = Y[:int(len(X) * (1 - test_split))]
        self.mean_y_train = np.mean(self.Y_train)
        self.std_y_train = np.std(self.Y_train)
        self.Y_train = (self.Y_train - self.mean_y_train) / self.std_y_train

        self.X_test = X[int(len(X) * (1 - test_split)):]
        self.Y_test = Y[int(len(X) * (1 - test_split)):]
        self.Y_test = (self.Y_test - self.mean_y_train) / self.std_y_train

        print(len(self.X_train), 'train sequences')
        print(len(self.X_test), 'test sequences')

        print("Pad sequences (samples x time)")
        self.X_train = sequence.pad_sequences(self.X_train, maxlen=maxlen)
        self.X_test = sequence.pad_sequences(self.X_test, maxlen=maxlen)
        print('X_train shape:', self.X_train.shape)
        print('X_test shape:', self.X_test.shape)