def __init__(self, config):
     self.config = config
     # load data here
     # Load data from files
     positive_data_file = self.config.positive_data_file
     negative_data_file = self.config.negative_data_file
     positive_examples = list(open(positive_data_file, "r").readlines())
     positive_examples = [s.strip() for s in positive_examples]
     negative_examples = list(open(negative_data_file, "r").readlines())
     negative_examples = [s.strip() for s in negative_examples]
     # Split by words
     x_text = positive_examples + negative_examples
     x_text = [clean_str(sent) for sent in x_text]
     self.max_text_length = max([len(x.split(" ")) for x in x_text])
     # transform words to id
     vocab_processor = VocabularyProcessor(self.max_text_length)
     X = np.array(list(vocab_processor.fit_transform(x_text)))
     self.vocab_size = len(vocab_processor.vocabulary_)
     # Generate labels
     positive_labels = [[0, 1] for _ in positive_examples]
     negative_labels = [[1, 0] for _ in negative_examples]
     y = np.concatenate([positive_labels, negative_labels], 0)
     self.train_input, self.valid_input, self.train_label, self.valid_label = train_test_split(
         X, y, random_state=2018, test_size=0.1)
     print('training size: {}'.format(len(self.train_input)))
Esempio n. 2
0
def getData(dataframe):
    # Extract the required columns for inputs and outputs
    totalX = dataframe.sub_grade
    totalY = dataframe.short_emp
    print(dataframe.sub_grade.value_counts())
    totalX = totalX.values
    totalY = totalY.values
    vocab_proc = VocabularyProcessor(1)
    # Convert the strings in the input into integers corresponding to the dictionary positions
    # Data is automatically padded so we need to pad_sequences manually
    """
    totalX="I am happy today. I feel sad today."
    vocab_proc = VocabularyProcessor(1)

    totalX = np.array(list(vocab_proc.fit_transform(totalX)))


    #totalX=totalX.transpose()
    #totalX = pad_sequences(totalX, maxlen=100, value=0.)
    print(totalX)
    """
    totalX = np.array(list(vocab_proc.fit_transform(totalX)))

    totalY = to_categorical(totalY, nb_classes=2)  #totalY=[1,0,1,0,1] 需要为数组
    # 分离出训练数据和测试数据
    trainX, testX, trainY, testY = train_test_split(totalX,
                                                    totalY,
                                                    test_size=0.1)
    return trainX, testX, trainY, testY
Esempio n. 3
0
def do_vocabulary_table():
    x, y = load_data()

    vp = VocabularyProcessor(max_document_length=max_document_length,
                             min_frequency=0)

    x = vp.fit_transform(x, unused_y=None)
    x = np.array(list(x))

    return x, y
class CNN(object):
    def __init__(self, batch_size=64):
        self.batch_size = batch_size
        self.number_of_classes = 2
        self.X_train = []
        self.X_test = []
        self.Y_train = []
        self.Y_test = []
        self.max_words = None
        self.vocabProcessor = None
        self.cnn_model = None
        self.model = None
        self.test_x = []
        self.test_y = []

    def load_dataset_training(self,
                              vocab_name,
                              filename='datasetWithoutNeutral'):
        """ Load the dataset """
        X, Y = load_csv('datasets/' + filename,
                        target_column=2,
                        columns_to_ignore=[0])
        """ Count max words from the longest sentence """
        self.max_words = max([len(x[0].split(" ")) for x in X])
        """ Get vocabulare size from longest sentence """
        self.vocabProcessor = VocabularyProcessor(self.max_words)
        """ Encode pos, neu and neg to numbers """
        labelEncoder = LabelEncoder()
        labelEncoder.fit(Y)
        Y = labelEncoder.transform(Y)
        """ Change the list of sentences to a list of sequence of words """
        X = np.array(list(self.vocabProcessor.fit_transform([x[0]
                                                             for x in X])))
        """ Split the datasets to training set and test test """
        self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(
            X, Y, test_size=0.10, random_state=7)
        """ Pad the sequences to fit the longest sentence """
        self.X_train = pad_sequences(self.X_train,
                                     maxlen=self.max_words,
                                     value=0.)
        self.X_test = pad_sequences(self.X_test,
                                    maxlen=self.max_words,
                                    value=0.)
        """ Convert labels to binary vector """
        self.Y_train = to_categorical(self.Y_train,
                                      nb_classes=self.number_of_classes)
        self.Y_test = to_categorical(self.Y_test,
                                     nb_classes=self.number_of_classes)
        self.vocabProcessor.save(vocab_name)

    def create_cnn_architecture_two_layers(
            self,
            model_name,
            outputDim=300,
            number_of_filters=60,
            filterSize=[3, 4],
            padding='same',
            activation_function_convLayer='relu',
            regularizer='L2',
            dropouts=0.5,
            activation_function_fc='softmax',
            optimizer='adam',
            learning_rate=0.001,
            loss_function='categorical_crossentropy'):
        if len(filterSize) == 0:
            filterSize = [3, 4]
        """ Define input shape and create word embedding """
        self.cnn_model = input_data(shape=[None, self.max_words], name='input')
        self.cnn_model = tflearn.embedding(
            self.cnn_model,
            input_dim=len(self.vocabProcessor.vocabulary_),
            output_dim=outputDim)
        """ Add three/two convolutional layer. Set number of filters and filter sizes and then merge together """
        conv1 = conv_1d(self.cnn_model,
                        nb_filter=number_of_filters,
                        filter_size=filterSize[0],
                        padding=padding,
                        activation=activation_function_convLayer,
                        regularizer=regularizer)
        conv2 = conv_1d(self.cnn_model,
                        nb_filter=number_of_filters,
                        filter_size=filterSize[1],
                        padding=padding,
                        activation=activation_function_convLayer,
                        regularizer=regularizer)
        #conv3 = conv_1d(cnn_model, nb_filter = 128,  filter_size = 5, padding = 'same',
        #                 activation = 'relu', regularizer = 'L2')
        self.cnn_model = merge([conv1, conv2], mode='concat', axis=1)
        """ Expand one dimension to fit the max_pooling layer """
        self.cnn_model = tf.expand_dims(self.cnn_model, 1)
        self.cnn_model = global_max_pool(self.cnn_model)
        """ Instantiate dropout layer and specify dropout parameter """
        self.cnn_model = dropout(self.cnn_model, dropouts)
        """ Instantiate fully connected layer and regression layer. """
        self.cnn_model = fully_connected(self.cnn_model,
                                         self.number_of_classes,
                                         activation=activation_function_fc)
        self.cnn_model = regression(self.cnn_model,
                                    optimizer=optimizer,
                                    learning_rate=learning_rate,
                                    loss=loss_function,
                                    name='models/' + model_name)

    def train_and_save(self,
                       model_name,
                       tensorboard_verbose=0,
                       tensorboard_dir='/logs/',
                       nb_epochs=5,
                       shuffle=True,
                       show_metric=True):
        """ Instantiate Deep neural network model and start the training """
        self.model = tflearn.DNN(self.cnn_model,
                                 tensorboard_verbose=tensorboard_verbose,
                                 tensorboard_dir=tensorboard_dir)
        self.model.fit(self.X_train,
                       self.Y_train,
                       n_epoch=nb_epochs,
                       validation_set=(self.X_test, self.Y_test),
                       shuffle=shuffle,
                       show_metric=show_metric,
                       batch_size=self.batch_size,
                       run_id=model_name)
        """ Save the model """
        self.model.save('models/' + model_name)

    def load_model(self,
                   model_name,
                   outputDim=300,
                   number_of_filters=60,
                   filterSize=[3, 4],
                   padding='same',
                   activation_function_convLayer='relu',
                   regularizer='L2',
                   dropouts=0.5,
                   activation_function_fc='softmax',
                   optimizer='adam',
                   learning_rate=0.001,
                   loss_function='categorical_crossentropy',
                   tensorboard_verbose=0,
                   tensorboard_dir='/logs/'):
        """
            Has to pass the same values that the models were trained with. If the
            model was trained on default values, the parameters will pass it automatically.
        """

        self.create_cnn_architecture_two_layers(
            model_name, outputDim, number_of_filters, filterSize, padding,
            activation_function_convLayer, regularizer, dropouts,
            activation_function_fc, optimizer, learning_rate, loss_function)

        self.model = tflearn.DNN(self.cnn_model,
                                 tensorboard_verbose=tensorboard_verbose,
                                 tensorboard_dir=tensorboard_dir)
        self.model.load('models/' + model_name)

    def load_test_dataset(self,
                          filename='testDatasetWithOutNeuTwo',
                          vocab_name='vocabProc'):
        """
            Something is wrong with this function. Does not get the same result
            as before when loading in the new data...
        """
        """ Load test dataset """
        self.test_x, self.test_y = load_csv('datasets/' + filename,
                                            target_column=1)
        """ Get restored vocabulary processor """
        self.vocabProcessor = VocabularyProcessor(self.max_words)
        self.vocabProcessor.restore(vocab_name)
        """ Encode pos, neu and neg to numbers  """
        labelEncoder = LabelEncoder()
        labelEncoder.fit(self.test_y)
        self.test_y = labelEncoder.transform(self.test_y)
        """ Change the list of sentences to a list of sequence of words """
        self.test_x = np.array(
            list(self.vocabProcessor.transform([x[0] for x in self.test_x])))
        """ Pad the sequences to fit the longest sentence """
        self.test_x = pad_sequences(self.test_x,
                                    maxlen=self.max_words,
                                    value=0.)
        """ Convert labels to binary vector """
        self.test_y = to_categorical(self.test_y,
                                     nb_classes=self.number_of_classes)

    def evaluate_model_performance(self):
        metrix_score = self.model.evaluate(self.test_x,
                                           self.test_y,
                                           batch_size=self.batch_size)
        return metrix_score

    def predict_one_sentence(self, sentence=[['']], vocab_name='vocabProc'):
        """ Load vocabulary processor """
        self.vocabProcessor = VocabularyProcessor(self.max_words)
        self.vocabProcessor.restore(vocab_name)
        """ Transorm sentence to matrix of numbers """
        sentence = np.array(
            list(self.vocabProcessor.transform([x[0] for x in sentence])))
        sentence = pad_sequences(
            sentence,
            max_len=self.vocabProcessor.max_document_length,
            value=0.)
        """ Predict sentence """
        pred_score = self.model.predict(sentence)
        return pred_score

    def predict_list(self,
                     list_of_sentences=[[''], ['']],
                     vocab_name='vocabProc'):
        """ Load vocabulary processor """
        self.vocabProcessor = VocabularyProcessor(self.max_words)
        self.vocabProcessor.restore(vocab_name)
        """ Transorm sentence to matrix of numbers """
        sentence = np.array(
            list(self.vocabProcessor.transform([x[0] for x in sentece])))
        sentence = pad_sequences(
            sentence,
            max_len=self.vocabProcessor.max_document_length,
            value=0.)
        """ Predict sentence """
        pred_score = self.model.predict(list_of_sentences)
        return pred_score
data.loc[data["score_phrase"] == "Unbearable", "score_phrase"] = 0
data.loc[data["score_phrase"] == "Painful", "score_phrase"] = 0
data.loc[data["score_phrase"] == "Bad", "score_phrase"] = 0
data.loc[data["score_phrase"] == "Mediocre", "score_phrase"] = 0
data.loc[data["score_phrase"] == "Awful", "score_phrase"] = 0

data.loc[data["score_phrase"] == "Amazing", "score_phrase"] = 1
data.loc[data["score_phrase"] == "Great", "score_phrase"] = 1
data.loc[data["score_phrase"] == "Okay", "score_phrase"] = 1
data.loc[data["score_phrase"] == "Masterpiece", "score_phrase"] = 1
data.loc[data["score_phrase"] == "Good", "score_phrase"] = 1

# tokenize title
# data["token_title"] = data["title"].apply(nltk.word_tokenize)
word_processor = VocabularyProcessor(100)
tmp = np.array(list(word_processor.fit_transform(data["title"])))

# split into train and test data
train_data, test_data = model_selection.train_test_split(data, train_size=0.9)
trainX = np.array(list(word_processor.fit_transform(train_data["title"])))
trainY = train_data.loc[:, ["score_phrase"]].as_matrix()
testX = np.array(list(word_processor.fit_transform(test_data["title"])))
testY = test_data.loc[:, ["score_phrase"]].as_matrix()

# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)

# Network building
net = tflearn.input_data([None, 100])
net = tflearn.embedding(net, input_dim=16762, output_dim=128)
Esempio n. 6
0
)
# print(type(df.num.apply(str)))
# print(type(df.text))
text = df['text'].astype(str)
# num = df['num'].astype(str)
# print(type(text))
# print(text)

# Extract the required columns for inputs and outputs
totalX = text
totalY = df.num

# # Convert the strings in the input into integers corresponding to the dictionary positions
# # Data is automatically padded so we need to pad_sequences manually
vocab_proc = VocabularyProcessor(10)
totalX = np.array(list(vocab_proc.fit_transform(totalX)))
seed = np.array(list(vocab_proc.fit_transform(seed)))
# # We will have 11 classes in total for prediction, indices from 0 to 10
vocab_proc2 = VocabularyProcessor(1)
totalY = np.array(list(vocab_proc2.fit_transform(totalY))) - 1
# # Convert the indices into 11 dimensional vectors
totalY = to_categorical(totalY, nb_classes=10)

# Split into training and testing data
# trainX, testX, trainY, testY = train_test_split(totalX, totalY, test_size=0.1)
totalX = pad_sequences(totalX, maxlen=100, value=0.)
seed = pad_sequences(seed, maxlen=100, value=0.)
# testX = pad_sequences(testX, maxlen=100, value=0.)
# print(trainX)
# print(trainY)
# print(testX)
Esempio n. 7
0
batch_size = 64
""" Load the dataset """
#X, Y = load_csv('datasetFullList', target_column = 2, columns_to_ignore = [0])
X, Y = load_csv('datasetWithoutNeutral',
                target_column=2,
                columns_to_ignore=[0])
""" Count max words from the longest sentence """
max_words = max([len(x[0].split(" ")) for x in X])
""" Get vocabulare size from longest sentence """
vocab = VocabularyProcessor(max_words)
""" Encode pos, neu and neg to numbers """
labelEncoder = LabelEncoder()
labelEncoder.fit(Y)
Y = labelEncoder.transform(Y)
""" Change the list of sentences to a list of sequence of words """
X = np.array(list(vocab.fit_transform([x[0] for x in X])))
""" Split the datasets to training set and test test """
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.10,
                                                    random_state=7)
""" Pad the sequences to fit the longest sentence """
X_train = pad_sequences(X_train, maxlen=max_words, value=0.)
X_test = pad_sequences(X_test, maxlen=max_words, value=0.)
""" Convert labels to binary vector """
Y_train = to_categorical(Y_train, nb_classes=2)
Y_test = to_categorical(Y_test, nb_classes=2)
vocab.save('vocabProc')
""" 
    Begin the creation of convolutional model
"""
def run_on_ign():

    df_dataset = pd.read_csv(ign_dataset_path)
    df_dataset.set_index(['index'], inplace=True)
    # fill null values with empty strings
    df_dataset.fillna(value='', inplace=True)
    # extract the required columns for inputs and outputs
    data_X = df_dataset.title
    # data_X[0]
    label_Y = df_dataset.score_phrase
    # label_Y[5]

    # convert the strings in the input into integers corresponding to the dictionary positions
    # maps documents to sequences of word ids
    # data is automatically padded so we need to pad_sequences manually
    vocab_proc = VocabularyProcessor(15)
    total_X = np.array(list(vocab_proc.fit_transform(data_X)))
    # total_X[0]

    # we will have 11 classes in total for prediction, indices from 0 to 10
    # vocabulary processor for single word
    vocab_proc2 = VocabularyProcessor(1)
    total_Y = np.array(list(vocab_proc2.fit_transform(label_Y))) - 1
    # total_Y[5]
    # len(total_Y)

    # as we have 11 unique score_phrase
    # convert the indices into 11 dimensional vectors
    # This is wrong as it generate same array for different score_phrase
    # total_Y = to_categorical(total_Y, nb_classes=11)
    array_list = []
    for array in total_Y:
        array_list.append(get_categorical(array, 11))
    total_Y = np.array(array_list)
    # total_Y[4]

    # split into training and testing data
    train_X, test_X, train_Y, test_Y = train_test_split(total_X,
                                                        total_Y,
                                                        test_size=0.1)

    # build the network for classification
    # each input has length of 15
    net = tflearn.input_data([None, 15])

    # the 15 input word integers are then casted out into 256 dimensions each creating a word embedding.
    # we assume the dictionary has 10000 words maximum
    net = tflearn.embedding(net, input_dim=10000, output_dim=256)
    # each input would have a size of 15x256 and each of these 256 sized vectors are fed into the LSTM layer one at a time.
    # all the intermediate outputs are collected and then passed on to the second LSTM layer.
    net = tflearn.gru(net, 256, dropout=0.9, return_seq=True)
    # using the intermediate outputs, we pass them to another LSTM layer and collect the final output only this time
    net = tflearn.gru(net, 256, dropout=0.9)
    # the output is then sent to a fully connected layer that would give us our final 11 classes
    net = tflearn.fully_connected(net, 11, activation='softmax')
    # we use the adam optimizer instead of standard SGD since it converges much faster
    net = tflearn.regression(net,
                             optimizer='adam',
                             learning_rate=0.001,
                             loss='categorical_crossentropy')

    model = tflearn.DNN(net, tensorboard_verbose=0)

    if check_file_exist(ign_model_path):
        model.load(ign_model_path)

    model.fit(train_X,
              train_Y,
              validation_set=(test_X, test_Y),
              show_metric=True,
              batch_size=32,
              n_epoch=20)

    if save_model:
        print("Saving model as './ign_model.tfl'")
        model.save(ign_model_path)

    return 0
Esempio n. 9
0
def run_analysis():

    # dataframe = pd.Dataframe(X.values)
    # dataframe.columns
    df = pd.read_csv('tensor/train.csv', sep='|', names=['number'])
    testdf = pd.read_csv('test.csv', sep='|', names=['content'])
    # df[label] = df.A.str.split(',', n=1, expand=True)
    seed = testdf.content.tolist()
    seeddict = {0: seed}
    # print(seed)
    # print('dataframe')
    # print(df.number)
    df['num'] = df['number'].map(lambda x: x.split(',')[0])
    df['text'] = df['number'].map(lambda x: x.split(',')[1:])
    # print(df.num)
    num = []
    text = []
    for n in df['num']:
        num.append(n)
    for t in df['text']:
        text.append(t)

    d = dict(zip(num, text))
    text_len = len(df['text'])
    # print (d)

    # Fill null values with empty strings

    print(
        '\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
    )
    # print(type(df.num.apply(str)))
    # print(type(df.text))
    text = df['text'].astype(str)
    # num = df['num'].astype(str)
    # print(type(text))
    # print(text)

    # Extract the required columns for inputs and outputs
    totalX = text
    totalY = df.num

    # # Convert the strings in the input into integers corresponding to the dictionary positions
    # # Data is automatically padded so we need to pad_sequences manually
    vocab_proc = VocabularyProcessor(10)
    totalX = np.array(list(vocab_proc.fit_transform(totalX)))
    seed = np.array(list(vocab_proc.fit_transform(seed)))
    # # We will have 11 classes in total for prediction, indices from 0 to 10
    vocab_proc2 = VocabularyProcessor(1)
    totalY = np.array(list(vocab_proc2.fit_transform(totalY))) - 1
    # # Convert the indices into 11 dimensional vectors
    totalY = to_categorical(totalY, nb_classes=10)

    # Split into training and testing data
    # trainX, testX, trainY, testY = train_test_split(totalX, totalY, test_size=0.1)
    totalX = pad_sequences(totalX, maxlen=100, value=0.)
    seed = pad_sequences(seed, maxlen=100, value=0.)
    # testX = pad_sequences(testX, maxlen=100, value=0.)
    # print(trainX)
    # print(trainY)
    # print(testX)
    # print(testY)
    print('finshed tokenizing')

    net = tflearn.input_data([None, 100])

    net = tflearn.embedding(net, input_dim=12000, output_dim=256)

    net = tflearn.lstm(net, 256, dropout=0.9, return_seq=True)

    net = tflearn.lstm(net, 256, dropout=0.9)
    net = tflearn.dropout(net, 0.5)
    # # The output is then sent to a fully connected layer that would give us our final 11 classes
    net = tflearn.fully_connected(net, 10, activation='softmax')
    # # We use the adam optimizer instead of standard SGD since it converges much faster
    net = tflearn.regression(net,
                             optimizer='adam',
                             learning_rate=0.001,
                             loss='categorical_crossentropy')
    model = tflearn.DNN(net, tensorboard_verbose=0)
    print('starting fit')
    model.fit(totalX,
              totalY,
              validation_set=0.2,
              show_metric=True,
              batch_size=None,
              n_epoch=20)

    # model = tflearn.helpers.evaluator.Evaluator(model)
    result = model.predict_label(seed)[0][0]
    print(result)
    return result
Esempio n. 10
0
class SentimentDataSet():
    """
    Master class. Not actually used for the DataLoader, but holds all
    information about the data set.
    """
    def __init__(self, args):
        self.use_gaze = args.use_gaze

        categories = ['NEGATIVE', 'POSITIVE']
        if args.num_sentiments == 3:
            categories.append('NEUTRAL')

        dataset = load_files(container_path=DATASET_DIR,
                             categories=categories,
                             load_content=True,
                             encoding='utf-8')
        self.sentences_, self.sentence_numbers, _, self.targets, _ = dataset.values(
        )
        self.sentence_numbers = [
            int(re.search(r'\d{1,3}', fname).group())
            for fname in dataset['filenames']
        ]

        # adopting Hollenstein's method of building the vocab
        self.sentences = [clean_str(s) for s in self.sentences_]
        self.num_sentences = len(self.sentences)
        self.max_sentence_length = max(
            [len(s.split()) for s in self.sentences])
        self.vocab_processor = VocabularyProcessor(self.max_sentence_length)
        self.indexed_sentences = torch.LongTensor(
            list(self.vocab_processor.fit_transform(self.sentences)))
        self.vocabulary = list(
            self.vocab_processor.vocabulary_._mapping.keys())

        print('\n> Data set loaded. Sentiment classes:', categories)
        print('> Max sentence length:', self.max_sentence_length, 'words.')

        if self.use_gaze:
            self.et_features = EyeTrackingFeatures(self.max_sentence_length)
            print('> Loaded eye-tracking features.\n')
        else:
            self.et_features = np.zeros(self.num_sentences)

    def split_cross_val(self, num_folds=10):
        cv = StratifiedKFold(num_folds, shuffle=True, random_state=111)
        splitter = cv.split(np.zeros(self.num_sentences), self.targets)
        for train_indices, test_indices in splitter:
            yield (self._get_dataloader(train_indices),
                   self._get_dataloader(test_indices))

    def _get_dataloader(self, indices):
        et_features = [self.et_features[i] for i in indices]
        # do this in order to match the sentences imported by load_files with
        # the ones given in the Matlab files (where we get et_features from)...
        # ONLY WORKS WHEN USING THE WHOLE DATA SET! (num_classes=3)
        indices_ = np.array([self.sentence_numbers.index(i) for i in indices])

        if self.use_gaze:
            _sent_len = [len(self.sentences[i].split()) for i in indices_]
            _et_len = [
                self.et_features.sentences_et[i].shape[0] for i in indices
            ]
            _matches = np.array(np.array(_et_len) == np.array(_sent_len))
            if not np.all(_matches):
                _sent_numbers = indices_[np.where(_matches != True)]
                print(
                    'Some sentences do not match with number of ET features!')
                print('Please check sentences', _sent_numbers)
                return []

        dataset = SplitDataset(self.indexed_sentences[indices_],
                               self.targets[indices_], et_features)
        return DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
Esempio n. 11
0
load_model = 0
save_model = 1

# Load csv (Only columns required)
data = pd.read_csv('ign.csv').ix[:, 1:3]
data.fillna(value='', inplace=True)

#print(data.score_phrase.value_counts())

value_x = data.title
value_y = data.score_phrase

# Convert the strings in the input into integers corresponding to the dictionary positions
# Data is automatically padded so we need to pad_sequences manually
vocab_proc = VocabularyProcessor(15)
value_x = np.array(list(vocab_proc.fit_transform(value_x)))

# 11 classes for predictions
vocab_proc2 = VocabularyProcessor(1)
value_y = np.array(list(vocab_proc2.fit_transform(value_y))) - 1  # Since 0-10
value_y = to_categorical(value_y, nb_classes=11)

# Split training data
trainX, testX, trainY, testY = train_test_split(value_x,
                                                value_y,
                                                test_size=0.1)

# Build network
# Each input has length 15
net = tflearn.input_data([None, 15])
net = tflearn.embedding(net, input_dim=10000, output_dim=256)
Esempio n. 12
0
# Select only the two columns we require. Game title and its corresponding emotion
dataframe = pd.read_csv('ign.csv').ix[:, 1:3]
# Fill null values with empty strings
dataframe.fillna(value='', inplace=True)

# print(dataframe.score_phrase.value_counts())

# Extract the required columns for inputs and outputs
totalX = dataframe.title
totalY = dataframe.score_phrase

# Convert the strings in the input into integers corresponding to the dictionary positions
# Data is automatically padded so we need to pad_sequences manually
vocab_proc = VocabularyProcessor(15)
totalX = np.array(list(vocab_proc.fit_transform(totalX)))

# We will have 11 classes in total for prediction, indices from 0 to 10
vocab_proc2 = VocabularyProcessor(1)
totalY = np.array(list(vocab_proc2.fit_transform(totalY))) - 1
# Convert the indices into 11 dimensional vectors
totalY = to_categorical(totalY, nb_classes=11)

# Split into training and testing data
trainX, testX, trainY, testY = train_test_split(totalX, totalY, test_size=0.1)

# Build the network for classification
# Each input has length of 15
net = tflearn.input_data([None, 15])
# The 15 input word integers are then casted out into 256 dimensions each creating a word embedding.
# We assume the dictionary has 10000 words maximum
Esempio n. 13
0
    'Painful', # 340
    'Bad', # 1269
    'Mediocre', # 1959
    'Okay', # 2945
    'Good', # 4741
    'Great', # 4773
    'Amazing', # 1804
    'Masterpiece', # 55
]

# select title and score lable from data and use it for traning
x = data['title']
# game titles is transformed to lists of 15 numbers (one to one word)
# use nltk for better transformation and predictions
word_processor = VocabularyProcessor(15)
x = np.array(list(word_processor.fit_transform(x)))

y = []
for label in data['score_phrase']:
    y.append(ratings.index(label))

# save the trained word model to use it in the predict program
word_processor.save("wordprocessor")



# find number of items in each category
def statistics():
    numbersInCategory = [0] * len(ratings)
    for labelNo in y:
        numbersInCategory[labelNo] += 1