def load_test_dataset(self,
                       filename='testDatasetWithOutNeuTwo',
                       vocab_name='vocabProc'):
     """
         Something is wrong with this function. Does not get the same result
         as before when loading in the new data...
     """
     """ Load test dataset """
     self.test_x, self.test_y = load_csv('datasets/' + filename,
                                         target_column=1)
     """ Get restored vocabulary processor """
     self.vocabProcessor = VocabularyProcessor(self.max_words)
     self.vocabProcessor.restore(vocab_name)
     """ Encode pos, neu and neg to numbers  """
     labelEncoder = LabelEncoder()
     labelEncoder.fit(self.test_y)
     self.test_y = labelEncoder.transform(self.test_y)
     """ Change the list of sentences to a list of sequence of words """
     self.test_x = np.array(
         list(self.vocabProcessor.transform([x[0] for x in self.test_x])))
     """ Pad the sequences to fit the longest sentence """
     self.test_x = pad_sequences(self.test_x,
                                 maxlen=self.max_words,
                                 value=0.)
     """ Convert labels to binary vector """
     self.test_y = to_categorical(self.test_y,
                                  nb_classes=self.number_of_classes)
 def load_dataset_training(self,
                           vocab_name,
                           filename='datasetWithoutNeutral'):
     """ Load the dataset """
     X, Y = load_csv('datasets/' + filename,
                     target_column=2,
                     columns_to_ignore=[0])
     """ Count max words from the longest sentence """
     self.max_words = max([len(x[0].split(" ")) for x in X])
     """ Get vocabulare size from longest sentence """
     self.vocabProcessor = VocabularyProcessor(self.max_words)
     """ Encode pos, neu and neg to numbers """
     labelEncoder = LabelEncoder()
     labelEncoder.fit(Y)
     Y = labelEncoder.transform(Y)
     """ Change the list of sentences to a list of sequence of words """
     X = np.array(list(self.vocabProcessor.fit_transform([x[0]
                                                          for x in X])))
     """ Split the datasets to training set and test test """
     self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(
         X, Y, test_size=0.10, random_state=7)
     """ Pad the sequences to fit the longest sentence """
     self.X_train = pad_sequences(self.X_train,
                                  maxlen=self.max_words,
                                  value=0.)
     self.X_test = pad_sequences(self.X_test,
                                 maxlen=self.max_words,
                                 value=0.)
     """ Convert labels to binary vector """
     self.Y_train = to_categorical(self.Y_train,
                                   nb_classes=self.number_of_classes)
     self.Y_test = to_categorical(self.Y_test,
                                  nb_classes=self.number_of_classes)
     self.vocabProcessor.save(vocab_name)
Ejemplo n.º 3
0
def getData(dataframe):
    # Extract the required columns for inputs and outputs
    totalX = dataframe.sub_grade
    totalY = dataframe.short_emp
    print(dataframe.sub_grade.value_counts())
    totalX = totalX.values
    totalY = totalY.values
    vocab_proc = VocabularyProcessor(1)
    # Convert the strings in the input into integers corresponding to the dictionary positions
    # Data is automatically padded so we need to pad_sequences manually
    """
    totalX="I am happy today. I feel sad today."
    vocab_proc = VocabularyProcessor(1)

    totalX = np.array(list(vocab_proc.fit_transform(totalX)))


    #totalX=totalX.transpose()
    #totalX = pad_sequences(totalX, maxlen=100, value=0.)
    print(totalX)
    """
    totalX = np.array(list(vocab_proc.fit_transform(totalX)))

    totalY = to_categorical(totalY, nb_classes=2)  #totalY=[1,0,1,0,1] 需要为数组
    # 分离出训练数据和测试数据
    trainX, testX, trainY, testY = train_test_split(totalX,
                                                    totalY,
                                                    test_size=0.1)
    return trainX, testX, trainY, testY
 def __init__(self, config):
     self.config = config
     # load data here
     # Load data from files
     positive_data_file = self.config.positive_data_file
     negative_data_file = self.config.negative_data_file
     positive_examples = list(open(positive_data_file, "r").readlines())
     positive_examples = [s.strip() for s in positive_examples]
     negative_examples = list(open(negative_data_file, "r").readlines())
     negative_examples = [s.strip() for s in negative_examples]
     # Split by words
     x_text = positive_examples + negative_examples
     x_text = [clean_str(sent) for sent in x_text]
     self.max_text_length = max([len(x.split(" ")) for x in x_text])
     # transform words to id
     vocab_processor = VocabularyProcessor(self.max_text_length)
     X = np.array(list(vocab_processor.fit_transform(x_text)))
     self.vocab_size = len(vocab_processor.vocabulary_)
     # Generate labels
     positive_labels = [[0, 1] for _ in positive_examples]
     negative_labels = [[1, 0] for _ in negative_examples]
     y = np.concatenate([positive_labels, negative_labels], 0)
     self.train_input, self.valid_input, self.train_label, self.valid_label = train_test_split(
         X, y, random_state=2018, test_size=0.1)
     print('training size: {}'.format(len(self.train_input)))
 def __init__(self, model_dir, char_embeddings, raw_sentence_pairs=None, save_vocab=True):
     self.model_dir = model_dir
     os.makedirs(self.model_dir, exist_ok=True)
     if raw_sentence_pairs is None:
         self.restore()
     else:
         raw_sentence_pairs = raw_sentence_pairs.ravel()
         raw_sentence_pairs = [str(x) for x in list(raw_sentence_pairs)]
         if char_embeddings:
             log('Chosen char embeddings.')
             self.sentences_lengths = [len(list(str(x))) for x in list(raw_sentence_pairs)]
         else:
             log('Chosen word embeddings.')
             self.sentences_lengths = [len(str(x).split(' ')) for x in list(raw_sentence_pairs)]
         max_sentence_length = max(self.sentences_lengths)
         log('Maximum sentence length : {}'.format(max_sentence_length))
         
         if char_embeddings:
             log('Processing sentences with char embeddings...')
             self.vocabulary = VocabularyProcessor(
                 max_document_length=max_sentence_length,
                 tokenizer_fn=char_tokenizer,
             )
         else:
             log('Processing sentences with word embeddings...')
             self.vocabulary = VocabularyProcessor(
                 max_document_length=max_sentence_length,
             )
         log('Sentences have been successfully processed.')
         self.vocabulary.fit(raw_sentence_pairs)
         if save_vocab:
             self.vocabulary.save('{}/vocab'.format(self.model_dir))
def load_vocab_processor(name,max_length,min_frequency):
    '''
    load model
    '''
    print('Loading vocabulary model from {}'.format(name))
    vp = VocabularyProcessor(max_length, min_frequency=min_frequency)
    vp = vp.restore(name)
    return vp
Ejemplo n.º 7
0
def do_vocabulary_table():
    x, y = load_data()

    vp = VocabularyProcessor(max_document_length=max_document_length,
                             min_frequency=0)

    x = vp.fit_transform(x, unused_y=None)
    x = np.array(list(x))

    return x, y
Ejemplo n.º 8
0
    def __init__(self, raw_sentence, model_dir, save_vocab=True):
        os.makedirs(model_dir, exist_ok=True)
        raw_sentence = raw_sentence.ravel()
        raw_sentence = [str(x) for x in list(raw_sentence)]
        self.sentence_length = [
            len(str(x).split(' ')) for x in list(raw_sentence)
        ]
        max_sentence_length = max(self.sentence_length)
        self.vocabulary = VocabularyProcessor(max_sentence_length)

        if save_vocab:
            self.vocabulary.save('{}/vocab'.format(model_dir))
def make_vocab_processor(name,text,max_length,min_frequency):
    ''''
    generate vocab model
    '''
    print('Making vocabulary model...')
    vp = VocabularyProcessor(max_length, min_frequency=min_frequency)
    vp = vp.fit(text)
    if name == None:
        return vp
    else:
        print('Saving vocabulary model to {}'.format(name))
        vp.save(name)
        return vp
 def predict_one_sentence(self, sentence=[['']], vocab_name='vocabProc'):
     """ Load vocabulary processor """
     self.vocabProcessor = VocabularyProcessor(self.max_words)
     self.vocabProcessor.restore(vocab_name)
     """ Transorm sentence to matrix of numbers """
     sentence = np.array(
         list(self.vocabProcessor.transform([x[0] for x in sentence])))
     sentence = pad_sequences(
         sentence,
         max_len=self.vocabProcessor.max_document_length,
         value=0.)
     """ Predict sentence """
     pred_score = self.model.predict(sentence)
     return pred_score
Ejemplo n.º 11
0
    def _create_vocab(min_count=3):
        """
        创建词汇表
        需要训练样本
        """
        def gen_documents():
            for path in (TRAIN_POS_PATH, TRAIN_NEG_PATH):
                with codecs.open(path, 'r', 'utf-8') as file:
                    for line in file:
                        yield line[:-1]

        vocab = VocabularyProcessor(SEQUENCE_LEN,
                                    min_count - 1,
                                    tokenizer_fn=chinese_tokenizer)
        vocab.fit(gen_documents())
        return vocab
Ejemplo n.º 12
0
def build_model(vocabFile, model_type='bilstm'):

    processor = VocabularyProcessor.restore(vocabFile)
    n_words = len(processor.vocabulary_)

    net = tflearn.input_data([None, 300])
    net = tflearn.embedding(net, input_dim=n_words, output_dim=200)

    if model_type == 'bilstm':
        net = tflearn.bidirectional_rnn(net, tflearn.BasicLSTMCell(200),
                                        tflearn.BasicLSTMCell(200))
        net = dropout(net, 0.5)
    elif model_type == 'lstm':
        net = tflearn.lstm(net, 200, dropout=0.5)
        net = dropout(net, 0.5)
    elif model_type == 'cnn':
        net = conv_model(net)

    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net,
                             optimizer='adam',
                             learning_rate=0.05,
                             loss='categorical_crossentropy')

    return net
Ejemplo n.º 13
0
 def __init__(self, model_dir, raw_sentence_pairs=None, save_vocab=True):
     self.model_dir = model_dir
     os.makedirs(self.model_dir, exist_ok=True)
     if raw_sentence_pairs is None:
         self.restore()
     else:
         raw_sentence_pairs = raw_sentence_pairs.ravel()
         raw_sentence_pairs = [str(x) for x in list(raw_sentence_pairs)]
         self.sentences_lengths = [
             len(str(x).split(' ')) for x in list(raw_sentence_pairs)
         ]
         max_sentence_length = max(self.sentences_lengths)
         self.vocabulary = VocabularyProcessor(max_sentence_length)
         self.vocabulary.fit(raw_sentence_pairs)
         if save_vocab:
             self.vocabulary.save('{}/vocab'.format(self.model_dir))
Ejemplo n.º 14
0
class SentimentLookup:
    net = tflearn.input_data     ([None, 40])
    net = tflearn.embedding      (net, input_dim=12495, output_dim=128)
    net = tflearn.lstm           (net, 128, dropout=0.8)
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression     (net, optimizer='adam', learning_rate=0.001,
                                       loss='categorical_crossentropy')
    model = tflearn.DNN(net, tensorboard_verbose=0)
    model.load(MODEL)

    vp = VocabularyProcessor.restore(VOCAB)
    
    def _process_tweet(self, tweet = ""):
        cleaned = str(tweet).upper()
        cleaned = re.sub('&\w+;',   '',          cleaned)
        cleaned = re.sub('\'',      '',          cleaned)
        cleaned = re.sub('@\w+ ',   'USERNAME ', cleaned)
        cleaned = re.sub('[^A-Z ]', '',          cleaned)
        cleaned = re.sub('[ ]+',    ' ',         cleaned)
        return cleaned.strip()

    def sentiment(self, data):
        if isinstance(data, str):
            query = [x for x in SentimentLookup.vp.transform([self._process_tweet(data)])]
            bad, good = SentimentLookup.model.predict(query).tolist()[0]
            return good

        data  = map(self._process_tweet, data)
        query = [x for x in SentimentLookup.vp.transform(data)]
        return SentimentLookup.model.predict(query)[:,1]
Ejemplo n.º 15
0
class DatasetVectorizer:
    def __init__(self, raw_sentence, model_dir, save_vocab=True):
        os.makedirs(model_dir, exist_ok=True)
        raw_sentence = raw_sentence.ravel()
        raw_sentence = [str(x) for x in list(raw_sentence)]
        self.sentence_length = [
            len(str(x).split(' ')) for x in list(raw_sentence)
        ]
        max_sentence_length = max(self.sentence_length)
        self.vocabulary = VocabularyProcessor(max_sentence_length)

        if save_vocab:
            self.vocabulary.save('{}/vocab'.format(model_dir))

    @property
    def max_sentence_len(self):
        return self.vocabulary.max_document_length

    @property
    def vocabulary_size(self):
        return len(self.vocabulary.vocabulary_._mapping)

    def restore(self):
        self.vocabulary = VocabularyProcessor.restore('{}/vocab'.format(
            self.model_dir))

    def vectorize(self, sentence):
        return np.array(list(self.vocabulary.transform([sentence])))

    def vectorize_2d(self, raw_sentence):
        #num_instances, num_classes = raw_sentence.shape
        num_instances = raw_sentence.shape[0]
        num_classes = 1
        raw_sentence = raw_sentence.ravel()

        for i, v in enumerate(raw_sentence):
            if v is np.nan:
                print(i, v)

        vectorized_sentence = np.array(
            list(self.vocabulary.transform(raw_sentence)))

        vectorized_sentence = vectorized_sentence.reshape(
            num_instances, num_classes, self.max_sentence_len)

        return vectorized_sentence[:, 0, :]
Ejemplo n.º 16
0
    def __init__(self):
        if not exists(VOCABULARY_PATH):
            self._vocab = self._create_vocab()
            self._vocab.save(VOCABULARY_PATH)
        else:
            self._vocab = VocabularyProcessor.restore(VOCABULARY_PATH)

        self._model = self._create_model()
        if exists(MODEL_PATH + '.meta'):
            self._model.load(MODEL_PATH, True)
Ejemplo n.º 17
0
    def __init__(self, args):
        self.use_gaze = args.use_gaze

        categories = ['NEGATIVE', 'POSITIVE']
        if args.num_sentiments == 3:
            categories.append('NEUTRAL')

        dataset = load_files(container_path=DATASET_DIR,
                             categories=categories,
                             load_content=True,
                             encoding='utf-8')
        self.sentences_, self.sentence_numbers, _, self.targets, _ = dataset.values(
        )
        self.sentence_numbers = [
            int(re.search(r'\d{1,3}', fname).group())
            for fname in dataset['filenames']
        ]

        # adopting Hollenstein's method of building the vocab
        self.sentences = [clean_str(s) for s in self.sentences_]
        self.num_sentences = len(self.sentences)
        self.max_sentence_length = max(
            [len(s.split()) for s in self.sentences])
        self.vocab_processor = VocabularyProcessor(self.max_sentence_length)
        self.indexed_sentences = torch.LongTensor(
            list(self.vocab_processor.fit_transform(self.sentences)))
        self.vocabulary = list(
            self.vocab_processor.vocabulary_._mapping.keys())

        print('\n> Data set loaded. Sentiment classes:', categories)
        print('> Max sentence length:', self.max_sentence_length, 'words.')

        if self.use_gaze:
            self.et_features = EyeTrackingFeatures(self.max_sentence_length)
            print('> Loaded eye-tracking features.\n')
        else:
            self.et_features = np.zeros(self.num_sentences)
Ejemplo n.º 18
0
def build_vocabulary(inFile, dtype, vocabFile):
    # 文本长度 200
    MAX_LENGTH = 300
    NB_CLASSES = 2

    # 读入分词后文本
    doc = MySentences(inFile, dtype, 'get_content')
    # 把原始文本映射到index
    processor = VocabularyProcessor(MAX_LENGTH, min_frequency=5)
    processor.fit(doc)
    processor.save(vocabFile)
Ejemplo n.º 19
0
class DatasetVectorizer:
    def __init__(self, model_dir, raw_sentence_pairs=None, save_vocab=True):
        self.model_dir = model_dir
        os.makedirs(self.model_dir, exist_ok=True)
        if raw_sentence_pairs is None:
            self.restore()
        else:
            raw_sentence_pairs = raw_sentence_pairs.ravel()
            raw_sentence_pairs = [str(x) for x in list(raw_sentence_pairs)]
            self.sentences_lengths = [
                len(str(x).split(' ')) for x in list(raw_sentence_pairs)
            ]
            max_sentence_length = max(self.sentences_lengths)
            self.vocabulary = VocabularyProcessor(max_sentence_length)
            self.vocabulary.fit(raw_sentence_pairs)
            if save_vocab:
                self.vocabulary.save('{}/vocab'.format(self.model_dir))

    @property
    def max_sentence_len(self):
        return self.vocabulary.max_document_length

    @property
    def vocabulary_size(self):
        return len(self.vocabulary.vocabulary_._mapping)

    def restore(self):
        self.vocabulary = VocabularyProcessor.restore('{}/vocab'.format(
            self.model_dir))

    def vectorize(self, sentence):
        return np.array(list(self.vocabulary.transform([sentence])))

    def vectorize_2d(self, raw_sentence_pairs):
        num_instances, num_classes = raw_sentence_pairs.shape
        raw_sentence_pairs = raw_sentence_pairs.ravel()

        for i, v in enumerate(raw_sentence_pairs):
            if v is np.nan:
                print(i, v)

        vectorized_sentence_pairs = np.array(
            list(self.vocabulary.transform(raw_sentence_pairs)))

        vectorized_sentence_pairs = vectorized_sentence_pairs.reshape(
            num_instances, num_classes, self.max_sentence_len)

        vectorized_sentence1 = vectorized_sentence_pairs[:, 0, :]
        vectorized_sentence2 = vectorized_sentence_pairs[:, 1, :]
        return vectorized_sentence1, vectorized_sentence2
Ejemplo n.º 20
0
def text2npy(inFile, outFile, vocabFile, dtype):

    processor = VocabularyProcessor.restore(vocabFile)
    doc = MySentences(inFile, dtype, 'get_content')
    train_doc = list(processor.transform(doc))

    # 可以使用 to_categorical 来实现onehot编码
    # to_categorical(np.array(lable), NB_CLASSES))

    if dtype == 'train':
        # 把标签做变换
        lable = []
        for y in MySentences(inFile, dtype, 'get_info'):
            lable.append(int(y))
        y = np.array(lable)
        # 保存到文件
        np.save(outFile, np.column_stack([train_doc, y]))
    elif dtype == 'test':
        np.save(outFile, train_doc)

        fw = open(outFile + "_doc.txt", 'w')
        for y in MySentences(inFile, dtype, 'get_info'):
            fw.write(y.encode('utf8') + "\n")
        fw.close()
Ejemplo n.º 21
0
save_model = 1

dataframe = pd.read_csv('ign.csv').ix[:, 1:3]
# Fill null values with empty strings
dataframe.fillna(value='', inplace=True)

# score phrase and title
print(dataframe.columns.values)

# Extract the required columns for inputs and outputs
totalX = dataframe.title
totalY = dataframe.score_phrase

# Convert the strings in the input into integers corresponding to the dictionary positions
# Data is automatically padded so we need to pad_sequences manually
vocab_proc = VocabularyProcessor(15) # max document length
totalX = np.array(list(vocab_proc.fit_transform(totalX)))


# totalX contains a matrix with the word idi (indexes) of the sentences

# We will have 11 classes in total for prediction, indices from 0 to 10
vocab_proc2 = VocabularyProcessor(1)
totalY = np.array(list(vocab_proc2.fit_transform(totalY))) - 1

# here totalY is numbered dictionary entries (0, 1, ... to 10)
print(totalY[5])
# Convert the indices into 11 dimensional vectors
totalY = to_categorical(totalY, 11)
# here totalY is a binary matrix
print(totalY[5])
class DatasetVectorizer:
    
    def __init__(self, model_dir, char_embeddings, raw_sentence_pairs=None, save_vocab=True):
        self.model_dir = model_dir
        os.makedirs(self.model_dir, exist_ok=True)
        if raw_sentence_pairs is None:
            self.restore()
        else:
            raw_sentence_pairs = raw_sentence_pairs.ravel()
            raw_sentence_pairs = [str(x) for x in list(raw_sentence_pairs)]
            if char_embeddings:
                log('Chosen char embeddings.')
                self.sentences_lengths = [len(list(str(x))) for x in list(raw_sentence_pairs)]
            else:
                log('Chosen word embeddings.')
                self.sentences_lengths = [len(str(x).split(' ')) for x in list(raw_sentence_pairs)]
            max_sentence_length = max(self.sentences_lengths)
            log('Maximum sentence length : {}'.format(max_sentence_length))
            
            if char_embeddings:
                log('Processing sentences with char embeddings...')
                self.vocabulary = VocabularyProcessor(
                    max_document_length=max_sentence_length,
                    tokenizer_fn=char_tokenizer,
                )
            else:
                log('Processing sentences with word embeddings...')
                self.vocabulary = VocabularyProcessor(
                    max_document_length=max_sentence_length,
                )
            log('Sentences have been successfully processed.')
            self.vocabulary.fit(raw_sentence_pairs)
            if save_vocab:
                self.vocabulary.save('{}/vocab'.format(self.model_dir))
    
    @property
    def max_sentence_len(self):
        return self.vocabulary.max_document_length
    
    @property
    def vocabulary_size(self):
        return len(self.vocabulary.vocabulary_._mapping)
    
    def restore(self):
        self.vocabulary = VocabularyProcessor.restore('{}/vocab'.format(self.model_dir))
    
    def vectorize(self, sentence):
        return np.array(list(self.vocabulary.transform([sentence])))
    
    def vectorize_2d(self, raw_sentence_pairs):
        num_instances, num_classes = raw_sentence_pairs.shape
        raw_sentence_pairs = raw_sentence_pairs.ravel()
        
        for i, v in enumerate(raw_sentence_pairs):
            if v is np.nan:
                print(i, v)
        
        vectorized_sentence_pairs = np.array(list(self.vocabulary.transform(raw_sentence_pairs)))
        
        vectorized_sentence_pairs = vectorized_sentence_pairs.reshape(num_instances, num_classes,
                                                                      self.max_sentence_len)
        
        vectorized_sentence1 = vectorized_sentence_pairs[:, 0, :]
        vectorized_sentence2 = vectorized_sentence_pairs[:, 1, :]
        return vectorized_sentence1, vectorized_sentence2
 def restore(self):
     self.vocabulary = VocabularyProcessor.restore('{}/vocab'.format(self.model_dir))
Ejemplo n.º 24
0
from tflearn.data_utils import VocabularyProcessor

vocab = {'hello':3, '.':5, 'world':20, '/' : 10}
sentences = ['hello world . / hello', 'hello']

vocab_processor = VocabularyProcessor(max_document_length=6, vocabulary=vocab)
encoded = list(vocab_processor.transform(sentences))
print(encoded)
def run_on_ign():

    df_dataset = pd.read_csv(ign_dataset_path)
    df_dataset.set_index(['index'], inplace=True)
    # fill null values with empty strings
    df_dataset.fillna(value='', inplace=True)
    # extract the required columns for inputs and outputs
    data_X = df_dataset.title
    # data_X[0]
    label_Y = df_dataset.score_phrase
    # label_Y[5]

    # convert the strings in the input into integers corresponding to the dictionary positions
    # maps documents to sequences of word ids
    # data is automatically padded so we need to pad_sequences manually
    vocab_proc = VocabularyProcessor(15)
    total_X = np.array(list(vocab_proc.fit_transform(data_X)))
    # total_X[0]

    # we will have 11 classes in total for prediction, indices from 0 to 10
    # vocabulary processor for single word
    vocab_proc2 = VocabularyProcessor(1)
    total_Y = np.array(list(vocab_proc2.fit_transform(label_Y))) - 1
    # total_Y[5]
    # len(total_Y)

    # as we have 11 unique score_phrase
    # convert the indices into 11 dimensional vectors
    # This is wrong as it generate same array for different score_phrase
    # total_Y = to_categorical(total_Y, nb_classes=11)
    array_list = []
    for array in total_Y:
        array_list.append(get_categorical(array, 11))
    total_Y = np.array(array_list)
    # total_Y[4]

    # split into training and testing data
    train_X, test_X, train_Y, test_Y = train_test_split(total_X,
                                                        total_Y,
                                                        test_size=0.1)

    # build the network for classification
    # each input has length of 15
    net = tflearn.input_data([None, 15])

    # the 15 input word integers are then casted out into 256 dimensions each creating a word embedding.
    # we assume the dictionary has 10000 words maximum
    net = tflearn.embedding(net, input_dim=10000, output_dim=256)
    # each input would have a size of 15x256 and each of these 256 sized vectors are fed into the LSTM layer one at a time.
    # all the intermediate outputs are collected and then passed on to the second LSTM layer.
    net = tflearn.gru(net, 256, dropout=0.9, return_seq=True)
    # using the intermediate outputs, we pass them to another LSTM layer and collect the final output only this time
    net = tflearn.gru(net, 256, dropout=0.9)
    # the output is then sent to a fully connected layer that would give us our final 11 classes
    net = tflearn.fully_connected(net, 11, activation='softmax')
    # we use the adam optimizer instead of standard SGD since it converges much faster
    net = tflearn.regression(net,
                             optimizer='adam',
                             learning_rate=0.001,
                             loss='categorical_crossentropy')

    model = tflearn.DNN(net, tensorboard_verbose=0)

    if check_file_exist(ign_model_path):
        model.load(ign_model_path)

    model.fit(train_X,
              train_Y,
              validation_set=(test_X, test_Y),
              show_metric=True,
              batch_size=32,
              n_epoch=20)

    if save_model:
        print("Saving model as './ign_model.tfl'")
        model.save(ign_model_path)

    return 0
class CNN(object):
    def __init__(self, batch_size=64):
        self.batch_size = batch_size
        self.number_of_classes = 2
        self.X_train = []
        self.X_test = []
        self.Y_train = []
        self.Y_test = []
        self.max_words = None
        self.vocabProcessor = None
        self.cnn_model = None
        self.model = None
        self.test_x = []
        self.test_y = []

    def load_dataset_training(self,
                              vocab_name,
                              filename='datasetWithoutNeutral'):
        """ Load the dataset """
        X, Y = load_csv('datasets/' + filename,
                        target_column=2,
                        columns_to_ignore=[0])
        """ Count max words from the longest sentence """
        self.max_words = max([len(x[0].split(" ")) for x in X])
        """ Get vocabulare size from longest sentence """
        self.vocabProcessor = VocabularyProcessor(self.max_words)
        """ Encode pos, neu and neg to numbers """
        labelEncoder = LabelEncoder()
        labelEncoder.fit(Y)
        Y = labelEncoder.transform(Y)
        """ Change the list of sentences to a list of sequence of words """
        X = np.array(list(self.vocabProcessor.fit_transform([x[0]
                                                             for x in X])))
        """ Split the datasets to training set and test test """
        self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(
            X, Y, test_size=0.10, random_state=7)
        """ Pad the sequences to fit the longest sentence """
        self.X_train = pad_sequences(self.X_train,
                                     maxlen=self.max_words,
                                     value=0.)
        self.X_test = pad_sequences(self.X_test,
                                    maxlen=self.max_words,
                                    value=0.)
        """ Convert labels to binary vector """
        self.Y_train = to_categorical(self.Y_train,
                                      nb_classes=self.number_of_classes)
        self.Y_test = to_categorical(self.Y_test,
                                     nb_classes=self.number_of_classes)
        self.vocabProcessor.save(vocab_name)

    def create_cnn_architecture_two_layers(
            self,
            model_name,
            outputDim=300,
            number_of_filters=60,
            filterSize=[3, 4],
            padding='same',
            activation_function_convLayer='relu',
            regularizer='L2',
            dropouts=0.5,
            activation_function_fc='softmax',
            optimizer='adam',
            learning_rate=0.001,
            loss_function='categorical_crossentropy'):
        if len(filterSize) == 0:
            filterSize = [3, 4]
        """ Define input shape and create word embedding """
        self.cnn_model = input_data(shape=[None, self.max_words], name='input')
        self.cnn_model = tflearn.embedding(
            self.cnn_model,
            input_dim=len(self.vocabProcessor.vocabulary_),
            output_dim=outputDim)
        """ Add three/two convolutional layer. Set number of filters and filter sizes and then merge together """
        conv1 = conv_1d(self.cnn_model,
                        nb_filter=number_of_filters,
                        filter_size=filterSize[0],
                        padding=padding,
                        activation=activation_function_convLayer,
                        regularizer=regularizer)
        conv2 = conv_1d(self.cnn_model,
                        nb_filter=number_of_filters,
                        filter_size=filterSize[1],
                        padding=padding,
                        activation=activation_function_convLayer,
                        regularizer=regularizer)
        #conv3 = conv_1d(cnn_model, nb_filter = 128,  filter_size = 5, padding = 'same',
        #                 activation = 'relu', regularizer = 'L2')
        self.cnn_model = merge([conv1, conv2], mode='concat', axis=1)
        """ Expand one dimension to fit the max_pooling layer """
        self.cnn_model = tf.expand_dims(self.cnn_model, 1)
        self.cnn_model = global_max_pool(self.cnn_model)
        """ Instantiate dropout layer and specify dropout parameter """
        self.cnn_model = dropout(self.cnn_model, dropouts)
        """ Instantiate fully connected layer and regression layer. """
        self.cnn_model = fully_connected(self.cnn_model,
                                         self.number_of_classes,
                                         activation=activation_function_fc)
        self.cnn_model = regression(self.cnn_model,
                                    optimizer=optimizer,
                                    learning_rate=learning_rate,
                                    loss=loss_function,
                                    name='models/' + model_name)

    def train_and_save(self,
                       model_name,
                       tensorboard_verbose=0,
                       tensorboard_dir='/logs/',
                       nb_epochs=5,
                       shuffle=True,
                       show_metric=True):
        """ Instantiate Deep neural network model and start the training """
        self.model = tflearn.DNN(self.cnn_model,
                                 tensorboard_verbose=tensorboard_verbose,
                                 tensorboard_dir=tensorboard_dir)
        self.model.fit(self.X_train,
                       self.Y_train,
                       n_epoch=nb_epochs,
                       validation_set=(self.X_test, self.Y_test),
                       shuffle=shuffle,
                       show_metric=show_metric,
                       batch_size=self.batch_size,
                       run_id=model_name)
        """ Save the model """
        self.model.save('models/' + model_name)

    def load_model(self,
                   model_name,
                   outputDim=300,
                   number_of_filters=60,
                   filterSize=[3, 4],
                   padding='same',
                   activation_function_convLayer='relu',
                   regularizer='L2',
                   dropouts=0.5,
                   activation_function_fc='softmax',
                   optimizer='adam',
                   learning_rate=0.001,
                   loss_function='categorical_crossentropy',
                   tensorboard_verbose=0,
                   tensorboard_dir='/logs/'):
        """
            Has to pass the same values that the models were trained with. If the
            model was trained on default values, the parameters will pass it automatically.
        """

        self.create_cnn_architecture_two_layers(
            model_name, outputDim, number_of_filters, filterSize, padding,
            activation_function_convLayer, regularizer, dropouts,
            activation_function_fc, optimizer, learning_rate, loss_function)

        self.model = tflearn.DNN(self.cnn_model,
                                 tensorboard_verbose=tensorboard_verbose,
                                 tensorboard_dir=tensorboard_dir)
        self.model.load('models/' + model_name)

    def load_test_dataset(self,
                          filename='testDatasetWithOutNeuTwo',
                          vocab_name='vocabProc'):
        """
            Something is wrong with this function. Does not get the same result
            as before when loading in the new data...
        """
        """ Load test dataset """
        self.test_x, self.test_y = load_csv('datasets/' + filename,
                                            target_column=1)
        """ Get restored vocabulary processor """
        self.vocabProcessor = VocabularyProcessor(self.max_words)
        self.vocabProcessor.restore(vocab_name)
        """ Encode pos, neu and neg to numbers  """
        labelEncoder = LabelEncoder()
        labelEncoder.fit(self.test_y)
        self.test_y = labelEncoder.transform(self.test_y)
        """ Change the list of sentences to a list of sequence of words """
        self.test_x = np.array(
            list(self.vocabProcessor.transform([x[0] for x in self.test_x])))
        """ Pad the sequences to fit the longest sentence """
        self.test_x = pad_sequences(self.test_x,
                                    maxlen=self.max_words,
                                    value=0.)
        """ Convert labels to binary vector """
        self.test_y = to_categorical(self.test_y,
                                     nb_classes=self.number_of_classes)

    def evaluate_model_performance(self):
        metrix_score = self.model.evaluate(self.test_x,
                                           self.test_y,
                                           batch_size=self.batch_size)
        return metrix_score

    def predict_one_sentence(self, sentence=[['']], vocab_name='vocabProc'):
        """ Load vocabulary processor """
        self.vocabProcessor = VocabularyProcessor(self.max_words)
        self.vocabProcessor.restore(vocab_name)
        """ Transorm sentence to matrix of numbers """
        sentence = np.array(
            list(self.vocabProcessor.transform([x[0] for x in sentence])))
        sentence = pad_sequences(
            sentence,
            max_len=self.vocabProcessor.max_document_length,
            value=0.)
        """ Predict sentence """
        pred_score = self.model.predict(sentence)
        return pred_score

    def predict_list(self,
                     list_of_sentences=[[''], ['']],
                     vocab_name='vocabProc'):
        """ Load vocabulary processor """
        self.vocabProcessor = VocabularyProcessor(self.max_words)
        self.vocabProcessor.restore(vocab_name)
        """ Transorm sentence to matrix of numbers """
        sentence = np.array(
            list(self.vocabProcessor.transform([x[0] for x in sentece])))
        sentence = pad_sequences(
            sentence,
            max_len=self.vocabProcessor.max_document_length,
            value=0.)
        """ Predict sentence """
        pred_score = self.model.predict(list_of_sentences)
        return pred_score
Ejemplo n.º 27
0
from tflearn.data_utils import pad_sequences, to_categorical, load_csv
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_1d, global_max_pool, max_pool_1d
from tflearn.layers.merge_ops import merge
from tflearn.layers.estimator import regression

batch_size = 64
""" Load the dataset """
#X, Y = load_csv('datasetFullList', target_column = 2, columns_to_ignore = [0])
X, Y = load_csv('datasetWithoutNeutral',
                target_column=2,
                columns_to_ignore=[0])
""" Count max words from the longest sentence """
max_words = max([len(x[0].split(" ")) for x in X])
""" Get vocabulare size from longest sentence """
vocab = VocabularyProcessor(max_words)
""" Encode pos, neu and neg to numbers """
labelEncoder = LabelEncoder()
labelEncoder.fit(Y)
Y = labelEncoder.transform(Y)
""" Change the list of sentences to a list of sequence of words """
X = np.array(list(vocab.fit_transform([x[0] for x in X])))
""" Split the datasets to training set and test test """
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.10,
                                                    random_state=7)
""" Pad the sequences to fit the longest sentence """
X_train = pad_sequences(X_train, maxlen=max_words, value=0.)
X_test = pad_sequences(X_test, maxlen=max_words, value=0.)
""" Convert labels to binary vector """
Ejemplo n.º 28
0
data.loc[data["score_phrase"] == "Disaster", "score_phrase"] = 0
data.loc[data["score_phrase"] == "Unbearable", "score_phrase"] = 0
data.loc[data["score_phrase"] == "Painful", "score_phrase"] = 0
data.loc[data["score_phrase"] == "Bad", "score_phrase"] = 0
data.loc[data["score_phrase"] == "Mediocre", "score_phrase"] = 0
data.loc[data["score_phrase"] == "Awful", "score_phrase"] = 0

data.loc[data["score_phrase"] == "Amazing", "score_phrase"] = 1
data.loc[data["score_phrase"] == "Great", "score_phrase"] = 1
data.loc[data["score_phrase"] == "Okay", "score_phrase"] = 1
data.loc[data["score_phrase"] == "Masterpiece", "score_phrase"] = 1
data.loc[data["score_phrase"] == "Good", "score_phrase"] = 1

# tokenize title
# data["token_title"] = data["title"].apply(nltk.word_tokenize)
word_processor = VocabularyProcessor(100)
tmp = np.array(list(word_processor.fit_transform(data["title"])))

# split into train and test data
train_data, test_data = model_selection.train_test_split(data, train_size=0.9)
trainX = np.array(list(word_processor.fit_transform(train_data["title"])))
trainY = train_data.loc[:, ["score_phrase"]].as_matrix()
testX = np.array(list(word_processor.fit_transform(test_data["title"])))
testY = test_data.loc[:, ["score_phrase"]].as_matrix()

# Converting labels to binary vectors
trainY = to_categorical(trainY, nb_classes=2)
testY = to_categorical(testY, nb_classes=2)

# Network building
net = tflearn.input_data([None, 100])
Ejemplo n.º 29
0
    '\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
)
# print(type(df.num.apply(str)))
# print(type(df.text))
text = df['text'].astype(str)
# num = df['num'].astype(str)
# print(type(text))
# print(text)

# Extract the required columns for inputs and outputs
totalX = text
totalY = df.num

# # Convert the strings in the input into integers corresponding to the dictionary positions
# # Data is automatically padded so we need to pad_sequences manually
vocab_proc = VocabularyProcessor(10)
totalX = np.array(list(vocab_proc.fit_transform(totalX)))
seed = np.array(list(vocab_proc.fit_transform(seed)))
# # We will have 11 classes in total for prediction, indices from 0 to 10
vocab_proc2 = VocabularyProcessor(1)
totalY = np.array(list(vocab_proc2.fit_transform(totalY))) - 1
# # Convert the indices into 11 dimensional vectors
totalY = to_categorical(totalY, nb_classes=10)

# Split into training and testing data
# trainX, testX, trainY, testY = train_test_split(totalX, totalY, test_size=0.1)
totalX = pad_sequences(totalX, maxlen=100, value=0.)
seed = pad_sequences(seed, maxlen=100, value=0.)
# testX = pad_sequences(testX, maxlen=100, value=0.)
# print(trainX)
# print(trainY)
Ejemplo n.º 30
0
def chinese_tokenizer(documents):
    """
    把中文文本转为词序列
    """

    for document in documents:
        # 繁体转简体
        text = HanziConv.toSimplified(document)
        # 英文转小写
        text = text.lower()
        # 分词
        yield list(cut(text))


# 序列长度填充或截取到100,删除词频<=2的词
vocab = VocabularyProcessor(100, 2, tokenizer_fn=chinese_tokenizer)

# 创建词汇表,创建后不能更改
vocab.fit(DOCUMENTS)

# 保存和加载词汇表
vocab.save('vocab.pickle')
vocab = VocabularyProcessor.restore('vocab.pickle')

# 文本转为词ID序列,未知或填充用的词ID为0
id_documents = list(vocab.transform(DOCUMENTS))
for id_document in id_documents:
    print(id_document)
# [2 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
#  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Ejemplo n.º 31
0
save_model = 0

# Select only the two columns we require. Game title and its corresponding emotion
dataframe = pd.read_csv('ign.csv').ix[:, 1:3]
# Fill null values with empty strings
dataframe.fillna(value='', inplace=True)

# print(dataframe.score_phrase.value_counts())

# Extract the required columns for inputs and outputs
totalX = dataframe.title
totalY = dataframe.score_phrase

# Convert the strings in the input into integers corresponding to the dictionary positions
# Data is automatically padded so we need to pad_sequences manually
vocab_proc = VocabularyProcessor(15)
totalX = np.array(list(vocab_proc.fit_transform(totalX)))

# We will have 11 classes in total for prediction, indices from 0 to 10
vocab_proc2 = VocabularyProcessor(1)
totalY = np.array(list(vocab_proc2.fit_transform(totalY))) - 1
# Convert the indices into 11 dimensional vectors
totalY = to_categorical(totalY, nb_classes=11)

# Split into training and testing data
trainX, testX, trainY, testY = train_test_split(totalX, totalY, test_size=0.1)

# Build the network for classification
# Each input has length of 15
net = tflearn.input_data([None, 15])
# The 15 input word integers are then casted out into 256 dimensions each creating a word embedding.