Ejemplo n.º 1
0
def main():
    G = GloveEmbedding("glove.6B.50d.txt")
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    #print("locon: ", word_to_idx["locon"])
    s = "I love New York and music locon"
    s = s.lower()
    print("Sentence: ", s)
    S = SentenceToIndices(word_to_idx)
    sentence = S.map_sentence(s)
    print("Sentence to indices: ", sentence)
    print("Padded: ", PadSentences(10).pad(sentence))
    SE = SentenceToEmbedding(word_to_idx, idx_to_word, embedding)
    matrix = SE.map_sentence(s, max_len=10)
    print("Matrix: ", matrix)
    print("Matrix.shape: ", matrix.shape)
    print("Embedding i: ", embedding[word_to_idx["i"]])

    sentences = []
    sentences.append("I esta malo".lower())
    sentences.append("Love la musica salsa.".lower())
    sentences.append("Uff, q mal te va nene".lower())
    mapped, mlen = S.map_sentence_list(sentences)
    print("mlen: ", mlen)
    for s in mapped:
        print(s)
Ejemplo n.º 2
0
def set_trained_data(data, NN):
    new_data = []
    for row in data:
        new_data.append(row[1])

    G = GloveEmbedding("data/glove.6B.50d.txt", dimensions=50)
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    S = SentenceToIndices(word_to_idx)

    X_Predict_Idx, max_len = S.map_sentence_list(new_data)
    i = 0
    for s in X_Predict_Idx:
        #    print(str(i)+ ": ", s)
        i = i + 1

    #if max_len % 2 != 0:
    #    max_len = max_len + 1

    max_len = 72

    print("Max Len", max_len)

    P = PadSentences(max_len)
    Trim = TrimSentences(max_len)

    X_Predict_Final = P.pad_list(X_Predict_Idx)
    X_Predict_Final = Trim.trim_list(X_Predict_Final)
    X_Predict_Final = np.array(X_Predict_Final)
    X_Prediction = NN.predict(X_Predict_Final)
    final = np.argmax(X_Prediction, axis=1)
    return new_data, final
Ejemplo n.º 3
0
def main():
    G = GloveEmbedding("data/glove.6B.50d.txt")
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    M = TweetSimilaryBasic(72, G, 5, 3)
    M.build()
    M.summary()
    M.plot("data/model3")
Ejemplo n.º 4
0
def main():
    G = GloveEmbedding("data/glove.6B.50d.txt")
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    print("embedding shape: ", embedding.shape)
    print("idx hello: ", word_to_idx["hello"])
    print("word 20: ", idx_to_word[20])
    e = embedding[word_to_idx["hello"]]
    print("embedding hello: ", e)
    print("e.shape: ", e.shape)
    print("<UNK>: ", word_to_idx['<unk>'])
    print("embedding: <UNK>: ", embedding[word_to_idx['<unk>']])
Ejemplo n.º 5
0
def main():
    G = GloveEmbedding("../test/data/glove.6B.50d.txt")
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    #print("locon: ", word_to_idx["locon"])
    print("Length dictionary: ", len(word_to_idx))
    #s = "I love New York and music locon"
    s = "The flu is making me sad"
    s = s.lower()
    print("Sentence: ", s)
    S = SentenceToIndices(word_to_idx)
    sentence = S.map_sentence(s)
    print("Sentence to indices: ", sentence)
    print("Padded: ", PadSentences(10).pad(sentence))
    SE = SentenceToEmbeddingWithEPSILON(word_to_idx, idx_to_word, embedding)
    matrix1 = SE.map_sentence(s, max_len=len(s))

    s2 = "The flu is making me sad".lower()
    matrix2 = SE.map_sentence(s2, max_len=len(s2))

    print("Matrix 1: ", matrix1)
    print("Matrix.shape: ", matrix1.shape)
    print("\n Matrix 2: ", matrix2)
    print("Matrix.shape: ", matrix2.shape)

    print("\n Self Similarity: ", matrix_cosine_similary(matrix1, matrix1))

    M1 = np.array([-1, 40, 0.04]).reshape((3, 1))
    M2 = np.array([100, 2, 3]).reshape((3, 1))
    print("M1: \n ", M1)
    print("M2: \n", M2)
    SimM = matrix_cosine_similary(M1, M2)
    print("SimM: \n", SimM)
    D = distance_similarity_matrix(SimM)
    print("D: ", D)

    M3 = np.array([[1, 2, 3, 1], [4, 5, 6, 2], [7, 8, 9, 1]])
    M4 = np.array([[1, 2, 3.000001, 1], [4, 5, 6, 2], [7, 8, 9, 1]])

    SimM = matrix_cosine_similary(M3, M3)
    print("SimM: \n", SimM)
    D = distance_similarity_matrix(SimM)
    print("D: ", D)

    SimM = matrix_cosine_similary(M3, M4)
    print("\nSimM: \n", SimM)
    Up = np.triu(SimM)
    D = distance_similarity_matrix(SimM)
    print("D: ", D)
    print("Up: ", Up)
    print("sum Up: ", np.sum(Up))
    print("up I: ", np.triu(np.ones(Up.shape)))
    print("sum I: ", np.sum(np.triu(np.ones(Up.shape))))
Ejemplo n.º 6
0
    def get_glove_embedding(self):
        g = GloveEmbedding(self.embedding_filename, dimensions=50)
        word_to_idx, idx_to_word, embedding = g.read_embedding()
        s = SentenceToIndices(word_to_idx)
        x_train_indices, max_len = s.map_sentence_list(self.x_all)

        if max_len % 2 != 0:
            max_len = max_len + 1

        p = PadSentences(max_len)
        x_train_pad = p.pad_list(x_train_indices)

        # TRIM Tweets to remove noisy data
        trim_size = max_len
        trim = TrimSentences(trim_size)
        x_train_pad = trim.trim_list(x_train_pad)

        return x_train_pad, max_len, g
Ejemplo n.º 7
0
def get_glove(glove_dims): # get glove embedding matrix
    if glove_dims == 50:
        G = GloveEmbedding(filename="../test/data/glove.twitter.27B.50d.txt", dimensions=50)
    elif glove_dims==200:
        G = GloveEmbedding(filename="../test/data/glove.twitter.27B.200d.txt", dimensions=200)
    elif glove_dims==300:
        G = GloveEmbedding(filename="../test/data/glove.840B.300d.txt", dimensions=300)
    else:
        print("Wrong Number of dimensions")
        exit(0)
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    #S = SentenceToIndices(word_to_idx)
    SE = SentenceToEmbeddingWithEPSILON(word_to_idx, idx_to_word, embedding)
    return SE
Ejemplo n.º 8
0
with open(labeled_tweets_filename, "r", encoding="ISO-8859-1") as f:
    i = 0
    csv_file = csv.reader(f, delimiter=',')
    for r in csv_file:
        if i != 0:
            tweet = r[0]
            label = r[1]
            X_all.append(tweet)
            Y_all.append(label)
        i = i + 1
print("Data Ingested")
num_data = len(X_all)
limit = math.ceil(num_data * 0.60)
X_train_sentences = X_all
Y_train = Y_all
G = GloveEmbedding(embedding_filename)
word_to_idx, idx_to_word, embedding = G.read_embedding()
S = SentenceToIndices(word_to_idx)
X_train_indices, max_len = S.map_sentence_list(X_train_sentences)
print("Train data mappend to indices")
P = PadSentences(max_len)
X_train_pad = P.pad_list(X_train_indices)
print("Train data padded")
# convert to numPY arrays
X_train = np.array(X_train_pad)
Y_train = np.array(Y_train)
Y_train = to_categorical(Y_train, num_classes=3)
print("Train data convert to numpy arrays")
model = KerasClassifier(build_fn=create_model(G, max_len))
print("Model created")
# define the grid search parameters
Ejemplo n.º 9
0
def set_prediction(pretain):
    data = pretain[:, 0]
    #Load Model
    model2 = load_model('trained/model1_50d_stoplemma_10e_new_prod.h5',
                        custom_objects={'tf': tf})
    # summarize model.
    model2.summary()
    # Load data
    G = GloveEmbedding("data/glove.twitter.27B.50d.txt", dimensions=50)
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    S = SentenceToIndices(word_to_idx)
    premise = "same busy and just over the flu so feeling great"
    premise = "when ebola struck the doctors stepped up to the plate and the rest of us sat and watched them do their stuff to all engineers and environmentalists this is our time to step up and find answers to these consequences of our failure to coexist with nature"
    premise = remove_stopwords(premise)
    premise = lemmatizer_spacy(premise)
    x_premise = remove_stopwords(premise)
    x_premise = np.full((len(data)), x_premise)

    x_hypothesis = []

    for row in data:
        #row = row.replace("’", "'")
        #row = fix_text_format(row)
        row = remove_stopwords(row)
        #row = lemmatizer_spacy(row)
        #row = remove_stopwords(row)
        x_hypothesis.append(row)
    x_hypothesis = np.array(x_hypothesis)

    X_one_indices, max_len1 = map_to_idx(S, x_premise)
    X_two_indices, max_len2 = map_to_idx(S, x_hypothesis)
    print("len: ", max_len1, max_len2)
    #max_len = max(max_len1, max_len2)
    max_len = 44
    print("max_len_final: ", max_len)

    P = PadSentences(max_len)
    Trim = TrimSentences(max_len)

    X_one_train = P.pad_list(X_one_indices)
    X_two_train = P.pad_list(X_two_indices)
    #X_one_train = Trim.trim_list(X_one_indices)
    X_two_train = Trim.trim_list(X_two_train)

    X_one_train = np.array(X_one_train)
    X_two_train = np.array(X_two_train)

    X_one_aux_disease = set_disease(x_premise)
    X_two_aux_disease = set_disease(x_hypothesis)

    new_dis = []
    for _ in range(len(data)):
        new_dis.append([0, 0, 1, 0, 0, 1])

    X_one_aux_train = new_dis

    X_two_aux_label = pretain[:, 1]

    #X_one_aux_train = binarize_aux(s4, X_one_aux_label)
    X_two_aux_train = binarize_aux(X_two_aux_disease, X_two_aux_label)
    #new_two = []
    #for row in range(len(data)):
    #    new_two.append([row[0], row[1], row[2], row[3], row[4], row[5], row[6]])

    X_two_aux_train = X_two_aux_train.tolist()
    for row in X_two_aux_train:
        del row[6]

    X_one_aux_train = np.array(X_one_aux_train)
    X_two_aux_train = np.array(X_two_aux_train)
    print("one_aux: ", np.array(X_one_aux_train).shape)
    print(X_one_aux_train[:5])
    print("two_aux: ", np.array(X_two_aux_train).shape)
    print(X_two_aux_train[:5])
    model2.load_weights('trained/model1_50d_stoplemma_10e_prod.h5')
    #model2.compile(optimizer='rmsprop',loss={'R1': 'mean_squared_error'},metrics={'R1': 'mse'}, loss_weights={'R1': 0.25})
    #model2.compile(optimizer='rmsprop')
    #model2.load_weights('trained/model1_50d_stoplemma_10e_prod.h5')
    X_Prediction = model2.predict(
        [X_one_train, X_two_train, X_one_aux_train, X_two_aux_train])

    return X_Prediction
Ejemplo n.º 10
0
    print("Epsilon 2: ", ans2)

    #cluster 3
    ans3 = (np.array(c3)- np.array(oldc3) < EM ).all()
    print("Epsilon 3: ", ans3)

    if ans1 and ans2 and ans3:
        return True
    else:
        return False


if __name__ == "__main__":
    #Step 1: Set Centroids
    print("Step 1: Starting")
    G = GloveEmbedding("../test/data/glove.twitter.27B.50d.txt")
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    S = SentenceToIndices(word_to_idx)
    SE = SentenceToEmbeddingWithEPSILON(word_to_idx, idx_to_word, embedding)
    data = []
    dictionary1  = {}
    dictionary2  = {}
    try:
        datafile = open("data/small_tweets.txt", "r", encoding='utf-8')
        with datafile as f:
            for line in f:
                newline = " ".join(line.split())
                data.append(newline)
    except Exception as e:
        print(e)
    max_len = get_max_len(data)
Ejemplo n.º 11
0
def main():
    G = GloveEmbedding("glove.6B.50d.txt")

    T = TweetSentiment2LSTM(10, G)

    pass
Ejemplo n.º 12
0
    def process(self, json_filename, h5_filename):
        np.random.seed(11)
        # open the file with tweets
        X_all = []
        Y_all = []
        with open(self.labeled_tweets_filename, "r",
                  encoding="ISO-8859-1") as f:
            i = 0
            csv_file = csv.reader(f, delimiter=',')
            for r in csv_file:
                if i != 0:
                    tweet = r[0]
                    label = r[1]
                    X_all.append(tweet)
                    Y_all.append(label)
                i = i + 1
        print("Data Ingested")
        # divide the data into training and test
        num_data = len(X_all)
        limit = math.ceil(num_data * 0.60)
        # divide the data into X_train, Y_train, X_test, Y_test
        X_train_sentences = X_all[0:limit]
        Y_train = Y_all[0:limit]
        X_test_sentences = X_all[limit:]
        Y_test = Y_all[limit:]
        print("Data Divided")
        #Get embeeding
        G = GloveEmbedding(self.embedding_filename)
        word_to_idx, idx_to_word, embedding = G.read_embedding()
        S = SentenceToIndices(word_to_idx)
        X_train_indices, max_len = S.map_sentence_list(X_train_sentences)
        print("Train data mappend to indices")
        P = PadSentences(max_len)
        X_train_pad = P.pad_list(X_train_indices)
        print("Train data padded")
        # Trim
        #trim_size = 40
        #Trim = TrimSentences(trim_size)
        #X_train_pad = Trim.trim_list(X_train_pad)
        #convert to numPY arrays
        X_train = np.array(X_train_pad)
        Y_train = np.array(Y_train)
        print("Train data convert to numpy arrays")
        NN = TweetSentiment2LSTM2Dense(max_len, G)
        #NN = TweetSentiment2LSTM2Dense(trim_size, G)

        print("model created")
        NN.build(first_layer_units=128,
                 dense_layer_units=1,
                 first_layer_dropout=0,
                 second_layer_dropout=0)
        print("model built")
        NN.summary()
        sgd = SGD(lr=0.3, momentum=0.001, decay=0.01, nesterov=False)
        adam = Adam(lr=0.03)
        #NN.compile(loss="binary_crossentropy", metrics=['binary_accuracy'], optimizer=adam)
        NN.compile(loss="binary_crossentropy",
                   metrics=['binary_accuracy'],
                   optimizer='rmsprop')

        print("model compiled")
        print("Begin training")
        callback = TensorBoard(log_dir="/tmp/logs")
        NN.fit(X_train, Y_train, epochs=5, callbacks=[callback])
        print("Model trained")
        X_test_indices, max_len = S.map_sentence_list(X_test_sentences)
        print("Test data mapped")
        X_test_pad = P.pad_list(X_test_indices)
        print("Test data padded")
        X_test = np.array(X_test_pad)
        Y_test = np.array(Y_test)
        print("Test data converted to numpy arrays")
        loss, acc = NN.evaluate(X_test, Y_test)
        print("accuracy: ", acc, ", loss: ", loss)
        T = "I have a bad case of vomit"
        X_Predict = [
            "my zika is bad", "i love colombia",
            "my has been tested for ebola",
            "there is a diarrhea outbreak in the city"
        ]
        X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict)
        i = 0
        for s in X_Predict_Idx:
            print(str(i) + ": ", s)
            i = i + 1
        print(X_Predict)
        X_Predict_Final = P.pad_list(X_Predict_Idx)
        #X_Predict_Final = Trim.trim_list(X_Predict_Final)
        #X_Predict = [X_Predict]
        X_Predict_Final = np.array(X_Predict_Final)
        print("Predict: ", NN.predict(X_Predict_Final))
        print("Storing model and weights")
        NN.save_model(json_filename, h5_filename)
        print("Done!")
Ejemplo n.º 13
0
    def process(self,
                json_filename,
                h5_filename,
                plot=False,
                epochs=100,
                vect_dimensions=100):
        # open the file with tweets
        X_all = []
        Y_all = []
        All = []

        #with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f:
        with open(self.labeled_tweets_filename, "r", encoding="utf-8") as f:
            i = 0
            csv_file = csv.reader(f, delimiter=',')
            ones_count = 0

            for r in csv_file:
                if i != 0:
                    All.append(r)
                i = i + 1

        np.random.shuffle(All)

        ones_count = 0
        two_count = 0
        zero_count = 0
        for r in All:
            tweet = r[0]
            label = int(r[1])
            if (label == 0):
                zero_count += 1
            elif (label == 1):
                ones_count += 1
            else:
                two_count += 1
            X_all.append(tweet)
            Y_all.append(label)

        print("len(Y_all): ", len(Y_all))
        class_weight_val = class_weight.compute_class_weight(
            'balanced', np.unique(Y_all), Y_all)
        print("classes: ", np.unique(Y_all))
        print("counts for 0, 1, 2: ", zero_count, ones_count, two_count)
        print("class weight_val: ", class_weight_val)
        class_weight_dictionary = {
            0: class_weight_val[0],
            1: class_weight_val[1],
            2: class_weight_val[2]
        }
        print("dict: ", class_weight_dictionary)

        print("Data Ingested")
        # divide the data into training and test
        num_data = len(X_all)
        limit = math.ceil(num_data * 0.80)
        X_train_sentences = X_all
        Y_train = Y_all

        G = GloveEmbedding(self.embedding_filename, dimensions=100)
        word_to_idx, idx_to_word, embedding = G.read_embedding()
        # print("hello", embedding[47])
        # print("hello", embedding[9876])

        # S = SentenceToEmbedding(word_to_idx, idx_to_word, embedding)
        #
        # edata = []
        # padding_vect = [0] * 100
        #
        # # // exit(0)
        # n = 0
        # for i in X_train_sentences:
        #     # print("Buenoooooooo", n)
        #     m = S.map_sentence(i)
        #     # print("n:", n)
        #     if m.shape[0] < 75:
        #         m = np.vstack((m, np.zeros((75-m.shape[0],100))))
        #         # cuando codear "eficientemente" tiene que ser utilizado
        #         # while m.shape[0] < 75:
        #             # m = np.vstack((m,np.array(padding_vect)))
        #     else:
        #         if m.shape[0] == 100:
        #             m = np.array([m])
        #             m = np.vstack((m, np.zeros((75-m.shape[0],100))))
        #     p = np.array([m])
        #     # print("ghjkl", str(p.shape), "  ghjkluhnm ", n, i)
        #     if n > 0:
        #         edata = np.vstack((edata, p))
        #     else:
        #         edata = p
        #     # print("----------------------------------->" + str(edata.shape))
        #     n = n+1

        # np.save("array", edata)
        hjkl = np.load("data/array.npy")

        print("----------------------------------->" + str(hjkl.shape))

        # exit(0)
        X_train = hjkl
        Y_train = np.array(Y_train)
        ones_count = np.count_nonzero(Y_train)
        zeros_count = len(Y_train) - ones_count
        print("ones count: ", ones_count)
        print("zeros count: ", zeros_count)
        print("two count: ", two_count)
        Y_train_old = Y_train
        Y_train = to_categorical(Y_train, num_classes=3)

        # plt.imshow(X_train[0])
        # plt.show()

        #Divide the data
        X_test_text = X_all[limit:]
        X_test = X_train[limit:]
        Y_test = Y_train[limit:]
        X_train = X_train[0:limit]
        Y_train = Y_train[0:limit]
        print("----------------------------------->" + str(X_train.shape))

        print(
            "Entiendo que esto es la data que utiliza para hacer le training ",
            len(X_train), len(X_train[0]), len(X_train[0][0]), " Y_train ",
            len(Y_train))
        print("Train data convert to numpy arrays")
        NN = KerasInceptionCNN(0, G)

        print("model created")
        kernel_regularizer = l2(0.001)

        NN.build(filters=11,
                 first_dropout=0,
                 second_dropout=0.05,
                 padding='valid',
                 dense_units=16)

        print("model built")
        NN.summary()

        sgd = SGD(lr=0.03, momentum=0.009, decay=0.001, nesterov=True)
        rmsprop = RMSprop(decay=0.003)
        adam = Adam(lr=0.1, decay=0.05)
        sgd = SGD(lr=0.05)

        NN.compile(optimizer='adam',
                   loss="categorical_crossentropy",
                   metrics=['accuracy', precision, recall, f1, fprate])
        print("model compiled")
        print("Begin training")
        #class_weight = {0: 0.67, 1: 0.33}
        #class_weight = None
        # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -  - - - - - - - - - - - - - - HERE

        history = NN.fit(X_train,
                         Y_train,
                         epochs=epochs,
                         batch_size=3,
                         class_weight=class_weight_dictionary)

        print("Model trained")
        print("Predicting")
        print("len(X_test): ", X_test)
        preds = NN.predict(X_test)
        print("len(preds): ", len(preds))
        print("type preds: ", type(preds))
        print("preds before: ", preds)
        preds = np.argmax(preds, axis=1)
        print("preds: ", preds)
        print("len(preds): ", len(preds))
        Y_test = Y_train_old[limit:]
        print("Y test: ", Y_test)
        c_matrix = confusion_matrix(Y_test, preds)
        print("matrix: ", c_matrix)
        print("Storing Errors: ")
        ErrorAnalysis.store_errors(X_test_text, Y_test, preds, "errorcnn.csv")
        print("Errors stored")
        print("Confusion matrix: ")
        prec_1, recall_1, f1_1, spec_1, t = calculate_cm_metrics(c_matrix, '')
        print("C1-> presicion, recall, F1: ", prec_1, recall_1, f1_1)

        #
        # X_test_indices, max_len = S.map_sentence_list(X_test_sentences)
        # print("Test data mapped")
        # X_test_pad = P.pad_list(X_test_indices)
        # print("Test data padded")
        # X_test = np.array(X_test_pad)
        # Y_test = np.array(Y_test)
        # print("Test data converted to numpy arrays")
        # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback])
        # print("accuracy: ", acc)
        T = "I have a bad case of vomit"
        X_Predict = [
            "my zika is bad", "i love colombia",
            "my has been tested for ebola",
            "there is a diarrhea outbreak in the city"
        ]
        X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict)
        i = 0
        for s in X_Predict_Idx:
            print(str(i) + ": ", s)
            i = i + 1
        print(X_Predict)
        X_Predict_Final = P.pad_list(X_Predict_Idx)
        X_Predict_Final = Trim.trim_list(X_Predict_Final)
        #X_Predict = [X_Predict]
        X_Predict_Final = np.array(X_Predict_Final)
        print("Predict: ", np.argmax(NN.predict(X_Predict_Final)))
        print("Storing model and weights")
        NN.save_model(json_filename, h5_filename)
        if plot:
            print("Ploting")
            self.plot(history)
        print("Done!")
Ejemplo n.º 14
0
    def process(self,
                json_filename,
                h5_filename,
                plot=False,
                epochs=100,
                vect_dimensions=100):
        # open the file with tweets
        X_all = []
        Y_all = []
        All = []

        #with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f:
        with open(self.labeled_tweets_filename, "r", encoding="utf-8") as f:
            i = 0
            csv_file = csv.reader(f, delimiter=',')
            ones_count = 0

            for r in csv_file:
                if i != 0:
                    All.append(r)
                i = i + 1

        np.random.shuffle(All)

        ones_count = 0
        two_count = 0
        zero_count = 0
        for r in All:
            tweet = r[0]
            label = int(r[1])
            if (label == 0):
                zero_count += 1
            elif (label == 1):
                ones_count += 1
            else:
                two_count += 1
            X_all.append(tweet)
            Y_all.append(label)

        print("len(Y_all): ", len(Y_all))
        class_weight_val = class_weight.compute_class_weight(
            'balanced', np.unique(Y_all), Y_all)
        print("classes: ", np.unique(Y_all))
        print("counts for 0, 1, 2: ", zero_count, ones_count, two_count)
        print("class weight_val: ", class_weight_val)
        class_weight_dictionary = {
            0: class_weight_val[0],
            1: class_weight_val[1],
            2: class_weight_val[2]
        }
        print("dict: ", class_weight_dictionary)

        print("Data Ingested")
        # divide the data into training and test
        num_data = len(X_all)
        limit = math.ceil(num_data * 0.80)
        X_train_sentences = X_all
        Y_train = Y_all

        G = GloveEmbedding(self.embedding_filename, dimensions=100)
        word_to_idx, idx_to_word, embedding = G.read_embedding()
        print("hello", embedding[47])
        print("hello", embedding[9876])

        S = SentenceToEmbedding(word_to_idx, idx_to_word, embedding)

        #X_train_matrixes = S.map_sentence(c)

        edata = []
        padding_vect = [0] * 100
        #
        # s = "I love New York and music locon"
        #
        # e = "I love New York and music locon"
        #
        # mskdn = [e, s]

        for i in X_train_sentences:
            m = S.map_sentence(i)
            if len(m) < 75:
                while len(m) < 75:
                    m = np.vstack((m, padding_vect))
            edata.append(m)
        print("hello", edata[1])
        print("len", len(edata[1]), " ", len(edata[1][1]), " ")

        # padding_len = self.max_len - len(sentence)
        # if (padding_len > 0):
        #     padding = []
        #     r = range(0, padding_len)
        #     for _ in r:
        #         padding.append(0)
        # return sentence + padding

        # print("Train data mappend to indices")
        # if max_len % 2 !=0:
        #     max_len = max_len + 1
        #
        # P = PadSentences(max_len)
        # X_train_pad = P.pad_list(X_train_indices)
        # print("Train data padded")
        # # TRIM
        # trim_size = max_len
        # #trim_size = 33
        # Trim = TrimSentences(trim_size)
        # X_train_pad = Trim.trim_list(X_train_pad)
        # print("X[0], ", X_train_pad[0])
        # #convert to numPY arrays
        X_train = np.array(edata)
        Y_train = np.array(Y_train)
        ones_count = np.count_nonzero(Y_train)
        zeros_count = len(Y_train) - ones_count
        print("ones count: ", ones_count)
        print("zeros count: ", zeros_count)
        print("two count: ", two_count)
        Y_train_old = Y_train
        Y_train = to_categorical(Y_train, num_classes=3)

        #Divide the data
        X_test_text = X_all[limit:]
        X_test = X_train[limit:]
        Y_test = Y_train[limit:]
        X_train = X_train[0:limit]
        Y_train = Y_train[0:limit]
        print(
            "Entiendo que esto es la data que utiliza para hacer le training ",
            X_train)
        # print ("data divided on value: ", limit)
        # print("lengths X_train, Y_train: ", len(X_train), len(Y_train))
        # print("lengths X_test, Y_test: ", len(X_test), len(Y_test))

        print("Train data convert to numpy arrays")
        NN = KerasInceptionCNN(0, G)

        print("model created")
        kernel_regularizer = l2(0.001)

        NN.build(filters=11,
                 first_dropout=0,
                 second_dropout=0.05,
                 padding='valid',
                 dense_units=16)

        print("model built")
        NN.summary()

        sgd = SGD(lr=0.03, momentum=0.009, decay=0.001, nesterov=True)
        rmsprop = RMSprop(decay=0.003)
        adam = Adam(lr=0.1, decay=0.05)
        sgd = SGD(lr=0.05)

        NN.compile(optimizer='adam',
                   loss="categorical_crossentropy",
                   metrics=['accuracy', precision, recall, f1, fprate])
        print("model compiled")
        print("Begin training")
        callback = TensorBoard(log_dir="/tmp/logs")
        #class_weight = {0: 0.67, 1: 0.33}
        #class_weight = None
        history = NN.fit(X_train,
                         Y_train,
                         epochs=epochs,
                         batch_size=32,
                         callbacks=[callback],
                         class_weight=class_weight_dictionary)
        print("Model trained")
        print("Predicting")
        print("len(X_test): ", X_test)
        preds = NN.predict(X_test)
        print("len(preds): ", len(preds))
        print("type preds: ", type(preds))
        print("preds before: ", preds)
        preds = np.argmax(preds, axis=1)
        print("preds: ", preds)
        print("len(preds): ", len(preds))
        Y_test = Y_train_old[limit:]
        print("Y test: ", Y_test)
        c_matrix = confusion_matrix(Y_test, preds)
        print("matrix: ", c_matrix)
        print("Storing Errors: ")
        ErrorAnalysis.store_errors(X_test_text, Y_test, preds, "errorcnn.csv")
        print("Errors stored")
        print("Confusion matrix: ")
        prec_1, recall_1, f1_1, spec_1, t = calculate_cm_metrics(c_matrix, '')
        print("C1-> presicion, recall, F1: ", prec_1, recall_1, f1_1)

        #
        # X_test_indices, max_len = S.map_sentence_list(X_test_sentences)
        # print("Test data mapped")
        # X_test_pad = P.pad_list(X_test_indices)
        # print("Test data padded")
        # X_test = np.array(X_test_pad)
        # Y_test = np.array(Y_test)
        # print("Test data converted to numpy arrays")
        # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback])
        # print("accuracy: ", acc)
        T = "I have a bad case of vomit"
        X_Predict = [
            "my zika is bad", "i love colombia",
            "my has been tested for ebola",
            "there is a diarrhea outbreak in the city"
        ]
        X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict)
        i = 0
        for s in X_Predict_Idx:
            print(str(i) + ": ", s)
            i = i + 1
        print(X_Predict)
        X_Predict_Final = P.pad_list(X_Predict_Idx)
        X_Predict_Final = Trim.trim_list(X_Predict_Final)
        #X_Predict = [X_Predict]
        X_Predict_Final = np.array(X_Predict_Final)
        print("Predict: ", np.argmax(NN.predict(X_Predict_Final)))
        print("Storing model and weights")
        NN.save_model(json_filename, h5_filename)
        if plot:
            print("Ploting")
            self.plot(history)
        print("Done!")
Ejemplo n.º 15
0
def main():
    G = GloveEmbedding("data/glove.6B.50d.txt")
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    print("embedding shape: ", embedding.shape)
    print("idx hello: ", word_to_idx["hello"])
    print("word 20: ", idx_to_word[20])
    e = embedding[word_to_idx["hello"]]
    print("embedding hello: ", e)
    print("e.shape: ", e.shape)
    print("<UNK>: ", word_to_idx['<unk>'])
    print("embedding: <UNK>: ", embedding[word_to_idx['<unk>']])

    you = embedding[word_to_idx['you']]
    he = embedding[word_to_idx['he']]
    ise = embedding[word_to_idx['is']]
    crazy = embedding[word_to_idx['crazy']]
    nuts =  embedding[word_to_idx['nuts']]

    print("embedding of you: ", you)
    print("embedding of he: ", he)
    print("embedding of ise: ", ise)
    print("embedding of crazy: ", crazy)
    print("embedding of nuts: ", nuts)

    tweet1 = "You are crazy"
    tweet2 = "You are nuts"
    tweet3 = "He is crazy"
    tweet4 = "You are lazy"
    tweet5 = "You are crazy man"
    tweet6 = "Yes You are crazy"
    tweet7 = "The fast train"

    mapper = SentenceToEmbeddingWithEPSILON(word_to_idx, idx_to_word, embedding)
    emb1 = mapper.map_sentence(tweet1.lower(), 4)
    emb2  = mapper.map_sentence(tweet2.lower(), 4)
    emb3  = mapper.map_sentence(tweet3.lower(), 4)
    emb4  = mapper.map_sentence(tweet4.lower(), 4)
    emb5  = mapper.map_sentence(tweet5.lower(), 4)
    emb6  = mapper.map_sentence(tweet6.lower(), 4)
    emb7  = mapper.map_sentence(tweet7.lower(), 4)


    print("Distance tweet1 vs tweet2: ")
    print("Frobenious: ", sim.Frobenius_Distance(emb1, emb2))
    print("Cos Tri: ", sim.TriUL_sim(emb1, emb2))
    print("Distance tweet1 vs tweet3: ")
    print("Frobenious: ", sim.Frobenius_Distance(emb1, emb3))
    print("Cos Tri: ", sim.TriUL_sim(emb1, emb3))

    print("Distance tweet2 vs tweet3: ")
    print("Frobenious: ", sim.Frobenius_Distance(emb2, emb3))
    print("Cos Tri: ", sim.TriUL_sim(emb2, emb3))

    print("Distance tweet1 vs tweet4: ")
    print("Frobenious: ", sim.Frobenius_Distance(emb1, emb4))
    print("Cos Tri: ", sim.TriUL_sim(emb1, emb4))

    print("Distance tweet1 vs tweet5: ")
    print("Frobenious: ", sim.Frobenius_Distance(emb1, emb5))
    print("Cos Tri: ", sim.TriUL_sim(emb1, emb5))

    print("Distance tweet1 vs tweet6: ")
    print("Frobenious: ", sim.Frobenius_Distance(emb1, emb6))
    print("Cos Tri: ", sim.TriUL_sim(emb1, emb6))

    print("Distance tweet1 vs tweet7: ")
    print("Frobenious: ", sim.Frobenius_Distance(emb1, emb7))
    print("Cos Tri: ", sim.TriUL_sim(emb1, emb7))


    print("Embedding tweet1: ")
    print(emb1)
    print("Embedding tweet6: ")
    print(emb6)
Ejemplo n.º 16
0
def getGlove():
    G = GloveEmbedding("../test/data/glove.twitter.27B.50d.txt")
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    S = SentenceToIndices(word_to_idx)
    SE = SentenceToEmbeddingWithEPSILON(word_to_idx, idx_to_word, embedding)
    return SE
Ejemplo n.º 17
0
    def process(self, json_filename, h5_filename, plot=False, epochs = 100, vect_dimensions = 100):
        np.random.seed(11)
        # open the file with tweets
        X_all = []
        Y_all = []
        All  = []
        Zeros = []
        with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f:
            i = 0
            csv_file = csv.reader(f, delimiter = ',')
            ones_count = 0
            Ones = []
            for r in csv_file:
                if i !=0:
                    label = int(r[1])
                    #if label == 0:
                    #  Zeros.append(r)
                    All.append(r)
                    # tweet = r[0]
                    # label = r[1]
                    # X_all.append(tweet)
                    # Y_all.append(label)
                i = i + 1

        print("len(All): ", len(All))
        np.random.shuffle(All)

        ones_count = 0
        for r in All:
            tweet = r[0].strip()
            label = int(r[1])
            if (label == 2):
                label = 0
            # if (label == 1) and (ones_count <= 4611):
            #     X_all.append(tweet)
            #     Y_all.append(label)
            #     ones_count +=1
            # elif (label == 0):
            X_all.append(tweet)
            Y_all.append(label)

        print("Data Ingested")
        # divide the data into training and test
        num_data = len(X_all)
        limit = math.ceil(num_data * 0.60)
        X_train_sentences = X_all
        Y_train = Y_all
        # divide the data into X_train, Y_train, X_test, Y_test
        #X_train_sentences = X_all[0: limit]
        #Y_train = Y_all[0: limit]
        #X_test_sentences = X_all[limit:]
        #Y_test = Y_all[limit:]
        #print("Data Divided")
        #Get embeeding
        #G = Word2VecEmbedding(self.embedding_filename, dimensions=vect_dimensions)
        G = GloveEmbedding(self.embedding_filename, dimensions=50)
        word_to_idx, idx_to_word, embedding = G.read_embedding()
        S = SentenceToIndices(word_to_idx)
        X_train_indices, max_len  = S.map_sentence_list(X_train_sentences)
        print("Train data mappend to indices")
        if max_len % 2 !=0:
            max_len = max_len + 1

        P = PadSentences(max_len)
        X_train_pad = P.pad_list(X_train_indices)
        print("Train data padded")
        # TRIM
        trim_size = max_len
        #trim_size = 45
        Trim = TrimSentences(trim_size)
        X_train_pad = Trim.trim_list(X_train_pad)
        print("X[0], ", X_train_pad[0])
        #convert to numPY arrays
        X_train_reverse = []
        for X in X_train_pad:
            t = X[::-1]
            X_train_reverse.append(t)
        X_train = np.array(X_train_pad)
        X_train_reverse = np.array(X_train_reverse)
        Y_train = np.array(Y_train)

        ones_count = np.count_nonzero(Y_train)
        zeros_count = len(Y_train) - ones_count
        print("ones count: ", ones_count)
        print("zeros count: ", zeros_count)
        #Y_train = to_categorical(Y_train, num_classes=3)
        print("Train data convert to numpy arrays")
        #NN = TweetSentiment2DCNN(trim_size, G)
        NN = TweetSentiment2DCNN2Channel(trim_size, G)
        #NN = TweetSentimentInception(trim_size, G)
        #print("Build GRU")
        #NN = TweetSentimentGRUSM(max_len, G)

        print("model created")
        kernel_regularizer = l2(0.001)
        #kernel_regularizer = None
        NN.build(filters=11, first_dropout=0, second_dropout=0.1, padding='valid', dense_units=32)
        print("model built")
        NN.summary()
        sgd = SGD(lr=0.03, momentum=0.009, decay=0.001, nesterov=True)
        rmsprop = RMSprop(decay=0.003)
        adam = Adam(lr=0.1, decay=0.05)
        #sgd = SGD(lr=0.05)
        NN.compile(optimizer=rmsprop, loss="binary_crossentropy", metrics=['accuracy', precision, recall, f1, fprate])
        print("model compiled")
        print("Begin training")
        callback = TensorBoard(log_dir="/tmp/logs")
        #class_weight = {0: 0.67, 1: 0.33}
        class_weight = None
        history = NN.fit([X_train, X_train_reverse], Y_train, epochs=epochs, batch_size=32, callbacks=[callback], validation_split=0.20, class_weight=class_weight)
        print("Model trained")
        # X_test_indices, max_len = S.map_sentence_list(X_test_sentences)
        # print("Test data mapped")
        # X_test_pad = P.pad_list(X_test_indices)
        # print("Test data padded")
        # X_test = np.array(X_test_pad)
        # Y_test = np.array(Y_test)
        # print("Test data converted to numpy arrays")
        # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback])
        # print("accuracy: ", acc)
        T = "I have a bad case of vomit"
        X_Predict = ["my zika is bad", "i love colombia", "my has been tested for ebola", "there is a diarrhea outbreak in the city"]
        X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict)
        i =0
        for s in X_Predict_Idx:
            print(str(i)+ ": ", s)
            i = i + 1
        print(X_Predict)
        X_Predict_Final = P.pad_list(X_Predict_Idx)
        X_Predict_Final = Trim.trim_list(X_Predict_Final)
        #X_Predict = [X_Predict]
        X_Predict_Reverse = []
        for r in X_Predict_Final:
            t = r[::-1]
            X_Predict_Reverse.append(t)

        X_Predict_Final = np.array(X_Predict_Final)
        X_Predict_Reverse = np.array(X_Predict_Reverse)
        Preds = NN.predict([X_Predict_Final, X_Predict_Reverse])
        Preds = ((Preds >= 0.5)*1).flatten()
        print("Predict: ", Preds)
        print("Storing model and weights")
        NN.save_model(json_filename, h5_filename)
        if plot:
            print("Ploting")
            self.plot(history)
        print("Done!")
Ejemplo n.º 18
0
 def process(self, json_filename, h5_filename, plot=False, epochs=100):
     np.random.seed(11)
     # open the file with tweets
     X_all = []
     Y_all = []
     with open(self.labeled_tweets_filename, "r",
               encoding="ISO-8859-1") as f:
         i = 0
         csv_file = csv.reader(f, delimiter=',')
         for r in csv_file:
             if i != 0:
                 tweet = r[0]
                 label = r[1]
                 X_all.append(tweet)
                 Y_all.append(label)
             i = i + 1
     print("Data Ingested")
     # divide the data into training and test
     num_data = len(X_all)
     limit = math.ceil(num_data * 0.60)
     X_train_sentences = X_all
     Y_train = Y_all
     # divide the data into X_train, Y_train, X_test, Y_test
     #X_train_sentences = X_all[0: limit]
     #Y_train = Y_all[0: limit]
     #X_test_sentences = X_all[limit:]
     #Y_test = Y_all[limit:]
     #print("Data Divided")
     #Get embeeding
     G = GloveEmbedding(self.embedding_filename)
     word_to_idx, idx_to_word, embedding = G.read_embedding()
     S = SentenceToIndices(word_to_idx)
     X_train_indices, max_len = S.map_sentence_list(X_train_sentences)
     print("Train data mappend to indices")
     P = PadSentences(max_len)
     X_train_pad = P.pad_list(X_train_indices)
     print("Train data padded")
     #convert to numPY arrays
     X_train = np.array(X_train_pad)
     Y_train = np.array(Y_train)
     Y_train = to_categorical(Y_train, num_classes=3)
     print("Train data convert to numpy arrays")
     NN = TweetSentiment2LSTM2DenseSM(max_len, G)
     print("model created")
     kernel_regularizer = l2(0.001)
     kernel_regularizer = None
     NN.build(first_layer_units=max_len,
              second_layer_units=max_len,
              relu_dense_layer=5,
              dense_layer_units=3,
              first_layer_dropout=0.3,
              second_layer_dropout=0.6,
              l2=kernel_regularizer)
     print("model built")
     NN.summary()
     sgd = SGD(lr=0.001, momentum=0.09, decay=0.001, nesterov=True)
     rmsprop = RMSprop(decay=0.003)
     adam = Adam(lr=0.1, decay=0.05)
     NN.compile(optimizer=rmsprop,
                loss="categorical_crossentropy",
                metrics=['accuracy', precision, recall, f1, fprate])
     print("model compiled")
     print("Begin training")
     callback = TensorBoard(log_dir="/tmp/logs")
     w_dict = {0: 0.31, 1: 0.63, 2: 0.06}
     history = NN.fit(X_train,
                      Y_train,
                      epochs=epochs,
                      callbacks=[callback],
                      validation_split=0.2,
                      class_weight=w_dict)
     print("Model trained")
     # X_test_indices, max_len = S.map_sentence_list(X_test_sentences)
     # print("Test data mapped")
     # X_test_pad = P.pad_list(X_test_indices)
     # print("Test data padded")
     # X_test = np.array(X_test_pad)
     # Y_test = np.array(Y_test)
     # print("Test data converted to numpy arrays")
     # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback])
     # print("accuracy: ", acc)
     T = "I have a bad case of vomit"
     X_Predict = [
         "my zika is bad", "i love colombia",
         "my has been tested for ebola",
         "there is a diarrhea outbreak in the city"
     ]
     X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict)
     i = 0
     for s in X_Predict_Idx:
         print(str(i) + ": ", s)
         i = i + 1
     print(X_Predict)
     X_Predict_Final = P.pad_list(X_Predict_Idx)
     #X_Predict = [X_Predict]
     X_Predict_Final = np.array(X_Predict_Final)
     print("Predict: ", NN.predict(X_Predict_Final))
     print("Storing model and weights")
     NN.save_model(json_filename, h5_filename)
     if plot:
         print("Ploting")
         self.plot(history)
     print("Done!")
Ejemplo n.º 19
0
    def process(self,
                json_filename,
                h5_filename,
                plot=False,
                epochs=100,
                vect_dimensions=50):
        # open the file with tweets
        X_all = []
        Y_all = []
        All = []

        #with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f:
        with open(self.labeled_tweets_filename, "r") as f:
            i = 0
            csv_file = csv.reader(f, delimiter=',')
            ones_count = 0

            for r in csv_file:
                if i != 0:
                    All.append(r)
                i = i + 1

        np.random.shuffle(All)

        ones_count = 0
        two_count = 0
        zero_count = 0
        for r in All:
            tweet = r[0]
            label = int(r[1])
            if (label == 0):
                zero_count += 1
            elif (label == 1):
                ones_count += 1
            else:
                two_count += 1
            # if (label == 2):
            #     label = 0
            # if (label == 1) and (ones_count <= 4611):
            #     X_all.append(tweet)
            #     Y_all.append(label)
            #     ones_count +=1
            # elif (label == 0):
            X_all.append(tweet)
            Y_all.append(label)

        print("len(Y_all): ", len(Y_all))
        class_weight_val = class_weight.compute_class_weight(
            'balanced', np.unique(Y_all), Y_all)
        print("classes: ", np.unique(Y_all))
        print("counts for 0, 1, 2: ", zero_count, ones_count, two_count)
        print("class weight_val: ", class_weight_val)
        class_weight_dictionary = {
            0: class_weight_val[0],
            1: class_weight_val[1],
            2: class_weight_val[2]
        }
        print("dict: ", class_weight_dictionary)

        print("Data Ingested")
        # divide the data into training and test
        num_data = len(X_all)
        limit = math.ceil(num_data * 0.80)
        X_train_sentences = X_all
        Y_train = Y_all
        # Divide after conversions
        # divide the data into X_train, Y_train, X_test, Y_test
        #X_train_sentences = X_all[0: limit]
        #Y_train = Y_all[0: limit]
        #X_test_sentences = X_all[limit:]
        #Y_test = Y_all[limit:]
        #print("Data Divided")

        #Get embeeding
        #G = Word2VecEmbedding(self.embedding_filename, dimensions=vect_dimensions)

        G = GloveEmbedding(self.embedding_filename, dimensions=vect_dimensions)
        word_to_idx, idx_to_word, embedding = G.read_embedding()
        S = SentenceToIndices(word_to_idx)
        X_train_indices, max_len = S.map_sentence_list(X_train_sentences)
        print("Train data mappend to indices")
        if max_len % 2 != 0:
            max_len = max_len + 1

        P = PadSentences(max_len)
        X_train_pad = P.pad_list(X_train_indices)
        print("Train data padded")
        # TRIM
        trim_size = max_len
        #trim_size = 33
        Trim = TrimSentences(trim_size)
        X_train_pad = Trim.trim_list(X_train_pad)
        print("X[0], ", X_train_pad[0])
        #convert to numPY arrays
        X_train = np.array(X_train_pad)
        Y_train = np.array(Y_train)
        ones_count = np.count_nonzero(Y_train)
        zeros_count = len(Y_train) - ones_count
        print("ones count: ", ones_count)
        print("zeros count: ", zeros_count)
        print("two count: ", two_count)
        Y_train_old = Y_train
        Y_train = to_categorical(Y_train, num_classes=3)

        # Divide the data
        X_test_text = X_all[limit:]
        X_test = X_train[limit:]
        Y_test = Y_train[limit:]
        X_train = X_train[0:limit]
        Y_train = Y_train[0:limit]
        print("data divided on value: ", limit)
        print("lengths X_train, Y_train: ", len(X_train), len(Y_train))
        print("lengths X_test, Y_test: ", len(X_test), len(Y_test))

        print("Train data convert to numpy arrays")
        #NN = TweetSentiment2DCNN(trim_size, G)
        #NN = TweetSentiment2LSTM2Dense(trim_size, G)
        #NN =TweetSentiment2LSTM2Dense3Layer(trim_size, G)
        #NN =TweetSentiment2LSTM2Dense4Layer(trim_size, G)
        #NN = TweetSentimentCNN(trim_size, G)
        #print("Build GRU")
        #NN = TweetSentimentGRUSM(max_len, G)
        NN = TweetSentiment1D(trim_size, G)
        #NN = TweetSentiment1DRev(trim_size, G)

        print("model created")
        kernel_regularizer = l2(0.001)
        #kernel_regularizer = None
        NN.build(filters=11,
                 first_dropout=0,
                 second_dropout=0.05,
                 padding='valid',
                 dense_units=16)

        #NN.build(first_layer_units = max_len, second_layer_units = max_len, relu_dense_layer=16, dense_layer_units = 3,
        #         first_layer_dropout=0, second_layer_dropout=0, third_layer_dropout=0)
        print("model built")
        NN.summary()
        sgd = SGD(lr=0.03, momentum=0.009, decay=0.001, nesterov=True)
        rmsprop = RMSprop(decay=0.003)
        adam = Adam(lr=0.0003, decay=0.001)
        sgd = SGD(lr=0.05)
        NN.compile(optimizer=adam,
                   loss="categorical_crossentropy",
                   metrics=['accuracy', precision, recall, f1, fprate])

        print("model compiled")
        print("Begin training")
        #callback = TensorBoard(log_dir="/tmp/logs")
        #class_weight = {0: 0.67, 1: 0.33}
        #class_weight = None
        #history = NN.fit(X_train, Y_train, epochs=epochs, batch_size=32, callbacks=[callback], class_weight=class_weight_dictionary)
        history = NN.fit(X_train,
                         Y_train,
                         epochs=epochs,
                         batch_size=64,
                         class_weight=class_weight_dictionary,
                         validation_split=0.2)

        print("Model trained")
        print("Predicting")
        print("len(X_test): ", X_test)
        preds = NN.predict(X_test)
        print("len(preds): ", len(preds))
        print("type preds: ", type(preds))
        print("preds before: ", preds)
        preds = np.argmax(preds, axis=1)
        print("preds: ", preds)
        print("len(preds): ", len(preds))
        Y_test = Y_train_old[limit:]
        print("Y test: ", Y_test)
        c_matrix = confusion_matrix(Y_test, preds)
        print("matrix: ", c_matrix)
        print("Storing Errors: ")
        ErrorAnalysis.store_errors(X_test_text, Y_test, preds, "errorcnn.csv")
        print("Errors stored")
        print("Confusion matrix: ")
        prec_1, recall_1, f1_1, spec_1, t = calculate_cm_metrics(c_matrix, '')
        print("C1-> presicion, recall, F1: ", prec_1, recall_1, f1_1)

        #
        # X_test_indices, max_len = S.map_sentence_list(X_test_sentences)
        # print("Test data mapped")
        # X_test_pad = P.pad_list(X_test_indices)
        # print("Test data padded")
        # X_test = np.array(X_test_pad)
        # Y_test = np.array(Y_test)
        # print("Test data converted to numpy arrays")
        # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback])
        # print("accuracy: ", acc)
        T = "I have a bad case of vomit"
        X_Predict = [
            "my zika is bad", "i love colombia",
            "my has been tested for ebola",
            "there is a diarrhea outbreak in the city"
        ]
        X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict)
        i = 0
        for s in X_Predict_Idx:
            print(str(i) + ": ", s)
            i = i + 1
        print(X_Predict)
        X_Predict_Final = P.pad_list(X_Predict_Idx)
        X_Predict_Final = Trim.trim_list(X_Predict_Final)
        #X_Predict = [X_Predict]
        X_Predict_Final = np.array(X_Predict_Final)
        print("Predict: ", np.argmax(NN.predict(X_Predict_Final)))
        print("Storing model and weights")
        NN.save_model(json_filename, h5_filename)
        if plot:
            print("Ploting")
            self.plot(history)
        print("Done!")
Ejemplo n.º 20
0
def main(model_file, model_weights, labeled_tweets, embedding_filename):
    # load json and create model
    json_file = open(model_file, 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    # load weights into new model
    loaded_model.load_weights(model_weights)
    print("Loaded model from disk")
    # evaluate loaded model on test data
    loaded_model.compile(loss='binary_crossentropy',
                         optimizer='rmsprop',
                         metrics=['accuracy'])

    # open the file with tweets
    X_all = []
    Y_all = []
    All = []

    with open(labeled_tweets, "r", encoding="ISO-8859-1") as f:
        i = 0
        csv_file = csv.reader(f, delimiter=',')
        ones_count = 0

        for r in csv_file:
            if i != 0:
                label = int(r[1])
                if (label == 1) or (label == 2):
                    if ones_count <= 13000:
                        All.append(r)
                        ones_count += 1
                else:
                    All.append(r)
                # tweet = r[0]
                # label = r[1]
                # X_all.append(tweet)
                # Y_all.append(label)
            i = i + 1

    ones_count = 0
    for r in All:
        tweet = r[0]
        label = int(r[1])
        if (label == 2):
            label = 0
        # if (label == 1) and (ones_count <= 4611):
        #     X_all.append(tweet)
        #     Y_all.append(label)
        #     ones_count +=1
        # elif (label == 0):
        X_all.append(tweet)
        Y_all.append(label)

    print("Data Ingested")
    # divide the data into training and test
    num_data = len(X_all)
    limit = math.ceil(num_data * 0.60)
    X_train_sentences = X_all
    Y_train = Y_all
    # divide the data into X_train, Y_train, X_test, Y_test
    # X_train_sentences = X_all[0: limit]
    # Y_train = Y_all[0: limit]
    # X_test_sentences = X_all[limit:]
    # Y_test = Y_all[limit:]
    # print("Data Divided")
    # Get embeeding
    # G = Word2VecEmbedding(self.embedding_filename, dimensions=vect_dimensions)
    G = GloveEmbedding(embedding_filename, dimensions=50)
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    S = SentenceToIndices(word_to_idx)
    X_train_indices, max_len = S.map_sentence_list(X_train_sentences)
    print("Train data mappend to indices")
    if max_len % 2 != 0:
        max_len = max_len + 1

    P = PadSentences(max_len)
    X_train_pad = P.pad_list(X_train_indices)
    print("Train data padded")
    # TRIM
    trim_size = max_len
    Trim = TrimSentences(trim_size)
    X_train_pad = Trim.trim_list(X_train_pad)
    print("X[0], ", X_train_pad[0])
    # convert to numPY arrays
    X_train = np.array(X_train_pad)
    Y_train = np.array(Y_train)
    ones_count = np.count_nonzero(Y_train)
    zeros_count = len(Y_train) - ones_count
    print("ones count: ", ones_count)
    print("zeros count: ", zeros_count)
    # Y_train = to_categorical(Y_train, num_classes=3)
    print("Train data convert to numpy arrays")
    Preds = loaded_model.predict(X_train)
    Preds = ((Preds >= 0.5) * 1).flatten()
    with open("data/alltweetsanderrors.csv", "w") as f:
        csv_writer = csv.writer(f, delimiter=",")
        i = 0
        err_count = 0
        for r in All:
            tweet = r[0]
            label = int(r[1])
            if label == 2:
                label = 0
            if Preds[i] != label:
                err_count += 1
                condition = 0
            else:
                condition = 1

            error_pred = []
            error_pred.append(tweet)
            error_pred.append(label)
            error_pred.append(Preds[i])
            error_pred.append(condition)
            csv_writer.writerow(error_pred)
            i += 1
        print("All tweets: ", i)
        print("Error count: ", err_count)