Ejemplos de GloveEmbedding en Python, ejemplos de ths.utils.files.GloveEmbedding en Python

Ejemplo n.º 1

0

Mostrar archivo

def main():
    G = GloveEmbedding("glove.6B.50d.txt")
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    #print("locon: ", word_to_idx["locon"])
    s = "I love New York and music locon"
    s = s.lower()
    print("Sentence: ", s)
    S = SentenceToIndices(word_to_idx)
    sentence = S.map_sentence(s)
    print("Sentence to indices: ", sentence)
    print("Padded: ", PadSentences(10).pad(sentence))
    SE = SentenceToEmbedding(word_to_idx, idx_to_word, embedding)
    matrix = SE.map_sentence(s, max_len=10)
    print("Matrix: ", matrix)
    print("Matrix.shape: ", matrix.shape)
    print("Embedding i: ", embedding[word_to_idx["i"]])

    sentences = []
    sentences.append("I esta malo".lower())
    sentences.append("Love la musica salsa.".lower())
    sentences.append("Uff, q mal te va nene".lower())
    mapped, mlen = S.map_sentence_list(sentences)
    print("mlen: ", mlen)
    for s in mapped:
        print(s)

Ejemplo n.º 2

0

Mostrar archivo

def set_trained_data(data, NN):
    new_data = []
    for row in data:
        new_data.append(row[1])

    G = GloveEmbedding("data/glove.6B.50d.txt", dimensions=50)
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    S = SentenceToIndices(word_to_idx)

    X_Predict_Idx, max_len = S.map_sentence_list(new_data)
    i = 0
    for s in X_Predict_Idx:
        #    print(str(i)+ ": ", s)
        i = i + 1

    #if max_len % 2 != 0:
    #    max_len = max_len + 1

    max_len = 72

    print("Max Len", max_len)

    P = PadSentences(max_len)
    Trim = TrimSentences(max_len)

    X_Predict_Final = P.pad_list(X_Predict_Idx)
    X_Predict_Final = Trim.trim_list(X_Predict_Final)
    X_Predict_Final = np.array(X_Predict_Final)
    X_Prediction = NN.predict(X_Predict_Final)
    final = np.argmax(X_Prediction, axis=1)
    return new_data, final

Ejemplo n.º 3

0

Mostrar archivo

Archivo: testsimtweets.py Proyecto: manuelr417/DetectDiseaseTHS

def main():
    G = GloveEmbedding("data/glove.6B.50d.txt")
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    M = TweetSimilaryBasic(72, G, 5, 3)
    M.build()
    M.summary()
    M.plot("data/model3")

Ejemplo n.º 4

0

Mostrar archivo

Archivo: testembedding.py Proyecto: manuelr417/DetectDiseaseTHS

def main():
    G = GloveEmbedding("data/glove.6B.50d.txt")
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    print("embedding shape: ", embedding.shape)
    print("idx hello: ", word_to_idx["hello"])
    print("word 20: ", idx_to_word[20])
    e = embedding[word_to_idx["hello"]]
    print("embedding hello: ", e)
    print("e.shape: ", e.shape)
    print("<UNK>: ", word_to_idx['<unk>'])
    print("embedding: <UNK>: ", embedding[word_to_idx['<unk>']])

Ejemplo n.º 5

0

Mostrar archivo

def main():
    G = GloveEmbedding("../test/data/glove.6B.50d.txt")
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    #print("locon: ", word_to_idx["locon"])
    print("Length dictionary: ", len(word_to_idx))
    #s = "I love New York and music locon"
    s = "The flu is making me sad"
    s = s.lower()
    print("Sentence: ", s)
    S = SentenceToIndices(word_to_idx)
    sentence = S.map_sentence(s)
    print("Sentence to indices: ", sentence)
    print("Padded: ", PadSentences(10).pad(sentence))
    SE = SentenceToEmbeddingWithEPSILON(word_to_idx, idx_to_word, embedding)
    matrix1 = SE.map_sentence(s, max_len=len(s))

    s2 = "The flu is making me sad".lower()
    matrix2 = SE.map_sentence(s2, max_len=len(s2))

    print("Matrix 1: ", matrix1)
    print("Matrix.shape: ", matrix1.shape)
    print("\n Matrix 2: ", matrix2)
    print("Matrix.shape: ", matrix2.shape)

    print("\n Self Similarity: ", matrix_cosine_similary(matrix1, matrix1))

    M1 = np.array([-1, 40, 0.04]).reshape((3, 1))
    M2 = np.array([100, 2, 3]).reshape((3, 1))
    print("M1: \n ", M1)
    print("M2: \n", M2)
    SimM = matrix_cosine_similary(M1, M2)
    print("SimM: \n", SimM)
    D = distance_similarity_matrix(SimM)
    print("D: ", D)

    M3 = np.array([[1, 2, 3, 1], [4, 5, 6, 2], [7, 8, 9, 1]])
    M4 = np.array([[1, 2, 3.000001, 1], [4, 5, 6, 2], [7, 8, 9, 1]])

    SimM = matrix_cosine_similary(M3, M3)
    print("SimM: \n", SimM)
    D = distance_similarity_matrix(SimM)
    print("D: ", D)

    SimM = matrix_cosine_similary(M3, M4)
    print("\nSimM: \n", SimM)
    Up = np.triu(SimM)
    D = distance_similarity_matrix(SimM)
    print("D: ", D)
    print("Up: ", Up)
    print("sum Up: ", np.sum(Up))
    print("up I: ", np.triu(np.ones(Up.shape)))
    print("sum I: ", np.sum(np.triu(np.ones(Up.shape))))

Ejemplo n.º 6

0

Mostrar archivo

Archivo: tweets_processor.py Proyecto: kristalys47/bigdata

    def get_glove_embedding(self):
        g = GloveEmbedding(self.embedding_filename, dimensions=50)
        word_to_idx, idx_to_word, embedding = g.read_embedding()
        s = SentenceToIndices(word_to_idx)
        x_train_indices, max_len = s.map_sentence_list(self.x_all)

        if max_len % 2 != 0:
            max_len = max_len + 1

        p = PadSentences(max_len)
        x_train_pad = p.pad_list(x_train_indices)

        # TRIM Tweets to remove noisy data
        trim_size = max_len
        trim = TrimSentences(trim_size)
        x_train_pad = trim.trim_list(x_train_pad)

        return x_train_pad, max_len, g

Ejemplo n.º 7

0

Mostrar archivo

def get_glove(glove_dims): # get glove embedding matrix
    if glove_dims == 50:
        G = GloveEmbedding(filename="../test/data/glove.twitter.27B.50d.txt", dimensions=50)
    elif glove_dims==200:
        G = GloveEmbedding(filename="../test/data/glove.twitter.27B.200d.txt", dimensions=200)
    elif glove_dims==300:
        G = GloveEmbedding(filename="../test/data/glove.840B.300d.txt", dimensions=300)
    else:
        print("Wrong Number of dimensions")
        exit(0)
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    #S = SentenceToIndices(word_to_idx)
    SE = SentenceToEmbeddingWithEPSILON(word_to_idx, idx_to_word, embedding)
    return SE

Ejemplo n.º 8

0

Mostrar archivo

with open(labeled_tweets_filename, "r", encoding="ISO-8859-1") as f:
    i = 0
    csv_file = csv.reader(f, delimiter=',')
    for r in csv_file:
        if i != 0:
            tweet = r[0]
            label = r[1]
            X_all.append(tweet)
            Y_all.append(label)
        i = i + 1
print("Data Ingested")
num_data = len(X_all)
limit = math.ceil(num_data * 0.60)
X_train_sentences = X_all
Y_train = Y_all
G = GloveEmbedding(embedding_filename)
word_to_idx, idx_to_word, embedding = G.read_embedding()
S = SentenceToIndices(word_to_idx)
X_train_indices, max_len = S.map_sentence_list(X_train_sentences)
print("Train data mappend to indices")
P = PadSentences(max_len)
X_train_pad = P.pad_list(X_train_indices)
print("Train data padded")
# convert to numPY arrays
X_train = np.array(X_train_pad)
Y_train = np.array(Y_train)
Y_train = to_categorical(Y_train, num_classes=3)
print("Train data convert to numpy arrays")
model = KerasClassifier(build_fn=create_model(G, max_len))
print("Model created")
# define the grid search parameters

Ejemplo n.º 9

0

Mostrar archivo

def set_prediction(pretain):
    data = pretain[:, 0]
    #Load Model
    model2 = load_model('trained/model1_50d_stoplemma_10e_new_prod.h5',
                        custom_objects={'tf': tf})
    # summarize model.
    model2.summary()
    # Load data
    G = GloveEmbedding("data/glove.twitter.27B.50d.txt", dimensions=50)
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    S = SentenceToIndices(word_to_idx)
    premise = "same busy and just over the flu so feeling great"
    premise = "when ebola struck the doctors stepped up to the plate and the rest of us sat and watched them do their stuff to all engineers and environmentalists this is our time to step up and find answers to these consequences of our failure to coexist with nature"
    premise = remove_stopwords(premise)
    premise = lemmatizer_spacy(premise)
    x_premise = remove_stopwords(premise)
    x_premise = np.full((len(data)), x_premise)

    x_hypothesis = []

    for row in data:
        #row = row.replace("’", "'")
        #row = fix_text_format(row)
        row = remove_stopwords(row)
        #row = lemmatizer_spacy(row)
        #row = remove_stopwords(row)
        x_hypothesis.append(row)
    x_hypothesis = np.array(x_hypothesis)

    X_one_indices, max_len1 = map_to_idx(S, x_premise)
    X_two_indices, max_len2 = map_to_idx(S, x_hypothesis)
    print("len: ", max_len1, max_len2)
    #max_len = max(max_len1, max_len2)
    max_len = 44
    print("max_len_final: ", max_len)

    P = PadSentences(max_len)
    Trim = TrimSentences(max_len)

    X_one_train = P.pad_list(X_one_indices)
    X_two_train = P.pad_list(X_two_indices)
    #X_one_train = Trim.trim_list(X_one_indices)
    X_two_train = Trim.trim_list(X_two_train)

    X_one_train = np.array(X_one_train)
    X_two_train = np.array(X_two_train)

    X_one_aux_disease = set_disease(x_premise)
    X_two_aux_disease = set_disease(x_hypothesis)

    new_dis = []
    for _ in range(len(data)):
        new_dis.append([0, 0, 1, 0, 0, 1])

    X_one_aux_train = new_dis

    X_two_aux_label = pretain[:, 1]

    #X_one_aux_train = binarize_aux(s4, X_one_aux_label)
    X_two_aux_train = binarize_aux(X_two_aux_disease, X_two_aux_label)
    #new_two = []
    #for row in range(len(data)):
    #    new_two.append([row[0], row[1], row[2], row[3], row[4], row[5], row[6]])

    X_two_aux_train = X_two_aux_train.tolist()
    for row in X_two_aux_train:
        del row[6]

    X_one_aux_train = np.array(X_one_aux_train)
    X_two_aux_train = np.array(X_two_aux_train)
    print("one_aux: ", np.array(X_one_aux_train).shape)
    print(X_one_aux_train[:5])
    print("two_aux: ", np.array(X_two_aux_train).shape)
    print(X_two_aux_train[:5])
    model2.load_weights('trained/model1_50d_stoplemma_10e_prod.h5')
    #model2.compile(optimizer='rmsprop',loss={'R1': 'mean_squared_error'},metrics={'R1': 'mse'}, loss_weights={'R1': 0.25})
    #model2.compile(optimizer='rmsprop')
    #model2.load_weights('trained/model1_50d_stoplemma_10e_prod.h5')
    X_Prediction = model2.predict(
        [X_one_train, X_two_train, X_one_aux_train, X_two_aux_train])

    return X_Prediction

Ejemplo n.º 10

0

Mostrar archivo

Archivo: testKMeans2.py Proyecto: manuelr417/DetectDiseaseTHS

    print("Epsilon 2: ", ans2)

    #cluster 3
    ans3 = (np.array(c3)- np.array(oldc3) < EM ).all()
    print("Epsilon 3: ", ans3)

    if ans1 and ans2 and ans3:
        return True
    else:
        return False


if __name__ == "__main__":
    #Step 1: Set Centroids
    print("Step 1: Starting")
    G = GloveEmbedding("../test/data/glove.twitter.27B.50d.txt")
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    S = SentenceToIndices(word_to_idx)
    SE = SentenceToEmbeddingWithEPSILON(word_to_idx, idx_to_word, embedding)
    data = []
    dictionary1  = {}
    dictionary2  = {}
    try:
        datafile = open("data/small_tweets.txt", "r", encoding='utf-8')
        with datafile as f:
            for line in f:
                newline = " ".join(line.split())
                data.append(newline)
    except Exception as e:
        print(e)
    max_len = get_max_len(data)

Ejemplo n.º 11

0

Mostrar archivo

def main():
    G = GloveEmbedding("glove.6B.50d.txt")

    T = TweetSentiment2LSTM(10, G)

    pass

Ejemplo n.º 12

0

Mostrar archivo

Archivo: process.py Proyecto: manuelr417/DetectDiseaseTHS

    def process(self, json_filename, h5_filename):
        np.random.seed(11)
        # open the file with tweets
        X_all = []
        Y_all = []
        with open(self.labeled_tweets_filename, "r",
                  encoding="ISO-8859-1") as f:
            i = 0
            csv_file = csv.reader(f, delimiter=',')
            for r in csv_file:
                if i != 0:
                    tweet = r[0]
                    label = r[1]
                    X_all.append(tweet)
                    Y_all.append(label)
                i = i + 1
        print("Data Ingested")
        # divide the data into training and test
        num_data = len(X_all)
        limit = math.ceil(num_data * 0.60)
        # divide the data into X_train, Y_train, X_test, Y_test
        X_train_sentences = X_all[0:limit]
        Y_train = Y_all[0:limit]
        X_test_sentences = X_all[limit:]
        Y_test = Y_all[limit:]
        print("Data Divided")
        #Get embeeding
        G = GloveEmbedding(self.embedding_filename)
        word_to_idx, idx_to_word, embedding = G.read_embedding()
        S = SentenceToIndices(word_to_idx)
        X_train_indices, max_len = S.map_sentence_list(X_train_sentences)
        print("Train data mappend to indices")
        P = PadSentences(max_len)
        X_train_pad = P.pad_list(X_train_indices)
        print("Train data padded")
        # Trim
        #trim_size = 40
        #Trim = TrimSentences(trim_size)
        #X_train_pad = Trim.trim_list(X_train_pad)
        #convert to numPY arrays
        X_train = np.array(X_train_pad)
        Y_train = np.array(Y_train)
        print("Train data convert to numpy arrays")
        NN = TweetSentiment2LSTM2Dense(max_len, G)
        #NN = TweetSentiment2LSTM2Dense(trim_size, G)

        print("model created")
        NN.build(first_layer_units=128,
                 dense_layer_units=1,
                 first_layer_dropout=0,
                 second_layer_dropout=0)
        print("model built")
        NN.summary()
        sgd = SGD(lr=0.3, momentum=0.001, decay=0.01, nesterov=False)
        adam = Adam(lr=0.03)
        #NN.compile(loss="binary_crossentropy", metrics=['binary_accuracy'], optimizer=adam)
        NN.compile(loss="binary_crossentropy",
                   metrics=['binary_accuracy'],
                   optimizer='rmsprop')

        print("model compiled")
        print("Begin training")
        callback = TensorBoard(log_dir="/tmp/logs")
        NN.fit(X_train, Y_train, epochs=5, callbacks=[callback])
        print("Model trained")
        X_test_indices, max_len = S.map_sentence_list(X_test_sentences)
        print("Test data mapped")
        X_test_pad = P.pad_list(X_test_indices)
        print("Test data padded")
        X_test = np.array(X_test_pad)
        Y_test = np.array(Y_test)
        print("Test data converted to numpy arrays")
        loss, acc = NN.evaluate(X_test, Y_test)
        print("accuracy: ", acc, ", loss: ", loss)
        T = "I have a bad case of vomit"
        X_Predict = [
            "my zika is bad", "i love colombia",
            "my has been tested for ebola",
            "there is a diarrhea outbreak in the city"
        ]
        X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict)
        i = 0
        for s in X_Predict_Idx:
            print(str(i) + ": ", s)
            i = i + 1
        print(X_Predict)
        X_Predict_Final = P.pad_list(X_Predict_Idx)
        #X_Predict_Final = Trim.trim_list(X_Predict_Final)
        #X_Predict = [X_Predict]
        X_Predict_Final = np.array(X_Predict_Final)
        print("Predict: ", NN.predict(X_Predict_Final))
        print("Storing model and weights")
        NN.save_model(json_filename, h5_filename)
        print("Done!")

Ejemplo n.º 13

0

Mostrar archivo

    def process(self,
                json_filename,
                h5_filename,
                plot=False,
                epochs=100,
                vect_dimensions=100):
        # open the file with tweets
        X_all = []
        Y_all = []
        All = []

        #with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f:
        with open(self.labeled_tweets_filename, "r", encoding="utf-8") as f:
            i = 0
            csv_file = csv.reader(f, delimiter=',')
            ones_count = 0

            for r in csv_file:
                if i != 0:
                    All.append(r)
                i = i + 1

        np.random.shuffle(All)

        ones_count = 0
        two_count = 0
        zero_count = 0
        for r in All:
            tweet = r[0]
            label = int(r[1])
            if (label == 0):
                zero_count += 1
            elif (label == 1):
                ones_count += 1
            else:
                two_count += 1
            X_all.append(tweet)
            Y_all.append(label)

        print("len(Y_all): ", len(Y_all))
        class_weight_val = class_weight.compute_class_weight(
            'balanced', np.unique(Y_all), Y_all)
        print("classes: ", np.unique(Y_all))
        print("counts for 0, 1, 2: ", zero_count, ones_count, two_count)
        print("class weight_val: ", class_weight_val)
        class_weight_dictionary = {
            0: class_weight_val[0],
            1: class_weight_val[1],
            2: class_weight_val[2]
        }
        print("dict: ", class_weight_dictionary)

        print("Data Ingested")
        # divide the data into training and test
        num_data = len(X_all)
        limit = math.ceil(num_data * 0.80)
        X_train_sentences = X_all
        Y_train = Y_all

        G = GloveEmbedding(self.embedding_filename, dimensions=100)
        word_to_idx, idx_to_word, embedding = G.read_embedding()
        # print("hello", embedding[47])
        # print("hello", embedding[9876])

        # S = SentenceToEmbedding(word_to_idx, idx_to_word, embedding)
        #
        # edata = []
        # padding_vect = [0] * 100
        #
        # # // exit(0)
        # n = 0
        # for i in X_train_sentences:
        #     # print("Buenoooooooo", n)
        #     m = S.map_sentence(i)
        #     # print("n:", n)
        #     if m.shape[0] < 75:
        #         m = np.vstack((m, np.zeros((75-m.shape[0],100))))
        #         # cuando codear "eficientemente" tiene que ser utilizado
        #         # while m.shape[0] < 75:
        #             # m = np.vstack((m,np.array(padding_vect)))
        #     else:
        #         if m.shape[0] == 100:
        #             m = np.array([m])
        #             m = np.vstack((m, np.zeros((75-m.shape[0],100))))
        #     p = np.array([m])
        #     # print("ghjkl", str(p.shape), "  ghjkluhnm ", n, i)
        #     if n > 0:
        #         edata = np.vstack((edata, p))
        #     else:
        #         edata = p
        #     # print("----------------------------------->" + str(edata.shape))
        #     n = n+1

        # np.save("array", edata)
        hjkl = np.load("data/array.npy")

        print("----------------------------------->" + str(hjkl.shape))

        # exit(0)
        X_train = hjkl
        Y_train = np.array(Y_train)
        ones_count = np.count_nonzero(Y_train)
        zeros_count = len(Y_train) - ones_count
        print("ones count: ", ones_count)
        print("zeros count: ", zeros_count)
        print("two count: ", two_count)
        Y_train_old = Y_train
        Y_train = to_categorical(Y_train, num_classes=3)

        # plt.imshow(X_train[0])
        # plt.show()

        #Divide the data
        X_test_text = X_all[limit:]
        X_test = X_train[limit:]
        Y_test = Y_train[limit:]
        X_train = X_train[0:limit]
        Y_train = Y_train[0:limit]
        print("----------------------------------->" + str(X_train.shape))

        print(
            "Entiendo que esto es la data que utiliza para hacer le training ",
            len(X_train), len(X_train[0]), len(X_train[0][0]), " Y_train ",
            len(Y_train))
        print("Train data convert to numpy arrays")
        NN = KerasInceptionCNN(0, G)

        print("model created")
        kernel_regularizer = l2(0.001)

        NN.build(filters=11,
                 first_dropout=0,
                 second_dropout=0.05,
                 padding='valid',
                 dense_units=16)

        print("model built")
        NN.summary()

        sgd = SGD(lr=0.03, momentum=0.009, decay=0.001, nesterov=True)
        rmsprop = RMSprop(decay=0.003)
        adam = Adam(lr=0.1, decay=0.05)
        sgd = SGD(lr=0.05)

        NN.compile(optimizer='adam',
                   loss="categorical_crossentropy",
                   metrics=['accuracy', precision, recall, f1, fprate])
        print("model compiled")
        print("Begin training")
        #class_weight = {0: 0.67, 1: 0.33}
        #class_weight = None
        # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -  - - - - - - - - - - - - - - HERE

        history = NN.fit(X_train,
                         Y_train,
                         epochs=epochs,
                         batch_size=3,
                         class_weight=class_weight_dictionary)

        print("Model trained")
        print("Predicting")
        print("len(X_test): ", X_test)
        preds = NN.predict(X_test)
        print("len(preds): ", len(preds))
        print("type preds: ", type(preds))
        print("preds before: ", preds)
        preds = np.argmax(preds, axis=1)
        print("preds: ", preds)
        print("len(preds): ", len(preds))
        Y_test = Y_train_old[limit:]
        print("Y test: ", Y_test)
        c_matrix = confusion_matrix(Y_test, preds)
        print("matrix: ", c_matrix)
        print("Storing Errors: ")
        ErrorAnalysis.store_errors(X_test_text, Y_test, preds, "errorcnn.csv")
        print("Errors stored")
        print("Confusion matrix: ")
        prec_1, recall_1, f1_1, spec_1, t = calculate_cm_metrics(c_matrix, '')
        print("C1-> presicion, recall, F1: ", prec_1, recall_1, f1_1)

        #
        # X_test_indices, max_len = S.map_sentence_list(X_test_sentences)
        # print("Test data mapped")
        # X_test_pad = P.pad_list(X_test_indices)
        # print("Test data padded")
        # X_test = np.array(X_test_pad)
        # Y_test = np.array(Y_test)
        # print("Test data converted to numpy arrays")
        # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback])
        # print("accuracy: ", acc)
        T = "I have a bad case of vomit"
        X_Predict = [
            "my zika is bad", "i love colombia",
            "my has been tested for ebola",
            "there is a diarrhea outbreak in the city"
        ]
        X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict)
        i = 0
        for s in X_Predict_Idx:
            print(str(i) + ": ", s)
            i = i + 1
        print(X_Predict)
        X_Predict_Final = P.pad_list(X_Predict_Idx)
        X_Predict_Final = Trim.trim_list(X_Predict_Final)
        #X_Predict = [X_Predict]
        X_Predict_Final = np.array(X_Predict_Final)
        print("Predict: ", np.argmax(NN.predict(X_Predict_Final)))
        print("Storing model and weights")
        NN.save_model(json_filename, h5_filename)
        if plot:
            print("Ploting")
            self.plot(history)
        print("Done!")

Ejemplo n.º 14

0

Mostrar archivo

    def process(self,
                json_filename,
                h5_filename,
                plot=False,
                epochs=100,
                vect_dimensions=100):
        # open the file with tweets
        X_all = []
        Y_all = []
        All = []

        #with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f:
        with open(self.labeled_tweets_filename, "r", encoding="utf-8") as f:
            i = 0
            csv_file = csv.reader(f, delimiter=',')
            ones_count = 0

            for r in csv_file:
                if i != 0:
                    All.append(r)
                i = i + 1

        np.random.shuffle(All)

        ones_count = 0
        two_count = 0
        zero_count = 0
        for r in All:
            tweet = r[0]
            label = int(r[1])
            if (label == 0):
                zero_count += 1
            elif (label == 1):
                ones_count += 1
            else:
                two_count += 1
            X_all.append(tweet)
            Y_all.append(label)

        print("len(Y_all): ", len(Y_all))
        class_weight_val = class_weight.compute_class_weight(
            'balanced', np.unique(Y_all), Y_all)
        print("classes: ", np.unique(Y_all))
        print("counts for 0, 1, 2: ", zero_count, ones_count, two_count)
        print("class weight_val: ", class_weight_val)
        class_weight_dictionary = {
            0: class_weight_val[0],
            1: class_weight_val[1],
            2: class_weight_val[2]
        }
        print("dict: ", class_weight_dictionary)

        print("Data Ingested")
        # divide the data into training and test
        num_data = len(X_all)
        limit = math.ceil(num_data * 0.80)
        X_train_sentences = X_all
        Y_train = Y_all

        G = GloveEmbedding(self.embedding_filename, dimensions=100)
        word_to_idx, idx_to_word, embedding = G.read_embedding()
        print("hello", embedding[47])
        print("hello", embedding[9876])

        S = SentenceToEmbedding(word_to_idx, idx_to_word, embedding)

        #X_train_matrixes = S.map_sentence(c)

        edata = []
        padding_vect = [0] * 100
        #
        # s = "I love New York and music locon"
        #
        # e = "I love New York and music locon"
        #
        # mskdn = [e, s]

        for i in X_train_sentences:
            m = S.map_sentence(i)
            if len(m) < 75:
                while len(m) < 75:
                    m = np.vstack((m, padding_vect))
            edata.append(m)
        print("hello", edata[1])
        print("len", len(edata[1]), " ", len(edata[1][1]), " ")

        # padding_len = self.max_len - len(sentence)
        # if (padding_len > 0):
        #     padding = []
        #     r = range(0, padding_len)
        #     for _ in r:
        #         padding.append(0)
        # return sentence + padding

        # print("Train data mappend to indices")
        # if max_len % 2 !=0:
        #     max_len = max_len + 1
        #
        # P = PadSentences(max_len)
        # X_train_pad = P.pad_list(X_train_indices)
        # print("Train data padded")
        # # TRIM
        # trim_size = max_len
        # #trim_size = 33
        # Trim = TrimSentences(trim_size)
        # X_train_pad = Trim.trim_list(X_train_pad)
        # print("X[0], ", X_train_pad[0])
        # #convert to numPY arrays
        X_train = np.array(edata)
        Y_train = np.array(Y_train)
        ones_count = np.count_nonzero(Y_train)
        zeros_count = len(Y_train) - ones_count
        print("ones count: ", ones_count)
        print("zeros count: ", zeros_count)
        print("two count: ", two_count)
        Y_train_old = Y_train
        Y_train = to_categorical(Y_train, num_classes=3)

        #Divide the data
        X_test_text = X_all[limit:]
        X_test = X_train[limit:]
        Y_test = Y_train[limit:]
        X_train = X_train[0:limit]
        Y_train = Y_train[0:limit]
        print(
            "Entiendo que esto es la data que utiliza para hacer le training ",
            X_train)
        # print ("data divided on value: ", limit)
        # print("lengths X_train, Y_train: ", len(X_train), len(Y_train))
        # print("lengths X_test, Y_test: ", len(X_test), len(Y_test))

        print("Train data convert to numpy arrays")
        NN = KerasInceptionCNN(0, G)

        print("model created")
        kernel_regularizer = l2(0.001)

        NN.build(filters=11,
                 first_dropout=0,
                 second_dropout=0.05,
                 padding='valid',
                 dense_units=16)

        print("model built")
        NN.summary()

        sgd = SGD(lr=0.03, momentum=0.009, decay=0.001, nesterov=True)
        rmsprop = RMSprop(decay=0.003)
        adam = Adam(lr=0.1, decay=0.05)
        sgd = SGD(lr=0.05)

        NN.compile(optimizer='adam',
                   loss="categorical_crossentropy",
                   metrics=['accuracy', precision, recall, f1, fprate])
        print("model compiled")
        print("Begin training")
        callback = TensorBoard(log_dir="/tmp/logs")
        #class_weight = {0: 0.67, 1: 0.33}
        #class_weight = None
        history = NN.fit(X_train,
                         Y_train,
                         epochs=epochs,
                         batch_size=32,
                         callbacks=[callback],
                         class_weight=class_weight_dictionary)
        print("Model trained")
        print("Predicting")
        print("len(X_test): ", X_test)
        preds = NN.predict(X_test)
        print("len(preds): ", len(preds))
        print("type preds: ", type(preds))
        print("preds before: ", preds)
        preds = np.argmax(preds, axis=1)
        print("preds: ", preds)
        print("len(preds): ", len(preds))
        Y_test = Y_train_old[limit:]
        print("Y test: ", Y_test)
        c_matrix = confusion_matrix(Y_test, preds)
        print("matrix: ", c_matrix)
        print("Storing Errors: ")
        ErrorAnalysis.store_errors(X_test_text, Y_test, preds, "errorcnn.csv")
        print("Errors stored")
        print("Confusion matrix: ")
        prec_1, recall_1, f1_1, spec_1, t = calculate_cm_metrics(c_matrix, '')
        print("C1-> presicion, recall, F1: ", prec_1, recall_1, f1_1)

        #
        # X_test_indices, max_len = S.map_sentence_list(X_test_sentences)
        # print("Test data mapped")
        # X_test_pad = P.pad_list(X_test_indices)
        # print("Test data padded")
        # X_test = np.array(X_test_pad)
        # Y_test = np.array(Y_test)
        # print("Test data converted to numpy arrays")
        # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback])
        # print("accuracy: ", acc)
        T = "I have a bad case of vomit"
        X_Predict = [
            "my zika is bad", "i love colombia",
            "my has been tested for ebola",
            "there is a diarrhea outbreak in the city"
        ]
        X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict)
        i = 0
        for s in X_Predict_Idx:
            print(str(i) + ": ", s)
            i = i + 1
        print(X_Predict)
        X_Predict_Final = P.pad_list(X_Predict_Idx)
        X_Predict_Final = Trim.trim_list(X_Predict_Final)
        #X_Predict = [X_Predict]
        X_Predict_Final = np.array(X_Predict_Final)
        print("Predict: ", np.argmax(NN.predict(X_Predict_Final)))
        print("Storing model and weights")
        NN.save_model(json_filename, h5_filename)
        if plot:
            print("Ploting")
            self.plot(history)
        print("Done!")

Ejemplo n.º 15

0

Mostrar archivo

def main():
    G = GloveEmbedding("data/glove.6B.50d.txt")
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    print("embedding shape: ", embedding.shape)
    print("idx hello: ", word_to_idx["hello"])
    print("word 20: ", idx_to_word[20])
    e = embedding[word_to_idx["hello"]]
    print("embedding hello: ", e)
    print("e.shape: ", e.shape)
    print("<UNK>: ", word_to_idx['<unk>'])
    print("embedding: <UNK>: ", embedding[word_to_idx['<unk>']])

    you = embedding[word_to_idx['you']]
    he = embedding[word_to_idx['he']]
    ise = embedding[word_to_idx['is']]
    crazy = embedding[word_to_idx['crazy']]
    nuts =  embedding[word_to_idx['nuts']]

    print("embedding of you: ", you)
    print("embedding of he: ", he)
    print("embedding of ise: ", ise)
    print("embedding of crazy: ", crazy)
    print("embedding of nuts: ", nuts)

    tweet1 = "You are crazy"
    tweet2 = "You are nuts"
    tweet3 = "He is crazy"
    tweet4 = "You are lazy"
    tweet5 = "You are crazy man"
    tweet6 = "Yes You are crazy"
    tweet7 = "The fast train"

    mapper = SentenceToEmbeddingWithEPSILON(word_to_idx, idx_to_word, embedding)
    emb1 = mapper.map_sentence(tweet1.lower(), 4)
    emb2  = mapper.map_sentence(tweet2.lower(), 4)
    emb3  = mapper.map_sentence(tweet3.lower(), 4)
    emb4  = mapper.map_sentence(tweet4.lower(), 4)
    emb5  = mapper.map_sentence(tweet5.lower(), 4)
    emb6  = mapper.map_sentence(tweet6.lower(), 4)
    emb7  = mapper.map_sentence(tweet7.lower(), 4)


    print("Distance tweet1 vs tweet2: ")
    print("Frobenious: ", sim.Frobenius_Distance(emb1, emb2))
    print("Cos Tri: ", sim.TriUL_sim(emb1, emb2))
    print("Distance tweet1 vs tweet3: ")
    print("Frobenious: ", sim.Frobenius_Distance(emb1, emb3))
    print("Cos Tri: ", sim.TriUL_sim(emb1, emb3))

    print("Distance tweet2 vs tweet3: ")
    print("Frobenious: ", sim.Frobenius_Distance(emb2, emb3))
    print("Cos Tri: ", sim.TriUL_sim(emb2, emb3))

    print("Distance tweet1 vs tweet4: ")
    print("Frobenious: ", sim.Frobenius_Distance(emb1, emb4))
    print("Cos Tri: ", sim.TriUL_sim(emb1, emb4))

    print("Distance tweet1 vs tweet5: ")
    print("Frobenious: ", sim.Frobenius_Distance(emb1, emb5))
    print("Cos Tri: ", sim.TriUL_sim(emb1, emb5))

    print("Distance tweet1 vs tweet6: ")
    print("Frobenious: ", sim.Frobenius_Distance(emb1, emb6))
    print("Cos Tri: ", sim.TriUL_sim(emb1, emb6))

    print("Distance tweet1 vs tweet7: ")
    print("Frobenious: ", sim.Frobenius_Distance(emb1, emb7))
    print("Cos Tri: ", sim.TriUL_sim(emb1, emb7))


    print("Embedding tweet1: ")
    print(emb1)
    print("Embedding tweet6: ")
    print(emb6)

Ejemplo n.º 16

0

Mostrar archivo

Archivo: testLSTM2.py Proyecto: dannyvillanueva/DetectDiseaseTHS

def getGlove():
    G = GloveEmbedding("../test/data/glove.twitter.27B.50d.txt")
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    S = SentenceToIndices(word_to_idx)
    SE = SentenceToEmbeddingWithEPSILON(word_to_idx, idx_to_word, embedding)
    return SE

Ejemplo n.º 17

0

Mostrar archivo

    def process(self, json_filename, h5_filename, plot=False, epochs = 100, vect_dimensions = 100):
        np.random.seed(11)
        # open the file with tweets
        X_all = []
        Y_all = []
        All  = []
        Zeros = []
        with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f:
            i = 0
            csv_file = csv.reader(f, delimiter = ',')
            ones_count = 0
            Ones = []
            for r in csv_file:
                if i !=0:
                    label = int(r[1])
                    #if label == 0:
                    #  Zeros.append(r)
                    All.append(r)
                    # tweet = r[0]
                    # label = r[1]
                    # X_all.append(tweet)
                    # Y_all.append(label)
                i = i + 1

        print("len(All): ", len(All))
        np.random.shuffle(All)

        ones_count = 0
        for r in All:
            tweet = r[0].strip()
            label = int(r[1])
            if (label == 2):
                label = 0
            # if (label == 1) and (ones_count <= 4611):
            #     X_all.append(tweet)
            #     Y_all.append(label)
            #     ones_count +=1
            # elif (label == 0):
            X_all.append(tweet)
            Y_all.append(label)

        print("Data Ingested")
        # divide the data into training and test
        num_data = len(X_all)
        limit = math.ceil(num_data * 0.60)
        X_train_sentences = X_all
        Y_train = Y_all
        # divide the data into X_train, Y_train, X_test, Y_test
        #X_train_sentences = X_all[0: limit]
        #Y_train = Y_all[0: limit]
        #X_test_sentences = X_all[limit:]
        #Y_test = Y_all[limit:]
        #print("Data Divided")
        #Get embeeding
        #G = Word2VecEmbedding(self.embedding_filename, dimensions=vect_dimensions)
        G = GloveEmbedding(self.embedding_filename, dimensions=50)
        word_to_idx, idx_to_word, embedding = G.read_embedding()
        S = SentenceToIndices(word_to_idx)
        X_train_indices, max_len  = S.map_sentence_list(X_train_sentences)
        print("Train data mappend to indices")
        if max_len % 2 !=0:
            max_len = max_len + 1

        P = PadSentences(max_len)
        X_train_pad = P.pad_list(X_train_indices)
        print("Train data padded")
        # TRIM
        trim_size = max_len
        #trim_size = 45
        Trim = TrimSentences(trim_size)
        X_train_pad = Trim.trim_list(X_train_pad)
        print("X[0], ", X_train_pad[0])
        #convert to numPY arrays
        X_train_reverse = []
        for X in X_train_pad:
            t = X[::-1]
            X_train_reverse.append(t)
        X_train = np.array(X_train_pad)
        X_train_reverse = np.array(X_train_reverse)
        Y_train = np.array(Y_train)

        ones_count = np.count_nonzero(Y_train)
        zeros_count = len(Y_train) - ones_count
        print("ones count: ", ones_count)
        print("zeros count: ", zeros_count)
        #Y_train = to_categorical(Y_train, num_classes=3)
        print("Train data convert to numpy arrays")
        #NN = TweetSentiment2DCNN(trim_size, G)
        NN = TweetSentiment2DCNN2Channel(trim_size, G)
        #NN = TweetSentimentInception(trim_size, G)
        #print("Build GRU")
        #NN = TweetSentimentGRUSM(max_len, G)

        print("model created")
        kernel_regularizer = l2(0.001)
        #kernel_regularizer = None
        NN.build(filters=11, first_dropout=0, second_dropout=0.1, padding='valid', dense_units=32)
        print("model built")
        NN.summary()
        sgd = SGD(lr=0.03, momentum=0.009, decay=0.001, nesterov=True)
        rmsprop = RMSprop(decay=0.003)
        adam = Adam(lr=0.1, decay=0.05)
        #sgd = SGD(lr=0.05)
        NN.compile(optimizer=rmsprop, loss="binary_crossentropy", metrics=['accuracy', precision, recall, f1, fprate])
        print("model compiled")
        print("Begin training")
        callback = TensorBoard(log_dir="/tmp/logs")
        #class_weight = {0: 0.67, 1: 0.33}
        class_weight = None
        history = NN.fit([X_train, X_train_reverse], Y_train, epochs=epochs, batch_size=32, callbacks=[callback], validation_split=0.20, class_weight=class_weight)
        print("Model trained")
        # X_test_indices, max_len = S.map_sentence_list(X_test_sentences)
        # print("Test data mapped")
        # X_test_pad = P.pad_list(X_test_indices)
        # print("Test data padded")
        # X_test = np.array(X_test_pad)
        # Y_test = np.array(Y_test)
        # print("Test data converted to numpy arrays")
        # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback])
        # print("accuracy: ", acc)
        T = "I have a bad case of vomit"
        X_Predict = ["my zika is bad", "i love colombia", "my has been tested for ebola", "there is a diarrhea outbreak in the city"]
        X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict)
        i =0
        for s in X_Predict_Idx:
            print(str(i)+ ": ", s)
            i = i + 1
        print(X_Predict)
        X_Predict_Final = P.pad_list(X_Predict_Idx)
        X_Predict_Final = Trim.trim_list(X_Predict_Final)
        #X_Predict = [X_Predict]
        X_Predict_Reverse = []
        for r in X_Predict_Final:
            t = r[::-1]
            X_Predict_Reverse.append(t)

        X_Predict_Final = np.array(X_Predict_Final)
        X_Predict_Reverse = np.array(X_Predict_Reverse)
        Preds = NN.predict([X_Predict_Final, X_Predict_Reverse])
        Preds = ((Preds >= 0.5)*1).flatten()
        print("Predict: ", Preds)
        print("Storing model and weights")
        NN.save_model(json_filename, h5_filename)
        if plot:
            print("Ploting")
            self.plot(history)
        print("Done!")

Ejemplo n.º 18

0

Mostrar archivo

Archivo: process.py Proyecto: manuelr417/DetectDiseaseTHS

 def process(self, json_filename, h5_filename, plot=False, epochs=100):
     np.random.seed(11)
     # open the file with tweets
     X_all = []
     Y_all = []
     with open(self.labeled_tweets_filename, "r",
               encoding="ISO-8859-1") as f:
         i = 0
         csv_file = csv.reader(f, delimiter=',')
         for r in csv_file:
             if i != 0:
                 tweet = r[0]
                 label = r[1]
                 X_all.append(tweet)
                 Y_all.append(label)
             i = i + 1
     print("Data Ingested")
     # divide the data into training and test
     num_data = len(X_all)
     limit = math.ceil(num_data * 0.60)
     X_train_sentences = X_all
     Y_train = Y_all
     # divide the data into X_train, Y_train, X_test, Y_test
     #X_train_sentences = X_all[0: limit]
     #Y_train = Y_all[0: limit]
     #X_test_sentences = X_all[limit:]
     #Y_test = Y_all[limit:]
     #print("Data Divided")
     #Get embeeding
     G = GloveEmbedding(self.embedding_filename)
     word_to_idx, idx_to_word, embedding = G.read_embedding()
     S = SentenceToIndices(word_to_idx)
     X_train_indices, max_len = S.map_sentence_list(X_train_sentences)
     print("Train data mappend to indices")
     P = PadSentences(max_len)
     X_train_pad = P.pad_list(X_train_indices)
     print("Train data padded")
     #convert to numPY arrays
     X_train = np.array(X_train_pad)
     Y_train = np.array(Y_train)
     Y_train = to_categorical(Y_train, num_classes=3)
     print("Train data convert to numpy arrays")
     NN = TweetSentiment2LSTM2DenseSM(max_len, G)
     print("model created")
     kernel_regularizer = l2(0.001)
     kernel_regularizer = None
     NN.build(first_layer_units=max_len,
              second_layer_units=max_len,
              relu_dense_layer=5,
              dense_layer_units=3,
              first_layer_dropout=0.3,
              second_layer_dropout=0.6,
              l2=kernel_regularizer)
     print("model built")
     NN.summary()
     sgd = SGD(lr=0.001, momentum=0.09, decay=0.001, nesterov=True)
     rmsprop = RMSprop(decay=0.003)
     adam = Adam(lr=0.1, decay=0.05)
     NN.compile(optimizer=rmsprop,
                loss="categorical_crossentropy",
                metrics=['accuracy', precision, recall, f1, fprate])
     print("model compiled")
     print("Begin training")
     callback = TensorBoard(log_dir="/tmp/logs")
     w_dict = {0: 0.31, 1: 0.63, 2: 0.06}
     history = NN.fit(X_train,
                      Y_train,
                      epochs=epochs,
                      callbacks=[callback],
                      validation_split=0.2,
                      class_weight=w_dict)
     print("Model trained")
     # X_test_indices, max_len = S.map_sentence_list(X_test_sentences)
     # print("Test data mapped")
     # X_test_pad = P.pad_list(X_test_indices)
     # print("Test data padded")
     # X_test = np.array(X_test_pad)
     # Y_test = np.array(Y_test)
     # print("Test data converted to numpy arrays")
     # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback])
     # print("accuracy: ", acc)
     T = "I have a bad case of vomit"
     X_Predict = [
         "my zika is bad", "i love colombia",
         "my has been tested for ebola",
         "there is a diarrhea outbreak in the city"
     ]
     X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict)
     i = 0
     for s in X_Predict_Idx:
         print(str(i) + ": ", s)
         i = i + 1
     print(X_Predict)
     X_Predict_Final = P.pad_list(X_Predict_Idx)
     #X_Predict = [X_Predict]
     X_Predict_Final = np.array(X_Predict_Final)
     print("Predict: ", NN.predict(X_Predict_Final))
     print("Storing model and weights")
     NN.save_model(json_filename, h5_filename)
     if plot:
         print("Ploting")
         self.plot(history)
     print("Done!")

Ejemplo n.º 19

0

Mostrar archivo

Archivo: processcnnw1d.py Proyecto: manuelr417/DetectDiseaseTHS

    def process(self,
                json_filename,
                h5_filename,
                plot=False,
                epochs=100,
                vect_dimensions=50):
        # open the file with tweets
        X_all = []
        Y_all = []
        All = []

        #with open(self.labeled_tweets_filename, "r", encoding="ISO-8859-1") as f:
        with open(self.labeled_tweets_filename, "r") as f:
            i = 0
            csv_file = csv.reader(f, delimiter=',')
            ones_count = 0

            for r in csv_file:
                if i != 0:
                    All.append(r)
                i = i + 1

        np.random.shuffle(All)

        ones_count = 0
        two_count = 0
        zero_count = 0
        for r in All:
            tweet = r[0]
            label = int(r[1])
            if (label == 0):
                zero_count += 1
            elif (label == 1):
                ones_count += 1
            else:
                two_count += 1
            # if (label == 2):
            #     label = 0
            # if (label == 1) and (ones_count <= 4611):
            #     X_all.append(tweet)
            #     Y_all.append(label)
            #     ones_count +=1
            # elif (label == 0):
            X_all.append(tweet)
            Y_all.append(label)

        print("len(Y_all): ", len(Y_all))
        class_weight_val = class_weight.compute_class_weight(
            'balanced', np.unique(Y_all), Y_all)
        print("classes: ", np.unique(Y_all))
        print("counts for 0, 1, 2: ", zero_count, ones_count, two_count)
        print("class weight_val: ", class_weight_val)
        class_weight_dictionary = {
            0: class_weight_val[0],
            1: class_weight_val[1],
            2: class_weight_val[2]
        }
        print("dict: ", class_weight_dictionary)

        print("Data Ingested")
        # divide the data into training and test
        num_data = len(X_all)
        limit = math.ceil(num_data * 0.80)
        X_train_sentences = X_all
        Y_train = Y_all
        # Divide after conversions
        # divide the data into X_train, Y_train, X_test, Y_test
        #X_train_sentences = X_all[0: limit]
        #Y_train = Y_all[0: limit]
        #X_test_sentences = X_all[limit:]
        #Y_test = Y_all[limit:]
        #print("Data Divided")

        #Get embeeding
        #G = Word2VecEmbedding(self.embedding_filename, dimensions=vect_dimensions)

        G = GloveEmbedding(self.embedding_filename, dimensions=vect_dimensions)
        word_to_idx, idx_to_word, embedding = G.read_embedding()
        S = SentenceToIndices(word_to_idx)
        X_train_indices, max_len = S.map_sentence_list(X_train_sentences)
        print("Train data mappend to indices")
        if max_len % 2 != 0:
            max_len = max_len + 1

        P = PadSentences(max_len)
        X_train_pad = P.pad_list(X_train_indices)
        print("Train data padded")
        # TRIM
        trim_size = max_len
        #trim_size = 33
        Trim = TrimSentences(trim_size)
        X_train_pad = Trim.trim_list(X_train_pad)
        print("X[0], ", X_train_pad[0])
        #convert to numPY arrays
        X_train = np.array(X_train_pad)
        Y_train = np.array(Y_train)
        ones_count = np.count_nonzero(Y_train)
        zeros_count = len(Y_train) - ones_count
        print("ones count: ", ones_count)
        print("zeros count: ", zeros_count)
        print("two count: ", two_count)
        Y_train_old = Y_train
        Y_train = to_categorical(Y_train, num_classes=3)

        # Divide the data
        X_test_text = X_all[limit:]
        X_test = X_train[limit:]
        Y_test = Y_train[limit:]
        X_train = X_train[0:limit]
        Y_train = Y_train[0:limit]
        print("data divided on value: ", limit)
        print("lengths X_train, Y_train: ", len(X_train), len(Y_train))
        print("lengths X_test, Y_test: ", len(X_test), len(Y_test))

        print("Train data convert to numpy arrays")
        #NN = TweetSentiment2DCNN(trim_size, G)
        #NN = TweetSentiment2LSTM2Dense(trim_size, G)
        #NN =TweetSentiment2LSTM2Dense3Layer(trim_size, G)
        #NN =TweetSentiment2LSTM2Dense4Layer(trim_size, G)
        #NN = TweetSentimentCNN(trim_size, G)
        #print("Build GRU")
        #NN = TweetSentimentGRUSM(max_len, G)
        NN = TweetSentiment1D(trim_size, G)
        #NN = TweetSentiment1DRev(trim_size, G)

        print("model created")
        kernel_regularizer = l2(0.001)
        #kernel_regularizer = None
        NN.build(filters=11,
                 first_dropout=0,
                 second_dropout=0.05,
                 padding='valid',
                 dense_units=16)

        #NN.build(first_layer_units = max_len, second_layer_units = max_len, relu_dense_layer=16, dense_layer_units = 3,
        #         first_layer_dropout=0, second_layer_dropout=0, third_layer_dropout=0)
        print("model built")
        NN.summary()
        sgd = SGD(lr=0.03, momentum=0.009, decay=0.001, nesterov=True)
        rmsprop = RMSprop(decay=0.003)
        adam = Adam(lr=0.0003, decay=0.001)
        sgd = SGD(lr=0.05)
        NN.compile(optimizer=adam,
                   loss="categorical_crossentropy",
                   metrics=['accuracy', precision, recall, f1, fprate])

        print("model compiled")
        print("Begin training")
        #callback = TensorBoard(log_dir="/tmp/logs")
        #class_weight = {0: 0.67, 1: 0.33}
        #class_weight = None
        #history = NN.fit(X_train, Y_train, epochs=epochs, batch_size=32, callbacks=[callback], class_weight=class_weight_dictionary)
        history = NN.fit(X_train,
                         Y_train,
                         epochs=epochs,
                         batch_size=64,
                         class_weight=class_weight_dictionary,
                         validation_split=0.2)

        print("Model trained")
        print("Predicting")
        print("len(X_test): ", X_test)
        preds = NN.predict(X_test)
        print("len(preds): ", len(preds))
        print("type preds: ", type(preds))
        print("preds before: ", preds)
        preds = np.argmax(preds, axis=1)
        print("preds: ", preds)
        print("len(preds): ", len(preds))
        Y_test = Y_train_old[limit:]
        print("Y test: ", Y_test)
        c_matrix = confusion_matrix(Y_test, preds)
        print("matrix: ", c_matrix)
        print("Storing Errors: ")
        ErrorAnalysis.store_errors(X_test_text, Y_test, preds, "errorcnn.csv")
        print("Errors stored")
        print("Confusion matrix: ")
        prec_1, recall_1, f1_1, spec_1, t = calculate_cm_metrics(c_matrix, '')
        print("C1-> presicion, recall, F1: ", prec_1, recall_1, f1_1)

        #
        # X_test_indices, max_len = S.map_sentence_list(X_test_sentences)
        # print("Test data mapped")
        # X_test_pad = P.pad_list(X_test_indices)
        # print("Test data padded")
        # X_test = np.array(X_test_pad)
        # Y_test = np.array(Y_test)
        # print("Test data converted to numpy arrays")
        # loss, acc = NN.evaluate(X_test, Y_test, callbacks=[callback])
        # print("accuracy: ", acc)
        T = "I have a bad case of vomit"
        X_Predict = [
            "my zika is bad", "i love colombia",
            "my has been tested for ebola",
            "there is a diarrhea outbreak in the city"
        ]
        X_Predict_Idx, max_len2 = S.map_sentence_list(X_Predict)
        i = 0
        for s in X_Predict_Idx:
            print(str(i) + ": ", s)
            i = i + 1
        print(X_Predict)
        X_Predict_Final = P.pad_list(X_Predict_Idx)
        X_Predict_Final = Trim.trim_list(X_Predict_Final)
        #X_Predict = [X_Predict]
        X_Predict_Final = np.array(X_Predict_Final)
        print("Predict: ", np.argmax(NN.predict(X_Predict_Final)))
        print("Storing model and weights")
        NN.save_model(json_filename, h5_filename)
        if plot:
            print("Ploting")
            self.plot(history)
        print("Done!")

Ejemplo n.º 20

0

Mostrar archivo

def main(model_file, model_weights, labeled_tweets, embedding_filename):
    # load json and create model
    json_file = open(model_file, 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    # load weights into new model
    loaded_model.load_weights(model_weights)
    print("Loaded model from disk")
    # evaluate loaded model on test data
    loaded_model.compile(loss='binary_crossentropy',
                         optimizer='rmsprop',
                         metrics=['accuracy'])

    # open the file with tweets
    X_all = []
    Y_all = []
    All = []

    with open(labeled_tweets, "r", encoding="ISO-8859-1") as f:
        i = 0
        csv_file = csv.reader(f, delimiter=',')
        ones_count = 0

        for r in csv_file:
            if i != 0:
                label = int(r[1])
                if (label == 1) or (label == 2):
                    if ones_count <= 13000:
                        All.append(r)
                        ones_count += 1
                else:
                    All.append(r)
                # tweet = r[0]
                # label = r[1]
                # X_all.append(tweet)
                # Y_all.append(label)
            i = i + 1

    ones_count = 0
    for r in All:
        tweet = r[0]
        label = int(r[1])
        if (label == 2):
            label = 0
        # if (label == 1) and (ones_count <= 4611):
        #     X_all.append(tweet)
        #     Y_all.append(label)
        #     ones_count +=1
        # elif (label == 0):
        X_all.append(tweet)
        Y_all.append(label)

    print("Data Ingested")
    # divide the data into training and test
    num_data = len(X_all)
    limit = math.ceil(num_data * 0.60)
    X_train_sentences = X_all
    Y_train = Y_all
    # divide the data into X_train, Y_train, X_test, Y_test
    # X_train_sentences = X_all[0: limit]
    # Y_train = Y_all[0: limit]
    # X_test_sentences = X_all[limit:]
    # Y_test = Y_all[limit:]
    # print("Data Divided")
    # Get embeeding
    # G = Word2VecEmbedding(self.embedding_filename, dimensions=vect_dimensions)
    G = GloveEmbedding(embedding_filename, dimensions=50)
    word_to_idx, idx_to_word, embedding = G.read_embedding()
    S = SentenceToIndices(word_to_idx)
    X_train_indices, max_len = S.map_sentence_list(X_train_sentences)
    print("Train data mappend to indices")
    if max_len % 2 != 0:
        max_len = max_len + 1

    P = PadSentences(max_len)
    X_train_pad = P.pad_list(X_train_indices)
    print("Train data padded")
    # TRIM
    trim_size = max_len
    Trim = TrimSentences(trim_size)
    X_train_pad = Trim.trim_list(X_train_pad)
    print("X[0], ", X_train_pad[0])
    # convert to numPY arrays
    X_train = np.array(X_train_pad)
    Y_train = np.array(Y_train)
    ones_count = np.count_nonzero(Y_train)
    zeros_count = len(Y_train) - ones_count
    print("ones count: ", ones_count)
    print("zeros count: ", zeros_count)
    # Y_train = to_categorical(Y_train, num_classes=3)
    print("Train data convert to numpy arrays")
    Preds = loaded_model.predict(X_train)
    Preds = ((Preds >= 0.5) * 1).flatten()
    with open("data/alltweetsanderrors.csv", "w") as f:
        csv_writer = csv.writer(f, delimiter=",")
        i = 0
        err_count = 0
        for r in All:
            tweet = r[0]
            label = int(r[1])
            if label == 2:
                label = 0
            if Preds[i] != label:
                err_count += 1
                condition = 0
            else:
                condition = 1

            error_pred = []
            error_pred.append(tweet)
            error_pred.append(label)
            error_pred.append(Preds[i])
            error_pred.append(condition)
            csv_writer.writerow(error_pred)
            i += 1
        print("All tweets: ", i)
        print("Error count: ", err_count)