num_epochs = 50

    #model parameters
    num_filters = 64
    embed_dim = 100
    weight_decay = 1e-4

    #embedding matrix
    print('preparing embedding matrix...')
    words_not_found = []
    nb_words = min(MAX_NB_WORDS, len(word_index) + 1)
    embedding_matrix = np.zeros((nb_words, embed_dim))
    for word, i in tqdm(word_index.items()):
        if i >= nb_words:
            continue
        embedding_vector = rep_reader.get_word_rep(index_name, word)
        if (embedding_vector is not None) and len(embedding_vector) > 0:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
        else:
            words_not_found.append(word)
    print('number of null word embeddings: %d' %
          np.sum(np.sum(embedding_matrix, axis=1) == 0))

    #print("words in document not found in the index : ", np.random.choice(words_not_found, 10))

    # Model 1 CNN

    model = Sequential()
    model.add(
        Embedding(nb_words,
Beispiel #2
0
    else:
        raise ValueError(
            "You must specify either kerasFile or esIndex. Neither specified.")

    sd = SpreadsheetData(args.inFile, args.textColumn, args.labelColumn,
                         args.testSize, args.randomizeTestSet)

    # embedding matrix
    print('preparing embedding matrix...')
    words_not_found = []
    nb_words = min(sd.MAX_NB_WORDS, len(sd.word_index) + 1)
    embed_dim = rep_reader.rep_shape[0]
    embedding_matrix = np.zeros((nb_words, embed_dim))
    for word, i in tqdm(sd.word_index.items()):
        if i >= nb_words:
            continue
        embedding_vector = rep_reader.get_word_rep(args.esIndex, word)
        if (embedding_vector is not None) and len(embedding_vector) > 0:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
        else:
            words_not_found.append(word)
    print('number of null word embeddings: %d' %
          np.sum(np.sum(embedding_matrix, axis=1) == 0))

    run = SpreadsheetClassificationExecution(sd, embedding_matrix,
                                             "SuperSimpleLSTMClassifier",
                                             args.kerasFile)

    print("Accuracy:%f" % run.accuracy)