unlabaled_data = options.unlabeled_data

    MAX_SEQUENCE_LENGTH = 45
    MAX_NB_WORDS = 10000
    EMBEDDING_DIM = 300
    batch_size = 32
    nb_classes = 2
    nb_epoch = 100

    modelFile = options.w2v_model_file  #"../w2v_models/crisis_word_vector.txt"
    emb_model = KeyedVectors.load_word2vec_format(modelFile, binary=False)

    delim = "\t"
    train_xdata, train_y, le, labels, word_index, tokenizer = data_process.getTrData(
        train_file, MAX_NB_WORDS, MAX_SEQUENCE_LENGTH, delim)
    dev_x, dev_y, dev_le, dev_labels, _ = data_process.getDevData2(
        dev_file, tokenizer, MAX_SEQUENCE_LENGTH, delim)
    test_x, test_y, test_le, test_labels, _ = data_process.getDevData2(
        test_file, tokenizer, MAX_SEQUENCE_LENGTH, delim)

    ublabelled_X, _, _, _, _ = data_process.getDevData2(
        unlabaled_data, tokenizer, MAX_SEQUENCE_LENGTH, delim)

    print(train_xdata.shape)
    print(dev_x.shape)
    print(test_x.shape)
    ##### call backs for early stoping and saving the model
    callbacks = callbacks.EarlyStopping(monitor='val_acc',
                                        patience=10,
                                        verbose=0,
                                        mode='max')
    best_model_path = "models/weights.best.hdf5"
Ejemplo n.º 2
0
base = os.path.basename(trainFile)
tr_file_name = os.path.splitext(base)[0]

evaluation = checkdir("results_baseline/")
resultsFile = open(evaluation + tr_file_name + "_result.txt", 'w')

MAX_SEQUENCE_LENGTH = 20
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 300
batch_size = 128
delim = "\t"
data, _, _ = data_process.getData(allxFile, delim)
word_index, tokenizer = data_process.getTokenizer(data, MAX_NB_WORDS,
                                                  MAX_SEQUENCE_LENGTH)
train_x, train_y, train_le, train_labels, _ = data_process.getDevData2(
    trainFile, tokenizer, MAX_SEQUENCE_LENGTH, delim)
delim = "\t"
dev_x, dev_y, dev_le, dev_labels, _ = data_process.getDevData2(
    devFile, tokenizer, MAX_SEQUENCE_LENGTH, delim)
test_x, test_y, test_le, test_labels, _ = data_process.getDevData2(
    testFile, tokenizer, MAX_SEQUENCE_LENGTH, delim)
delim = "\t"
allx, _, _, _, _ = data_process.getDevData2(allxFile, tokenizer,
                                            MAX_SEQUENCE_LENGTH, delim)

graph = getGraph(graphFile)

MAX_SEQUENCE_LENGTH = 20
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 300
#emb_file="/export/home/fialam/crisis_semi_supervised/crisis-tweets/model/crisis_word_vector.txt"
Ejemplo n.º 3
0
    dev_file = options.val_data
    test_file = options.test_data

    MAX_SEQUENCE_LENGTH = 20
    MAX_NB_WORDS = 20000
    EMBEDDING_DIM = 300
    batch_size = 32

    nb_epoch = 500
    model_file = options.w2v_model_file  #"../w2v_models/crisis_word_vector.txt"
    emb_model = KeyedVectors.load_word2vec_format(model_file, binary=False)

    delim = "\t"
    train_x, train_y, train_le, labels, word_index, tokenizer = data_process.getTrData(
        train_file, MAX_NB_WORDS, MAX_SEQUENCE_LENGTH, delim)
    dev_x, dev_y, Dle, Dlabels, _ = data_process.getDevData2(
        dev_file, tokenizer, MAX_SEQUENCE_LENGTH, delim)
    test_x, test_y, Tle, Tlabels, _ = data_process.getDevData2(
        test_file, tokenizer, MAX_SEQUENCE_LENGTH, delim)

    print(train_x.shape)
    print(dev_x.shape)
    print(test_x.shape)

    ##### call backs for early stoping and saving the model
    callbacks = callbacks.EarlyStopping(monitor='val_acc',
                                        patience=25,
                                        verbose=0,
                                        mode='max')
    best_model_path = "models/weights.best.hdf5"
    checkpoint = ModelCheckpoint(best_model_path,
                                 monitor='val_acc',