def main(args): """ Builds and evaluates an ensemble """ LOG = "log.txt" F1 = "f1_{}_scores.txt" count = 0 #read the data print("Reading the data") dataDict = loadData(args.f, args.s) trainingData = dataDict["train_x"] devData = dataDict["dev_x"] testData = dataDict["test_x"] rawTrainingLabels = dataDict["train_y"] rawDevLabels = dataDict["dev_y"] rawTestingLabels = dataDict["test_y"] #make the event map eventMap = load(open(args.m)) trainingLabels = eventMap.namesToMatrix(rawTrainingLabels) devLabels = eventMap.namesToMatrix(rawDevLabels) testingLabels = eventMap.namesToMatrix(rawTestingLabels) if args.t: evalData = testData evalLabels = rawTestingLabels evalFile = F1.format("test") else: evalData = devData evalLabels = rawDevLabels evalFile = F1.format("dev") if args.emb: evalData = setupEmbeddings(evalData) #walk the directory for sub-directories without a scores file for path, dirs, names in walk(args.d): print("In dir {}".format(path)) #if there is a log file and no f1 scores, eval all the models if LOG in names and evalFile not in names: count += 1 print("Found Models in {}".format(path)) #open the output file with open(join(path, evalFile), "a") as out: for name in sorted(names): if name.endswith(".h5"): print("\nEvaluating {}\n".format(name)) evalModel(evalData, evalLabels, join(path, name), eventMap, out, args.b) if count == args.l: break
def main(args): """ Builds and evaluates an ensemble """ useBothHalves = args.full #read the data print("Reading the data") dataDict = loadData(args.f, args.s) trainingData = dataDict["train_x"] devData = dataDict["dev_x"] testData = dataDict["test_x"] rawTrainingLabels = dataDict["train_y"] rawDevLabels = dataDict["dev_y"] rawTestingLabels = dataDict["test_y"] #make the event map eventMap = load(open(args.m)) trainingLabels = eventMap.namesToMatrix(rawTrainingLabels) devLabels = eventMap.namesToMatrix(rawDevLabels) testingLabels = eventMap.namesToMatrix(rawTestingLabels) if args.emb: trainingData = setupEmbeddings(trainingData, useBothHalves) devData = setupEmbeddings(devData, useBothHalves) testData = setupEmbeddings(testData, useBothHalves) if args.f1: parser = parseF1Scores else: parser = parseScores #load the pre-made ensemble if args.e: models = loadEnsemble(args.e, eventMap) #find the top models else: models = loadBest(args.d, eventMap, args.n, parser) #evalutate the ensemble #make predictions trainPred = majorityPredictions(models, trainingData, args.b, len(eventMap)) devPred = majorityPredictions(models, devData, args.b, len(eventMap)) print("-----Training Scores-----") evaluatePredictions(trainPred, rawTrainingLabels, eventMap) print("\n-----Dev Scores------") evaluatePredictions(devPred, rawDevLabels, eventMap) if args.t: testPred = majorityPredictions(models, testData, args.b, len(eventMap)) print("\n\n-----Test Scores------") evaluatePredictions(testPred, rawTestingLabels, eventMap)
def main(args): """ Loads and evaluates a model """ print("Reading the data") dataDict = loadData(args.f, args.s) devData = dataDict["dev_x"] testData = dataDict["test_x"] if args.emb: devData = setupEmbeddings(devData) testData = setupEmbeddings(testData) devLabels = dataDict["dev_y"] testingLabels = dataDict["test_y"] #make the event map eventMap = load(open(args.p)) #load the model print("Loading model") model = loadModel(args.m, eventMap) #pick the correct set to eval on if args.t: print("Using Testing") evalLabels = testingLabels evalData = testData else: print("Using Dev") evalLabels = devLabels evalData = devData print("Evaluating") #make predictions pred = predictClasses(model, evalData, args.b) print("\nEvalutation") #evaluate the model print("-----Scores-----") evaluatePredictions(pred, evalLabels, eventMap)
def main(args): """ Runs and evaluates the model """ #n.random.seed(13) n.random.seed(16) #n.random.seed(17) #better for realis14 #n.random.seed(20) print("Reading the data") dataDict = loadData(args.f, args.s) useBothHalves = args.full useEmb = args.emb or args.full if args.o: mkdir(args.o) #unpack the data if useEmb: trainData = setupEmbeddings(dataDict["train_x"], useBothHalves) devData = setupEmbeddings(dataDict["dev_x"], useBothHalves) testData = setupEmbeddings(dataDict["test_x"], useBothHalves) else: trainData = dataDict["train_x"] devData = dataDict["dev_x"] testData = dataDict["test_x"] rawTrainingLabels = dataDict["train_y"] rawDevLabels = dataDict["dev_y"] rawTestingLabels = dataDict["test_y"] #make the event map eventMap = load(open(args.m)) trainingLabels = eventMap.namesToMatrix(rawTrainingLabels) devLabels = eventMap.namesToMatrix(rawDevLabels) testingLabels = eventMap.namesToMatrix(rawTestingLabels) if useEmb: (samples, seqLen) = trainData[0].shape else: (samples, seqLen, dim) = trainData[0].shape print(trainData[0].shape) if args.s: (rightSamples, contextDim) = trainData[2].shape else: if useBothHalves: rightSamples = trainData[0].shape[0] (_, contextDim) = trainData[-1].shape else: (rightSamples, contextDim) = trainData[-1].shape print("#instances: {}, seq len: {}".format(samples, seqLen)) print("right side {} {}".format(rightSamples, contextDim)) print("labels shape {}".format(trainingLabels.shape)) print("Building the model") #get the model if useEmb: w2vPath = "data/vectors/word2vec/GoogleNews-vectors-negative300.bin.gz" indexPath = "data/word_index.p" #load the realis data if args.realis: realisData = loadRealisData(args.realis) trainData += [realisData[0]] devData += [realisData[1]] testData += [realisData[2]] (_, contextDim) = realisData[0].shape #load the weights w2v = loadW2V(w2vPath) #load the index wordIndex = load(open(indexPath)) #make the initial weights initWeights = makeEmbeddingWeights(w2v, wordIndex) if args.full or args.realis: model = buildMultiEmbModel(len(eventMap), seqLen, contextDim, initWeights, eventMap) else: model = buildCNNEmbModel(len(eventMap), seqLen, contextDim, initWeights, eventMap) else: model = buildCNNModel(len(eventMap), seqLen, dim, contextDim, eventMap) #train the model print("Training the model") #hard coding class weights... weights = defaultdict(lambda: 5.5) #weights = defaultdict(lambda: 7.0) weights[eventMap.nilIndex()] = 1.0 #make the logger logger = makeLogger(args.o, eventMap) model.fit(trainData, trainingLabels, nb_epoch=args.e, batch_size=args.b, validation_data=(devData, devLabels), callbacks=[logger], class_weight=weights) #get the best model best = logger.best() print("Best Model round: {} val: {}".format(logger.bestModel, logger.bestScore)) print("Make Predictions") #make predictions trainPred = predictClasses(best, trainData, args.b) devPred = predictClasses(best, devData, args.b) print("\nEvalutation") #evaluate the model print("-----Training Scores-----") evaluatePredictions(trainPred, rawTrainingLabels, eventMap) print("\n-----Dev Scores------") evaluatePredictions(devPred, rawDevLabels, eventMap) if args.t: testPred = predictClasses(best, testData, args.b) print("\n\n-----Test Scores------") evaluatePredictions(testPred, rawTestingLabels, eventMap)
def main(args): """ Makes predictions using the loaded model """ w2vPath = "data/vectors/word2vec/GoogleNews-vectors-negative300.bin.gz" useBothHalves = args.full useEmb = args.emb or args.full print("Reading Data") dataDict = loadData(args.f, args.s) #unpack the data if useEmb: trainData = setupEmbeddings(dataDict["train_x"], useBothHalves) devData = setupEmbeddings(dataDict["dev_x"], useBothHalves) testData = setupEmbeddings(dataDict["test_x"], useBothHalves) else: trainData = dataDict["train_x"] devData = dataDict["dev_x"] testData = dataDict["test_x"] #rawTrainingLabels = dataDict["train_y"] #rawDevLabels = dataDict["dev_y"] #rawTestingLabels = dataDict["test_y"] if useEmb: (samples, seqLen) = trainData[0].shape else: (samples, seqLen, dim) = trainData[0].shape print(trainData[0].shape) if args.s: (rightSamples, contextDim) = trainData[2].shape else: if useBothHalves: rightSamples = trainData[0].shape[0] (_, contextDim) = trainData[-1].shape else: (rightSamples, contextDim) = trainData[-1].shape print("#instances: {}, seq len: {}".format(samples, seqLen)) print("right side {} {}".format(rightSamples, contextDim)) #print("labels shape {}".format(trainingLabels.shape)) #load the realis data if args.realis: realisData = loadRealisData(args.realis) trainData += [realisData[0]] devData += [realisData[1]] testData += [realisData[2]] (_, contextDim) = realisData[0].shape eventMap = load(open(args.m)) #load the model model = loadBest([args.a], eventMap)[0] model.summary() eventOut = "eventOut" eventProbOut = "eventProbOut" mkdir(join(args.a, eventOut)) mkdir(join(args.a, eventProbOut)) makeNames = lambda p: [ join(args.a, p, i) for i in ["training_pred.csv", "dev_pred.csv", "test_pred.csv"] ] outModel = buildCNNEmbOutput(len(eventMap), seqLen, contextDim, eventMap, model, len(args.realis) > 0) eventEmb = predictEventEmb(outModel, [trainData, devData, testData]) eventProbEmb = predictEventEmb(model, [trainData, devData, testData]) #load event info eventInfo = loadEvents(args.f) writeEventEmb(eventEmb, makeNames(eventOut), eventInfo) writeEventEmb(eventProbEmb, makeNames(eventProbOut), eventInfo)
def main(args): """ Prints out html indicating the errors """ w2vPath = "data/vectors/word2vec/GoogleNews-vectors-negative300.bin.gz" #load the word vector model print("Loading vectors") wordModel = v.loadW2V(w2vPath) print("Reading Data") dataDict = loadData(args.f, args.s) trainingData = dataDict["train_x"] devData = dataDict["dev_x"] testData = dataDict["test_x"] trainEvents = dataDict["train_events"] devEvents = dataDict["dev_events"] testEvents = dataDict["test_events"] rawTrainingLabels = dataDict["train_y"] rawDevLabels = dataDict["dev_y"] rawTestingLabels = dataDict["test_y"] realisIndex = None if args.tr: evalData = trainingData evalLabels = rawTrainingLabels evalEvents = trainEvents ext = "train" realisIndex = 0 elif args.t: evalData = testData evalLabels = rawTestingLabels evalEvents = testEvents ext = "test" realisIndex = 2 else: evalData = devData evalLabels = rawDevLabels evalEvents = devEvents ext = "dev" realisIndex = 1 print("gold events {}".format(len(evalEvents))) #load the event map eventMap = load(open(args.m)) print("Loading the model from {}".format(args.a)) if args.emb: evalData = setupEmbeddings(evalData, args.full) #load the realis data if args.realis: realisData = loadRealisData(args.realis) evalData += [realisData[realisIndex]] (_, contextDim) = realisData[0].shape #load the ensemble if args.e: ensemble = loadEnsemble(args.a, eventMap) pred = majorityPredictions(ensemble, evalData, args.b, len(eventMap)) else: #load the model model = loadBest([args.a], eventMap)[0] #make predictions pred = predictClasses(model, evalData, args.b) print("Creating HTML") #make a map predicted vs actual events annoMap = makeAnnoMap(evalEvents, eventMap.toNames(pred)) markErrors(evalLabels, pred, evalEvents) createErrorReports(args.d, join(args.a, "{}_errors.html").format(ext), annoMap, wordModel)
def main(args): """ Runs and evaluates the model """ #show gpu connection info #sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) print("Reading the data") dataDict = loadData(args.f) trainingData = dataDict["train_x"] devData = dataDict["dev_x"] testData = dataDict["test_x"] rawTrainingLabels = dataDict["train_y"] rawDevLabels = dataDict["dev_y"] rawTestingLabels = dataDict["test_y"] #make the event map eventMap = load(open(args.m)) trainingLabels = eventMap.namesToMatrix(rawTrainingLabels) devLabels = eventMap.namesToMatrix(rawDevLabels) testingLabels = eventMap.namesToMatrix(rawTestingLabels) (samples, length) = trainingData.shape print("#instances: {}, vector length: {}".format(samples, length)) print("Building the model") #get the model model = buildModel(length, len(eventMap), microF1(eventMap)) #model = buildModel(length, len(eventMap)) print(model.summary()) print("Training the model") #train the model #TODO include F1 metric #TODO try 1/cube root p for weights #TODO write out parameters to logger #hard coding class weights... #weights = defaultdict(lambda: 49.0) #weights = defaultdict(lambda: 1.0) #weights = defaultdict(lambda: 25.0) weights = defaultdict(lambda: 10.0) weights[eventMap.nilIndex()] = 1.0 #make the logger logger = makeLogger(args.o, eventMap) model.fit(trainingData, trainingLabels, nb_epoch=args.e, batch_size=args.b, validation_data=(devData, devLabels), class_weight=weights, callbacks=[logger]) #get the best model best = logger.best() print("Best Model round: {} val: {}".format(logger.bestModel, logger.bestScore)) print("Make Predictions") #make predictions trainPred = best.predict_classes(trainingData, batch_size=args.b) devPred = best.predict_classes(devData, batch_size=args.b) print("\nEvalutation") #evaluate the model print("-----Training Scores-----") evaluatePredictions(trainPred, rawTrainingLabels, eventMap) print("\n-----Dev Scores------") evaluatePredictions(devPred, rawDevLabels, eventMap) if args.t: testPred = best.predict_classes(testData, batch_size=args.b) print("\n\n-----Test Scores------") evaluatePredictions(testPred, rawTestingLabels, eventMap) print("STD eval {}".format(best.evaluate(devData, devLabels)))
def main(args): """ Runs and evaluates the model """ print("Reading the data") dataDict = loadData(args.f, args.s) useBothHalves = args.full useEmb = args.emb or args.full #unpack the data if useEmb: trainData = setupEmbeddings(dataDict["train_x"], useBothHalves) devData = setupEmbeddings(dataDict["dev_x"], useBothHalves) testData = setupEmbeddings(dataDict["test_x"], useBothHalves) else: trainData = dataDict["train_x"] devData = dataDict["dev_x"] testData = dataDict["test_x"] rawTrainingLabels = dataDict["train_y"] rawDevLabels = dataDict["dev_y"] rawTestingLabels = dataDict["test_y"] #make the event map eventMap = load(open(args.m)) params = Parameters(eventMap) trainingLabels = eventMap.namesToMatrix(rawTrainingLabels) devLabels = eventMap.namesToMatrix(rawDevLabels) #testingLabels = eventMap.namesToMatrix(rawTestingLabels) if args.dev: data, labels = joinDev(trainData, trainingLabels, devData, devLabels) else: data = trainData labels = trainingLabels params.emb = args.emb params.useBothHalves = useBothHalves params.samples = data[0].shape[0] params.windowSize = data[0].shape[1] params.batchSize = args.b params.epochs = args.e params.split = args.s params.limit = args.limit if useEmb: w2vPath = "data/vectors/word2vec/GoogleNews-vectors-negative300.bin.gz" indexPath = "data/word_index.p" #load the weights w2v = loadW2V(w2vPath) #load the index wordIndex = load(open(indexPath)) #make the initial weights params.wordWeights = makeEmbeddingWeights(w2v, wordIndex) else: params.wordSize = data[0].shape[2] if args.s: params.contextSize = data[2].shape[1] else: if useBothHalves: params.contextSize = data[-1].shape[1] else: params.contextSize = data[1].shape[1] print("Training") if args.std: print("Standard Cross Validation") validator = StandardSplitter(args.c) elif args.strat: print("Stratified Cross Validation") validator = StratifiedSplitter(args.c) else: print("Random Cross Validation") validator = RandomSplitter(args.p, args.c) mkdir(args.o) models = crossTrain(data, labels, args.o, args.k, validator, params) print("Make Predictions") """