Beispiel #1
0
def main(args):
    """
	Builds and evaluates an ensemble
	"""
    LOG = "log.txt"
    F1 = "f1_{}_scores.txt"
    count = 0

    #read the data
    print("Reading the data")
    dataDict = loadData(args.f, args.s)

    trainingData = dataDict["train_x"]
    devData = dataDict["dev_x"]
    testData = dataDict["test_x"]

    rawTrainingLabels = dataDict["train_y"]
    rawDevLabels = dataDict["dev_y"]
    rawTestingLabels = dataDict["test_y"]

    #make the event map
    eventMap = load(open(args.m))

    trainingLabels = eventMap.namesToMatrix(rawTrainingLabels)
    devLabels = eventMap.namesToMatrix(rawDevLabels)
    testingLabels = eventMap.namesToMatrix(rawTestingLabels)

    if args.t:
        evalData = testData
        evalLabels = rawTestingLabels
        evalFile = F1.format("test")
    else:
        evalData = devData
        evalLabels = rawDevLabels
        evalFile = F1.format("dev")

    if args.emb:
        evalData = setupEmbeddings(evalData)

    #walk the directory for sub-directories without a scores file
    for path, dirs, names in walk(args.d):

        print("In dir {}".format(path))

        #if there is a log file and no f1 scores, eval all the models
        if LOG in names and evalFile not in names:

            count += 1
            print("Found Models in {}".format(path))

            #open the output file
            with open(join(path, evalFile), "a") as out:
                for name in sorted(names):
                    if name.endswith(".h5"):
                        print("\nEvaluating {}\n".format(name))
                        evalModel(evalData, evalLabels, join(path, name),
                                  eventMap, out, args.b)

        if count == args.l:
            break
def main(args):
    """
	Builds and evaluates an ensemble
	"""
    useBothHalves = args.full

    #read the data
    print("Reading the data")
    dataDict = loadData(args.f, args.s)

    trainingData = dataDict["train_x"]
    devData = dataDict["dev_x"]
    testData = dataDict["test_x"]

    rawTrainingLabels = dataDict["train_y"]
    rawDevLabels = dataDict["dev_y"]
    rawTestingLabels = dataDict["test_y"]

    #make the event map
    eventMap = load(open(args.m))

    trainingLabels = eventMap.namesToMatrix(rawTrainingLabels)
    devLabels = eventMap.namesToMatrix(rawDevLabels)
    testingLabels = eventMap.namesToMatrix(rawTestingLabels)

    if args.emb:
        trainingData = setupEmbeddings(trainingData, useBothHalves)
        devData = setupEmbeddings(devData, useBothHalves)
        testData = setupEmbeddings(testData, useBothHalves)

    if args.f1:
        parser = parseF1Scores
    else:
        parser = parseScores

    #load the pre-made ensemble
    if args.e:
        models = loadEnsemble(args.e, eventMap)

    #find the top models
    else:
        models = loadBest(args.d, eventMap, args.n, parser)

    #evalutate the ensemble
    #make predictions
    trainPred = majorityPredictions(models, trainingData, args.b,
                                    len(eventMap))
    devPred = majorityPredictions(models, devData, args.b, len(eventMap))

    print("-----Training Scores-----")
    evaluatePredictions(trainPred, rawTrainingLabels, eventMap)

    print("\n-----Dev Scores------")
    evaluatePredictions(devPred, rawDevLabels, eventMap)

    if args.t:
        testPred = majorityPredictions(models, testData, args.b, len(eventMap))
        print("\n\n-----Test Scores------")
        evaluatePredictions(testPred, rawTestingLabels, eventMap)
Beispiel #3
0
def main(args):
    """
	Loads and evaluates a model
	"""
    print("Reading the data")
    dataDict = loadData(args.f, args.s)

    devData = dataDict["dev_x"]
    testData = dataDict["test_x"]

    if args.emb:
        devData = setupEmbeddings(devData)
        testData = setupEmbeddings(testData)

    devLabels = dataDict["dev_y"]
    testingLabels = dataDict["test_y"]

    #make the event map
    eventMap = load(open(args.p))

    #load the model
    print("Loading model")
    model = loadModel(args.m, eventMap)

    #pick the correct set to eval on
    if args.t:
        print("Using Testing")
        evalLabels = testingLabels
        evalData = testData

    else:
        print("Using Dev")
        evalLabels = devLabels
        evalData = devData

    print("Evaluating")
    #make predictions
    pred = predictClasses(model, evalData, args.b)

    print("\nEvalutation")
    #evaluate the model

    print("-----Scores-----")
    evaluatePredictions(pred, evalLabels, eventMap)
Beispiel #4
0
def main(args):
	"""
	Runs and evaluates the model
	"""
	#n.random.seed(13)
	n.random.seed(16)
	#n.random.seed(17) #better for realis14
	#n.random.seed(20)

	print("Reading the data")
	dataDict = loadData(args.f, args.s)

	useBothHalves = args.full
	useEmb = args.emb or args.full

	if args.o:
		mkdir(args.o)

	#unpack the data
	if useEmb:
		trainData = setupEmbeddings(dataDict["train_x"], useBothHalves)
		devData = setupEmbeddings(dataDict["dev_x"], useBothHalves)
		testData = setupEmbeddings(dataDict["test_x"], useBothHalves)
	else:
		trainData = dataDict["train_x"]
		devData = dataDict["dev_x"]
		testData = dataDict["test_x"]

	rawTrainingLabels = dataDict["train_y"] 
	rawDevLabels = dataDict["dev_y"] 
	rawTestingLabels = dataDict["test_y"] 
	
	#make the event map
	eventMap = load(open(args.m))

	trainingLabels = eventMap.namesToMatrix(rawTrainingLabels)
	devLabels = eventMap.namesToMatrix(rawDevLabels)
	testingLabels = eventMap.namesToMatrix(rawTestingLabels)

	if useEmb:
		(samples, seqLen) = trainData[0].shape
	else:
		(samples, seqLen, dim) = trainData[0].shape

	print(trainData[0].shape)

	if args.s:
		(rightSamples, contextDim) = trainData[2].shape
	else:

		if useBothHalves:
			rightSamples = trainData[0].shape[0]
			(_, contextDim) = trainData[-1].shape
		else:
			(rightSamples, contextDim) = trainData[-1].shape

	print("#instances: {}, seq len: {}".format(samples, seqLen))
	print("right side {} {}".format(rightSamples, contextDim))
	print("labels shape {}".format(trainingLabels.shape))

	print("Building the model")
	
	#get the model
	if useEmb:

		w2vPath = "data/vectors/word2vec/GoogleNews-vectors-negative300.bin.gz"
		indexPath = "data/word_index.p"

		#load the realis data
		if args.realis:
			realisData = loadRealisData(args.realis)
			trainData += [realisData[0]]
			devData += [realisData[1]]
			testData += [realisData[2]]

			(_, contextDim) = realisData[0].shape

		#load the weights
		w2v = loadW2V(w2vPath)

		#load the index
		wordIndex = load(open(indexPath))

		#make the initial weights
		initWeights = makeEmbeddingWeights(w2v, wordIndex)

		if args.full or args.realis:
			model = buildMultiEmbModel(len(eventMap), seqLen, contextDim, initWeights, eventMap)

		else:
			model = buildCNNEmbModel(len(eventMap), seqLen, contextDim, initWeights, eventMap)

	else:
		model = buildCNNModel(len(eventMap), seqLen, dim, contextDim, eventMap)

	#train the model
	print("Training the model")
	
	#hard coding class weights...
	weights = defaultdict(lambda: 5.5)
	#weights = defaultdict(lambda: 7.0)
	weights[eventMap.nilIndex()] = 1.0

	#make the logger
	logger = makeLogger(args.o, eventMap)

	model.fit(trainData, trainingLabels, nb_epoch=args.e, batch_size=args.b, 
		validation_data=(devData, devLabels), callbacks=[logger], 
		class_weight=weights)

	#get the best model
	best = logger.best()

	print("Best Model round: {} val: {}".format(logger.bestModel, logger.bestScore))

	print("Make Predictions")

	#make predictions
	trainPred = predictClasses(best, trainData, args.b)
	devPred = predictClasses(best, devData, args.b)

	print("\nEvalutation")
	#evaluate the model

	print("-----Training Scores-----")
	evaluatePredictions(trainPred, rawTrainingLabels, eventMap)

	print("\n-----Dev Scores------")
	evaluatePredictions(devPred, rawDevLabels, eventMap)

	if args.t:
		testPred = predictClasses(best, testData, args.b)
		print("\n\n-----Test Scores------")
		evaluatePredictions(testPred, rawTestingLabels, eventMap)
def main(args):
    """
	Makes predictions using the loaded model
	"""
    w2vPath = "data/vectors/word2vec/GoogleNews-vectors-negative300.bin.gz"

    useBothHalves = args.full
    useEmb = args.emb or args.full

    print("Reading Data")
    dataDict = loadData(args.f, args.s)

    #unpack the data
    if useEmb:
        trainData = setupEmbeddings(dataDict["train_x"], useBothHalves)
        devData = setupEmbeddings(dataDict["dev_x"], useBothHalves)
        testData = setupEmbeddings(dataDict["test_x"], useBothHalves)
    else:
        trainData = dataDict["train_x"]
        devData = dataDict["dev_x"]
        testData = dataDict["test_x"]

    #rawTrainingLabels = dataDict["train_y"]
    #rawDevLabels = dataDict["dev_y"]
    #rawTestingLabels = dataDict["test_y"]

    if useEmb:
        (samples, seqLen) = trainData[0].shape
    else:
        (samples, seqLen, dim) = trainData[0].shape

    print(trainData[0].shape)

    if args.s:
        (rightSamples, contextDim) = trainData[2].shape
    else:

        if useBothHalves:
            rightSamples = trainData[0].shape[0]
            (_, contextDim) = trainData[-1].shape
        else:
            (rightSamples, contextDim) = trainData[-1].shape

    print("#instances: {}, seq len: {}".format(samples, seqLen))
    print("right side {} {}".format(rightSamples, contextDim))
    #print("labels shape {}".format(trainingLabels.shape))

    #load the realis data
    if args.realis:
        realisData = loadRealisData(args.realis)
        trainData += [realisData[0]]
        devData += [realisData[1]]
        testData += [realisData[2]]

        (_, contextDim) = realisData[0].shape

    eventMap = load(open(args.m))

    #load the model
    model = loadBest([args.a], eventMap)[0]

    model.summary()

    eventOut = "eventOut"
    eventProbOut = "eventProbOut"
    mkdir(join(args.a, eventOut))
    mkdir(join(args.a, eventProbOut))

    makeNames = lambda p: [
        join(args.a, p, i)
        for i in ["training_pred.csv", "dev_pred.csv", "test_pred.csv"]
    ]

    outModel = buildCNNEmbOutput(len(eventMap), seqLen, contextDim, eventMap,
                                 model,
                                 len(args.realis) > 0)

    eventEmb = predictEventEmb(outModel, [trainData, devData, testData])
    eventProbEmb = predictEventEmb(model, [trainData, devData, testData])

    #load event info
    eventInfo = loadEvents(args.f)

    writeEventEmb(eventEmb, makeNames(eventOut), eventInfo)
    writeEventEmb(eventProbEmb, makeNames(eventProbOut), eventInfo)
Beispiel #6
0
def main(args):
    """
	Prints out html indicating the errors
	"""
    w2vPath = "data/vectors/word2vec/GoogleNews-vectors-negative300.bin.gz"

    #load the word vector model
    print("Loading vectors")
    wordModel = v.loadW2V(w2vPath)

    print("Reading Data")
    dataDict = loadData(args.f, args.s)

    trainingData = dataDict["train_x"]
    devData = dataDict["dev_x"]
    testData = dataDict["test_x"]

    trainEvents = dataDict["train_events"]
    devEvents = dataDict["dev_events"]
    testEvents = dataDict["test_events"]

    rawTrainingLabels = dataDict["train_y"]
    rawDevLabels = dataDict["dev_y"]
    rawTestingLabels = dataDict["test_y"]
    realisIndex = None

    if args.tr:
        evalData = trainingData
        evalLabels = rawTrainingLabels
        evalEvents = trainEvents
        ext = "train"
        realisIndex = 0

    elif args.t:
        evalData = testData
        evalLabels = rawTestingLabels
        evalEvents = testEvents
        ext = "test"
        realisIndex = 2
    else:
        evalData = devData
        evalLabels = rawDevLabels
        evalEvents = devEvents
        ext = "dev"
        realisIndex = 1

    print("gold events {}".format(len(evalEvents)))

    #load the event map
    eventMap = load(open(args.m))

    print("Loading the model from {}".format(args.a))

    if args.emb:
        evalData = setupEmbeddings(evalData, args.full)

        #load the realis data
        if args.realis:
            realisData = loadRealisData(args.realis)
            evalData += [realisData[realisIndex]]

            (_, contextDim) = realisData[0].shape

    #load the ensemble
    if args.e:
        ensemble = loadEnsemble(args.a, eventMap)

        pred = majorityPredictions(ensemble, evalData, args.b, len(eventMap))

    else:
        #load the model
        model = loadBest([args.a], eventMap)[0]

        #make predictions
        pred = predictClasses(model, evalData, args.b)

    print("Creating HTML")

    #make a map predicted vs actual events
    annoMap = makeAnnoMap(evalEvents, eventMap.toNames(pred))

    markErrors(evalLabels, pred, evalEvents)

    createErrorReports(args.d,
                       join(args.a, "{}_errors.html").format(ext), annoMap,
                       wordModel)
def main(args):
    """
	Runs and evaluates the model
	"""
    #show gpu connection info
    #sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))

    print("Reading the data")
    dataDict = loadData(args.f)

    trainingData = dataDict["train_x"]
    devData = dataDict["dev_x"]
    testData = dataDict["test_x"]

    rawTrainingLabels = dataDict["train_y"]
    rawDevLabels = dataDict["dev_y"]
    rawTestingLabels = dataDict["test_y"]

    #make the event map
    eventMap = load(open(args.m))

    trainingLabels = eventMap.namesToMatrix(rawTrainingLabels)
    devLabels = eventMap.namesToMatrix(rawDevLabels)
    testingLabels = eventMap.namesToMatrix(rawTestingLabels)

    (samples, length) = trainingData.shape

    print("#instances: {}, vector length: {}".format(samples, length))

    print("Building the model")

    #get the model
    model = buildModel(length, len(eventMap), microF1(eventMap))
    #model = buildModel(length, len(eventMap))
    print(model.summary())

    print("Training the model")
    #train the model
    #TODO include F1 metric
    #TODO try 1/cube root p for weights
    #TODO write out parameters to logger

    #hard coding class weights...
    #weights = defaultdict(lambda: 49.0)
    #weights = defaultdict(lambda: 1.0)
    #weights = defaultdict(lambda: 25.0)
    weights = defaultdict(lambda: 10.0)
    weights[eventMap.nilIndex()] = 1.0

    #make the logger
    logger = makeLogger(args.o, eventMap)

    model.fit(trainingData,
              trainingLabels,
              nb_epoch=args.e,
              batch_size=args.b,
              validation_data=(devData, devLabels),
              class_weight=weights,
              callbacks=[logger])

    #get the best model
    best = logger.best()

    print("Best Model round: {} val: {}".format(logger.bestModel,
                                                logger.bestScore))

    print("Make Predictions")
    #make predictions
    trainPred = best.predict_classes(trainingData, batch_size=args.b)
    devPred = best.predict_classes(devData, batch_size=args.b)

    print("\nEvalutation")
    #evaluate the model

    print("-----Training Scores-----")
    evaluatePredictions(trainPred, rawTrainingLabels, eventMap)

    print("\n-----Dev Scores------")
    evaluatePredictions(devPred, rawDevLabels, eventMap)

    if args.t:
        testPred = best.predict_classes(testData, batch_size=args.b)
        print("\n\n-----Test Scores------")
        evaluatePredictions(testPred, rawTestingLabels, eventMap)

    print("STD eval {}".format(best.evaluate(devData, devLabels)))
Beispiel #8
0
def main(args):
    """
	Runs and evaluates the model
	"""
    print("Reading the data")
    dataDict = loadData(args.f, args.s)

    useBothHalves = args.full
    useEmb = args.emb or args.full

    #unpack the data
    if useEmb:
        trainData = setupEmbeddings(dataDict["train_x"], useBothHalves)
        devData = setupEmbeddings(dataDict["dev_x"], useBothHalves)
        testData = setupEmbeddings(dataDict["test_x"], useBothHalves)
    else:
        trainData = dataDict["train_x"]
        devData = dataDict["dev_x"]
        testData = dataDict["test_x"]

    rawTrainingLabels = dataDict["train_y"]
    rawDevLabels = dataDict["dev_y"]
    rawTestingLabels = dataDict["test_y"]

    #make the event map
    eventMap = load(open(args.m))

    params = Parameters(eventMap)

    trainingLabels = eventMap.namesToMatrix(rawTrainingLabels)
    devLabels = eventMap.namesToMatrix(rawDevLabels)
    #testingLabels = eventMap.namesToMatrix(rawTestingLabels)

    if args.dev:
        data, labels = joinDev(trainData, trainingLabels, devData, devLabels)

    else:
        data = trainData
        labels = trainingLabels

    params.emb = args.emb
    params.useBothHalves = useBothHalves
    params.samples = data[0].shape[0]
    params.windowSize = data[0].shape[1]
    params.batchSize = args.b
    params.epochs = args.e
    params.split = args.s
    params.limit = args.limit

    if useEmb:
        w2vPath = "data/vectors/word2vec/GoogleNews-vectors-negative300.bin.gz"
        indexPath = "data/word_index.p"

        #load the weights
        w2v = loadW2V(w2vPath)

        #load the index
        wordIndex = load(open(indexPath))

        #make the initial weights
        params.wordWeights = makeEmbeddingWeights(w2v, wordIndex)

    else:
        params.wordSize = data[0].shape[2]

    if args.s:
        params.contextSize = data[2].shape[1]
    else:

        if useBothHalves:
            params.contextSize = data[-1].shape[1]
        else:
            params.contextSize = data[1].shape[1]

    print("Training")

    if args.std:
        print("Standard Cross Validation")
        validator = StandardSplitter(args.c)

    elif args.strat:
        print("Stratified Cross Validation")
        validator = StratifiedSplitter(args.c)

    else:
        print("Random Cross Validation")
        validator = RandomSplitter(args.p, args.c)

    mkdir(args.o)

    models = crossTrain(data, labels, args.o, args.k, validator, params)

    print("Make Predictions")
    """