Beispiel #1
0
def extractFeatures(examples, table=False, prepend=False):
    global leastNgram, highestNgram

    # load and format data correctly if
    # given a table; otherwise calculate
    # probabilities
    if table:
        probabilities = loadData(table, "key")
        for i in range(leastNgram, highestNgram + 1):
            try:
                probabilities[str(i)] = probabilities[str(i)][0]
            except:
                print "Ngrams are not set correctly"
                sys.exit()
    else:
        probabilities = tableFromData(examples, leastNgram, highestNgram)

    # run over all examples counting
    # the maximum length to allow
    # for zero-padding later on
    dataProbabilities = []
    dataY = []
    maxLength = 0
    for sluiceId in examples:

        # prepare sentence data
        sentenceProbabilities = []
        length = 0
        for sentence in examples[sluiceId]:

            # extract sluice and pos tags
            # from candidate
            candidate = sentence["text"]
            tags = [m[1] for m in pos_tag(candidate)]
            sluice = getSluice(sentence["sluiceGovVPText"])
            if not sluice:
                sluice = sentence["sluiceGovVPText"]

            # calculate probabilities
            for i in range(leastNgram, highestNgram + 1):
                sentenceProbabilities.append(
                    computeProbability(i, tags, sluice, probabilities))

            # append data if antecedent
            # and update iterator
            if sentence["isAntecedent"]:
                dataY.append(length)

            length += 1

        # add data for X and
        # update max length
        dataProbabilities.append(sentenceProbabilities)
        if length > maxLength:
            maxLength = length

    # before returning, add padding to
    # the data in case some examples
    # have too few sentences
    dataProbabilities = addPadding(dataProbabilities,
                                   coefNumber() * maxLength, prepend)

    if table:
        return dataProbabilities, dataY
    else:
        return dataProbabilities, dataY, probabilities
Beispiel #2
0
    parser.add_argument(
        '-s',
        '--save',
        metavar='save',
        type=str,
        help=
        'Save the table model to the given distination generated in this pass')
    parser.add_argument('-m',
                        '--model',
                        metavar='model',
                        type=str,
                        default=False,
                        help='Reference to the table file')
    args = parser.parse_args()

    # load data and set
    # batch size
    examples = loadData(args.dataref)
    kfold = 10

    # get the data in the right format, and
    # run a kfold validation
    if args.model:
        dataX, dataY = extractFeatures(examples, args.model)
    else:
        dataX, dataY, probabilities = extractFeatures(examples, args.model)
        # save data if required
        if args.save and not args.model:
            saveData(args.save, probabilities, 1)

    kfoldValidation(kfold, dataX, dataY, True)
####### ---->
###			------------>
###		LET'S GO 		----------->
###			------------>
####### ---->
if __name__ == '__main__':
	import argparse

	 # setup parser and parse args
	parser = argparse.ArgumentParser(description='Trains the parameters of the POS model for antecedent identificaton')
	parser.add_argument('dataref', metavar='dataref', type=str, help='Reference to the example file')
	parser.add_argument('-s', '--save', metavar='save', type=str, help='Save the table model to the given distination generated in this pass')
	parser.add_argument('-m', '--model', metavar='model', type=str, default=False, help='Reference to the table file')
	args = parser.parse_args()

	# load data and set 
	# batch size
	examples = loadData(args.dataref)
	kfold = 10

	# get the data in the right format, and
	# run a kfold validation
	if args.model:
		dataX, dataY = extractFeatures(examples, args.model)
	else:
		dataX, dataY, probabilities = extractFeatures(examples, args.model)
		# save data if required
		if args.save and not args.model:
			saveData(args.save, probabilities, 1)

	kfoldValidation(kfold, dataX, dataY, True)	
def extractFeatures(examples, table=False, prepend=False):
	global leastNgram, highestNgram

	# load and format data correctly if
	# given a table; otherwise calculate
	# probabilities
	if table:
		probabilities = loadData(table, "key")
		for i in range(leastNgram, highestNgram + 1):
			try:
				probabilities[str(i)] = probabilities[str(i)][0]
			except:
				print "Ngrams are not set correctly"
				sys.exit()
	else:
		probabilities = tableFromData(examples, leastNgram, highestNgram)

	# run over all examples counting
	# the maximum length to allow
	# for zero-padding later on
	dataProbabilities = []
	dataY = []
	maxLength = 0
	for sluiceId in examples:

		# prepare sentence data
		sentenceProbabilities = []
		length = 0
		for sentence in examples[sluiceId]:

			# extract sluice and pos tags
			# from candidate
			candidate = sentence["text"]
			tags = [m[1] for m in pos_tag(candidate)]
			sluice = getSluice(sentence["sluiceGovVPText"])
			if not sluice:
				sluice = sentence["sluiceGovVPText"]

			# calculate probabilities
			for i in range(leastNgram, highestNgram + 1):
				sentenceProbabilities.append(computeProbability(i, tags, sluice, probabilities))

			# append data if antecedent
			# and update iterator
			if sentence["isAntecedent"]:
				dataY.append(length)

			length += 1

		# add data for X and
		# update max length
		dataProbabilities.append(sentenceProbabilities)
		if length > maxLength:
			maxLength = length

	# before returning, add padding to
	# the data in case some examples
	# have too few sentences
	dataProbabilities = addPadding(dataProbabilities, coefNumber() * maxLength, prepend)
	
	if table:
		return dataProbabilities, dataY
	else:
		return dataProbabilities, dataY, probabilities
####### ---->
###         ------------>
###     LET'S GO        ----------->
###         ------------>
####### ---->
if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(description='Take file of features (as jsons) as compute antecedent.')
    parser.add_argument('-debug', metavar='debug', type=int, help='debug level')
    parser.add_argument('-optimize', metavar='optimize', type=int, help='optimize switch')
    parser.add_argument('-restarts', metavar='restarts', type=int, help='number of random restarts')
    parser.add_argument('-iterations', metavar='iterations', type=int, help='number of iterations in hill-climbing')
    parser.add_argument('featuref', metavar='featuref', type=str, help='the featurefile')
                       
    args = parser.parse_args()
    features = loadData(args.featuref)
       
    debugGlobal = 0
    if args.debug:
        debugGlobal = args.debug

    optimize_switch = 1
    if args.optimize:
        optimize_switch = 1

    restarts = 1
    if args.restarts:
        restarts = args.restarts
    
    iterations = 100
    if args.iterations: