Beispiel #1
0
def startexp(
		stages=(DEFAULTSTAGE, ),  # see above
		corpusfmt='export',  # choices: export, discbracket, bracket
		corpusdir='.',
		# filenames may include globbing characters '*' and '?'.
		traincorpus='alpinosample.export', trainencoding='utf-8',
		testcorpus='alpinosample.export', testencoding='utf-8',
		testmaxwords=40,
		trainmaxwords=40,
		trainnumsents=2,
		testnumsents=1,  # number of sentences to parse
		skiptrain=True,  # test set starts after training set
		# (useful when they are in the same file)
		skip=0,  # number of sentences to skip from test corpus
		punct=None,  # choices: None, 'move', 'remove', 'root'
		functions=None,  # choices None, 'add', 'remove', 'replace'
		morphology=None,  # choices: None, 'add', 'replace', 'between'
		transformations=None,  # apply treebank transformations
		# postagging: pass None to use tags from treebank.
		postagging=None,
		relationalrealizational=None,  # do not apply RR-transform
		headrules=None,  # rules for finding heads of constituents
		bintype='binarize',  # choices: binarize, optimal, optimalhead
		factor='right',
		revmarkov=True,
		v=1,
		h=2,
		pospa=False,  # when v > 1, add parent annotation to POS tags?
		markhead=False,  # prepend head to siblings
		leftmostunary=True,  # start binarization with unary node
		rightmostunary=True,  # end binarization with unary node
		tailmarker='',  # with headrules, head is last node and can be marked
		fanout_marks_before_bin=False,
		evalparam='proper.prm',  # EVALB-style parameter file
		quiet=False, reallyquiet=False,  # quiet=no per sentence results
		numproc=1,  # increase to use multiple CPUs; None: use all CPUs.
		resultdir='results',
		rerun=False):
	""" Execute an experiment. """
	assert bintype in ('optimal', 'optimalhead', 'binarize')
	if postagging is not None:
		assert set(postagging).issubset({'method', 'model',
				'unknownthreshold', 'openclassthreshold', 'simplelexsmooth'})
		if postagging['method'] == 'unknownword':
			assert postagging['model'] in ('4', '6', 'base')
			assert postagging['unknownthreshold'] >= 1
			assert postagging['openclassthreshold'] >= 0
		else:
			assert postagging['method'] in ('treetagger', 'stanford')

	if rerun:
		assert os.path.exists(resultdir), (
				'Directory %r does not exist.'
				'--rerun requires a directory '
				'with the grammar(s) of a previous experiment.'
				% resultdir)
	else:
		assert not os.path.exists(resultdir), (
			'Directory %r exists.\n'
			'Use --rerun to parse with existing grammar '
			'and overwrite previous results.' % resultdir)
		os.mkdir(resultdir)

	# Log everything, and send it to stderr, in a format with just the message.
	formatstr = '%(message)s'
	if reallyquiet:
		logging.basicConfig(level=logging.WARNING, format=formatstr)
	elif quiet:
		logging.basicConfig(level=logging.INFO, format=formatstr)
	else:
		logging.basicConfig(level=logging.DEBUG, format=formatstr)

	# also log to a file
	fileobj = logging.FileHandler(filename='%s/output.log' % resultdir)
	#fileobj.setLevel(logging.INFO)
	fileobj.setLevel(logging.DEBUG)
	fileobj.setFormatter(logging.Formatter(formatstr))
	logging.getLogger('').addHandler(fileobj)

	corpusreader = getreader(corpusfmt)
	if not rerun:
		corpus = corpusreader(corpusdir, traincorpus, encoding=trainencoding,
				headrules=headrules, headfinal=True, headreverse=False,
				punct=punct, functions=functions, morphology=morphology)
		logging.info('%d sentences in training corpus %s/%s',
				len(corpus.parsed_sents()), corpusdir, traincorpus)
		if isinstance(trainnumsents, float):
			trainnumsents = int(trainnumsents * len(corpus.sents()))
		trees = list(corpus.parsed_sents().values())[:trainnumsents]
		sents = list(corpus.sents().values())[:trainnumsents]
		if transformations:
			trees = [transform(tree, sent, transformations)
					for tree, sent in zip(trees, sents)]
		if relationalrealizational:
			trees = [rrtransform(tree, **relationalrealizational)[0]
					for tree in trees]
		train_tagged_sents = [[(word, tag) for word, (_, tag)
				in zip(sent, sorted(tree.pos()))]
					for tree, sent in zip(trees, sents)]
		blocks = list(corpus.blocks().values())[:trainnumsents]
		assert trees, 'training corpus should be non-empty'
		logging.info('%d training sentences before length restriction',
				len(trees))
		trees, sents, blocks = zip(*[sent for sent in zip(trees, sents, blocks)
			if len(sent[1]) <= trainmaxwords])
		logging.info('%d training sentences after length restriction <= %d',
			len(trees), trainmaxwords)

	testset = corpusreader(corpusdir, testcorpus, encoding=testencoding,
			punct=punct, morphology=morphology, functions=functions)
	gold_sents = testset.tagged_sents()
	test_parsed_sents = testset.parsed_sents()
	if skiptrain:
		skip += trainnumsents
	logging.info('%d sentences in test corpus %s/%s',
			len(testset.parsed_sents()), corpusdir, testcorpus)
	logging.info('%d test sentences before length restriction',
			len(list(gold_sents)[skip:skip + testnumsents]))
	lexmodel = None
	test_tagged_sents = gold_sents
	if postagging and postagging['method'] in ('treetagger', 'stanford'):
		if postagging['method'] == 'treetagger':
			# these two tags are never given by tree-tagger,
			# so collect words whose tag needs to be overriden
			overridetags = ('PTKANT', 'PIDAT')
		elif postagging['method'] == 'stanford':
			overridetags = ('PTKANT', )
		taglex = defaultdict(set)
		for sent in train_tagged_sents:
			for word, tag in sent:
				taglex[word].add(tag)
		overridetagdict = {tag:
			{word for word, tags in taglex.items() if tags == {tag}}
			for tag in overridetags}
		tagmap = {'$(': '$[', 'PAV': 'PROAV'}
		sents_to_tag = OrderedDict((a, b) for a, b
				in islice(gold_sents.items(), skip, skip + testnumsents)
				if len(b) <= testmaxwords),
		test_tagged_sents = externaltagging(postagging['method'],
				postagging['model'], sents_to_tag, overridetagdict, tagmap)
		# give these tags to parser
		usetags = True
	elif postagging and postagging['method'] == 'unknownword' and not rerun:
		postagging['unknownwordfun'] = getunknownwordfun(postagging['model'])
		# get smoothed probalities for lexical productions
		lexresults, msg = getunknownwordmodel(
				train_tagged_sents, postagging['unknownwordfun'],
				postagging['unknownthreshold'],
				postagging['openclassthreshold'])
		logging.info(msg)
		simplelexsmooth = postagging['simplelexsmooth']
		if simplelexsmooth:
			lexmodel = lexresults[2:8]
		else:
			lexmodel, msg = getlexmodel(*lexresults)
			logging.info(msg)
		# NB: knownwords are all words in training set, lexicon is the subset
		# of words that are above the frequency threshold.
		# for training purposes we work with the subset, at test time we exploit
		# the full set of known words from the training set.
		sigs, knownwords, lexicon = lexresults[:3]
		postagging['sigs'], postagging['lexicon'] = sigs, knownwords
		# replace rare train words with signatures
		sents = replaceraretrainwords(train_tagged_sents,
				postagging['unknownwordfun'], lexicon)
		# make sure gold POS tags are not given to parser
		usetags = False
	elif postagging and postagging['method'] == 'unknownword' and rerun:
		usetags = False
	else:
		simplelexsmooth = False
		# give gold POS tags to parser
		usetags = True

	# 0: test sentences as they should be handed to the parser,
	# 1: gold trees for evaluation purposes
	# 2: gold sentence because test sentences may be mangled by unknown word
	#   model
	# 3: blocks from treebank file to reproduce the relevant part of the
	#   original treebank verbatim.
	testset = OrderedDict((a, (test_tagged_sents[a], test_parsed_sents[a],
			gold_sents[a], block)) for a, block
			in islice(testset.blocks().items(), skip, skip + testnumsents)
			if len(test_tagged_sents[a]) <= testmaxwords)
	assert test_tagged_sents, 'test corpus should be non-empty'
	logging.info('%d test sentences after length restriction <= %d',
			len(testset), testmaxwords)

	if rerun:
		trees = []
		sents = []
	toplabels = {tree.label for tree in trees} | {
			test_parsed_sents[n].label for n in testset}
	assert len(toplabels) == 1, 'expected unique ROOT label: %r' % toplabels
	top = toplabels.pop()

	if rerun:
		readgrammars(resultdir, stages, postagging, top)
	else:
		logging.info('read training & test corpus')
		getgrammars(trees, sents, stages, bintype, h, v, factor, tailmarker,
				revmarkov, leftmostunary, rightmostunary, pospa, markhead,
				fanout_marks_before_bin, testmaxwords, resultdir, numproc,
				lexmodel, simplelexsmooth, top, relationalrealizational)
	evalparam = evalmod.readparam(evalparam)
	evalparam['DEBUG'] = -1
	evalparam['CUTOFF_LEN'] = 40
	deletelabel = evalparam.get('DELETE_LABEL', ())
	deleteword = evalparam.get('DELETE_WORD', ())

	begin = time.clock()
	parser = Parser(stages, transformations=transformations,
			tailmarker=tailmarker, postagging=postagging if postagging
			and postagging['method'] == 'unknownword' else None,
			relationalrealizational=relationalrealizational)
	results = doparsing(parser=parser, testset=testset, resultdir=resultdir,
			usetags=usetags, numproc=numproc, deletelabel=deletelabel,
			deleteword=deleteword, corpusfmt=corpusfmt, morphology=morphology)
	if numproc == 1:
		logging.info('time elapsed during parsing: %gs', time.clock() - begin)
	for result in results[0]:
		nsent = len(result.parsetrees)
		header = (' ' + result.name.upper() + ' ').center(35, '=')
		evalsummary = evalmod.doeval(OrderedDict((a, b.copy(True))
				for a, b in test_parsed_sents.items()), gold_sents,
				result.parsetrees, test_tagged_sents if usetags else gold_sents,
				evalparam)
		coverage = 'coverage: %s = %6.2f' % (
				('%d / %d' % (nsent - result.noparse, nsent)).rjust(
				25 if any(len(a) > evalparam['CUTOFF_LEN']
				for a in gold_sents.values()) else 14),
				100.0 * (nsent - result.noparse) / nsent)
		logging.info('\n'.join(('', header, evalsummary, coverage)))
	return top