Beispiel #1
0
def run(dev, processed):
	
	file_name = "RTE2_dev.preprocessed.xml"
	data_set = part1a.run()
	texts = data_set[0]
	hypos = data_set[1]
	
	parta = "wordmatches.txt"
	partb_1 = "lemma_matches.txt" 
	partb_2 = "pos-tag_matches.txt" 
	partc = "bleuresults.txt"
	partd = "idfresults.txt"
	
	partd_threshold = 0.9670
	partc_threshold = 0.2690	
	parta_threshold = 0.8190
	partb_1_threshold = 0.9170 
	partb_2_threshold = 0.5790
	
	part1b.run(file_name)
	part1c.run(texts, hypos)
	part1d.predict(texts, hypos)
	make_predictions.predict(parta_threshold, parta)
	make_predictions.predict(partb_1_threshold, partb_1)
	make_predictions.predict(partb_2_threshold, partb_2)
	make_predictions.predict(partc_threshold, partc)
	make_predictions.predict(partd_threshold, partd)
Beispiel #2
0
def run(dev, processed):
    #***********************************************************
    # Extracts the text enclosed in <t></t>
    # Returns a list of characters
    #***********************************************************

    class TextHandler(ContentHandler):

        in_text = False

        def __init__(self, text):
            ContentHandler.__init__(self)
            self.text = text
            self.data = []

        def startElement(self, name, attrs):
            if name == 't':
                self.in_text = True

        def endElement(self, name):
            if name == 't':
                t = ''.join(self.data)
                self.data = []
                self.text.append(t)
                self.in_text = False

        def characters(self, string):
            if self.in_text:
                self.data.append(string)

    #***********************************************************
    # Extracts the text enclosed in <h></h>
    # Returns a list of characters
    #***********************************************************

    class HypothesisHandler(ContentHandler):

        in_hypothesis = False

        def __init__(self, hypothesis):
            ContentHandler.__init__(self)
            self.hypothesis = hypothesis
            self.data = []

        def startElement(self, name, attrs):
            if name == 'h':
                self.in_hypothesis = True

        def endElement(self, name):
            if name == 'h':
                t = ''.join(self.data)
                self.data = []
                self.hypothesis.append(t)
                self.in_hypothesis = False

        def characters(self, string):
            if self.in_hypothesis:
                self.data.append(string)

    # Uses the content handlers to extract texts and hypothesis from the xml-file
    text = []
    hypothesis = []
    parse(dev, TextHandler(text))
    parse(dev, HypothesisHandler(hypothesis))

    # converts everything to lowercase
    text = map(lambda x: x.lower(), text)
    hypothesis = map(lambda x: x.lower(), hypothesis)

    no_words = []
    texts = []
    hypos = []
    # TODO: creating lists of words and removal of punctuation can be done on the text-list as awhole
    # 	instead of for each element
    for i in range(len(text)):
        t = text[i]
        h = hypothesis[i]

        # create lists of words from the lists of characters
        t = t.split()
        h = h.split()

        # remove punctuations TODO: Extend the list of characters to be removed
        t = map(lambda x: x.strip('.,:;()"'), t)
        h = map(lambda x: x.strip('.,:;()"'), h)

        texts.append(t)
        hypos.append(h)

    bleu = "bleuresults.txt"
    idf = "idfresults.txt"
    step_size = 0.001
    part1d.predict(texts, hypos)
    part1c.run(text, hypos)
    predict.predict(step_size, bleu)
    predict.predict(step_size, idf)
Beispiel #3
0
# Calculates a normalized value for the number of words occuring in both text and hypothesis
word_match = []
for i in range(len(no_words)):
	h = hypothesis[i]
	h = h.split()
	h = map(lambda x : x.strip('.,:;"'), h)
	h = sorted(h)
	word_match.append((no_words[i]*1.0) / len(h))

# Prints the list of word matches to file
out = "wordmatches.txt"
file = open(out, 'wb')
if file:
	for i in word_match:
		print >> file, i
	file.close()
else:
	print "Error opening file"
 
# Call predict with step_size and wordmatches to find best threshold 	
step_size = 0.001
name = "wordmatches.txt" 
bleu = "bleuresults.txt"	
idf = "idfresults.txt"
#predict.predict(step_size, name)
#part1c.run(texts, hypos)
#predict.predict(step_size, bleu)
part1d.predict(texts, hypos)
predict.predict(step_size, idf)