def run(dev, processed): file_name = "RTE2_dev.preprocessed.xml" data_set = part1a.run() texts = data_set[0] hypos = data_set[1] parta = "wordmatches.txt" partb_1 = "lemma_matches.txt" partb_2 = "pos-tag_matches.txt" partc = "bleuresults.txt" partd = "idfresults.txt" partd_threshold = 0.9670 partc_threshold = 0.2690 parta_threshold = 0.8190 partb_1_threshold = 0.9170 partb_2_threshold = 0.5790 part1b.run(file_name) part1c.run(texts, hypos) part1d.predict(texts, hypos) make_predictions.predict(parta_threshold, parta) make_predictions.predict(partb_1_threshold, partb_1) make_predictions.predict(partb_2_threshold, partb_2) make_predictions.predict(partc_threshold, partc) make_predictions.predict(partd_threshold, partd)
def run(dev, processed): #*********************************************************** # Extracts the text enclosed in <t></t> # Returns a list of characters #*********************************************************** class TextHandler(ContentHandler): in_text = False def __init__(self, text): ContentHandler.__init__(self) self.text = text self.data = [] def startElement(self, name, attrs): if name == 't': self.in_text = True def endElement(self, name): if name == 't': t = ''.join(self.data) self.data = [] self.text.append(t) self.in_text = False def characters(self, string): if self.in_text: self.data.append(string) #*********************************************************** # Extracts the text enclosed in <h></h> # Returns a list of characters #*********************************************************** class HypothesisHandler(ContentHandler): in_hypothesis = False def __init__(self, hypothesis): ContentHandler.__init__(self) self.hypothesis = hypothesis self.data = [] def startElement(self, name, attrs): if name == 'h': self.in_hypothesis = True def endElement(self, name): if name == 'h': t = ''.join(self.data) self.data = [] self.hypothesis.append(t) self.in_hypothesis = False def characters(self, string): if self.in_hypothesis: self.data.append(string) # Uses the content handlers to extract texts and hypothesis from the xml-file text = [] hypothesis = [] parse(dev, TextHandler(text)) parse(dev, HypothesisHandler(hypothesis)) # converts everything to lowercase text = map(lambda x: x.lower(), text) hypothesis = map(lambda x: x.lower(), hypothesis) no_words = [] texts = [] hypos = [] # TODO: creating lists of words and removal of punctuation can be done on the text-list as awhole # instead of for each element for i in range(len(text)): t = text[i] h = hypothesis[i] # create lists of words from the lists of characters t = t.split() h = h.split() # remove punctuations TODO: Extend the list of characters to be removed t = map(lambda x: x.strip('.,:;()"'), t) h = map(lambda x: x.strip('.,:;()"'), h) texts.append(t) hypos.append(h) bleu = "bleuresults.txt" idf = "idfresults.txt" step_size = 0.001 part1d.predict(texts, hypos) part1c.run(text, hypos) predict.predict(step_size, bleu) predict.predict(step_size, idf)