def readTestAttributesFromFile(self): dom_doc = xml_util.get_dom_from_xml("data/RTE2_test.preprocessed.xml") pair_nodes = xml_util.get_pair_nodes(dom_doc) entailment_values = [0 for _ in range(800 + 1)] document = xml_util.get_dom_from_xml("data/RTE2_test.annotated.xml") pair_nodes2 = xml_util.get_pair_nodes(document) pair_attributes = xml_util.get_attributes_from_pair_nodes(pair_nodes2) for i in range(len(pair_attributes)): t, h, id_num, e = pair_attributes[i] id_num = int(id_num) entailment_values[id_num] = 0 def get_attributes_from_preprocessed_pair_nodes(pair_nodes): pairs = [] for pair in pair_nodes: text = pair.getElementsByTagName("text")[0].childNodes hypothesis = pair.getElementsByTagName( "hypothesis")[0].childNodes id_number = pair.getAttributeNode("id").value #entailment = pair.getAttributeNode("entailment").value entailment = entailment_values[int(id_number)] task = pair.getAttributeNode("task").value pairs.append((text, hypothesis, id_number, entailment, task)) return pairs pair_attributes = get_attributes_from_preprocessed_pair_nodes( pair_nodes) return pair_attributes
def readTestAttributesFromFile(self): dom_doc = xml_util.get_dom_from_xml("data/RTE2_test.preprocessed.xml") pair_nodes = xml_util.get_pair_nodes(dom_doc) entailment_values = [0 for _ in range(800+1)] document = xml_util.get_dom_from_xml("data/RTE2_test.annotated.xml") pair_nodes2 = xml_util.get_pair_nodes(document) pair_attributes = xml_util.get_attributes_from_pair_nodes(pair_nodes2) for i in range(len(pair_attributes)): t,h,id_num,e = pair_attributes[i] id_num = int(id_num) entailment_values[id_num] = 0 def get_attributes_from_preprocessed_pair_nodes(pair_nodes): pairs = [] for pair in pair_nodes: text = pair.getElementsByTagName("text")[0].childNodes hypothesis = pair.getElementsByTagName("hypothesis")[0].childNodes id_number = pair.getAttributeNode("id").value #entailment = pair.getAttributeNode("entailment").value entailment = entailment_values[int(id_number)] task = pair.getAttributeNode("task").value pairs.append((text,hypothesis,id_number,entailment,task)) return pairs pair_attributes = get_attributes_from_preprocessed_pair_nodes(pair_nodes) return pair_attributes
def readProcessedAttributesFromFile(self): # Be aware that it takes quite some time to read the preprocessed xml file dom_doc = xml_util.get_dom_from_xml("data/RTE2_dev.preprocessed.xml") pair_nodes = xml_util.get_pair_nodes(dom_doc) pair_attributes = xml_util.get_attributes_from_preprocessed_pair_nodes( pair_nodes) return pair_attributes
def threshold_iterator(threshold): document = xml_util.get_dom_from_xml("data/RTE2_dev.xml") pair_nodes = xml_util.get_pair_nodes(document) pair_attributes = xml_util.get_attributes_from_pair_nodes(pair_nodes) if threshold == -1: for i in range(100): threshold = 1.0 - (0.01 * i) word_matching(threshold, pair_attributes) else: word_matching(threshold,pair_attributes)
def threshold_iterator(threshold): document = xml_util.get_dom_from_xml("data/RTE2_dev.xml") pair_nodes = xml_util.get_pair_nodes(document) pair_attributes = xml_util.get_attributes_from_pair_nodes(pair_nodes) if threshold == -1: for i in range(100): threshold = 1.0 - (0.01 * i) word_matching(threshold, pair_attributes) else: word_matching(threshold, pair_attributes)
def threshold_iterator(threshold): dom_doc = xml_util.get_dom_from_xml("data/RTE2_dev.preprocessed.xml") pair_nodes = xml_util.get_pair_nodes(dom_doc) pair_attributes = xml_util.get_attributes_from_preprocessed_pair_nodes(pair_nodes) if threshold == -1: for i in range(100): threshold = 1.0 - (0.01 * i) idf_weighting(threshold, pair_attributes) else: idf_weighting(threshold, pair_attributes)
def threshold_iterator(threshold): dom_doc = xml_util.get_dom_from_xml("data/RTE2_dev.preprocessed.xml") pair_nodes = xml_util.get_pair_nodes(dom_doc) pair_attributes = xml_util.get_attributes_from_preprocessed_pair_nodes( pair_nodes) if threshold == -1: for i in range(100): threshold = 1.0 - (0.01 * i) idf_weighting(threshold, pair_attributes) else: idf_weighting(threshold, pair_attributes)
def threshold_iterator(threshold): # Be aware that it takes quite some time to read the preprocessed xml file dom_doc = xml_util.get_dom_from_xml("data/RTE2_dev.preprocessed.xml") pair_nodes = xml_util.get_pair_nodes(dom_doc) pair_attributes = xml_util.get_attributes_from_preprocessed_pair_nodes(pair_nodes) if threshold == -1: for i in range(100): threshold = 1.0 - (0.01 * i) bleu_matching(threshold, pair_attributes) else: bleu_matching(threshold, pair_attributes)
def threshold_iterator(threshold): # Be aware that it takes quite some time to read the preprocessed xml file dom_doc = xml_util.get_dom_from_xml("data/RTE2_dev.preprocessed.xml") pair_nodes = xml_util.get_pair_nodes(dom_doc) pair_attributes = xml_util.get_attributes_from_preprocessed_pair_nodes( pair_nodes) if threshold == -1: for i in range(100): threshold = 1.0 - (0.01 * i) lemma_matching(threshold, pair_attributes) else: lemma_matching(threshold, pair_attributes)
def threshold_iterator(threshold): """ The method used to extract data from the XML-file and calculate distances. Iterates over different thresholds to find the best threshold or just calculates the entailment values for one threshold. """ global idf_dict dom_doc = xml_util.get_dom_from_xml("data/RTE2_dev.preprocessed.xml") pairs = xml_util.get_pairs(dom_doc) pair_attributes = xml_util.get_attributes_from_preprocessed_pair_nodes(pairs) idf_dict = calculate_idf_dictionary(pair_attributes) print len(idf_dict.keys()) tree_value_pairs = [] # Extracting the actual lemma values from the pair nodes for i in range(len(pair_attributes)): t,h,id_num,e,ta = pair_attributes[i] id_num = int(id_num) t_values = xml_util.get_minipar_values_from_text_node(t) h_values = xml_util.get_minipar_values_from_text_node(h) tree_value_pairs.append((t_values,h_values)) # Calculating distances between text and hypothesis distances = [] for i in range(len(tree_value_pairs)): t_tree,h_tree = build_tree(tree_value_pairs[i]) dist = tree_edit_dist.distance(t_tree, h_tree, idf_cost) normalizer = tree_edit_dist.distance(tree_edit_dist.Node("root"), h_tree, idf_cost) normalized_dist = float(dist) / float(normalizer) distances.append(normalized_dist) #for d in distances: # print d if threshold == -1: for i in range(200): threshold = 1.0 - (0.005 * i) syntax_matching(pair_attributes, distances, threshold) else: syntax_matching(pair_attributes, distances, threshold)
def readAttributesFromFile(self): document = xml_util.get_dom_from_xml("data/RTE2_dev.xml") pair_nodes = xml_util.get_pair_nodes(document) pair_attributes = xml_util.get_attributes_from_pair_nodes(pair_nodes) return pair_attributes
def readProcessedAttributesFromFile(self): # Be aware that it takes quite some time to read the preprocessed xml file dom_doc = xml_util.get_dom_from_xml("data/RTE2_dev.preprocessed.xml") pair_nodes = xml_util.get_pair_nodes(dom_doc) pair_attributes = xml_util.get_attributes_from_preprocessed_pair_nodes(pair_nodes) return pair_attributes