def get_auxiliaries(self, raw=False): sent_auxs = [] for i in range(0, len(self)): try: if not raw and wc.is_auxiliary(self, i, AUX_LEMMAS, ALL_AUXILIARIES): sent_auxs.append(Auxiliary(self.sentnum, self.words[i], self.lemmas[i], self.pos[i], i)) elif raw and wc.is_auxiliary(self, i, AUX_LEMMAS, ALL_AUXILIARIES, raw=raw): sent_auxs.append(RawAuxiliary(self.words[i], i, self.sentnum)) except AuxiliaryHasNoTypeException: continue return sent_auxs
def get_gs_auxiliaries(self, annotations, sentnum_modifier): parser = ET.XMLParser() tree = ET.parse(Files.XML_RAW_TOKENIZED + self.get_subdir() + Files.SLASH_CHAR + self.file_name[0:8] + '.xml', parser=parser) root = tree.getroot() ann_num = 0 crt_annotation = annotations[ann_num] raw_matrix = [] raw_gold_auxiliaries = [] raw_all_auxiliaries = Auxiliaries() raw_gold_indexes = [] for sentence in root.iter('sentence'): try: s = SentDict(sentence, raw=True) for i in range(0, len(s)): if s.offset_starts[i] == crt_annotation.vpe_offset_start: raw_gold_indexes.append(len(raw_all_auxiliaries.auxs)) raw_gold_auxiliaries.append(RawAuxiliary(s.words[i], i, s.sentnum)) ann_num += 1 if not ann_num >= len(annotations): crt_annotation = annotations[ann_num] if wc.is_auxiliary(s, i, AUX_LEMMAS, ALL_AUXILIARIES, raw=True): raw_all_auxiliaries.add_aux(RawAuxiliary(s.words[i], i, s.sentnum)) raw_matrix.append(s) except EmptySentDictException: continue if len(raw_gold_auxiliaries) != len(annotations): print '\nAnnotations:' for ann in annotations: print ann print 'Number of auxs we got: %d, number of annotations for file %s: %d.' % ( len(raw_gold_auxiliaries), crt_annotation.file, len(annotations)) raise Exception('Error! When extracting the annotations using the raw data, we didn\'t get the correct number of auxiliaries!') """ Now that we got the auxiliaries according to their location/offsets within the raw text files (above), we now have to link the RawAuxiliaries with their corresponding MRG/POS XML file auxiliaries. """ mrg_raw_type_auxiliaries = Auxiliaries() # We pass "raw" as true here because we want to use the same "is_auxiliary" method for the comparison. for sentdict in self: mrg_raw_type_auxiliaries.add_auxs(sentdict.get_auxiliaries(raw=True)) gold_standard = [] if len(raw_all_auxiliaries.auxs) == len(mrg_raw_type_auxiliaries.auxs): for raw_aux_idx in raw_gold_indexes: raw_aux = mrg_raw_type_auxiliaries.auxs[raw_aux_idx] aux_sentdict = self.matrix[raw_aux.sentnum - 1] idx = raw_aux.wordnum mrg_aux = Auxiliary(aux_sentdict.sentnum + sentnum_modifier, aux_sentdict.words[idx], aux_sentdict.lemmas[idx], aux_sentdict.pos[idx], idx) gold_standard.append(mrg_aux) return gold_standard
def make_all_the_files(self, sentdicts, word_distance_from_aux=3): words,lemmas,pos_tags,words_near_aux = [],[],[],[] for sentdict in sentdicts: for i in range(0,len(sentdict)): if wc.is_auxiliary(sentdict, i, [], [], raw=False): for j in range(max(0,i-word_distance_from_aux), min(len(sentdict),i+word_distance_from_aux+1)): if j != i: words_near_aux.append(sentdict.words[j]) words.append(sentdict.words[i]) lemmas.append(sentdict.lemmas[i]) pos_tags.append(sentdict.pos[i]) words = set(words) lemmas = set(lemmas) pos_tags = set(pos_tags) freq_dict = {} for w in words_near_aux: if w not in freq_dict: freq_dict[w] = 1 else: freq_dict[w] += 1 sorted_by_freq = sorted(freq_dict.items(), key=operator.itemgetter(1)) most_frequent_words_near_aux = [pair[0] for pair in sorted_by_freq[-1*number_of_words_surrounding_aux():len(sorted_by_freq)]] self.make_file(self.EACH_UNIQUE_WORD_FILE, words) self.make_file(self.EACH_UNIQUE_LEMMA_FILE, lemmas) self.make_file(self.EACH_UNIQUE_POS_FILE, pos_tags) self.make_file(self.EACH_UNIQUE_WORD_NEAR_AUX, most_frequent_words_near_aux)
def get_auxiliaries(self, raw=False): sent_auxs = [] for i in range(0, len(self)): try: if not raw and wc.is_auxiliary(self, i, AUX_LEMMAS, ALL_AUXILIARIES): sent_auxs.append( Auxiliary(self.sentnum, self.words[i], self.lemmas[i], self.pos[i], i)) elif raw and wc.is_auxiliary( self, i, AUX_LEMMAS, ALL_AUXILIARIES, raw=raw): sent_auxs.append( RawAuxiliary(self.words[i], i, self.sentnum)) except AuxiliaryHasNoTypeException: continue return sent_auxs
def make_all_the_files(self, sentdicts, word_distance_from_aux=3): words, lemmas, pos_tags, words_near_aux = [], [], [], [] for sentdict in sentdicts: for i in range(0, len(sentdict)): if wc.is_auxiliary(sentdict, i, [], [], raw=False): for j in range( max(0, i - word_distance_from_aux), min(len(sentdict), i + word_distance_from_aux + 1)): if j != i: words_near_aux.append(sentdict.words[j]) words.append(sentdict.words[i]) lemmas.append(sentdict.lemmas[i]) pos_tags.append(sentdict.pos[i]) words = set(words) lemmas = set(lemmas) pos_tags = set(pos_tags) freq_dict = {} for w in words_near_aux: if w not in freq_dict: freq_dict[w] = 1 else: freq_dict[w] += 1 sorted_by_freq = sorted(freq_dict.items(), key=operator.itemgetter(1)) most_frequent_words_near_aux = [ pair[0] for pair in sorted_by_freq[-1 * number_of_words_surrounding_aux( ):len(sorted_by_freq)] ] self.make_file(self.EACH_UNIQUE_WORD_FILE, words) self.make_file(self.EACH_UNIQUE_LEMMA_FILE, lemmas) self.make_file(self.EACH_UNIQUE_POS_FILE, pos_tags) self.make_file(self.EACH_UNIQUE_WORD_NEAR_AUX, most_frequent_words_near_aux)
def get_gs_auxiliaries(self, annotations, sentnum_modifier): parser = ET.XMLParser() tree = ET.parse(Files.XML_RAW_TOKENIZED + self.get_subdir() + Files.SLASH_CHAR + self.file_name[0:8] + '.xml', parser=parser) root = tree.getroot() ann_num = 0 crt_annotation = annotations[ann_num] raw_matrix = [] raw_gold_auxiliaries = [] raw_all_auxiliaries = Auxiliaries() raw_gold_indexes = [] for sentence in root.iter('sentence'): try: s = SentDict(sentence, raw=True) for i in range(0, len(s)): if s.offset_starts[i] == crt_annotation.vpe_offset_start: raw_gold_indexes.append(len(raw_all_auxiliaries.auxs)) raw_gold_auxiliaries.append( RawAuxiliary(s.words[i], i, s.sentnum)) ann_num += 1 if not ann_num >= len(annotations): crt_annotation = annotations[ann_num] if wc.is_auxiliary(s, i, AUX_LEMMAS, ALL_AUXILIARIES, raw=True): raw_all_auxiliaries.add_aux( RawAuxiliary(s.words[i], i, s.sentnum)) raw_matrix.append(s) except EmptySentDictException: continue if len(raw_gold_auxiliaries) != len(annotations): print '\nAnnotations:' for ann in annotations: print ann print 'Number of auxs we got: %d, number of annotations for file %s: %d.' % ( len(raw_gold_auxiliaries), crt_annotation.file, len(annotations)) raise Exception( 'Error! When extracting the annotations using the raw data, we didn\'t get the correct number of auxiliaries!' ) """ Now that we got the auxiliaries according to their location/offsets within the raw text files (above), we now have to link the RawAuxiliaries with their corresponding MRG/POS XML file auxiliaries. """ mrg_raw_type_auxiliaries = Auxiliaries() # We pass "raw" as true here because we want to use the same "is_auxiliary" method for the comparison. for sentdict in self: mrg_raw_type_auxiliaries.add_auxs( sentdict.get_auxiliaries(raw=True)) gold_standard = [] if len(raw_all_auxiliaries.auxs) == len(mrg_raw_type_auxiliaries.auxs): for raw_aux_idx in raw_gold_indexes: raw_aux = mrg_raw_type_auxiliaries.auxs[raw_aux_idx] aux_sentdict = self.matrix[raw_aux.sentnum - 1] idx = raw_aux.wordnum mrg_aux = Auxiliary(aux_sentdict.sentnum + sentnum_modifier, aux_sentdict.words[idx], aux_sentdict.lemmas[idx], aux_sentdict.pos[idx], idx) gold_standard.append(mrg_aux) return gold_standard