Exemple #1
0
    def get_auxiliaries(self, raw=False):
        sent_auxs = []
        for i in range(0, len(self)):
            try:
                if not raw and wc.is_auxiliary(self, i, AUX_LEMMAS, ALL_AUXILIARIES):
                    sent_auxs.append(Auxiliary(self.sentnum, self.words[i], self.lemmas[i], self.pos[i], i))

                elif raw and wc.is_auxiliary(self, i, AUX_LEMMAS, ALL_AUXILIARIES, raw=raw):
                    sent_auxs.append(RawAuxiliary(self.words[i], i, self.sentnum))

            except AuxiliaryHasNoTypeException:
                continue
        return sent_auxs
Exemple #2
0
    def get_gs_auxiliaries(self, annotations, sentnum_modifier):
        parser = ET.XMLParser()
        tree = ET.parse(Files.XML_RAW_TOKENIZED + self.get_subdir() + Files.SLASH_CHAR + self.file_name[0:8] + '.xml',
                        parser=parser)
        root = tree.getroot()

        ann_num = 0
        crt_annotation = annotations[ann_num]
        raw_matrix = []
        raw_gold_auxiliaries = []
        raw_all_auxiliaries = Auxiliaries()
        raw_gold_indexes = []

        for sentence in root.iter('sentence'):
            try:
                s = SentDict(sentence, raw=True)
                for i in range(0, len(s)):
                    if s.offset_starts[i] == crt_annotation.vpe_offset_start:
                        raw_gold_indexes.append(len(raw_all_auxiliaries.auxs))
                        raw_gold_auxiliaries.append(RawAuxiliary(s.words[i], i, s.sentnum))
                        ann_num += 1
                        if not ann_num >= len(annotations):
                            crt_annotation = annotations[ann_num]

                    if wc.is_auxiliary(s, i, AUX_LEMMAS, ALL_AUXILIARIES, raw=True):
                        raw_all_auxiliaries.add_aux(RawAuxiliary(s.words[i], i, s.sentnum))

                raw_matrix.append(s)
            except EmptySentDictException:
                continue

        if len(raw_gold_auxiliaries) != len(annotations):
            print '\nAnnotations:'
            for ann in annotations:
                print ann
            print 'Number of auxs we got: %d, number of annotations for file %s: %d.' % (
                len(raw_gold_auxiliaries), crt_annotation.file, len(annotations))
            raise Exception('Error! When extracting the annotations using the raw data, we didn\'t get the correct number of auxiliaries!')

        """ Now that we got the auxiliaries according to their location/offsets within the raw text files (above),
            we now have to link the RawAuxiliaries with their corresponding MRG/POS XML file auxiliaries. """

        mrg_raw_type_auxiliaries = Auxiliaries()

        # We pass "raw" as true here because we want to use the same "is_auxiliary" method for the comparison.
        for sentdict in self:
            mrg_raw_type_auxiliaries.add_auxs(sentdict.get_auxiliaries(raw=True))

        gold_standard = []
        if len(raw_all_auxiliaries.auxs) == len(mrg_raw_type_auxiliaries.auxs):
            for raw_aux_idx in raw_gold_indexes:
                raw_aux = mrg_raw_type_auxiliaries.auxs[raw_aux_idx]
                aux_sentdict = self.matrix[raw_aux.sentnum - 1]
                idx = raw_aux.wordnum

                mrg_aux = Auxiliary(aux_sentdict.sentnum + sentnum_modifier, aux_sentdict.words[idx],
                                    aux_sentdict.lemmas[idx], aux_sentdict.pos[idx], idx)
                gold_standard.append(mrg_aux)

        return gold_standard
Exemple #3
0
    def make_all_the_files(self, sentdicts, word_distance_from_aux=3):

        words,lemmas,pos_tags,words_near_aux = [],[],[],[]
        for sentdict in sentdicts:
            for i in range(0,len(sentdict)):
                if wc.is_auxiliary(sentdict, i, [], [], raw=False):
                    for j in range(max(0,i-word_distance_from_aux), min(len(sentdict),i+word_distance_from_aux+1)):
                        if j != i:
                            words_near_aux.append(sentdict.words[j])

                words.append(sentdict.words[i])
                lemmas.append(sentdict.lemmas[i])
                pos_tags.append(sentdict.pos[i])

        words = set(words)
        lemmas = set(lemmas)
        pos_tags = set(pos_tags)

        freq_dict = {}
        for w in words_near_aux:
            if w not in freq_dict:
                freq_dict[w] = 1
            else:
                freq_dict[w] += 1

        sorted_by_freq = sorted(freq_dict.items(), key=operator.itemgetter(1))
        most_frequent_words_near_aux = [pair[0] for pair in sorted_by_freq[-1*number_of_words_surrounding_aux():len(sorted_by_freq)]]

        self.make_file(self.EACH_UNIQUE_WORD_FILE, words)
        self.make_file(self.EACH_UNIQUE_LEMMA_FILE, lemmas)
        self.make_file(self.EACH_UNIQUE_POS_FILE, pos_tags)
        self.make_file(self.EACH_UNIQUE_WORD_NEAR_AUX, most_frequent_words_near_aux)
Exemple #4
0
    def get_auxiliaries(self, raw=False):
        sent_auxs = []
        for i in range(0, len(self)):
            try:
                if not raw and wc.is_auxiliary(self, i, AUX_LEMMAS,
                                               ALL_AUXILIARIES):
                    sent_auxs.append(
                        Auxiliary(self.sentnum, self.words[i], self.lemmas[i],
                                  self.pos[i], i))

                elif raw and wc.is_auxiliary(
                        self, i, AUX_LEMMAS, ALL_AUXILIARIES, raw=raw):
                    sent_auxs.append(
                        RawAuxiliary(self.words[i], i, self.sentnum))

            except AuxiliaryHasNoTypeException:
                continue
        return sent_auxs
Exemple #5
0
    def make_all_the_files(self, sentdicts, word_distance_from_aux=3):

        words, lemmas, pos_tags, words_near_aux = [], [], [], []
        for sentdict in sentdicts:
            for i in range(0, len(sentdict)):
                if wc.is_auxiliary(sentdict, i, [], [], raw=False):
                    for j in range(
                            max(0, i - word_distance_from_aux),
                            min(len(sentdict),
                                i + word_distance_from_aux + 1)):
                        if j != i:
                            words_near_aux.append(sentdict.words[j])

                words.append(sentdict.words[i])
                lemmas.append(sentdict.lemmas[i])
                pos_tags.append(sentdict.pos[i])

        words = set(words)
        lemmas = set(lemmas)
        pos_tags = set(pos_tags)

        freq_dict = {}
        for w in words_near_aux:
            if w not in freq_dict:
                freq_dict[w] = 1
            else:
                freq_dict[w] += 1

        sorted_by_freq = sorted(freq_dict.items(), key=operator.itemgetter(1))
        most_frequent_words_near_aux = [
            pair[0]
            for pair in sorted_by_freq[-1 * number_of_words_surrounding_aux(
            ):len(sorted_by_freq)]
        ]

        self.make_file(self.EACH_UNIQUE_WORD_FILE, words)
        self.make_file(self.EACH_UNIQUE_LEMMA_FILE, lemmas)
        self.make_file(self.EACH_UNIQUE_POS_FILE, pos_tags)
        self.make_file(self.EACH_UNIQUE_WORD_NEAR_AUX,
                       most_frequent_words_near_aux)
Exemple #6
0
    def get_gs_auxiliaries(self, annotations, sentnum_modifier):
        parser = ET.XMLParser()
        tree = ET.parse(Files.XML_RAW_TOKENIZED + self.get_subdir() +
                        Files.SLASH_CHAR + self.file_name[0:8] + '.xml',
                        parser=parser)
        root = tree.getroot()

        ann_num = 0
        crt_annotation = annotations[ann_num]
        raw_matrix = []
        raw_gold_auxiliaries = []
        raw_all_auxiliaries = Auxiliaries()
        raw_gold_indexes = []

        for sentence in root.iter('sentence'):
            try:
                s = SentDict(sentence, raw=True)
                for i in range(0, len(s)):
                    if s.offset_starts[i] == crt_annotation.vpe_offset_start:
                        raw_gold_indexes.append(len(raw_all_auxiliaries.auxs))
                        raw_gold_auxiliaries.append(
                            RawAuxiliary(s.words[i], i, s.sentnum))
                        ann_num += 1
                        if not ann_num >= len(annotations):
                            crt_annotation = annotations[ann_num]

                    if wc.is_auxiliary(s,
                                       i,
                                       AUX_LEMMAS,
                                       ALL_AUXILIARIES,
                                       raw=True):
                        raw_all_auxiliaries.add_aux(
                            RawAuxiliary(s.words[i], i, s.sentnum))

                raw_matrix.append(s)
            except EmptySentDictException:
                continue

        if len(raw_gold_auxiliaries) != len(annotations):
            print '\nAnnotations:'
            for ann in annotations:
                print ann
            print 'Number of auxs we got: %d, number of annotations for file %s: %d.' % (
                len(raw_gold_auxiliaries), crt_annotation.file,
                len(annotations))
            raise Exception(
                'Error! When extracting the annotations using the raw data, we didn\'t get the correct number of auxiliaries!'
            )
        """ Now that we got the auxiliaries according to their location/offsets within the raw text files (above),
            we now have to link the RawAuxiliaries with their corresponding MRG/POS XML file auxiliaries. """

        mrg_raw_type_auxiliaries = Auxiliaries()

        # We pass "raw" as true here because we want to use the same "is_auxiliary" method for the comparison.
        for sentdict in self:
            mrg_raw_type_auxiliaries.add_auxs(
                sentdict.get_auxiliaries(raw=True))

        gold_standard = []
        if len(raw_all_auxiliaries.auxs) == len(mrg_raw_type_auxiliaries.auxs):
            for raw_aux_idx in raw_gold_indexes:
                raw_aux = mrg_raw_type_auxiliaries.auxs[raw_aux_idx]
                aux_sentdict = self.matrix[raw_aux.sentnum - 1]
                idx = raw_aux.wordnum

                mrg_aux = Auxiliary(aux_sentdict.sentnum + sentnum_modifier,
                                    aux_sentdict.words[idx],
                                    aux_sentdict.lemmas[idx],
                                    aux_sentdict.pos[idx], idx)
                gold_standard.append(mrg_aux)

        return gold_standard