Python all_partitions Examples

Programming Language: Python

Namespace/Package Name: langtools.utils.useful

Method/Function: all_partitions

Examples at hotexamples.com: 2

Python all_partitions - 2 examples found. These are the top rated real world Python examples of langtools.utils.useful.all_partitions extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: select_ner_training_hu.py Project: hlt-bme-hu/hunNERwiki

    def __add_tmp(self):
#        print "NERTrainingCallback:__add_tmp"
        if len(self.tmp) == 0:
            return

#        print "TMP", self.tmp, self._sent.ner_type
        """Adds the temporary chunk to the output sentence."""
        if self._mode == NERTrainingCallback.NER_LINK:
            # The link may end with punctuation marks -- let's remove them!
            puncts = 0
            for token in reversed(self.tmp):
                if token[NERTrainingCallback.LEMMA] in self._link_punct:
                    puncts += 1
                else:
                    break

            # Add the link stripped of punctuation marks
            for i, attributes in enumerate(self.tmp[0 : len(self.tmp) - puncts]):
                if i == 0:
                    self._sent.bi = 'B'
                else:
                    self._sent.bi = 'I'
                self._sent.append(attributes)
            self._trie.add_anchor(self.tmp[0 : len(self.tmp) - puncts], self._sent.ner_type)

            # And the rest as regular text
            self._sent.ner_type = zero_ner_type()
            for attributes in self.tmp[len(self.tmp) - puncts:]:
                self._sent.append(attributes)

        elif self._mode == NERTrainingCallback.NO_LINK:
            for attributes in self.tmp:
                self._sent.append(attributes)
        elif self._mode == NERTrainingCallback.NNP_LINK:
#            print "NNP", self.tmp, self._sent.ner_type, len(self.tmp)
#            sys.stdout.flush()
#            print "TRIE", self._trie.paths

            if len(self.tmp) <= 8:
                sentence_start_non_nnp = False
                for partition in all_partitions(self.tmp):
                    categories = [self._trie.get_category(part)[1] for part in partition]
    #                print "PC", partition, categories
                    for i, category in enumerate(categories):
                        if category is None or category is 'UNK':
                            # Invalid NNP, UNLESS the first word is sentence starter
                            if (len(self._sent.sentence) == 0 and i == 0 and
                                len(partition[i]) == 1 and
                                not self.__has_noun(partition[i])):
                                sentence_start_non_nnp = True
                            else:
                                break
                    else:
                        for i, part in enumerate(partition):
                            if i == 0 and sentence_start_non_nnp:
                                self._sent.ner_type = zero_ner_type()
                                for word in part:
                                    self._sent.append(word)
                            else:
                                self._sent.ner_type = categories[i]
                                was_B = False
                                for word in part:
                                    if not was_B:
                                        self._sent.bi = 'B'
                                        was_B = True
                                    else:
                                        self._sent.bi = 'I'
                                    self._sent.append(word)
                        break
                else:  # for
                    self.__unknown_nnp_link()
            else:  # if len <= 8
                self.__unknown_nnp_link()

        self.tmp = []

Example #2

Show file

File: select_ner_training_en.py Project: hlt-bme-hu/hunNERwiki

    def __add_tmp(self):
#        print "NERTrainingCallback:__add_tmp"
        if len(self.tmp) == 0:
            return

#        print "TMP", self.tmp, self._sent.ner_type
        """Adds the temporary chunk to the output sentence."""
        if self._mode == NERTrainingCallback.NER_LINK:
            begin, last = self.__partition_candidate(self.tmp, self._sent.ner_type)

            if last != 0:
                # Add the title as regular text
                if begin != 0:
                    tmp_type = self._sent.ner_type
                    self._sent.ner_type = zero_ner_type()
                    for attributes in self.tmp[0 : begin]:
                        self._sent.append(attributes)
                    self._sent.ner_type = tmp_type

                # Add the real entity part
                if begin != last:
                    # If the anchor link is an adjective, and the last word does not
                    # occur in the link target, then it is a derivative form of the
                    # entity and must be a MISC according to ConLL guidelines
                    if (self.tmp[last - 1][NERTrainingCallback.POS].startswith(u'J')
                        and not self.tmp[last - 1][NERTrainingCallback.RAW].lower()
                            in self.tmp[last - 1][NERTrainingCallback.LINK].lower()
                        and self._sent.ner_type != 'UNK'
                        and self._sent.ner_type != zero_ner_type()):
                        sys.stderr.write("Adj entity: {0}\n".format(self.tmp[0 : last]))
                        self._sent.ner_type = 'MISC'
                    # Add the link stripped of punctuation marks
                    for i, attributes in enumerate(self.tmp[begin : last]):
                        if i == 0:
                            self._sent.bi = 'B'
                        else:
                            self._sent.bi = 'I'
                        self._sent.append(attributes)
                    self._trie.add_anchor(self.tmp[begin : last], self._sent.ner_type)

                # Unknown link: we must throw the sentence away
                if self._sent.ner_type == 'UNK':
                    self._sent.links_lost += 1

            # And the rest as regular text
            self._sent.ner_type = zero_ner_type()
            for attributes in self.tmp[last:]:
                self._sent.append(attributes)

        elif self._mode == NERTrainingCallback.NO_LINK:
            for attributes in self.tmp:
                self._sent.append(attributes)
        elif self._mode == NERTrainingCallback.NNP_LINK:
#            print "NNP", self.tmp, self._sent.ner_type, len(self.tmp)
#            sys.stdout.flush()
#            print "TRIE", self._trie.paths

            if len(self.tmp) <= 8:
                sentence_start_non_nnp = False
                for partition in all_partitions(self.tmp):
                    categories = [self._trie.get_category(part)[1] for part in partition]
#                    print "PC", partition, categories
                    for i, category in enumerate(categories):
                        if category == 'UNK':
                            # Invalid NNP, UNLESS the first word is sentence starter
                            if (len(self._sent.sentence) == 0 and i == 0 and
                                len(partition[i]) == 1 and
                                not self.__has_noun(partition[i])):
                                sentence_start_non_nnp = True
                            else:
                                begin, last = self.__partition_candidate(partition[i], category)
                                if begin != last:
                                    category = self._trie.get_category(partition[i][begin : last])[1]
                                    if category == 'UNK' or (begin > 0 and category != 'PER'):
                                        break
                    else:
                        for i, part in enumerate(partition):
                            # At the beginning of a sentence, and the first word
                            # is not an NN(P)
                            if i == 0 and sentence_start_non_nnp:
                                self._sent.ner_type = zero_ner_type()
                                for word in part:
                                    self._sent.append(word)
                            # The rest of the partitions
                            else:
                                self._sent.ner_type = categories[i]
                                begin, last = self.__partition_candidate(part, category)

                                # Add the title as regular text
                                if begin != 0:
                                    self._sent.ner_type = zero_ner_type()
                                    for attributes in self.tmp[0 : begin]:
                                        self._sent.append(attributes)

                                # Add the real entity part
                                if begin != last:
                                    link, category = self._trie.get_category(partition[i][begin : last])
                                    link = u" ".join(link)
                                    # If the anchor link is an adjective, and the last word does not
                                    # occur in the link target, then it is a derivative form of the
                                    # entity and must be a MISC according to ConLL guidelines
                                    if (part[last - 1][NERTrainingCallback.POS].startswith(u'J')
                                        and not part[last - 1][NERTrainingCallback.RAW].lower()
                                            in link.lower()
                                        and category != 'UNK'):
                                        sys.stderr.write("Adj entity: {0}\n".format(self.tmp[0 : last]))
                                        category = 'MISC'

                                    self._sent.ner_type = category
                                    # Add the link stripped of punctuation marks
                                    for i, attributes in enumerate(part[begin : last]):
                                        if i == 0:
                                            self._sent.bi = 'B'
                                        else:
                                            self._sent.bi = 'I'
                                        self._sent.append(attributes)

                                    if self._sent.ner_type == 'UNK':
                                        self._sent.links_lost += 1

                                # And the rest as regular text
                                self._sent.ner_type = zero_ner_type()
                                for attributes in self.tmp[last:]:
                                    self._sent.append(attributes)

                        # We are done, let's break
                        break
                else:  # for
                    self.__unknown_nnp_link()
            else:  # if len <= 8
                self.__unknown_nnp_link()

        self.tmp = []