Esempio n. 1
0
 def from_token(token: Token) -> 'MarkovWord':
     if CapitalizationMode.from_token(
             token,
             CAPITALIZATION_COMPOUND_RULES) == CapitalizationMode.COMPOUND:
         compound = True
     else:
         compound = False
     return MarkovWord(token.text,
                       Pos.from_token(token),
                       compound=compound,
                       neighbors={})
Esempio n. 2
0
 def from_token(token: Token) -> 'MarkovNeighbor':
     key = token.text.lower()
     text = token.text
     if CapitalizationMode.from_token(token, CAPITALIZATION_COMPOUND_RULES) == CapitalizationMode.COMPOUND:
         compound = True
     else:
         compound = False
     pos = Pos.from_token(token)
     values = [0, 0]
     dist = [0] * (MARKOV_WINDOW_SIZE * 2 + 1)
     return MarkovNeighbor(key, text, pos, compound, values, dist)
Esempio n. 3
0
 def from_token(token: Token) -> 'MarkovNeighbor':
     key = token.text.lower()
     text = token.text
     if CapitalizationMode.from_token(token, CAPITALIZATION_COMPOUND_RULES) == CapitalizationMode.COMPOUND:
         compound = True
     else:
         compound = False
     pos = Pos.from_token(token)
     values = [0, 0]
     dist = [0] * (MARKOV_WINDOW_SIZE * 2 + 1)
     return MarkovNeighbor(key, text, pos, compound, values, dist)
Esempio n. 4
0
    def preprocess(self, doc: Doc) -> bool:
        if len(self.data) >= STRUCTURE_MODEL_TRAINING_MAX_SIZE:
            return False

        sequence = []
        previous_item = None
        for sentence_idx, sentence in enumerate(doc.sents):
            if len(self.data) >= STRUCTURE_MODEL_TRAINING_MAX_SIZE:
                return False

            for token_idx, token in enumerate(sentence):
                item = StructureFeatureAnalyzer.analyze(
                    token,
                    CapitalizationMode.from_token(
                        token, CAPITALIZATION_COMPOUND_RULES))
                label = item

                if len(sequence) == 0:
                    # Offset data by one, making label point to the next data item
                    sequence.append(
                        PoSCapitalizationMode(
                            Pos.NONE, CapitalizationMode.NONE).to_embedding())
                else:
                    sequence.append(previous_item)

                # We only want the latest SEQUENCE_LENGTH items
                sequence = sequence[-StructureModel.SEQUENCE_LENGTH:]

                self.data.append(sequence.copy())
                self.labels.append(label)

                previous_item = item

            # Handle EOS after each sentence
            item = PoSCapitalizationMode(
                Pos.EOS, CapitalizationMode.NONE).to_embedding()
            label = item

            sequence.append(previous_item)

            # We only want the latest SEQUENCE_LENGTH items
            sequence = sequence[-StructureModel.SEQUENCE_LENGTH:]

            self.data.append(sequence.copy())
            self.labels.append(label)

            previous_item = item
        return True
Esempio n. 5
0
    def preprocess(self, doc: Doc) -> bool:
        if len(self.data) >= STRUCTURE_MODEL_TRAINING_MAX_SIZE:
            return False

        sequence = []
        previous_item = None
        for sentence_idx, sentence in enumerate(doc.sents):
            if len(self.data) >= STRUCTURE_MODEL_TRAINING_MAX_SIZE:
                return False

            for token_idx, token in enumerate(sentence):
                item = StructureFeatureAnalyzer.analyze(
                    token, CapitalizationMode.from_token(token, CAPITALIZATION_COMPOUND_RULES))
                label = item

                if len(sequence) == 0:
                    # Offset data by one, making label point to the next data item
                    sequence.append(PoSCapitalizationMode(Pos.NONE, CapitalizationMode.NONE).to_embedding())
                else:
                    sequence.append(previous_item)

                # We only want the latest SEQUENCE_LENGTH items
                sequence = sequence[-StructureModel.SEQUENCE_LENGTH:]

                self.data.append(sequence.copy())
                self.labels.append(label)

                previous_item = item

            # Handle EOS after each sentence
            item = PoSCapitalizationMode(Pos.EOS, CapitalizationMode.NONE).to_embedding()
            label = item

            sequence.append(previous_item)

            # We only want the latest SEQUENCE_LENGTH items
            sequence = sequence[-StructureModel.SEQUENCE_LENGTH:]

            self.data.append(sequence.copy())
            self.labels.append(label)

            previous_item = item
        return True
Esempio n. 6
0
 def from_token(token: Token) -> 'MarkovWord':
     if CapitalizationMode.from_token(token, CAPITALIZATION_COMPOUND_RULES) == CapitalizationMode.COMPOUND:
         compound = True
     else:
         compound = False
     return MarkovWord(token.text, Pos.from_token(token), compound=compound, neighbors={})