def from_token(token: Token) -> 'MarkovWord': if CapitalizationMode.from_token( token, CAPITALIZATION_COMPOUND_RULES) == CapitalizationMode.COMPOUND: compound = True else: compound = False return MarkovWord(token.text, Pos.from_token(token), compound=compound, neighbors={})
def from_token(token: Token) -> 'MarkovNeighbor': key = token.text.lower() text = token.text if CapitalizationMode.from_token(token, CAPITALIZATION_COMPOUND_RULES) == CapitalizationMode.COMPOUND: compound = True else: compound = False pos = Pos.from_token(token) values = [0, 0] dist = [0] * (MARKOV_WINDOW_SIZE * 2 + 1) return MarkovNeighbor(key, text, pos, compound, values, dist)
def preprocess(self, doc: Doc) -> bool: if len(self.data) >= STRUCTURE_MODEL_TRAINING_MAX_SIZE: return False sequence = [] previous_item = None for sentence_idx, sentence in enumerate(doc.sents): if len(self.data) >= STRUCTURE_MODEL_TRAINING_MAX_SIZE: return False for token_idx, token in enumerate(sentence): item = StructureFeatureAnalyzer.analyze( token, CapitalizationMode.from_token( token, CAPITALIZATION_COMPOUND_RULES)) label = item if len(sequence) == 0: # Offset data by one, making label point to the next data item sequence.append( PoSCapitalizationMode( Pos.NONE, CapitalizationMode.NONE).to_embedding()) else: sequence.append(previous_item) # We only want the latest SEQUENCE_LENGTH items sequence = sequence[-StructureModel.SEQUENCE_LENGTH:] self.data.append(sequence.copy()) self.labels.append(label) previous_item = item # Handle EOS after each sentence item = PoSCapitalizationMode( Pos.EOS, CapitalizationMode.NONE).to_embedding() label = item sequence.append(previous_item) # We only want the latest SEQUENCE_LENGTH items sequence = sequence[-StructureModel.SEQUENCE_LENGTH:] self.data.append(sequence.copy()) self.labels.append(label) previous_item = item return True
def preprocess(self, doc: Doc) -> bool: if len(self.data) >= STRUCTURE_MODEL_TRAINING_MAX_SIZE: return False sequence = [] previous_item = None for sentence_idx, sentence in enumerate(doc.sents): if len(self.data) >= STRUCTURE_MODEL_TRAINING_MAX_SIZE: return False for token_idx, token in enumerate(sentence): item = StructureFeatureAnalyzer.analyze( token, CapitalizationMode.from_token(token, CAPITALIZATION_COMPOUND_RULES)) label = item if len(sequence) == 0: # Offset data by one, making label point to the next data item sequence.append(PoSCapitalizationMode(Pos.NONE, CapitalizationMode.NONE).to_embedding()) else: sequence.append(previous_item) # We only want the latest SEQUENCE_LENGTH items sequence = sequence[-StructureModel.SEQUENCE_LENGTH:] self.data.append(sequence.copy()) self.labels.append(label) previous_item = item # Handle EOS after each sentence item = PoSCapitalizationMode(Pos.EOS, CapitalizationMode.NONE).to_embedding() label = item sequence.append(previous_item) # We only want the latest SEQUENCE_LENGTH items sequence = sequence[-StructureModel.SEQUENCE_LENGTH:] self.data.append(sequence.copy()) self.labels.append(label) previous_item = item return True
def from_token(token: Token) -> 'MarkovWord': if CapitalizationMode.from_token(token, CAPITALIZATION_COMPOUND_RULES) == CapitalizationMode.COMPOUND: compound = True else: compound = False return MarkovWord(token.text, Pos.from_token(token), compound=compound, neighbors={})