def process_batch(documents: Iterable[str], maca_config: str, toki_config_path: str) -> Generator[ List[Sample], None, None]: maca_analyzer = MacaAnalyzer(maca_config, toki_config_path) for document_id, document in enumerate(documents): results = maca_analyzer._maca(document) for res in results: result = maca_analyzer._parse(res) sequence = [] for form, space_before, interpretations, start, end in result: sample = Sample() sequence.append(sample) sample.features['token'] = form sample.features['tags'] = uniq([t for l, t in interpretations]) interpretations = [(re.sub(r':[abcdijnopqsv]\d?$', '', l), t) for l, t in interpretations] sample.features['maca_lemmas'] = [(l.replace('_', ' '), t) for l, t in uniq(interpretations)] # TODO: cleanup space before sample.features['space_before'] = ['space_before'] if space_before !='none' else [ 'no_space_before'] sample.features['space_before'].append(space_before) sample.features['start'] = start sample.features['end'] = end sample.features['document_id'] = document_id Preprocess.create_features(sequence) if sequence: yield sequence
def process_batch_preana(batch: Iterable[Paragraph]) -> Generator[List[Sample], None, None]: for document_id, paragraph in batch: for sentence in paragraph: sequence = [] for token in sentence: sample = Sample() sequence.append(sample) sample.features['token'] = token.form sample.features['tags'] = uniq([form.tags for form in token.interpretations]) sample.features['maca_lemmas'] = uniq([(form.lemma, form.tags) for form in token.interpretations]) sample.features['space_before'] = ['space_before'] if token.space_before else ['no_space_before'] sample.features['space_before'].append(token.space_before) sample.features['document_id'] = document_id Preprocess.create_features(sequence) if sequence: yield sequence
def preprocess_paragraph_reanalyzed( paragraph: Paragraph) -> List[Tuple[List[Sample], List[Sample]]]: paragraph_sequence = [] for sentence, sentence_gold in zip(paragraph, paragraph.concraft): valid_training_data = len(sentence_gold.tokens) == len( sentence.tokens) and len([ token.gold_form for token in sentence.tokens if token.gold_form is None ]) == 0 sequence = [] for token in sentence.tokens: sample = Sample() sequence.append(sample) sample.features['token'] = token.form sample.features['tags'] = uniq( map(lambda form: form.tags, token.interpretations)) if valid_training_data: sample.features['label'] = token.gold_form.tags sample.features['lemma'] = token.gold_form.lemma sample.features['space_before'] = [ 'space_before' ] if is_separator_before( token.space_before) else ['no_space_before'] sample.features['tags4e3'] = create_token_features( sample.features['token'], sample.features['tags'], sample.features['space_before']) sequence_gold = [] for token_gold in sentence_gold.tokens: sample = Sample() sequence_gold.append(sample) sample.features['token'] = token_gold.form if token_gold.gold_form is None: sample.features['label'] = 'ign' else: sample.features['label'] = token_gold.gold_form.tags sample.features['lemma'] = token_gold.gold_form.lemma sample.features['space_before'] = [ 'space_before' ] if is_separator_before( token_gold.space_before) else ['no_space_before'] paragraph_sequence.append((sequence, sequence_gold)) return paragraph_sequence
def analyze(self, text: str) -> Paragraph: results = self._maca(text) paragraph_reanalyzed = Paragraph() for i, res in enumerate(results): result = self._parse(res) sentence_reanalyzed = Sentence() paragraph_reanalyzed.add_sentence(sentence_reanalyzed) for form, space_before, interpretations, start, end in result: token_reanalyzed = Token() sentence_reanalyzed.add_token(token_reanalyzed) token_reanalyzed.form = form token_reanalyzed.space_before = space_before # != 'none' interpretations = [(re.sub(r':[abcdijnopqsv]\d?$', '', l), t) for l, t in interpretations ] # remove senses token_reanalyzed.interpretations = [ Form(l.replace('_', ' '), t) for l, t in uniq(interpretations) ] token_reanalyzed.start = start token_reanalyzed.end = end return paragraph_reanalyzed
def preprocess_paragraph_preanalyzed( paragraph: Paragraph) -> List[Tuple[List[Sample], List[Sample]]]: paragraph_sequence = [] for sentence in paragraph: sequence = [] for token in sentence.tokens: sample = Sample() sequence.append(sample) sample.features['token'] = token.form sample.features['tags'] = uniq( map(lambda form: form.tags, token.interpretations)) sample.features['label'] = token.gold_form.tags sample.features['lemma'] = token.gold_form.lemma sample.features['space_before'] = [ 'space_before' ] if is_separator_before( token.space_before) else ['no_space_before'] sample.features['tags4e3'] = create_token_features( sample.features['token'], sample.features['tags'], sample.features['space_before']) paragraph_sequence.append((sequence, sequence)) return paragraph_sequence