Example #1
0
    def process_batch(documents: Iterable[str], maca_config: str, toki_config_path: str) -> Generator[
        List[Sample], None, None]:
        maca_analyzer = MacaAnalyzer(maca_config, toki_config_path)

        for document_id, document in enumerate(documents):
            results = maca_analyzer._maca(document)

            for res in results:
                result = maca_analyzer._parse(res)

                sequence = []
                for form, space_before, interpretations, start, end in result:
                    sample = Sample()
                    sequence.append(sample)
                    sample.features['token'] = form
                    sample.features['tags'] = uniq([t for l, t in interpretations])
                    interpretations = [(re.sub(r':[abcdijnopqsv]\d?$', '', l), t) for l, t in
                                       interpretations]
                    sample.features['maca_lemmas'] = [(l.replace('_', ' '), t) for l, t in uniq(interpretations)]

                    # TODO: cleanup space before
                    sample.features['space_before'] = ['space_before'] if space_before !='none' else [
                        'no_space_before']
                    sample.features['space_before'].append(space_before)
                    sample.features['start'] = start
                    sample.features['end'] = end
                    sample.features['document_id'] = document_id
                Preprocess.create_features(sequence)

                if sequence:
                    yield sequence
Example #2
0
    def process_batch_preana(batch: Iterable[Paragraph]) -> Generator[List[Sample], None, None]:
        for document_id, paragraph in batch:
            for sentence in paragraph:
                sequence = []
                for token in sentence:
                    sample = Sample()
                    sequence.append(sample)
                    sample.features['token'] = token.form
                    sample.features['tags'] = uniq([form.tags for form in token.interpretations])
                    sample.features['maca_lemmas'] = uniq([(form.lemma, form.tags) for form in token.interpretations])
                    sample.features['space_before'] = ['space_before'] if token.space_before else ['no_space_before']
                    sample.features['space_before'].append(token.space_before)
                    sample.features['document_id'] = document_id
                Preprocess.create_features(sequence)

                if sequence:
                    yield sequence
Example #3
0
def preprocess_paragraph_reanalyzed(
        paragraph: Paragraph) -> List[Tuple[List[Sample], List[Sample]]]:
    paragraph_sequence = []
    for sentence, sentence_gold in zip(paragraph, paragraph.concraft):
        valid_training_data = len(sentence_gold.tokens) == len(
            sentence.tokens) and len([
                token.gold_form
                for token in sentence.tokens if token.gold_form is None
            ]) == 0

        sequence = []
        for token in sentence.tokens:
            sample = Sample()
            sequence.append(sample)
            sample.features['token'] = token.form
            sample.features['tags'] = uniq(
                map(lambda form: form.tags, token.interpretations))
            if valid_training_data:
                sample.features['label'] = token.gold_form.tags
                sample.features['lemma'] = token.gold_form.lemma
            sample.features['space_before'] = [
                'space_before'
            ] if is_separator_before(
                token.space_before) else ['no_space_before']
            sample.features['tags4e3'] = create_token_features(
                sample.features['token'], sample.features['tags'],
                sample.features['space_before'])

        sequence_gold = []
        for token_gold in sentence_gold.tokens:
            sample = Sample()
            sequence_gold.append(sample)
            sample.features['token'] = token_gold.form
            if token_gold.gold_form is None:
                sample.features['label'] = 'ign'
            else:
                sample.features['label'] = token_gold.gold_form.tags
                sample.features['lemma'] = token_gold.gold_form.lemma
            sample.features['space_before'] = [
                'space_before'
            ] if is_separator_before(
                token_gold.space_before) else ['no_space_before']

        paragraph_sequence.append((sequence, sequence_gold))
    return paragraph_sequence
Example #4
0
    def analyze(self, text: str) -> Paragraph:
        results = self._maca(text)

        paragraph_reanalyzed = Paragraph()
        for i, res in enumerate(results):
            result = self._parse(res)
            sentence_reanalyzed = Sentence()
            paragraph_reanalyzed.add_sentence(sentence_reanalyzed)
            for form, space_before, interpretations, start, end in result:
                token_reanalyzed = Token()
                sentence_reanalyzed.add_token(token_reanalyzed)
                token_reanalyzed.form = form
                token_reanalyzed.space_before = space_before  # != 'none'
                interpretations = [(re.sub(r':[abcdijnopqsv]\d?$', '', l), t)
                                   for l, t in interpretations
                                   ]  # remove senses
                token_reanalyzed.interpretations = [
                    Form(l.replace('_', ' '), t)
                    for l, t in uniq(interpretations)
                ]
                token_reanalyzed.start = start
                token_reanalyzed.end = end
        return paragraph_reanalyzed
Example #5
0
def preprocess_paragraph_preanalyzed(
        paragraph: Paragraph) -> List[Tuple[List[Sample], List[Sample]]]:
    paragraph_sequence = []
    for sentence in paragraph:
        sequence = []
        for token in sentence.tokens:
            sample = Sample()
            sequence.append(sample)
            sample.features['token'] = token.form
            sample.features['tags'] = uniq(
                map(lambda form: form.tags, token.interpretations))
            sample.features['label'] = token.gold_form.tags
            sample.features['lemma'] = token.gold_form.lemma
            sample.features['space_before'] = [
                'space_before'
            ] if is_separator_before(
                token.space_before) else ['no_space_before']
            sample.features['tags4e3'] = create_token_features(
                sample.features['token'], sample.features['tags'],
                sample.features['space_before'])

        paragraph_sequence.append((sequence, sequence))
    return paragraph_sequence