Ejemplo n.º 1
0
def annotate_using_janome(sentences, tokenize=False):
    assert tokenize, 'no support for using janome with pre-tokenized inputs'
    try:
        from janome.tokenizer import Tokenizer
    except ImportError:
        logger.error(
            'failed to import janome. please install it by "pip install janome".'
        )
        exit(1)

    logger.info('use Janome to tokenize and annotate POS infos.')
    tokenizer = Tokenizer()
    res = []
    raw_sentences = []
    for sentence in sentences:
        sentence = ''.join(sentence)
        tokenized = list(tokenizer.tokenize(sentence))
        tokens = []
        for token in tokenized:
            pos, pos1, pos2, pos3 = token.part_of_speech.split(',')
            token = Token(word=token.surface,
                          surf=token.surface,
                          pos=pos,
                          pos1=pos1,
                          pos2=pos2,
                          pos3=pos3,
                          inflectionForm=token.infl_form,
                          inflectionType=token.infl_type,
                          reading=token.reading,
                          base=token.base_form)
            tokens.append(token)
        raw_sentence = [token.surface for token in tokenized]
        res.append(tokens)
        raw_sentences.append(raw_sentence)
    return res, raw_sentences
Ejemplo n.º 2
0
def annotate_using_spacy(sentences,
                         tokenize=False,
                         n_threads=2,
                         batch_size=10000):
    try:
        import spacy
    except ImportError:
        logger.error(
            'failed to import janome. please install it by "pip install janome".'
        )
        exit(1)

    nlp = spacy.load('en', disable=['parser'])
    logger.info('use spacy to annotate POS and NER infos.')

    if tokenize:
        docs = [nlp.tokenizer(' '.join(sentence)) for sentence in sentences]
        raw_sentences = [[str(token) for token in doc] for doc in docs]
    else:
        docs = [
            nlp.tokenizer.tokens_from_list(sentence) for sentence in sentences
        ]
    for name, proc in nlp.pipeline:
        docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size)

    res = []
    for sentence in docs:
        tokens = []
        for token in sentence:
            if token.ent_iob_ == 'O':
                ner = token.ent_iob_
            else:
                ner = token.ent_iob_ + '-' + token.ent_type_

            # takes care of pronoun
            if token.lemma_ == '-PRON-':
                lemma = str(token).lower()
            else:
                lemma = token.lemma_.lower()
            tokens.append(
                Token(word=str(token),
                      pos=token.tag_,
                      entity=ner,
                      lemma=lemma,
                      chunk='XX'))
        res.append(tokens)
    if tokenize:
        return res, raw_sentences
    else:
        return res
Ejemplo n.º 3
0
def annotate_using_jigg(sentences, tokenize=False, pipeline='ssplit,kuromoji'):
    assert tokenize, 'no support for using jigg with pre-tokenized inputs'
    logger.info('use Jigg to tokenize and annotate POS infos.')

    jigg_dir = os.environ.get('JIGG', None)
    if not jigg_dir:
        logger.error(
            'did not find Jigg at JIGG environmental variable. exiting..')
        exit(1)

    tmpfile = tempfile.mktemp()
    with open(tmpfile, 'w') as f:
        for sentence in sentences:
            print(' '.join(sentence), file=f)

    outfile = tempfile.mktemp()
    command = jigg_cmd.format(jigg_dir, pipeline, tmpfile, outfile)
    proc = subprocess.Popen(command,
                            shell=True,
                            stdin=subprocess.PIPE,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE)

    proc.communicate()
    res = []
    raw_sentences = []
    for sentence in etree.parse(outfile).getroot().xpath('*//sentence'):
        tokens = []
        for token in sentence.xpath('*//token'):
            attrib = token.attrib
            token = Token(word=attrib['surf'],
                          surf=attrib['surf'],
                          pos=attrib['pos'],
                          pos1=attrib['pos1'],
                          pos2=attrib['pos2'],
                          pos3=attrib['pos3'],
                          inflectionForm=attrib['inflectionForm'],
                          inflectionType=attrib['inflectionType'],
                          reading=attrib['reading'],
                          base=attrib['base'])
            tokens.append(token)
        res.append(tokens)
        raw_sentence = [token.surf for token in tokens]
        raw_sentences.append(raw_sentence)
    return res, raw_sentences
Ejemplo n.º 4
0
def try_annotate_using_candc(sentences: List[List[str]],
                             tokenize=False) -> List[List[Token]]:
    if tokenize:
        raise NotImplementedError(
            'no tokenizer implemented in the C&C pipeline')

    candc_dir = os.environ.get('CANDC', None)
    candc_model_pos = None
    candc_model_ner = None
    fail = False
    if candc_dir:
        candc_dir = Path(candc_dir)
        candc_model_pos = Path(
            os.environ.get('CANDC_MODEL_POS',
                           str(candc_dir / 'models' / 'pos')))
        candc_model_ner = Path(
            os.environ.get('CANDC_MODEL_NER',
                           str(candc_dir / 'models' / 'ner')))
        if (candc_dir / 'bin' / 'pos').exists() and \
                (candc_dir / 'bin' / 'ner').exists() and \
                candc_model_pos.exists() and \
                candc_model_ner.exists():
            pass
        else:
            logger.info(
                'CANDC environmental variable may not be configured correctly.'
            )
            logger.info(
                '$CANDC/bin/{pos,ner} and $CANDC/models/{pos,ner} are expected to exist.'
            )
            fail = True
    else:
        fail = True

    if fail:
        logger.info('did not find C&C parser at CANDC environmental variable.')
        logger.info('fill POS tag etc. using XX tag.')
        return annotate_XX(sentences)

    logger.info('find C&C parser at CANDC environmental variable.')
    logger.info('use C&C pipeline to annotate POS and NER infos.')
    logger.info(f'C&C models: [{candc_model_pos}, {candc_model_ner}]')

    stemmer = MorphaStemmer(str(MODEL_DIRECTORY / 'verbstem.list'))

    tmpfile = tempfile.mktemp()
    with open(tmpfile, 'w') as f:
        for sentence in sentences:
            print(' '.join(sentence), file=f)

    command = candc_cmd.format(tmpfile, candc_dir, candc_model_pos,
                               candc_model_ner)
    proc = subprocess.Popen(command,
                            shell=True,
                            stdin=subprocess.PIPE,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE)

    res, error = proc.communicate()
    try:
        tagged_sentences = res.decode('utf-8').strip().split('\n')
        tagged_sentences = [[
            tuple(token.split('|')) for token in sentence.strip().split(' ')
        ] for sentence in tagged_sentences]
    except:
        raise RuntimeError(
            'failed to process C&C output. there might have been some problem '
            'during running C&C pipeline?\n'
            f'stderr:\n {error}')

    res = []
    for sentence in tagged_sentences:
        words, poss = zip(*[(word, pos) for word, pos, _ in sentence])
        lemmas = stemmer.analyze(list(words), list(poss))
        tokens = [
            Token(word=word,
                  pos=pos,
                  entity=ner,
                  lemma=lemma.lower(),
                  chunk='XX')
            for (word, pos, ner), lemma in zip(sentence, lemmas)
        ]
        res.append(tokens)
    return res