Example #1
0
def annotate_using_janome(sentences, tokenize=False):
    assert tokenize, 'no support for using janome with pre-tokenized inputs'
    try:
        from janome.tokenizer import Tokenizer
    except ImportError:
        logger.error(
            'failed to import janome. please install it by "pip install janome".'
        )
        exit(1)

    logger.info('use Janome to tokenize and annotate POS infos.')
    tokenizer = Tokenizer()
    res = []
    raw_sentences = []
    for sentence in sentences:
        sentence = ''.join(sentence)
        tokenized = list(tokenizer.tokenize(sentence))
        tokens = []
        for token in tokenized:
            pos, pos1, pos2, pos3 = token.part_of_speech.split(',')
            token = Token(word=token.surface,
                          surf=token.surface,
                          pos=pos,
                          pos1=pos1,
                          pos2=pos2,
                          pos3=pos3,
                          inflectionForm=token.infl_form,
                          inflectionType=token.infl_type,
                          reading=token.reading,
                          base=token.base_form)
            tokens.append(token)
        raw_sentence = [token.surface for token in tokenized]
        res.append(tokens)
        raw_sentences.append(raw_sentence)
    return res, raw_sentences
Example #2
0
def annotate_using_spacy(sentences,
                         tokenize=False,
                         n_threads=2,
                         batch_size=10000):
    try:
        import spacy
    except ImportError:
        logger.error(
            'failed to import janome. please install it by "pip install janome".'
        )
        exit(1)

    nlp = spacy.load('en', disable=['parser'])
    logger.info('use spacy to annotate POS and NER infos.')

    if tokenize:
        docs = [nlp.tokenizer(' '.join(sentence)) for sentence in sentences]
        raw_sentences = [[str(token) for token in doc] for doc in docs]
    else:
        docs = [
            nlp.tokenizer.tokens_from_list(sentence) for sentence in sentences
        ]
    for name, proc in nlp.pipeline:
        docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size)

    res = []
    for sentence in docs:
        tokens = []
        for token in sentence:
            if token.ent_iob_ == 'O':
                ner = token.ent_iob_
            else:
                ner = token.ent_iob_ + '-' + token.ent_type_

            # takes care of pronoun
            if token.lemma_ == '-PRON-':
                lemma = str(token).lower()
            else:
                lemma = token.lemma_.lower()
            tokens.append(
                Token(word=str(token),
                      pos=token.tag_,
                      entity=ner,
                      lemma=lemma,
                      chunk='XX'))
        res.append(tokens)
    if tokenize:
        return res, raw_sentences
    else:
        return res
Example #3
0
def annotate_using_jigg(sentences, tokenize=False, pipeline='ssplit,kuromoji'):
    assert tokenize, 'no support for using jigg with pre-tokenized inputs'
    logger.info('use Jigg to tokenize and annotate POS infos.')

    jigg_dir = os.environ.get('JIGG', None)
    if not jigg_dir:
        logger.error(
            'did not find Jigg at JIGG environmental variable. exiting..')
        exit(1)

    tmpfile = tempfile.mktemp()
    with open(tmpfile, 'w') as f:
        for sentence in sentences:
            print(' '.join(sentence), file=f)

    outfile = tempfile.mktemp()
    command = jigg_cmd.format(jigg_dir, pipeline, tmpfile, outfile)
    proc = subprocess.Popen(command,
                            shell=True,
                            stdin=subprocess.PIPE,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE)

    proc.communicate()
    res = []
    raw_sentences = []
    for sentence in etree.parse(outfile).getroot().xpath('*//sentence'):
        tokens = []
        for token in sentence.xpath('*//token'):
            attrib = token.attrib
            token = Token(word=attrib['surf'],
                          surf=attrib['surf'],
                          pos=attrib['pos'],
                          pos1=attrib['pos1'],
                          pos2=attrib['pos2'],
                          pos3=attrib['pos3'],
                          inflectionForm=attrib['inflectionForm'],
                          inflectionType=attrib['inflectionType'],
                          reading=attrib['reading'],
                          base=attrib['base'])
            tokens.append(token)
        res.append(tokens)
        raw_sentence = [token.surf for token in tokens]
        raw_sentences.append(raw_sentence)
    return res, raw_sentences