def annotate_using_janome(sentences, tokenize=False): assert tokenize, 'no support for using janome with pre-tokenized inputs' try: from janome.tokenizer import Tokenizer except ImportError: logger.error( 'failed to import janome. please install it by "pip install janome".' ) exit(1) logger.info('use Janome to tokenize and annotate POS infos.') tokenizer = Tokenizer() res = [] raw_sentences = [] for sentence in sentences: sentence = ''.join(sentence) tokenized = list(tokenizer.tokenize(sentence)) tokens = [] for token in tokenized: pos, pos1, pos2, pos3 = token.part_of_speech.split(',') token = Token(word=token.surface, surf=token.surface, pos=pos, pos1=pos1, pos2=pos2, pos3=pos3, inflectionForm=token.infl_form, inflectionType=token.infl_type, reading=token.reading, base=token.base_form) tokens.append(token) raw_sentence = [token.surface for token in tokenized] res.append(tokens) raw_sentences.append(raw_sentence) return res, raw_sentences
def annotate_using_spacy(sentences, tokenize=False, n_threads=2, batch_size=10000): try: import spacy except ImportError: logger.error( 'failed to import janome. please install it by "pip install janome".' ) exit(1) nlp = spacy.load('en', disable=['parser']) logger.info('use spacy to annotate POS and NER infos.') if tokenize: docs = [nlp.tokenizer(' '.join(sentence)) for sentence in sentences] raw_sentences = [[str(token) for token in doc] for doc in docs] else: docs = [ nlp.tokenizer.tokens_from_list(sentence) for sentence in sentences ] for name, proc in nlp.pipeline: docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size) res = [] for sentence in docs: tokens = [] for token in sentence: if token.ent_iob_ == 'O': ner = token.ent_iob_ else: ner = token.ent_iob_ + '-' + token.ent_type_ # takes care of pronoun if token.lemma_ == '-PRON-': lemma = str(token).lower() else: lemma = token.lemma_.lower() tokens.append( Token(word=str(token), pos=token.tag_, entity=ner, lemma=lemma, chunk='XX')) res.append(tokens) if tokenize: return res, raw_sentences else: return res
def annotate_using_jigg(sentences, tokenize=False, pipeline='ssplit,kuromoji'): assert tokenize, 'no support for using jigg with pre-tokenized inputs' logger.info('use Jigg to tokenize and annotate POS infos.') jigg_dir = os.environ.get('JIGG', None) if not jigg_dir: logger.error( 'did not find Jigg at JIGG environmental variable. exiting..') exit(1) tmpfile = tempfile.mktemp() with open(tmpfile, 'w') as f: for sentence in sentences: print(' '.join(sentence), file=f) outfile = tempfile.mktemp() command = jigg_cmd.format(jigg_dir, pipeline, tmpfile, outfile) proc = subprocess.Popen(command, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) proc.communicate() res = [] raw_sentences = [] for sentence in etree.parse(outfile).getroot().xpath('*//sentence'): tokens = [] for token in sentence.xpath('*//token'): attrib = token.attrib token = Token(word=attrib['surf'], surf=attrib['surf'], pos=attrib['pos'], pos1=attrib['pos1'], pos2=attrib['pos2'], pos3=attrib['pos3'], inflectionForm=attrib['inflectionForm'], inflectionType=attrib['inflectionType'], reading=attrib['reading'], base=attrib['base']) tokens.append(token) res.append(tokens) raw_sentence = [token.surf for token in tokens] raw_sentences.append(raw_sentence) return res, raw_sentences