Example #1
0
def tag(tagger, args):
  """Chunk named entities."""
  for l in args.input:
    words = l.strip().split()
    line_annotations = [u"{:<16}{:<5}".format(w,p) for w, p in tagger.annotate(words)]
    _print(u"\n".join(line_annotations))
    _print(u"")
Example #2
0
def ner_chunk(args):
  """Chunk named entities."""
  chunker = NEChunker(lang=args.lang)
  for l in args.input:
    words = l.strip().split()
    line_annotations = [u"{}\t{}".format(w,p) for w, p in chunker.annotate(words)]
    _print(u"\n".join(line_annotations))
Example #3
0
def morphemes(args):
  """Segment words according to their morphemes."""
  morfessor = load_morfessor_model(lang=args.lang)
  for l in args.input:
    words = l.strip().split()
    morphemes = [(w, u"_".join(morfessor.viterbi_segment(w)[0])) for w in words]
    line_annotations = [u"{:<16}{:<5}".format(w,p) for w, p in morphemes]
    _print(u"\n".join(line_annotations))
    _print(u"")
Example #4
0
def transliterate(args):
  """Transliterate words according to the target language."""
  t = Transliterator(source_lang=args.lang,
                     target_lang=args.target)
  for l in args.input:
    words = l.strip().split()
    line_annotations = [u"{:<16}{:<16}".format(w, t.transliterate(w)) for w in words]
    _print(u"\n".join(line_annotations))
    _print(u"")
Example #5
0
def segment(args):
    lang = args.lang
    w_tokenizer = WordTokenizer(locale=lang)
    s_tokenizer = SentenceTokenizer(locale=lang)

    if args.only_sent:
        for l in args.input:
            seq = Sequence(l)
            if not seq.empty(): _print(u'\n'.join(s_tokenizer.transform(seq)))

    elif args.only_word:
        for l in args.input:
            seq = Sequence(l)
            if not seq.empty(): _print(u' '.join(w_tokenizer.transform(seq)))

    else:
        for l in args.input:
            seq = Sequence(l)
            sents = s_tokenizer.transform(seq)
            words = w_tokenizer.transform(seq)
            for tokenized_sent in words.split(sents):
                if not tokenized_sent.empty():
                    _print(u' '.join(tokenized_sent.tokens()))
Example #6
0
def segment(args):
  lang  = args.lang
  w_tokenizer = WordTokenizer(locale=lang)
  s_tokenizer = SentenceTokenizer(locale=lang)

  if args.only_sent:
    for l in args.input:
      seq = Sequence(l)
      if not seq.empty(): _print(s_tokenizer.transform(seq))

  elif args.only_word:
    for l in args.input:
      seq = Sequence(l)
      if not seq.empty(): _print(w_tokenizer.transform(seq))

  else:
    for l in args.input:
      seq = Sequence(l)
      sents = s_tokenizer.transform(seq)
      words = w_tokenizer.transform(seq)
      for tokenized_sent in words.split(sents):
        if not tokenized_sent.empty():
          _print(u' '.join(tokenized_sent.tokens()))
Example #7
0
def cat(args):
    """ Concatenate the content of the input file."""
    for l in args.input:
        _print(l.strip())
Example #8
0
def detect(args):
    """ Detect the language of each line."""
    for l in args.input:
        if l.strip():
            _print("{:<20}{}".format(Detector(l).language.name, l.strip()))
Example #9
0
def cat(args):
  """ Concatenate the content of the input file."""
  for l in args.input:
    _print(l.strip())
Example #10
0
def detect(args):
  """ Detect the language of each line."""
  for l in args.input:
    if l.strip():
      _print("{:<20}{}".format(Detector(l).language.name, l.strip()))
Example #11
0
def detect(args):
  """ Detect the language of each line."""
  for l in args.input:
    if l.strip():
      _print(Detector(l).name)