Esempio n. 1
0
def sentSegment(par, lang):
  try:
    sents = sent_tokenize(par, lang)
  except:
    try:
      par_seq = Sequence(par)
      st = SentenceTokenizer(locale = lang_map[lang])
      sents = [sent for sent in st.transform(par_seq)]
    except:
      return None
  return sents
 def _create_sentence_objects(self):
     '''Returns a list of Sentence objects from the raw text.
 '''
     sentence_objects = []
     sent_tokenizer = SentenceTokenizer(locale=self.language.code)
     seq = Sequence(self.raw)
     seq = sent_tokenizer.transform(seq)
     for start_index, end_index in zip(seq.idx[:-1], seq.idx[1:]):
         # Sentences share the same models as their parent blob
         sent = seq.text[start_index:end_index].strip()
         if not sent: continue
         s = Sentence(sent, start_index=start_index, end_index=end_index)
         s.detected_languages = self.detected_languages
         sentence_objects.append(s)
     return sentence_objects
Esempio n. 3
0
def segment(args):
    lang = args.lang
    w_tokenizer = WordTokenizer(locale=lang)
    s_tokenizer = SentenceTokenizer(locale=lang)

    if args.only_sent:
        for l in args.input:
            seq = Sequence(l)
            if not seq.empty(): _print(u'\n'.join(s_tokenizer.transform(seq)))

    elif args.only_word:
        for l in args.input:
            seq = Sequence(l)
            if not seq.empty(): _print(u' '.join(w_tokenizer.transform(seq)))

    else:
        for l in args.input:
            seq = Sequence(l)
            sents = s_tokenizer.transform(seq)
            words = w_tokenizer.transform(seq)
            for tokenized_sent in words.split(sents):
                if not tokenized_sent.empty():
                    _print(u' '.join(tokenized_sent.tokens()))