def test_split_regular(self): result = split_contractions(["We'll", 'see', "her's", 'too', '!']) self.assertEqual(7, len(result), str(result)) self.assertEqual(result[0], 'We', str(result)) self.assertEqual(result[1], "'ll", str(result)) self.assertEqual(result[3], 'her', str(result)) self.assertEqual(result[4], "'s", str(result))
def __build_graph__(self): stopwords = get_stopwords(self.lan) stem = get_stem(self.lan).stem self.G = nx.Graph() sentences_str = [[ w for w in split_contractions(web_tokenizer(s)) if not (w.startswith("'") and len(w) > 1) and len(w) > 0 ] for s in list(split_multi(self.text)) if len(s.strip()) > 0] for sentence in sentences_str: buffer = [] for word in sentence: if len([ c for c in word if c in EXCLUDE ]) == len(word) or word.lower() in stopwords or word.replace( '.', '').replace(',', '').replace('-', '').isnumeric(): continue else: #stemmed_word = lemma(word).lower() stemmed_word = stem(word) if stemmed_word not in self.G: self.G.add_node(stemmed_word, TF=0) self.G.node[stemmed_word]['TF'] += 1 for (idx_cooccur, word_cooccur) in enumerate(buffer[-self.w:]): self.__add_cooccur__(word_cooccur, stemmed_word, idx_cooccur + 1) buffer.append(stemmed_word) self.__build_linegraph__()
def add_document(self, text): text = self.pre_filter(text) sentences_str = [[ w for w in split_contractions(web_tokenizer(s)) if not (w.startswith("'") and len(w) > 1) and len(w) > 0 ] for s in list(split_multi(text)) if len(s.strip()) > 0] self.number_of_sentences += len(sentences_str) self.number_of_documents += 1 pos_text = 0 document_candidates = {} term_in_doc = {} sentences_obj = [] block_of_word_obj = [] sentence_obj_aux = [] for (sentence_id, sentence) in enumerate(sentences_str): sentence_obj_aux = [] block_of_word_obj = [] for (pos_sent, word) in enumerate(sentence): if len([ c for c in word if c in self.exclude ]) == len(word): # If the word is based on exclude chars if len(block_of_word_obj) > 0: sentence_obj_aux.append(block_of_word_obj) cand = ComposedWord(block_of_word_obj) cand = self.add_or_update_composed_word(cand) if cand.unique_kw not in document_candidates: document_candidates[cand.unique_kw] = cand block_of_word_obj = [] else: tag = self.get_tag(word, pos_sent) term_obj = self.get_term(word) term_in_doc[term_obj.unique_term] = term_obj term_obj.add_occurrence(tag, sentence_id, pos_sent, pos_text, self.number_of_documents) pos_text += 1 #Create co-occurrence matrix if tag not in self.tagsToDiscard: word_windows = list( range( max(0, len(block_of_word_obj) - self.windowsSize), len(block_of_word_obj))) for w in word_windows: if block_of_word_obj[w][ 0] not in self.tagsToDiscard: self.add_cooccurrence(block_of_word_obj[w][2], term_obj) # Add term to the block of words' buffer block_of_word_obj.append((tag, word, term_obj)) if len(block_of_word_obj) > 0: sentence_obj_aux.append(block_of_word_obj) if len(sentence_obj_aux) > 0: sentences_obj.append(sentence_obj_aux) if len(block_of_word_obj) > 0: sentence_obj_aux.append(block_of_word_obj) if len(sentence_obj_aux) > 0: sentences_obj.append(sentence_obj_aux) self.number_of_words += pos_text return document_candidates, term_in_doc
def __init__(self, text: str = None, use_tokenizer: bool = False, labels: List[str] = None): self.tokens: List[Token] = [] self.labels: List[str] = labels self._embeddings: Dict = {} # optionally, directly instantiate with sentence tokens if text is not None: # tokenize the text first if option selected, otherwise assumes whitespace tokenized text if use_tokenizer: sentences = split_single(text) tokens = [] for sentence in sentences: contractions = split_contractions(word_tokenizer(sentence)) tokens.extend(contractions) text = ' '.join(tokens) # add each word in tokenized string as Token object to Sentence for word in text.split(' '): self.add_token(Token(word))
def _build(self, text, windowsSize, n): text = self.pre_filter(text) self.sentences_str = [ [w for w in split_contractions(web_tokenizer(s)) if not (w.startswith("'") and len(w) > 1) and len(w) > 0] for s in list(split_multi(text)) if len(s.strip()) > 0] self.number_of_sentences = len(self.sentences_str) pos_text = 0 block_of_word_obj = [] sentence_obj_aux = [] for (sentence_id, sentence) in enumerate(self.sentences_str): sentence_obj_aux = [] block_of_word_obj = [] for (pos_sent, word) in enumerate(sentence): if len([c for c in word if c in self.exclude]) == len(word): # If the word is based on exclude chars if len(block_of_word_obj) > 0: sentence_obj_aux.append( block_of_word_obj ) block_of_word_obj = [] else: tag = self.getTag(word, pos_sent) term_obj = self.getTerm(word) term_obj.addOccur(tag, sentence_id, pos_sent, pos_text) pos_text += 1 #Create co-occurrence matrix if tag not in self.tagsToDiscard: word_windows = list(range( max(0, len(block_of_word_obj)-windowsSize), len(block_of_word_obj) )) for w in word_windows: if block_of_word_obj[w][0] not in self.tagsToDiscard: self.addCooccur(block_of_word_obj[w][2], term_obj) #Generate candidate keyphrase list candidate = [ (tag, word, term_obj) ] cand = composed_word(candidate) self.addOrUpdateComposedWord(cand) word_windows = list(range( max(0, len(block_of_word_obj)-(n-1)), len(block_of_word_obj) ))[::-1] for w in word_windows: candidate.append(block_of_word_obj[w]) self.freq_ns[len(candidate)] += 1. cand = composed_word(candidate[::-1]) self.addOrUpdateComposedWord(cand) # Add term to the block of words' buffer block_of_word_obj.append( (tag, word, term_obj) ) if len(block_of_word_obj) > 0: sentence_obj_aux.append( block_of_word_obj ) if len(sentence_obj_aux) > 0: self.sentences_obj.append(sentence_obj_aux) if len(block_of_word_obj) > 0: sentence_obj_aux.append( block_of_word_obj ) if len(sentence_obj_aux) > 0: self.sentences_obj.append(sentence_obj_aux) self.number_of_words = pos_text
def tokenize(text): """ Inputs: txt Outputs: tokens tokenized by segtok.tokenizer """ tokens = [] sentences = split_single(text) for sentence in sentences: contractions = split_contractions(word_tokenizer(sentence)) tokens.extend(contractions) return tokens
def run_tokenize(text: str) -> List[str]: words: List[str] = [] sentences = split_single(text) for sentence in sentences: contractions = split_contractions(word_tokenizer(sentence)) words.extend(contractions) words = list(filter(None, words)) return words
def __init__(self, text: str = None, use_tokenizer: bool = False, labels: Union[List[Label], List[str]] = None): super(Sentence, self).__init__() self.tokens: List[Token] = [] self.labels: List[Label] = [] if labels is not None: self.add_labels(labels) self._embeddings: Dict = {} # if text is passed, instantiate sentence with tokens (words) if text is not None: # tokenize the text first if option selected if use_tokenizer: # use segtok for tokenization tokens = [] sentences = split_single(text) for sentence in sentences: contractions = split_contractions(word_tokenizer(sentence)) tokens.extend(contractions) # determine offsets for whitespace_after field index = text.index running_offset = 0 last_word_offset = -1 last_token = None for word in tokens: token = Token(word) self.add_token(token) try: word_offset = index(word, running_offset) except: word_offset = last_word_offset + 1 if word_offset - 1 == last_word_offset and last_token is not None: last_token.whitespace_after = False word_len = len(word) running_offset = word_offset + word_len last_word_offset = running_offset - 1 last_token = token # otherwise assumes whitespace tokenized text else: # add each word in tokenized string as Token object to Sentence for word in text.split(' '): if word: token = Token(word) self.add_token(token)
def __init__(self, text=None, use_tokenizer=False, labels=None): super(Sentence, self).__init__() self.tokens = [] self.labels = [] if (labels is not None): self.add_labels(labels) self._embeddings = {} if (text is not None): if use_tokenizer: tokens = [] sentences = split_single(text) for sentence in sentences: contractions = split_contractions(word_tokenizer(sentence)) tokens.extend(contractions) index = text.index running_offset = 0 last_word_offset = (-1) last_token = None for word in tokens: try: word_offset = index(word, running_offset) start_position = word_offset except: word_offset = (last_word_offset + 1) start_position = ((running_offset + 1) if (running_offset > 0) else running_offset) token = Token(word, start_position=start_position) self.add_token(token) if (((word_offset - 1) == last_word_offset) and (last_token is not None)): last_token.whitespace_after = False word_len = len(word) running_offset = (word_offset + word_len) last_word_offset = (running_offset - 1) last_token = token else: word = u'' for (index, char) in enumerate(text): if (char == u' '): if (len(word) > 0): token = Token(word, start_position=(index - len(word))) self.add_token(token) word = u'' else: word += char index += 1 if (len(word) > 0): token = Token(word, start_position=(index - len(word))) self.add_token(token)
def word_tokenize(self, text): """Get list of string tokens from input string. Args: text: input string for tokenization Yields: token: str, non-whitespace tokens """ for token in split_possessive_markers(split_contractions(_html_tokenize(text))): if self._max_characters_per_token is not None: for token_chunk in funcy.chunks(self._max_characters_per_token, token): yield token_chunk else: yield token
def build_candidate(self, candidate_string): sentences_str = [w for w in split_contractions(web_tokenizer(candidate_string.lower())) if not (w.startswith("'") and len(w) > 1) and len(w) > 0] candidate_terms = [] for (i, word) in enumerate(sentences_str): tag = self.getTag(word, i) term_obj = self.getTerm(word, save_non_seen=False) if term_obj.tf == 0: term_obj = None candidate_terms.append( (tag, word, term_obj) ) if len([cand for cand in candidate_terms if cand[2] != None]) == 0: invalid_virtual_cand = composed_word(None) return invalid_virtual_cand virtual_cand = composed_word(candidate_terms) return virtual_cand
def fr_tokenizer(text: str) -> list: """ Tokenizes texts in French Args: text (str): input text Returns: flair Token objects """ tokens = [] tokenizer = RegexpTokenizer(r"""\w'|\w’|\w`|\w\w+'\w+|[^\w\s]|\w+""") words = [] sentences = split_single(text) for sentence in sentences: contractions = split_contractions(tokenizer.tokenize(sentence)) words.extend(contractions) # determine offsets for whitespace_after field index = text.index current_offset = 0 previous_word_offset = -1 previous_token = None for word in words: try: word_offset = index(word, current_offset) start_position = word_offset except ValueError: word_offset = previous_word_offset + 1 start_position = (current_offset + 1 if current_offset > 0 else current_offset) if word: token = Token(text=word, start_position=start_position, whitespace_after=True) tokens.append(token) if (previous_token is not None) and word_offset - 1 == previous_word_offset: previous_token.whitespace_after = False current_offset = word_offset + len(word) previous_word_offset = current_offset - 1 previous_token = token return tokens
def segtok_tokenizer(text: str) -> List[Token]: """ Tokenizer using segtok, a third party library dedicated to rules-based Indo-European languages. https://github.com/fnl/segtok """ tokens: List[Token] = [] words: List[str] = [] sentences = split_single(text) for sentence in sentences: contractions = split_contractions(word_tokenizer(sentence)) words.extend(contractions) words = list(filter(None, words)) # determine offsets for whitespace_after field index = text.index current_offset = 0 previous_word_offset = -1 previous_token = None for word in words: #try: word_offset = index(word, current_offset) start_position = word_offset #except: # word_offset = previous_word_offset + 1 # start_position = ( # current_offset + 1 if current_offset > 0 else current_offset # ) if word: token = Token(text=word, start_position=start_position, whitespace_after=True) tokens.append(token) if (previous_token is not None) and word_offset - 1 == previous_word_offset: previous_token.whitespace_after = False current_offset = word_offset + len(word) previous_word_offset = current_offset - 1 previous_token = token return tokens
def run_tokenize(text: str) -> List[Token]: tokens: List[Token] = [] words: List[str] = [] sentences = split_single(text) for sentence in sentences: contractions = split_contractions(word_tokenizer(sentence)) words.extend(contractions) words = list(filter(None, words)) # determine offsets for whitespace_after field index = text.index current_offset = 0 previous_word_offset = -1 previous_token = None for word in words: try: word_offset = index(word, current_offset) start_position = word_offset except: word_offset = previous_word_offset + 1 start_position = (current_offset + 1 if current_offset > 0 else current_offset) if word: token = Token(text=word, start_position=start_position, whitespace_after=True) tokens.append(token) if (previous_token is not None) and word_offset - 1 == previous_word_offset: previous_token.whitespace_after = False current_offset = word_offset + len(word) previous_word_offset = current_offset - 1 previous_token = token return tokens
def test_split_not(self): stem, contraction = split_contractions(["don't"]) self.assertEqual(stem, 'do') self.assertEqual(contraction, "n't")
def _process_internal(self, sentences: List[str]) -> List[List[str]]: return [split_contractions(word_tokenizer(sen)) for sen in sentences]
def word_tokenizer(self, text) -> List[Token]: tokenized = [] if self.language_type == 'zh': if self.sp_op == 'char': for index, char in enumerate(text): token = Token(char, start_position=index) tokenized.append(token) elif self.sp_op == 'py': for index, char in enumerate(text): token = Token(char, start_position=index, sp='py') tokenized.append(token) else: seg_list = list(jieba.tokenize(text)) for t in seg_list: token = Token(t[0], start_position=t[1]) tokenized.append(token) elif self.language_type == 'ug': text = self.uy_preprocess(text) word = '' for index, char in enumerate(text): if char == ' ': if len(word) > 0: token = Token(word, start_position=index - len(word), sp=self.sp_op) tokenized.append(token) word = '' else: word += char index += 1 if len(word) > 0: token = Token(word, start_position=index - len(word), sp=self.sp_op) tokenized.append(token) else: tokenized = [] tokens = [] sentences = split_single(text) for sentence in sentences: contractions = split_contractions(word_tokenizer(sentence)) tokens.extend(contractions) index = text.index running_offset = 0 last_word_offset = -1 last_token = None for word in tokens: try: word_offset = index(word, running_offset) start_position = word_offset except: word_offset = last_word_offset + 1 start_position = running_offset + 1 if running_offset > 0 else running_offset token = Token(word, start_position=start_position) tokenized.append(token) if word_offset - 1 == last_word_offset and last_token is not None: last_token.whitespace_after = False word_len = len(word) running_offset = word_offset + word_len last_word_offset = running_offset - 1 last_token = token return tokenized
def question_to_tokenized_fields(question): b = ['¡ Description'] a = question.replace('¶ ¶ Examples ¶ ', '¦¶ ¶ Examples ¶ ¶ ').replace( '¶ Examples ¶ ', '¦¶ ¶ Examples ¶ ¶ ').split('¦') #You replace Note with Explanation in Codeforces #codeforces if len(a) > 1: for idx, i in enumerate(a): if idx == 0: c = [] c += [ i.encode('utf-8') for i in segtok.segmenter.split_multi( a[idx].decode('utf-8')) ] for i in c: b += i.replace( '¶ ¶ Description ¶ ', '¡ Description¦').replace( '¶ ¶ Input ¶ ', '¦¡ Input¦').replace( '¶ ¶ Output ¶ ', '¦¡ Output¦').replace( '¶ Input ¶ ', '¦¡ Input¦').replace( '¶ Output ¶ ', '¦¡ Output¦').replace( ' . ', ' .¦').replace('¶ ¶ ', '¦').split('¦') else: c = [] c += [ i.encode('utf-8') for i in segtok.segmenter.split_multi( a[idx].decode('utf-8')) ] for i in c: b += i.replace('¶ ¶ Input ¶ ', '¦¶ ¶ Input ¶ ').replace( '¶ ¶ Examples ', '¡ Examples').replace( '¶ Examples ', '¡ Examples').replace( '¶ ¶ Output ¶ ', '¦¶ Output ¶ ').replace( '¶ ¶ Note ¶ ', '¦¡ Explanation¦').replace( '¶ ¶ Input : ¶', '¦¡ Input¦').replace( '¶ ¶ Output : ¶', '¦¡ Output¦').replace( ' . ', ' .¦').replace( '¶ ¶ ', '¦').replace( '¶ Output ¶', 'Output ¶').split('¦') #hackerearth else: c = [] c += [ i.encode('utf-8') for i in segtok.segmenter.split_multi(a[0].decode('utf-8')) ] for i in c: b+=i.replace('Description: ¶ ', '').replace('¶ ¶ Output', '¶ Output').replace('¶ Output', '¶ ¶ Output').replace('¶ ¶ Input : ¶ ', '¦¡ Input¦').replace('¶ ¶ Output : ¶ ', '¦¡ Output¦').replace('¶ ¶ Input: ¶ ', '¦¡ Input¦').replace('¶ ¶ Output: ¶ ', '¦¡ Output¦').replace('¶ ¶ Input ¶ ', '¦¡ Input¦').replace('¶ ¶ Output ¶ ', '¦¡ Output¦').replace('¶ ¶ Input ', '¦¡ Input¦').replace('¶ ¶ Examples ', '¡ Examples').replace('¶ ¶ Output ', '¦¡ Output¦').replace('¶ ¶ Note ¶ ', '¦¡ Note¦') \ .replace('¶ ¶ SAMPLE INPUT ¶', '¦¡ Examples¦¶ ¶ Input ¶').replace('¶ ¶ SAMPLE OUTPUT ¶', '¦¶ ¶ Output ¶').replace('¶ ¶ Constraints : ¶ ', '¦¡ Constraints¦').replace('¶ ¶ Constraint : ¶ ', '¦¡ Constraints¦').replace('¶ ¶ Constraints: ¶ ', '¦¡ Constraints¦').replace('¶ ¶ Constraint: ¶ ', '¦¡ Constraints¦').replace('¶ ¶ Constraints ¶ ', '¦¡ Constraints¦').replace('¶ ¶ Constraint ¶ ', '¦¡ Constraints¦').replace('¶ ¶ Explanation ¶ ', '¦¡ Explanation¦').replace('¶ ¶ ', '¦').split('¦') b = [ split_nums(split_contractions(char_split_if_io_example(x))) for x in b if x.strip() ] return b
def test_split_unicode(self): stem, contraction = split_contractions(["a\u2032d"]) self.assertEqual(stem, 'a') self.assertEqual(contraction, "\u2032d")
def __init__(self, text: str = None, use_tokenizer: bool = False, labels: Union[(List[Label], List[str])] = None, language_code: str = None): super(Sentence, self).__init__() self.tokens = [] self.labels = [] if (labels is not None): self.add_labels(labels) self._embeddings = {} self.language_code = language_code if (text is not None): if use_tokenizer: tokens = [] sentences = split_single(text) for sentence in sentences: contractions = split_contractions(word_tokenizer(sentence)) tokens.extend(contractions) index = text.index running_offset = 0 last_word_offset = (-1) last_token = None for word in tokens: try: word_offset = index(word, running_offset) start_position = word_offset except: word_offset = (last_word_offset + 1) start_position = ((running_offset + 1) if (running_offset > 0) else running_offset) token = Token(word, start_position=start_position) self.add_token(token) if (((word_offset - 1) == last_word_offset) and (last_token is not None)): last_token.whitespace_after = False word_len = len(word) running_offset = (word_offset + word_len) last_word_offset = (running_offset - 1) last_token = token else: word = '' index = (-1) for (index, char) in enumerate(text): if (char == ' '): if (len(word) > 0): token = Token(word, start_position=(index - len(word))) self.add_token(token) word = '' else: word += char index += 1 if (len(word) > 0): token = Token(word, start_position=(index - len(word))) self.add_token(token) if (text == ''): log.warn( 'ACHTUNG: An empty Sentence was created! Are there empty strings in your dataset?' ) self.tokenized = None
def __nltk_stem__(self, word): return ' '.join([ self.stem.stem(w) for w in split_contractions(web_tokenizer(word)) ])
def __polish_stem__(self, word): return ' '.join( self.stem.stemmer_convert( [w for w in split_contractions(web_tokenizer(word))]))
def setUp(self): self.tokenizer = test_tokenizer_with_spans( self, lambda t: split_contractions(space_tokenizer(t)))
def __init__( self, text: str = None, use_tokenizer: bool = False, labels: Union[List[Label], List[str]] = None, language_code: str = None, ): super(Sentence, self).__init__() self.tokens: List[Token] = [] self.labels: List[Label] = [] if labels is not None: self.add_labels(labels) self._embeddings: Dict = {} self.language_code: str = language_code # if text is passed, instantiate sentence with tokens (words) if text is not None: # tokenize the text first if option selected if use_tokenizer: # use segtok for tokenization tokens = [] sentences = split_single(text) for sentence in sentences: contractions = split_contractions(word_tokenizer(sentence)) tokens.extend(contractions) # determine offsets for whitespace_after field index = text.index running_offset = 0 last_word_offset = -1 last_token = None for word in tokens: try: word_offset = index(word, running_offset) start_position = word_offset except: word_offset = last_word_offset + 1 start_position = (running_offset + 1 if running_offset > 0 else running_offset) token = Token(word, start_position=start_position) self.add_token(token) if word_offset - 1 == last_word_offset and last_token is not None: last_token.whitespace_after = False word_len = len(word) running_offset = word_offset + word_len last_word_offset = running_offset - 1 last_token = token # otherwise assumes whitespace tokenized text else: # add each word in tokenized string as Token object to Sentence word = "" index = -1 for index, char in enumerate(text): if char == " ": if len(word) > 0: token = Token(word, start_position=index - len(word)) self.add_token(token) word = "" else: word += char # increment for last token in sentence if not followed by whtespace index += 1 if len(word) > 0: token = Token(word, start_position=index - len(word)) self.add_token(token) # log a warning if the dataset is empty if text == "": log.warn( "ACHTUNG: An empty Sentence was created! Are there empty strings in your dataset?" )
def __init__(self, text: str = None, use_tokenizer: bool = False, labels: Union[List[Label], List[str]] = None): super(Sentence, self).__init__() self.tokens: List[Token] = [] self.labels: List[Label] = [] if labels is not None: self.add_labels(labels) self._embeddings: Dict = {} # if text is passed, instantiate sentence with tokens (words) if text is not None: # tokenize the text first if option selected if use_tokenizer: # use segtok for tokenization tokens = [] sentences = split_single(text) for sentence in sentences: contractions = split_contractions(word_tokenizer(sentence)) tokens.extend(contractions) # determine offsets for whitespace_after field index = text.index running_offset = 0 last_word_offset = -1 last_token = None for word in tokens: try: word_offset = index(word, running_offset) start_position = word_offset except: word_offset = last_word_offset + 1 start_position = running_offset + 1 if running_offset > 0 else running_offset token = Token(word, start_position=start_position) self.add_token(token) if word_offset - 1 == last_word_offset and last_token is not None: last_token.whitespace_after = False word_len = len(word) running_offset = word_offset + word_len last_word_offset = running_offset - 1 last_token = token # otherwise assumes whitespace tokenized text else: # catch the empty string case if not text: raise ValueError("Cannot convert empty string to a Sentence object.") # add each word in tokenized string as Token object to Sentence word = '' for index, char in enumerate(text): if char == ' ': if len(word) > 0: token = Token(word, start_position=index-len(word)) self.add_token(token) word = '' else: word += char # increment for last token in sentence if not followed by whtespace index += 1 if len(word) > 0: token = Token(word, start_position=index-len(word)) self.add_token(token)
def __init__(self, text: str = None, use_tokenizer: str = 'split', labels: Union[List[Label], List[str]] = None): super(Sentence, self).__init__() self.tokens: List[Token] = [] self.labels: List[Label] = [] if labels is not None: self.add_labels(labels) self._embeddings: Dict = {} # if text is passed, instantiate sentence with tokens (words) if text is not None: # tokenize the text first if option selected if use_tokenizer == 'segtok': # use segtok for tokenization tokens = [] sentences = split_single(text) for sentence in sentences: contractions = split_contractions(word_tokenizer(sentence)) tokens.extend(contractions) # determine offsets for whitespace_after field index = text.index running_offset = 0 last_word_offset = -1 last_token = None for word in tokens: try: word_offset = index(word, running_offset) start_position = word_offset except: word_offset = last_word_offset + 1 start_position = running_offset + 1 if running_offset > 0 else running_offset token = Token(word, start_position=start_position) self.add_token(token) if word_offset - 1 == last_word_offset and last_token is not None: last_token.whitespace_after = False word_len = len(word) running_offset = word_offset + word_len last_word_offset = running_offset - 1 last_token = token # otherwise assumes whitespace tokenized text elif use_tokenizer == 'split': # add each word in tokenized string as Token object to Sentence offset = 0 for word in text.split(' '): if word: try: word_offset = text.index(word, offset) except: word_offset = offset token = Token(word, start_position=word_offset) self.add_token(token) offset += len(word) + 1 elif use_tokenizer == 'toki': cmd = ['toki-app', '-q', '-n', '-c', 'nkjp'] p = Popen(cmd, stdout=PIPE, stdin=PIPE, stderr=PIPE) stdout = p.communicate(input=text.encode('utf-8'))[0] offset = 0 print(stdout.decode('utf-8').split('\n')) for t in stdout.decode('utf-8').split( '\n')[:-2]: #omit last two newlines print('XX', t) m = re.match(r'^(.*)/[tp]:(none|space|newline)', t) word = m.group(1) # before=m.group(2) # print(word, text) word_offset = text.index(word, offset) token = Token(word, start_position=word_offset) self.add_token(token) offset = word_offset + len(word)
def __simple_filter__(self, word): term = word.lower() for p in punctuation: term = term.replace(p, ' ') term = ' '.join([w for w in split_contractions(web_tokenizer(term))]) return term.strip()