def transform_tokens_gen(token_iter): left, right = [], [] try: center = next(token_iter) except StopIteration: return while center is not None: new_tokens = None if len(left) == max_num_left and len(right) == max_num_right: for transformer in token_transformers: new_tokens = transformer[left, center, right] if new_tokens is not None: break else: for transformer in token_transformers: if transformer.is_applicable(len(left), len(right)): new_tokens = transformer[left, center, right] if new_tokens is not None: break if new_tokens is None: if len(right) < max_num_right: try: right.append(next(token_iter)) continue except StopIteration: pass if new_tokens is None: yield center else: for new_token in new_tokens: yield new_token # pdb.set_trace() center = ptoken.shift_context_center_tokens( (left, center, right), token_iter, max_num_left)
def get_iters_with_annotations(self, annotation_transformers): max_num_left, max_num_right = ptoken.get_transformers_max_num_tokens( annotation_transformers) left, right = [], [] token_iter = iter(self) try: center = next(token_iter) except StopIteration: return while center is not None: annotation = None if len(left) == max_num_left and len(right) == max_num_right: for transformer in annotation_transformers: annotation = transformer[left, center, right] if annotation is not None: break else: for transformer in annotation_transformers: if transformer.is_applicable(len(left), len(right)): annotation = transformer[left, center, right] if annotation is not None: break if annotation is None: if len(right) < max_num_right: try: right.append(next(token_iter)) continue except StopIteration: pass yield center, annotation center = ptoken.shift_context_center_tokens( (left, center, right), token_iter, max_num_left)
def skip_tokens_gen(token_iter): left, right = [], [] try: center = next(token_iter) except StopIteration: return while center is not None: skip_flag = False if len(left) == max_num_left and len(right) == max_num_right: for transformer in bool_token_transformers: skip_flag = transformer[left, center, right] if skip_flag: break if not skip_flag: yield center else: for transformer in bool_token_transformers: if transformer.is_applicable(len(left), len(right)): skip_flag = transformer[left, center, right] if skip_flag: break if not skip_flag: if len(right) < max_num_right: try: right.append(next(token_iter)) continue except StopIteration: pass yield center center = ptoken.shift_context_center_tokens( (left, center, right), token_iter, max_num_left)
def transform_flags_gen(seq_flag_iter): left_seq_list, right_seq_list = [], [] left_flag_list, right_flag_list = [], [] try: seq, center = next(seq_flag_iter) except StopIteration: return while center is not None: new_center = None if len(left_flag_list) == max_num_left and len(right_flag_list) == max_num_right: for transformer in flag_token_transformers: new_center = transformer[left_flag_list, center, right_flag_list,] # if one successfully transformed the flag, stop checking the rest if new_center is not None: break else: for transformer in flag_token_transformers: if transformer.is_applicable(len(left_flag_list), len(right_flag_list)): new_center = transformer[left_flag_list, center, right_flag_list] # if one successfully transformed the flag, stop checking the rest if new_center is not None: break # If none of the transformer was applicable or transforming the token # Expand right flag list until max if new_center is None: if len(right_flag_list) < max_num_right: try: right_seq, right_flag = next(seq_flag_iter) right_seq_list.append(right_seq) right_flag_list.append(right_flag) continue except StopIteration: pass if new_center is None: yield seq, center else: yield seq, new_center seq, center = ptoken.shift_context_center_tokens( ( [left_seq_list, left_flag_list], [seq, center], [right_seq_list, right_flag_list] ), seq_flag_iter, max_num_left )
def _word2vec_gen(self, token_iter, unk_token): try: center_word = next(token_iter) except StopIteration: return left_context_words = [] right_context_words = [] for _ in range(self._window_size): try: right_context_words.append(next(token_iter)) except StopIteration: break while center_word is not None: context_words = left_context_words + right_context_words if center_word != unk_token: for context_word in context_words: if context_word != unk_token: yield center_word, context_word center_word = ptoken.shift_context_center_tokens( (left_context_words, center_word, right_context_words), token_iter, self._window_size )