def build_word_dict(args, examples, fields, dict_size=None, no_special_token=False): """Return a dictionary from question and document words in provided examples. """ word_dict = Vocabulary(no_special_token) for w in load_words(args, examples, fields, dict_size): word_dict.add(w) return word_dict
def build_word_dict(args, examples, fields, dict_size=None, special_token="pad_unk", attrname="tokens"): """Return a dictionary from question and document words in provided examples. """ word_dict = Vocabulary(no_special_token) for w in load_words(args, examples, fields, dict_size, \ num_spec_tokens=len(special_tokens.split("_")),\ attrname=attrname): word_dict.add(w) return word_dict
def _insert(iterable): words = [] for w in iterable: w = Vocabulary.normalize(w) if valid_words and w not in valid_words: continue words.append(w) word_count.update(words)
def top_summary_words(args, examples, word_dict): """Count and return the most common question words in provided examples.""" word_count = Counter() for ex in examples: for w in ex['summary'].tokens: w = Vocabulary.normalize(w) if w in word_dict: word_count.update([w]) return word_count.most_common(args.tune_partial)
def index_embedding_words(embedding_file): """Put all the words in embedding_file into a set.""" words = set() with open(embedding_file) as f: for line in tqdm(f, total=count_file_lines(embedding_file)): w = Vocabulary.normalize(line.rstrip().split(' ')[0]) words.add(w) words.update([BOS_WORD, EOS_WORD, PAD_WORD, UNK_WORD]) return words
def form_src_vocab(self) -> None: self.src_vocab = Vocabulary() assert self.src_vocab.remove(BOS_WORD) assert self.src_vocab.remove(EOS_WORD) self.src_vocab.add_tokens(self.tokens)
class Code(object): """ Code containing annotated text, original text, selection label and all the extractive spans that can be an answer for the associated question. """ def __init__(self, _id=None): self._id = _id self._language = None self._text = None self._tokens = [] self._type = [] self._mask = [] self.src_vocab = None # required for Copy Attention @property def id(self) -> str: return self._id @property def language(self) -> str: return self._language @language.setter def language(self, param: str) -> None: self._language = param @property def text(self) -> str: return self._text @text.setter def text(self, param: str) -> None: self._text = param @property def type(self) -> list: return self._type @type.setter def type(self, param: list) -> None: assert isinstance(param, list) self._type = param @property def mask(self) -> list: return self._mask @mask.setter def mask(self, param: list) -> None: assert isinstance(param, list) self._mask = param @property def tokens(self) -> list: return self._tokens @tokens.setter def tokens(self, param: list) -> None: assert isinstance(param, list) self._tokens = param self.form_src_vocab() def form_src_vocab(self) -> None: self.src_vocab = Vocabulary() assert self.src_vocab.remove(BOS_WORD) assert self.src_vocab.remove(EOS_WORD) self.src_vocab.add_tokens(self.tokens) def vectorize(self, word_dict, _type='word') -> list: if _type == 'word': return [word_dict[w] for w in self.tokens] elif _type == 'char': return [ word_dict.word_to_char_ids(w).tolist() for w in self.tokens ] else: assert False
def _insert(iterable): words = [] for w in iterable: w = Vocabulary.normalize(w) words.append(w) word_count.update(words)