def _create_vocab(self, min_occ): assert self.split == 'train', "Vocablurary can only be created for training file." tokenizer = TweetTokenizer(preserve_case=False) w2c = utils.OrderedCounter() w2i = dict() i2w = dict() special_tokens = [ '<pad>', '<unk>', '<eoq>', '<sos>', '<eos>', '<yes>', '<no>', '<n/a>' ] for st in special_tokens: i2w[len(w2i)] = st w2i[st] = len(w2i) with gzip.open(self.file, 'r') as file: for json_game in file: game = json.loads(json_game.decode("utf-8")) if successful_only and game['status'] != 'success': continue for qa in game['qas']: words = tokenizer.tokenize(qa['question']) w2c.update(words) for w, c in w2c.items(): if c >= min_occ and w.count('.') <= 1: i2w[len(w2i)] = w w2i[w] = len(w2i) assert len(w2i) == len(i2w) print("Vocablurary of %i keys created." % len(w2i)) vocab = dict(w2i=w2i, i2w=i2w) with io.open(self.vocab_file, 'wb') as vocab_file: data = json.dumps(vocab, ensure_ascii=False) vocab_file.write(data.encode('utf8', 'replace')) return self._load_vocab(self.vocab_file)
def _get_top_answers_by_frequency(answers, limit=None): """Computes the number of occurrences of each answer, keeping only the top limit answers, and returns an AnswerFrequencyList. This method is run from within the context of a MapReduce job. Args: answers: iterable(*). The collection of answers to be tallied. limit: int or None. The maximum number of answers to return. When None, all answers are returned. Returns: stats_domain.AnswerFrequencyList. A list of the top "limit" answers. """ answer_counter = utils.OrderedCounter(_HashableAnswer(a) for a in answers) return stats_domain.AnswerFrequencyList([ stats_domain.AnswerOccurrence(hashable_answer.answer, frequency) for hashable_answer, frequency in answer_counter.most_common(limit) ])