コード例 #1
0
    def _create_vocab(self, min_occ):
        assert self.split == 'train', "Vocablurary can only be created for training file."

        tokenizer = TweetTokenizer(preserve_case=False)

        w2c = utils.OrderedCounter()
        w2i = dict()
        i2w = dict()

        special_tokens = [
            '<pad>', '<unk>', '<eoq>', '<sos>', '<eos>', '<yes>', '<no>',
            '<n/a>'
        ]
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        with gzip.open(self.file, 'r') as file:

            for json_game in file:
                game = json.loads(json_game.decode("utf-8"))

                if successful_only and game['status'] != 'success':
                    continue

                for qa in game['qas']:
                    words = tokenizer.tokenize(qa['question'])
                    w2c.update(words)

        for w, c in w2c.items():
            if c >= min_occ and w.count('.') <= 1:
                i2w[len(w2i)] = w
                w2i[w] = len(w2i)

        assert len(w2i) == len(i2w)

        print("Vocablurary of %i keys created." % len(w2i))

        vocab = dict(w2i=w2i, i2w=i2w)
        with io.open(self.vocab_file, 'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))

        return self._load_vocab(self.vocab_file)
コード例 #2
0
def _get_top_answers_by_frequency(answers, limit=None):
    """Computes the number of occurrences of each answer, keeping only the top
    limit answers, and returns an AnswerFrequencyList.

    This method is run from within the context of a MapReduce job.

    Args:
        answers: iterable(*). The collection of answers to be tallied.
        limit: int or None. The maximum number of answers to return. When None,
            all answers are returned.

    Returns:
        stats_domain.AnswerFrequencyList. A list of the top "limit" answers.
    """
    answer_counter = utils.OrderedCounter(_HashableAnswer(a) for a in answers)
    return stats_domain.AnswerFrequencyList([
        stats_domain.AnswerOccurrence(hashable_answer.answer, frequency)
        for hashable_answer, frequency in answer_counter.most_common(limit)
    ])