Exemple #1
0
def build_vocab(file_path, threshold):
    """Build a simple vocabulary wrapper."""
    with open(file_path, 'r') as f:
        dataset = json.load(f)

    counter = Counter()
    for i, sen in enumerate(dataset):
        dependecies = sen['depends']
        g = Graph()
        for dep in dependecies:
            gov_node = g.add_node(dep['governorGloss'], dep['governor'], "")
            dep_node = g.add_node(dep['dependentGloss'], dep['dependent'], "")
            g.add_edge(gov_node, dep_node, dep['dep'])
        caption = g.__str__()
        # print("{} # {}".format(i, caption))
        tokens = nltk.tokenize.word_tokenize(caption.lower())
        counter.update(tokens)

        if (i + 1) % 1000 == 0:
            print("{} # {}".format(i + 1, caption))
            print("[{}/{}] Tokenized the captions.".format(i + 1, len(dataset)))

    # If the word frequency is less than 'threshold', then the word is
    # discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # Create a vocab wrapper and add some special tokens.
    #   <bot>: begin of tree
    #   <eob>: end of branch
    vocab = Vocabulary()
    vocab.add_word('<ROOT>')
    vocab.add_word('<EOB>')
    vocab.add_word('<UNK>')
    vocab.add_word('<PAD>')

    # Add the words to the vocabulary.
    for i, word in enumerate(words):
        vocab.add_word(word)
    return vocab