def create_index_data_file(data_dir):
    """

    :param data_dir: path to the data directory
    :return: index lookup dataframe that holds the references to the feature files together with the
    corresponding recipe types. Furthermore, this dataframe stores the belonging of each video sequence to
    the train, validation or test dataset.
    """

    # read in the recipe labels of the YouCook2 dataset
    recipe_labels = pd.read_csv(data_dir + config.DATA["recipe_label_path"])
    recipe_labels = recipe_labels.T.reset_index().T
    recipe_labels = recipe_labels.to_dict(orient='records')
    recipe_labels_dic = {}
    for i in recipe_labels:
        recipe_labels_dic[int(i[0])] = i[1]
    recipe_types = list(recipe_labels_dic.values())

    # load the annotations of the YouCook2 dataset
    annotations_path = data_dir + config.DATA["annotations_file"]
    data = load_json_data(annotations_path)
    # flatten the annotations to each segment, df_annotations.index matches df_data.index
    yc2_annotations = data.annotations.apply(pd.Series).stack().reset_index(level=1, drop=True).to_frame('annotations')
    yc2_annotations = yc2_annotations.annotations.apply(pd.Series)

    yc2_all = yc2_annotations.join(data, how='outer').reset_index()

    # create individual video_segment_id for each segment of a video
    yc2_all['video_seg_id'] = yc2_all['index'] + '_' + yc2_all['id'].apply(str)

    # match recipe_type with recipe_label
    yc2_all['recipe_label'] = yc2_all['recipe_type'].apply(int).map(recipe_labels_dic)
    yc2_all['recipe_index'] = yc2_all['recipe_label'].apply(lambda x: recipe_types.index(x))

    # split segments to single column
    yc2_all['seg_start'] = yc2_all['segment'].apply(lambda x: x[0])
    yc2_all['seg_end'] = yc2_all['segment'].apply(lambda x: x[1])

    yc2_all = yc2_all.drop(['level_0', 'segment'], axis=1)

    # re-split the dataset for the zero shot setting into a training, validation and test set
    # the splits are based on different recipe types
    yc2_all.recipe_type = yc2_all.recipe_type.apply(int)
    training = [205, 301, 309, 318, 121, 225, 207, 405, 314, 425, 324, 124, 422,
                223, 409, 323, 311, 214, 224, 226, 114, 310, 102, 103, 119, 112,
                222, 410, 111, 308, 127, 306, 419, 212, 208, 319, 108, 218, 206,
                423, 117, 115, 213, 110, 404, 304, 113, 325, 401, 317, 303, 203,
                302, 416, 230, 204, 406, 227, 221, 228, 421, 316, 116, 307, 109,
                219, 418, 413, 209, 106, 104, 321, 120, 105, 210]
    test = [201, 122, 313, 101, 229, 202, 412]
    validation = [215, 107, 216, 305, 403, 126, 211]


    yc2_all["subset_new"] = None
    yc2_all.loc[yc2_all['recipe_type'].isin(training), "subset_new"] = 'training'
    yc2_all.loc[yc2_all['recipe_type'].isin(validation), 'subset_new'] = 'validation'
    yc2_all.loc[yc2_all['recipe_type'].isin(test), 'subset_new'] = 'test'


    return yc2_all
Exemple #2
0
def retokenize_file(fname):
    new_tokenizer_name = utils.TOKENIZER.__class__.__name__
    new_name = fname + ".retokenized." + new_tokenizer_name
    log.info("Processing file: %s", fname)
    record_iter = list(utils.load_json_data(fname))
    log.info("  saving to %s", new_name)
    with open(new_name, 'w') as fd:
        for record in tqdm(record_iter):
            new_record = retokenize_record(record)
            fd.write(json.dumps(new_record))
            fd.write("\n")
Exemple #3
0
def count_labels(fname: str) -> Type[collections.Counter]:
    """Count labels across all targets in a file of edge probing examples."""
    label_ctr = collections.Counter()
    record_iter = utils.load_json_data(fname)
    for record in tqdm(record_iter):
        for target in record['targets']:
            label = target['label']
            if isinstance(label, str):
                label = [label]
            label_ctr.update(label)
    return label_ctr
Exemple #4
0
    def from_run(cls, run_dir: str, task_name: str, split_name: str):
        # Load vocabulary
        exp_dir = os.path.dirname(run_dir.rstrip("/"))
        vocab_path = os.path.join(exp_dir, "vocab")
        log.info("Loading vocabulary from %s" % vocab_path)
        vocab = Vocabulary.from_files(vocab_path)
        label_namespace = f"{task_name}_labels"

        # Load predictions
        preds_file = os.path.join(run_dir, f"{task_name}_{split_name}.json")
        log.info("Loading predictions from %s" % preds_file)
        return cls(vocab, utils.load_json_data(preds_file),
                   label_namespace=label_namespace)
Exemple #5
0
def build_vocab(threshold, recipe1M, data_dir):
    """
    Function for building the vocabulary. Either only on the basis of the plain annotations of the YouCook2 dataset or
    with all the words from the Recipe1M dataset.
    :param threshold:   minimum amount of word counts for a word to be incorporated into the vocabulary.
    :param recipe1M:    boolean variable that specifies whether the vocabulary should
                        be built from the plain annotations of the YouCook2 dataset or the Recipe1M dataset.
                        True for  using the recipe1M dataset
    :param data_dir:    path to the data directory of the project
    :return: vocabulary object
    """
    # initialize the vocabulary
    vocab = Vocabulary(data_dir, recipe1M)

    # add the words from recipe1M  to the vocabulary if recipe1M is true
    if recipe1M:
        print("Add vocab from recipe1M")
    vocab.add_vocab_from_Recipe1M()

    # add all the  words from the annotations of the YouCook2 dataset to the vocabulary
    print("add all words from YouCook2 annotations")
    annotations_path = data_dir + config.DATA["annotations_file"]
    data = load_json_data(annotations_path)
    # flatten the annotations to each segment, df_annotations.index matches df_data.index
    yc2_annotations = data.annotations.apply(pd.Series).stack().reset_index(
        level=1, drop=True).to_frame('annotations')
    yc2_annotations = yc2_annotations.annotations.apply(pd.Series)

    counter = Counter()
    print("Tokenize...")
    for idx, caption in tqdm(enumerate(yc2_annotations['sentence'])):
        tokens = nltk.tokenize.word_tokenize(caption.lower())
        counter.update(tokens)

        if (idx + 1) % 1000 == 0:
            print("[{}/{}] Tokenized the captions.".format(idx + 1, len(data)))

    # If the word frequency is less than 'threshold', then the word is discarded.
    words = [word for word, cnt in counter.items() if cnt >= threshold]

    # add some special tokens to the vocabulary
    vocab.add_word('<pad>')
    vocab.add_word('<start>')
    vocab.add_word('<end>')
    vocab.add_word('<unk>')

    # Add the words to the vocabulary.
    for i, word in tqdm(enumerate(words)):
        vocab.add_word(word)
    return vocab
Exemple #6
0
def split_file(fname):
    pieces = fname.split(".json", 1)
    new_pos_name = pieces[0] + ".pos.json" + pieces[1]
    new_non_name = pieces[0] + ".nonterminal.json" + pieces[1]
    log.info("Processing file: %s", fname)
    record_iter = list(utils.load_json_data(fname))
    log.info("  saving to %s and %s", new_pos_name, new_non_name)
    pos_fd = open(new_pos_name, 'w')
    non_fd = open(new_non_name, 'w')
    for record in tqdm(record_iter):
        pos_record, non_record = split_record(record)
        pos_fd.write(json.dumps(pos_record))
        pos_fd.write("\n")
        non_fd.write(json.dumps(non_record))
        non_fd.write("\n")