Esempio n. 1
0
    data_loader = DataLoader()
    data_loader.load(posts_path)

    # map question indices to their text
    qs, followup_qs = data_loader.questions_in_folder("", index=True)
    qs = {q[0]: q[1] for q in qs}

    # load piazza's pred
    dupe_check = load_pickle(dupe_check_path)

    # label dataset
    labeler = Labeler(label_path)

    # # randomly select 100
    # indices = random.sample([i for i in range(len(dupe_check))], 100)
    # dupe_check = [dupe_check[i] for i in indices]

    for curr in dupe_check:
        idx = curr[0]
        text = qs[idx]

        labeler.label(
            text=text,
            text_idx=idx,
            choices=[qs[qidx] for qidx in curr[1:]],
            choices_idx=curr[1:]
        )

    labeler.save()
Esempio n. 2
0
    path_corpus = r"C:\Users\karlc\Documents\uoft\CSC492\CSC108&148v2\csc148h5_spring2020_2020-05-03\corpus.pkl"
    path_corpus_embeddings = r"C:\Users\karlc\Documents\uoft\CSC492\CSC108&148v2\csc148h5_spring2020_2020-05-03\corpus_embeddings.pkl"
    label_path = r"C:\Users\karlc\Documents\uoft\CSC492\CSC108&148v2\csc148h5_spring2020_2020-05-03\Labeler.pkl"

    data_loader = DataLoader()
    data_loader.load(posts_path)

    qs, followup_qs = data_loader.questions_in_folder("", index=True)
    as2, followup_as2 = data_loader.questions_in_folder("assignment2", index=True)

    bert_s_s = BertSemanticSearch().from_files(path_corpus, path_corpus_embeddings)

    # label dataset
    labeler = Labeler(label_path)

    for i in range(len(as2)):
        idx, text = as2[i]
        choices_idx = bert_s_s.single_semantic_search(text, 10)

        labeler.label(
            text=text,
            text_idx=idx,
            choices=[qs[int(choice_idx)][1] for choice_idx in choices_idx],
            choices_idx=[qs[int(choice_idx)][0] for choice_idx in choices_idx]
        )
        print(labeler.labels)

    labeler.save()


Esempio n. 3
0
from downloader import Downloader
from labeler import Labeler

if __name__ == "__main__":
    downloader = Downloader("apple", "data", img_count=5)
    downloader.download()
    labeler = Labeler("./data", ["apple", "not apple"],
                      dataset_dir="def_not_data")
    labeler.label()