data_loader = DataLoader() data_loader.load(posts_path) # map question indices to their text qs, followup_qs = data_loader.questions_in_folder("", index=True) qs = {q[0]: q[1] for q in qs} # load piazza's pred dupe_check = load_pickle(dupe_check_path) # label dataset labeler = Labeler(label_path) # # randomly select 100 # indices = random.sample([i for i in range(len(dupe_check))], 100) # dupe_check = [dupe_check[i] for i in indices] for curr in dupe_check: idx = curr[0] text = qs[idx] labeler.label( text=text, text_idx=idx, choices=[qs[qidx] for qidx in curr[1:]], choices_idx=curr[1:] ) labeler.save()
path_corpus = r"C:\Users\karlc\Documents\uoft\CSC492\CSC108&148v2\csc148h5_spring2020_2020-05-03\corpus.pkl" path_corpus_embeddings = r"C:\Users\karlc\Documents\uoft\CSC492\CSC108&148v2\csc148h5_spring2020_2020-05-03\corpus_embeddings.pkl" label_path = r"C:\Users\karlc\Documents\uoft\CSC492\CSC108&148v2\csc148h5_spring2020_2020-05-03\Labeler.pkl" data_loader = DataLoader() data_loader.load(posts_path) qs, followup_qs = data_loader.questions_in_folder("", index=True) as2, followup_as2 = data_loader.questions_in_folder("assignment2", index=True) bert_s_s = BertSemanticSearch().from_files(path_corpus, path_corpus_embeddings) # label dataset labeler = Labeler(label_path) for i in range(len(as2)): idx, text = as2[i] choices_idx = bert_s_s.single_semantic_search(text, 10) labeler.label( text=text, text_idx=idx, choices=[qs[int(choice_idx)][1] for choice_idx in choices_idx], choices_idx=[qs[int(choice_idx)][0] for choice_idx in choices_idx] ) print(labeler.labels) labeler.save()
from downloader import Downloader from labeler import Labeler if __name__ == "__main__": downloader = Downloader("apple", "data", img_count=5) downloader.download() labeler = Labeler("./data", ["apple", "not apple"], dataset_dir="def_not_data") labeler.label()