def _get_torchtext_data_iterator(include_lengths=False): text_field = torchtext.data.Field( sequential=True, pad_first=False, # nosec init_token="<s>", eos_token="</s>", # nosec include_lengths=include_lengths ) # nosec example1 = Example.fromdict({"text": "a b c a c"}, {"text": ("text", text_field)}) example2 = Example.fromdict({"text": "b c a a"}, {"text": ("text", text_field)}) example3 = Example.fromdict({"text": "c b a"}, {"text": ("text", text_field)}) dataset = torchtext.data.Dataset( [example1, example2, example3], {"text": text_field}, ) text_field.build_vocab(dataset) iterator = torchtext.data.Iterator( dataset, batch_size=3, sort_key=None, device=None, batch_size_fn=None, train=True, repeat=False, shuffle=None, sort=None, sort_within_batch=None ) return iterator, text_field
def __init__(self, path, fields, **kwargs): """Create a ConllXDataset given a path and field list. Arguments: path (str): Path to the data file. fields (dict[str: tuple(str, Field)]): The keys should be a subset of the columns, and the values should be tuples of (name, field). Keys not present in the input dictionary are ignored. """ self.n_tokens = 0 with io.open(os.path.expanduser(path), encoding="utf8") as f: examples = [] for d in conllx_reader(f): if len(d["form"]) >= 70 and "train" in path: continue else: self.n_tokens += len(Example.fromdict(d, fields).form) examples.append(Example.fromdict(d, fields)) # examples.append(Example.fromdict(d, fields)) if isinstance(fields, dict): fields, field_dict = [], fields for field in field_dict.values(): if isinstance(field, list): fields.extend(field) else: fields.append(field) super(ConllXDataset, self).__init__(examples, fields, **kwargs)
def __init__(self, paragraph_path: str, label_path: str, fields: dict, split_sentences: bool, train: bool, max_chars: int=1000, level: str="char", **kwargs): """ Create a WiLIDataset given a path two the raw text and to the labels and field dict. """ self.level = level with io.open(os.path.expanduser(paragraph_path), encoding="utf8") as f_par, \ io.open(os.path.expanduser(label_path), encoding="utf8") as f_lab: examples = [] for d in data_reader(f_par, f_lab, train, split_sentences, max_chars, level): for sentence in d: examples.extend([Example.fromdict(sentence, fields)]) if isinstance(fields, dict): fields, field_dict = [], fields for field in field_dict.values(): if isinstance(field, list): fields.extend(field) else: fields.append(field) super(WiLIDataset, self).__init__(examples, fields, **kwargs)
def __init__(self, paragraph_path: str, label_path: str, switch_path: str, fields: dict, level: str, **kwargs): """ Create a WiLIDataset given a path two the raw text and to the labels and field dict. """ self.level = level with io.open(os.path.expanduser(paragraph_path), encoding="utf8") as f_par, \ io.open(os.path.expanduser(label_path), encoding="utf8") as f_lang, \ io.open(os.path.expanduser(switch_path), encoding="utf8") as f_switch: examples = [] for d in data_reader(f_par, f_lang, f_switch): if d is None: continue examples.append(Example.fromdict(d, fields)) if isinstance(fields, dict): fields, field_dict = [], fields for field in field_dict.values(): if isinstance(field, list): fields.extend(field) else: fields.append(field) super(WiLIDataset, self).__init__(examples, fields, **kwargs)
def __init__(self, fields, path, extension='.txt', **kwargs): examples = [] num_sequences = len(fields) data_files = glob.glob(os.path.join(path, '*' + extension)) for data_file in data_files: # Read the file line by line, and create examples from series # of num_sequences consecutive lines with io.open(os.path.expanduser(data_file), encoding="utf8") as f: line_buffer = [] for line in f: if len(line_buffer) == num_sequences: # Make a new example example = Example.fromlist(line_buffer, fields) examples.append(example) # Remove the first sentence line_buffer.pop(0) line_buffer.append(line) print('Found %d examples' % (len(examples))) super(StoryDataset, self).__init__(examples, fields, **kwargs) def foo(x): sort_keys = [] for i in xrange(0, len(fields)): example = getattr(x, fields[i][0]) sort_keys.append(len(example)) return sort_keys self.sort_key = foo #lambda x: len(x.field_0)
def __init__(self, path, load_ext_feats=False): """ Create a Castor dataset involving pairs of texts """ fields = [('id', self.ID_FIELD), ('sentence_1', self.TEXT_FIELD), ('sentence_2', self.TEXT_FIELD), ('ext_feats', self.EXT_FEATS_FIELD), ('label', self.LABEL_FIELD), ('aid', self.AID_FIELD), ('sentence_1_raw', self.RAW_TEXT_FIELD), ('sentence_2_raw', self.RAW_TEXT_FIELD)] examples = [] with open(os.path.join(path, 'a.toks'), 'r') as f1, open(os.path.join(path, 'b.toks'), 'r') as f2: sent_list_1 = [l.rstrip('.\n').split(' ') for l in f1] sent_list_2 = [l.rstrip('.\n').split(' ') for l in f2] word_to_doc_cnt = get_pairwise_word_to_doc_freq(sent_list_1, sent_list_2) self.word_to_doc_cnt = word_to_doc_cnt if not load_ext_feats: overlap_feats = get_pairwise_overlap_features(sent_list_1, sent_list_2, word_to_doc_cnt) else: overlap_feats = np.loadtxt(os.path.join(path, 'overlap_feats.txt')) with open(os.path.join(path, 'id.txt'), 'r') as id_file, open(os.path.join(path, 'sim.txt'), 'r') as label_file: for i, (pair_id, l1, l2, ext_feats, label) in enumerate(zip(id_file, sent_list_1, sent_list_2, overlap_feats, label_file)): pair_id = pair_id.rstrip('.\n') label = label.rstrip('.\n') example_list = [pair_id, l1, l2, ext_feats, label, i + 1, ' '.join(l1), ' '.join(l2)] example = Example.fromlist(example_list, fields) examples.append(example) super(CastorPairDataset, self).__init__(examples, fields)
def __init__(self, path): """ Create a MSRP dataset instance """ fields = [('id', self.ID_FIELD), ('sentence_1', self.TEXT_FIELD), ('sentence_2', self.TEXT_FIELD), ('ext_feats', self.EXT_FEATS_FIELD), ('label', self.LABEL_FIELD), ('sentence_1_raw', self.RAW_TEXT_FIELD), ('sentence_2_raw', self.RAW_TEXT_FIELD)] examples = [] with open(os.path.join(path, 'a.toks'), 'r') as f1, open(os.path.join(path, 'b.toks'), 'r') as f2: sent_list_1 = [l.rstrip('.\n').split(' ') for l in f1] sent_list_2 = [l.rstrip('.\n').split(' ') for l in f2] word_to_doc_cnt = get_pairwise_word_to_doc_freq(sent_list_1, sent_list_2) self.word_to_doc_cnt = word_to_doc_cnt with open(os.path.join(path, 'id.txt'), 'r') as id_file, open(os.path.join(path, 'sim.txt'), 'r') as label_file: for pair_id, l1, l2, label in zip(id_file, sent_list_1, sent_list_2, label_file): pair_id = pair_id.rstrip('.\n') label = label.rstrip('.\n') ext_feats = [] # Number features sent1_nums, sent2_nums = [], [] match = self.NUMBER_PATTERN.search(' '.join(l1)) if match: for g in match.groups(): if g is not None: sent1_nums.append(g) match = self.NUMBER_PATTERN.search(' '.join(l2)) if match: for g in match.groups(): if g is not None: sent2_nums.append(g) sent1_nums = set(sent1_nums) sent2_nums = set(sent2_nums) exact = int(sent1_nums == sent2_nums) superset = int(sent1_nums.issuperset(sent2_nums) or sent2_nums.issuperset(sent1_nums)) ext_feats.append(1 if (exact or (len(sent1_nums) == 0 and len(sent2_nums) == 0)) else 0) ext_feats.append(exact) ext_feats.append(superset) # Length difference ext_feats.append(len(l2) - len(l1)) # Overlap overlap = len(set(l1) & set(l2)) ext_feats.append(overlap / len(l1)) ext_feats.append(overlap / len(l2)) example = Example.fromlist([pair_id, l1, l2, ext_feats, label, ' '.join(l1), ' '.join(l2)], fields) examples.append(example) super(MSRP, self).__init__(examples, fields)
def __init__(self, path: str, text_field: Field, label_field: Field, **kwargs) -> None: fields = [('text', text_field), ('label', label_field)] examples = [] with open(path) as f: for line in f.readlines(): line = line.strip() label = line[-1] text = line[:-2] examples.append(Example.fromlist([text, label], fields)) super().__init__(examples, fields, **kwargs)
def __init__(self, path, fields, **kwargs): """Create a ConllUDataset given a path and field list. Arguments: path (str): Path to the data file. fields (dict[str: tuple(str, Field)]): The keys should be a subset of the columns, and the values should be tuples of (name, field). Keys not present in the input dictionary are ignored. """ with io.open(os.path.expanduser(path), encoding="utf8") as f: # examples = [Example.fromdict(d, fields) for d in conllu_reader(f)] # count = 0 if "train" in path: examples = [] for d in conllu_reader(f): if len(Example.fromdict(d, fields).form) <= 70: examples.append(Example.fromdict(d, fields)) else: examples = [ Example.fromdict(d, fields) for d in conllu_reader(f) ] # if len(Example.fromdict(d, fields).form) > 60: # count += 1 # print(count) if isinstance(fields, dict): fields, field_dict = [], fields for field in field_dict.values(): if isinstance(field, list): fields.extend(field) else: fields.append(field) super(ConllUDataset, self).__init__(examples, fields, **kwargs)
def augment(data_source, aug_algo, encoder_model, sim_measure, labeled_examples, unlabeled_examples, train_ds, test_ds, text_field, label_field, num_classes, sigma=None): res = encode_data_with_pretrained(data_source, train_ds, test_ds, text_field, encoder_model, labeled_examples, unlabeled_examples) x_l, y_l, x_u, y_u, xs_u_unencoded = res if aug_algo.startswith("knn"): algo_version = aug_algo.split('_')[1] if algo_version == 'base': classifications, indices = knn_classify(x_l, y_l, x_u, n=1, weights='uniform') frac_used = 1 elif algo_version == 'threshold': classifications, indices = knn_classify(x_l, y_l, x_u, n=2, threshold=0.99) y_u = y_u[indices] xs_u_unencoded = [xs_u_unencoded[idx] for idx in indices] frac_used = float(len(xs_u_unencoded)/len(x_u)) elif aug_algo.startswith("kmeans"): algo_version = aug_algo.split('_')[1] if algo_version == "base": classifications = kmeans(x_l, x_u, y_l, n_clusters=num_classes) elif algo_version == "recursive": classifications = recursive_kmeans(x_l, x_u, y_l, n_clusters=num_classes) frac_used = 1 elif aug_algo.startswith("lp"): algo_version = aug_algo.split('_')[1] lp = LabelProp(x_l, y_l, x_u, y_u, num_classes, data_source=data_source, sigma=sigma) if algo_version == 'base': lp.propagate() classifications, indices = lp.classify(threshold=False) y_u = y_u[indices] xs_u_unencoded = [xs_u_unencoded[idx] for idx in indices] frac_used = float(len(xs_u_unencoded)/len(x_u)) elif algo_version == "threshold": lp.propagate() classifications, indices = lp.classify(threshold=True) y_u = y_u[indices] xs_u_unencoded = [xs_u_unencoded[idx] for idx in indices] frac_used = float(len(xs_u_unencoded)/len(x_u)) elif algo_version == "recursive": classifications, indices = lp.recursive(x_l, y_l, x_u, y_u) y_u = y_u[indices] xs_u_unencoded = [xs_u_unencoded[idx] for idx in indices] frac_used = float(len(xs_u_unencoded)/len(x_u)) num_correct = np.sum(classifications == y_u) aug_acc = 0 if len(classifications) == 0 else float(num_correct/len(classifications)) new_labeled_data = [{'x': x, 'y': classifications[i]} for i, x in enumerate(xs_u_unencoded)] example_fields = {'x': ('x', text_field), 'y': ('y', label_field)} new_examples = [Example.fromdict(x, example_fields) for x in new_labeled_data] return labeled_examples + new_examples, aug_acc, frac_used
def __init__(self, src_path, tgt_path, src_field, tgt_field, **kwargs): fields = {"src": ("src", src_field), "tgt": ("tgt", tgt_field)} examples = [] for src, tgt in zip(open(src_path), open(tgt_path)): example = Example.fromdict({ "src": src.strip(), "tgt": tgt.strip() }, fields) examples.append(example) if isinstance(fields, dict): fields, field_dict = [], fields for field in field_dict.values(): if isinstance(field, list): fields.extend(field) else: fields.append(field) super(BilingualDataset, self).__init__(examples, fields, **kwargs)
def seg(self,sentences): examples = [] fields=[('unigram', self.unigram_field), ('fwd_bigram', self.bigram_field),('back_bigram', self.bigram_field)] for sent in sentences: columns = [[], [], []] chars = ['<BOS>'] + list(sent) + ['<EOS>'] for c,f_bi,b_bi in zip(chars[1:-1],zip(chars,chars[1:]),zip(chars[1:],chars[2:])): fwd_bi = ''.join(f_bi) back_bi = ''.join(b_bi) columns[0].append(c) columns[1].append(fwd_bi) columns[2].append(back_bi) examples.append(Example.fromlist(columns,fields)) dataset = data.Dataset(examples,fields) iter = data.BucketIterator(dataset, batch_size=64, train=False, shuffle=False, sort=False, device=device) decoded =self.model.decode(iter) segmented_sentence = self.BMSE2seg(sentences,decoded) return segmented_sentence
def __init__(self, path): """ Create a Castor dataset involving pairs of texts """ fields = [('id', self.ID_FIELD), ('sentence_1', self.TEXT_FIELD), ('sentence_2', self.TEXT_FIELD), ('ext_feats', self.EXT_FEATS_FIELD), ('label', self.LABEL_FIELD), ('sentence_1_raw', self.RAW_TEXT_FIELD), ('sentence_2_raw', self.RAW_TEXT_FIELD)] examples = [] ids, labels, sent_list_1, sent_list_2 = [], [], [], [] with open(path) as f: for line in f: content = json.loads(line) sent_list_1.append(content['question']) sent_list_2.append(content['qaquestion']) word_to_doc_cnt = get_pairwise_word_to_doc_freq( sent_list_1, sent_list_2) overlap_feats = get_pairwise_overlap_features(sent_list_1, sent_list_2, word_to_doc_cnt) self.word_to_doc_cnt = word_to_doc_cnt with open(path) as f: for line in f: content = json.loads(line) ids.append(content['qid']) labels.append(content['qarel']) for pair_id, l1, l2, ext_feats, label in zip(ids, sent_list_1, sent_list_2, overlap_feats, labels): example = Example.fromlist([ pair_id, l1, l2, ext_feats, label, ' '.join(l1), ' '.join(l2) ], fields) examples.append(example) super(SemevalDataset, self).__init__(examples, fields)
def create_example_objs(self, hard_training_instances: list) -> list: """ Create `Example` objects from the list of hard training instances This method will return a list of `Example` objects that will be used to extend the Data Iterator Arguments: hard_training_instances: List of hard training instances across all batches Returns: A list of `Example` torchtext objects """ example_objs = [] for i in range(len(hard_training_instances)): example = Example() setattr(example, "src", list(hard_training_instances[i][0][0])) setattr(example, "trg", list(hard_training_instances[i][0][1])) example_objs.append(example) return example_objs
def __init__(self, path, qnum_field, sent_field, page_field, confidence_field, text_field, unigram_field, bigram_field, trigram_field, example_mode='sentence', use_wiki=False, n_wiki_sentences=3, replace_title_mentions='', **kwargs): from unidecode import unidecode if use_wiki and 'train' in path: base_path = os.path.dirname(path) filename = os.path.basename(s3_wiki) output_file = os.path.join(base_path, filename) if not os.path.exists(output_file): download_from_url(s3_wiki, output_file) with open(output_file) as f: self.wiki_lookup = json.load(f) else: self.wiki_lookup = {} self.path = path self.example_mode = example_mode text_dependent_fields = [] if text_field is not None: text_dependent_fields.append(('text', text_field)) if unigram_field is not None: text_dependent_fields.append(('unigram', unigram_field)) if bigram_field is not None: text_dependent_fields.append(('bigram', bigram_field)) if trigram_field is not None: text_dependent_fields.append(('trigram', trigram_field)) example_fields = { 'qnum': [('qnum', qnum_field)], 'sent': [('sent', sent_field)], 'page': [('page', page_field)], 'confidence': [('confidence', confidence_field)], 'text': text_dependent_fields } examples = [] answer_set = set() with open(path) as f: for ex in json.load(f)['questions']: if example_mode == 'sentence': sentences = ex['sentences'] confidences = ex['confidences'] for i, s in enumerate(sentences): if (len(confidences[i]) != len(s)): raise ValueError(str(len(confidences[i])), str(len(s)), ex['qnum']) examples.append( Example.fromdict( { 'qnum': ex['qnum'], 'sent': i, 'text': s, 'page': ex['page'], 'confidence': confidences[i] }, example_fields)) answer_set.add(ex['page']) elif example_mode == 'question': raise NotImplementedError( 'Question tokenization is not implemented yet, submit a PR!' ) elif example_mode == 'runs': raise NotImplementedError( 'Run tokenization is not implemented yet, submit a PR!' ) else: raise ValueError( f"Valid modes are 'sentence', 'question', and 'runs', but '{example_mode}' was given" ) if use_wiki and n_wiki_sentences > 0 and 'train' in path: for page in answer_set: if page in self.wiki_lookup: sentences = extract_wiki_sentences( page, self.wiki_lookup[page]['text'], n_wiki_sentences, replace_title_mentions=replace_title_mentions) for i, s in enumerate(sentences): examples.append( Example.fromdict( { 'qnum': -1, 'sent': i, 'text': s, 'page': page }, example_fields)) dataset_fields = { 'qnum': qnum_field, 'sent': sent_field, 'page': page_field, 'confidence': confidence_field, } if text_field is not None: dataset_fields['text'] = text_field if unigram_field is not None: dataset_fields['unigram'] = unigram_field if bigram_field is not None: dataset_fields['bigram'] = bigram_field if trigram_field is not None: dataset_fields['trigram'] = trigram_field super(QuizBowl, self).__init__(examples, dataset_fields, **kwargs)
def __init__( self, path, qanta_id_field, sent_field, page_field, text_field, unigram_field, bigram_field, trigram_field, example_mode="sentence", use_wiki=False, n_wiki_sentences=3, replace_title_mentions="", **kwargs, ): from unidecode import unidecode if use_wiki and "train" in path: base_path = os.path.dirname(path) filename = os.path.basename(s3_wiki) output_file = os.path.join(base_path, filename) if not os.path.exists(output_file): download_from_url(s3_wiki, output_file) with open(output_file) as f: self.wiki_lookup = json.load(f) else: self.wiki_lookup = {} self.path = path self.example_mode = example_mode text_dependent_fields = [] if text_field is not None: text_dependent_fields.append(("text", text_field)) if unigram_field is not None: text_dependent_fields.append(("unigram", unigram_field)) if bigram_field is not None: text_dependent_fields.append(("bigram", bigram_field)) if trigram_field is not None: text_dependent_fields.append(("trigram", trigram_field)) example_fields = { "qanta_id": [("qanta_id", qanta_id_field)], "sent": [("sent", sent_field)], "page": [("page", page_field)], "text": text_dependent_fields, } examples = [] answer_set = set() with open(path) as f: for ex in json.load(f)["questions"]: if example_mode == "sentence": sentences = [ ex["text"][start:end] for start, end in ex["tokenizations"] ] for i, s in enumerate(sentences): examples.append( Example.fromdict( { "qanta_id": ex["qanta_id"], "sent": i, "text": unidecode(s), "page": ex["page"], }, example_fields, )) answer_set.add(ex["page"]) elif example_mode == "question": examples.append( Example.fromdict( { "qanta_id": ex["qanta_id"], "sent": -1, "text": unidecode(ex["text"]), "page": ex["page"], }, example_fields, )) answer_set.add(ex["page"]) else: raise ValueError( f"Valid modes are 'sentence' and 'question', but '{example_mode}' was given" ) if use_wiki and n_wiki_sentences > 0 and "train" in path: print("Loading wikipedia") pages = [(p, self.wiki_lookup[p]["text"]) for p in answer_set if p in self.wiki_lookup] def extract(args): title, text = args sentences = extract_wiki_sentences( title, text, n_wiki_sentences, replace_title_mentions=replace_title_mentions, ) return title, sentences for page, sentences in pseq(pages).map(extract).list(): for i, s in enumerate(sentences): examples.append( Example.fromdict( { "qanta_id": -1, "sent": i, "text": s, "page": page }, example_fields, )) dataset_fields = { "qanta_id": qanta_id_field, "sent": sent_field, "page": page_field, } if text_field is not None: dataset_fields["text"] = text_field if unigram_field is not None: dataset_fields["unigram"] = unigram_field if bigram_field is not None: dataset_fields["bigram"] = bigram_field if trigram_field is not None: dataset_fields["trigram"] = trigram_field super(QuizBowl, self).__init__(examples, dataset_fields, **kwargs)
def repeat_augment_and_train(dir_to_save, iter_func, model_wrapper, data_source, aug_algo, encoder_model, sim_measure, datasets, text_field, label_field, frac, num_classes, classifier_params, k, learning_type): """ Runs k trials of augmentation & repeat-classification for a given fraction of labeled training data. Args: dir_to_save (str): directory to save models created/loaded during this process aug_algo (str): which augmentation algorithm to use encoder_model (str): encoder model to use for augmentation (w similarity measure between these encodings) sim_measure (str): which similarity measure to use datasets (list(Dataset)): train/val/test torchtext datasets text_field (Field): torchtext field for sentences label_field (LabelField): torchtext LabelField for class labels frac (float): Fraction of labeled training data to use classifier_params (dict): params for intent classifier to use on augmented data. k (int): Number of times to repeat augmentation-classifier training process learning_type (str): inductive|transductive Returns: 8 statistical measures of the results of these trials """ train_ds, val_ds, test_ds = datasets class_accs, aug_accs, aug_fracs = [], [], [] ps, rs, fs = [], [], [] # FOR ENTROPY HEURISTIC # mst_sigmas, entropies, sigmas, accs, fracs = [], [], [], [], [] # # ABLATION STUDY # sigmas, f1_means, f1_stds, aug_acc_means, aug_acc_stds, frac_used_means, frac_used_stds = [],[],[],[],[],[],[] # for sigma in np.arange(0.035, 0.155, 0.005): # sigmas.append(sigma) for i in tqdm(range(k), total=k): examples = train_ds.examples np.random.shuffle(examples) cutoff = int(frac*len(examples)) if learning_type == "transductive": labeled_examples = train_ds.examples unlabeled_examples = test_ds.examples elif frac == 0: # 1 labeled eg from each class classes_seen = {i: 0 for i in range(num_classes)} labeled_examples, unlabeled_examples = [], [] for eg in examples: if classes_seen[eg.y] == 0: labeled_examples.append(eg) classes_seen[eg.y] += 1 else: unlabeled_examples.append(eg) else: # at least one labeled eg from each class while True: labeled_examples = examples[:cutoff] unlabeled_examples = examples[cutoff:] if len(set([eg.y for eg in labeled_examples])) == num_classes: break np.random.shuffle(examples) ################################################################################################################## # PROPAGATION PROCESS VISUALISATION (FOR DEMO) # from matplotlib import pyplot as plt # from pandas import DataFrame # from sklearn.decomposition import PCA # from sklearn.manifold import TSNE # import matplotlib.transforms as transforms # # EXTRACT DATA & COMPUTE DIM_REDUCED EMBEDDINGS # pickle.dump(labeled_examples, Path(f'./paper/{frac}_labeled_egs.pkl').open('wb')) # pickle.dump(unlabeled_examples, Path(f'./paper/{frac}_unlabeled_egs.pkl').open('wb')) # labeled_examples = pickle.load(Path(f'./paper/{frac}_labeled_egs.pkl').open('rb')) # unlabeled_examples = pickle.load(Path(f'./paper/{frac}_unlabeled_egs.pkl').open('rb')) # intents = pickle.load(Path(f'./data/ic/{data_source}/intents.pkl').open('rb')) # res = encode_data_with_pretrained(data_source, train_ds, test_ds, text_field, encoder_model, labeled_examples, unlabeled_examples) # x_l, y_l, x_u, y_u, _ = res # X = np.concatenate([x_l, x_u]) # Y = np.concatenate([y_l, y_u]) # pca = PCA(n_components=100) # pca_res = pca.fit_transform(X) # tsne = TSNE(n_components=2, verbose=0, perplexity=30, n_iter=1000) # tsne_pca_res = tsne.fit_transform(pca_res) # ts1, ts2 = tsne_pca_res[:,0], tsne_pca_res[:,1] # df_tsne_pca = DataFrame([{ # 'intent': intents[y], # 'x-tsne-pca': t1, # 'y-tsne-pca': t2, # 'og_idx': idx # } for idx, (y,t1,t2) in enumerate(zip(Y,ts1,ts2))]) # df_tsne_pca.to_pickle(f'./paper/{frac}_dataframe.pkl') # df_tsne_pca = pd.read_pickle(f'./paper/{frac}_dataframe.pkl') # # PLOT INITIAL DATASET # fig, ax = plt.subplots() # n_l = len(labeled_examples) # for idx, intent in enumerate(set(df_tsne_pca['intent'].values)): # values = [v for v in df_tsne_pca.loc[df_tsne_pca['intent']==intent].drop(columns=['intent']).values] # for i, v in enumerate(values): # if v[0] < n_l: # ax.scatter(v[1], v[2], color=f'C{idx}', s=100, alpha=1, label=intent) # else: # ax.scatter(v[1], v[2], color='black', s=100, alpha=0.2) # title = 'propagation_initial_labeled_only' # for idx, intent in enumerate(set(df_tsne_pca['intent'].values)): # values = [v for v in df_tsne_pca.loc[df_tsne_pca['intent']==intent].drop(columns=['intent']).values] # ax.scatter([v[1] for v in values], [v[2] for v in values], color=f'C{idx}', s=100, alpha=1, label=intent) # title = 'propagation_initial_all' # ax.grid(b=False) # ax.set_ylim(-7.6, 12.5) # ax.set_xlim(-10.5, 5.2) # fig.set_size_inches(15, 10) # plt.legend(loc='lower right', frameon=True, fancybox=True, shadow=True, fontsize='large') # plt.savefig(f'./paper/{title}.pdf', format='pdf', dpi=100) # plt.show() # assert(False) # # PRELIMINARY DATA FOR MAIN PLOT # dim_reduced_points = [0 for _ in range(100)] # for idx, intent in enumerate(set(df_tsne_pca['intent'].values)): # values = [v for v in df_tsne_pca.loc[df_tsne_pca['intent']==intent].drop(columns=['intent']).values] # for v in values: # dim_reduced_points[int(v[0])] = (v[1:],intent) # data = pickle.load(Path('./paper/propagation_data.pkl').open('rb')) # indices = pickle.load(Path('./paper/indices_data.pkl').open('rb')) # classifications = pickle.load(Path('./paper/classifications_data.pkl').open('rb')) # colors = {'findconnection': 'C1', 'departuretime': 'C0'} # intent_map = {0: 'findconnection', 1: 'departuretime'} # classified_indices = [0, 1] # classified_true_labels = ['findconnection', 'departuretime'] # classified_intents = ['findconnection', 'departuretime'] # classified_xs = [dim_reduced_points[i][0][0] for i in classified_indices] # classified_ys = [dim_reduced_points[i][0][1] for i in classified_indices] # # PLOT EACH RECURSION & PROPAGATION ITERATION # with plt.style.context('seaborn-whitegrid'): # plt.rcParams['font.family'] = 'serif' # plt.rcParams['mathtext.fontset'] = 'dejavuserif' # # starting point plot # title = '0_final' # fig, ax = plt.subplots() # unclassified_indices = [i for i in range(100) if i not in classified_indices] # unclassified_xs = [dim_reduced_points[i][0][0] for i in unclassified_indices] # unclassified_ys = [dim_reduced_points[i][0][1] for i in unclassified_indices] # ax.scatter(unclassified_xs, unclassified_ys, color='black', s=100, alpha=0.2) # ax.scatter(classified_xs[1], classified_ys[1], color=colors[classified_intents[1]], marker='s', s=200, alpha=1, label=classified_intents[1]) # ax.scatter(classified_xs[0], classified_ys[0], color=colors[classified_intents[0]], marker='s', s=200, alpha=1, label=classified_intents[0]) # ax.text(2, 10, 'Recursion 0 -- complete', fontsize=15, color='black', ha="center", va="center") # ax.grid(b=False) # ax.set_ylim(-7.6, 12.5) # ax.set_xlim(-10.5, 5.2) # fig.set_size_inches(15, 10) # plt.legend(loc='lower right', frameon=True, fancybox=True, shadow=True, fontsize='large') # plt.savefig(f'./paper/prop_plots_2/{title}.png', format='png', dpi=150) # plt.close() # for recursion_idx, prop_data in tqdm(enumerate(data), total=len(data)): # # plot results during propagation # Y_us = [prop_data[0]] if len(prop_data) == 1 else np.array(prop_data)[range(0, len(prop_data), 100)] # for prop_idx, Y_u in enumerate(Y_us): # title = f'{recursion_idx+1}_{(prop_idx+1)*100}' # fig, ax = plt.subplots() # for idx, row in enumerate(Y_u): # color = colors[intent_map[np.argmax(row)]] # prob = np.max(row) # ax.scatter(unclassified_xs[idx], unclassified_ys[idx], color=color, s=100, alpha=prob*0.75) # for (x, y, intent, true_label) in zip(classified_xs[2:], classified_ys[2:], classified_intents[2:], classified_true_labels[2:]): # ax.scatter(x, y, color=colors[intent], marker='s', s=100, alpha=1) # if intent != true_label: # ax.scatter(x, y, color='black', marker='x', s=150, alpha=1) # ax.scatter(classified_xs[1], classified_ys[1], color=colors[classified_intents[1]], marker='s', s=200, alpha=1, label=classified_intents[1]) # ax.scatter(classified_xs[0], classified_ys[0], color=colors[classified_intents[0]], marker='s', s=200, alpha=1, label=classified_intents[0]) # ax.text(2, 10, f'Recursion {recursion_idx+1} -- iterating...', fontsize=15, color='black', ha="center", va="center") # ax.grid(b=False) # ax.set_ylim(-7.6, 12.5) # ax.set_xlim(-10.5, 5.2) # fig.set_size_inches(15, 10) # plt.legend(loc='lower right', frameon=True, fancybox=True, shadow=True, fontsize='large') # plt.savefig(f'./paper/prop_plots_2/{title}.png', format='png', dpi=150) # plt.close() # # plot the end result of each recursion - i.e. new ground truth classifications # classified_indices += [i + 2 for i in indices[recursion_idx]] # classified_xs = [dim_reduced_points[i][0][0] for i in classified_indices] # classified_ys = [dim_reduced_points[i][0][1] for i in classified_indices] # classified_true_labels = [dim_reduced_points[i][1] for i in classified_indices] # classified_intents += [intent_map[intent_class] for intent_class in classifications[recursion_idx]] # unclassified_indices = [i for i in range(100) if i not in classified_indices] # unclassified_xs = [dim_reduced_points[i][0][0] for i in unclassified_indices] # unclassified_ys = [dim_reduced_points[i][0][1] for i in unclassified_indices] # title = f'{recursion_idx+1}_final' # fig, ax = plt.subplots() # ax.scatter(unclassified_xs, unclassified_ys, color='black', s=100, alpha=0.2) # for (x, y, intent, true_label) in zip(classified_xs[2:], classified_ys[2:], classified_intents[2:], classified_true_labels[2:]): # ax.scatter(x, y, color=colors[intent], marker='s', s=100, alpha=1) # if intent != true_label: # ax.scatter(x, y, color='black', marker='x', s=150, alpha=1) # ax.scatter(classified_xs[1], classified_ys[1], color=colors[classified_intents[1]], marker='s', s=200, alpha=1, label=classified_intents[1]) # ax.scatter(classified_xs[0], classified_ys[0], color=colors[classified_intents[0]], marker='s', s=200, alpha=1, label=classified_intents[0]) # ax.text(2, 10, f'Recursion {recursion_idx+1} -- complete', fontsize=15, color='black', ha="center", va="center") # ax.grid(b=False) # ax.set_ylim(-7.6, 12.5) # ax.set_xlim(-10.5, 5.2) # fig.set_size_inches(15, 10) # plt.legend(loc='lower right', frameon=True, fancybox=True, shadow=True, fontsize='large') # plt.savefig(f'./paper/prop_plots_2/{title}.png', format='png', dpi=150) # plt.close() # assert(False) ################################################################################################################## # # ENTROPY HEURISTIC # res = encode_data_with_pretrained(data_source, train_ds, test_ds, text_field, encoder_model, labeled_examples, unlabeled_examples) # x_l, y_l, x_u, y_u, _ = res # mst_sigma, entropy, sigma, acc, frac_used = sigma_fit(x_l, y_l, x_u, y_u, num_classes, data_source) # mst_sigmas.append(mst_sigma); entropies.append(entropy); sigmas.append(sigma); accs.append(acc); fracs.append(frac_used) # continue if aug_algo == "eda": x_l, y_l = [eg.x for eg in labeled_examples], [eg.y for eg in labeled_examples] augmented_x_l, augmented_y_l = eda_corpus(x_l, y_l) new_labeled_data = [{'x': x, 'y': y} for x,y in zip(augmented_x_l, augmented_y_l)] augmented_train_examples = [Example.fromdict(x, {'x': ('x', text_field), 'y': ('y', label_field)}) for x in new_labeled_data] aug_acc = 1; frac_used = 0 elif aug_algo == "none": augmented_train_examples = labeled_examples aug_acc = 1; frac_used = 0 elif aug_algo == "self_feed": sf_thresh = 0.7 augmented_train_examples, aug_acc, frac_used = self_feed(data_source, dir_to_save, iter_func, model_wrapper, labeled_examples, unlabeled_examples, val_ds, test_ds, text_field, label_field, classifier_params, thresh=sf_thresh) else: augmented_train_examples, aug_acc, frac_used = augment(data_source, aug_algo, encoder_model, sim_measure, labeled_examples, unlabeled_examples, train_ds, test_ds, text_field, label_field, num_classes, sigma=None) aug_accs.append(aug_acc); aug_fracs.append(frac_used) new_train_ds = data.Dataset(augmented_train_examples, {'x': text_field, 'y': label_field}) new_datasets = (new_train_ds, val_ds, test_ds) if learning_type == "inductive": acc, p, r, f = do_basic_train_and_classify(new_train_ds, test_ds, classifier_params, data_source) else: # transductive predictions = [eg.y for eg in augmented_train_examples[len(train_ds.examples):]] test_Y = [eg.y for eg in test_ds.examples] acc = accuracy_score(predictions, test_Y) avg = "macro avg" if data_source == "chat" else "weighted avg" report = classification_report(predictions, test_Y, output_dict=True)[avg] p, r, f = report['precision'], report['recall'], report['f1-score'] class_accs.append(acc); ps.append(p); rs.append(r); fs.append(f) # # ENTROPY HEURISTIC # print(np.mean(entropies), np.std(entropies)) # print(np.mean(mst_sigmas), np.std(mst_sigmas)) # print(np.mean(sigmas), np.std(sigmas)) # print(np.mean(accs), np.std(accs)) # print(np.mean(fracs), np.std(fracs)) # assert(False) # # ABLATION STUDY # print(f"SIGMA: {sigma}") # f1_means.append(np.mean(class_accs)); f1_stds.append(np.std(class_accs)) # aug_acc_means.append(np.mean(aug_accs)); aug_acc_stds.append(np.std(aug_accs)) # frac_used_means.append(np.mean(aug_fracs)); frac_used_stds.append(np.std(aug_fracs)) # assert(False) print(f"FRAC '{frac}' Results Below:") print(f'classification acc --> mean: {np.mean(class_accs)}; std: {np.std(class_accs)}') print(f'augmentation acc --> mean: {np.mean(aug_accs)}; std: {np.std(aug_accs)}\t (average frac used: {np.mean(aug_fracs)})') print(f'p/r/f1 means --> precision mean: {np.mean(ps)}; recall mean: {np.mean(rs)}; f1 mean: {np.mean(fs)}') print(f'p/r/f1 stds --> precision std: {np.std(ps)}; recall std: {np.std(rs)}; f1 std: {np.std(fs)}') class_acc_mean, class_acc_std = np.mean(class_accs), np.std(class_accs) aug_acc_mean, aug_acc_std, aug_frac_mean = np.mean(aug_accs), np.std(aug_accs), np.mean(aug_fracs) p_mean, r_mean, f_mean = np.mean(ps), np.mean(rs), np.mean(fs) p_std, r_std, f_std = np.std(ps), np.std(rs), np.std(fs) # # ABLATION STUDY # print([round(s, 3) for s in sigmas]) # print(f1_means) # print(f1_stds) # print(aug_acc_means) # print(aug_acc_stds) # print(frac_used_means) # print(frac_used_stds) # assert(False) return class_acc_mean, class_acc_std, aug_acc_mean, aug_acc_std, aug_frac_mean, p_mean, p_std, r_mean, r_std, f_mean, f_std
def __init__(self, path, qanta_id_field, sent_field, page_field, text_field, unigram_field, bigram_field, trigram_field, example_mode='sentence', use_wiki=False, n_wiki_sentences=3, replace_title_mentions='', **kwargs): from unidecode import unidecode if use_wiki and 'train' in path: base_path = os.path.dirname(path) filename = os.path.basename(s3_wiki) output_file = os.path.join(base_path, filename) if not os.path.exists(output_file): download_from_url(s3_wiki, output_file) with open(output_file) as f: self.wiki_lookup = json.load(f) else: self.wiki_lookup = {} self.path = path self.example_mode = example_mode text_dependent_fields = [] if text_field is not None: text_dependent_fields.append(('text', text_field)) if unigram_field is not None: text_dependent_fields.append(('unigram', unigram_field)) if bigram_field is not None: text_dependent_fields.append(('bigram', bigram_field)) if trigram_field is not None: text_dependent_fields.append(('trigram', trigram_field)) example_fields = { 'qanta_id': [('qanta_id', qanta_id_field)], 'sent': [('sent', sent_field)], 'page': [('page', page_field)], 'text': text_dependent_fields } examples = [] answer_set = set() with open(path) as f: for ex in json.load(f)['questions']: if example_mode == 'sentence': sentences = [ ex['text'][start:end] for start, end in ex['tokenizations'] ] for i, s in enumerate(sentences): examples.append( Example.fromdict( { 'qanta_id': ex['qanta_id'], 'sent': i, 'text': unidecode(s), 'page': ex['page'] }, example_fields)) answer_set.add(ex['page']) elif example_mode == 'question': examples.append( Example.fromdict( { 'qanta_id': ex['qanta_id'], 'sent': -1, 'text': unidecode(ex['text']), 'page': ex['page'] }, example_fields)) answer_set.add(ex['page']) else: raise ValueError( f"Valid modes are 'sentence' and 'question', but '{example_mode}' was given" ) if use_wiki and n_wiki_sentences > 0 and 'train' in path: print('Loading wikipedia') pages = [(p, self.wiki_lookup[p]['text']) for p in answer_set if p in self.wiki_lookup] def extract(args): title, text = args sentences = extract_wiki_sentences( title, text, n_wiki_sentences, replace_title_mentions=replace_title_mentions) return title, sentences for page, sentences in pseq(pages).map(extract).list(): for i, s in enumerate(sentences): examples.append( Example.fromdict( { 'qanta_id': -1, 'sent': i, 'text': s, 'page': page }, example_fields)) dataset_fields = { 'qanta_id': qanta_id_field, 'sent': sent_field, 'page': page_field, } if text_field is not None: dataset_fields['text'] = text_field if unigram_field is not None: dataset_fields['unigram'] = unigram_field if bigram_field is not None: dataset_fields['bigram'] = bigram_field if trigram_field is not None: dataset_fields['trigram'] = trigram_field super(QuizBowl, self).__init__(examples, dataset_fields, **kwargs)
'ENC_EMB_DIM': 256, 'DEC_EMB_DIM': 256, 'ENC_HID_DIM': 512, 'DEC_HID_DIM': 512, 'ENC_DROPOUT': 0.5, 'DEC_DROPOUT': 0.5 } network = create_seq2seq(network_params, device) network.load_state_dict(torch.load('weights/tut1-model.pt')) # sentence = input('Enter sentence in german: ') sentence = 'Ein Hund rennt im Schnee.' while sentence is not 'exit': # Convert custom sentence to tensor example = Example.fromlist([sentence], [('de', src_field)]) batch = [example.de] idx_input = src_field.process(batch).to(device) # Translate this tensor output_probs = network(idx_input, None, 0) idx_output = output_probs.squeeze(1).argmax(axis=1) # TODO is actually probs, not idx # Convert back output_sentence = ' '.join([trg_field.vocab.itos[idx] for idx in idx_output]) print(output_sentence) sentence = input('Enter sentence in german: ')
def _make_example(self, src, tgt): return Example.fromdict({ "src": src.strip(), "tgt": tgt.strip() }, self._fields)
def read_one(self, data_file, dataset_type="train"): pkl_data = pickle.load(Path(data_file).open('rb')) examples = [Example.fromdict(x, self.fields1) for x in pkl_data] dataset = data.Dataset(examples, fields=self.fields2) return dataset
def fromTSV(data, fields): return Example.fromlist(data.split('\t'), fields)