def __init__(self, data_bundle, vocabulary=None): #, c_len=None, q_len=None): (self.contexts, self.questions, self.choices, self.labels, self.choices_map, self.context_lens, self.qs_lens) = data_bundle if vocabulary: self.vocab = vocabulary else: self.vocab = rn.get_vocab(self.questions, self.contexts, min_frequency=10) self.vocab_size = len(self.vocab.vocabulary_) self.labels_idx = sorted( list( set([choice for choices in self.choices for choice in choices]))) print(self.contexts[:10]) print(self.questions[:10]) print(self.labels[:10]) self.transformed_labels_idx = [ x[0] for x in list(self.vocab.transform(self.labels_idx)) ] print(self.transformed_labels_idx) self.contexts = rn.vocab_transform(self.contexts, self.vocab) self.questions = rn.vocab_transform(self.questions, self.vocab)
def __init__(self, data_bundle, vocabulary=None):#, c_len=None, q_len=None): (self.contexts, self.questions, self.choices, self.labels, self.choices_map, self.context_lens, self.qs_lens) = data_bundle if vocabulary: self.vocab = vocabulary else: self.vocab = rn.get_vocab( self.questions, self.contexts, min_frequency=10) self.vocab_size = len(self.vocab.vocabulary_) self.labels_idx = sorted( list(set([choice for choices in self.choices for choice in choices])) ) self.transformed_labels_idx = [x[0] for x in\ list(self.vocab.transform(self.labels_idx))] print(self.transformed_labels_idx) print([x for x in self.questions if '@placeholder' not in x.split(" ")]) self.contexts = rn.vocab_transform(self.contexts, self.vocab) self.questions = rn.vocab_transform(self.questions, self.vocab) placeholder_token = ['@placeholder'] placeholder_idx = rn.vocab_transform(placeholder_token,self.vocab)[0][0] print(placeholder_idx) print(len([x for x in self.questions if placeholder_idx in x])) self.placeholder_inds = np.array([list(x).index(placeholder_idx)\ for x in self.questions]).astype(int) print(self.placeholder_inds.shape)
def __init__(self, config, data_path=None, vocabulary=None, name=None): self.batch_size = batch_size = config.batch_size self.num_steps = num_steps = config.num_steps raw_context, raw_questions, raw_choices, raw_labels, self.choices_map = \ read.load_data(data_path) all_choices = read.build_choices(raw_choices) self.epoch_size = ((len(raw_context) // batch_size) - 1) // num_steps # build vocab for train data if not vocabulary: self.vocabulary = read.get_vocab(raw_questions,\ raw_context,min_frequency=500) else: self.vocabulary = vocabulary raw_choices = [" ".join(x) for x in raw_choices] self.all_choices = read.vocab_transform(all_choices, self.vocabulary) self.questions = read.vocab_transform(raw_questions, self.vocabulary) self.context = read.vocab_transform(raw_context, self.vocabulary) self.labels = read.vocab_transform(raw_labels, self.vocabulary) self.choices = read.vocab_transform([" ".join(x) for x in raw_choices], self.vocabulary)
def __init__(self, data_bundle, vocabulary=None): (contexts, questions, self.choices, self.labels, self.choices_map, self.context_lens, self.qs_lens) = data_bundle if vocabulary: self.vocab = vocabulary else: self.vocab = rn.get_vocab(questions, contexts, min_frequency=FLAGS.min_freq) self.vocab_size = len(self.vocab.vocabulary_) self.labels_idx = sorted( list( set([choice for choices in self.choices for choice in choices]))) contexts = rn.vocab_transform(contexts, self.vocab) self.contexts = rn.pad_eval(contexts, FLAGS.context_steps) questions = rn.vocab_transform(questions, self.vocab) self.questions = rn.pad_eval(questions, FLAGS.question_steps)
from reader import load_data from reader import get_vocab from reader import vocab_transform from reader import batch_iter contexts, questions, choices, labels, choices_map, context_lens, qs_lens =\ load_data(data_path="wdw/test") # # 2. Fit vocabulary with questions and context. vocab = get_vocab(contexts, questions) # # 3. Transform context and questions contexts = vocab_transform(contexts, vocab) questions = vocab_transform(questions, vocab) # 4. Give to batch_iter readers = batch_iter(contexts, questions, choices, labels, choices_map, context_lens, qs_lens) # for q, c, ch, lab, ch_map, c_lens, q_lens in readers: # print(c.shape) # break