def __init__(self, predicate): Dataset.__init__(self) filename = '' if predicate == 'bert': filename = './data/rte_transformed_dataset_bert.csv' elif predicate == 'cleaned': filename = './data/rte_transformed_dataset_cleaned_stem.csv' dt = pd.read_csv(filename) ''' group the data points and merge worker responses for each data point ''' dt = dt.groupby(['taskContent', 'taskID', 'goldLabel' ]).agg(workerID=('workerID', lambda x: list(x)), response=('response', lambda x: list(x))).reset_index() y = dt['goldLabel'].values r = dt['response'].values.tolist() X = dt['taskContent'].values v = Vectorizer() v.fit(X.astype('U')) X = v.transform(X.astype('U')) self.poolData = X self.poolGoldLabels = y self.poolWorkerResponses = r
def load_data(data_file, word_tokens, pristine_input, pristine_output, batch_size, seq_length=50, seq_step=25): try: with open(data_file, encoding='utf-8') as input_file: all_text = input_file.read() except FileNotFoundError: print("No input.txt in data_dir") sys.exit(1) # try: # with open(os.path.join(data_dir, 'validate.txt'), encoding='utf-8') as validate_file: # text_val = validate_file.read() # skip_validate = False # except FileNotFoundError: # pass # Validation text optional # Find some good default seed string in our source text. # self.seeds = find_random_seeds(text) # Include our validation texts with our vectorizer vectorizer = Vectorizer.Vectorizer(all_text, word_tokens, pristine_input, pristine_output) data = vectorizer.vectorize(all_text) x, y = shape_for_stateful_rnn(data, batch_size, seq_length, seq_step) print("Word_tokens:", word_tokens) print('x.shape:', x.shape) print('y.shape:', y.shape) return x, y, vectorizer
def load_datasets(self): config = self.config args = self.args cwd = os.getcwd() vectorizer = Vectorizer(min_frequency=config.min_freq) data_path = cwd + config.relative_data_path training_abstracts = headline2abstractdataset( data_path, vectorizer, args.cuda, max_len=1000, use_topics=config.use_topics, use_structure_info=config.use_labels) validation_data_path = cwd + config.relative_dev_path validation_abstracts = headline2abstractdataset( validation_data_path, vectorizer, args.cuda, max_len=1000, use_topics=config.use_topics, use_structure_info=config.use_labels) if args.local_rank == 0: print("number of training examples: %d" % len(training_abstracts), flush=True) return training_abstracts, validation_abstracts
# Set the random seed manually for reproducibility. torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) else: torch.cuda.manual_seed(args.seed) config = Config() #config = ConfigTest() cwd = os.getcwd() data_path = cwd + config.relative_data_path vectorizer = Vectorizer(min_frequency=config.min_freq) abstracts = headline2abstractdataset(data_path, vectorizer, args.cuda, max_len=1000) print("number of training examples: %d" % len(abstracts)) vocab_size = abstracts.vectorizer.vocabulary_size embedding = nn.Embedding(vocab_size, config.emsize, padding_idx=0) encoder_title = EncoderRNN(vocab_size, embedding, abstracts.head_len, config.emsize, input_dropout_p=config.dropout, n_layers=config.nlayers, bidirectional=config.bidirectional,