Esempio n. 1
0
    def __init__(self, predicate):
        Dataset.__init__(self)

        filename = ''

        if predicate == 'bert':
            filename = './data/rte_transformed_dataset_bert.csv'
        elif predicate == 'cleaned':
            filename = './data/rte_transformed_dataset_cleaned_stem.csv'

        dt = pd.read_csv(filename)
        ''' group the data points and merge worker responses for each data point '''
        dt = dt.groupby(['taskContent', 'taskID', 'goldLabel'
                         ]).agg(workerID=('workerID', lambda x: list(x)),
                                response=('response',
                                          lambda x: list(x))).reset_index()

        y = dt['goldLabel'].values
        r = dt['response'].values.tolist()
        X = dt['taskContent'].values

        v = Vectorizer()
        v.fit(X.astype('U'))
        X = v.transform(X.astype('U'))

        self.poolData = X
        self.poolGoldLabels = y
        self.poolWorkerResponses = r
Esempio n. 2
0
def load_data(data_file,
              word_tokens,
              pristine_input,
              pristine_output,
              batch_size,
              seq_length=50,
              seq_step=25):
    try:
        with open(data_file, encoding='utf-8') as input_file:
            all_text = input_file.read()
    except FileNotFoundError:
        print("No input.txt in data_dir")
        sys.exit(1)

    # try:
    #     with open(os.path.join(data_dir, 'validate.txt'), encoding='utf-8') as validate_file:
    #         text_val = validate_file.read()
    #         skip_validate = False
    # except FileNotFoundError:
    #     pass  # Validation text optional

    # Find some good default seed string in our source text.
    # self.seeds = find_random_seeds(text)
    # Include our validation texts with our vectorizer
    vectorizer = Vectorizer.Vectorizer(all_text, word_tokens, pristine_input,
                                       pristine_output)
    data = vectorizer.vectorize(all_text)
    x, y = shape_for_stateful_rnn(data, batch_size, seq_length, seq_step)
    print("Word_tokens:", word_tokens)
    print('x.shape:', x.shape)
    print('y.shape:', y.shape)

    return x, y, vectorizer
Esempio n. 3
0
    def load_datasets(self):
        config = self.config
        args = self.args
        cwd = os.getcwd()
        vectorizer = Vectorizer(min_frequency=config.min_freq)

        data_path = cwd + config.relative_data_path
        training_abstracts = headline2abstractdataset(
            data_path,
            vectorizer,
            args.cuda,
            max_len=1000,
            use_topics=config.use_topics,
            use_structure_info=config.use_labels)

        validation_data_path = cwd + config.relative_dev_path
        validation_abstracts = headline2abstractdataset(
            validation_data_path,
            vectorizer,
            args.cuda,
            max_len=1000,
            use_topics=config.use_topics,
            use_structure_info=config.use_labels)
        if args.local_rank == 0:
            print("number of training examples: %d" % len(training_abstracts),
                  flush=True)
        return training_abstracts, validation_abstracts
Esempio n. 4
0
# Set the random seed manually for reproducibility.
torch.manual_seed(args.seed)
if torch.cuda.is_available():
    if not args.cuda:
        print(
            "WARNING: You have a CUDA device, so you should probably run with --cuda"
        )
    else:
        torch.cuda.manual_seed(args.seed)

config = Config()
#config = ConfigTest()

cwd = os.getcwd()
data_path = cwd + config.relative_data_path
vectorizer = Vectorizer(min_frequency=config.min_freq)
abstracts = headline2abstractdataset(data_path,
                                     vectorizer,
                                     args.cuda,
                                     max_len=1000)
print("number of training examples: %d" % len(abstracts))

vocab_size = abstracts.vectorizer.vocabulary_size
embedding = nn.Embedding(vocab_size, config.emsize, padding_idx=0)
encoder_title = EncoderRNN(vocab_size,
                           embedding,
                           abstracts.head_len,
                           config.emsize,
                           input_dropout_p=config.dropout,
                           n_layers=config.nlayers,
                           bidirectional=config.bidirectional,