Ejemplo n.º 1
0
def compute_bert_formatted_inputs(data_folder, max_seq_length=128):

    tokenizer, _, _, _ = load_bert(lower_case=True, base_version=True)

    # Generate formatted input
    for dataset_type in ['train', 'test']:
        with open(Path(data_folder, f'splitted_{dataset_type}_sentences.json'),
                  'r') as f:
            dataset = json.load(f)

            # Just convert examples to InputExamples
            splitted_dataset = _process_examples(dataset)

            # Convert to BERT input
            # examples_features is a list of list. The outer list holds examples (documents), the inner list holds one feature object per sentence
            examples_features = _convert_examples_to_features(
                examples=splitted_dataset,
                seq_length=max_seq_length,
                tokenizer=tokenizer)

            print(
                f'Len of parsed examples is {len(examples_features)}, just to double check'
            )

            with open(
                    Path(data_folder,
                         f'formatted_{dataset_type}_dataset.pickle'),
                    'wb') as w:
                pickle.dump(examples_features, w)
def compute_bert_formatted_inputs(data_folder, max_seq_length=512):

    tokenizer, _, _, _ = load_bert(lower_case=True, base_version=True)

    # Generate formatted input
    for dataset_type in ['train', 'validation', 'test']:
        with open(
                Path(data_folder, f'spouse_{dataset_type}_set_finetune.json'),
                'r') as f:
            dataset = json.load(f)

            # Just convert examples to InputExamples
            splitted_dataset = _process_examples(dataset)

            # Convert to BERT input
            examples_features = _convert_examples_to_features(
                examples=splitted_dataset,
                seq_length=max_seq_length,
                tokenizer=tokenizer)

            print(
                f'Len of parsed examples is {len(examples_features)}, just to double check'
            )

            with open(
                    Path(
                        data_folder,
                        f'spouse_formatted_{dataset_type}_finetune_dataset.pickle'
                    ), 'wb') as w:
                pickle.dump(examples_features, w)
                            processed_example[key] = torch.cat((processed_example[key], val), dim=0)

                    starting_token_idx += no_tokens

                    processed_example['tokens'].extend(sentence_tokens)

                store_path = processed_path

                if not os.path.exists(store_path):
                    os.makedirs(store_path)

                torch.save(processed_example, Path(store_path, f'{example_filename[:-6]}_processed.torch'))

                if example_no % 1000 == 0:
                    print(f'Processed {example_no} examples')


if __name__ == '__main__':
    formatted_data_folder = '../../../data/MovieReview'
    processed_data_folder = '../../../data/MovieReview_FineTune/'

    # Load BERT model
    _, bert_model, device, embedding_dim = load_bert(lower_case=True, base_version=True)

    bert_moviereview_parser = MovieReviewBuilder(formatted_data_folder, processed_data_folder)

    # Computes embeddings for each example
    bert_moviereview_parser.compute_moviereview_embeddings(bert_model, device)

    # Processes examples for subsequent training
    bert_moviereview_parser.process_embeddings(embedding_dim)