# set random seed seed_everything(args.seed, USE_CUDA) use_bert = args.use_bert # do preprocessing print('\t start loading data...') start_t = time.time() train_input_file = os.path.join(os.path.dirname(__file__), 'temp/train.pickle.input') dev_input_file = os.path.join(os.path.dirname(__file__), 'temp/dev.pickle.input') train_data = data_utils.load_dump_data(train_input_file) dev_data = data_utils.load_dump_data(dev_input_file) train_dataset = train_data['input_data'] dev_dataset = dev_data['input_data'][:9000] train_input_file_fr = os.path.join(os.path.dirname(__file__), 'temp/train_fr.pickle.input') dev_input_file_fr = os.path.join(os.path.dirname(__file__), 'temp/dev_fr.pickle.input') train_data_fr = data_utils.load_dump_data(train_input_file_fr) dev_data_fr = data_utils.load_dump_data(dev_input_file_fr) train_dataset_fr = train_data_fr['input_data'] dev_dataset_fr = dev_data_fr['input_data'] #log(len(train_dataset_fr)) #log(len(dev_dataset_fr)) labeled_dataset_fr = train_dataset_fr + dev_dataset_fr
tmp_path, 'dev.pickle.input')) log('\t data preprocessing finished! consuming {} s'.format( int(time.time() - start_t))) log('\t start loading data...') start_t = time.time() train_input_file = os.path.join(os.path.dirname(__file__), 'temp/train.pickle.input') dev_input_file = os.path.join(os.path.dirname(__file__), 'temp/dev.pickle.input') unlabeled_input_file = os.path.join(os.path.dirname(__file__), 'temp/unlabeled.pickle.input') train_data = data_utils.load_dump_data(train_input_file) dev_data = data_utils.load_dump_data(dev_input_file) unlabeled_data = data_utils.load_dump_data(unlabeled_input_file) train_dataset = train_data['input_data'] dev_dataset = dev_data['input_data'] unlabeled_dataset = unlabeled_data['input_data'] word2idx = data_utils.load_dump_data( os.path.join(os.path.dirname(__file__), 'temp/word2idx.bin')) idx2word = data_utils.load_dump_data( os.path.join(os.path.dirname(__file__), 'temp/idx2word.bin')) fr_word2idx = data_utils.load_dump_data( os.path.join(os.path.dirname(__file__), 'temp/fr_word2idx.bin')) fr_idx2word = data_utils.load_dump_data(
# set random seed seed_everything(args.seed, USE_CUDA) use_bert = args.use_bert # do preprocessing print('\t start loading data...') start_t = time.time() En_train_file_CoNLL = os.path.join(os.path.dirname(__file__), 'temp/En_train_conll.pickle.input') Fr_dev_file = os.path.join(os.path.dirname(__file__), 'temp/Fr_dev.pickle.input') En_train_data_CoNLL = data_utils.load_dump_data(En_train_file_CoNLL) En_train_file_UPB = os.path.join(os.path.dirname(__file__), 'temp/En_train_UPB.pickle.input') En_train_data_UPB = data_utils.load_dump_data(En_train_file_UPB) Fr_dev_data = data_utils.load_dump_data(Fr_dev_file) train_dataset = En_train_data_CoNLL['input_data'] dev_dataset = En_train_data_CoNLL['input_data'] #train_input_file_fr = os.path.join(os.path.dirname(__file__), 'temp/train_fr.pickle.input') dev_input_file_fr = os.path.join(os.path.dirname(__file__), 'temp/Fr_dev.pickle.input') dev_data_fr = data_utils.load_dump_data(dev_input_file_fr) dev_dataset_fr = dev_data_fr['input_data'] labeled_dataset_fr = dev_dataset_fr print(len(labeled_dataset_fr))