try: print("Carregando dicionários...") pickle_src = open(".pickle/" + sys.argv[1] + ".pkl", "rb") src_dict = pickle.load(pickle_src) pickle_src.close() pickle_tgt = open(".pickle/" + sys.argv[2] + ".pkl", "rb") tgt_dict = pickle.load(pickle_tgt) pickle_tgt.close() except: print("Erro...") print("Criando dicionário da origem") src_dict = Dictionary(src_file_path, lambda text: word_tokenize(text)) src_dict.build_dictionary() pickle.dump(src_dict, open(".pickle/" + sys.argv[1] + ".pkl", "wb")) print("Criando dicionário do objetivo") tgt_dict = Dictionary(tgt_file_path, lambda text: word_tokenize(text, 'portuguese')) tgt_dict.build_dictionary() pickle.dump(tgt_dict, open(".pickle/" + sys.argv[2] + ".pkl", "wb")) batch_src_it = None batch_tgt_it = None src_it = ReadFileIterator(src_file_path, end=0.8) tgt_it = ReadFileIterator(tgt_file_path, end=0.8) valid_src_it = ReadFileIterator(src_file_path, start=0.8, end=0.9)
def create_dict(self): d = Dictionary(self._dict_set) d.build_dictionary() self._dictionary = d
torch.cuda.manual_seed_all(1) torch.backends.cudnn.deterministic = True # 保证每次结果一样 start_time = time.time() logger.info("Loading data...") logger.info('Building dictionary ...') data = pd.read_csv(train_file, sep='\t') if args.word: data = data['text'].values.tolist() else: data = data['text'].apply(lambda x: " ".join("".join(x.split()))) if args.dictionary is None: dictionary = Dictionary() dictionary.build_dictionary(data) del data joblib.dump(dictionary, root_path + '/model/vocab.bin') else: dictionary = joblib.load(args.dictionary) tokenizer = tokenizer logger.info('Making dataset & dataloader...') train_dataset = MyDataset(train_file, dictionary, args.max_length, tokenizer=tokenizer, word=args.word) train_dataloader = DataLoader(train_dataset,