try:
        print("Carregando dicionários...")
        pickle_src = open(".pickle/" + sys.argv[1] + ".pkl", "rb")
        src_dict = pickle.load(pickle_src)
        pickle_src.close()

        pickle_tgt = open(".pickle/" + sys.argv[2] + ".pkl", "rb")
        tgt_dict = pickle.load(pickle_tgt)
        pickle_tgt.close()

    except:
        print("Erro...")
        print("Criando dicionário da origem")
        src_dict = Dictionary(src_file_path, lambda text: word_tokenize(text))
        src_dict.build_dictionary()
        pickle.dump(src_dict, open(".pickle/" + sys.argv[1] + ".pkl", "wb"))

        print("Criando dicionário do objetivo")
        tgt_dict = Dictionary(tgt_file_path,
                              lambda text: word_tokenize(text, 'portuguese'))
        tgt_dict.build_dictionary()
        pickle.dump(tgt_dict, open(".pickle/" + sys.argv[2] + ".pkl", "wb"))

    batch_src_it = None
    batch_tgt_it = None

    src_it = ReadFileIterator(src_file_path, end=0.8)
    tgt_it = ReadFileIterator(tgt_file_path, end=0.8)

    valid_src_it = ReadFileIterator(src_file_path, start=0.8, end=0.9)
Ejemplo n.º 2
0
 def create_dict(self):
     d = Dictionary(self._dict_set)
     d.build_dictionary()
     self._dictionary = d
Ejemplo n.º 3
0
    torch.cuda.manual_seed_all(1)
    torch.backends.cudnn.deterministic = True  # 保证每次结果一样

    start_time = time.time()
    logger.info("Loading data...")

    logger.info('Building dictionary ...')

    data = pd.read_csv(train_file, sep='\t')
    if args.word:
        data = data['text'].values.tolist()
    else:
        data = data['text'].apply(lambda x: " ".join("".join(x.split())))
    if args.dictionary is None:
        dictionary = Dictionary()
        dictionary.build_dictionary(data)
        del data
        joblib.dump(dictionary, root_path + '/model/vocab.bin')
    else:
        dictionary = joblib.load(args.dictionary)

    tokenizer = tokenizer


    logger.info('Making dataset & dataloader...')
    train_dataset = MyDataset(train_file,
                              dictionary,
                              args.max_length,
                              tokenizer=tokenizer,
                              word=args.word)
    train_dataloader = DataLoader(train_dataset,