class Environment(object): """initialize the enviroment""" def __init__(self, args): self.args = args # init log if args.log_path: utils.init_log(args.log_path, args.local_rank, args.log_level) # init seed fluid.default_main_program().random_seed = args.seed np.random.seed(args.seed) # init place if args.use_cuda: self.place = "gpu" else: self.place = "cpu" os.environ["FLAGS_paddle_num_threads"] = str(args.threads) if not os.path.exists(self.args.model_files): os.makedirs(self.args.model_files) if not os.path.exists(args.fields_path) or args.preprocess: logging.info("Preprocess the data") if args.encoding_model in [ "ernie-1.0", "ernie-tiny", "ernie-lstm" ]: tokenizer = ErnieTokenizer.from_pretrained(args.encoding_model) args["ernie_vocabs_size"] = len(tokenizer.vocab) self.WORD = ErnieField( "word", pad=tokenizer.pad_token, unk=tokenizer.unk_token, bos=tokenizer.cls_token, eos=tokenizer.sep_token, fix_len=args.fix_len, tokenizer=tokenizer, ) self.WORD.vocab = tokenizer.vocab args.feat = None else: self.WORD = Field( "word", pad=utils.pad, unk=utils.unk, bos=utils.bos, eos=utils.eos, lower=True, ) if args.feat == "char": self.FEAT = SubwordField( "chars", pad=utils.pad, unk=utils.unk, bos=utils.bos, eos=utils.eos, fix_len=args.fix_len, tokenize=list, ) elif args.feat == "pos": self.FEAT = Field("postag", bos=utils.bos, eos=utils.eos) else: self.FEAT = None self.ARC = Field( "head", bos=utils.bos, eos=utils.eos, use_vocab=False, fn=utils.numericalize, ) self.REL = Field("deprel", bos=utils.bos, eos=utils.eos) if args.feat == "char": self.fields = CoNLL(FORM=(self.WORD, self.FEAT), HEAD=self.ARC, DEPREL=self.REL) else: self.fields = CoNLL(FORM=self.WORD, CPOS=self.FEAT, HEAD=self.ARC, DEPREL=self.REL) train = Corpus.load(args.train_data_path, self.fields) if not args.encoding_model.startswith("ernie"): self.WORD.build(train, args.min_freq) self.FEAT.build(train) self.REL.build(train) if args.local_rank == 0: with open(args.fields_path, "wb") as f: logging.info("dumping fileds to disk.") pickle.dump(self.fields, f, protocol=2) else: logging.info("loading the fields.") with open(args.fields_path, "rb") as f: self.fields = pickle.load(f) if isinstance(self.fields.FORM, tuple): self.WORD, self.FEAT = self.fields.FORM else: self.WORD, self.FEAT = self.fields.FORM, self.fields.CPOS self.ARC, self.REL = self.fields.HEAD, self.fields.DEPREL if args.encoding_model.startswith("ernie"): vocab_items = self.WORD.vocab.items() else: vocab_items = self.WORD.vocab.stoi.items() self.puncts = np.array([i for s, i in vocab_items if utils.ispunct(s)], dtype=np.int64) self.args.update({ "n_words": len(self.WORD.vocab), "n_feats": self.FEAT and len(self.FEAT.vocab), "n_rels": len(self.REL.vocab), "pad_index": self.WORD.pad_index, "unk_index": self.WORD.unk_index, "bos_index": self.WORD.bos_index, "eos_index": self.WORD.eos_index, "feat_pad_index": self.FEAT and self.FEAT.pad_index, })
class Environment(object): """initialize the enviroment""" def __init__(self, args): self.args = args # init log if self.args.log_path: utils.init_log(self.args.log_path, self.args.local_rank, self.args.log_level) # init seed fluid.default_main_program().random_seed = self.args.seed np.random.seed(self.args.seed) # init place if self.args.use_cuda: if self.args.use_data_parallel: self.place = fluid.CUDAPlace( fluid.dygraph.parallel.Env().dev_id) else: self.place = fluid.CUDAPlace(0) else: self.place = fluid.CPUPlace() os.environ['FLAGS_paddle_num_threads'] = str(self.args.threads) os.makedirs(self.args.model_files, exist_ok=True) if not os.path.exists(self.args.fields_path) or self.args.preprocess: logging.info("Preprocess the data") self.WORD = Field('word', pad=utils.pad, unk=utils.unk, bos=utils.bos, lower=True) if self.args.feat == 'char': self.FEAT = SubwordField('chars', pad=utils.pad, unk=utils.unk, bos=utils.bos, fix_len=self.args.fix_len, tokenize=list) else: self.FEAT = Field('postag', bos=utils.bos) self.ARC = Field('head', bos=utils.bos, use_vocab=False, fn=utils.numericalize) self.REL = Field('deprel', bos=utils.bos) if self.args.feat == 'char': self.fields = CoNLL(FORM=(self.WORD, self.FEAT), HEAD=self.ARC, DEPREL=self.REL) else: self.fields = CoNLL(FORM=self.WORD, CPOS=self.FEAT, HEAD=self.ARC, DEPREL=self.REL) train = Corpus.load(self.args.train_data_path, self.fields) if self.args.pretrained_embedding_dir: logging.info("loading pretrained embedding from file.") embed = Embedding.load(self.args.pretrained_embedding_dir, self.args.unk) else: embed = None self.WORD.build(train, self.args.min_freq, embed) self.FEAT.build(train) self.REL.build(train) if self.args.local_rank == 0: with open(self.args.fields_path, "wb") as f: logging.info("dumping fileds to disk.") pickle.dump(self.fields, f, protocol=2) else: logging.info("loading the fields.") with open(self.args.fields_path, "rb") as f: self.fields = pickle.load(f) if isinstance(self.fields.FORM, tuple): self.WORD, self.FEAT = self.fields.FORM else: self.WORD, self.FEAT = self.fields.FORM, self.fields.CPOS self.ARC, self.REL = self.fields.HEAD, self.fields.DEPREL self.puncts = np.array( [i for s, i in self.WORD.vocab.stoi.items() if utils.ispunct(s)], dtype=np.int64) if self.WORD.embed is not None: self.args["pretrained_embed_shape"] = self.WORD.embed.shape else: self.args["pretrained_embed_shape"] = None self.args.update({ 'n_words': self.WORD.vocab.n_init, 'n_feats': len(self.FEAT.vocab), 'n_rels': len(self.REL.vocab), 'pad_index': self.WORD.pad_index, 'unk_index': self.WORD.unk_index, 'bos_index': self.WORD.bos_index, 'feat_pad_index': self.FEAT.pad_index })