def build_fields(ds: Dataset): # initialize the frequency counter counters = defaultdict(Counter) _src_vocab, _src_vocab_size = inputter._load_vocab(ds.vocab.source, 'src', counters) _tgt_vocab, _tgt_vocab_size = inputter._load_vocab(ds.vocab.target, 'tgt', counters) # initialize fields src_nfeats, tgt_nfeats = 0, 0 # do not support word features for now fields = inputter.get_fields(defaults.vocabulary["data_type"], src_nfeats, tgt_nfeats) return inputter._build_fields_vocab(fields, counters, **defaults.vocabulary)
def build_dynamic_fields(opts, src_specials=None, tgt_specials=None): """Build fields for dynamic, including load & build vocab.""" fields = _get_dynamic_fields(opts) counters = defaultdict(Counter) logger.info("Loading vocab from text file...") _src_vocab, _src_vocab_size = _load_vocab( opts.src_vocab, 'src', counters, min_freq=opts.src_words_min_frequency) if opts.src_feats_vocab: for feat_name, filepath in opts.src_feats_vocab.items(): _, _ = _load_vocab(filepath, feat_name, counters, min_freq=0) if opts.tgt_vocab: _tgt_vocab, _tgt_vocab_size = _load_vocab( opts.tgt_vocab, 'tgt', counters, min_freq=opts.tgt_words_min_frequency) elif opts.share_vocab: logger.info("Sharing src vocab to tgt...") counters['tgt'] = counters['src'] else: raise ValueError("-tgt_vocab should be specified if not share_vocab.") logger.info("Building fields with vocab in counters...") fields = _build_fields_vocab(fields, counters, 'text', opts.share_vocab, opts.vocab_size_multiple, opts.src_vocab_size, opts.src_words_min_frequency, opts.tgt_vocab_size, opts.tgt_words_min_frequency, src_specials=src_specials, tgt_specials=tgt_specials) return fields
def build_save_dataset(corpus_type, fields, src_reader, tgt_reader, align_reader, opt): assert corpus_type in ['train', 'valid'] if corpus_type == 'train': counters = defaultdict(Counter) #counters = None srcs = opt.train_src tgts = opt.train_tgt ids = opt.train_ids aligns = opt.train_align elif corpus_type == 'valid': counters = None srcs = [opt.valid_src] tgts = [opt.valid_tgt] ids = [None] aligns = [opt.valid_align] src_vocab, tgt_vocab, existing_fields = maybe_load_vocab( corpus_type, counters, opt) existing_shards = check_existing_pt_files(opt, corpus_type, ids, existing_fields) # every corpus has shards, no new one if existing_shards == ids and not opt.overwrite: return #@asyncio.coroutine def shard_iterator(srcs, tgts, ids, aligns, existing_shards, existing_fields, corpus_type, opt): """ Builds a single iterator yielding every shard of every corpus. """ for src, tgt, maybe_id, maybe_align in zip(srcs, tgts, ids, aligns): if maybe_id in existing_shards: if opt.overwrite: logger.warning( "Overwrite shards for corpus {}".format(maybe_id)) else: if corpus_type == "train": assert existing_fields is not None,\ ("A 'vocab.pt' file should be passed to " "`-src_vocab` when adding a corpus to " "a set of already existing shards.") logger.warning("Ignore corpus {} because " "shards already exist".format(maybe_id)) continue if ((corpus_type == "train" or opt.filter_valid) and tgt is not None): filter_pred = partial(inputters.filter_example, use_src_len=opt.data_type == "text", max_src_len=opt.src_seq_length, max_tgt_len=opt.tgt_seq_length) else: filter_pred = None src_shards = split_corpus(src, opt.shard_size) tgt_shards = split_corpus(tgt, opt.shard_size) align_shards = split_corpus(maybe_align, opt.shard_size) for i, (ss, ts, a_s) in enumerate(zip(src_shards, tgt_shards, align_shards)): yield (i, (ss, ts, a_s, maybe_id, filter_pred)) shard_iter = shard_iterator(srcs, tgts, ids, aligns, existing_shards, existing_fields, corpus_type, opt) #logger.info(sys.getsizeof(shard_iter.__self__.bitstring_cache) / 1024 / 1024) dataset_params = (corpus_type, fields, src_reader, tgt_reader, align_reader, opt, existing_fields, src_vocab, tgt_vocab) func = partial(process_one_shard, dataset_params) for sub_counter in map(func, shard_iter): if sub_counter is not None: for key, value in sub_counter.items(): counters[key].update(value) """ with Pool(4) as p: #with Pool(opt.num_threads) as p: ## dataset_params = (corpus_type, fields, src_reader, tgt_reader, align_reader, opt, existing_fields, src_vocab, tgt_vocab) func = partial(process_one_shard, dataset_params) #print(func) #print(shard_iter) #temp = p.apply_async(func, shard_iter) #try: # print(temp.get(timeout=1)) #except TimeoutError: # print("sorry") #temp = [p.apply_async(func, s) for s in shard_iter] #temp = [p.apply_async(func, shard_iter)] #temp = [p.apply_async(func, (s, y)) for s,y in shard_iter] #temp = p.apply_async(func, shard_iter) #temp = p.apply_async(func, shard_iter) #temp = temp.get(timeout=1) #if len(p._cache) > 1e6: # print("waiting please...") #for sub_counter in temp: for sub_counter in p.map(func, shard_iter): #if len(p._cache) > 1e6: # print("Waiting please..") #for sub_counter in temp.get(): #logger.info(sub_counter) #sub_counter = sub_counter.get() #print(sub_counter) #logger.info("sub_counter : " + sub_counter) if sub_counter is not None: for key, value in sub_counter.items(): #print(key) #print(value) counters[key].update(value) #logger.info("p end") p.close() p.join() """ if corpus_type == "train": vocab_path = opt.save_data + '.vocab.pt' if existing_fields is None: fields = _build_fields_vocab( fields, counters, opt.data_type, opt.share_vocab, opt.vocab_size_multiple, opt.src_vocab_size, opt.src_words_min_frequency, opt.tgt_vocab_size, opt.tgt_words_min_frequency) else: fields = existing_fields logger.info("**********************************************") #device = torch.device("cuda") torch.save(fields, vocab_path) #model = torch.load(vocab_path) #model.to(device) #logger.info(model) logger.info("----------------------------") #logger.info("-------------------------------------------") return True # For Debugging!!!(from sehun)
def build_save_dataset(corpus_type, fields, src_reader, tgt_reader, opt): assert corpus_type in ['train', 'valid'] if corpus_type == 'train': counters = defaultdict(Counter) srcs = opt.train_src tgts = opt.train_tgt ids = opt.train_ids else: srcs = [opt.valid_src] tgts = [opt.valid_tgt] ids = [None] logger.info(opt) for src, tgt, maybe_id in zip(srcs, tgts, ids): logger.info("Reading source and target files: %s %s." % (src, tgt)) src_shards = split_corpus(src, opt.shard_size) tgt_shards = split_corpus(tgt, opt.shard_size) shard_pairs = zip(src_shards, tgt_shards) dataset_paths = [] if (corpus_type == "train" or opt.filter_valid) and tgt is not None: filter_pred = partial(inputters.filter_example, use_src_len=opt.data_type == "text", max_src_len=opt.src_seq_length, max_tgt_len=opt.tgt_seq_length) else: filter_pred = None if corpus_type == "train": existing_fields = None if opt.src_vocab != "": try: logger.info("Using existing vocabulary...") existing_fields = torch.load(opt.src_vocab) except torch.serialization.pickle.UnpicklingError: logger.info("Building vocab from text file...") src_vocab, src_vocab_size = _load_vocab( opt.src_vocab, "src", counters, opt.src_words_min_frequency) else: src_vocab = None if opt.tgt_vocab != "": tgt_vocab, tgt_vocab_size = _load_vocab( opt.tgt_vocab, "tgt", counters, opt.tgt_words_min_frequency) else: tgt_vocab = None for i, (src_shard, tgt_shard) in enumerate(shard_pairs): assert len(src_shard) == len(tgt_shard) logger.info("Building shard %d." % i) # @memray: to be different from normal datasets dataset = inputters.str2dataset[opt.data_type]( fields, readers=([src_reader, tgt_reader] if tgt_reader else [src_reader]), data=([("src", src_shard), ("tgt", tgt_shard)] if tgt_reader else [("src", src_shard)]), dirs=([opt.src_dir, None] if tgt_reader else [opt.src_dir]), sort_key=inputters.str2sortkey[opt.data_type], filter_pred=filter_pred) if corpus_type == "train" and existing_fields is None: for ex in dataset.examples: for name, field in fields.items(): try: f_iter = iter(field) except TypeError: f_iter = [(name, field)] all_data = [getattr(ex, name, None)] else: all_data = getattr(ex, name) for (sub_n, sub_f), fd in zip(f_iter, all_data): has_vocab = (sub_n == 'src' and src_vocab is not None) or \ (sub_n == 'tgt' and tgt_vocab is not None) if (hasattr(sub_f, 'sequential') and sub_f.sequential and not has_vocab): val = fd if opt.data_type == 'keyphrase' and sub_n == 'tgt': # in this case, val is a list of phrases (list of strings (words)) for v in val: counters[sub_n].update(v) else: counters[sub_n].update(val) if maybe_id: shard_base = corpus_type + "_" + maybe_id else: shard_base = corpus_type data_path = "{:s}.{:s}.{:d}.pt".\ format(opt.save_data, shard_base, i) dataset_paths.append(data_path) logger.info(" * saving %sth %s data shard to %s. %d examples" % (i, corpus_type, data_path, len(dataset.examples))) dataset.save(data_path) del dataset.examples gc.collect() del dataset gc.collect() if corpus_type == "train": vocab_path = opt.save_data + '.vocab.pt' if existing_fields is None: fields = _build_fields_vocab( fields, counters, opt.data_type, opt.share_vocab, opt.vocab_size_multiple, opt.src_vocab_size, opt.src_words_min_frequency, opt.tgt_vocab_size, opt.tgt_words_min_frequency) else: fields = existing_fields torch.save(fields, vocab_path)
def build_save_dataset(corpus_type, fields, src_reader, tgt_reader, align_reader, opt): assert corpus_type in ['train', 'valid'] if corpus_type == 'train': counters = defaultdict(Counter) srcs = opt.train_src tgts = opt.train_tgt ids = opt.train_ids aligns = opt.train_align elif corpus_type == 'valid': counters = None srcs = [opt.valid_src] tgts = [opt.valid_tgt] ids = [None] aligns = [opt.valid_align] src_vocab, tgt_vocab, existing_fields = maybe_load_vocab( corpus_type, counters, opt) existing_shards = check_existing_pt_files(opt, corpus_type, ids, existing_fields) # every corpus has shards, no new one if existing_shards == ids and not opt.overwrite: return def shard_iterator(srcs, tgts, ids, aligns, existing_shards, existing_fields, corpus_type, opt): """ Builds a single iterator yielding every shard of every corpus. """ for src, tgt, maybe_id, maybe_align in zip(srcs, tgts, ids, aligns): if maybe_id in existing_shards: if opt.overwrite: logger.warning( "Overwrite shards for corpus {}".format(maybe_id)) else: if corpus_type == "train": assert existing_fields is not None,\ ("A 'vocab.pt' file should be passed to " "`-src_vocab` when adding a corpus to " "a set of already existing shards.") logger.warning("Ignore corpus {} because " "shards already exist".format(maybe_id)) continue if ((corpus_type == "train" or opt.filter_valid) and tgt is not None): filter_pred = partial(inputters.filter_example, use_src_len=opt.data_type == "text", max_src_len=opt.src_seq_length, max_tgt_len=opt.tgt_seq_length) else: filter_pred = None src_shards = split_corpus(src, opt.shard_size) tgt_shards = split_corpus(tgt, opt.shard_size) align_shards = split_corpus(maybe_align, opt.shard_size) for i, (ss, ts, a_s) in enumerate(zip(src_shards, tgt_shards, align_shards)): yield (i, (ss, ts, a_s, maybe_id, filter_pred)) shard_iter = shard_iterator(srcs, tgts, ids, aligns, existing_shards, existing_fields, corpus_type, opt) with Pool(opt.num_threads) as p: dataset_params = (corpus_type, fields, src_reader, tgt_reader, align_reader, opt, existing_fields, src_vocab, tgt_vocab) func = partial(process_one_shard, dataset_params) for sub_counter in p.imap(func, shard_iter): if sub_counter is not None: for key, value in sub_counter.items(): counters[key].update(value) if corpus_type == "train": vocab_path = opt.save_data + '.vocab.pt' if existing_fields is None: fields = _build_fields_vocab( fields, counters, opt.data_type, opt.share_vocab, opt.vocab_size_multiple, opt.src_vocab_size, opt.src_words_min_frequency, opt.tgt_vocab_size, opt.tgt_words_min_frequency) else: fields = existing_fields torch.save(fields, vocab_path)
def build_save_dataset(corpus_type, fields, src_reader, tgt_reader, tag_reader, align_reader, opt): assert corpus_type in ['train', 'valid'] if corpus_type == 'train': counters = defaultdict(Counter) srcs = opt.train_src tgts = opt.train_tgt ids = opt.train_ids aligns = opt.train_align # wei 20200721 nfr_tags = opt.train_nfr_tag # end wei # wei 20200730 flat_tags = opt.train_flat_tag # end wei elif corpus_type == 'valid': counters = None srcs = [opt.valid_src] tgts = [opt.valid_tgt] ids = [None] aligns = [opt.valid_align] # wei 20200723 nfr_tags = [opt.valid_nfr_tag] # end wei # wei 20200730 flat_tags = [opt.valid_flat_tag] # end wei src_vocab, tgt_vocab, existing_fields = maybe_load_vocab( corpus_type, counters, opt) existing_shards = check_existing_pt_files(opt, corpus_type, ids, existing_fields) # every corpus has shards, no new one if existing_shards == ids and not opt.overwrite: return # def shard_iterator(srcs, tgts, ids, aligns, existing_shards, # wei 20200721 # def shard_iterator(srcs, tgts, ids, aligns, tags, existing_shards, # wei 20200730 def shard_iterator(srcs, tgts, ids, aligns, nfr_tags, flat_tags, existing_shards, existing_fields, corpus_type, opt): """ Builds a single iterator yielding every shard of every corpus. """ for src, tgt, maybe_id, maybe_align, nfr_tag, flat_tag in zip( srcs, tgts, ids, aligns, nfr_tags, flat_tags): if maybe_id in existing_shards: if opt.overwrite: logger.warning( "Overwrite shards for corpus {}".format(maybe_id)) else: if corpus_type == "train": assert existing_fields is not None,\ ("A 'vocab.pt' file should be passed to " "`-src_vocab` when adding a corpus to " "a set of already existing shards.") logger.warning("Ignore corpus {} because " "shards already exist".format(maybe_id)) continue if ((corpus_type == "train" or opt.filter_valid) and tgt is not None): filter_pred = partial(inputters.filter_example, use_src_len=opt.data_type == "text", max_src_len=opt.src_seq_length, max_tgt_len=opt.tgt_seq_length) else: filter_pred = None src_shards = split_corpus(src, opt.shard_size) tgt_shards = split_corpus(tgt, opt.shard_size) align_shards = split_corpus(maybe_align, opt.shard_size) # wei 20200721 nfr_tag_shards = split_corpus(nfr_tag, opt.shard_size) # end wei # wei 20200730 flat_tag_shards = split_corpus(flat_tag, opt.shard_size) # end wei # for i, (ss, ts, a_s) in enumerate( # wei 20200721 for i, (ss, ts, a_s, n_t_s, f_t_s) in enumerate( # zip(src_shards, tgt_shards, align_shards)): # wei 20200721 # zip(src_shards, tgt_shards, align_shards, tag_shards)): # wei 20200730 zip(src_shards, tgt_shards, align_shards, nfr_tag_shards, flat_tag_shards)): # yield (i, (ss, ts, a_s, maybe_id, filter_pred)) # wei 20200721 # yield (i, (ss, ts, a_s, t_s, maybe_id, filter_pred)) # wei 20200730 yield (i, (ss, ts, a_s, n_t_s, f_t_s, maybe_id, filter_pred)) # shard_iter = shard_iterator(srcs, tgts, ids, aligns, existing_shards, # wei 20200721 # shard_iter=shard_iterator(srcs, tgts, ids, aligns, tags, existing_shards, # wei 20200730 shard_iter = shard_iterator(srcs, tgts, ids, aligns, nfr_tags, flat_tags, existing_shards, existing_fields, corpus_type, opt) with Pool(opt.num_threads) as p: # dataset_params = (corpus_type, fields, src_reader, tgt_reader, # wei 20200721 dataset_params = (corpus_type, fields, src_reader, tgt_reader, tag_reader, align_reader, opt, existing_fields, src_vocab, tgt_vocab) func = partial(process_one_shard, dataset_params) for sub_counter in p.imap(func, shard_iter): if sub_counter is not None: for key, value in sub_counter.items(): counters[key].update(value) if corpus_type == "train": vocab_path = opt.save_data + '.vocab.pt' new_fields = _build_fields_vocab( fields, counters, opt.data_type, opt.share_vocab, opt.vocab_size_multiple, opt.src_vocab_size, opt.src_words_min_frequency, opt.tgt_vocab_size, opt.tgt_words_min_frequency, subword_prefix=opt.subword_prefix, subword_prefix_is_joiner=opt.subword_prefix_is_joiner) if existing_fields is None: fields = new_fields else: fields = existing_fields if old_style_vocab(fields): fields = load_old_vocab(fields, opt.data_type, dynamic_dict=opt.dynamic_dict) # patch corpus_id if fields.get("corpus_id", False): fields["corpus_id"].vocab = new_fields["corpus_id"].vocab_cls( counters["corpus_id"]) torch.save(fields, vocab_path)
def build_save_dataset(corpus_type, fields, src_reader, tgt_reader, align_reader, opt): assert corpus_type in [config.train, config.valid] if corpus_type == config.train: counters = defaultdict(Counter) srcs = opt.train_src tgts = opt.train_tgt ids = opt.train_ids aligns = opt.train_align elif corpus_type == config.valid: counters = None srcs = [opt.valid_src] tgts = [opt.valid_tgt] ids = [None] aligns = [opt.valid_align] src_vocab, tgt_vocab, existing_fields = maybe_load_vocab( corpus_type, counters, opt ) existing_shards = [] def shard_iterator(srcs, tgts, ids, aligns, existing_shards, existing_fields, corpus_type, opt): for src, tgt, maybe_id, maybe_align in zip(srcs, tgts, ids, aligns): if maybe_id in existing_shards: if opt.overwrite: logger.warning("Overwrite shards for corpus {}" .format(maybe_id)) else: if corpus_type == config.train: assert existing_fields is not None, \ ("A 'vocab.pt' file should be passed to " "`-src_vocab` when adding a corpus to " "a set of already existing shards.") logger.warning("Ignore corpus {} because " "shards already exist" .format(maybe_id)) continue if ((corpus_type == "train" or opt.filter_valid) and tgt is not None): filter_pred = partial( inputters.filter_example, use_src_len=opt.data_type == "text", max_src_len=opt.src_seq_length, max_tgt_len=opt.tgt_seq_length) else: filter_pred = None src_shards = split_corpus(src, opt.shard_size) tgt_shards = split_corpus(tgt, opt.shard_size) align_shards = split_corpus(maybe_align, opt.shard_size) for i, (ss, ts, a_s) in enumerate( zip(src_shards, tgt_shards, align_shards)): yield (i, (ss, ts, a_s, maybe_id, filter_pred)) shard_iter = shard_iterator(srcs, tgts, ids, aligns, existing_shards, existing_fields, corpus_type, opt) with Pool(opt.num_threads) as p: dataset_params = ( corpus_type, fields, src_reader, tgt_reader, align_reader, opt, existing_fields, src_vocab, tgt_vocab ) func = partial(process_one_shard, dataset_params) for sub_counter in p.imap(func, shard_iter): if sub_counter is not None: for key, value in sub_counter.items(): counters[key].update(value) if corpus_type == "train": vocab_path = opt.save_data + '.vocab.pt' new_fields = _build_fields_vocab( fields, counters, opt.data_type, opt.share_vocab, opt.vocab_size_multiple, opt.src_vocab_size, opt.src_words_min_frequency, opt.tgt_vocab_size, opt.tgt_words_min_frequency, subword_prefix=opt.subword_prefix, subword_prefix_is_joiner=opt.subword_prefix_is_joiner) if existing_fields is None: fields = new_fields else: fields = existing_fields if old_style_vocab(fields): fields = load_old_vocab( fields, opt.data_type, dynamic_dict=opt.dynamic_dict) # patch corpus_id if fields.get("corpus_id", False): fields["corpus_id"].vocab = new_fields["corpus_id"].vocab_cls( counters["corpus_id"]) torch.save(fields, vocab_path)
def build_save_dataset(corpus_type, fields, src_reader, opt): assert corpus_type in ['train', 'valid'] if corpus_type == 'train': counters = defaultdict(Counter) srcs = opt.train_src ids = opt.train_ids else: srcs = [opt.valid_src] ids = [None] for src, maybe_id in zip(srcs, ids): logger.info("Reading source files: %s." % src) # src_shards = split_corpus(src, opt.shard_size) src_shards = split_corpus(src, 0) dataset_paths = [] # if (corpus_type == "train" or opt.filter_valid): # filter_pred = partial( # inputters.filter_example, use_src_len=opt.data_type == "text", # max_src_len=opt.src_seq_length, max_tgt_len=opt.tgt_seq_length) # else: filter_pred = None if corpus_type == "train": existing_fields = None if opt.src_vocab != "": try: logger.info("Using existing vocabulary...") existing_fields = torch.load(opt.src_vocab) except torch.serialization.pickle.UnpicklingError: logger.info("Building vocab from text file...") src_vocab, src_vocab_size = _load_vocab( opt.src_vocab, "src", counters, opt.src_words_min_frequency) else: src_vocab = None for i, _src_shard in enumerate(src_shards): # not considered shard logger.info("Building shard %d." % i) src_shard = [] for j, line in enumerate(_src_shard): if len(line.strip().split("\t")) == 6: src_shard.append(line) _id = [line.strip().split("\t")[0] for line in src_shard[1:]] # _id = [i for i, line in enumerate(src_shard[1:], 1)] sent1 = [line.strip().split("\t")[3] for line in src_shard[1:]] sent2 = [line.strip().split("\t")[4] for line in src_shard[1:]] prelogit1 = [0.0 for _ in src_shard[1:]] prelogit2 = [0.0 for _ in src_shard[1:]] label = [] for line in src_shard[1:]: token = line.strip().split("\t")[5] if token in ["Good", "entailment", "1", 1]: label.append(1) else: label.append(0) dataset = inputters.Dataset( fields, readers=([ src_reader, src_reader, src_reader, src_reader, src_reader, src_reader ]), data=([("id", _id), ("sent1", sent1), ("sent2", sent2), ("label", label), ("prelogit1", prelogit1), ("prelogit2", prelogit2)]), # data=([("src", src_shard), ("tgt", tgt_shard)] # if tgt_reader else [("src", src_shard)]), dirs=([None, None, None, None, None, None]), sort_key=inputters.str2sortkey[opt.data_type], filter_pred=filter_pred) if corpus_type == "train" and existing_fields is None: for ex in dataset.examples: for name, field in fields.items(): if name in ["label", "id", "prelogit1", "prelogit2"]: continue try: f_iter = iter(field) except TypeError: f_iter = [(name, field)] all_data = [getattr(ex, name, None)] else: all_data = getattr(ex, name) for (sub_n, sub_f), fd in zip(f_iter, all_data): has_vocab = (sub_n == 'src' and src_vocab) if (hasattr(sub_f, 'sequential') and sub_f.sequential and not has_vocab): val = fd counters[sub_n].update(val) if maybe_id: shard_base = corpus_type + "_" + maybe_id else: shard_base = corpus_type data_path = "{:s}.{:s}.{:d}.pt".\ format(opt.save_data, shard_base, i) dataset_paths.append(data_path) logger.info(" * saving %sth %s data shard to %s." % (i, shard_base, data_path)) dataset.save(data_path) del dataset.examples gc.collect() del dataset gc.collect() if corpus_type == "train": vocab_path = opt.save_data + '.vocab.pt' if existing_fields is None: fields = _build_fields_vocab(fields, counters, opt.data_type, opt.share_vocab, opt.vocab_size_multiple, opt.src_vocab_size, opt.src_words_min_frequency) else: fields = existing_fields torch.save(fields, vocab_path)