def pack_batch(self, test_file, batch_size=8): ftype = test_file.split(".")[-1] if ftype == "json": data = JSON.load(test_file) # random.shuffle(data) entries = [] for entry in data: if len(entries) == batch_size: yield entries entries = [] entries.append(entry) if len(entries) != 0: yield entries elif ftype == "csv": data = CSV.read(test_file) entries = [] for row in data: if len(entries) == batch_size: yield entries entries = [] entry = self.prepare_entry(row[0]) entry["gold_output"] = CSV.process_target(row[-1]) entries.append(entry) if len(entries) != 0: yield entries else: print("not implement yet") return
def merge_files(fileone, filetwo, mergedfile): dataone = CSV.read(fileone, firstline=False, slices=None) datatwo = CSV.read(filetwo, firstline=False, slices=None) data = dataone + datatwo # data = list(set(data)) np.random.shuffle(data) CSV.write(data, mergedfile)
def prepare_iter(filename, firstline=True, task=2): # load datasets to map into indexes if filename.split(".")[-1] == "csv": data_iter = CSV.get_iterator(filename, firstline=firstline, task=task) num_lines = CSV._len(filename) elif filename.split(".")[-1] == "json": data_iter = JSON.get_iterator(filename, task=task) num_lines = JSON._len(filename) else: raise Exception("Not implement yet") return data_iter, num_lines
def read_data(filename, firstline=True): # load datasets to map into indexes if filename.split(".")[-1] == "csv": data = CSV.read(filename, firstline=firstline, slices=[0, 1]) elif filename.split(".")[-1] == "txt": data = TXT.read(filename, firstline=firstline) elif filename.split(".")[-1] == "json": data = JSON.load(filename) else: raise Exception("Not implement yet") return data
def load_file(files, firstline=True, task=2): datasets = [] for fname in files: # Read input files if fname.split(".")[-1] == "csv": datasets.append( CSV(fname, limit=-1, firstline=firstline, task=task)) elif fname.split(".")[-1] == "json": datasets.append(JSON(fname, limit=-1, task=task)) else: raise Exception("Not implement yet") return datasets
def build(self, files, limit=-1, firstline=True): """ Read a list of file names, return vocabulary :param files: list of file names :param limit: read number of lines """ swcnt, swl = Counter(), 0 twcnt, twl = Counter(), 0 count = 0 for fname in files: # Read input files if fname.split(".")[-1] == "csv": raw = CSV(fname, limit=limit, firstline=firstline) elif fname.split(".")[-1] == "json": raw = JSON(fname, source2idx=None, target2idx=None, limit=-1) else: raise Exception("Not implement yet") for line in raw: count += 1 (nl, target) = line nl = Vocab.process_nl(nl) target = Vocab.process_target(target) swcnt, swl = Vocab.update_sent(nl, swcnt, swl) twcnt, twl = Vocab.update_sent(target, twcnt, twl) swvocab = Vocab.update_vocab(swcnt, self.swcutoff, sys_tokens) twvocab = Vocab.update_vocab(twcnt, self.twcutoff, sys_tokens) self.sw2i = swvocab self.i2sw = Vocab.reversed_dict(swvocab) self.swl = swl if self.swl < 0 else min(swl, self.swl) self.tw2i = twvocab self.i2tw = Vocab.reversed_dict(twvocab) self.twl = twl if self.twl < 0 else min(twl, self.twl) print("\t- Extracting vocabulary: %d total samples" % count) print("\t\t- Natural Language Side: ") print("\t\t\t- %d total words" % (sum(swcnt.values()))) print("\t\t\t- %d unique words" % (len(swcnt))) print("\t\t\t- %d unique words appearing at least %d times" % (len(swvocab) - 4, self.swcutoff)) print("\t\t- Label Side: ") print("\t\t\t- %d total words" % (sum(twcnt.values()))) print("\t\t\t- %d unique words" % (len(twcnt))) print("\t\t\t- %d unique words appearing at least %d times" % (len(twvocab) - 4, self.twcutoff))
def prepare_entry(rv, date=None, rvid=None, rating=None): entry = dict() # start = time.time() if date is not None: entry["date"] = date if rvid is not None: entry["review_id"] = rvid if rating is not None: entry["review_score"] = rating entry['mention'] = rv question_toks = CSV.process_nl(rv) entry['input_tokens'] = question_toks # print("- TIMING: %.4f seconds for NL tokenization" % (time.time() - start)) return entry
def regression_test(self, pfile, test_file, limit=None, batch_size=8): data_iter = self.pack_batch(test_file, batch_size=batch_size) if not os.path.exists(os.path.dirname(pfile)): os.mkdir(os.path.dirname(pfile)) data = [] reference = [] candidate = [] hearder = ["review", "gold_output", "pred_output", "matching"] data.append(hearder) i = 0 init = time.time() for entries in data_iter: entries = self.predict_batch(entries) for entry in entries: review = entry["input_tokens"] gold_output = entry["gold_output"] pred_output = " ".join(entry["pred_sequence"]) # prob_output = str(entry["prob_output"]) row = [ review, gold_output, pred_output, pred_output == gold_output ] candidate.append(pred_output.split()) reference.append(gold_output.split()) data.append(row) if i > 0 and i % 2 == 0: now = time.time() print( "Processing %d queries in %.4f seconds; Accumulated inference speed: %.4f (queries/second)" % (i, now - init, i / (now - init))) i += 1 if i == limit: return metrics = self.tagger.class_metrics(reference, candidate) data.append(metrics) CSV.write(data, pfile) return
sos=False, eos=False, add_special_tokens=False) tg2ids = BPE.tokens2ids(tokenizer, sos=False, eos=False, add_special_tokens=False) pad_id = tokenizer.token_to_id(BPAD) if tokenizer.token_to_id( BPAD) else 0 collate_fn = BPE.collate_fn(pad_id, True) # load datasets to map into indexes if filename.split(".")[-1] == "csv": train_data = CSV.get_iterator(filename, firstline=True, task=2) num_lines = CSV._len(filename) elif filename.split(".")[-1] == "json": train_data = JSON.get_iterator(filename, task=2) num_lines = JSON._len(filename) else: raise Exception("Not implement yet") train_iterdataset = IterDataset(train_data, source2idx=nl2ids, target2idx=lb2ids, num_lines=num_lines, bpe=True) train_dataloader = DataLoader(train_iterdataset, pin_memory=True, batch_size=8,
# -*- coding: utf-8 -*- """ Created on 2020-07-13 @author duytinvo """ from mlmodels.utils.txtIO import TXT from mlmodels.utils.csvIO import CSV filename = "/media/data/paraphrase/paralex-evaluation/data/train/paraphrases.txt" data = TXT.read(filename, False) newdata = [] for d in data: newdata.append(d.split("\t")[:-1]) CSV.write(newdata, "/media/data/paraphrase/paralex.csv")
# -*- coding: utf-8 -*- """ Created on 25/03/2020 @author duytinvo """ from mlmodels.utils.csvIO import CSV from mlmodels.utils.txtIO import TXT csvfiles = [ "/media/data/review_response/Train.csv", "/media/data/review_response/Dev.csv" ] txtfile = "/media/data/review_response/raw_vocab.txt" data = [] for csvfile in csvfiles: rev = CSV.read(csvfile, True, [0]) data += rev res = CSV.read(csvfile, True, [1]) data += res TXT.write(data, txtfile)
if args.task == "paraphrase": question = "what is the price of GOOG" generated_sequences = lm_api.inference(task=args.task, rv_text=question) responses = [] for generated_sequence in generated_sequences: responses += [lm_api.post_process_string(generated_sequence[-1])] # print(lm_api.pretty_print(lm_api.post_process_string(generated_sequence[-1]), width=180)) # print(lm_api.pretty_print(lm_api.post_process_string(generated_sequences[-1]), width=180)) elif args.task == "sentiment": review = "it is so delicious" generated_sequences = lm_api.inference(task=args.task, rv_text=review) else: data = CSV.read(args.test_file, firstline=True, slices=[0, 1, 2, 3, 4, 5]) reviews = [d[0] for d in data] # rv_hotel = "Woodbridge Ford" # rv_name = "guest" # rv_rate = "" # rv_title = "" # rv_text = "Great staff awesome inventory. See Brock Dennison and team Atlas for the best deals and over the top customer service. " \ # "Add in the fact that they care and support the community in so many ways. " \ # "You definitely need to go see the great team at Woodbridge Ford." rouge_score = "0.3500" responses = [] for d in data: rv_text = d[0] rv_rate = d[1] rv_title = d[2]
from mlmodels.inference.perplexity_calculator import main from mlmodels.utils.csvIO import CSV if __name__ == '__main__': file = '../../../media/data/coco/paraphrase-coco.csv' data = CSV.read(file) flat_list = [item for sublist in data for item in sublist] avg_perplexity = main(flat_list) print(avg_perplexity, flush=True)
) tokenizer = BPE.load(args.vocab_file) source2idx = tokens2ids(tokenizer) # data = CSV.read(args.train_data_file, firstline=True, slices=[0, 1]) # train_dataset = MapDataset(data, source2idx=source2idx, target2idx=source2idx) # # # train_sampler = RandomSampler(train_dataset) # train_sampler = SequentialSampler(train_dataset) # train_dataloader = DataLoader(train_dataset, sampler=train_sampler, pin_memory=True, # batch_size=16, collate_fn=collate_fn) # # for i, batch in enumerate(train_dataloader): # inputs, outputs = batch[0], batch[1] # break iterdata = CSV.get_iterator(args.train_data_file, firstline=True) num_lines = CSV._len(args.train_data_file, firstline=True) train_iterdataset = IterDataset(iterdata, source2idx=source2idx, target2idx=source2idx, num_lines=num_lines) train_dataloader = DataLoader(train_iterdataset, pin_memory=True, batch_size=16, collate_fn=collate_fn) for i, batch in enumerate(train_dataloader): inputs, outputs = batch[0], batch[1] break
def prepare_nls(nls): data = [] for nl in nls: data.append((CSV.process_nl(nl), None)) return data