Exemple #1
0
    def pack_batch(self, test_file, batch_size=8):
        ftype = test_file.split(".")[-1]
        if ftype == "json":
            data = JSON.load(test_file)
            # random.shuffle(data)
            entries = []
            for entry in data:
                if len(entries) == batch_size:
                    yield entries
                    entries = []
                entries.append(entry)
            if len(entries) != 0:
                yield entries
        elif ftype == "csv":
            data = CSV.read(test_file)
            entries = []
            for row in data:
                if len(entries) == batch_size:
                    yield entries
                    entries = []
                entry = self.prepare_entry(row[0])
                entry["gold_output"] = CSV.process_target(row[-1])
                entries.append(entry)
            if len(entries) != 0:
                yield entries

        else:
            print("not implement yet")
            return
Exemple #2
0
def merge_files(fileone, filetwo, mergedfile):
    dataone = CSV.read(fileone, firstline=False, slices=None)
    datatwo = CSV.read(filetwo, firstline=False, slices=None)
    data = dataone + datatwo
    # data = list(set(data))
    np.random.shuffle(data)
    CSV.write(data, mergedfile)
Exemple #3
0
 def prepare_iter(filename, firstline=True, task=2):
     # load datasets to map into indexes
     if filename.split(".")[-1] == "csv":
         data_iter = CSV.get_iterator(filename,
                                      firstline=firstline,
                                      task=task)
         num_lines = CSV._len(filename)
     elif filename.split(".")[-1] == "json":
         data_iter = JSON.get_iterator(filename, task=task)
         num_lines = JSON._len(filename)
     else:
         raise Exception("Not implement yet")
     return data_iter, num_lines
Exemple #4
0
def read_data(filename, firstline=True):
    # load datasets to map into indexes
    if filename.split(".")[-1] == "csv":
        data = CSV.read(filename, firstline=firstline, slices=[0, 1])
    elif filename.split(".")[-1] == "txt":
        data = TXT.read(filename, firstline=firstline)
    elif filename.split(".")[-1] == "json":
        data = JSON.load(filename)
    else:
        raise Exception("Not implement yet")
    return data
Exemple #5
0
 def load_file(files, firstline=True, task=2):
     datasets = []
     for fname in files:
         # Read input files
         if fname.split(".")[-1] == "csv":
             datasets.append(
                 CSV(fname, limit=-1, firstline=firstline, task=task))
         elif fname.split(".")[-1] == "json":
             datasets.append(JSON(fname, limit=-1, task=task))
         else:
             raise Exception("Not implement yet")
     return datasets
Exemple #6
0
    def build(self, files, limit=-1, firstline=True):
        """
        Read a list of file names, return vocabulary
        :param files: list of file names
        :param limit: read number of lines
        """
        swcnt, swl = Counter(), 0
        twcnt, twl = Counter(), 0
        count = 0

        for fname in files:
            # Read input files
            if fname.split(".")[-1] == "csv":
                raw = CSV(fname, limit=limit, firstline=firstline)
            elif fname.split(".")[-1] == "json":
                raw = JSON(fname, source2idx=None, target2idx=None, limit=-1)
            else:
                raise Exception("Not implement yet")

            for line in raw:
                count += 1
                (nl, target) = line
                nl = Vocab.process_nl(nl)
                target = Vocab.process_target(target)
                swcnt, swl = Vocab.update_sent(nl, swcnt, swl)
                twcnt, twl = Vocab.update_sent(target, twcnt, twl)

        swvocab = Vocab.update_vocab(swcnt, self.swcutoff, sys_tokens)

        twvocab = Vocab.update_vocab(twcnt, self.twcutoff, sys_tokens)

        self.sw2i = swvocab
        self.i2sw = Vocab.reversed_dict(swvocab)
        self.swl = swl if self.swl < 0 else min(swl, self.swl)

        self.tw2i = twvocab
        self.i2tw = Vocab.reversed_dict(twvocab)
        self.twl = twl if self.twl < 0 else min(twl, self.twl)

        print("\t- Extracting vocabulary: %d total samples" % count)

        print("\t\t- Natural Language Side: ")
        print("\t\t\t- %d total words" % (sum(swcnt.values())))
        print("\t\t\t- %d unique words" % (len(swcnt)))
        print("\t\t\t- %d unique words appearing at least %d times" %
              (len(swvocab) - 4, self.swcutoff))
        print("\t\t- Label Side: ")
        print("\t\t\t- %d total words" % (sum(twcnt.values())))
        print("\t\t\t- %d unique words" % (len(twcnt)))
        print("\t\t\t- %d unique words appearing at least %d times" %
              (len(twvocab) - 4, self.twcutoff))
Exemple #7
0
 def prepare_entry(rv, date=None, rvid=None, rating=None):
     entry = dict()
     # start = time.time()
     if date is not None:
         entry["date"] = date
     if rvid is not None:
         entry["review_id"] = rvid
     if rating is not None:
         entry["review_score"] = rating
     entry['mention'] = rv
     question_toks = CSV.process_nl(rv)
     entry['input_tokens'] = question_toks
     # print("- TIMING: %.4f seconds for NL tokenization" % (time.time() - start))
     return entry
Exemple #8
0
 def regression_test(self, pfile, test_file, limit=None, batch_size=8):
     data_iter = self.pack_batch(test_file, batch_size=batch_size)
     if not os.path.exists(os.path.dirname(pfile)):
         os.mkdir(os.path.dirname(pfile))
     data = []
     reference = []
     candidate = []
     hearder = ["review", "gold_output", "pred_output", "matching"]
     data.append(hearder)
     i = 0
     init = time.time()
     for entries in data_iter:
         entries = self.predict_batch(entries)
         for entry in entries:
             review = entry["input_tokens"]
             gold_output = entry["gold_output"]
             pred_output = " ".join(entry["pred_sequence"])
             # prob_output = str(entry["prob_output"])
             row = [
                 review, gold_output, pred_output,
                 pred_output == gold_output
             ]
             candidate.append(pred_output.split())
             reference.append(gold_output.split())
             data.append(row)
             if i > 0 and i % 2 == 0:
                 now = time.time()
                 print(
                     "Processing %d queries in %.4f seconds; Accumulated inference speed: %.4f (queries/second)"
                     % (i, now - init, i / (now - init)))
             i += 1
             if i == limit:
                 return
     metrics = self.tagger.class_metrics(reference, candidate)
     data.append(metrics)
     CSV.write(data, pfile)
     return
Exemple #9
0
                                sos=False,
                                eos=False,
                                add_special_tokens=False)
        tg2ids = BPE.tokens2ids(tokenizer,
                                sos=False,
                                eos=False,
                                add_special_tokens=False)

        pad_id = tokenizer.token_to_id(BPAD) if tokenizer.token_to_id(
            BPAD) else 0

    collate_fn = BPE.collate_fn(pad_id, True)

    # load datasets to map into indexes
    if filename.split(".")[-1] == "csv":
        train_data = CSV.get_iterator(filename, firstline=True, task=2)
        num_lines = CSV._len(filename)
    elif filename.split(".")[-1] == "json":
        train_data = JSON.get_iterator(filename, task=2)
        num_lines = JSON._len(filename)
    else:
        raise Exception("Not implement yet")

    train_iterdataset = IterDataset(train_data,
                                    source2idx=nl2ids,
                                    target2idx=lb2ids,
                                    num_lines=num_lines,
                                    bpe=True)
    train_dataloader = DataLoader(train_iterdataset,
                                  pin_memory=True,
                                  batch_size=8,
Exemple #10
0
# -*- coding: utf-8 -*-
"""
Created on 2020-07-13
@author duytinvo
"""
from mlmodels.utils.txtIO import TXT
from mlmodels.utils.csvIO import CSV
filename = "/media/data/paraphrase/paralex-evaluation/data/train/paraphrases.txt"

data = TXT.read(filename, False)
newdata = []
for d in data:
    newdata.append(d.split("\t")[:-1])

CSV.write(newdata, "/media/data/paraphrase/paralex.csv")
Exemple #11
0
# -*- coding: utf-8 -*-
"""
Created on 25/03/2020
@author duytinvo
"""
from mlmodels.utils.csvIO import CSV
from mlmodels.utils.txtIO import TXT

csvfiles = [
    "/media/data/review_response/Train.csv",
    "/media/data/review_response/Dev.csv"
]
txtfile = "/media/data/review_response/raw_vocab.txt"

data = []
for csvfile in csvfiles:
    rev = CSV.read(csvfile, True, [0])
    data += rev
    res = CSV.read(csvfile, True, [1])
    data += res

TXT.write(data, txtfile)
Exemple #12
0
    if args.task == "paraphrase":
        question = "what is the price of GOOG"
        generated_sequences = lm_api.inference(task=args.task,
                                               rv_text=question)
        responses = []
        for generated_sequence in generated_sequences:
            responses += [lm_api.post_process_string(generated_sequence[-1])]
            # print(lm_api.pretty_print(lm_api.post_process_string(generated_sequence[-1]), width=180))
        # print(lm_api.pretty_print(lm_api.post_process_string(generated_sequences[-1]), width=180))
    elif args.task == "sentiment":
        review = "it is so delicious"
        generated_sequences = lm_api.inference(task=args.task,
                                               rv_text=review)
    else:
        data = CSV.read(args.test_file, firstline=True, slices=[0, 1, 2, 3, 4, 5])
        reviews = [d[0] for d in data]

        # rv_hotel = "Woodbridge Ford"
        # rv_name = "guest"
        # rv_rate = ""
        # rv_title = ""
        # rv_text = "Great staff awesome inventory. See Brock Dennison and team Atlas for the best deals and over the top customer service. " \
        #           "Add in the fact that they care and support the community in so many ways. " \
        #           "You definitely need to go see the great team at Woodbridge Ford."
        rouge_score = "0.3500"
        responses = []
        for d in data:
            rv_text = d[0]
            rv_rate = d[1]
            rv_title = d[2]
Exemple #13
0
from mlmodels.inference.perplexity_calculator import main
from mlmodels.utils.csvIO import CSV

if __name__ == '__main__':
    file = '../../../media/data/coco/paraphrase-coco.csv'
    data = CSV.read(file)
    flat_list = [item for sublist in data for item in sublist]
    avg_perplexity = main(flat_list)
    print(avg_perplexity, flush=True)
Exemple #14
0
    )
    tokenizer = BPE.load(args.vocab_file)
    source2idx = tokens2ids(tokenizer)

    # data = CSV.read(args.train_data_file, firstline=True, slices=[0, 1])
    # train_dataset = MapDataset(data, source2idx=source2idx, target2idx=source2idx)
    #
    # # train_sampler = RandomSampler(train_dataset)
    # train_sampler = SequentialSampler(train_dataset)
    # train_dataloader = DataLoader(train_dataset, sampler=train_sampler, pin_memory=True,
    #                               batch_size=16, collate_fn=collate_fn)
    #
    # for i, batch in enumerate(train_dataloader):
    #     inputs, outputs = batch[0], batch[1]
    #     break

    iterdata = CSV.get_iterator(args.train_data_file, firstline=True)
    num_lines = CSV._len(args.train_data_file, firstline=True)
    train_iterdataset = IterDataset(iterdata,
                                    source2idx=source2idx,
                                    target2idx=source2idx,
                                    num_lines=num_lines)
    train_dataloader = DataLoader(train_iterdataset,
                                  pin_memory=True,
                                  batch_size=16,
                                  collate_fn=collate_fn)

    for i, batch in enumerate(train_dataloader):
        inputs, outputs = batch[0], batch[1]
        break
Exemple #15
0
 def prepare_nls(nls):
     data = []
     for nl in nls:
         data.append((CSV.process_nl(nl), None))
     return data