def __init__(self,
              index_fp,
              catalog,
              term_ids_map,
              doc_ids_map,
              doc_len_map,
              decompressor=None,
              exclude=None,
              stemmer=None):
     self._index_fp = index_fp
     self._catalog = catalog
     self._term_ids_map = term_ids_map
     self._doc_ids_map = doc_ids_map
     self._doc_len_map = doc_len_map
     self._sum_ttf = sum(ttf for ttf in self._doc_len_map.values())
     self._tokenizer = tokenizer.build_tokenizer(exclude=exclude,
                                                 stemmer=stemmer)
     self._cached_inverted_list = dict()
     self._decoder = indexer.TextProcessor.decoder(decompressor)
Beispiel #2
0
def main():

    name = sys.argv.pop(0)
    usage = '''{}  -trn FILE [-tst FILE]* [-tok FILE]
       -tok FILE : options for tokenizer
       -trn FILE : train file
       -tst FILE : test file

The script needs pyonmttok installed (pip install pyonmttok)
'''.format(name)

    ftok = None
    ftrn = None
    ftsts = []
    while len(sys.argv):
        tok = sys.argv.pop(0)
        if (tok == "-tok" and len(sys.argv)): ftok = sys.argv.pop(0)
        elif (tok == "-trn" and len(sys.argv)): ftrn = sys.argv.pop(0)
        elif (tok == "-tst" and len(sys.argv)): ftsts.append(sys.argv.pop(0))
        elif (tok == "-h"):
            sys.stderr.write("{}".format(usage))
            sys.exit()
        else:
            sys.stderr.write('error: unparsed {} option\n'.format(tok))
            sys.stderr.write("{}".format(usage))
            sys.exit()

    token = None
    if ftok is not None:
        with open(ftok) as yamlfile:
            opts = yaml.load(yamlfile, Loader=yaml.FullLoader)
            token = build_tokenizer(opts)

    if ftrn is not None:
        trn = File(ftrn, None, token)
        for ftst in ftsts:
            tst = File(ftst, trn, token)
Beispiel #3
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument("--train_dataset",
                        type=str,
                        default="data/corpus.small",
                        help="train dataset")
    parser.add_argument(
        "--test_dataset",
        type=str,
        default="data/corpus.small",
        help="test set for evaluation",
    )
    parser.add_argument("--vocab_file", default="gpt2-vocab.json", type=str)
    parser.add_argument("--merges_file", default="gpt2-merges.txt", type=str)
    parser.add_argument("--output_path",
                        default="output/",
                        type=str,
                        help="save path")
    parser.add_argument("--restore_file",
                        default=None,
                        type=str,
                        help="the path for pretrained model")

    parser.add_argument("--seq_len",
                        type=int,
                        default=128,
                        help="maximum sequence len")

    parser.add_argument("--batch_size",
                        type=int,
                        default=8,
                        help="number of batch_size")
    parser.add_argument("--epochs",
                        type=int,
                        default=5,
                        help="number of epochs")
    parser.add_argument("--num_workers",
                        type=int,
                        default=0,
                        help="dataloader worker size")

    parser.add_argument("--lr",
                        type=float,
                        default=3e-4,
                        help="learning rate of adam")
    parser.add_argument("--adam_weight_decay",
                        type=float,
                        default=0.01,
                        help="weight_decay of adam")
    parser.add_argument("--adam_beta1",
                        type=float,
                        default=0.98,
                        help="adam first beta value")
    parser.add_argument("--adam_beta2",
                        type=float,
                        default=0.999,
                        help="adam first beta value")
    parser.add_argument("--warmup_steps",
                        type=int,
                        default=1000,
                        help="warmup steps")
    parser.add_argument(
        "--accumulate_gradient_steps",
        type=int,
        default=1,
        help="accumulate gradient steps",
    )

    args = parser.parse_args()

    print("building tokenizer")
    tokenizer = build_tokenizer(
        vocab_file=args.vocab_file,
        merges_file=args.merges_file,
        tokenizer_type="GPT2BPETokenizer",
    )

    print("building train dataset")
    train_dataset = GPTDataset(args.train_dataset, tokenizer, args.seq_len)

    print("building test dataset")
    test_dataset = GPTDataset(args.test_dataset, tokenizer, args.seq_len)

    print("building train dataloader")
    train_data_loader = DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   num_workers=args.num_workers)

    print("building test dataloader")
    test_data_loader = DataLoader(
        test_dataset,
        shuffle=False,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
    )

    print("building model")
    config = GPT2Config()
    model = GPT2LMHeadModel(config)

    if args.restore_file is not None:
        model.load_state_dict(flow.load(args.restore_file))
    model.lm_head.weight = model.transformer.wte.weight

    trainer = Trainer(
        model,
        train_dataloader=train_data_loader,
        test_dataloader=test_data_loader,
        epoch=args.epochs,
        lr=args.lr,
        betas=(args.adam_beta1, args.adam_beta2),
        weight_decay=args.adam_weight_decay,
        warmup_steps=args.warmup_steps,
        accumulate_gradient_steps=args.accumulate_gradient_steps,
        output_path=args.output_path,
    )

    print("begin training")
    trainer.train()
Beispiel #4
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--train_dataset",
        required=False,
        type=str,
        default="data/corpus.small",
        help="train dataset",
    )
    parser.add_argument(
        "--test_dataset",
        type=str,
        default="data/corpus.small",
        help="test set for evaluation",
    )
    parser.add_argument("--vocab_file",
                        required=False,
                        default="vocab.json",
                        type=str)
    parser.add_argument("--merges_file",
                        required=False,
                        default="merge.txt",
                        type=str)
    parser.add_argument(
        "--output_path",
        required=False,
        default="output/model",
        type=str,
        help="save path",
    )

    parser.add_argument("--seq_len",
                        type=int,
                        default=128,
                        help="maximum sequence len")

    parser.add_argument("--batch_size",
                        type=int,
                        default=4,
                        help="number of batch_size")
    parser.add_argument("--epochs",
                        type=int,
                        default=50,
                        help="number of epochs")
    parser.add_argument("--num_workers",
                        type=int,
                        default=0,
                        help="dataloader worker size")

    parser.add_argument(
        "--with_cuda",
        type=bool,
        default=True,
        help="training with CUDA: true, or false",
    )

    parser.add_argument("--lr",
                        type=float,
                        default=1e-4,
                        help="learning rate of adam")
    parser.add_argument("--adam_weight_decay",
                        type=float,
                        default=0.01,
                        help="weight_decay of adam")
    parser.add_argument("--adam_beta1",
                        type=float,
                        default=0.9,
                        help="adam first beta value")
    parser.add_argument("--adam_beta2",
                        type=float,
                        default=0.999,
                        help="adam first beta value")

    args = parser.parse_args()

    print("building tokenizer")
    tokenizer = build_tokenizer(
        vocab_file=args.vocab_file,
        merges_file=args.merges_file,
        tokenizer_type="GPT2BPETokenizer",
    )

    print("building train dataset")
    train_dataset = GPTDataset(args.train_dataset, tokenizer, args.seq_len)

    print("building train dataloader")
    train_data_loader = DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   num_workers=args.num_workers)

    for i, b in enumerate(train_data_loader):
        if i == 2:
            batch = b
            break

    of_batch = batch.cuda()

    print("building model")
    config = GPT2Config()

    pt_batch = torch.from_numpy(batch.numpy()).long().cuda()

    model = pt_GPT2LMHeadModel(config)

    model.load_state_dict(torch.load("gpt2_model.pt"))
    model.lm_head.weight = model.transformer.wte.weight

    model.cuda()
    model.eval()

    learning_rate = 0.01
    mom = 0.9
    pt_optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=0.0001,
        betas=(args.adam_beta1, args.adam_beta2),
        weight_decay=args.adam_weight_decay,
    )

    for_time = 0.0
    bp_time = 0.0
    update_time = 0.0
    pt_loss = list()
    loss = None
    print("start pytorch training loop....")
    start_t = time.time()
    for epoch in range(args.epochs):
        s_t = time.time()
        loss = model(pt_batch, labels=pt_batch)[0]
        for_time += time.time() - s_t

        pt_loss.append(loss.item())

        s_t = time.time()
        loss.backward()
        bp_time += time.time() - s_t

        s_t = time.time()
        pt_optimizer.step()
        pt_optimizer.zero_grad()
        update_time += time.time() - s_t

    end_t = time.time()

    print("pytorch traning loop avg time : {}".format(
        (end_t - start_t) / args.epochs))
    print("forward avg time : {}".format(for_time / args.epochs))
    print("backward avg time : {}".format(bp_time / args.epochs))
    print("update parameters avg time : {}".format(update_time / args.epochs))

    pt_parameters_names = []
    pt_parameters_value = []
    for name, param in model.named_parameters():
        pt_parameters_names.append(name)
        pt_parameters_value.append(param.cpu().detach().numpy())

    model = GPT2LMHeadModel(config)

    model.load_state_dict(flow.load("gpt2_oneflow_model"))
    model.lm_head.weight = model.transformer.wte.weight

    model.cuda()
    model.eval()

    optimizer = flow.optim.AdamW(
        model.parameters(),
        lr=0.0001,
        betas=(args.adam_beta1, args.adam_beta2),
        weight_decay=args.adam_weight_decay,
    )

    for_time = 0.0
    bp_time = 0.0
    update_time = 0.0
    of_loss = list()

    print("start oneflow training loop....")
    start_t = time.time()
    for epoch in range(args.epochs):
        s_t = time.time()
        loss = model(of_batch, labels=of_batch)[0]
        for_time += time.time() - s_t

        of_loss.append(loss.numpy())

        s_t = time.time()
        loss.backward()
        bp_time += time.time() - s_t

        s_t = time.time()
        optimizer.step()
        optimizer.zero_grad()
        update_time += time.time() - s_t

    end_t = time.time()

    print("oneflow traning loop avg time : {}".format(
        (end_t - start_t) / args.epochs))
    print("forward avg time : {}".format(for_time / args.epochs))
    print("backward avg time : {}".format(bp_time / args.epochs))
    print("update parameters avg time : {}".format(update_time / args.epochs))

    for i in range(args.epochs):
        print(i, of_loss[i], pt_loss[i])

    import matplotlib.pyplot as plt

    plt.switch_backend("agg")
    epochs = np.arange(1, args.epochs + 1)

    plt.plot(epochs, of_loss, label="oneflow")
    plt.plot(epochs, pt_loss, label="pytorch")
    plt.legend()
    plt.savefig("./1.jpg")
    plt.show()
Beispiel #5
0
    def __init__(self, filepath, voc_src, tok_src, voc_tgt, tok_tgt, seq_size,
                 max_sents, do_shuffle):
        if filepath is None:
            return
        self.voc_src = voc_src
        self.voc_tgt = voc_tgt
        self.files = filepath.split(",")
        self.seq_size = seq_size
        self.max_sents = max_sents
        self.do_shuffle = do_shuffle
        self.annotated = False
        self.data = []
        ### length of the data set to be used (not necessarily the whole set)
        self.length = 0

        src_tokenizer = None
        tgt_tokenizer = None
        if tok_src:
            src_tokenizer = build_tokenizer(tok_src)
        if tok_tgt:
            tgt_tokenizer = build_tokenizer(tok_tgt)

        # file handlers
        fhs = []
        for file in self.files:
            if file.endswith('.gz'):
                fh.append(gzip.open(file, 'rb'))
            else:
                fhs.append(open(file, 'rb'))

        firstline = True
        count_column = None
        idx = 0
        for line in fhs[0]:
            idx += 1
            if len(fhs) > 1:
                # read from multiple files
                lsplit = [line]
                for fh in fhs[1:]:
                    lsplit.append(fh.readline().strip())
            else:
                # or for one single file
                lsplit = line.split('\t')
            if firstline:
                assert len(lsplit) >= 2 and len(
                    lsplit) <= 4, "invalid column count in {}".format(filepath)
                count_column = len(lsplit)
                if len(lsplit) == 4:
                    self.annotated = True
                firstline = False
            else:
                assert len(
                    lsplit
                ) == count_column, "invalid column count in {}, line {}".format(
                    filepath, idx)
            if src_tokenizer:
                tokens, _ = src_tokenizer.tokenize(str(lsplit[0]))
                lsplit[0] = " ".join(tokens)
            if tgt_tokenizer:
                tokens, _ = tgt_tokenizer.tokenize(str(lsplit[1]))
                lsplit[1] = " ".join(tokens)
            self.data.append("\t".join(lsplit))
            self.length += 1

        if self.max_sents > 0:
            self.length = min(self.length, self.max_sents)
        sys.stderr.write('({} contains {} examples)\n'.format(
            filepath, len(self.data)))
Beispiel #6
0
def main():

    name = sys.argv.pop(0)
    usage = '''{} [-data FILE] ( -save FILE | -load FILE )
       -tok  FILE : options for tokenizer
       -data FILE : file used to learn/inference
       -save FILE : save tfidf model after building it with data file      (LEARNING)
       -load FILE : load tfidf model and use it for inference on data file (INFERENCE)

The script needs pyonmttok installed (pip install pyonmttok)
'''.format(name)

    ftok = None
    fsave = None
    fload = None
    fdata = []
    while len(sys.argv):
        tok = sys.argv.pop(0)
        if (tok == "-tok" and len(sys.argv)): ftok = sys.argv.pop(0)
        elif (tok == "-save" and len(sys.argv)): fsave = sys.argv.pop(0)
        elif (tok == "-load" and len(sys.argv)): fload = sys.argv.pop(0)
        elif (tok == "-data" and len(sys.argv)): fdata.append(sys.argv.pop(0))
        elif (tok == "-h"):
            sys.stderr.write("{}".format(usage))
            sys.exit()
        else:
            sys.stderr.write('error: unparsed {} option\n'.format(tok))
            sys.stderr.write("{}".format(usage))
            sys.exit()

    token = None
    if ftok is not None:
        with open(ftok) as yamlfile:
            opts = yaml.load(yamlfile)
            token = build_tokenizer(opts)

    ### learning ###
    if fsave is not None and len(fdata):
        sys.stderr.write('Learning mode\n')
        sentIdf = SentIdf()
        for f in fdata:
            sys.stderr.write('\treading {}\n'.format(f))
            sentIdf.add(f, token)
        sys.stderr.write('Model saved in {}\n'.format(fsave))
        sentIdf.save(fsave)

    ### inference ###
    if fload is not None and len(fdata):
        sys.stderr.write('Inference mode. Model in {}\n'.format(fload))
        sentIdf = SentIdf(fload)

        for file in fdata:
            with open(file) as f:
                for line in f:
                    line = line.strip('\n')
                    if token is not None:
                        toks, _ = token.tokenize(str(line))
                    else:
                        toks = line.split(' ')

                    tfidf = sentIdf.tfidf(toks, use_tf=False)
                    sys.stdout.write(" ".join(toks) + '\n')
                    for i in range(len(toks)):
                        sys.stdout.write("{:.8f}\t{}\n".format(
                            tfidf[i], toks[i]))
Beispiel #7
0
def main():

    name = sys.argv.pop(0)
    usage = '''{} -tok FILE -mod FILE ([-trn STRING]+ | -tst FILE [-snt])
       -tok   FILE : options for tokenizer
       -mod   FILE : tfidf model file (to create/save)
       -tst   FILE : file used for inference
       -trn STRING : file:tag used for the given domain
       -max      N : max vocabulary size (default 0: use all)
       -snt        : compute tfidf values for each sentence rather the entire tst file

The script needs pyonmttok installed (pip install pyonmttok)
'''.format(name)

    ftok = None
    fmod = None
    vtrn = []
    ftst = None
    max_voc_size = 0
    snt = False #### compute inference over whole test-set
    while len(sys.argv):
        tok = sys.argv.pop(0)
        if   (tok=="-tok" and len(sys.argv)): ftok = sys.argv.pop(0)
        elif (tok=="-mod" and len(sys.argv)): fmod = sys.argv.pop(0)
        elif (tok=="-trn" and len(sys.argv)): vtrn.append(sys.argv.pop(0))
        elif (tok=="-tst" and len(sys.argv)): ftst = sys.argv.pop(0)
        elif (tok=="-max" and len(sys.argv)): max_voc_size = int(sys.argv.pop(0))
        elif (tok=="-snt"): snt = True
        elif (tok=="-h"):
            sys.stderr.write("{}".format(usage))
            sys.exit()
        else:
            sys.stderr.write('error: unparsed {} option\n'.format(tok))
            sys.stderr.write("{}".format(usage))
            sys.exit()

    token = None
    if ftok is not None:
        with open(ftok) as yamlfile: 
            opts = yaml.load(yamlfile)
            token = build_tokenizer(opts)


    tfidf = TfIdf()
    #############################
    ### create/read the model ###
    #############################
    if len(vtrn):
        if os.path.exists(fmod):
            sys.stderr.write('error: the path {} already exists\n'.format(fmod))
            sys.exit()
        tfidf.learn(vtrn,fmod,max_voc_size,token)
        fout = open(fmod,'w')
        pickle.dump(tfidf, fout)
        fout.close()
        #tfidf.debug()
        sys.stderr.write('Wrote model (V, D) = {}\n'.format(tfidf.TfIdf.shape))
    else:
        fin = open(fmod,'r')
        tfidf = pickle.load(fin)
        fin.close()
        sys.stderr.write('Read model (V, D) = {}\n'.format(tfidf.TfIdf.shape))

    #################
    ### inference ###
    #################
    if ftst is not None:
        tfidf.inference(ftst,snt,token)

    sys.stderr.write('Done\n')
Beispiel #8
0
        sys.stderr.write('error: -trn and/or -tst options must be set\n')
        sys.stderr.write("{}".format(usage))
        sys.exit()

    sys.stderr.write('Nbest       : {}\n'.format(Nbest))
    sys.stderr.write('minNgram    : {}\n'.format(minNgram))
    sys.stderr.write('maxNgram    : {}\n'.format(maxNgram))
    sys.stderr.write('testSet     : {}\n'.format(testSet))
    sys.stderr.write('sortByEDist : {}\n'.format(sortByEDist))

    sys.stderr.write('{} Start\n'.format(str_time()))
    token = None
    if ftok is not None:
        with open(ftok) as yamlfile: 
            opts = yaml.load(yamlfile, Loader=yaml.FullLoader)
            token = build_tokenizer(opts)

    sa = None
    if ftrn is not None:
        sa = SuffixArray(ftrn, token)
        if fmod is not None:
            with open(fmod, 'wb') as f: pickle.dump(sa, f)

    if ftst is not None:
        if sa is None and fmod is not None:
            with open(fmod, 'rb') as f: sa  = pickle.load(f)
            sys.stderr.write('{} Read model from: {}\n'.format(str_time(),fmod))
            sa.queryfile(ftst,token,minNgram,maxNgram,Nbest,sortByEDist,testSet)        
        
    sys.stderr.write('{} End\n'.format(str_time()))
Beispiel #9
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--vocab_file", default="gpt2-vocab.json", type=str)
    parser.add_argument("--merges_file", default="gpt2-merges.txt", type=str)
    parser.add_argument(
        "--restore_file",
        default="gpt2_oneflow_model",
        type=str,
        help="Path to pre-trained model",
    )
    parser.add_argument("--prompt", type=str, default="")
    parser.add_argument("--length", type=int, default=20)
    parser.add_argument("--temperature", type=float, default=1.0)
    parser.add_argument("--top_k", type=int, default=1)
    parser.add_argument("--top_p", type=float, default=0.9)
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Avoid using CUDA when available")
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")
    args = parser.parse_args()

    args.device = flow.device("cuda" if not args.no_cuda else "cpu")

    set_seed(args)

    tokenizer = build_tokenizer(
        vocab_file=args.vocab_file,
        merges_file=args.merges_file,
        tokenizer_type="GPT2BPETokenizer",
    )
    config = GPT2Config()
    model = GPT2LMHeadModel(config)
    if args.restore_file is not None:
        model.load_state_dict(flow.load(args.restore_file))
    model.lm_head.weight = model.transformer.wte.weight
    model.to(args.device)
    model.eval()

    if args.length < 0 and config.max_position_embeddings > 0:
        args.length = config.max_position_embeddings
    elif 0 < config.max_position_embeddings < args.length:
        args.length = (config.max_position_embeddings
                       )  # No generation bigger than model size
    elif args.length < 0:
        args.length = MAX_LENGTH  # avoid infinite loop

    print(args)
    while True:
        raw_text = args.prompt if args.prompt else input("Model prompt >>> ")
        context_tokens = tokenizer.tokenize(raw_text)
        out = sample_sequence(
            model=model,
            context=context_tokens,
            length=args.length,
            temperature=args.temperature,
            top_k=args.top_k,
            top_p=args.top_p,
            device=args.device,
        )
        out = out[0, len(context_tokens):].tolist()
        text = tokenizer.detokenize(out)
        print(text)
        if args.prompt:
            break
    return text