default=90000) parser.add_argument("--savesteps", help="saving steps", type=int, default=10000) parser.add_argument("--weightdecay", help="weight decay", type=float, default=0.1) parser.add_argument("--scheduler", help="scheduler type", default="linear") args = parser.parse_args() print(args.pretrained) try: client = connect_server(args.host, args.accesskey, args.secretkey) load_object(client, args.bucket, args.corpusdata) load_object(client, args.bucket, args.tokenizer) load_object(client, args.bucket, args.pretrained) except: pass try: uncompress_object(args.tokenizer, ".") uncompress_object(args.pretrained, ".") except: pass tokenizer = RobertaTokenizerFast.from_pretrained("./pretrained", max_len=512) config = RobertaConfig(vocab_size=args.vocabsize,
parser.add_argument("-A", "--accesskey", help="access key") parser.add_argument("-K", "--secretkey", help="secret key") parser.add_argument("--logdir", help="tensorboard logdir", default="./logs") parser.add_argument("--weightdecay", help="weight decay", type=float, default=0.01) parser.add_argument("--scheduler", help="scheduler type", default="linear") args = parser.parse_args() cluster_flag = True try: client = connect_server(args.host, args.accesskey, args.secretkey) load_object(client, args.bucket, args.traindata) load_object(client, args.bucket, args.testdata) load_object(client, args.bucket, args.pretrained) except: print("minio connection fails") cluster_flag = False pass if cluster_flag: uncompress_object(args.pretrained, ".") train_df = pd.read_csv(args.traindata) test_df = pd.read_csv(args.testdata) else: print("local file reading") train_df = pd.read_csv('notebooks/files/unlabel_train1.csv') test_df = pd.read_csv('notebooks/files/unlabel_test1.csv')
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-u", "--bucket", help="bucket name", default="petcharts") parser.add_argument("-c", "--corpusdata", help="corpus file", default="pet_wiki.txt") parser.add_argument( "-k", "--tokenizer", help="tokenizer zip file", default="tokenizer.zip" ) parser.add_argument("-v", "--vocabsize", help="vocabsize", type=int, default=40000) parser.add_argument("-H", "--host", help="object server") parser.add_argument("-A", "--accesskey", help="access key") parser.add_argument("-K", "--secretkey", help="secret key") args = parser.parse_args() try: client = connect_server(args.host, args.accesskey, args.secretkey) load_object(client, args.bucket, args.corpusdata) # except Exception as e: # print('error', e) except : pass os.makedirs("./pretrained", exist_ok=True) paths = [str(x) for x in Path(".").glob("**/{}".format(args.corpusdata))] tokenizer = ByteLevelBPETokenizer() tokenizer.train( files=paths, vocab_size=args.vocabsize, min_frequency=50, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"],