Ejemplo n.º 1
0
                        default=90000)
    parser.add_argument("--savesteps",
                        help="saving steps",
                        type=int,
                        default=10000)
    parser.add_argument("--weightdecay",
                        help="weight decay",
                        type=float,
                        default=0.1)
    parser.add_argument("--scheduler", help="scheduler type", default="linear")
    args = parser.parse_args()
    print(args.pretrained)

    try:
        client = connect_server(args.host, args.accesskey, args.secretkey)
        load_object(client, args.bucket, args.corpusdata)
        load_object(client, args.bucket, args.tokenizer)
        load_object(client, args.bucket, args.pretrained)
    except:
        pass

    try:
        uncompress_object(args.tokenizer, ".")
        uncompress_object(args.pretrained, ".")
    except:
        pass

    tokenizer = RobertaTokenizerFast.from_pretrained("./pretrained",
                                                     max_len=512)

    config = RobertaConfig(vocab_size=args.vocabsize,
Ejemplo n.º 2
0
    parser.add_argument("-A", "--accesskey", help="access key")
    parser.add_argument("-K", "--secretkey", help="secret key")
    parser.add_argument("--logdir",
                        help="tensorboard logdir",
                        default="./logs")
    parser.add_argument("--weightdecay",
                        help="weight decay",
                        type=float,
                        default=0.01)
    parser.add_argument("--scheduler", help="scheduler type", default="linear")
    args = parser.parse_args()
    cluster_flag = True

    try:
        client = connect_server(args.host, args.accesskey, args.secretkey)
        load_object(client, args.bucket, args.traindata)
        load_object(client, args.bucket, args.testdata)
        load_object(client, args.bucket, args.pretrained)
    except:
        print("minio connection fails")
        cluster_flag = False
        pass

    if cluster_flag:
        uncompress_object(args.pretrained, ".")
        train_df = pd.read_csv(args.traindata)
        test_df = pd.read_csv(args.testdata)
    else:
        print("local file reading")
        train_df = pd.read_csv('notebooks/files/unlabel_train1.csv')
        test_df = pd.read_csv('notebooks/files/unlabel_test1.csv')
Ejemplo n.º 3
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("-u", "--bucket", help="bucket name", default="petcharts")
    parser.add_argument("-c", "--corpusdata", help="corpus file", default="pet_wiki.txt")
    parser.add_argument(
        "-k", "--tokenizer", help="tokenizer zip file", default="tokenizer.zip"
    )
    parser.add_argument("-v", "--vocabsize", help="vocabsize", type=int, default=40000)
    parser.add_argument("-H", "--host", help="object server")
    parser.add_argument("-A", "--accesskey", help="access key")
    parser.add_argument("-K", "--secretkey", help="secret key")
    args = parser.parse_args()

    try:
        client = connect_server(args.host, args.accesskey, args.secretkey)
        load_object(client, args.bucket, args.corpusdata)
#    except Exception as e:
#	print('error', e)
    except :
        pass

    os.makedirs("./pretrained", exist_ok=True)

    paths = [str(x) for x in Path(".").glob("**/{}".format(args.corpusdata))]

    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(
        files=paths,
        vocab_size=args.vocabsize,
        min_frequency=50,
        special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"],