help="vocabulary size")

    # 3. perturbation -> bpe-tokenize
    parser.add_argument("--min_cnt", type=int, default=4)
    parser.add_argument("--word_change_prob", type=float, default=.9)
    parser.add_argument("--type_change_prob", type=float, default=.1)
    parser.add_argument(
        "--n_epochs",
        type=int,
        nargs="+",
        default=[1, 12, 5],
        help="list of n_epochs of gutenberg, tatoeba, and wiki103")

    args = parser.parse_args()

    fp = filepath.FilePath()
    fp.make_dirs()

    logging.info("STEP 0. Download data")
    logging.info("STEP 0-1. Download Gutenberg Text")
    maybe_download(
        fp.gutenberg,
        f"gdown https://drive.google.com/uc?id=0B2Mzhc7popBga2RkcWZNcjlRTGM & "
        f"unzip Gutenberg.zip -d {fp.gutenberg} & "
        f"rm Gutenberg.zip")

    logging.info("STEP 0-2. Download Tatoeba")
    maybe_download(
        fp.tatoeba,
        f"wget http://downloads.tatoeba.org/exports/sentences.tar.bz2 & "
        f"tar -C {fp.tatoeba} -xjf sentences.tar.bz2 &"
Exemple #2
0
 def __init__(self, track_num):
     self.fp = filepath.FilePath()
     self.TRACK_NUM = track_num
     self.TRACK_PATH = f"{self.fp.root}/track{track_num}"