help="vocabulary size") # 3. perturbation -> bpe-tokenize parser.add_argument("--min_cnt", type=int, default=4) parser.add_argument("--word_change_prob", type=float, default=.9) parser.add_argument("--type_change_prob", type=float, default=.1) parser.add_argument( "--n_epochs", type=int, nargs="+", default=[1, 12, 5], help="list of n_epochs of gutenberg, tatoeba, and wiki103") args = parser.parse_args() fp = filepath.FilePath() fp.make_dirs() logging.info("STEP 0. Download data") logging.info("STEP 0-1. Download Gutenberg Text") maybe_download( fp.gutenberg, f"gdown https://drive.google.com/uc?id=0B2Mzhc7popBga2RkcWZNcjlRTGM & " f"unzip Gutenberg.zip -d {fp.gutenberg} & " f"rm Gutenberg.zip") logging.info("STEP 0-2. Download Tatoeba") maybe_download( fp.tatoeba, f"wget http://downloads.tatoeba.org/exports/sentences.tar.bz2 & " f"tar -C {fp.tatoeba} -xjf sentences.tar.bz2 &"
def __init__(self, track_num): self.fp = filepath.FilePath() self.TRACK_NUM = track_num self.TRACK_PATH = f"{self.fp.root}/track{track_num}"