def obsproc_run(self): ''' run obsproc.exe ''' obsproc_dir = os.path.join(self.config['filesystem']['wrfda_dir'], 'var/obsproc') # TODO: check if output is file is created and no errors have occurred j_id = None if len(self.config['options_slurm']['slurm_obsproc.exe']): # run using slurm if j_id: mid = "--dependency=afterok:%d" %j_id obsproc_command = ['sbatch', mid, self.config['options_slurm']['slurm_obsproc.exe']] else: obsproc_command = ['sbatch', self.config['options_slurm']['slurm_obsproc.exe']] utils.check_file_exists(obsproc_command[-1]) try: res = subprocess.check_output(obsproc_command, cwd=obsproc_dir, stderr=utils.devnull()) j_id = int(res.split()[-1]) # slurm job-id except subprocess.CalledProcessError: logger.error('Obsproc failed %s:' %obsproc_command) raise # re-raise exception return j_id # return slurm job-id else: # run locally subprocess.check_call(os.path.join(obsproc_dir, 'obsproc.exe'), cwd=obsproc_dir, stdout=utils.devnull(), stderr=utils.devnull()) return None
def __init__(self): self.batch_size = 200 self.num_classes = 10 self.epochs = 50 # check if model exist # if exist -> load else -> train (x_train, y_train), (x_test, y_test) = mnist.load_data(get_cwd() + "/.keras/datasets/mnist.npz") self.x_train = x_train.reshape(x_train.shape[0], 28, 28, 1).astype('float32') self.x_test = x_test.reshape(x_test.shape[0], 28, 28, 1).astype('float32') self.x_train = self.x_train / 255 self.x_test = self.x_test / 255 self.y_train = keras.utils.to_categorical(y_train) self.y_test = keras.utils.to_categorical(y_test) print('x_train shape:', x_train.shape) print(x_train.shape[0], 'train_samples') print(x_test.shape[0], 'test_samples') self.model = None if check_file_exists(get_cwd() + "/.keras/models/model.h5") and check_file_exists( get_cwd() + "/.keras/models/model.yaml"): yaml_file = open(get_cwd() + "/.keras/models/model.yaml", "r") self.model: Sequential = keras.models.model_from_yaml(yaml_file.read()) yaml_file.close() self.model.load_weights(get_cwd() + "/.keras/models/model.h5")
def wrfvar_run(self, domain): ''' run da_wrfvar.exe ''' # set domain specific workdir wrfda_workdir = os.path.join(self.wrfda_workdir, "d0" + str(domain)) logfile = os.path.join(wrfda_workdir, 'log.wrfda_d' + str(domain)) j_id = None if len(self.config['options_slurm']['slurm_wrfvar.exe']): if j_id: mid = "--dependency=afterok:%d" %j_id wrfvar_command = ['sbatch', mid, self.config['options_slurm']['slurm_wrfvar.exe']] else: wrfvar_command = ['sbatch', self.config['options_slurm']['slurm_wrfvar.exe']] utils.check_file_exists(wrfvar_command[-1]) try: res = subprocess.check_output(wrfvar_command, cwd=wrfda_workdir, stderr=utils.devnull()) j_id = int(res.split()[-1]) # slurm job-id except subprocess.CalledProcessError: logger.error('Wrfvar failed %s:' %wrfvar_command) raise # re-raise exception while True: time.sleep(1) if not utils.testjob(j_id): break else: # run locally subprocess.check_call([os.path.join(wrfda_workdir, 'da_wrfvar.exe'), '>&!', logfile], cwd=wrfda_workdir, stdout=utils.devnull(), stderr=utils.devnull())
def _run_metgrid(self, j_id=None): ''' run metgrid.exe (locally or using slurm script defined in config.json) ''' if len(self.config['options_slurm']['slurm_metgrid.exe']): if j_id: mid = "--dependency=afterok:%d" %j_id metgrid_command = ['sbatch', mid, self.config['options_slurm']['slurm_metgrid.exe']] else: metgrid_command = ['sbatch', self.config['options_slurm']['slurm_metgrid.exe']] utils.check_file_exists(metgrid_command[-1]) utils.silentremove(os.path.join(self.wps_workdir, 'metgrid', 'metgrid.exe')) os.symlink(os.path.join(self.config['filesystem']['wps_dir'],'metgrid','metgrid.exe'), os.path.join(self.wps_workdir, 'metgrid', 'metgrid.exe')) try: res = subprocess.check_output(metgrid_command, cwd=self.wps_workdir, stderr=utils.devnull()) j_id = int(res.split()[-1]) # slurm job-id except subprocess.CalledProcessError: logger.error('Metgrid failed %s:' %metgrid_command) raise # re-raise exception return j_id # return slurm job-id else: metgrid_command = os.path.join(self.config['filesystem']['wps_dir'], 'metgrid', 'metgrid.exe') utils.check_file_exists(metgrid_command) try: subprocess.check_call(metgrid_command, cwd=self.wps_workdir, stdout=utils.devnull(), stderr=utils.devnull()) except subprocess.CalledProcessError: logger.error('Metgrid failed %s:' %metgrid_command) raise # re-raise exception
def _check_wrf(self): ''' check wrf options in json config file ''' # verify that the config option is specified by the user assert (len(self.config['options_wrf']['namelist.input']) > 0), ( 'No WRF namelist.input specified in config file') # check if specified namelist.wps exist and are readable utils.check_file_exists(self.config['options_wrf']['namelist.input']) # check if namelist.input is in the required format and has all keys needed self._check_namelist_wrf()
def _connect_to_database(self): """ check if database exists and try to connect to the database """ utils.check_file_exists(self.database) # check if database exists try: logger.debug("Connecting to database: %s" % self.database) self.connection = sqlite3.connect( self.database, detect_types=sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES ) except: message = "Database %s exists, but failed to connect" % self.database logger.error(message) raise
def _connect_to_database(self): ''' check if database exists and try to connect to the database ''' utils.check_file_exists(self.database) # check if database exists try: logger.debug('Connecting to database: %s' %self.database) self.connection = sqlite3.connect( self.database, detect_types=sqlite3.PARSE_DECLTYPES|sqlite3.PARSE_COLNAMES) except: message = 'Database %s exists, but failed to connect' %self.database logger.error(message) raise
def _check_namelist_wps(self): ''' check if namelist.wps is in the required format and has all keys needed ''' # verify that example namelist.wps exists and is not removed by user basepath = utils.get_script_path() basepath = '/home/WUR/haren009/wrfpy' # TODO: fix self.example_file = os.path.join(basepath, 'examples', 'namelist.wps') utils.check_file_exists(self.example_file) # load specified namelist self.user_nml = f90nml.read(self.config['options_wps']['namelist.wps']) # verify that all keys in self.user_nml are also in example namelist self._verify_namelist_wps_keys() # validate the key information specified self._validate_namelist_wps_keys()
def retrieve_csv(self, camera_url_file, duration, interval, result_path): """ Reads camera urls from csv file and archives the images at the requested directory. """ #verify file exists and can be read if not check_file_exists(camera_url_file): return -1 with open(camera_url_file, 'r') as camera_file: camera_reader = csv.reader(camera_file) id = 1 cams = [] for camera_url in camera_reader: #These cameras do not come from the database and so have no ID. Assign one to them so they can be placed in a result folder. camera_type = camera_url[0].split(".")[-1] if (camera_type == "m3u8"): camera = StreamCamera(id, duration, interval, camera_url[0]) else: camera = NonIPCamera(id, duration, interval, camera_url[0]) id += 1 cams.append(camera) if len(cams): self.__archive_cameras(cams, result_path)
def retrieve_db(self, camera_id_file, duration, interval, result_path): """ Reads camera IDs from csv file, retrieves the associated camera objects from the database, and archives the images at the requested directory. """ if not check_file_exists(camera_id_file): return -1 with open(camera_id_file, 'r') as id_file: id_reader = csv.reader(id_file) cams = [] for line in id_reader: try: id = int(line[0]) except: raise (Exception( "Error: No camera_id exists in line {} of input file \"{}\"" .format(line, id_file))) camera = self.__get_camera_from_db(id, duration, interval) if camera is not None: cams.append(camera) if len(cams): self.__archive_cameras(cams, result_path) return 0
def init(params): ''' Project initialation package.yml base conf - name - version - author ''' if not utils.check_file_exists( os.path.join(params.name, package_file) if params. name else package_file): package_file_data = {} package_file_data[ 'name'] = params.name if params.name else project_name package_file_data['author'] = getpass.getuser() package_file_data[ 'version'] = '1.0.0' # version formating: [project version].[feature version].[bug version] write_conf( os.path.join(params.name, package_file) if params.name else package_file, package_file_data) if params.env: virtualenv = { 'cmd': 'virtualenv', 'args': [ os.path.join( os.getcwd(), os.path.dirname( os.path.join(params.name, package_file) if params. name else package_file), 'venv') ] } if params.sys: virtualenv['args'].append('--system-site-packages') if params.nosys: virtualenv['args'].append('--no-site-packages') args = virtualenv['args'] args.insert(0, virtualenv['cmd']) cmd_string = ' '.join(args) if not utils.cmd_with_check_os_value(cmd_string): cmd = 'source {0}'.format( os.path.join( os.path.dirname( os.path.join(params.name, package_file) if params. name else package_file), 'venv', 'bin', 'activate')) print 'Enter command \'{0}\' to start your project.'.format( cmd) else: print 'package.yml already exists' exit(0)
def compare_folders(folder1, folder2, output_file): """ Compares the contents of two folders and writes the differences to the output file. """ return_md = "" for lang in settings.languages: expected_files = "" generated_output_rst = settings.generated_output_rst.format( language=lang) generated_output_csv = settings.generated_output_csv.format( language=lang) # check if files exist in both folder1 and folder 2 if not utils.check_file_exists(f"{folder1}/{generated_output_rst}"): expected_files += f"- {generated_output_rst} doesn't exist in folder {folder1}\n" if not utils.check_file_exists(f"{folder2}/{generated_output_rst}"): expected_files += f"- {generated_output_rst} doesn't exist in folder {folder2}\n" if not utils.check_file_exists(f"{folder1}/{generated_output_csv}"): expected_files += f"- {generated_output_csv} doesn't exist in folder {folder1}\n" if not utils.check_file_exists(f"{folder2}/{generated_output_csv}"): expected_files += f"- {generated_output_csv} doesn't exist in folder {folder2}\n" if expected_files != "": print("Expected files are missing", file=sys.stderr) return_md += f"\n### {lang}\n\n#### Expected files are missing for {lang}\n{expected_files}\n" continue # compare contents of files cmp1 = compare_files(f"{folder1}/{generated_output_rst}", f"{folder2}/{generated_output_rst}") cmp2 = compare_files(f"{folder1}/{generated_output_csv}", f"{folder2}/{generated_output_csv}") if cmp1 != "" or cmp2 != "": print("Generated file contents are not matching", file=sys.stderr) return_md += f"\n### {lang}\n\n#### Generated file changes for {lang}\n\n" if cmp1 != "": return_md += f"- Changes to {generated_output_rst}:\n```diff\n{cmp1}```\n\n" if cmp2 != "": return_md += f"- Changes to {generated_output_csv}:\n```diff\n{cmp2}```\n\n" with open(output_file, 'w', newline='') as out: out.write(return_md)
def ask_input_image_uri(message): while True: uri = input(message) if not utils.check_file_exists(uri): print('URI not valid') return False else: return uri
def _check_upp_dir(self): assert os.path.isdir(self.config['filesystem']['upp_dir']), ( 'upp directory %s not found' %self.config['filesystem']['upp_dir']) # create list of files to check files_to_check = [ os.path.join(self.config['filesystem']['upp_dir'], filename) for filename in ['bin/unipost.exe', 'parm/wrf_cntrl.parm']] # check if all files in the list exist and are readable [utils.check_file_exists(filename) for filename in files_to_check]
def _archive_output(self, current_time, thours, domain): ''' rename unipost.exe output to wrfpost_d0${domain}_time.grb and archive ''' import shutil # verify that domain is an int if not isinstance(domain, int): message = 'domain id should be an integer' logger.error(message) raise IOError(message) # define original and destination filename origname = 'WRFPRS%02d.tm00' %thours outname = 'wrfpost_d%02d_%s.grb' %(domain, current_time) # rename file and move to archive dir shutil.move(os.path.join(config['post_dir'], origname), os.path.join(config['upp_archive_dir'], outname)) # check if file is indeed archived utils.check_file_exists(os.path.join(config['upp_archive_dir'], outname))
def check_cv5(self): ''' return True if be.dat_d0{domain} is defined for each domain in config.json and all files exits, else return False ''' return all([utils.check_file_exists( self.config['options_wrfda'][ 'be.dat_d0' + str(domain)], boolean=True ) for domain in range(1, self.max_dom+1)])
def featurize_images_augmented(input_dir, model_path, output_dir, batch_size, downsample_encoder=True): """ Compresses a set of whole-slide aumented images using a trained encoder network. :param input_dir: directory containing the vectorized images. :param model_path: path to trained encoder network. :param output_dir: destination folder to store the compressed images. :param batch_size: number of images to process in the GPU in one-go. :param downsample_encoder: if true downsample image from 128 to 64 :return: nothing """ # Output dir if not os.path.exists(output_dir): os.makedirs(output_dir) # Load encoder model encoder = keras.models.load_model(model_path, compile=False) # Downsample image to fit encoder needed for bigan encoder if downsample_encoder: encoder = downsample_encoder_128_to_64(encoder) image_list = get_file_list(input_dir, ext='_{item}.png') total_images = len(image_list) for index in range(total_images): filename = os.path.splitext(os.path.basename(image_list[index]))[0] filename_npy = input_dir + '/' + filename + '.npy' # by convection on NIC it has to be an .npy wsi_pattern = input_dir + '/' + filename.split('_')[0] + '_{item}.npy' if check_file_exists(wsi_pattern.format(item='im_shape')): print(f'Processing image {filename}') encode_augment_wsi(wsi_pattern=filename_npy, encoder=encoder, output_dir=output_dir, batch_size=batch_size, aug_modes=[('none', 0), ('none', 90), ('none', 180), ('none', 270), ('horizontal', 0), ('vertical', 0), ('vertical', 90), ('vertical', 270)], overwrite=False) print( f'Successful vectorized {filename} : {total_images - index - 1} images left' ) else: print('Vectorized file not found: {f}'.format(f=wsi_pattern.format( item='im_shape')), flush=True) print('Finish Processing All images!')
def vectorize_images(input_dir, mask_dir, output_dir, cache_dir, image_level, patch_size): """ Converts a set of whole-slide images into numpy arrays with valid tissue patches for fast processing. :param input_dir: folder containing the whole-slide images. :param mask_dir: folder containing the whole-slide masks. :param output_dir: destination folder to store the vectorized images. :param cache_dir: folder to store whole-slide images temporarily for fast access. :param image_level: image resolution to read the patches. :param patch_size: size of the read patches. :return: nothing """ # Output dir if not os.path.exists(output_dir): os.makedirs(output_dir) # Read image file names image_paths = get_file_list(input_dir, ext='tif') # get all the wsi.svs files # Read mask file names mask_paths = get_file_list(mask_dir) # get all the mask files total_images = len(image_paths) for index in range(total_images): image_id = (os.path.basename(image_paths[index])).split('.')[0] output_pattern = output_dir + '/' + image_id + '_{item}.npy' # by convection on NIC it has to be an .npy vectorized_png = output_dir + '/' + image_id + '_{item}.png' if not check_file_exists(vectorized_png): print(f'Processing image {image_id}') vectorize_wsi(image_path=cache_file(image_paths[index], cache_dir, overwrite=False), mask_path=mask_paths[index], output_pattern=output_pattern, image_level=image_level, mask_level=image_level, patch_size=patch_size, stride=patch_size, downsample=1, select_bounding_box=False) print( f'Successful vectorized {image_id} : {total_images - index} images left' ) else: print( f'Already existing file {image_id} - {total_images - index - 1} images left' ) print('Finish Processing All images!')
def featurize_images(input_dir, model_path, output_dir, batch_size, downsample_encoder=True): """ Featurizes vectorized of whole-slide images using a trained encoder network. :param input_dir: directory containing the vectorized images. :param model_path: path to trained encoder network. :param output_dir: destination folder to store the compressed images. :param batch_size: number of images to process in the GPU in one-go. :param downsample_encoder: if true downsample image from 128 to 64 :return: nothing """ # Output dir if not os.path.exists(output_dir): os.makedirs(output_dir) # Load encoder model encoder = keras.models.load_model(model_path, compile=False) # Downsample image to fit encoder needed for bigan encoder if downsample_encoder: encoder = downsample_encoder_128_to_64(encoder) image_list = get_file_list(input_dir, ext='_{item}.png') total_images = len(image_list) for index in range(total_images): filename = os.path.splitext(os.path.basename(image_list[index]))[0] filename_npy = input_dir + '/' + filename + '.npy' # by convection on NIC it has to be an .npy featurized_npy = output_dir + '/' + filename.split('_')[0] + '.npy' featurized_png = output_dir + '/' + filename.split('_')[0] + '.png' if not check_file_exists(featurized_npy): print(f'Processing image {filename}') encode_wsi_npy_simple(encoder, filename_npy, batch_size, featurized_npy, featurized_png, output_distance_map=True) print( f'Successful vectorized {filename} : {total_images - index - 1} images left' ) else: print( f'Already existing file {filename} - {total_images - index - 1} images left' ) print('Finish Processing All images!')
def __init__(self, img_dir, xray_csv, bbox_csv, transform=None, masks=False): self.transform = transform self.path_to_images = img_dir self.df = pd.read_csv(xray_csv) self.masks = pd.read_csv((bbox_csv), names=["Image Index","Finding Label","x","y","w","h","_1","_2","_3"], skiprows=1) check_path_exists(self.path_to_images) check_file_exists(xray_csv) if masks: check_file_exists(self.masks) self.df = self.df.set_index("Image Index") self.diseases = [ 'Atelectasis', 'Cardiomegaly', 'Effusion', 'Infiltration', 'Mass', 'Nodule', 'Pneumonia', 'Pneumothorax', 'Consolidation', 'Edema', 'Emphysema', 'Fibrosis', 'Pleural_Thickening', 'Hernia', 'Enlarged_Cardiomediastinum', 'Lung_Lesion', 'Fracture', 'Lung_Opacity']
def __init__(self, wrfpy_config=False): global logger wrfpy_dir = os.environ['HOME'] logger = utils.start_logging(os.path.join(wrfpy_dir, 'wrfpy.log')) if not wrfpy_config: try: # get CYLC_SUITE_DEF_PATH environment variable wrfpy_dir = os.environ['CYLC_SUITE_DEF_PATH'] except KeyError: # default back to user home dir in case CYLC is not used wrfpy_dir = os.environ['HOME'] # config.json needs to be in base of wrfpy_dir self.configfile = os.path.join(wrfpy_dir, 'config.json') else: self.configfile = wrfpy_config try: logger.debug('Checking if configuration file exists: %s' %self.configfile) utils.check_file_exists(self.configfile) except IOError: # create config file self._create_empty_config() # TODO: exit and notify user to manually edit config file # read json config file self._read_json()
def _check_wrda_dir(self): ''' check if the wrfda directory exist check if obsproc.exe and da_wrfvar.exe executables exist in the wrfda directory ''' # TODO: find out if we can verify that WRFDA dir is 3dvar or 4dvar compiled assert os.path.isdir(self.config['filesystem']['wrfda_dir']), ( 'wrfda directory %s not found' %self.config['filesystem']['wrfda_dir']) # create list of files to check files_to_check = [ os.path.join(self.config['filesystem']['wrfda_dir'], filename) for filename in ['var/obsproc/obsproc.exe', 'var/da/da_wrfvar.exe']] # check if all files in the list exist and are readable [utils.check_file_exists(filename) for filename in files_to_check]
def overwrite_files(): for lang in settings.languages: repo_output_rst = settings.repo_output_rst.format(language=lang) repo_output_csv = settings.repo_output_csv.format(language=lang) generated_output_rst = settings.generated_output_rst.format( language=lang) generated_output_csv = settings.generated_output_csv.format( language=lang) exists = utils.check_file_exists(generated_output_rst) if not exists: print(f"Generated RST file {generated_output_rst} is missing", file=sys.stderr) sys.exit(1) exists = utils.check_file_exists(generated_output_csv) if not exists: print(f"Generated RST file {generated_output_csv} is missing", file=sys.stderr) sys.exit(1) shutil.move(generated_output_rst, repo_output_rst) shutil.move(generated_output_csv, repo_output_csv)
def release_fastq(self, fq_type): print '> release {} ...'.format(fq_type) if fq_type == 'raw': data_dir = 'RawData' elif fq_type == 'clean': data_dir = 'CleanData' else: exit('error fq_type') for sample in self.qc_lists: md5_list = [] dest = '{Data}/{data_dir}/{sample}/'.format( **dict(self.__dict__, **locals())) dest_md5 = '{Data}/{data_dir}/{sample}/MD5.txt'.format( **dict(self.__dict__, **locals())) for lane in self.qc_lists[sample]['lanes']: for read in (1, 2): fastq = '{analydir}/QC/{sample}/{sample}_{novoid}_{flowcell_lane}_{read}.clean.fq.gz'.format( sample=sample, read=read, analydir=self.analydir, **lane) if fq_type == 'raw': fastq = fastq.replace('clean.fq.gz', 'fq.gz').replace( 'QC', 'RawData') self.link_data(fastq, dest) fastq_md5 = fastq + '.MD5.txt' if utils.check_file_exists(fastq_md5): md5_list.append(fastq_md5) if md5_list: self.cat_md5(md5_list, dest_md5)
def do_restore(color, progress, date, key, bucket, jobname, target): # Colors yellow = color_macro(color, colored.yellow) cyan = color_macro(color, colored.cyan) red = color_macro(color, colored.red) green = color_macro(color, colored.green) # First check if the given backup exists # If no date specified, use most recent backup puts( f"Trying to restore {cyan(jobname)} from AWS S3 bucket {yellow(bucket)} to {yellow(target)}" ) if not check_folder_exists(target): raise RuntimeError(red(f"Folder {target} does not exists")) if not date: puts("No date supplied, trying to restore most recent backup") try: out = subprocess.check_output( ["aws", "s3", "ls", s3_url(bucket, jobname + "/")]).decode("utf-8") except: raise RuntimeError( f"Could not list bucket {bucket}/{jobname}, please double check the name and jobname" ) dates = [x.rsplit(" ", 1)[1].strip("/") for x in out.splitlines()] dates_sorted = sorted(dates) date = dates_sorted[-1] puts(f"Most recent backup: {yellow(date)}") else: try: datetime.strptime(date, "%Y-%m-%d_%H-%M-%S") except: raise RuntimeError( f"date ({date}) has invalid date format, expected %Y-%m-%d_%H-%M-%S" ) try: puts(f"Checking if backup for {yellow(date)} exists...", newline=False) # Check if backup with that date actually exists out = subprocess.check_call([ "aws", "s3", "ls", s3_url(bucket, os.path.join(jobname, date)) ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) puts(green("OK")) except: print() raise click.BadOptionUsage("date", red(f"No backup found for date {date}")) # Next check files, determine if encrypted, compressed or both print(f"Checking files in {bucket}/{jobname}/{date}...", end="") try: backup_content_str = subprocess.check_output([ "aws", "s3", "ls", s3_url(bucket, os.path.join(jobname, date) + "/") ]).decode("utf-8") backup_content = [ x.rsplit(" ", 1)[1].strip("/") for x in backup_content_str.splitlines() ] puts(green("DONE")) except: raise RuntimeError( f"Could not list files in {bucket}/{jobname}/{date}") encrypted = any(".meta.enc" for s in backup_content) compressed = any(".tar.zstd" for s in backup_content) print( f"Backup is{' not' if not encrypted else ''} encrypted and{' not' if not compressed else ''} compressed" ) if encrypted: if not key or not check_file_exists(key): raise click.BadOptionUsage("key", "Key is missing, backup is encrypted") print("Downloading metafile...", end="", flush=True) try: metafile_url = s3_url( bucket, os.path.join(jobname, date, f"{jobname}.meta.enc")) # print(metafile_url) openssl = subprocess.Popen( ["openssl", "rsautl", "-decrypt", "-inkey", key], stdin=subprocess.PIPE, stdout=subprocess.PIPE) openssl_out = openssl.stdout aws = subprocess.Popen(["aws", "s3", "cp", metafile_url, "-"], stdout=openssl.stdin) aws.wait() print("yottek", flush=True) print(openssl_out.readline()) except: raise RuntimeError(f"Could not download/decrypt metafile")
def runtest(): """check command line interface""" # setup shutil.copyfile(Disks.recsgen, Disks.work) # disk image operations with open(Files.output, "w") as f1, open(Files.reference, "w") as f2: xdm(Disks.work, "-i", stdout=f2) xdm(Disks.work, "-q", stdout=f1) check_files_eq("CLI", Files.output, Files.reference, "DIS/VAR255") xdm(Disks.work, "-e", "PROG00255", "DV064X010", "DF002X001") xdm(Disks.work, "-e", "PROG00255", "-o", Files.output) check_files_eq("CLI", Files.output, "prog00255", "PROGRAM") xdm(Disks.work, "-e", "DV064X010", "-o", Files.output) check_files_eq("CLI", Files.output, "dv064x010", "DIS/VAR64") xdm(Disks.work, "-e", "DF002X001", "-o", Files.output) check_files_eq("CLI", Files.output, "df002x001", "DIS/FIX 2") with open(Files.output, "w") as f1: xdm(Disks.work, "-p", "DV064X010", stdout=f1) check_files_eq("CLI", Files.output, "dv064x010", "DIS/VAR 64") with open(Files.error, "w") as ferr: xdm(Disks.work, "-e", "INVALID", stderr=ferr, rc=1) xdm(Disks.work, "-S", "0x01", "-o", Files.output) check_files_eq("CLI", Files.output, os.path.join(Dirs.refs, "sector1"), "DIS/VAR255") # add, rename, remove files shutil.copyfile(Disks.blank, Disks.work) xdm(Disks.work, "-a", "prog00255", "dv064x010", "df002x001") xdm(Disks.work, "-e", "PROG00255", "-o", Files.output) check_files_eq("CLI", Files.output, "prog00255", "PROGRAM") xdm(Disks.work, "-e", "DV064X010", "-o", Files.output) check_files_eq("CLI", Files.output, "dv064x010", "PROGRAM") #! shutil.copyfile(Disks.work, Disks.tifiles) xdm(Disks.work, "-e", "PROG00255", "-o", Files.reference) xdm(Disks.work, "-r", "PROG00255:OTHERNAME") xdm(Disks.work, "-e", "OTHERNAME", "-o", Files.output) check_files_eq("CLI", Files.output, Files.reference, "P") xdm(Disks.work, "-r", "OTHERNAME:PROG00255") check_files_eq("CLI", Disks.work, Disks.tifiles, "P") xdm(Disks.work, "-d", "PROG00255", "DV064X010", "DF002X001") with open(Files.output, "w") as f1, open(Files.reference, "w") as f2: xdm(Disks.work, "-i", stdout=f1) xdm(Disks.blank, "-i", stdout=f2) check_files_eq("CLI", Files.output, Files.reference, "DIS/VAR255") shutil.copyfile(Disks.recsgen, Disks.work) xdm(Disks.work, "-e", "DF127*", "PROG00001", "PROG00002") if (not os.path.isfile("df127x001") or not os.path.isfile("df127x010") or not os.path.isfile("df127x020p")): error("CLI", "DF127*: Missing files") xdm(Disks.work, "-d", "PROG*", "D?010X060") with open(Files.error, "w") as ferr: xdm(Disks.work, "-e", "PROG00255", stderr=ferr, rc=1) xdm(Disks.work, "-e", "DV010X060", stderr=ferr, rc=1) xdm(Disks.work, "-e", "DF010X060", stderr=ferr, rc=1) # multi-file naming xdm(Disks.work, "-n", "MULTI", "-a", "prog00001", "prog00255", "prog00002") xdm(Disks.work, "-e", "MULTI", "-o", Files.output) check_files_eq("CLI", "prog00001", Files.output, "P") xdm(Disks.work, "-e", "MULTJ", "-o", Files.output) check_files_eq("CLI", "prog00255", Files.output, "P") xdm(Disks.work, "-e", "MULTK", "-o", Files.output) check_files_eq("CLI", "prog00002", Files.output, "P") xdm("-T", "prog00001", "prog00255", "prog00002", "-n", "MULTFI") # -n applies to internal names! xdm(Disks.work, "-t", "-a", "prog00001.tfi", "prog00255.tfi", "prog00002.tfi") xdm(Disks.work, "-e", "MULTFI", "-o", Files.output) check_files_eq("CLI", "prog00001", Files.output, "P") xdm(Disks.work, "-e", "MULTFJ", "-o", Files.output) check_files_eq("CLI", "prog00255", Files.output, "P") xdm(Disks.work, "-e", "MULTFK", "-o", Files.output) check_files_eq("CLI", "prog00002", Files.output, "P") xdm("-T", "prog00255", "prog00002", "-9", "-n", "MULV9T") xdm(Disks.work, "-9", "-a", "prog00255.v9t9", "prog00002.v9t9") xdm(Disks.work, "-e", "MULV9T", "-o", Files.output) check_files_eq("CLI", "prog00255", Files.output, "P") xdm(Disks.work, "-e", "MULV9U", "-o", Files.output) check_files_eq("CLI", "prog00002", Files.output, "P") ref = os.path.join(Dirs.refs, "glob") xdm(Disks.work, "-a", ref + "?", "-n", "GLOBA1", shell=True) xdm(Disks.work, "-e", "GLOBA1", "-o", Files.output) xdm(Disks.work, "-e", "GLOBA2", "-o", Files.output) with open(Files.error, "w") as ferr: xdm(Disks.work, "-e", "GLOBA3", "-o", Files.output, stderr=ferr, rc=1) xdm(Disks.work, "-d", "GLOB*", "-o", Files.output) xdm(Disks.work, "-a", ref + "*", "-n", "GLOBB1", shell=True) xdm(Disks.work, "-e", "GLOBB1", "-o", Files.output) xdm(Disks.work, "-e", "GLOBB2", "-o", Files.output) xdm(Disks.work, "-e", "GLOBB3", "-o", Files.output) # initialize disk xdm(Disks.work, "--initialize", "360", "-n", "SSSD") check_file_size(Disks.work, 360 * 256) check_files_eq("CLI", Disks.work, Disks.blank, "P") os.remove(Disks.work) xdm(Disks.work, "--initialize", "SSSD", "-n", "SSSD") check_file_size(Disks.work, 360 * 256) check_files_eq("CLI", Disks.work, Disks.blank, "P") xdm(Disks.work, "--initialize", "800", "-n", "INIT") with open(Files.output, "w") as f: xdm(Disks.work, "-i", stdout=f) check_file_matches(Files.output, [(0, "\s2\s+used\s+798\s+free\s")]) os.remove(Disks.work) xdm(Disks.work, "--initialize", "CF", "-n", "INIT") with open(Files.output, "w") as f: xdm(Disks.work, "-i", stdout=f) check_file_matches(Files.output, [(0, "\s2\s+used\s+1598\s+free\s")]) with open(Files.error, "w") as ferr: xdm(Disks.work, "--initialize", "1", stderr=ferr, rc=1) xdm(Disks.work, "--initialize", "1601", stderr=ferr, rc=1) xdm(Disks.work, "--initialize", "FOO", stderr=ferr, rc=1) f = os.path.join(Dirs.refs, "vardis") for n in ["AA", "BB"]: xdm(Disks.work, "--initialize", "SSSD", "-a", f, "-n", n) with open(Files.output, "w") as fout: xdm(Disks.work, "-i", stdout=fout) check_file_matches(Files.output, [(0, n + "\s+"), (2, n + "\s+")]) # set geometry xdm(Disks.work, "--initialize", "1600", "-n", "GEO") for g, p in [("1S1D", "1S/1D\s+40T"), ("99T8D7S", "7S/8D\s+99T"), ("22TDD", "7S/2D\s+22T"), ("DSSD", "2S/1D\s+22T"), ("1T", "2S/1D\s+1T"), ("3D10T9S", "9S/3D\s+10T"), ("SDDS", "2S/1D\s+10T"), ("SS", "1S/1D\s+10T")]: xdm(Disks.work, "--set-geometry", g) with open(Files.output, "w") as fout: xdm(Disks.work, "-i", "-q", stdout=fout) check_file_matches(Files.output, [(0, p)]) # resize disk shutil.copyfile(Disks.recsgen, Disks.work) for s in ["800", "248", "1600"]: xdm(Disks.work, "-Z", s, "-q") for f in ["PROG02560", "DF129X010", "DV127X010", "DV255X015P"]: xdm(Disks.work, "-e", f, "-q", "-o", Files.output) xdm(Disks.recsgen, "-e", f, "-o", Files.reference) check_files_eq("CLI", Files.output, Files.reference, "PROGRAM") with open(Files.error, "w") as ferr: xdm(Disks.work, "-Z", "240", stderr=ferr, rc=1) xdm(Disks.work, "-Z", "1608", stderr=ferr, rc=1) # new geometry handling (v1.5.3) for c, g, p in [ ("--initialize", "SSSD", r"358 free\s+90 KB\s+1S/1D\s+40T"), ("--resize", "DS1D", r"718 free\s+180 KB\s+2S/1D\s+40T"), ("--set-geometry", "80T", r"718 free\s+180 KB\s+2S/1D\s+80T"), # geom mismatch ("--initialize", "408", r"406 free\s+102 KB\s+2S/1D\s+40T"), ("--resize", "DSSD80T", r"1438 free\s+360 KB\s+2S/1D\s+80T"), ("--resize", "2DSS", r"718 free\s+180 KB\s+1S/2D\s+40T"), ("-Z", "208", r"206 free\s+52 KB\s+1S/2D\s+40T"), ("--set-geometry", "SD80T", r"206 free\s+52 KB\s+1S/1D\s+80T"), ("-X", "DSSD80T", r"1438 free\s+360 KB\s+2S/1D\s+80T"), ("--set-geometry", "20T", r"1438 free\s+360 KB\s+2S/1D\s+20T") ]: # geom mismatch xdm(Disks.work, c, g) with open(Files.output, "w") as fout: xdm(Disks.work, "-i", "-q", stdout=fout) check_file_matches(Files.output, [(0, p)]) with open(Files.error, "w") as ferr: xdm(Disks.work, "--initialize", "SS80T", stderr=ferr, rc=1) xdm(Disks.work, "--resize", "2S", stderr=ferr, rc=1) xdm(Disks.work, "--resize", "80T", stderr=ferr, rc=1) xdm(Disks.work, "--set-geometry", "123", stderr=ferr, rc=1) # xdm99 vs real images rfile = os.path.join(Dirs.refs, "ti-text") # TEXT D/V80 with open(Files.output, "w") as fout, open(Files.error, "w") as ferr: xdm(Disks.work, "-X", "sssd", "-n", "TI-DISK", stderr=ferr, rc=0) xdm(Disks.work, "-a", rfile, "-n", "TEXT", "-f", "dv80", stderr=ferr, rc=0) check_file_len(Files.error, max_lines=0) check_disks_eq(Disks.work, Disks.tisssd) xdm(Disks.work, "-X", "dsdd", "-n", "TI-DISK", stderr=ferr, rc=0) xdm(Disks.work, "-a", rfile, "-n", "TEXT", "-f", "dv80", stderr=ferr, rc=0) check_file_len(Files.error, max_lines=0) check_disks_eq(Disks.work, Disks.tidsdd) xdm(Disks.work, "-Z", "sssd", stderr=ferr, rc=0) check_file_len(Files.error, max_lines=0) check_disks_eq(Disks.work, Disks.tisssd) xdm(Disks.work, "--set-geometry", "ssdd", stderr=ferr, rc=0) # warn check_file_len(Files.error, min_lines=1, max_lines=1) xdm(Disks.work, "-i", stdout=fout, stderr=ferr, rc=0) # warn check_file_len(Files.error, min_lines=2, max_lines=2) xdm(Disks.work, "-Z", "dsdd", stderr=ferr, rc=0) check_file_len(Files.error, max_lines=2) check_disks_eq(Disks.work, Disks.tidsdd) xdm(Disks.work, "--set-geometry", "ssdd80t", stderr=ferr, rc=0) check_file_len(Files.error, max_lines=2) xdm(Disks.work, "-X", "dssd80t", "-n", "TI-DSSD80", stderr=ferr, rc=0) check_file_len(Files.error, max_lines=2) check_disks_eq(Disks.work, Disks.tidssd80) # repair disks shutil.copyfile(Disks.bad, Disks.work) with open(Files.output, "w") as f1, open(Files.reference, "w") as f2: xdm(Disks.work, "-C", stderr=f1, rc=1) xdm(Disks.work, "-R", stderr=f2) check_file_len(Files.output, min_lines=2) with open(Files.output, "w") as f1: xdm(Disks.work, "-C", stderr=f1) check_file_len(Files.output, max_lines=0) # FIAD operations shutil.copyfile(Disks.recsgen, Disks.work) xdm(Disks.work, "-e", "PROG00255", "DV064X010", "-t") xdm(Disks.work, "-e", "PROG00255", "-t", "-o", Files.output) check_files_eq("CLI", Files.output, "prog00255.tfi", "PROGRAM") xdm(Disks.work, "-e", "DV064X010", "-t", "-o", Files.output) check_files_eq("CLI", Files.output, "dv064x010.tfi", "PROGRAM") with open(Files.output, "w") as f: xdm("-I", "prog00255.tfi", "dv064x010.tfi", stdout=f) xdm(Disks.work, "-e", "PROG00255", "DV064X010", "-9") xdm(Disks.work, "-e", "PROG00255", "-9", "-o", Files.output) check_files_eq("CLI", Files.output, "prog00255.v9t9", "PROGRAM") xdm(Disks.work, "-e", "DV064X010", "-9", "-o", Files.output) check_files_eq("CLI", Files.output, "dv064x010.v9t9", "PROGRAM") with open(Files.output, "w") as f: xdm("-I", "prog00255.v9t9", "dv064x010.v9t9", stdout=f) xdm(Disks.work, "-e", "PROG00255") xdm("-T", "prog00255", "-o", Files.output) check_files_eq("CLI", Files.output, "prog00255.tfi", "PROGRAM", Masks.TIFile) xdm("-T", "prog00255", "-9", "-o", Files.output) check_files_eq("CLI", Files.output, "prog00255.v9t9", "PROGRAM", Masks.v9t9) xdm(Disks.work, "-e", "DV064X010", "-o", Files.reference) xdm("-F", "dv064x010.tfi") check_files_eq("CLI", "dv064x010", Files.reference, "DIS/VAR 64") xdm("-F", "dv064x010.tfi", "-o", Files.output) check_files_eq("CLI", Files.output, "dv064x010", "PROGRAM") xdm("-F", "dv064x010.v9t9", "-9") check_files_eq("CLI", "dv064x010", Files.reference, "DIS/VAR 64") xdm("-F", "dv064x010.v9t9", "-o", Files.output) check_files_eq("CLI", Files.output, "dv064x010", "PROGRAM") xdm("-T", "dv064x010", "-o", Files.output, "-n", "DV064X010", "-f", "DIS/VAR 64") check_files_eq("CLI", Files.output, "dv064x010.tfi", "PROGRAM", Masks.TIFile) os.remove("dv064x010.tfi") xdm("-T", "dv064x010", "-n", "DV064X010", "-f", "DIS/VAR 64") check_files_eq("CLI", "dv064x010.tfi", Files.output, "PROGRAM", Masks.TIFile) xdm("-T", "dv064x010", "-9", "-o", Files.output, "-n", "DV064X010", "-f", "DIS/VAR 64") check_files_eq("CLI", Files.output, "dv064x010.v9t9", "PROGRAM", Masks.v9t9) os.remove("dv064x010.v9t9") xdm("-T", "dv064x010", "-9", "-n", "DV064X010", "-f", "DIS/VAR 64") check_files_eq("CLI", "dv064x010.v9t9", Files.output, "PROGRAM", Masks.v9t9) # TI names shutil.copyfile(Disks.recsdis, Disks.work) xdm(Disks.work, "-t", "-e", "F16", "V16") xdm(Disks.work, "-t", "-e", "F16", "V16", "--ti-names") check_files_eq("TI names", "F16", "f16.tfi", "PROGRAM") check_files_eq("TI names", "V16", "v16.tfi", "PROGRAM") xdm(Disks.work, "-9", "-e", "F1") xdm(Disks.work, "-9", "-e", "F1", "--ti-names") check_files_eq("TI names", "F1", "f1.v9t9", "PROGRAM") xdm(Disks.work, "-e", "V1", "-o", Files.reference) xdm(Disks.work, "-e", "V1", "--ti-names") check_files_eq("TI names", "V1", Files.reference, "PROGRAM") # conversion between TI/PC names ('.' vs '/') file1 = os.path.join(Dirs.refs, "vardis") with open(os.path.join(Dirs.tmp, "file.y.z"), "wb") as f: f.write("\xff" * 100) xdm(Disks.work, "-X", "sssd", "-a", file1, "-n", "FILE.X") xdm(Disks.work, "-a", os.path.join(Dirs.tmp, "file.y.z")) with open(Files.output, "w") as fout: xdm(Disks.work, "-i", stdout=fout, rc=0) check_lines_start(Files.output, ("FILE/X", "FILE/Y"), skip=1) xdm(Disks.work, "-r", "FILE/X:NEW.FILE/X") with open(Files.output, "w") as fout: xdm(Disks.work, "-i", stdout=fout, rc=0) check_lines_start(Files.output, ("NEW/FILE/X", "FILE/Y"), skip=1) xdm(Disks.work, "-e", "*") check_file_exists("new.file.x") os.remove("new.file.x") check_file_exists("file.y") os.remove("file.y") xdm(Disks.work, "-e", "FILE/Y", "-t") check_file_exists("file.y.tfi") os.remove("file.y.tfi") # rename disk (-n) xdm(Disks.work, "-X", "sssd", "-n", "FIRST.NAME") with open(Files.output, "w") as fout: xdm(Disks.work, "-i", stdout=fout, rc=0) check_lines_start(Files.output, ("FIRST/NAME", )) xdm(Disks.work, "-n", "SECND.NAME") with open(Files.output, "w") as fout: xdm(Disks.work, "-i", stdout=fout, rc=0) check_lines_start(Files.output, ("SECND/NAME", )) # output directory -o <dir> ref1 = os.path.join(Dirs.refs, "glob1") ref2 = os.path.join(Dirs.refs, "glob12") xdm(Disks.work, "-X", "sssd", "-a", ref1, ref2) xdm(Disks.work, "-e", "GLOB*", "-o", Dirs.tmp) check_file_exists(os.path.join(Dirs.tmp, "glob1")) os.remove(os.path.join(Dirs.tmp, "glob1")) check_file_exists(os.path.join(Dirs.tmp, "glob12")) os.remove(os.path.join(Dirs.tmp, "glob12")) xdm(Disks.work, "-X", "sssd", "-a", ref1, ref2) with open(Files.error, "w") as ferr: xdm(Disks.work, "-e", "GLOB*", "-o", Files.output, stderr=ferr, rc=1) # stdin and stdout ref = os.path.join(Dirs.refs, "vardis") with open(ref, "r") as fin: xdm(Disks.work, "--initialize", "sssd", "-a", "-", "-f", "dv40", stdin=fin) with open(Files.output, "w") as fout: xdm(Disks.work, "-e", "STDIN", "-o", "-", stdout=fout) check_files_eq("stdin/stdout", Files.output, ref, "DV") ref = os.path.join(Dirs.refs, "sector1") with open(Files.reference, "wb") as fout: xdm(Disks.work, "--initialize", "sssd", "-a", ref, "-n", "T", "-o", "-", stdout=fout) with open(Files.reference, "rb") as fin: xdm("-", "-e", "T", "-o", Files.output, stdin=fin) check_files_eq("stdin/stdout", Files.output, ref, "P") # usage errors with open(Files.error, "w") as ferr: xdm("-a", Files.output, stderr=ferr, rc=1) xdm("-T", "prog00001", "prog00002", "-o", Files.output, stderr=ferr, rc=1) xdm("-T", "prog00001", "prog00002", "-9", "-o", Files.output, stderr=ferr, rc=1) xdm("-F", "-o", Files.output, stderr=ferr, rc=2) # cleanup os.remove(Files.output) os.remove(Files.reference) os.remove(Files.error) os.remove(Disks.work) os.remove(Disks.tifiles) for fn in [ "prog00001", "prog00002", "prog00255", "dv064x010", "df002x001", "df127x001", "df127x010", "df127x020p", "prog00001.tfi", "prog00002.tfi", "prog00255.tfi", "dv064x010.tfi", "prog00002.v9t9", "prog00255.v9t9", "dv064x010.v9t9", "F16", "V16", "f16.tfi", "v16.tfi", "F1", "f1.v9t9", "V1" ]: os.remove(fn)
import utils as utils IMAGES_FOLDER = "../images" # Parsing parser = argparse.ArgumentParser() parser.add_argument("--term", help="Pass the term of the image you are looking", type=str) parser.add_argument("--build-index", action='store_true', help="Recreate file with image probabilities", default=False) args = parser.parse_args() term = args.term model = MobileNet(weights='imagenet') images = utils.get_imgs_paths(IMAGES_FOLDER) _id = utils.term_to_id(term) if utils.check_file_exists() and not args.build_index: probs = utils.open_probs() else: probs = utils.get_imgs_probs(model, images) utils.save_probs(probs) probs_id = utils.get_probs_id(probs, _id) top_imgs = utils.get_top_probs(probs_id, 3) utils.show_imgs(images, top_imgs)
def runtest(): """check command line interface""" # setup shutil.copyfile(Disks.recsgen, Disks.work) # disk image operations with open(Files.output, 'w') as f1, open(Files.reference, 'w') as f2: xdm(Disks.work, '-i', stdout=f2) xdm(Disks.work, '-q', stdout=f1) check_files_eq('CLI', Files.output, Files.reference, 'DIS/VAR255') ref_prog = os.path.join(Dirs.refs, 'prog00255') xdm(Disks.work, '-e', 'PROG00255', '-o', Files.output) check_files_eq('CLI', Files.output, ref_prog, 'PROGRAM') ref_dv = os.path.join(Dirs.refs, 'dv064x010') xdm(Disks.work, '-e', 'DV064X010', '-o', Files.output) check_files_eq('CLI', Files.output, ref_dv, 'DIS/VAR64') ref_df = os.path.join(Dirs.refs, 'df002x001') xdm(Disks.work, '-e', 'DF002X001', '-o', Files.output) check_files_eq('CLI', Files.output, ref_df, 'DIS/FIX 2') with open(Files.output, 'w') as f1: xdm(Disks.work, '-p', 'DV064X010', stdout=f1) check_files_eq('CLI', Files.output, ref_dv, 'DIS/VAR 64') with open(Files.error, 'w') as ferr: xdm(Disks.work, '-e', 'INVALID', stderr=ferr, rc=1) xdm(Disks.work, '-S', '0x01', '-o', Files.output) check_files_eq('CLI', Files.output, os.path.join(Dirs.refs, 'sector1'), 'DIS/VAR255') # add, rename, remove files shutil.copyfile(Disks.blank, Disks.work) xdm(Disks.work, '-a', ref_prog, ref_dv, ref_df) xdm(Disks.work, '-e', 'PROG00255', '-o', Files.output) check_files_eq('CLI', Files.output, ref_prog, 'PROGRAM') xdm(Disks.work, '-e', 'DV064X010', '-o', Files.output) check_files_eq('CLI', Files.output, ref_dv, 'PROGRAM') # use PROGRAM here to compare! shutil.copyfile(Disks.work, Disks.tifiles) xdm(Disks.work, '-e', 'PROG00255', '-o', Files.reference) xdm(Disks.work, '-r', 'PROG00255:OTHERNAME') xdm(Disks.work, '-e', 'OTHERNAME', '-o', Files.output) check_files_eq('CLI', Files.output, Files.reference, 'P') xdm(Disks.work, '-r', 'OTHERNAME:PROG00255') check_files_eq('CLI', Disks.work, Disks.tifiles, 'P') xdm(Disks.work, '-d', 'PROG00255', 'DV064X010', 'DF002X001') with open(Files.output, 'w') as f1, open(Files.reference, 'w') as f2: xdm(Disks.work, '-i', stdout=f1) xdm(Disks.blank, '-i', stdout=f2) check_files_eq('CLI', Files.output, Files.reference, 'DIS/VAR255') shutil.copyfile(Disks.recsgen, Disks.work) xdm(Disks.work, '-e', 'DF127*', 'PROG00001', 'PROG00002') if (not os.path.isfile('df127x001') or not os.path.isfile('df127x010') or not os.path.isfile('df127x020p')): error('CLI', 'DF127*: Missing files') xdm(Disks.work, '-d', 'PROG*', 'D?010X060') with open(Files.error, 'w') as ferr: xdm(Disks.work, '-e', 'PROG00255', stderr=ferr, rc=1) xdm(Disks.work, '-e', 'DV010X060', stderr=ferr, rc=1) xdm(Disks.work, '-e', 'DF010X060', stderr=ferr, rc=1) # multi-file naming xdm(Disks.work, '-n', 'MULTI', '-a', 'prog00001', ref_prog, 'prog00002') xdm(Disks.work, '-e', 'MULTI', '-o', Files.output) check_files_eq('CLI', 'prog00001', Files.output, 'P') xdm(Disks.work, '-e', 'MULTJ', '-o', Files.output) check_files_eq('CLI', ref_prog, Files.output, 'P') xdm(Disks.work, '-e', 'MULTK', '-o', Files.output) check_files_eq('CLI', 'prog00002', Files.output, 'P') xdm('-T', 'prog00001', ref_prog, 'prog00002', '-n', 'MULTFI') # -n applies to internal names! xdm(Disks.work, '-t', '-a', 'prog00001.tfi', ref_prog + '.tfi', 'prog00002.tfi') xdm(Disks.work, '-e', 'MULTFI', '-o', Files.output) check_files_eq('CLI', 'prog00001', Files.output, 'P') xdm(Disks.work, '-e', 'MULTFJ', '-o', Files.output) check_files_eq('CLI', ref_prog, Files.output, 'P') xdm(Disks.work, '-e', 'MULTFK', '-o', Files.output) check_files_eq('CLI', 'prog00002', Files.output, 'P') xdm('-T', ref_prog, 'prog00002', '-9', '-n', 'MULV9T') xdm(Disks.work, '-9', '-a', ref_prog + '.v9t9', 'prog00002.v9t9') xdm(Disks.work, '-e', 'MULV9T', '-o', Files.output) check_files_eq('CLI', ref_prog, Files.output, 'P') xdm(Disks.work, '-e', 'MULV9U', '-o', Files.output) check_files_eq('CLI', 'prog00002', Files.output, 'P') ref = os.path.join(Dirs.refs, 'glob') xdm(Disks.work, '-a', ref + '?', '-n', 'GLOBA1', shell=True) xdm(Disks.work, '-e', 'GLOBA1', '-o', Files.output) xdm(Disks.work, '-e', 'GLOBA2', '-o', Files.output) with open(Files.error, 'w') as ferr: xdm(Disks.work, '-e', 'GLOBA3', '-o', Files.output, stderr=ferr, rc=1) xdm(Disks.work, '-d', 'GLOB*', '-o', Files.output) xdm(Disks.work, '-a', ref + '*', '-n', 'GLOBB1', shell=True) xdm(Disks.work, '-e', 'GLOBB1', '-o', Files.output) xdm(Disks.work, '-e', 'GLOBB2', '-o', Files.output) xdm(Disks.work, '-e', 'GLOBB3', '-o', Files.output) # initialize disk xdm(Disks.work, '--initialize', '360', '-n', 'SSSD') check_file_size(Disks.work, 360 * 256) check_files_eq('CLI', Disks.work, Disks.blank, 'P') os.remove(Disks.work) xdm(Disks.work, '--initialize', 'SSSD', '-n', 'SSSD') check_file_size(Disks.work, 360 * 256) check_files_eq('CLI', Disks.work, Disks.blank, 'P') xdm(Disks.work, '--initialize', '800', '-n', 'INIT') with open(Files.output, 'w') as f: xdm(Disks.work, '-i', '-q', stdout=f) check_file_matches(Files.output, [(0, '\s2\s+used\s+798\s+free\s')]) os.remove(Disks.work) xdm(Disks.work, '--initialize', 'CF', '-n', 'INIT', '-q') with open(Files.output, 'w') as f: xdm(Disks.work, '-i', '-q', stdout=f) check_file_matches(Files.output, [(0, '\s2\s+used\s+1598\s+free\s')]) with open(Files.error, 'w') as ferr: xdm(Disks.work, '--initialize', '1', stderr=ferr, rc=1) xdm(Disks.work, '--initialize', '1601', stderr=ferr, rc=1) xdm(Disks.work, '--initialize', 'FOO', stderr=ferr, rc=1) f = os.path.join(Dirs.refs, 'vardis') for n in ['AA', 'BB']: xdm(Disks.work, '--initialize', 'SSSD', '-a', f, '-n', n) with open(Files.output, 'w') as fout: xdm(Disks.work, '-i', stdout=fout) check_file_matches(Files.output, [(0, n + '\s+'), (2, n + '\s+')]) # set geometry xdm(Disks.work, '--initialize', '1600', '-n', 'GEO') for g, p in [('1S1D', '1S/1D\s+40T'), ('99T8D7S', '7S/8D\s+99T'), ('22TDD', '7S/2D\s+22T'), ('DSSD', '2S/1D\s+22T'), ('1T', '2S/1D\s+1T'), ('3D10T9S', '9S/3D\s+10T'), ('SDDS', '2S/1D\s+10T'), ('SS', '1S/1D\s+10T')]: xdm(Disks.work, '--set-geometry', g, '-q') with open(Files.output, 'w') as fout: xdm(Disks.work, '-i', '-q', stdout=fout) check_file_matches(Files.output, [(0, p)]) # resize disk shutil.copyfile(Disks.recsgen, Disks.work) for s in ['800', '248', '1600']: xdm(Disks.work, '-Z', s, '-q') for f in ['PROG02560', 'DF129X010', 'DV127X010', 'DV255X015P']: xdm(Disks.work, '-e', f, '-q', '-o', Files.output) xdm(Disks.recsgen, '-e', f, '-o', Files.reference) check_files_eq('CLI', Files.output, Files.reference, 'PROGRAM') with open(Files.error, 'w') as ferr: xdm(Disks.work, '-Z', '240', stderr=ferr, rc=1) xdm(Disks.work, '-Z', '1608', stderr=ferr, rc=1) # new geometry handling (v1.5.3) for c, g, p in [ ('--initialize', 'SSSD', r'358 free\s+90 KB\s+1S/1D\s+40T'), ('--resize', 'DS1D', r'718 free\s+180 KB\s+2S/1D\s+40T'), ('--set-geometry', '80T', r'718 free\s+180 KB\s+2S/1D\s+80T'), # geom mismatch ('--initialize', '408', r'406 free\s+102 KB\s+2S/1D\s+40T'), ('--resize', 'DSSD80T', r'1438 free\s+360 KB\s+2S/1D\s+80T'), ('--resize', '2DSS', r'718 free\s+180 KB\s+1S/2D\s+40T'), ('-Z', '208', r'206 free\s+52 KB\s+1S/2D\s+40T'), ('--set-geometry', 'SD80T', r'206 free\s+52 KB\s+1S/1D\s+80T'), ('-X', 'DSSD80T', r'1438 free\s+360 KB\s+2S/1D\s+80T'), ('--set-geometry', '20T', r'1438 free\s+360 KB\s+2S/1D\s+20T') ]: # geom mismatch xdm(Disks.work, c, g, '-q') with open(Files.output, 'w') as fout: xdm(Disks.work, '-i', '-q', stdout=fout) check_file_matches(Files.output, [(0, p)]) with open(Files.error, 'w') as ferr: xdm(Disks.work, '--initialize', 'SS80T', stderr=ferr, rc=1) xdm(Disks.work, '--resize', '2S', stderr=ferr, rc=1) xdm(Disks.work, '--resize', '80T', stderr=ferr, rc=1) xdm(Disks.work, '--set-geometry', '123', stderr=ferr, rc=1) # xdm99 vs real images rfile = os.path.join(Dirs.refs, 'ti-text') # TEXT D/V80 with open(Files.output, 'w') as fout, open(Files.error, 'w') as ferr: xdm(Disks.work, '-X', 'sssd', '-n', 'TI-DISK', stderr=ferr, rc=0) xdm(Disks.work, '-a', rfile, '-n', 'TEXT', '-f', 'dv80', stderr=ferr, rc=0) check_file_len(Files.error, max_lines=0) check_disks_eq(Disks.work, Disks.tisssd) xdm(Disks.work, '-X', 'dsdd', '-n', 'TI-DISK', stderr=ferr, rc=0) xdm(Disks.work, '-a', rfile, '-n', 'TEXT', '-f', 'dv80', stderr=ferr, rc=0) check_file_len(Files.error, max_lines=0) check_disks_eq(Disks.work, Disks.tidsdd) xdm(Disks.work, '-Z', 'sssd', stderr=ferr, rc=0) check_file_len(Files.error, max_lines=0) check_disks_eq(Disks.work, Disks.tisssd) xdm(Disks.work, '--set-geometry', 'ssdd', stderr=ferr, rc=0) # warn check_file_len(Files.error, min_lines=1, max_lines=1) xdm(Disks.work, '-i', stdout=fout, stderr=ferr, rc=0) # warn check_file_len(Files.error, min_lines=2, max_lines=2) xdm(Disks.work, '-Z', 'dsdd', stderr=ferr, rc=0) check_file_len(Files.error, max_lines=2) check_disks_eq(Disks.work, Disks.tidsdd) xdm(Disks.work, '--set-geometry', 'ssdd80t', stderr=ferr, rc=0) check_file_len(Files.error, max_lines=2) xdm(Disks.work, '-X', 'dssd80t', '-n', 'TI-DSSD80', stderr=ferr, rc=0) check_file_len(Files.error, max_lines=2) check_disks_eq(Disks.work, Disks.tidssd80) # repair disks shutil.copyfile(Disks.bad, Disks.work) with open(Files.output, 'w') as f1, open(Files.reference, 'w') as f2: xdm(Disks.work, '-C', stderr=f1, rc=1) xdm(Disks.work, '-R', stderr=f2) check_file_len(Files.output, min_lines=2) with open(Files.output, 'w') as f1: xdm(Disks.work, '-C', stderr=f1) check_file_len(Files.output, max_lines=0) # FIAD operations shutil.copyfile(Disks.recsgen, Disks.work) xdm(Disks.work, '-e', 'PROG00255', 'DV064X010', '-t') xdm(Disks.work, '-e', 'PROG00255', '-t', '-o', Files.output) check_files_eq('CLI', Files.output, 'prog00255.tfi', 'PROGRAM') xdm(Disks.work, '-e', 'DV064X010', '-t', '-o', Files.output) check_files_eq('CLI', Files.output, 'dv064x010.tfi', 'PROGRAM') with open(Files.output, 'w') as f: xdm('-I', 'prog00255.tfi', 'dv064x010.tfi', stdout=f) xdm(Disks.work, '-e', 'PROG00255', 'DV064X010', '-9') xdm(Disks.work, '-e', 'PROG00255', '-9', '-o', Files.output) check_files_eq('CLI', Files.output, 'prog00255.v9t9', 'PROGRAM') xdm(Disks.work, '-e', 'DV064X010', '-9', '-o', Files.output) check_files_eq('CLI', Files.output, 'dv064x010.v9t9', 'PROGRAM') with open(Files.output, 'w') as f: xdm('-I', 'prog00255.v9t9', 'dv064x010.v9t9', stdout=f) xdm(Disks.work, '-e', 'PROG00255') xdm('-T', 'prog00255', '-o', Files.output) check_files_eq('CLI', Files.output, 'prog00255.tfi', 'PROGRAM', Masks.TIFile) xdm('-T', 'prog00255', '-9', '-o', Files.output) check_files_eq('CLI', Files.output, 'prog00255.v9t9', 'PROGRAM', Masks.v9t9) xdm(Disks.work, '-e', 'DV064X010', '-o', Files.reference) xdm('-F', 'dv064x010.tfi') check_files_eq('CLI', 'dv064x010', Files.reference, 'DIS/VAR 64') xdm('-F', 'dv064x010.tfi', '-o', Files.output) check_files_eq('CLI', Files.output, 'dv064x010', 'PROGRAM') xdm('-F', 'dv064x010.v9t9', '-9') check_files_eq('CLI', 'dv064x010', Files.reference, 'DIS/VAR 64') xdm('-F', 'dv064x010.v9t9', '-o', Files.output) check_files_eq('CLI', Files.output, 'dv064x010', 'PROGRAM') xdm('-T', 'dv064x010', '-o', Files.output, '-n', 'DV064X010', '-f', 'DIS/VAR 64') check_files_eq('CLI', Files.output, 'dv064x010.tfi', 'PROGRAM', Masks.TIFile) os.remove('dv064x010.tfi') xdm('-T', 'dv064x010', '-n', 'DV064X010', '-f', 'DIS/VAR 64') check_files_eq('CLI', 'dv064x010.tfi', Files.output, 'PROGRAM', Masks.TIFile) xdm('-T', 'dv064x010', '-9', '-o', Files.output, '-n', 'DV064X010', '-f', 'DIS/VAR 64') check_files_eq('CLI', Files.output, 'dv064x010.v9t9', 'PROGRAM', Masks.v9t9) os.remove('dv064x010.v9t9') xdm('-T', 'dv064x010', '-9', '-n', 'DV064X010', '-f', 'DIS/VAR 64') check_files_eq('CLI', 'dv064x010.v9t9', Files.output, 'PROGRAM', Masks.v9t9) # TI names shutil.copyfile(Disks.recsdis, Disks.work) xdm(Disks.work, '-t', '-e', 'F16', 'V16') xdm(Disks.work, '-t', '-e', 'F16', 'V16', '--ti-names') check_files_eq('TI names', 'F16', 'f16.tfi', 'PROGRAM') check_files_eq('TI names', 'V16', 'v16.tfi', 'PROGRAM') xdm(Disks.work, '-9', '-e', 'F1') xdm(Disks.work, '-9', '-e', 'F1', '--ti-names') check_files_eq('TI names', 'F1', 'f1.v9t9', 'PROGRAM') xdm(Disks.work, '-e', 'V1', '-o', Files.reference) xdm(Disks.work, '-e', 'V1', '--ti-names') check_files_eq('TI names', 'V1', Files.reference, 'PROGRAM') # conversion between TI/PC names ('.' vs '/') file1 = os.path.join(Dirs.refs, 'vardis') with open(os.path.join(Dirs.tmp, 'file.y.z'), 'wb') as f: f.write(b'\xff' * 100) xdm(Disks.work, '-X', 'sssd', '-a', file1, '-n', 'FILE.X') xdm(Disks.work, '-a', os.path.join(Dirs.tmp, 'file.y.z')) with open(Files.output, 'w') as fout: xdm(Disks.work, '-i', stdout=fout, rc=0) check_lines_start(Files.output, ('FILE/X', 'FILE/Y'), skip=1) xdm(Disks.work, '-r', 'FILE/X:NEW.FILE/X') with open(Files.output, 'w') as fout: xdm(Disks.work, '-i', stdout=fout, rc=0) check_lines_start(Files.output, ('NEW/FILE/X', 'FILE/Y'), skip=1) xdm(Disks.work, '-e', '*') check_file_exists('new.file.x') os.remove('new.file.x') check_file_exists('file.y') os.remove('file.y') xdm(Disks.work, '-e', 'FILE/Y', '-t') check_file_exists('file.y.tfi') os.remove('file.y.tfi') # rename disk (-n) xdm(Disks.work, '-X', 'sssd', '-n', 'FIRST.NAME') with open(Files.output, 'w') as fout: xdm(Disks.work, '-i', stdout=fout, rc=0) check_lines_start(Files.output, ('FIRST/NAME', )) xdm(Disks.work, '-n', 'SECND.NAME') with open(Files.output, 'w') as fout: xdm(Disks.work, '-i', stdout=fout, rc=0) check_lines_start(Files.output, ('SECND/NAME', )) # output directory -o <dir> ref1 = os.path.join(Dirs.refs, 'glob1') ref2 = os.path.join(Dirs.refs, 'glob12') xdm(Disks.work, '-X', 'sssd', '-a', ref1, ref2) xdm(Disks.work, '-e', 'GLOB*', '-o', Dirs.tmp) check_file_exists(os.path.join(Dirs.tmp, 'glob1')) os.remove(os.path.join(Dirs.tmp, 'glob1')) check_file_exists(os.path.join(Dirs.tmp, 'glob12')) os.remove(os.path.join(Dirs.tmp, 'glob12')) xdm(Disks.work, '-X', 'sssd', '-a', ref1, ref2) with open(Files.error, 'w') as ferr: xdm(Disks.work, '-e', 'GLOB*', '-o', Files.output, stderr=ferr, rc=1) # stdin and stdout ref = os.path.join(Dirs.refs, 'vardis') with open(ref, 'r') as fin: xdm(Disks.work, '--initialize', 'sssd', '-a', '-', '-f', 'dv40', stdin=fin) with open(Files.output, 'w') as fout: xdm(Disks.work, '-e', 'STDIN', '-o', '-', stdout=fout) check_files_eq('stdin/stdout', Files.output, ref, 'DV') ref = os.path.join(Dirs.refs, 'sector1') with open(Files.reference, 'wb') as fout: xdm(Disks.work, '--initialize', 'sssd', '-a', ref, '-n', 'T', '-o', '-', stdout=fout) with open(Files.reference, 'rb') as fin: xdm('-', '-e', 'T', '-o', Files.output, stdin=fin) check_files_eq('stdin/stdout', Files.output, ref, 'P') # usage errors with open(Files.error, 'w') as ferr: xdm('-a', Files.output, stderr=ferr, rc=2) xdm('-T', 'prog00001', 'prog00002', '-o', Files.output, stderr=ferr, rc=1) xdm('-T', 'prog00001', 'prog00002', '-9', '-o', Files.output, stderr=ferr, rc=1) xdm('-F', '-o', Files.output, stderr=ferr, rc=2) # cleanup os.remove(Files.output) os.remove(Files.reference) os.remove(Files.error) os.remove(Disks.work) os.remove(Disks.tifiles) for fn in [ 'prog00001', 'prog00002', 'df127x001', 'df127x010', 'df127x020p', 'prog00001.tfi', 'prog00002.tfi', 'prog00255.tfi', 'dv064x010.tfi', 'prog00002.v9t9', 'prog00255.v9t9', 'dv064x010.v9t9', 'F16', 'V16', 'f16.tfi', 'v16.tfi', 'F1', 'f1.v9t9', 'V1' ]: os.remove(fn)
def wrapper(*args): if utils.check_file_exists(package_file): return func(*args) else: print '\'package.yml\' isn\'t exists' exit(0)
def _prepare_post_dir(self): ''' Create and prepare post_dir ''' logger.debug('Preparing postprd directory: %s' %config['post_dir']) # create config['post_dir'] if it does not exist yet utils._create_directory(config['post_dir']) # Link all the relevant files need to compute various diagnostics relpath_to_link = ['EmisCoeff/Big_Endian/EmisCoeff.bin', 'AerosolCoeff/Big_Endian/AerosolCoeff.bin', 'CloudCoeff/Big_Endian/CloudCoeff.bin', 'SpcCoeff/Big_Endian/imgr_g11.SpcCoeff.bin', 'TauCoeff/ODPS/Big_Endian/imgr_g11.TauCoeff.bin', 'SpcCoeff/Big_Endian/imgr_g12.SpcCoeff.bin', 'TauCoeff/ODPS/Big_Endian/imgr_g12.TauCoeff.bin', 'SpcCoeff/Big_Endian/imgr_g13.SpcCoeff.bin', 'TauCoeff/ODPS/Big_Endian/imgr_g13.TauCoeff.bin', 'SpcCoeff/Big_Endian/imgr_g15.SpcCoeff.bin', 'TauCoeff/ODPS/Big_Endian/imgr_g15.TauCoeff.bin', 'SpcCoeff/Big_Endian/imgr_mt1r.SpcCoeff.bin', 'TauCoeff/ODPS/Big_Endian/imgr_mt1r.TauCoeff.bin', 'SpcCoeff/Big_Endian/imgr_mt2.SpcCoeff.bin', 'TauCoeff/ODPS/Big_Endian/imgr_mt2.TauCoeff.bin', 'SpcCoeff/Big_Endian/imgr_insat3d.SpcCoeff.bin', 'TauCoeff/ODPS/Big_Endian/imgr_insat3d.TauCoeff.bin', 'SpcCoeff/Big_Endian/amsre_aqua.SpcCoeff.bin', 'TauCoeff/ODPS/Big_Endian/amsre_aqua.TauCoeff.bin', 'SpcCoeff/Big_Endian/tmi_trmm.SpcCoeff.bin', 'TauCoeff/ODPS/Big_Endian/tmi_trmm.TauCoeff.bin', 'SpcCoeff/Big_Endian/ssmi_f13.SpcCoeff.bin', 'TauCoeff/ODPS/Big_Endian/ssmi_f13.TauCoeff.bin', 'SpcCoeff/Big_Endian/ssmi_f14.SpcCoeff.bin', 'TauCoeff/ODPS/Big_Endian/ssmi_f14.TauCoeff.bin', 'SpcCoeff/Big_Endian/ssmi_f15.SpcCoeff.bin', 'TauCoeff/ODPS/Big_Endian/ssmi_f15.TauCoeff.bin', 'SpcCoeff/Big_Endian/ssmis_f16.SpcCoeff.bin', 'TauCoeff/ODPS/Big_Endian/ssmis_f16.TauCoeff.bin', 'SpcCoeff/Big_Endian/ssmis_f17.SpcCoeff.bin', 'TauCoeff/ODPS/Big_Endian/ssmis_f17.TauCoeff.bin', 'SpcCoeff/Big_Endian/ssmis_f18.SpcCoeff.bin', 'TauCoeff/ODPS/Big_Endian/ssmis_f18.TauCoeff.bin', 'SpcCoeff/Big_Endian/ssmis_f19.SpcCoeff.bin', 'TauCoeff/ODPS/Big_Endian/ssmis_f19.TauCoeff.bin', 'SpcCoeff/Big_Endian/ssmis_f20.SpcCoeff.bin', 'TauCoeff/ODPS/Big_Endian/ssmis_f20.TauCoeff.bin', 'SpcCoeff/Big_Endian/seviri_m10.SpcCoeff.bin', 'TauCoeff/ODPS/Big_Endian/seviri_m10.TauCoeff.bin', 'SpcCoeff/Big_Endian/v.seviri_m10.SpcCoeff.bin'] # abspath coefficients for crtm2 (simulated synthetic satellites) abspath_coeff= [os.path.join(config['crtm_dir'], relpath) for relpath in relpath_to_link ] # abspath wrf_cntrl param file abspath_pf = os.path.join(config['upp_domain_dir'], 'parm', 'wrf_cntrl.parm') # concatenate lists of paths abspath_to_link = abspath_coeff + [abspath_pf] # create a symlink for every file in abspath_to_link for fl in abspath_to_link: utils.check_file_exists(fl) # check if file exist and is readable os.symlink(fl, os.path.join(config['post_dir'], os.path.basename(fl))) # symlink wrf_cntrl.parm to config['post_dir']/fort.14 os.symlink(abspath_pf, os.path.join(config['post_dir'], 'fort.14')) # symlink microphysic's tables - code used is based on mp_physics option # used in the wrfout file os.symlink(os.path.join(config['wrf_run_dir'], 'ETAMPNEW_DATA'), os.path.join(config['post_dir'], 'nam_micro_lookup.dat')) os.symlink(os.path.join(config['wrf_run_dir'], 'ETAMPNEW_DATA.expanded_rain' ), os.path.join(config['post_dir'], 'hires_micro_lookup.dat'))
def generate_textid_corpus(args: argparse.Namespace) -> None: """ Read raw files (in specified directory), parse and filter, then output the Bert token-ids for all files to another directory :param args: ArgumentParser-parsed arguments :return: None """ if not args.mode in VALID_MODES: raise ValueError(f"The argument 'mode' needs to be one of " f"{VALID_MODES}, got {args.mode}.") if platform.system() == "Darwin" and args.mode in MODES_NEEDING_BLINGFIRE: raise Exception( f"Got a mode requiring Blingfire (mode = {args.mode}), " "yet Blingfire doesn't support Macos.") if not blingfire: # If we aren't using blingfire, then we must use spacy # for sentence segmentation. try: spacy_model = spacy.load("en_core_web_sm") except OSError: print() print("Exception:") print("Didn't find the model for spacy.") print("Run 'python -m spacy download en_core_web_sm'") exit(-1) # Get list of input file paths in_list = sorted(glob.glob(os.path.join(args.input_dir, "*.txt"))) if args.max_number_of_books: in_list = in_list[:args.max_number_of_books] logging.warning( f"{colorama.Fore.RED}>>> USING A MAX NUMBER OF BOOKS <<<" f"{colorama.Style.RESET_ALL}") # Load blingfire textid model if args.mode == "blingfire" and platform.system() == "Darwin": raise Exception("BlingFire is not compatible with MacOS.") idtok_model = None if blingfire and args.mode in MODES_NEEDING_BLINGFIRE: model_path = os.path.join(args.textid_dir, args.base_tok_file) utils.check_file_exists(model_path) idtok_model = blingfire.load_model(model_path) utils.check_file_exists(args.vocab_path) bert_full_tokenizer = tokenization.FullTokenizer(vocab_file=str( args.vocab_path), do_lower_case=False) if args.mode == "check": with open(args.vocab_path) as fin: ids_to_words = fin.read().strip().split("\n") words_to_ids = {i: word for i, word in enumerate(ids_to_words)} # Iterate through each raw file if args.mode != "blingfire": print("WARNING: We aren't in a mode that doesn't " f"exclusively use Blingfire. Will be slow.\nMode: {args.mode}") logging.info(f"Main Loop - {args.mode}") for i, in_file_path in enumerate(tqdm.tqdm(in_list)): # Generate output file path file_basename = os.path.splitext(os.path.basename(in_file_path))[0] out_file_path = os.path.join(args.output_dir, file_basename) # Read file chunk by chunk with open(in_file_path) as in_file: # We read the whole file, then cut to CHUNK_MAX_LEN characters long. # This seems like a more resistant way to guarantee that we # correctly get full sentences. # The length of the chunks at 100k is the longuest that doesn't # break spacy's sentence tokenizer. logging.debug("Loading a file >") file_text = in_file.read().strip() if not file_text: continue logging.debug("< Done loading a file") for i in range(len(file_text) // CHUNK_MAX_LEN): logging.debug("Chunking. >") chunk = file_text[i * CHUNK_MAX_LEN:(i + 1) * CHUNK_MAX_LEN] # Get the blingfire-processed sentences from this chunk # (NOTE: maybe redundant, look into it maybe removing if slow) sent_tok_start = time.time() logging.debug("< Done chunking.") logging.debug("Segmentizing sentence. >") if blingfire: sentences = chunk_to_sentences(chunk) else: sentences = [str(x) for x in spacy_model(chunk).sents] # Ignore the first and last sentences, as they've # likely been cut weirdly by the chunking process. # We loose less than 1/1000th of all sentences by doing this. # (with a CHUNK_MAX_LEN of 100k). logging.debug(f"Number of sentences: {len(sentences)}") sentences = sentences[1:-1] logging.debug(f"< Done segmentizing sentence. It took " f"{time.time() - sent_tok_start} seconds.") # Additional filtering for plaintext sentences filter_time_start = time.time() logging.debug("Filtering sentences >") ft_sentences = filter_sentences(sentences) logging.debug(f"< Done filtering sentences. It took " f"{time.time() - filter_time_start} seconds.") # Convert each sentence to their textid bpe_tok_time_start = time.time() logging.debug("Tokenizing sentences >") curr_ids = utils.TypedList(np.ndarray) for ft_sent in ft_sentences: ids = None if blingfire: ids = blingfire.text_to_ids(idtok_model, ft_sent, args.id_seq_length, args.oov_id) if args.mode == "bert-native" or args.mode == "check": bert_tokens = bert_full_tokenizer.tokenize(ft_sent) bert_tok_ids = bert_full_tokenizer.convert_tokens_to_ids( bert_tokens) bert_tok_ids_ = utils.TypedList(int) for x in bert_tok_ids: bert_tok_ids_.append(x) bert_tok_ids = bert_tok_ids_ while len(bert_tok_ids) < args.id_seq_length: bert_tok_ids.append(0) bert_tok_ids = np.array( list(bert_tok_ids), dtype=np.int32)[:args.id_seq_length] if args.mode == "bert-native": ids = bert_tok_ids if args.mode == "check": # In the "check" mode, we test that both the # bert native tokenizer and blingfire return # the same thing. utils.check_equal(ids.shape, bert_tok_ids.shape) comp = ids == bert_tok_ids if not np.all(comp): def bert_decode(ids): return " ".join( ids_to_words[wid] for wid in ids if wid != 0) #.replace(" ##", "") # print("Blingfire ids:") # print(ids) print( "\n################################################" ) print("Mismatch between decoders:") print( f"\t Blingfire decoded: \"{bert_decode(ids)}\"" ) print( f"\t- Bert-native decoded: \"{bert_decode(bert_tok_ids)}\"" ) print( "################################################\n" ) # print("Bert-native tokenizer ids:") # print(bert_tok_ids) num_errors = np.sum(np.logical_not(comp)) out_of = max(np.sum(ids != 0), np.sum(bert_tok_ids != 0)) if num_errors / out_of >= 1: raise ValueError(f"{num_errors} " f"different out of {out_of} " f"non padding values") curr_ids.append(ids) logging.debug(f"< Done tokenizing sentences. It took " f"{time.time() - bpe_tok_time_start} seconds.") concat_time_start = time.time() logging.debug("Concatenating the ids. >") if not curr_ids: logging.warning(">> Warning: empty cur_file_ids") id_mat = np.array(list(curr_ids), dtype=np.int32) logging.debug(f"< Done Concatenating the ids. Took " f"{time.time() - concat_time_start} seconds.") if len(id_mat) == 0: logging.warn( f"We got an id_mat of size 0.\nFile index = {i}." f"\nBook file path = {in_file_path}.") logging.debug("Saving >") path = pathlib.Path(out_file_path) np.save(path.parent / (f"{i}_" + str(path.name)), id_mat) logging.debug("< Done saving.") # Free model if blingfire: blingfire.free_model(idtok_model)