def expand_location(path): # if isinstance(path, file): if is_file(path): path = path.name if path.startswith('https://s3-external-1.amazonaws.com/' ) or path.startswith('https://s3.amazonaws.com/'): # it's being downloaded from a bucket, no worries return path # resolve any symlinks # the backfill uses symlinks to the article-xml dir path = os.path.abspath(os.path.realpath(path)) if re.match(r".*article-xml/articles/.+\.xml$", path): # this article is coming from the local ./article-xml/ directory, which # is almost certainly a git checkout. we want a location that looks like: # https://raw.githubusercontent.com/elifesciences/elife-article-xml/5f1179c24c9b8a8b700c5f5bf3543d16a32fbe2f/articles/elife-00003-v1.xml rc, rawsha = utils.run_script(["cat", "elife-article-xml.sha1"]) ensure(rc == 0, "failed to read the contents of './elife-article-xml.sha1'") sha = rawsha.strip() fname = os.path.basename(path) return "https://raw.githubusercontent.com/elifesciences/elife-article-xml/%s/articles/%s" % ( sha, fname) # who knows what this path is ... LOG.warn( "scraping article content in a non-repeatable way. path %r not found in article-xml dir. please don't send the results to lax", path) return path
def expand_location(path): # if isinstance(path, file): if is_file(path): path = path.name if path.startswith('https://s3-external-1.amazonaws.com/') or path.startswith('https://s3.amazonaws.com/'): # it's being downloaded from a bucket, no worries return path # resolve any symlinks # the backfill uses symlinks to the article-xml dir path = os.path.abspath(os.path.realpath(path)) if re.match(r".*article-xml/articles/.+\.xml$", path): # this article is coming from the local ./article-xml/ directory, which # is almost certainly a git checkout. we want a location that looks like: # https://raw.githubusercontent.com/elifesciences/elife-article-xml/5f1179c24c9b8a8b700c5f5bf3543d16a32fbe2f/articles/elife-00003-v1.xml rc, rawsha = utils.run_script(["cat", "elife-article-xml.sha1"]) ensure(rc == 0, "failed to read the contents of './elife-article-xml.sha1'") sha = rawsha.strip() fname = os.path.basename(path) return "https://raw.githubusercontent.com/elifesciences/elife-article-xml/%s/articles/%s" % (sha, fname) # who knows what this path is ... LOG.warn("scraping article content in a non-repeatable way. path %r not found in article-xml dir. please don't send the results to lax", path) return path
def load_files(files, decoder): fileObjects = [] for f in files: if exists(f) and is_file(f): obj = json.loads(stringfile(f), object_hook=decoder) fileObjects.append(obj) return fileObjects
def _rename_file(self): """Batch renames an image sequence of a single file. If self.input_dir is the same as self.output_dir the original file gets renamed, otherwise the file gets renamed and moved to the new location.""" _, ext = os.path.splitext(self.input_fname) out_root = self.output_root if "%%" in out_root: out_root = out_root.replace("%%", "%") destination_path = os.path.join(self.output_dir, out_root + ext) if compare(self.input_dir, self.output_dir): # rename files only try: shutil.move(self.input_path, destination_path) except OSError as e: raise e else: # move and rename files try: shutil.copy(self.input_path, destination_path) except OSError as e: raise e rc, msg = is_file(destination_path) if not rc: e = "Unable to move and rename file from {}. " if compare(self.input_dir, self.output_dir): e = "Unable to rename file from {}. " e += msg raise RuntimeError(e.format(self.input_dir))
def restore(model_path=None, filename='model.ckpt'): if model_path is None: model_path = model_dirname if is_file(pathname(model_path, 'checkpoint')): saver.restore(sess, pathname(model_path, filename)) print('Layers weights restored') return True else: print('Layers weights not restored') return False
def get_start_number(self): """Gets the start number of the image sequence. If the sequence does not start at zero, a smaller start number is searched for.""" other_num = -1 if not self.starts_at_zero(): for i in xrange(self.start_num - 1, -1, -1): fname = self.dfs_filename % i fpath = os.path.join(self.basedir, fname) if is_file(fpath)[0]: other_num = i return self.start_num if other_num < 0 else other_num
def __init__( self, path_or_file, output_file_name, id_scan_results, file_sum_info, doc_type ): self.is_file = utils.is_file(path_or_file) self.path_or_file = os.path.expanduser(path_or_file) self.file_to_scan = None if self.is_file: self.file_to_scan = path_or_file self.path_or_file = os.path.dirname(path_or_file) self.output_file_name = output_file_name + utils.FILE_SUFFIX self.id_scan_results = id_scan_results self.doc_type = doc_type self.output_file = None self.code_extra_params = utils.get_codebase_extra_params(self.path_or_file) self.full_file_path = None self.spdx_document = None
def __init__(self, path, supp_formats=None): """Inits this ImageSequence.""" self.path = path test, error = is_file(self.path, supp_formats) if not test: raise IOError(error) self.basedir, self.filename = os.path.split(self.path) # Raise error for filenames with percent symbols if "%" in self.filename: e = "The specified filename '{}' is not FFmpeg compliant. " \ .format(self.filename) e += "It includes one or more % characters, " e += "which are currently not allowed." raise ValueError(e) self.root, self.ext = os.path.splitext(self.filename) self.head, self.mid, self.tail = split_at_digit(self.root).values() # Gets all files from the base directory with the same extension as path self.files = fetch_dir(self.basedir, self.ext.strip(".").upper()) self.fcount = len(self.files) self.sequence_files = None self.messages = [] if self.fcount < 2: # single image # Set filename as digit format specifier filename self.dfs_filename = self.filename # Set the sequence start number to -1 to indicate missing sequence self.start_num = -1 self.messages.append( "Unable to find image sequence files, " + "other than '{}' at the specified path.".format(self.filename)) else: # image sequence # Get digit format specifier filename and initial image sequence number self.dfs_filename, self.start_num = self._parse_filename_pattern() # Verify the initial image number as real sequence start number self.start_num = self.get_start_number()
def load_news(path, decoder): newsOjectsList = None newsFiles = [] if not is_file(path): newsFiles = ls(fix_dirpath(path) + "*.json", pattern=True) else: newsFiles = [path] if len(newsFiles) <= 0: print("No files found") return None newsOjectsList = load_files(newsFiles, decoder) if len(newsOjectsList) <= 0: return None collectedReads = [] globalId = 0 for i in newsOjectsList: for a in i: globalId += 1 a.set_id(globalId) collectedReads.append(a) return collectedReads
def _predict_sequence(self, body, strict=False): """Predicts the part(s) of a filename root that is a/are number sequence(s), by comparing each of the components to the corresponding components of the filename roots inside the base directory. Args: body (list): Head, midsection, and tail of a filename root strict (bool): True, to check against all filename roots in the base directory, or by default False, to only compare to a minimum amount Raises: RuntimeError: Unable to predict number sequence for 'self.filename' Returns: A list of indices of the predicted body components. """ # Get the max. number of files from the base directory to check max_count = int( "1" + "".join(["0" for _ in range(len(str(self.fcount)) - 1)])) max_count = max_count if max_count > 10 else self.fcount # Predict the sequence(s) si = 0 # string start index of body component predicted = [] # indices of predicted sequence body components for i in xrange(len(body)): cln = len(body[i]) # string length of body component bln = sum([len(c) for c in body]) # total body string length total_dist = 0 # total Levenshtein distance count = 0 # number of checked files for f in self.files: if not strict and count >= max_count: break # skip the remaining files fpath = os.path.join(self.basedir, f) if is_file(fpath, self.ext.strip(".").upper())[0] \ and f != self.filename: froot, fext = os.path.splitext(f) # Get the string length difference between the corresponding # froot string length and the total body string length ld = len(froot) - bln # Get the froot segment to compare to the body component froot_seg = froot[si:si + cln] # Calculate the character offset between the body component and # the possibly longer froot segment (of a non-zero padded sequence) char_offset = (cln + ld) - cln if char_offset > 0: # len(froot_seg) > len(body[i]) for j in range(char_offset): idx = si + cln + j if not froot[idx].isdigit(): break froot_seg += froot[idx] # add digits only # Compute the Levenshtein distance dist = levenshtein_distance(froot_seg, body[i]) total_dist += dist count += 1 # Distances greater than zero mean a high sequence probability # because this body component changes from root to root, and # the digit verification filters out indices of body components # that are not numbers, especially important for a roots with # non-zero padded sequences, because the absence of padding # makes the root length bigger for higher ranging roots, and # thus a distance greater than 0 gets erroneously predicted if total_dist > 0 and body[i].isdigit(): predicted.append(i) # Increment si to the start index of the next body component si += cln # if len(predicted) < 1 or len(predicted) >= len(body): if len(predicted) < 1 or len(predicted) > len(body): raise RuntimeError( "Unable to predict number sequence for '{}'".format( self.filename)) return predicted
def main(): """ Main function to parse the arguments and call the main process. TODO: - add support for output target selectin (from names); - replace the default config by embedding it in the config module. """ parser = argparse.ArgumentParser( description='Models for MER | Evaluation with nested cross-validation.' ) parser.add_argument('mode', choices=['baseline', 'emomucs'], help='Either running the baselines or emomucs.') parser.add_argument( 'model', choices=['deezeremo', 'vggemonet', 'vggexp', 'unified'], help='The name of the model to train and evaluate.') parser.add_argument( 'audio_features', type=lambda x: is_file(parser, x), help='Path to the file containing the audio features of the dataset.') parser.add_argument( 'va_annotations', type=lambda x: is_file(parser, x), help='Path to the csv file with the Valence-Arousal annotations.') parser.add_argument( 'nestedcv_folds', type=lambda x: is_file(parser, x), help='Path to the file containing the dataset split for nested cv.') # The following arguments are specifically for using Emomucs parser.add_argument( '--sources', action='store', nargs='+', help=f'One or more source names from {SOURCE_NAMES} for Emomucs') parser.add_argument('--fusion', action='store', type=str, choices=['early', 'mid', 'late'], default="early", help='Feature fusion technique to use in Emomucs.') parser.add_argument( '--finetuning', action='store_true', default=False, help='Whether fine-tuning each source model already trained separately.' ) parser.add_argument( '--training', action='store', choices=['joint', 'load'], default='load', help='Train the source-models from scratch in emomucs or load them.') parser.add_argument( '--dropouts', action='store', nargs='+', help='Dropout probs for the fully-connected layers of Emomucs.') parser.add_argument( '--units_per_source', type=int, action='store', default=32, help='Number of fully-connected units per source in Emomucs.') parser.add_argument('--log_dir', action='store', help='Directory where log files will be generated.') parser.add_argument('--result_dir', action='store', help='Where the evaluation results will be saved.') parser.add_argument('--checkpoint_dir', action='store', help='Where the model checkpoints will be saved.') parser.add_argument('--write', action='store_true', default=False, help='Whether to write the results to disk or not.') parser.add_argument( '--checkpointing', action='store_true', default=False, help='Whether the model state dict will be saved at every fold.') parser.add_argument( '--config', action='store', type=str, default=os.path.join(prj_abp, 'example_config.ini'), help='File containing the specification of the hyperparameters.') parser.add_argument( '--sel_output_features', action='store', nargs='+', help='Optional list of the output features names for regression.') parser.add_argument( '--folds', action='store', nargs='+', help='List of outer folds to process (for parallel execution mode.)') parser.add_argument('--num_workers', action='store', type=int, default=0, help='Number of workers for data loading.') parser.add_argument( '--dtype', action='store', choices=['d', 'f'], default='f', help='Data type of tensors to process. Default: f (float).') parser.add_argument( '--device', action='store', help='Device to use for training and validation. Default: cpu.') parser.add_argument( '--seed', action='store', type=int, help='Random seed for the reproducibility of the experiment.') parser.add_argument( '--debug', action='store_true', default=False, help='Whether to activate the debug mode for exp checking.') args = parser.parse_args() # TODO: check the consistency of the paramaters ... assert args.sources is None or all( [name in SOURCE_NAMES for name in args.sources]) args.dropouts = [.5, .5] if args.dropouts is None \ else [float(drop) for drop in args.dropouts[:2]] args.folds = [int(fold_no) for fold_no in args.folds ] if args.folds is not None else None # Filling missing/optional values if not provided ... args.dtype = torch.double if args.dtype == 'd' else torch.float args.device = torch.device('cpu') if args.device is None else torch.device( args.device) # setting the random seed for all modules if args.seed is None: args.seed = random.randint(1, 10000) random.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) if args.device.type == 'cuda': torch.cuda.manual_seed_all(args.seed) # torch.cuda.set_device(args.device) if args.log_dir is None: args.log_dir = os.path.join(os.path.dirname(args.va_annotations), 'logdir') activate_logging(args.log_dir) args.result_dir = create_dir(os.path.join(args.log_dir, "results_ft")) \ if args.result_dir is None else create_dir(args.result_dir) args.checkpoint_dir = create_dir(os.path.join(args.log_dir, "checkpoints")) \ if args.checkpoint_dir is None else create_dir(args.checkpoint_dir) args.prediction_dir = create_dir( os.path.join(args.result_dir, "predictions")) print('Using {} (tensor type: {}) with random seed {} |' ' Running fold(s): {} and logging in: {}'.format( args.device, args.dtype, args.seed, 'all' if args.folds is None else args.folds, args.log_dir)) run_nested_cross_validation_exp(args)
def rename_sequence_files(self, start_num=0): """Batch renames image sequence files. If self.input_dir is the same as self.output_dir the original sequence files get renamed, otherwise the files get renamed and moved to the new location. Args: start_num: Optional sequence start number, by default 0 Raises: RuntimeError: Batch renaming is currently only supported for files that are part of an image sequence OSError: ... RuntimeError: Unable to move and rename any files from 'self.input_dir...' RuntimeError: Unable to rename any files from 'self.input_dir...' """ if self.im is None: raise RuntimeError( "Batch renaming is currently only supported \n" + "for files that are part of an image sequence") if self.im.get_start_number() < 0: # single image return self._rename_file() # Create a temporary directory to rename the files in dt = datetime.now() tmp_dir = os.path.join(self.input_dir, "tmp" + dt.strftime("%y%m%d%H%M%S")) try: os.mkdir(tmp_dir) except OSError as e: raise e files = self.im.get_sequence_files() dfs_fname = self.im.get_dfs_filename() idx, _ = extract_digit_format_specifiers(dfs_fname) re_fname = self.im.get_regex_filename() width = len(str(len(files) + start_num)) # file count width for zero padding count = start_num # number of renamed and moved files # Rename and move the files to the temporary directory tmp_filepaths = [] for f in files: source_fpath = os.path.join(self.input_dir, f) if not os.path.isfile(source_fpath): continue match = re.match(re_fname, f) if match is None: continue root, ext = os.path.splitext(f) stripped_root = "" if self.im.get_start_number() >= 0: stripped_root = root.replace(match.group(1), "") out_root = self.output_root if "%%" in out_root: out_root = out_root.replace("%%", "%") root = root.replace(match.group(1), str(count).zfill(width)) if stripped_root != out_root: if idx < int(len(dfs_fname) / 2): root = str(count).zfill(width) + out_root else: root = out_root + str(count).zfill(width) tmp_fpath = os.path.join(tmp_dir, root + ext) if compare(self.input_dir, self.output_dir): # rename files only try: shutil.move(source_fpath, tmp_fpath) except OSError as e: raise e else: # move and rename files try: shutil.copy(source_fpath, tmp_fpath) except OSError as e: raise e count += 1 tmp_filepaths.append(tmp_fpath) if count - start_num == 0: e = "Unable to move and rename any files from {}. " if compare(self.input_dir, self.output_dir): e = "Unable to rename any files from {}. " rc, msg = is_file(self.input_path) if not rc: e += msg delete(tmp_dir) # cleanup the still empty temporary directory raise RuntimeError(e.format(self.input_dir)) # Move the renamed files from the temporary to the output directory count = 0 for fpath in tmp_filepaths: _, fname = os.path.split(fpath) destination_fpath = os.path.join(self.output_dir, fname) try: shutil.move(fpath, destination_fpath) except OSError as e: raise e count += 1 if count == 0: e = "Unable to move renamed files from the temporary directory " e += "to {}. The renamed files can probably be recovered from {}" raise RuntimeError(e.format(self.output_dir, tmp_dir)) # Delete the empty temporary directory delete(tmp_dir)