Esempio n. 1
0
def expand_location(path):
    # if isinstance(path, file):
    if is_file(path):
        path = path.name

    if path.startswith('https://s3-external-1.amazonaws.com/'
                       ) or path.startswith('https://s3.amazonaws.com/'):
        # it's being downloaded from a bucket, no worries
        return path

    # resolve any symlinks
    # the backfill uses symlinks to the article-xml dir
    path = os.path.abspath(os.path.realpath(path))

    if re.match(r".*article-xml/articles/.+\.xml$", path):
        # this article is coming from the local ./article-xml/ directory, which
        # is almost certainly a git checkout. we want a location that looks like:
        # https://raw.githubusercontent.com/elifesciences/elife-article-xml/5f1179c24c9b8a8b700c5f5bf3543d16a32fbe2f/articles/elife-00003-v1.xml
        rc, rawsha = utils.run_script(["cat", "elife-article-xml.sha1"])
        ensure(rc == 0,
               "failed to read the contents of './elife-article-xml.sha1'")
        sha = rawsha.strip()
        fname = os.path.basename(path)
        return "https://raw.githubusercontent.com/elifesciences/elife-article-xml/%s/articles/%s" % (
            sha, fname)

    # who knows what this path is ...
    LOG.warn(
        "scraping article content in a non-repeatable way. path %r not found in article-xml dir. please don't send the results to lax",
        path)
    return path
Esempio n. 2
0
def expand_location(path):
    # if isinstance(path, file):
    if is_file(path):
        path = path.name

    if path.startswith('https://s3-external-1.amazonaws.com/') or path.startswith('https://s3.amazonaws.com/'):
        # it's being downloaded from a bucket, no worries
        return path

    # resolve any symlinks
    # the backfill uses symlinks to the article-xml dir
    path = os.path.abspath(os.path.realpath(path))

    if re.match(r".*article-xml/articles/.+\.xml$", path):
        # this article is coming from the local ./article-xml/ directory, which
        # is almost certainly a git checkout. we want a location that looks like:
        # https://raw.githubusercontent.com/elifesciences/elife-article-xml/5f1179c24c9b8a8b700c5f5bf3543d16a32fbe2f/articles/elife-00003-v1.xml
        rc, rawsha = utils.run_script(["cat", "elife-article-xml.sha1"])
        ensure(rc == 0, "failed to read the contents of './elife-article-xml.sha1'")
        sha = rawsha.strip()
        fname = os.path.basename(path)
        return "https://raw.githubusercontent.com/elifesciences/elife-article-xml/%s/articles/%s" % (sha, fname)

    # who knows what this path is ...
    LOG.warn("scraping article content in a non-repeatable way. path %r not found in article-xml dir. please don't send the results to lax", path)
    return path
Esempio n. 3
0
def load_files(files, decoder):
    fileObjects = []
    for f in files:
        if exists(f) and is_file(f):
            obj = json.loads(stringfile(f), object_hook=decoder)
            fileObjects.append(obj)
    return fileObjects
Esempio n. 4
0
    def _rename_file(self):
        """Batch renames an image sequence of a single file. If self.input_dir
            is the same as self.output_dir the original file gets renamed,
            otherwise the file gets renamed and moved to the new location."""
        _, ext = os.path.splitext(self.input_fname)
        out_root = self.output_root

        if "%%" in out_root:
            out_root = out_root.replace("%%", "%")

        destination_path = os.path.join(self.output_dir, out_root + ext)
        if compare(self.input_dir, self.output_dir):  # rename files only
            try:
                shutil.move(self.input_path, destination_path)
            except OSError as e:
                raise e
        else:  # move and rename files
            try:
                shutil.copy(self.input_path, destination_path)
            except OSError as e:
                raise e

        rc, msg = is_file(destination_path)
        if not rc:
            e = "Unable to move and rename file from {}. "
            if compare(self.input_dir, self.output_dir):
                e = "Unable to rename file from {}. "
            e += msg
            raise RuntimeError(e.format(self.input_dir))
Esempio n. 5
0
 def restore(model_path=None, filename='model.ckpt'):
     if model_path is None:
         model_path = model_dirname
     if is_file(pathname(model_path, 'checkpoint')):
         saver.restore(sess, pathname(model_path, filename))
         print('Layers weights restored')
         return True
     else:
         print('Layers weights not restored')
         return False
Esempio n. 6
0
 def get_start_number(self):
     """Gets the start number of the image sequence. If the sequence does
         not start at zero, a smaller start number is searched for."""
     other_num = -1
     if not self.starts_at_zero():
         for i in xrange(self.start_num - 1, -1, -1):
             fname = self.dfs_filename % i
             fpath = os.path.join(self.basedir, fname)
             if is_file(fpath)[0]:
                 other_num = i
     return self.start_num if other_num < 0 else other_num
Esempio n. 7
0
 def __init__(
     self, path_or_file, output_file_name, id_scan_results, file_sum_info, doc_type
 ):
     self.is_file = utils.is_file(path_or_file)
     self.path_or_file = os.path.expanduser(path_or_file)
     self.file_to_scan = None
     if self.is_file:
         self.file_to_scan = path_or_file
         self.path_or_file = os.path.dirname(path_or_file)
     self.output_file_name = output_file_name + utils.FILE_SUFFIX
     self.id_scan_results = id_scan_results
     self.doc_type = doc_type
     self.output_file = None
     self.code_extra_params = utils.get_codebase_extra_params(self.path_or_file)
     self.full_file_path = None
     self.spdx_document = None
Esempio n. 8
0
    def __init__(self, path, supp_formats=None):
        """Inits this ImageSequence."""
        self.path = path
        test, error = is_file(self.path, supp_formats)
        if not test:
            raise IOError(error)

        self.basedir, self.filename = os.path.split(self.path)

        # Raise error for filenames with percent symbols
        if "%" in self.filename:
            e = "The specified filename '{}' is not FFmpeg compliant. " \
              .format(self.filename)
            e += "It includes one or more % characters, "
            e += "which are currently not allowed."
            raise ValueError(e)

        self.root, self.ext = os.path.splitext(self.filename)
        self.head, self.mid, self.tail = split_at_digit(self.root).values()

        # Gets all files from the base directory with the same extension as path
        self.files = fetch_dir(self.basedir, self.ext.strip(".").upper())
        self.fcount = len(self.files)
        self.sequence_files = None
        self.messages = []
        if self.fcount < 2:  # single image
            # Set filename as digit format specifier filename
            self.dfs_filename = self.filename
            # Set the sequence start number to -1 to indicate missing sequence
            self.start_num = -1
            self.messages.append(
                "Unable to find image sequence files, " +
                "other than '{}' at the specified path.".format(self.filename))
        else:  # image sequence
            # Get digit format specifier filename and initial image sequence number
            self.dfs_filename, self.start_num = self._parse_filename_pattern()
            # Verify the initial image number as real sequence start number
            self.start_num = self.get_start_number()
Esempio n. 9
0
def load_news(path, decoder):
    newsOjectsList = None
    newsFiles = []
    if not is_file(path):
        newsFiles = ls(fix_dirpath(path) + "*.json", pattern=True)
    else:
        newsFiles = [path]

    if len(newsFiles) <= 0:
        print("No files found")
        return None

    newsOjectsList = load_files(newsFiles, decoder)
    if len(newsOjectsList) <= 0:
        return None

    collectedReads = []
    globalId = 0
    for i in newsOjectsList:
        for a in i:
            globalId += 1
            a.set_id(globalId)
            collectedReads.append(a)
    return collectedReads
Esempio n. 10
0
    def _predict_sequence(self, body, strict=False):
        """Predicts the part(s) of a filename root that is a/are number
            sequence(s), by comparing each of the components to the
            corresponding components of the filename roots inside the
            base directory.

        Args:
          body (list): Head, midsection, and tail of a filename root
          strict (bool): True, to check against all filename roots in
            the base directory, or by default False, to only compare
            to a minimum amount

        Raises:
          RuntimeError: Unable to predict number sequence for 'self.filename'

        Returns:
          A list of indices of the predicted body components.
        """
        # Get the max. number of files from the base directory to check
        max_count = int(
            "1" + "".join(["0" for _ in range(len(str(self.fcount)) - 1)]))
        max_count = max_count if max_count > 10 else self.fcount
        # Predict the sequence(s)
        si = 0  # string start index of body component
        predicted = []  # indices of predicted sequence body components

        for i in xrange(len(body)):
            cln = len(body[i])  # string length of body component
            bln = sum([len(c) for c in body])  # total body string length
            total_dist = 0  # total Levenshtein distance
            count = 0  # number of checked files

            for f in self.files:
                if not strict and count >= max_count:
                    break  # skip the remaining files
                fpath = os.path.join(self.basedir, f)
                if is_file(fpath, self.ext.strip(".").upper())[0] \
                        and f != self.filename:
                    froot, fext = os.path.splitext(f)
                    # Get the string length difference between the corresponding
                    # froot string length and the total body string length
                    ld = len(froot) - bln
                    # Get the froot segment to compare to the body component
                    froot_seg = froot[si:si + cln]
                    # Calculate the character offset between the body component and
                    # the possibly longer froot segment (of a non-zero padded sequence)
                    char_offset = (cln + ld) - cln
                    if char_offset > 0:  # len(froot_seg) > len(body[i])
                        for j in range(char_offset):
                            idx = si + cln + j
                            if not froot[idx].isdigit():
                                break
                            froot_seg += froot[idx]  # add digits only
                    # Compute the Levenshtein distance
                    dist = levenshtein_distance(froot_seg, body[i])
                    total_dist += dist
                count += 1

            # Distances greater than zero mean a high sequence probability
            # because this body component changes from root to root, and
            # the digit verification filters out indices of body components
            # that are not numbers, especially important for a roots with
            # non-zero padded sequences, because the absence of padding
            # makes the root length bigger for higher ranging roots, and
            # thus a distance greater than 0 gets erroneously predicted
            if total_dist > 0 and body[i].isdigit():
                predicted.append(i)
            # Increment si to the start index of the next body component
            si += cln

        # if len(predicted) < 1 or len(predicted) >= len(body):
        if len(predicted) < 1 or len(predicted) > len(body):
            raise RuntimeError(
                "Unable to predict number sequence for '{}'".format(
                    self.filename))
        return predicted
Esempio n. 11
0
def main():
    """
    Main function to parse the arguments and call the main process.
    
    TODO:
        - add support for output target selectin (from names);
        - replace the default config by embedding it in the config module.
    """

    parser = argparse.ArgumentParser(
        description='Models for MER | Evaluation with nested cross-validation.'
    )

    parser.add_argument('mode',
                        choices=['baseline', 'emomucs'],
                        help='Either running the baselines or emomucs.')
    parser.add_argument(
        'model',
        choices=['deezeremo', 'vggemonet', 'vggexp', 'unified'],
        help='The name of the model to train and evaluate.')
    parser.add_argument(
        'audio_features',
        type=lambda x: is_file(parser, x),
        help='Path to the file containing the audio features of the dataset.')
    parser.add_argument(
        'va_annotations',
        type=lambda x: is_file(parser, x),
        help='Path to the csv file with the Valence-Arousal annotations.')
    parser.add_argument(
        'nestedcv_folds',
        type=lambda x: is_file(parser, x),
        help='Path to the file containing the dataset split for nested cv.')

    # The following arguments are specifically for using Emomucs
    parser.add_argument(
        '--sources',
        action='store',
        nargs='+',
        help=f'One or more source names from {SOURCE_NAMES} for Emomucs')
    parser.add_argument('--fusion',
                        action='store',
                        type=str,
                        choices=['early', 'mid', 'late'],
                        default="early",
                        help='Feature fusion technique to use in Emomucs.')
    parser.add_argument(
        '--finetuning',
        action='store_true',
        default=False,
        help='Whether fine-tuning each source model already trained separately.'
    )
    parser.add_argument(
        '--training',
        action='store',
        choices=['joint', 'load'],
        default='load',
        help='Train the source-models from scratch in emomucs or load them.')
    parser.add_argument(
        '--dropouts',
        action='store',
        nargs='+',
        help='Dropout probs for the fully-connected layers of Emomucs.')
    parser.add_argument(
        '--units_per_source',
        type=int,
        action='store',
        default=32,
        help='Number of fully-connected units per source in Emomucs.')

    parser.add_argument('--log_dir',
                        action='store',
                        help='Directory where log files will be generated.')
    parser.add_argument('--result_dir',
                        action='store',
                        help='Where the evaluation results will be saved.')
    parser.add_argument('--checkpoint_dir',
                        action='store',
                        help='Where the model checkpoints will be saved.')
    parser.add_argument('--write',
                        action='store_true',
                        default=False,
                        help='Whether to write the results to disk or not.')
    parser.add_argument(
        '--checkpointing',
        action='store_true',
        default=False,
        help='Whether the model state dict will be saved at every fold.')
    parser.add_argument(
        '--config',
        action='store',
        type=str,
        default=os.path.join(prj_abp, 'example_config.ini'),
        help='File containing the specification of the hyperparameters.')

    parser.add_argument(
        '--sel_output_features',
        action='store',
        nargs='+',
        help='Optional list of the output features names for regression.')
    parser.add_argument(
        '--folds',
        action='store',
        nargs='+',
        help='List of outer folds to process (for parallel execution mode.)')
    parser.add_argument('--num_workers',
                        action='store',
                        type=int,
                        default=0,
                        help='Number of workers for data loading.')
    parser.add_argument(
        '--dtype',
        action='store',
        choices=['d', 'f'],
        default='f',
        help='Data type of tensors to process. Default: f (float).')
    parser.add_argument(
        '--device',
        action='store',
        help='Device to use for training and validation. Default: cpu.')
    parser.add_argument(
        '--seed',
        action='store',
        type=int,
        help='Random seed for the reproducibility of the experiment.')
    parser.add_argument(
        '--debug',
        action='store_true',
        default=False,
        help='Whether to activate the debug mode for exp checking.')

    args = parser.parse_args()

    # TODO: check the consistency of the paramaters ...
    assert args.sources is None or all(
        [name in SOURCE_NAMES for name in args.sources])
    args.dropouts = [.5, .5] if args.dropouts is None \
        else [float(drop) for drop in args.dropouts[:2]]
    args.folds = [int(fold_no) for fold_no in args.folds
                  ] if args.folds is not None else None

    # Filling missing/optional values if not provided ...
    args.dtype = torch.double if args.dtype == 'd' else torch.float
    args.device = torch.device('cpu') if args.device is None else torch.device(
        args.device)

    # setting the random seed for all modules
    if args.seed is None:
        args.seed = random.randint(1, 10000)
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    if args.device.type == 'cuda':
        torch.cuda.manual_seed_all(args.seed)
        # torch.cuda.set_device(args.device)

    if args.log_dir is None:
        args.log_dir = os.path.join(os.path.dirname(args.va_annotations),
                                    'logdir')
    activate_logging(args.log_dir)

    args.result_dir = create_dir(os.path.join(args.log_dir, "results_ft")) \
        if args.result_dir is None else create_dir(args.result_dir)
    args.checkpoint_dir = create_dir(os.path.join(args.log_dir, "checkpoints")) \
        if args.checkpoint_dir is None else create_dir(args.checkpoint_dir)
    args.prediction_dir = create_dir(
        os.path.join(args.result_dir, "predictions"))

    print('Using {} (tensor type: {}) with random seed {} |'
          ' Running fold(s): {} and logging in: {}'.format(
              args.device, args.dtype, args.seed,
              'all' if args.folds is None else args.folds, args.log_dir))

    run_nested_cross_validation_exp(args)
Esempio n. 12
0
    def rename_sequence_files(self, start_num=0):
        """Batch renames image sequence files. If self.input_dir is the same
            as self.output_dir the original sequence files get renamed,
            otherwise the files get renamed and moved to the new location.

        Args:
          start_num: Optional sequence start number, by default 0

        Raises:
          RuntimeError: Batch renaming is currently only supported
            for files that are part of an image sequence
          OSError: ...
          RuntimeError: Unable to move and rename any files from 'self.input_dir...'
          RuntimeError: Unable to rename any files from 'self.input_dir...'
        """
        if self.im is None:
            raise RuntimeError(
                "Batch renaming is currently only supported \n" +
                "for files that are part of an image sequence")

        if self.im.get_start_number() < 0:  # single image
            return self._rename_file()

        # Create a temporary directory to rename the files in
        dt = datetime.now()
        tmp_dir = os.path.join(self.input_dir,
                               "tmp" + dt.strftime("%y%m%d%H%M%S"))
        try:
            os.mkdir(tmp_dir)
        except OSError as e:
            raise e

        files = self.im.get_sequence_files()
        dfs_fname = self.im.get_dfs_filename()
        idx, _ = extract_digit_format_specifiers(dfs_fname)
        re_fname = self.im.get_regex_filename()
        width = len(str(len(files) +
                        start_num))  # file count width for zero padding
        count = start_num  # number of renamed and moved files

        # Rename and move the files to the temporary directory
        tmp_filepaths = []
        for f in files:
            source_fpath = os.path.join(self.input_dir, f)
            if not os.path.isfile(source_fpath):
                continue
            match = re.match(re_fname, f)
            if match is None:
                continue

            root, ext = os.path.splitext(f)
            stripped_root = ""
            if self.im.get_start_number() >= 0:
                stripped_root = root.replace(match.group(1), "")

            out_root = self.output_root
            if "%%" in out_root:
                out_root = out_root.replace("%%", "%")

            root = root.replace(match.group(1), str(count).zfill(width))
            if stripped_root != out_root:
                if idx < int(len(dfs_fname) / 2):
                    root = str(count).zfill(width) + out_root
                else:
                    root = out_root + str(count).zfill(width)

            tmp_fpath = os.path.join(tmp_dir, root + ext)
            if compare(self.input_dir, self.output_dir):  # rename files only
                try:
                    shutil.move(source_fpath, tmp_fpath)
                except OSError as e:
                    raise e
            else:  # move and rename files
                try:
                    shutil.copy(source_fpath, tmp_fpath)
                except OSError as e:
                    raise e
            count += 1
            tmp_filepaths.append(tmp_fpath)

        if count - start_num == 0:
            e = "Unable to move and rename any files from {}. "
            if compare(self.input_dir, self.output_dir):
                e = "Unable to rename any files from {}. "
            rc, msg = is_file(self.input_path)
            if not rc:
                e += msg
            delete(tmp_dir)  # cleanup the still empty temporary directory
            raise RuntimeError(e.format(self.input_dir))

        # Move the renamed files from the temporary to the output directory
        count = 0
        for fpath in tmp_filepaths:
            _, fname = os.path.split(fpath)
            destination_fpath = os.path.join(self.output_dir, fname)
            try:
                shutil.move(fpath, destination_fpath)
            except OSError as e:
                raise e
            count += 1

        if count == 0:
            e = "Unable to move renamed files from the temporary directory "
            e += "to {}. The renamed files can probably be recovered from {}"
            raise RuntimeError(e.format(self.output_dir, tmp_dir))

        # Delete the empty temporary directory
        delete(tmp_dir)