def _validate_data(counter): unique_entries = 0 multiple_entries = 0 for media_type, media_config in MEDIA_TYPES.items(): if counter[media_type]: if media_config['unique']: unique_entries += len(counter[media_type]) else: multiple_entries += len(counter[media_type]) if unique_entries == 1 and multiple_entries > 0 or unique_entries > 1: unique_types = ', '.join( [k for k, v in MEDIA_TYPES.items() if v['unique']]) multiply_types = ', '.join( [k for k, v in MEDIA_TYPES.items() if not v['unique']]) count = ', '.join( ['{} {}(s)'.format(len(v), k) for k, v in counter.items()]) raise ValueError('Only one {} or many {} can be used simultaneously, \ but {} found.'.format(unique_types, multiply_types, count)) if unique_entries == 0 and multiple_entries == 0: raise ValueError('No media data found') task_modes = [ MEDIA_TYPES[media_type]['mode'] for media_type, media_files in counter.items() if media_files ] if not all(mode == task_modes[0] for mode in task_modes): raise Exception('Could not combine different task modes for data') return counter, task_modes[0]
def _count_files(data, manifest_files=None): share_root = settings.SHARE_ROOT server_files = [] for path in data["server_files"]: path = os.path.normpath(path).lstrip('/') if '..' in path.split(os.path.sep): raise ValueError("Don't use '..' inside file paths") full_path = os.path.abspath(os.path.join(share_root, path)) if os.path.commonprefix([share_root, full_path]) != share_root: raise ValueError("Bad file path: " + path) server_files.append(path) sorted_server_files = sorted(server_files, reverse=True) # The idea of the code is trivial. After sort we will have files in the # following order: 'a/b/c/d/2.txt', 'a/b/c/d/1.txt', 'a/b/c/d', 'a/b/c' # Let's keep all items which aren't substrings of the previous item. In # the example above only 2.txt and 1.txt files will be in the final list. # Also need to correctly handle 'a/b/c0', 'a/b/c' case. without_extra_dirs = [ v[1] for v in zip([""] + sorted_server_files, sorted_server_files) if not os.path.dirname(v[0]).startswith(v[1]) ] # we need to keep the original sequence of files data['server_files'] = [f for f in server_files if f in without_extra_dirs] def count_files(file_mapping, counter): for rel_path, full_path in file_mapping.items(): mime = get_mime(full_path) if mime in counter: counter[mime].append(rel_path) elif rel_path.endswith('.jsonl'): manifest_files.append(rel_path) else: slogger.glob.warn( "Skip '{}' file (its mime type doesn't " "correspond to supported MIME file type)".format( full_path)) counter = {media_type: [] for media_type in MEDIA_TYPES.keys()} count_files( file_mapping={ f: f for f in data['remote_files'] or data['client_files'] }, counter=counter, ) count_files( file_mapping={ f: os.path.abspath(os.path.join(share_root, f)) for f in data['server_files'] }, counter=counter, ) return counter
def _validate_data(data): share_root = settings.SHARE_ROOT server_files = [] for path in data["server_files"]: path = os.path.normpath(path).lstrip('/') if '..' in path.split(os.path.sep): raise ValueError("Don't use '..' inside file paths") full_path = os.path.abspath(os.path.join(share_root, path)) if os.path.commonprefix([share_root, full_path]) != share_root: raise ValueError("Bad file path: " + path) server_files.append(path) server_files.sort(reverse=True) # The idea of the code is trivial. After sort we will have files in the # following order: 'a/b/c/d/2.txt', 'a/b/c/d/1.txt', 'a/b/c/d', 'a/b/c' # Let's keep all items which aren't substrings of the previous item. In # the example above only 2.txt and 1.txt files will be in the final list. # Also need to correctly handle 'a/b/c0', 'a/b/c' case. data['server_files'] = [v[1] for v in zip([""] + server_files, server_files) if not os.path.dirname(v[0]).startswith(v[1])] def count_files(file_mapping, counter): for rel_path, full_path in file_mapping.items(): mime = get_mime(full_path) if mime in counter: counter[mime].append(rel_path) else: slogger.glob.warn("Skip '{}' file (its mime type doesn't " "correspond to a video or an image file)".format(full_path)) counter = { media_type: [] for media_type in MEDIA_TYPES.keys() } count_files( file_mapping={ f:f for f in data['remote_files'] or data['client_files']}, counter=counter, ) count_files( file_mapping={ f:os.path.abspath(os.path.join(share_root, f)) for f in data['server_files']}, counter=counter, ) unique_entries = 0 multiple_entries = 0 for media_type, media_config in MEDIA_TYPES.items(): if counter[media_type]: if media_config['unique']: unique_entries += len(counter[media_type]) else: multiple_entries += len(counter[media_type]) if unique_entries == 1 and multiple_entries > 0 or unique_entries > 1: unique_types = ', '.join([k for k, v in MEDIA_TYPES.items() if v['unique']]) multiply_types = ', '.join([k for k, v in MEDIA_TYPES.items() if not v['unique']]) count = ', '.join(['{} {}(s)'.format(len(v), k) for k, v in counter.items()]) raise ValueError('Only one {} or many {} can be used simultaneously, \ but {} found.'.format(unique_types, multiply_types, count)) if unique_entries == 0 and multiple_entries == 0: raise ValueError('No media data found') return counter
def _validate_data(data): share_root = settings.SHARE_ROOT server_files = { 'dirs': [], 'files': [], } for path in data["server_files"]: path = os.path.normpath(path).lstrip('/') if '..' in path.split(os.path.sep): raise ValueError("Don't use '..' inside file paths") full_path = os.path.abspath(os.path.join(share_root, path)) if 'directory' == get_mime(full_path): server_files['dirs'].append(path) else: server_files['files'].append(path) if os.path.commonprefix([share_root, full_path]) != share_root: raise ValueError("Bad file path: " + path) # Remove directories if other files from them exists in server files data['server_files'] = server_files['files'] + [ dir_name for dir_name in server_files['dirs'] if not [ f_name for f_name in server_files['files'] if f_name.startswith(dir_name) ] ] def count_files(file_mapping, counter): for rel_path, full_path in file_mapping.items(): mime = get_mime(full_path) counter[mime].append(rel_path) counter = {media_type: [] for media_type in MEDIA_TYPES.keys()} count_files( file_mapping={ f: f for f in data['remote_files'] or data['client_files'] }, counter=counter, ) count_files( file_mapping={ f: os.path.abspath(os.path.join(share_root, f)) for f in data['server_files'] }, counter=counter, ) unique_entries = 0 multiple_entries = 0 for media_type, media_config in MEDIA_TYPES.items(): if counter[media_type]: if media_config['unique']: unique_entries += len(counter[media_type]) else: multiple_entries += len(counter[media_type]) if unique_entries == 1 and multiple_entries > 0 or unique_entries > 1: unique_types = ', '.join( [k for k, v in MEDIA_TYPES.items() if v['unique']]) multiply_types = ', '.join( [k for k, v in MEDIA_TYPES.items() if not v['unique']]) count = ', '.join( ['{} {}(s)'.format(len(v), k) for k, v in counter.items()]) raise ValueError('Only one {} or many {} can be used simultaneously, \ but {} found.'.format(unique_types, multiply_types, count)) if unique_entries == 0 and multiple_entries == 0: raise ValueError('No media data found') return counter