Example #1
0
def check_schemas(data_root, schemas_dir, verbose=False):
    schemas = ('category.json', 'video.json')
    all_file_paths = get_json_files(data_root)

    error_count = 0

    for schema, file_paths in zip(schemas, all_file_paths):
        schema_path = os.path.join(schemas_dir, schema)
        with open(schema_path, encoding='UTF-8') as fp:
            schema_blob = json.load(fp)

        for file_path in file_paths:
            with open(file_path, encoding='UTF-8') as fp:
                try:
                    blob = json.load(fp)
                except json.decoder.JSONDecodeError as e:
                    print('\nError JSON-decoding {}'.format(file_path),
                          flush=True)
                    if verbose:
                        print(e, flush=True)
                    error_count += 1
                    continue
                try:
                    jsonschema.validate(blob, schema_blob)
                except jsonschema.exceptions.ValidationError as e:
                    print(file_path, flush=True)
                    if verbose:
                        print(e, flush=True)
                    error_count += 1

    return error_count
Example #2
0
def check_render_rest(data_root, verbose=False):
    _, video_paths = get_json_files(data_root)

    fields = ('description', 'summary')

    error_by_path = {}
    valid = True
    for file_path in video_paths:
        with open(file_path, encoding='UTF-8') as fp:
            blob = json.load(fp)

            for field in fields:
                # A description or summary maybe None.
                # Ensure text is a string.
                text = blob.get(field) or ''
                error, level = validate_rest(text)
                if error and level >= INVALID_ERROR_LEVEL:
                    valid = False

                if error and verbose:
                    msg = 'ReST validation error:\n\tFile:{}\n\tKey:{}'
                    print(msg.format(file_path, field), flush=True)
                    print('\t', error, sep='', flush=True)

    if not valid:
        sys.exit(1)
Example #3
0
def check_render_rest(data_root, verbose=False):
    _, video_paths = get_json_files(data_root)

    fields = ('description', 'summary')

    error_by_path = {}
    valid = True
    for file_path in video_paths:
        with open(file_path, encoding='UTF-8') as fp:
            blob = json.load(fp)

            for field in fields:
                # A description or summary maybe None.
                # Ensure text is a string.
                text = blob.get(field) or ''
                error, level = validate_rest(text)
                if error and level >= INVALID_ERROR_LEVEL:
                    valid = False

                if error:
                    msg = 'ReST validation error (level {level}):\n\tFile: {fp}\n\tKey: {key}\n\tError:\n{error}'
                    print(msg.format(fp=file_path,
                                     key=field,
                                     level=level,
                                     error=textwrap.indent(error, '\t\t')),
                          flush=True)
                    if verbose:
                        print('\t', error, sep='', flush=True)

    if not valid:
        sys.exit(1)
Example #4
0
def main():
    """Fill id field in json video files"""

    logging.basicConfig(level=logging.WARNING)  # WARNING or DEBUG
    parser = argparse.ArgumentParser()
    parser.add_argument("path", help="path to data repository")
    parser.add_argument('--db',
                        default='/tmp/db.json',
                        help="path to tinydb file")

    args = parser.parse_args()

    _, video_paths = get_json_files(args.path)

    # Retrieve data
    tb_video = [get_json_data(file_name) for file_name in sorted(video_paths)]

    # Query max id
    all_id = collections.Counter(video['id'] for video in tb_video
                                 if 'id' in video.keys())
    most_common, times_duplicate = all_id.most_common(1)[0]
    if times_duplicate > 1:
        raise ValueError('Duplicate id: {}'.format(most_common))
    max_id = max(all_id)
    logging.debug('Max id: {}'.format(max_id))

    # Update files
    video_without_id = [video for video in tb_video
                        if 'id' not in video.keys()]
    for video_id, video in enumerate(video_without_id, max_id + 1):
        update_id(video, video_id)
Example #5
0
def main():
    """Fill id field in json video files"""

    logging.basicConfig(level=logging.WARNING)  # WARNING or DEBUG
    parser = argparse.ArgumentParser()
    parser.add_argument("path", help="path to data repository")
    parser.add_argument('--db',
                        default='/tmp/db.json',
                        help="path to tinydb file")

    args = parser.parse_args()

    _, video_paths = get_json_files(args.path)

    # Retrieve data
    tb_video = [get_json_data(file_name) for file_name in sorted(video_paths)]

    # Query max id
    all_id = collections.Counter(video['id'] for video in tb_video
                                 if 'id' in video.keys())
    most_common, times_duplicate = all_id.most_common(1)[0]
    if times_duplicate > 1:
        raise ValueError('Duplicate id: {}'.format(most_common))
    max_id = max(all_id)
    logging.debug('Max id: {}'.format(max_id))

    # Update files
    video_without_id = [
        video for video in tb_video if 'id' not in video.keys()
    ]
    for video_id, video in enumerate(video_without_id, max_id + 1):
        update_id(video, video_id)
Example #6
0
def check_schemas(data_root, schemas_dir, verbose=False):
    schemas = ('category.json', 'video.json')
    all_file_paths = get_json_files(data_root)

    error_count = 0

    for schema, file_paths in zip(schemas, all_file_paths):
        schema_path = os.path.join(schemas_dir, schema)
        with open(schema_path, encoding='UTF-8') as fp:
            schema_blob = json.load(fp)

        for file_path in file_paths:
            with open(file_path, encoding='UTF-8') as fp:
                try:
                    blob = json.load(fp)
                except json.decoder.JSONDecodeError as e:
                    print('\nError JSON-decoding {}'.format(file_path),
                        flush=True)
                    if verbose:
                        print(e, flush=True)
                    error_count += 1
                    continue
                try:
                    jsonschema.validate(
                        blob,
                        schema_blob,
                        format_checker=jsonschema.FormatChecker())
                except jsonschema.exceptions.ValidationError as e:
                    print(file_path, flush=True)
                    if verbose:
                        print(e, flush=True)
                    error_count += 1

    return error_count
Example #7
0
def main():
    """Convert json file(s) to the project format standards"""
    logging.basicConfig(level=logging.WARNING)  # WARNING or DEBUG
    parser = argparse.ArgumentParser()
    parser.add_argument("path", help="path to data repository")
    args = parser.parse_args()
    category_paths, video_paths = get_json_files(args.path)
    print('\n# Category statistics')
    print(markdown_statistics(category_paths))
    print('\n# Video statistics')
    print(markdown_statistics(video_paths))
Example #8
0
def check_ids_unique(data_root, verbose=False):
    _, video_paths = get_json_files(data_root)

    bad_lang_by_path = {}
    for file_path in video_paths:
        with open(file_path, encoding='UTF-8') as fp:
            blob = json.load(fp)
            lang = blob.get('language')
            if lang and lang not in VIDEO_LANGUAGE_NAMES:
                bad_lang_by_path[file_path] = lang

    if bad_lang_by_path:
        print('Incorrect languages found:')
        for path, lang in bad_lang_by_path.items():
            print('{} {}'.format(lang, path))
        sys.exit(1)
Example #9
0
def main():
    """Convert json file(s) to the project format standards"""
    logging.basicConfig(level=logging.WARNING)
    parser = argparse.ArgumentParser()
    parser.add_argument("path", help="path to file(s) to reserialize")
    parser.add_argument("-a",
                        "--all",
                        action="store_true",
                        help="reserialize all JSON files under path")
    args = parser.parse_args()

    if args.all:
        category_paths, video_paths = get_json_files(args.path)
        paths = category_paths + video_paths
        for path in paths:
            reserialize(path)
    else:
        reserialize(args.path)
Example #10
0
def main():
    """Pull related urls from summary and description of video JSON"""
    logging.basicConfig(level=logging.WARNING)
    parser = argparse.ArgumentParser()
    parser.add_argument("path", help="path to file(s) to reserialize")
    parser.add_argument("-a",
                        "--all",
                        action="store_true",
                        help="reserialize all JSON files under path")
    args = parser.parse_args()

    if args.all:
        category_paths, video_paths = get_json_files(args.path)
        paths = video_paths
        for path in paths:
            pull_links_from_file(path)
    else:
        pull_links_from_file(args.path)
Example #11
0
def main():
    """Convert json file(s) to the project format standards"""
    logging.basicConfig(level=logging.WARNING)
    parser = argparse.ArgumentParser()
    parser.add_argument("path",
                        help="path to file(s) to reserialize")
    parser.add_argument("-a", "--all",
                        action="store_true",
                        help="reserialize all JSON files under path")
    args = parser.parse_args()

    if args.all:
        category_paths, video_paths = get_json_files(args.path)
        paths = category_paths + video_paths
        for path in paths:
            reserialize(path)
    else:
        reserialize(args.path)
def main():
    """Pull related urls from summary and description of video JSON"""
    logging.basicConfig(level=logging.WARNING)
    parser = argparse.ArgumentParser()
    parser.add_argument("path",
                        help="path to file(s) to reserialize")
    parser.add_argument("-a", "--all",
                        action="store_true",
                        help="reserialize all JSON files under path")
    args = parser.parse_args()

    if args.all:
        category_paths, video_paths = get_json_files(args.path)
        paths = video_paths
        for path in paths:
            pull_links_from_file(path)
    else:
        pull_links_from_file(args.path)
Example #13
0
def check_ids_unique(data_root, verbose=False):
    _, video_paths = get_json_files(data_root)

    paths_by_id = defaultdict(list)
    for file_path in video_paths:
        with open(file_path, encoding='UTF-8') as fp:
            blob = json.load(fp)
            id_ = blob.get('id')
            if id_:
                paths_by_id[id_].append(file_path)

    keys = list(paths_by_id.keys())
    for key in keys:
        if len(paths_by_id[key]) <= 1:
            del paths_by_id[key]

    if paths_by_id:
        print('Duplicate IDs found:')
        for id_, paths in paths_by_id.items():
            print('ID {}'.format(id_))
            for path in paths:
                print('\t', path)
        sys.exit(1)
Example #14
0
def check_render_rest(data_root, verbose=False):
    category_paths, video_paths = get_json_files(data_root)

    file_paths = category_paths + video_paths

    error_by_path = {}
    for file_path in file_paths:
        with open(file_path, encoding='UTF-8') as fp:
            serialized_blob = fp.read()
            re_serialized_blob = json.dumps(
                json.loads(serialized_blob),
                **JSON_FORMAT_KWARGS
            )
            if serialized_blob.strip() != re_serialized_blob.strip():
                error_by_path[file_path] = (serialized_blob, re_serialized_blob)

    if error_by_path:
        for path, blobs in error_by_path.items():
            print('Incorrect serialization order in {}'.format(path), flush=True)
            blobs = tuple(blob.splitlines(keepends=True) for blob in blobs)
            if verbose:
                print(''.join(difflib.ndiff(*blobs)), end="")
        sys.exit(1)
Example #15
0
def check_schemas(data_root, schemas_dir, verbose=False):
    schemas = ('category.json', 'video.json')
    all_file_paths = get_json_files(data_root)

    error_count = 0

    for schema, file_paths in zip(schemas, all_file_paths):
        schema_path = os.path.join(schemas_dir, schema)
        with open(schema_path, encoding='UTF-8') as fp:
            schema_blob = json.load(fp)

        for file_path in file_paths:
            with open(file_path, encoding='UTF-8') as fp:
                blob = json.load(fp)
                try:
                    jsonschema.validate(blob, schema_blob)
                except jsonschema.exceptions.ValidationError as e:
                    print(file_path, flush=True)
                    if verbose:
                        print(e, flush=True)
                    error_count += 1

    return error_count
Example #16
0
def check_slugs_unique(data_root, verbose=False):
    category_paths, _ = get_json_files(data_root)

    paths_by_combo = defaultdict(list)

    for category_path in category_paths:
        with open(category_path, encoding='UTF-8') as fp:
            category_blob = json.load(fp)
            # slugs will be generated from titles, so titles can be used
            # as a stand-in for slugs when testing unique constraints.
            category_title = category_blob.get('title')

            head, _ = os.path.split(category_path)
            video_pattern = os.path.join(head, 'videos/*.json')
            for video_path in glob.iglob(video_pattern):
                with open(video_path, encoding='UTF-8') as fp:
                    video_blob = json.load(fp)
                    video_slug = video_blob.get('slug')
                    if not video_slug:
                        video_slug = slugify(video_blob.get('title'))

                    combo = (category_title, video_slug)
                    paths_by_combo[combo].append(video_path)

    keys = list(paths_by_combo.keys())
    for key in keys:
        if len(paths_by_combo[key]) <= 1:
            del paths_by_combo[key]

    if paths_by_combo:
        print('Duplicate slug combinations found:')
        for combo, paths in paths_by_combo.items():
            print('Combination {}'.format(combo))
            for path in paths:
                print('\t', path)
        sys.exit(1)