Example #1
0
 def test_file_base_name_on_dir_path(self):
     test_dir = self.get_test_loc('fileutils/basename')
     test_file = 'a/b/'
     expected_name = 'b'
     result = fileutils.file_base_name(test_file)
     assert expected_name == result
     result = fileutils.file_base_name(join(test_dir, test_file))
     assert expected_name == result
Example #2
0
 def test_file_base_name_on_path_for_plain_dir_with_extension(self):
     test_dir = self.get_test_loc('fileutils/basename')
     test_file = 'f.a/'
     expected_name = 'f.a'
     result = fileutils.file_base_name(test_file)
     assert expected_name == result
     result = fileutils.file_base_name(join(test_dir, test_file))
     assert expected_name == result
Example #3
0
 def test_file_base_name_on_path_for_plain_file(self):
     test_dir = self.get_test_loc('fileutils/basename')
     test_file = 'tst'
     expected_name = 'tst'
     result = fileutils.file_base_name(test_file)
     assert expected_name == result
     result = fileutils.file_base_name(join(test_dir, test_file))
     assert expected_name == result
Example #4
0
 def test_file_base_name_on_file_path_for_file_with_known_composed_extension(self):
     test_dir = self.get_test_loc('fileutils/basename')
     test_file = 'a/b/a.tar.gz'
     expected_name = 'a'
     result = fileutils.file_base_name(test_file)
     assert expected_name == result
     result = fileutils.file_base_name(join(test_dir, test_file))
     assert expected_name == result
Example #5
0
 def test_file_base_name_on_path_and_location_1(self):
     test_dir = self.get_test_loc('fileutils/basename')
     test_file = 'a/.a/file'
     expected_name = 'file'
     result = fileutils.file_base_name(test_file)
     assert expected_name == result
     result = fileutils.file_base_name(join(test_dir, test_file))
     assert expected_name == result
Example #6
0
def is_special_legal_file(location):
    """
    Return an indication that a file may be a "special" legal-like file.
    """
    file_base_name = fileutils.file_base_name(location)
    file_base_name_lower = file_base_name.lower()
    file_extension = fileutils.file_extension(location)
    file_extension_lower = file_extension.lower()

    name_contains_special = (special_name in file_base_name
                             or special_name in file_extension
                             for special_name in special_names)

    name_lower_is_special = (special_name_lower
                             in (file_base_name_lower, file_extension_lower)
                             for special_name_lower in special_names_lower)

    name_lower_contains_special = (
        special_name_lower in file_base_name_lower
        or special_name_lower in file_extension_lower
        for special_name_lower in special_names_lower)

    if any(name_contains_special) or any(name_lower_is_special):
        return 'yes'

    elif any(name_lower_contains_special):
        return 'maybe'
    else:
        # return False for now?
        pass
def uncompress_file(location, decompressor):
    """
    Uncompress a compressed file at location and return a temporary location of
    the uncompressed file and a list of warning messages. Raise Exceptions on
    errors. Use the `decompressor` object for decompression.
    """
    # FIXME: do not create a sub-directory and instead strip the "compression"
    # extension such gz, etc. or introspect the archive header to get the file
    # name when present.
    assert location
    assert decompressor

    warnings = []
    base_name = fileutils.file_base_name(location)
    target_location = os.path.join(fileutils.get_temp_dir(base_dir='extract'), base_name)
    with decompressor(location, 'rb') as compressed:
        with open(target_location, 'wb') as uncompressed:
            buffer_size = 32 * 1024 * 1024
            while True:
                chunk = compressed.read(buffer_size)
                if not chunk:
                    break
                uncompressed.write(chunk)
        if getattr(decompressor, 'has_trailing_garbage', False):
            warnings.append(location + ': Trailing garbage found and ignored.')
    return target_location, warnings
Example #8
0
def load_rules(rule_dir=rules_data_dir):
    """
    Return a list of rules, loaded from rules files.
    FIXME: return an iterable instead
    """
    rules = []

    seen_files = set()
    processed_files = set()
    for top, _, files in os.walk(rule_dir):
        for yfile in files:
            if yfile.endswith('.yml'):
                data_file = join(top, yfile)
                base_name = fileutils.file_base_name(yfile)
                text_file = join(top, base_name + '.RULE')
                rule = Rule(data_file=data_file, text_file=text_file)
                rules.append(rule)
                processed_files.add(data_file)
                processed_files.add(text_file)

            seen_file = join(top, yfile)
            seen_files.add(seen_file)

    unknown_files = seen_files - processed_files
    if unknown_files:
        print(unknown_files)
        files = '\n'.join(sorted(unknown_files))
        msg = 'Unknown files in rule directory: %(rule_dir)r\n%(files)s'
        raise Exception(msg % locals())
    return rules
def load_license_tests(test_dir=TEST_DATA_DIR):
    """
    Yield an iterable of LicenseTest loaded from test data files in test_dir.
    """
    # first collect files with .yml extension and files with other extensions
    # in two  maps keyed by file base_name
    data_files = {}
    test_files = {}
    for top, _, files in os.walk(test_dir):
        for yfile in files:
            if yfile. endswith('~'):
                continue
            base_name = fileutils.file_base_name(yfile)
            file_path = abspath(join(top, yfile))
            if yfile.endswith('.yml'):
                assert base_name not in data_files
                data_files[base_name] = file_path
            else:
                assert base_name not in test_files
                test_files[base_name] = file_path

    # ensure that each data file has a corresponding test file
    diff = set(data_files.keys()).symmetric_difference(set(test_files.keys()))
    assert not diff, ('Orphaned license test file(s) found: '
                      'test file without its YAML test descriptor '
                      'or YAML test descriptor without its test file.')

    # second, create pairs of corresponding (data_file, test file) for files
    # that have the same base_name
    for base_name, data_file in data_files.items():
        test_file = test_files[base_name]
        yield LicenseTest(data_file, test_file)
Example #10
0
def load_rules(rule_dir=rules_data_dir):
    """
    Return a list of rules, loaded from rules files.
    FIXME: return an iterable instead
    """
    rules = []

    seen_files = set()
    processed_files = set()
    for top, _, files in os.walk(rule_dir):
        for yfile in files:
            if yfile.endswith('.yml'):
                data_file = join(top, yfile)
                base_name = fileutils.file_base_name(yfile)
                text_file = join(top, base_name + '.RULE')
                rule = Rule(data_file=data_file, text_file=text_file)
                rules.append(rule)
                processed_files.add(data_file)
                processed_files.add(text_file)

            seen_file = join(top, yfile)
            seen_files.add(seen_file)

    unknown_files = seen_files - processed_files
    if unknown_files:
        print(unknown_files)
        files = '\n'.join(sorted(unknown_files))
        msg = 'Unknown files in rule directory: %(rule_dir)r\n%(files)s'
        raise Exception(msg % locals())
    return rules
Example #11
0
def uncompress_file(location, decompressor):
    """
    Uncompress a compressed file at location and return a temporary location of
    the uncompressed file and a list of warning messages. Raise Exceptions on
    errors. Use the `decompressor` object for decompression.
    """
    # FIXME: do not create a sub-directory and instead strip the "compression"
    # extension such gz, etc. or introspect the archive header to get the file
    # name when present.
    assert location
    assert decompressor

    warnings = []
    base_name = fileutils.file_base_name(location)
    target_location = os.path.join(fileutils.get_temp_dir(base_dir='extract'),
                                   base_name)
    with decompressor(location, 'rb') as compressed:
        with open(target_location, 'wb') as uncompressed:
            buffer_size = 32 * 1024 * 1024
            while True:
                chunk = compressed.read(buffer_size)
                if not chunk:
                    break
                uncompressed.write(chunk)
        if getattr(decompressor, 'has_trailing_garbage', False):
            warnings.append(location + ': Trailing garbage found and ignored.')
    return target_location, warnings
Example #12
0
def load_licenses(licenses_data_dir=licenses_data_dir , with_deprecated=False):
    """
    Return a mapping of key -> license objects, loaded from license files.
    Raise Exceptions if there are dangling orphaned files.
    """
    licenses = {}
    used_files = set()
    all_files = set(resource_iter(licenses_data_dir, ignored=ignore_editor_tmp_files, with_dirs=False))
    for data_file in sorted(all_files):
        if data_file.endswith('.yml'):
            key = file_base_name(data_file)
            lic = License(key, licenses_data_dir)
            used_files.add(data_file)
            if exists(lic.text_file):
                used_files.add(lic.text_file)
            if not with_deprecated and lic.is_deprecated:
                continue
            licenses[key] = lic

    dangling = all_files.difference(used_files)
    if dangling:
        msg = 'Some License data or text files are orphaned in "{}".\n'.format(licenses_data_dir)
        msg += '\n'.join('file://{}'.format(f) for f in sorted(dangling))
        raise Exception(msg)
    return licenses
def load_license_tests(test_dir=TEST_DATA_DIR):
    """
    Yield an iterable of LicenseTest loaded from test data files in test_dir.
    """
    # first collect files with .yml extension and files with other extensions
    # in two  maps keyed by file base_name
    data_files = {}
    test_files = {}
    for top, _, files in os.walk(test_dir):
        for yfile in files:
            base_name = fileutils.file_base_name(yfile)
            file_path = abspath(join(top, yfile))
            if yfile.endswith(".yml"):
                assert base_name not in data_files
                data_files[base_name] = file_path
            else:
                assert base_name not in test_files
                test_files[base_name] = file_path

    # ensure that each data file has a corresponding test file
    diff = set(data_files.keys()).symmetric_difference(set(test_files.keys()))
    assert not diff

    # second, create pairs of a data_file and the corresponding test file
    # that have the same base_name
    for base_name, data_file in data_files.items():
        test_file = test_files[base_name]
        yield LicenseTest(data_file, test_file)
Example #14
0
def create_html_app(output_file, results, version, scanned_path):  # NOQA
    """
    Given an html-app output_file, generate that file, create the data.js data
    file from the results and create the corresponding `_files` directory and
    copy the data and assets to this directory. The target directory is deleted
    if it exists.

    Raise HtmlAppAssetCopyWarning if the output_file is <stdout> or
    HtmlAppAssetCopyError if the copy was not possible.
    """
    try:
        if is_stdout(output_file):
            raise HtmlAppAssetCopyWarning()

        source_assets_dir = join(TEMPLATES_DIR, 'html-app', 'assets')

        # Return a tuple of (parent_dir, dir_name) directory named after the
        # `output_location` output_locationfile_base_name (stripped from extension) and
        # a `_files` suffix Return empty strings if output is to stdout.
        output_location = output_file.name
        tgt_root_path = dirname(output_location)
        tgt_assets_dir = file_base_name(output_location) + '_files'

        # delete old assets
        target_assets_dir = join(tgt_root_path, tgt_assets_dir)
        if exists(target_assets_dir):
            delete(target_assets_dir)

        # copy assets
        copytree(source_assets_dir, target_assets_dir)

        template = get_template(
            join(TEMPLATES_DIR, 'html-app', 'template.html'))
        rendered_html = template.render(assets_dir=target_assets_dir,
                                        scanned_path=scanned_path,
                                        version=version)
        output_file.write(rendered_html)

        # create help file
        help_template = get_template(
            join(TEMPLATES_DIR, 'html-app', 'help_template.html'))
        rendered_help = help_template.render(main_app=output_location)
        with io.open(join(target_assets_dir, 'help.html'),
                     'w',
                     encoding='utf-8') as f:
            f.write(rendered_help)

        # FIXME: this should a regular JSON scan format
        with io.open(join(target_assets_dir, 'data.js'), 'w') as f:
            f.write('data=')
            json.dump(list(results), f)

    except HtmlAppAssetCopyWarning as w:
        raise w

    except Exception as e:  # NOQA
        import traceback
        msg = 'ERROR: cannot create HTML application.\n' + traceback.format_exc(
        )
        raise HtmlAppAssetCopyError(msg)
def load_license_tests(test_dir=TEST_DATA_DIR):
    """
    Yield an iterable of LicenseTest loaded from test data files in test_dir.
    """
    # first collect files with .yml extension and files with other extensions
    # in two  maps keyed by file base_name
    data_files = {}
    test_files = {}
    for top, _, files in os.walk(test_dir):
        for yfile in files:
            base_name = fileutils.file_base_name(yfile)
            file_path = abspath(join(top, yfile))
            if yfile.endswith('.yml'):
                assert base_name not in data_files
                data_files[base_name] = file_path
            else:
                assert base_name not in test_files
                test_files[base_name] = file_path

    # ensure that each data file has a corresponding test file
    diff = set(data_files.keys()).symmetric_difference(set(test_files.keys()))
    assert not diff

    # second, create pairs of a data_file and the corresponding test file
    # that have the same base_name
    for base_name, data_file in data_files.items():
        test_file = test_files[base_name]
        yield LicenseTest(data_file, test_file)
Example #16
0
def get_dockerfile(location, echo=print):
    """
    Return a Dockerfile data dictionary if the location is a Dockerfile,
    otherwise return None.
    """
    fn = fileutils.file_base_name(location)
    if not 'Dockerfile' in fn:
        return {}

    echo('Found Dockerfile at: %(location)r' % locals())

    try:
        # TODO: keep comments instead of ignoring them:
        # assign the comments before an instruction line to a line "comment" attribute
        # assign end of line comment to the line
        # assign top of file and  end of file comments to file level comment attribute
        df = dockerfile_parse.DockerfileParser(location)

        df_data = OrderedDict()
        df_data['location'] = location
        df_data['base_image'] = df.baseimage
        df_data['instructions'] = []

        for entry in df.structure:
            entry = OrderedDict([(k, v) for k, v in sorted(entry.items())
                                 if k in (
                                     'instruction',
                                     'startline',
                                     'value',
                                 )])
            df_data['instructions'].append(entry)
        return {location: df_data}
    except:
        echo('Error parsing Dockerfile at: %(location)r' % locals())
        return {}
Example #17
0
def load_rules(rules_data_dir=rules_data_dir):
    """
    Return an iterable of rules loaded from rule files.
    """
    # TODO: OPTIMIZE: create a graph of rules to account for containment and
    # similarity clusters?
    seen_files = set()
    processed_files = set()
    lower_case_files = set()
    case_problems = set()
    model_errors = []
    for data_file in resource_iter(rules_data_dir, with_dirs=False):
        if data_file.endswith('.yml'):
            base_name = file_base_name(data_file)
            rule_file = join(rules_data_dir, base_name + '.RULE')
            try:
                rule = Rule(data_file=data_file, text_file=rule_file)
                yield rule
            except Exception as re:
                model_errors.append(str(re))
            # accumulate sets to ensures we do not have illegal names or extra
            # orphaned files
            data_lower = data_file.lower()
            if data_lower in lower_case_files:
                case_problems.add(data_lower)
            else:
                lower_case_files.add(data_lower)

            rule_lower = rule_file.lower()
            if rule_lower in lower_case_files:
                case_problems.add(rule_lower)
            else:
                lower_case_files.add(rule_lower)

            processed_files.update([data_file, rule_file])

        if not data_file.endswith('~'):
            seen_files.add(data_file)

    if model_errors:
        errors = '\n'.join(model_errors)
        msg = 'Invalid in rule directory: %(rules_data_dir)r\n%(errors)s'
        raise Exception(msg % locals())

    unknown_files = seen_files - processed_files
    if unknown_files or case_problems:

        if unknown_files:
            files = '\n'.join(sorted('file://' + f for f in unknown_files))
            msg = 'Orphaned files in rule directory: %(rules_data_dir)r\n%(files)s'

        if case_problems:
            files = '\n'.join(sorted('file://' + f for f in case_problems))
            msg += '\nRule files with non-unique name ignoring casein rule directory: %(rules_data_dir)r\n%(files)s'

        raise Exception(msg % locals())
Example #18
0
 def test_load_image_config(self):
     test_dir = self.get_test_loc('images/config')
     expected_dir = self.get_test_loc('images/config_expected')
     for config_file in os.listdir(test_dir):
         base_name = fileutils.file_base_name(config_file)
         config_file = os.path.join(test_dir, config_file)
         image = Image.load_image_config(config_file)
         expected = os.path.join(expected_dir, base_name + '.expected.json')
         result = image.as_dict()
         check_expected(result, expected, regen=True)
Example #19
0
 def test_file_base_name_on_path_and_location(self):
     test_dir = self.get_test_loc("fileutils/basename", copy=True)
     tests = [
         ("a/.a/file", "file"),
         ("a/.a/", ".a"),
         ("a/b/.a.b", ".a"),
         ("a/b/a.tag.gz", "a.tag"),
         ("a/b/", "b"),
         ("a/f.a", "f"),
         ("a/", "a"),
         ("f.a/a.c", "a"),
         ("f.a/", "f.a"),
         ("tst", "tst"),
     ]
     for test_file, name in tests:
         result = fileutils.file_base_name(test_file)
         assert name == result
         # also test on location
         result = fileutils.file_base_name((os.path.join(test_dir, test_file)))
         assert name == result
Example #20
0
def get_html_app_files_dirs(output_file):
    """
    Return a tuple of (parent_dir, dir_name) directory named after the
    `output_file` file object file_base_name (stripped from extension) and a
    `_files` suffix Return None if output is to stdout.
    """
    file_name = output_file.name
    if file_name == "<stdout>":
        return
    parent_dir = dirname(file_name)
    dir_name = fileutils.file_base_name(file_name) + "_files"
    return parent_dir, dir_name
Example #21
0
def get_resource_info(location):
    """
    Return a mapping suitable for the creation of a new CodebaseResource.
    """
    file_info = {}

    location_path = Path(location)
    is_symlink = location_path.is_symlink()
    is_file = location_path.is_file()

    if is_symlink:
        resource_type = CodebaseResource.Type.SYMLINK
        file_info["status"] = "symlink"
    elif is_file:
        resource_type = CodebaseResource.Type.FILE
    else:
        resource_type = CodebaseResource.Type.DIRECTORY

    file_info.update(
        {
            "type": resource_type,
            "name": fileutils.file_base_name(location),
            "extension": fileutils.file_extension(location),
        }
    )

    if is_symlink:
        return file_info

    # Missing fields on CodebaseResource model returned by `get_file_info`.
    unsupported_fields = [
        "is_binary",
        "is_text",
        "is_archive",
        "is_media",
        "is_source",
        "is_script",
        "date",
    ]

    other_info = scancode_api.get_file_info(location)

    # Skip unsupported_fields
    # Skip empty values to avoid null vs. '' conflicts
    other_info = {
        field_name: value
        for field_name, value in other_info.items()
        if field_name not in unsupported_fields and value
    }

    file_info.update(other_info)

    return file_info
 def test_file_base_name_on_path_and_location(self):
     test_dir = self.get_test_loc('fileutils/basename', copy=True)
     tests = [
         ('a/.a/file', 'file'),
         ('a/.a/', '.a'),
         ('a/b/.a.b', '.a'),
         ('a/b/a.tag.gz', 'a.tag'),
         ('a/b/', 'b'),
         ('a/f.a', 'f'),
         ('a/', 'a'),
         ('f.a/a.c', 'a'),
         ('f.a/', 'f.a'),
         ('tst', 'tst'),
     ]
     for test_file, name in tests:
         result = fileutils.file_base_name(test_file)
         assert name == result
         # also test on location
         result = fileutils.file_base_name(
             (os.path.join(test_dir, test_file)))
         assert name == result
Example #23
0
def get_html_app_files_dirs(output_file):
    """
    Return a tuple of (parent_dir, dir_name) directory named after the
    `output_file` file object file_base_name (stripped from extension) and a
    `_files` suffix Return None if output is to stdout.
    """
    file_name = output_file.name
    if file_name == '<stdout>':
        return
    parent_dir = dirname(file_name)
    dir_name = fileutils.file_base_name(file_name) + '_files'
    return parent_dir, dir_name
Example #24
0
def get_html_app_files_dirs(output_file):
    """
    Return a tuple of (parent_dir, dir_name) directory named after the
    `output_file` file object file_base_name (stripped from extension) and a
    `_files` suffix Return empty strings if output is to stdout.
    """
    if is_stdout(output_file):
        return '', ''

    file_name = output_file.name
    parent_dir = dirname(file_name)
    dir_name = fileutils.file_base_name(file_name) + '_files'
    return parent_dir, dir_name
def get_html_app_files_dirs(output_file):
    """
    Return a tuple of (parent_dir, dir_name) directory named after the
    `output_file` file object file_base_name (stripped from extension) and a
    `_files` suffix Return empty strings if output is to stdout.
    """
    if is_stdout(output_file):
        return '', ''

    file_name = output_file.name
    parent_dir = os.path.dirname(file_name)
    dir_name = fileutils.file_base_name(file_name) + '_files'
    return parent_dir, dir_name
Example #26
0
def load_licenses(licenses_data_dir=licenses_data_dir , with_deprecated=False):
    """
    Return a mapping of key -> license objects, loaded from license files.
    """
    licenses = {}
    for data_file in file_iter(licenses_data_dir):
        if not data_file.endswith('.yml'):
            continue
        key = file_base_name(data_file)
        lic = License(key, licenses_data_dir)
        if not with_deprecated and lic.is_deprecated:
            continue
        licenses[key] = lic
    return licenses
Example #27
0
def load_licenses(licenses_data_dir=licenses_data_dir, with_deprecated=False):
    """
    Return a mapping of key -> license objects, loaded from license files.
    """
    licenses = {}
    for data_file in file_iter(licenses_data_dir):
        if not data_file.endswith('.yml'):
            continue
        key = file_base_name(data_file)
        lic = License(key, licenses_data_dir)
        if not with_deprecated and lic.is_deprecated:
            continue
        licenses[key] = lic
    return licenses
Example #28
0
    def parse(cls, location):
        with open(location, encoding='utf-8') as loc:
            readme_manifest = loc.read()

        package_data = build_package(readme_manifest)

        if not package_data.name:
            # If no name was detected for the Package, then we use the basename
            # of the parent directory as the Package name
            parent_dir = fileutils.parent_directory(location)
            parent_dir_basename = fileutils.file_base_name(parent_dir)
            package_data.name = parent_dir_basename

        yield package_data
Example #29
0
def parse(location):
    """
    Return a Package object from a package.json file or None.
    """
    if not is_package_json(location):
        return

    with codecs.open(location, encoding='utf-8') as loc:
        package_data = json.load(loc, object_pairs_hook=OrderedDict)

    # a package.json is at the root of an NPM package
    base_dir = fileutils.parent_directory(location)
    metafile_name = fileutils.file_base_name(location)
    return build_package(package_data, base_dir, metafile_name)
Example #30
0
def load_rules(rules_data_dir=rules_data_dir, load_notes=False):
    """
    Return an iterable of rules loaded from rule files.
    """
    # TODO: OPTIMIZE: create a graph of rules to account for containment and similarity clusters?
    # TODO: we should assign the rule id at that stage
    seen_files = set()
    processed_files = set()
    lower_case_files = set()
    case_problems = set()
    for data_file in file_iter(rules_data_dir):
        if data_file.endswith('.yml'):
            base_name = file_base_name(data_file)
            rule_file = join(rules_data_dir, base_name + '.RULE')
            yield Rule(data_file=data_file,
                       text_file=rule_file,
                       load_notes=load_notes)

            # accumulate sets to ensures we do not have illegal names or extra
            # orphaned files
            data_lower = data_file.lower()
            if data_lower in lower_case_files:
                case_problems.add(data_lower)
            else:
                lower_case_files.add(data_lower)

            rule_lower = rule_file.lower()
            if rule_lower in lower_case_files:
                case_problems.add(rule_lower)
            else:
                lower_case_files.add(rule_lower)

            processed_files.update([data_file, rule_file])

        if not data_file.endswith('~'):
            seen_files.add(data_file)

    unknown_files = seen_files - processed_files
    if unknown_files or case_problems:
        if unknown_files:
            files = '\n'.join(sorted(unknown_files))
            msg = 'Orphaned files in rule directory: %(rules_data_dir)r\n%(files)s'

        if case_problems:
            files = '\n'.join(sorted(case_problems))
            msg += '\nRule files with non-unique name ignoring casein rule directory: %(rules_data_dir)r\n%(files)s'

        raise Exception(msg % locals())
Example #31
0
def load_rules(rules_data_dir=rules_data_dir, load_notes=False):
    """
    Return an iterable of rules loaded from rule files.
    """
    # TODO: OPTIMIZE: create a graph of rules to account for containment and similarity clusters?
    # TODO: we should assign the rule id at that stage
    seen_files = set()
    processed_files = set()
    lower_case_files = set()
    case_problems = set()
    for data_file in file_iter(rules_data_dir):
        if data_file.endswith('.yml'):
            base_name = file_base_name(data_file)
            rule_file = join(rules_data_dir, base_name + '.RULE')
            yield Rule(data_file=data_file, text_file=rule_file,
                       load_notes=load_notes)

            # accumulate sets to ensures we do not have illegal names or extra
            # orphaned files
            data_lower = data_file.lower()
            if data_lower in lower_case_files:
                case_problems.add(data_lower)
            else:
                lower_case_files.add(data_lower)

            rule_lower = rule_file.lower()
            if rule_lower in lower_case_files:
                case_problems.add(rule_lower)
            else:
                lower_case_files.add(rule_lower)

            processed_files.update([data_file, rule_file])

        if not data_file.endswith('~'):
            seen_files.add(data_file)

    unknown_files = seen_files - processed_files
    if unknown_files or case_problems:
        if unknown_files:
            files = '\n'.join(sorted(unknown_files))
            msg = 'Orphaned files in rule directory: %(rules_data_dir)r\n%(files)s'

        if case_problems:
            files = '\n'.join(sorted(case_problems))
            msg += '\nRule files with non-unique name ignoring casein rule directory: %(rules_data_dir)r\n%(files)s'

        raise Exception(msg % locals())
Example #32
0
def is_special_legal_file(location):
    file_base_name = fileutils.file_base_name(location).lower()
    file_extension = fileutils.file_extension(location).lower()

    if (any(special_name == file_base_name or special_name == file_extension
            for special_name in special_names_lower)
            or any(special_name in file_base_name
                   or special_name in file_extension
                   for special_name in special_names)):
        return 'yes'

    elif any(special_name in file_base_name or special_name in file_extension
             for special_name in special_names_lower):
        return 'maybe'
    else:
        # return False for now?
        pass
    def recognize(cls, location):
        """
        Yield one or more Package manifest objects given a file ``location`` pointing to a
        package archive, manifest or similar.
        """
        with open(location, encoding='utf-8') as loc:
            readme_manifest = loc.read()

        package = build_package(cls, readme_manifest)

        if not package.name:
            # If no name was detected for the Package, then we use the basename of
            # the parent directory as the Package name
            parent_dir = fileutils.parent_directory(location)
            parent_dir_basename = fileutils.file_base_name(parent_dir)
            package.name = parent_dir_basename

        yield package
Example #34
0
def new_name(location, is_dir=False):
    """
    Return a new non-existing location usable to write a file or create
    directory without overwriting existing files or directories in the same
    parent directory, ignoring the case of the name.
    The case of the name is ignored to ensure that similar results are returned
    across case sensitive (*nix) and case insensitive file systems.

    To find a new unique name:
     * pad a directory name with _X where X is an incremented number.
     * pad a file base name with _X where X is an incremented number and keep
       the extension unchanged.
    """
    assert location

    location = location.rstrip('\\/')
    name = fileutils.file_name(location).strip()
    if (not name or name == '.'
            # windows bare drive path as in c: or z:
            or (name and len(name) == 2 and name.endswith(':'))):
        name = 'file'

    parent = fileutils.parent_directory(location)
    # all existing files or directory as lower case
    siblings_lower = set(s.lower() for s in os.listdir(parent))

    if name.lower() not in siblings_lower:
        return posixpath.join(parent, name)

    ext = fileutils.file_extension(name)
    base_name = fileutils.file_base_name(name)
    if is_dir:
        # directories have no extension
        ext = ''
        base_name = name

    counter = 1
    while True:
        new_name = base_name + '_' + str(counter) + ext
        if new_name.lower() not in siblings_lower:
            break
        counter += 1
    return os.path.join(parent, new_name)
Example #35
0
def new_name(location, is_dir=False):
    """
    Return a new non-existing location usable to write a file or create
    directory without overwriting existing files or directories in the same
    parent directory, ignoring the case of the name.
    The case of the name is ignored to ensure that similar results are returned
    across case sensitive (*nix) and case insensitive file systems.

    To find a new unique name:
     * pad a directory name with _X where X is an incremented number.
     * pad a file base name with _X where X is an incremented number and keep
       the extension unchanged.
    """
    assert location
    
    location = location.rstrip('\\/')
    name = fileutils.file_name(location).strip()
    if (not name or name == '.' 
        # windows bare drive path as in c: or z:
        or (name and len(name)==2 and name.endswith(':'))):
        name = 'file'

    parent = fileutils.parent_directory(location)
    # all existing files or directory as lower case
    siblings_lower = set(s.lower() for s in os.listdir(parent))

    if name.lower() not in siblings_lower:
        return posixpath.join(parent, name)

    ext = fileutils.file_extension(name)
    base_name = fileutils.file_base_name(name)
    if is_dir:
        # directories have no extension
        ext = ''
        base_name = name

    counter = 1
    while True:
        new_name = base_name + '_' + str(counter) + ext
        if new_name.lower() not in siblings_lower:
            break
        counter += 1
    return os.path.join(parent, new_name)
Example #36
0
def is_special_legal_file(location):
        file_base_name = fileutils.file_base_name(location).lower()
        file_extension = fileutils.file_extension(location).lower()

        if (any(special_name == file_base_name
                or special_name == file_extension
                for special_name in special_names_lower)
         or any(special_name in file_base_name
                or special_name in file_extension
                for special_name in special_names)):
            return 'yes'

        elif any(special_name in file_base_name
                 or special_name in file_extension
                for special_name in special_names_lower):
            return 'maybe'
        else:
            # return False for now?
            pass
Example #37
0
def parse_debian_files_list(location, datasource_id, package_type):
    """
    Yield PackageData from a list of file paths at locations such as an from a
    Debian installed .list or .md5sums file.
    """
    qualifiers = {}
    filename = fileutils.file_base_name(location)
    if ':' in filename:
        name, _, arch = filename.partition(':')
        qualifiers['arch'] = arch
    else:
        name = filename

    file_references = []
    with open(location) as info_file:
        for line in info_file:
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            # for a plain file lits, the md5sum will be empty
            md5sum, _, path = line.partition(' ')
            path = path.strip()
            md5sum = md5sum and md5sum.strip() or None

            # we ignore dirs in general, and we ignore these that would
            # be created a plain dir when we can
            if path in ignored_root_dirs:
                continue

            ref = models.FileReference(path=path, md5=md5sum)
            file_references.append(ref)

    if not file_references:
        return

    yield models.PackageData(
        datasource_id=datasource_id,
        type=package_type,
        name=name,
        qualifiers=qualifiers,
        file_references=file_references,
    )
Example #38
0
def is_special_legal_file(location):
    """
    Return an indication that a file may be a "special" legal-like file.
    """
    file_base_name = fileutils.file_base_name(location).lower()
    file_extension = fileutils.file_extension(location).lower()

    if (any(special_name == file_base_name or special_name == file_extension
            for special_name in special_names_lower)
            or any(special_name in file_base_name
                   or special_name in file_extension
                   for special_name in special_names)):
        return 'yes'

    elif any(special_name in file_base_name or special_name in file_extension
             for special_name in special_names_lower):
        return 'maybe'
    else:
        # return False for now?
        pass
Example #39
0
def parse(location):
    """
    Return a Package object from a README manifest file or None.
    """
    if not is_readme_manifest(location):
        return

    with open(location, encoding='utf-8') as loc:
        readme_manifest = loc.read()

    package = build_package(readme_manifest)

    if not package.name:
        # If no name was detected for the Package, then we use the basename of
        # the parent directory as the Package name
        parent_dir = fileutils.parent_directory(location)
        parent_dir_basename = fileutils.file_base_name(parent_dir)
        package.name = parent_dir_basename

    return package
Example #40
0
def is_special_legal_file(location):
    """
    Return an indication that a file may be a "special" legal-like file.
    """
    file_base_name = fileutils.file_base_name(location).lower()
    file_extension = fileutils.file_extension(location).lower()

    if (any(special_name == file_base_name
            or special_name == file_extension
            for special_name in special_names_lower)
     or any(special_name in file_base_name
            or special_name in file_extension
            for special_name in special_names)):
        return 'yes'

    elif any(special_name in file_base_name
             or special_name in file_extension
            for special_name in special_names_lower):
        return 'maybe'
    else:
        # return False for now?
        pass
Example #41
0
def rules(rule_dir=rules_data_dir):
    """
    Return an iterable of rules loaded from rules files.
    """
    # TODO: OPTIMIZE: break RULES with gaps in multiple sub-rules??
    # TODO: OPTIMIZE: create a graph of rules to account for containment and similarity clusters?
    seen_files = set()
    processed_files = set()
    for top, _, files in walk(rule_dir):
        for yfile in (f for f in files if f.endswith('.yml')):
            data_file = join(top, yfile)
            base_name = file_base_name(yfile)
            text_file = join(top, base_name + '.RULE')
            processed_files.add(data_file)
            processed_files.add(text_file)
            yield Rule(data_file=data_file, text_file=text_file)
            seen_files.add(join(top, yfile))

    unknown_files = seen_files - processed_files
    if unknown_files:
        print(unknown_files)
        files = '\n'.join(sorted(unknown_files))
        msg = 'Unknown files in rule directory: %(rule_dir)r\n%(files)s'
        raise Exception(msg % locals())
 def get_test_method_name(self):
     dfn = fileutils.file_base_name(self.data_file.lower())
     test_name = f'test_alpine_license_detection_{dfn}'
     return text.python_safe_name(test_name)
Example #43
0
    def load_manifest(self, repo_dir):
        """
        Load this repository from a "manifest.json" JSON file for format v1.1/1.2.

        This file is a mapping with this shape:

        - The `Config` field references another JSON file in the tar or repo which
          includes the image data for this image.

        - The `RepoTags` field lists references pointing to this image.

        - The `Layers` field points to the filesystem changeset tars, e.g. the path
         to the layer.tar files as a list ordered from bottom to top layer.

        - An optional `Parent` field references the imageID (as a sha256-prefixed
         digest?) of the parent image. This parent must be part of the same
         `manifest.json` file.

        For example:

        [
            {'Config': '7043867122e704683c9eaccd7e26abcd5bc9fea413ddfeae66166697bdcbde1f.json',
             'Layers': [
                 '768d4f50f65f00831244703e57f64134771289e3de919a576441c9140e037ea2/layer.tar',
                 '6a630e46a580e8b2327fc45d9d1f4734ccaeb0afaa094e0f45722a5f1c91e009/layer.tar',
                 ]
             'RepoTags': ['user/image:version'],
             "Parent": "sha256:5a00e6ccb81ef304e1bb9995ea9605f199aa96659a44237d58ca96982daf9af8"
             },

            {'Config': '7043867122e704683c9eaccd7e26abcd5bc9fea413ddfeae66166697bdcbde1f.json',
             'Layers': [
                 '768d4f50f65f00831244703e57f64134771289e3de919a576441c9140e037ea2/layer.tar',
                 '6a630e46a580e8b2327fc45d9d1f4734ccaeb0afaa094e0f45722a5f1c91e009/layer.tar',
                 ]
             'RepoTags': ['user/image:version']
             },
        ]
        """
        manifest_file = join(repo_dir, MANIFEST_JSON_FILE)
        manifest = load_json(manifest_file)

        for image_config in manifest:
            config_file = image_config.get('Config')

            config_file = join(repo_dir, config_file)
            if not exists(config_file):
                # FIXME: orphaned manifest entry
                image_id = file_base_name(config_file)
                image = Image(image_id=image_id)
                assert image.image_id not in self.images_by_id
                self.images_by_id[image.image_id] = image
                continue

            image = Image.load_image_config(config_file)
            assert image.image_id not in self.images_by_id
            self.images_by_id[image.image_id] = image

            image.parent_digest = image_config.get('Parent')

            image.tags = image_config.get('RepoTags') or []
            for tag in image.tags:
                self.image_id_by_tags[tag] = image.image_id

            layer_paths = image_config.get('Layers') or []
            layers = OrderedDict()
            for lp in layer_paths:
                layer_dir = fileutils.parent_directory(lp).strip('/')
                layer_id = layer_dir
                layer_dir = join(repo_dir, layer_dir)
                layer = Layer.load_layer(layer_dir)
                layer_digest = sha256_digest(join(repo_dir, lp))
                if layer.layer_digest:
                    assert layer.layer_digest == layer_digest
                layers[layer_id] = layer
                self.layers_by_id[layer_id] = layer

            # the last one is the top one
            image.top_layer_id = layer_id
            image.top_layer_digest = layer_digest
Example #44
0
    def load_image_config(cls, config_file, verbose=True):
        """
        Return an Image object built from the image_config JSON file at location.

        Each Config JSON file for each image has this shape:
        {
            'docker_version': '1.8.2',
            'os': 'linux',
            'architecture': 'amd64',
            'author': '<author name>',
            'created': '2016-09-30T10:16:27.109917034Z',
            'container': '1ee508bc7a35150c9e5924097a31dfb4b6b2ca1260deb6fd14cb03c53764e40b',
            # these two mappings are essentially similar: image_config is the runtime image_config
            # and container_config is the image_config as it existed when the container was created.
            'image_config': { <some image_config k/v pairs> },
            'container_config': { <some image_config k/v pairs> },
            # array of objects describing the history of each layer.
            # The array is ordered from bottom-most layer to top-most layer.

            'history': [
                {'author': 'The CentOS Project <*****@*****.**> - ami_creator',
                 'created': '2015-04-22T05:12:47.171582029Z',
                 'created_by': '/bin/sh -c #(nop) MAINTAINER The CentOS Project <*****@*****.**> - ami_creator'
                 'comment': 'some comment (eg a commit message)',
                 'empty_layer': True or False (if not present, defaults to False.
                                True for empty, no-op layers with no rootfs content.
                },

                {'author': 'The CentOS Project <*****@*****.**> - ami_creator',
                 'created': '2015-04-22T05:13:47.072498418Z',
                 'created_by': '/bin/sh -c #(nop) ADD file:eab3c29917290b056db08167d3a9f769c4b4ce46403be2fad083bc2535fb4d03 in /'
                },
            ]
            # this is in order from bottom-most to top-most
            # each id is the sha256 of a layer.tar
            # NOTE: Empty layer may NOT have their digest listed here, so this list
            # may not align exactly with the history list:
            # e.g. this list only has entries if "empty_layer" is not set to True for that layer.
            'rootfs': {
                'diff_ids': ['sha256:5f70bf18a086007016e948b04aed3b82103a36bea41755b6cddfaf10ace3c6ef',
                             'sha256:2436bc321ced91d2f3052a98ff886a2feed0788eb524b2afeb48099d084c33f5',
                             'sha256:cd141a5beb0ec83004893dfea6ea8508c6d09a0634593c3f35c0d433898c9322',]
                'type': u'layers'
            }
        }
        """

        image_id = fileutils.file_base_name(config_file)
        config_digest = sha256_digest(config_file)
        if image_id != as_bare_id(config_digest):
            print('WARNING: image config digest is not consistent.')
            config_digest = 'sha256' + image_id

        image_config = load_json(config_file)

        # merge "configs"
        ccnf = image_config.pop('container_config', {})
        cnf = image_config.pop('config', {})
        config, warns = merge_configs(ccnf, cnf)

        if warns and verbose:
            print('Warning when loading: %(config_file)r' % locals())
            for w in warns:
                print(w)

        rootfs = image_config.pop('rootfs')
        # we only support this for now
        assert rootfs['type'] == 'layers'
        digests = rootfs['diff_ids']
        digests_it = iter(digests)

        # FIXME: this may not work if there is a diff for an empty layer with a
        # digest for some EMPTY content e.g. e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855

        # update layer data with digest(e.g align history and diff digests, skipping empty layers that have no digest)
        layers = image_config.pop('history')
        for lay in layers:
            if lay.get('empty_layer'):
                continue
            lay['layer_digest'] = next(digests_it)

        remaining = list(digests_it)
        assert not remaining

        layers = [Layer(**l) for l in layers]
        image_data = dict (
            image_id=image_id,
            layers=layers,
            config_digest=config_digest,
            top_layer_digest=layers[-1].layer_digest,
            top_layer_id=layers[-1].layer_id,
            config=config,
        )
        image_data.update(image_config)

        image = Image(**image_data)

        return image