def test_file_extension_on_path_and_location_10(self): test_dir = self.get_test_loc('fileutils/basename') test_file = 'tst' expected_name = '' result = fileutils.file_extension(test_file) assert expected_name == result result = fileutils.file_extension((os.path.join(test_dir, test_file))) assert expected_name == result
def get_file_infos(location): """ Return a list of dictionaries of informations collected from the file or directory at location. """ from commoncode import fileutils from commoncode import filetype from commoncode.hash import sha1, md5 from typecode import contenttype T = contenttype.get_type(location) is_file = T.is_file is_dir = T.is_dir infos = OrderedDict() infos['type'] = filetype.get_type(location, short=False) infos['name'] = fileutils.file_name(location) infos['extension'] = is_file and fileutils.file_extension(location) or '' infos['date'] = is_file and filetype.get_last_modified_date( location) or None infos['size'] = T.size infos['sha1'] = is_file and sha1(location) or None infos['md5'] = is_file and md5(location) or None infos['files_count'] = is_dir and filetype.get_file_count(location) or None infos['mime_type'] = is_file and T.mimetype_file or None infos['file_type'] = is_file and T.filetype_file or None infos['programming_language'] = is_file and T.programming_language or None infos['is_binary'] = is_file and T.is_binary or None infos['is_text'] = is_file and T.is_text or None infos['is_archive'] = is_file and T.is_archive or None infos['is_media'] = is_file and T.is_media or None infos['is_source'] = is_file and T.is_source or None infos['is_script'] = is_file and T.is_script or None return [infos]
def from_name(filename): """ Return an (E, N, V, R, A) tuple given a file name, by splitting [e:]name-version-release.arch into the four possible subcomponents. Default epoch, version, release and arch to None if not specified. Accepts RPM names with and without extensions """ parse_nevra = re.compile("^" "(.*)" "-" "([^-]*)" "-" "([^-]*)" "\\." "([^.]*)" "$").match file_ext = fileutils.file_extension(filename) or None if file_ext in ['.rpm', '.srpm']: filename = filename[:-len(file_ext)] m = parse_nevra(filename) if not m: return None n, v, r, a = m.groups() if file_ext == '.srpm': a = 'src' if ':' not in v: return None, n, v, r, a e, v = v.split(':', 1) e = int(e) return (e, n, v, r, a)
def is_special_legal_file(location): """ Return an indication that a file may be a "special" legal-like file. """ file_base_name = fileutils.file_base_name(location) file_base_name_lower = file_base_name.lower() file_extension = fileutils.file_extension(location) file_extension_lower = file_extension.lower() name_contains_special = (special_name in file_base_name or special_name in file_extension for special_name in special_names) name_lower_is_special = (special_name_lower in (file_base_name_lower, file_extension_lower) for special_name_lower in special_names_lower) name_lower_contains_special = ( special_name_lower in file_base_name_lower or special_name_lower in file_extension_lower for special_name_lower in special_names_lower) if any(name_contains_special) or any(name_lower_is_special): return 'yes' elif any(name_lower_contains_special): return 'maybe' else: # return False for now? pass
def get_pygments_lexer(location): """ Given an input file location, return a Pygments lexer appropriate for lexing this file content. """ try: T = _registry[location] if T.is_binary: return except KeyError: if is_binary(location): return # We first try to get a lexer using # - the filename # - then the lowercased filename # - and finally the begining of the file content. # We try with lowercase as detection is skewed otherwise (e.g. .java vs .JAVA) try: return get_lexer_for_filename(location) except LexerClassNotFound: try: return get_lexer_for_filename(location.lower()) except LexerClassNotFound: # only try content-based detection if we do not have an extension ext = fileutils.file_extension(location) if not ext: try: # if Pygments does not guess we should not carry forward content = get_text_file_start(location) return guess_lexer(content) except LexerClassNotFound: return
def get_file_infos(location): """ Return a list of dictionaries of informations collected from the file or directory at location. """ from commoncode import fileutils from commoncode import filetype from commoncode.hash import sha1, md5 from typecode import contenttype T = contenttype.get_type(location) is_file = T.is_file is_dir = T.is_dir infos = OrderedDict() infos['type'] = filetype.get_type(location, short=False) infos['name'] = fileutils.file_name(location) infos['extension'] = is_file and fileutils.file_extension(location) or '' infos['date'] = is_file and filetype.get_last_modified_date(location) or None infos['size'] = T.size infos['sha1'] = is_file and sha1(location) or None infos['md5'] = is_file and md5(location) or None infos['files_count'] = is_dir and filetype.get_file_count(location) or None infos['mime_type'] = is_file and T.mimetype_file or None infos['file_type'] = is_file and T.filetype_file or None infos['programming_language'] = is_file and T.programming_language or None infos['is_binary'] = is_file and T.is_binary or None infos['is_text'] = is_file and T.is_text or None infos['is_archive'] = is_file and T.is_archive or None infos['is_media'] = is_file and T.is_media or None infos['is_source'] = is_file and T.is_source or None infos['is_script'] = is_file and T.is_script or None return [infos]
def is_c_source(self): C_EXTENSIONS = set([ '.c', '.cc', '.cp', '.cpp', '.cxx', '.c++', '.h', '.hh', '.s', '.asm', '.hpp', '.hxx', '.h++', '.i', '.ii', '.m' ]) ext = fileutils.file_extension(self.location) return self.is_text is True and ext.lower() in C_EXTENSIONS
def test_file_extension_on_path_and_location(self): test_dir = self.get_test_loc("fileutils/basename", copy=True) tests = [ ("a/.a/file", ""), ("a/.a/", ""), ("a/b/.a.b", ".b"), ("a/b/a.tag.gz", ".gz"), ("a/b/", ""), ("a/f.a", ".a"), ("a/", ""), ("f.a/a.c", ".c"), ("f.a/", ""), ("tst", ""), ] for test_file, name in tests: result = fileutils.file_extension(test_file) assert name == result # also test on location result = fileutils.file_extension((os.path.join(test_dir, test_file))) assert name == result
def is_c_source(self): C_EXTENSIONS = set( ['.c', '.cc', '.cp', '.cpp', '.cxx', '.c++', '.h', '.hh', '.s', '.asm', '.hpp', '.hxx', '.h++', '.i', '.ii', '.m']) if on_linux and py2: C_EXTENSIONS = set(as_bytes(C_EXTENSIONS)) ext = fileutils.file_extension(self.location) if self.is_text is True and ext.lower() in C_EXTENSIONS: return True else: return False
def get_resource_info(location): """ Return a mapping suitable for the creation of a new CodebaseResource. """ file_info = {} location_path = Path(location) is_symlink = location_path.is_symlink() is_file = location_path.is_file() if is_symlink: resource_type = CodebaseResource.Type.SYMLINK file_info["status"] = "symlink" elif is_file: resource_type = CodebaseResource.Type.FILE else: resource_type = CodebaseResource.Type.DIRECTORY file_info.update( { "type": resource_type, "name": fileutils.file_base_name(location), "extension": fileutils.file_extension(location), } ) if is_symlink: return file_info # Missing fields on CodebaseResource model returned by `get_file_info`. unsupported_fields = [ "is_binary", "is_text", "is_archive", "is_media", "is_source", "is_script", "date", ] other_info = scancode_api.get_file_info(location) # Skip unsupported_fields # Skip empty values to avoid null vs. '' conflicts other_info = { field_name: value for field_name, value in other_info.items() if field_name not in unsupported_fields and value } file_info.update(other_info) return file_info
def test_file_extension_on_path_and_location(self): test_dir = self.get_test_loc('fileutils/basename', copy=True) tests = [ ('a/.a/file', ''), ('a/.a/', ''), ('a/b/.a.b', '.b'), ('a/b/a.tag.gz', '.gz'), ('a/b/', ''), ('a/f.a', '.a'), ('a/', ''), ('f.a/a.c', '.c'), ('f.a/', ''), ('tst', ''), ] for test_file, name in tests: result = fileutils.file_extension(test_file) assert name == result # also test on location result = fileutils.file_extension( (os.path.join(test_dir, test_file))) assert name == result
def is_special_legal_file(location): file_base_name = fileutils.file_base_name(location).lower() file_extension = fileutils.file_extension(location).lower() if (any(special_name == file_base_name or special_name == file_extension for special_name in special_names_lower) or any(special_name in file_base_name or special_name in file_extension for special_name in special_names)): return 'yes' elif any(special_name in file_base_name or special_name in file_extension for special_name in special_names_lower): return 'maybe' else: # return False for now? pass
def check_get_extractors(self, test_file, expected, kinds=()): from extractcode import archive test_loc = self.get_test_loc(test_file) if kinds: extractors = archive.get_extractors(test_loc, kinds) else: extractors = archive.get_extractors(test_loc) # import typecode # ft = 'TODO' or typecode.contenttype.get_type(test_loc).filetype_file # mt = 'TODO' or typecode.contenttype.get_type(test_loc).mimetype_file fe = fileutils.file_extension(test_loc).lower() em = ', '.join(e.__module__ + '.' + e.__name__ for e in extractors) msg = ('%(expected)r == %(extractors)r for %(test_file)s\n' 'with fe:%(fe)r, em:%(em)s' % locals()) assert expected == extractors, msg
def new_name(location, is_dir=False): """ Return a new non-existing location usable to write a file or create directory without overwriting existing files or directories in the same parent directory, ignoring the case of the name. The case of the name is ignored to ensure that similar results are returned across case sensitive (*nix) and case insensitive file systems. To find a new unique name: * pad a directory name with _X where X is an incremented number. * pad a file base name with _X where X is an incremented number and keep the extension unchanged. """ assert location location = location.rstrip('\\/') name = fileutils.file_name(location).strip() if (not name or name == '.' # windows bare drive path as in c: or z: or (name and len(name)==2 and name.endswith(':'))): name = 'file' parent = fileutils.parent_directory(location) # all existing files or directory as lower case siblings_lower = set(s.lower() for s in os.listdir(parent)) if name.lower() not in siblings_lower: return posixpath.join(parent, name) ext = fileutils.file_extension(name) base_name = fileutils.file_base_name(name) if is_dir: # directories have no extension ext = '' base_name = name counter = 1 while True: new_name = base_name + '_' + str(counter) + ext if new_name.lower() not in siblings_lower: break counter += 1 return os.path.join(parent, new_name)
def new_name(location, is_dir=False): """ Return a new non-existing location usable to write a file or create directory without overwriting existing files or directories in the same parent directory, ignoring the case of the name. The case of the name is ignored to ensure that similar results are returned across case sensitive (*nix) and case insensitive file systems. To find a new unique name: * pad a directory name with _X where X is an incremented number. * pad a file base name with _X where X is an incremented number and keep the extension unchanged. """ assert location location = location.rstrip('\\/') name = fileutils.file_name(location).strip() if (not name or name == '.' # windows bare drive path as in c: or z: or (name and len(name) == 2 and name.endswith(':'))): name = 'file' parent = fileutils.parent_directory(location) # all existing files or directory as lower case siblings_lower = set(s.lower() for s in os.listdir(parent)) if name.lower() not in siblings_lower: return posixpath.join(parent, name) ext = fileutils.file_extension(name) base_name = fileutils.file_base_name(name) if is_dir: # directories have no extension ext = '' base_name = name counter = 1 while True: new_name = base_name + '_' + str(counter) + ext if new_name.lower() not in siblings_lower: break counter += 1 return os.path.join(parent, new_name)
def get_file_infos(location, as_list=True): """ Return a list of dictionaries of informations collected from the file or directory at location. """ from commoncode import fileutils from commoncode import filetype from commoncode.hash import multi_checksums from scancode import utils from typecode import contenttype infos = OrderedDict() is_file = filetype.is_file(location) is_dir = filetype.is_dir(location) T = contenttype.get_type(location) infos['type'] = filetype.get_type(location, short=False) infos['name'] = fileutils.file_name(location) infos['extension'] = is_file and fileutils.file_extension(location) or '' infos['date'] = is_file and filetype.get_last_modified_date( location) or None infos['size'] = T.size infos.update(multi_checksums(location, ( 'sha1', 'md5', ))) infos['files_count'] = is_dir and filetype.get_file_count(location) or None infos['mime_type'] = is_file and T.mimetype_file or None infos['file_type'] = is_file and T.filetype_file or None infos['programming_language'] = is_file and T.programming_language or None infos['is_binary'] = bool(is_file and T.is_binary) infos['is_text'] = bool(is_file and T.is_text) infos['is_archive'] = bool(is_file and T.is_archive) infos['is_media'] = bool(is_file and T.is_media) infos['is_source'] = bool(is_file and T.is_source) infos['is_script'] = bool(is_file and T.is_script) if as_list: return [infos] else: return infos
def is_special_legal_file(location): """ Return an indication that a file may be a "special" legal-like file. """ file_base_name = fileutils.file_base_name(location).lower() file_extension = fileutils.file_extension(location).lower() if (any(special_name == file_base_name or special_name == file_extension for special_name in special_names_lower) or any(special_name in file_base_name or special_name in file_extension for special_name in special_names)): return 'yes' elif any(special_name in file_base_name or special_name in file_extension for special_name in special_names_lower): return 'maybe' else: # return False for now? pass
def from_name(filename): """ Return an (E, N, V, R, A) tuple given a file name, by splitting [e:]name-version-release.arch into the four possible subcomponents. Default epoch, version, release and arch to None if not specified. Accepts RPM names with and without extensions """ _re = re.compile("^(.*)-([^-]*)-([^-]*)\.([^.]*)$") file_ext = fileutils.file_extension(filename) or None if file_ext in ['.rpm', 'srpm']: filename = filename[:-len(file_ext)] m = _re.match(filename) if not m: return None n, v, r, a = m.groups() if file_ext == '.srpm': a = 'src' if ':' not in v: return None, n, v, r, a e, v = v.split(':', 1) e = int(e) return (e, n, v, r, a)
def is_markup(location): return fileutils.file_extension(location) in extensions
def is_c_source(self): ext = fileutils.file_extension(self.location) if self.is_text is True and ext.lower() in C_EXTENSIONS: return True else: return False