def find_corpus_fileids(root, regexp): if not isinstance(root, PathPointer): raise TypeError("find_corpus_fileids: expected a PathPointer") regexp += "$" # Find fileids in a zipfile: scan the zipfile's namelist. Filter # out entries that end in '/' -- they're directories. if isinstance(root, ZipFilePathPointer): fileids = [name[len(root.entry) :] for name in root.zipfile.namelist() if not name.endswith("/")] items = [name for name in fileids if re.match(regexp, name)] return sorted(items) # Find fileids in a directory: use os.walk to search all (proper # or symlinked) subdirectories, and match paths against the regexp. elif isinstance(root, FileSystemPathPointer): items = [] # workaround for py25 which doesn't support followlinks kwargs = {} if not py25(): kwargs = {"followlinks": True} for dirname, subdirs, fileids in os.walk(root.path, **kwargs): prefix = "".join("%s/" % p for p in _path_from(root.path, dirname)) items += [prefix + fileid for fileid in fileids if re.match(regexp, prefix + fileid)] # Don't visit svn directories: if ".svn" in subdirs: subdirs.remove(".svn") return sorted(items) else: raise AssertionError("Don't know how to handle %r" % root)
def find_corpus_fileids(root, regexp): if not isinstance(root, PathPointer): raise TypeError('find_corpus_fileids: expected a PathPointer') regexp += '$' # Find fileids in a zipfile: scan the zipfile's namelist. Filter # out entries that end in '/' -- they're directories. if isinstance(root, ZipFilePathPointer): fileids = [name[len(root.entry):] for name in root.zipfile.namelist() if not name.endswith('/')] items = [name for name in fileids if re.match(regexp, name)] return sorted(items) # Find fileids in a directory: use os.walk to search all (proper # or symlinked) subdirectories, and match paths against the regexp. elif isinstance(root, FileSystemPathPointer): items = [] # workaround for py25 which doesn't support followlinks kwargs = {} if not py25(): kwargs = {'followlinks': True} for dirname, subdirs, fileids in os.walk(root.path, **kwargs): prefix = ''.join('%s/' % p for p in _path_from(root.path, dirname)) items += [prefix+fileid for fileid in fileids if re.match(regexp, prefix+fileid)] # Don't visit svn directories: if '.svn' in subdirs: subdirs.remove('.svn') return sorted(items) else: raise AssertionError("Don't know how to handle %r" % root)