def magic_open(filename, verbose=False, cpus=None): """ To read uncompressed zip gzip bzip2 or tar.xx files :param filename: either a path to a file, or a file handler :returns: opened file ready to be iterated """ textchars = bytearray({7,8,9,10,12,13,27} | set(range(0x20, 0x100)) - {0x7f}) is_binary_string = lambda bytes: bool(bytes.translate(None, textchars)) if isinstance(filename, basestring) or isinstance(filename, basestring): fhandler = open(filename, 'rb') inputpath = True if tarfile.is_tarfile(filename): print('tar') thandler = tarfile.open(filename) if len(thandler.members) != 1: raise NotImplementedError( 'Not exactly one file in this tar archieve.') return magic_open(thandler.extractfile(thandler.getnames()[0])) else: fhandler = filename filename = fhandler.name inputpath = False start_of_file = '' if filename.endswith('.dsrc'): dsrc_binary = which('dsrc') if not dsrc_binary: raise Exception('\n\nERROR: DSRC binary not found, install it from:' '\nhttps://github.com/lrog/dsrc/releases') proc = Popen([dsrc_binary, 'd', '-t%d' % (cpus or cpu_count()), '-s', filename], stdout=PIPE, universal_newlines=True) return proc.stdout if inputpath: start_of_file = fhandler.read(1024) fhandler.seek(0) if is_binary_string(start_of_file): if start_of_file.startswith(b'\x50\x4b\x03\x04'): if verbose: print('zip') zhandler = TextIOWrapper(zipfile.ZipFile(fhandler)) if len(zhandler.NameToInfo) != 1: raise NotImplementedError( 'Not exactly one file in this zip archieve.') return TextIOWrapper(BufferedReader(zhandler.open(list(zhandler.NameToInfo.keys())[0]))) if is_binary_string(start_of_file) and start_of_file.startswith(b'\x42\x5a\x68'): if verbose: print('bz2') fhandler.close() return TextIOWrapper(BufferedReader(bz2.BZ2File(filename))) if is_binary_string(start_of_file) and start_of_file.startswith(b'\x1f\x8b\x08'): if verbose: print('gz') return TextIOWrapper(BufferedReader(gzip.GzipFile(fileobj=fhandler))) else: if verbose: print('text') fhandler.close() fhandler = open(filename, 'r') return fhandler