def read_df(file, nrows=None): start('reading dataframe: ' + file) if file.endswith('.pickle'): df = load(file) else: sep = '\t' if '.tsv' in file else ',' if file.endswith('.7z'): import libarchive with libarchive.reader(file) as reader: df = pd.read_csv(reader, nrows=nrows, sep=sep) elif file.endswith('.zip'): import zipfile zf = zipfile.ZipFile(file) if len(zf.filelist) != 1: raise Exception('zip files with multiple files not supported') with zf.open(zf.filelist[0].filename) as reader: df = pd.read_csv(reader, nrows=nrows, sep=sep) else: compression = 'gzip' if file.endswith('.gz') else None nrows = None if nrows == None else int(nrows) df = pd.read_csv(file, compression=compression, nrows=nrows, sep=sep) stop('done reading dataframe') return df
def read_df(file, nrows=None, sheetname=None, header=0): start('reading dataframe 2: ' + file) if file.endswith('.pickle'): df = load(file) else: sep = '\t' if '.tsv' in file else ',' if file.endswith('.xls') or file.endswith('.xlsx'): skip_footer = 0 if nrows is not None: xl = pd.ExcelFile(file) total_rows = xl.book.sheet_by_index(0).nrows if sheetname is not None: total_rows = xl.book.sheet_by_name(sheetname).nrows skip_footer = total_rows - nrows df = pd.read_excel(file, sheetname=sheetname, skip_footer=skip_footer, header=header); elif file.endswith('.7z'): import libarchive with libarchive.reader(file) as reader: df = pd.read_csv(reader, nrows=nrows, sep=sep); elif file.endswith('.zip'): import zipfile zf = zipfile.ZipFile(file) if len(zf.filelist) != 1: raise Exception('zip files with multiple files not supported') with zf.open(zf.filelist[0].filename) as reader: df = pd.read_csv(reader, nrows=nrows, sep=sep); else: compression = 'gzip' if file.endswith('.gz') else None nrows = None if nrows == None else int(nrows) df = pd.read_csv(file, compression=compression, nrows=nrows, sep=sep); stop('done reading dataframe') return df
def read_df(file, nrows=None): t0 = time.time() if file.endswith(".pickle"): df = load(file) else: sep = "\t" if ".tsv" in file else None if file.endswith(".7z"): import libarchive with libarchive.reader(file) as reader: df = pd.read_csv(reader, nrows=nrows, sep=sep) else: compression = "gzip" if file.endswith(".gz") else None nrows = None if nrows == None else int(nrows) df = pd.read_csv(file, compression=compression, nrows=nrows, sep=sep) dbg( "data frame [" + file + "] read in " + str(datetime.timedelta(seconds=time.time() - t0)) + " shape: " + str(df.shape) ) return df
def read_df(file, nrows=None): start("reading dataframe: " + file) if file.endswith(".pickle"): df = load(file) else: sep = "\t" if ".tsv" in file else "," if file.endswith(".7z"): import libarchive with libarchive.reader(file) as reader: df = pd.read_csv(reader, nrows=nrows, sep=sep) elif file.endswith(".zip"): import zipfile zf = zipfile.ZipFile(file) if len(zf.filelist) != 1: raise Exception("zip files with multiple files not supported") with zf.open(zf.filelist[0].filename) as reader: df = pd.read_csv(reader, nrows=nrows, sep=sep) else: compression = "gzip" if file.endswith(".gz") else None nrows = None if nrows == None else int(nrows) df = pd.read_csv(file, compression=compression, nrows=nrows, sep=sep) stop("done reading dataframe") return df
def read_df(file, nrows=None): start('reading dataframe: ' + file) if file.endswith('.pickle'): df = load(file) else: sep = '\t' if '.tsv' in file else ',' if file.endswith('.7z'): import libarchive with libarchive.reader(file) as reader: df = pd.read_csv(reader, nrows=nrows, sep=sep); else: compression = 'gzip' if file.endswith('.gz') else None nrows = None if nrows == None else int(nrows) df = pd.read_csv(file, compression=compression, nrows=nrows, sep=sep); stop('done reading dataframe') return df
def read_df(file, nrows=None): t0 = time.time() if file.endswith('.pickle'): df = load(file) else: sep = '\t' if '.tsv' in file else None if file.endswith('.7z'): import libarchive with libarchive.reader(file) as reader: df = pd.read_csv(reader, nrows=nrows, sep=sep); else: compression = 'gzip' if file.endswith('.gz') else None nrows = None if nrows == None else int(nrows) df = pd.read_csv(file, compression=compression, nrows=nrows, sep=sep); dbg('data frame [' + file + '] read in ' + str(datetime.timedelta(seconds=time.time() - t0)) + ' shape: ' + str(df.shape)) return df
def read_df(file, nrows=None): start('reading dataframe: ' + file) if file.endswith('.pickle'): df = load(file) else: sep = '\t' if '.tsv' in file else ',' if file.endswith('.7z'): import libarchive with libarchive.reader(file) as reader: df = pd.read_csv(reader, nrows=nrows, sep=sep) else: compression = 'gzip' if file.endswith('.gz') else None nrows = None if nrows == None else int(nrows) df = pd.read_csv(file, compression=compression, nrows=nrows, sep=sep) stop('done reading dataframe') return df
def read_df(file, nrows=None, sheetname=None, header=0): start('reading dataframe 2: ' + file) if file.endswith('.pickle'): df = load(file) else: sep = '\t' if '.tsv' in file else ',' if file.endswith('.xls') or file.endswith('.xlsx'): skip_footer = 0 if nrows is not None: xl = pd.ExcelFile(file) total_rows = xl.book.sheet_by_index(0).nrows if sheetname is not None: total_rows = xl.book.sheet_by_name(sheetname).nrows skip_footer = total_rows - nrows df = pd.read_excel(file, sheetname=sheetname, skip_footer=skip_footer, header=header) elif file.endswith('.7z'): import libarchive with libarchive.reader(file) as reader: df = pd.read_csv(reader, nrows=nrows, sep=sep) elif file.endswith('.zip'): import zipfile zf = zipfile.ZipFile(file) if len(zf.filelist) != 1: raise Exception('zip files with multiple files not supported') with zf.open(zf.filelist[0].filename) as reader: df = pd.read_csv(reader, nrows=nrows, sep=sep) else: compression = 'gzip' if file.endswith('.gz') else None nrows = None if nrows == None else int(nrows) df = pd.read_csv(file, compression=compression, nrows=nrows, sep=sep) stop('done reading dataframe') return df
def read_df(file, nrows=None): start('reading dataframe: ' + file) if file.endswith('.pickle'): df = load(file) else: sep = '\t' if '.tsv' in file else ',' if file.endswith('.7z'): import libarchive with libarchive.reader(file) as reader: df = pd.read_csv(reader, nrows=nrows, sep=sep); elif file.endswith('.zip'): import zipfile zf = zipfile.ZipFile(file) if len(zf.filelist) != 1: raise Exception('zip files with multiple files not supported') with zf.open(zf.filelist[0].filename) as reader: df = pd.read_csv(reader, nrows=nrows, sep=sep); else: compression = 'gzip' if file.endswith('.gz') else None nrows = None if nrows == None else int(nrows) df = pd.read_csv(file, compression=compression, nrows=nrows, sep=sep); stop('done reading dataframe') return df