Beispiel #1
0
def read_df(file, nrows=None):
    start('reading dataframe: ' + file)
    if file.endswith('.pickle'):
        df = load(file)
    else:

        sep = '\t' if '.tsv' in file else ','
        if file.endswith('.7z'):
            import libarchive

            with libarchive.reader(file) as reader:
                df = pd.read_csv(reader, nrows=nrows, sep=sep)
        elif file.endswith('.zip'):
            import zipfile
            zf = zipfile.ZipFile(file)
            if len(zf.filelist) != 1:
                raise Exception('zip files with multiple files not supported')
            with zf.open(zf.filelist[0].filename) as reader:
                df = pd.read_csv(reader, nrows=nrows, sep=sep)
        else:
            compression = 'gzip' if file.endswith('.gz') else None
            nrows = None if nrows == None else int(nrows)
            df = pd.read_csv(file,
                             compression=compression,
                             nrows=nrows,
                             sep=sep)
    stop('done reading dataframe')
    return df
Beispiel #2
0
def read_df(file, nrows=None, sheetname=None, header=0):
  start('reading dataframe 2: ' + file)
  if file.endswith('.pickle'):
    df = load(file)
  else:
    sep = '\t' if '.tsv' in file else ','
    if file.endswith('.xls') or file.endswith('.xlsx'):
      skip_footer = 0
      if nrows is not None:
        xl = pd.ExcelFile(file)
        total_rows = xl.book.sheet_by_index(0).nrows
        if sheetname is not None: total_rows = xl.book.sheet_by_name(sheetname).nrows
        skip_footer = total_rows - nrows
      df = pd.read_excel(file, sheetname=sheetname, skip_footer=skip_footer, header=header);
    elif file.endswith('.7z'):
      import libarchive
      with libarchive.reader(file) as reader:
        df = pd.read_csv(reader, nrows=nrows, sep=sep);
    elif file.endswith('.zip'):
      import zipfile
      zf = zipfile.ZipFile(file)
      if len(zf.filelist) != 1: raise Exception('zip files with multiple files not supported')
      with zf.open(zf.filelist[0].filename) as reader:
        df = pd.read_csv(reader, nrows=nrows, sep=sep);
    else:
      compression = 'gzip' if file.endswith('.gz') else None
      nrows = None if nrows == None else int(nrows)
      df = pd.read_csv(file, compression=compression, nrows=nrows, sep=sep);
  stop('done reading dataframe')
  return df
Beispiel #3
0
def read_df(file, nrows=None):
    t0 = time.time()
    if file.endswith(".pickle"):
        df = load(file)
    else:

        sep = "\t" if ".tsv" in file else None
        if file.endswith(".7z"):
            import libarchive

            with libarchive.reader(file) as reader:
                df = pd.read_csv(reader, nrows=nrows, sep=sep)
        else:

            compression = "gzip" if file.endswith(".gz") else None
            nrows = None if nrows == None else int(nrows)
            df = pd.read_csv(file, compression=compression, nrows=nrows, sep=sep)
    dbg(
        "data frame ["
        + file
        + "] read in "
        + str(datetime.timedelta(seconds=time.time() - t0))
        + " shape: "
        + str(df.shape)
    )
    return df
Beispiel #4
0
def read_df(file, nrows=None):
    start("reading dataframe: " + file)
    if file.endswith(".pickle"):
        df = load(file)
    else:

        sep = "\t" if ".tsv" in file else ","
        if file.endswith(".7z"):
            import libarchive

            with libarchive.reader(file) as reader:
                df = pd.read_csv(reader, nrows=nrows, sep=sep)
        elif file.endswith(".zip"):
            import zipfile

            zf = zipfile.ZipFile(file)
            if len(zf.filelist) != 1:
                raise Exception("zip files with multiple files not supported")
            with zf.open(zf.filelist[0].filename) as reader:
                df = pd.read_csv(reader, nrows=nrows, sep=sep)
        else:
            compression = "gzip" if file.endswith(".gz") else None
            nrows = None if nrows == None else int(nrows)
            df = pd.read_csv(file, compression=compression, nrows=nrows, sep=sep)
    stop("done reading dataframe")
    return df
Beispiel #5
0
def read_df(file, nrows=None):
  start('reading dataframe: ' + file)
  if file.endswith('.pickle'): 
    df = load(file)
  else:

    sep = '\t' if '.tsv' in file else ','
    if file.endswith('.7z'):
      import libarchive
   
      with libarchive.reader(file) as reader:
        df = pd.read_csv(reader, nrows=nrows, sep=sep);
    else:
      
      compression = 'gzip' if file.endswith('.gz') else None
      nrows = None if nrows == None else int(nrows)  
      df = pd.read_csv(file, compression=compression, nrows=nrows, sep=sep);
  stop('done reading dataframe')
  return df
Beispiel #6
0
def read_df(file, nrows=None):
  t0 = time.time()
  if file.endswith('.pickle'): 
    df = load(file)
  else:

    sep = '\t' if '.tsv' in file else None
    if file.endswith('.7z'):
      import libarchive
   
      with libarchive.reader(file) as reader:
        df = pd.read_csv(reader, nrows=nrows, sep=sep);
    else:
      
      compression = 'gzip' if file.endswith('.gz') else None
      nrows = None if nrows == None else int(nrows)  
      df = pd.read_csv(file, compression=compression, nrows=nrows, sep=sep);
  dbg('data frame [' + file + '] read in ' + 
      str(datetime.timedelta(seconds=time.time() - t0)) + ' shape: ' + str(df.shape))
  return df
Beispiel #7
0
def read_df(file, nrows=None):
  t0 = time.time()
  if file.endswith('.pickle'): 
    df = load(file)
  else:

    sep = '\t' if '.tsv' in file else None
    if file.endswith('.7z'):
      import libarchive
   
      with libarchive.reader(file) as reader:
        df = pd.read_csv(reader, nrows=nrows, sep=sep);
    else:
      
      compression = 'gzip' if file.endswith('.gz') else None
      nrows = None if nrows == None else int(nrows)  
      df = pd.read_csv(file, compression=compression, nrows=nrows, sep=sep);
  dbg('data frame [' + file + '] read in ' + 
      str(datetime.timedelta(seconds=time.time() - t0)) + ' shape: ' + str(df.shape))
  return df
Beispiel #8
0
def read_df(file, nrows=None):
    start('reading dataframe: ' + file)
    if file.endswith('.pickle'):
        df = load(file)
    else:

        sep = '\t' if '.tsv' in file else ','
        if file.endswith('.7z'):
            import libarchive

            with libarchive.reader(file) as reader:
                df = pd.read_csv(reader, nrows=nrows, sep=sep)
        else:

            compression = 'gzip' if file.endswith('.gz') else None
            nrows = None if nrows == None else int(nrows)
            df = pd.read_csv(file,
                             compression=compression,
                             nrows=nrows,
                             sep=sep)
    stop('done reading dataframe')
    return df
Beispiel #9
0
def read_df(file, nrows=None, sheetname=None, header=0):
    start('reading dataframe 2: ' + file)
    if file.endswith('.pickle'):
        df = load(file)
    else:
        sep = '\t' if '.tsv' in file else ','
        if file.endswith('.xls') or file.endswith('.xlsx'):
            skip_footer = 0
            if nrows is not None:
                xl = pd.ExcelFile(file)
                total_rows = xl.book.sheet_by_index(0).nrows
                if sheetname is not None:
                    total_rows = xl.book.sheet_by_name(sheetname).nrows
                skip_footer = total_rows - nrows
            df = pd.read_excel(file,
                               sheetname=sheetname,
                               skip_footer=skip_footer,
                               header=header)
        elif file.endswith('.7z'):
            import libarchive
            with libarchive.reader(file) as reader:
                df = pd.read_csv(reader, nrows=nrows, sep=sep)
        elif file.endswith('.zip'):
            import zipfile
            zf = zipfile.ZipFile(file)
            if len(zf.filelist) != 1:
                raise Exception('zip files with multiple files not supported')
            with zf.open(zf.filelist[0].filename) as reader:
                df = pd.read_csv(reader, nrows=nrows, sep=sep)
        else:
            compression = 'gzip' if file.endswith('.gz') else None
            nrows = None if nrows == None else int(nrows)
            df = pd.read_csv(file,
                             compression=compression,
                             nrows=nrows,
                             sep=sep)
    stop('done reading dataframe')
    return df
Beispiel #10
0
def read_df(file, nrows=None):
  start('reading dataframe: ' + file)
  if file.endswith('.pickle'): 
    df = load(file)
  else:

    sep = '\t' if '.tsv' in file else ','
    if file.endswith('.7z'):
      import libarchive
   
      with libarchive.reader(file) as reader:
        df = pd.read_csv(reader, nrows=nrows, sep=sep);
    elif file.endswith('.zip'):
      import zipfile
      zf = zipfile.ZipFile(file)
      if len(zf.filelist) != 1: raise Exception('zip files with multiple files not supported')
      with zf.open(zf.filelist[0].filename) as reader:
        df = pd.read_csv(reader, nrows=nrows, sep=sep);
    else:
      compression = 'gzip' if file.endswith('.gz') else None
      nrows = None if nrows == None else int(nrows)  
      df = pd.read_csv(file, compression=compression, nrows=nrows, sep=sep);
  stop('done reading dataframe')
  return df