Example #1
0
def count_big_file_value_counts(fname, colname):
    '''
    Count the number of occurrances for each unique value in a column
    Returns a defaultdict containing the value counts
    '''
    value_counts = defaultdict(int)
    with closing(dataio.fopen(fname)) as fin:
        reader = csv.DictReader(fin)
        for row in reader:
            value_counts[row[colname]] += 1
    return value_counts
Example #2
0
    def test_fopen(self):
        """Test getting file handle based on file type"""

        import gzip
        import tarfile
        import zipfile

        ######################################################
        # Reading from file

        # txt file
        sample_file = os.path.join(CURDIR, 'res', 'sample1.txt')
        with dataio.fopen(sample_file) as f:
            self.assertTrue(isinstance(f, io.TextIOWrapper))
            self.assertEqual(next(f).strip().split('\t')[0], 'a')
            self.assertEqual(next(f).strip().split('\t')[1], '2')
            self.assertEqual(next(f).strip().split('\t')[2], '33')

        # csv file
        sample_file = os.path.join(CURDIR, 'res', 'sample1.csv')
        with dataio.fopen(sample_file) as f:
            self.assertTrue(isinstance(f, io.TextIOWrapper))
            self.assertEqual(next(f).strip().split(',')[0], 'a')
            self.assertEqual(next(f).strip().split(',')[1], '2')
            self.assertEqual(next(f).strip().split(',')[2], '33')

        # tsv file
        sample_file = os.path.join(CURDIR, 'res', 'sample1.tsv')
        with dataio.fopen(sample_file) as f:
            self.assertTrue(isinstance(f, io.TextIOWrapper))
            self.assertEqual(next(f).strip().split('\t')[0], 'a')
            self.assertEqual(next(f).strip().split('\t')[1], '2')
            self.assertEqual(next(f).strip().split('\t')[2], '33')

        # zip file
        sample_file = os.path.join(CURDIR, 'res', 'sample1.csv.zip')
        with dataio.fopen(sample_file) as f:
            self.assertTrue(isinstance(f.buffer, zipfile.ZipExtFile))
            self.assertEqual(next(f).strip().split(',')[0], 'a')
            self.assertEqual(next(f).strip().split(',')[1], '2')
            self.assertEqual(next(f).strip().split(',')[2], '33')

        # gz file
        sample_file = os.path.join(CURDIR, 'res', 'sample1.csv.gz')
        with dataio.fopen(sample_file) as f:
            self.assertTrue(isinstance(f.buffer, gzip.GzipFile))
            self.assertEqual(next(f).strip().split(',')[0], 'a')
            self.assertEqual(next(f).strip().split(',')[1], '2')
            self.assertEqual(next(f).strip().split(',')[2], '33')

        # tar.gz
        sample_file = os.path.join(CURDIR, 'res', 'sample1.csv.tar.gz')
        with dataio.fopen(sample_file) as f:
            self.assertTrue(isinstance(f.buffer, tarfile.ExFileObject))
            self.assertEqual(f.readline().strip().split(',')[0], 'a')
            self.assertEqual(f.readline().strip().split(',')[1], '2')
            self.assertEqual(f.readline().strip().split(',')[2], '33')

        # tar.bz2
        sample_file = os.path.join(CURDIR, 'res', 'sample1.csv.tar.bz2')
        with dataio.fopen(sample_file) as f:
            self.assertTrue(isinstance(f.buffer, tarfile.ExFileObject))
            self.assertEqual(f.readline().strip().split(',')[0], 'a')
            self.assertEqual(f.readline().strip().split(',')[1], '2')
            self.assertEqual(f.readline().strip().split(',')[2], '33')

        ######################################################
        # Writing to file

        # tsv file
        # Read from file
        sample_file = os.path.join(CURDIR, 'res', 'sample1.tsv')
        with dataio.fopen(sample_file) as f:
            data = f.read()
        # Write to second duplicate file
        sample_outfile = os.path.join(CURDIR, 'res', 'sample1.dup.tsv')
        with dataio.fopen(sample_outfile, 'w') as fout:
            self.assertTrue(isinstance(fout.buffer, io.BufferedWriter))
            fout.write(data)
        # Read from second duplicate file
        with open(sample_outfile, 'r') as f:
            data2 = f.read()
        self.assertEqual(data, data2)
        os.remove(sample_outfile)

        # gz file
        sample_file = os.path.join(CURDIR, 'res', 'sample1.csv.gz')
        data = dataio.fopen(sample_file).read()
        # Write to second duplicate file
        sample_outfile = os.path.join(CURDIR, 'res', 'sample1.csv.dup.gz')
        with dataio.fopen(sample_outfile, 'w') as fout:
            self.assertTrue(isinstance(fout.buffer, gzip.GzipFile))
            fout.write(data)
        # Read from second duplicate file
        data2 = dataio.fopen(sample_outfile, 'r').read()
        self.assertEqual(data, data2)
        os.remove(sample_outfile)
Example #3
0
    def test_load_subset(self):
        """Test loading a subset of data from filehandle to a dataframe"""
        # Setup test parameters
        sample_file = os.path.join(CURDIR, 'res', 'sample1.csv')
        sample_file_length = 0
        with dataio.fopen(sample_file, 'r') as f:
            for i, line in enumerate(f):
                if i == 0:
                    sample_cols = line.strip().split(',')
                sample_file_length += 1
        sample_cols_len = len(sample_cols)
        sample_data_shape = (sample_file_length - 1, sample_cols_len)
        sample_data_nocols_shape = (sample_file_length, sample_cols_len)
        mycols = ['x', 'y', 'z']
        autocols = [0, 1, 2]

        #===========================================================
        # Reading entire file

        #-----------------------------------------------------------
        # colnames defined

        # k = None
        df = dataio.load_subset(sample_file, colnames=mycols)
        self.assertEqual(df.shape, sample_data_shape)
        self.assertEqual(list(df.columns), mycols)

        # k = length(file)
        df = dataio.load_subset(
            sample_file, k=sample_file_length, colnames=mycols)
        self.assertEqual(df.shape, sample_data_shape)
        self.assertEqual(list(df.columns), mycols)

        #-----------------------------------------------------------
        # colnames not defined, use first row of file as colnames

        # k = None
        with open(sample_file, 'r') as f:
            df = dataio.load_subset(f)
        self.assertEqual(df.shape, sample_data_shape)
        self.assertEqual(list(df.columns), sample_cols)

        # k > length(file)
        df = dataio.load_subset(sample_file, k=9)
        self.assertEqual(df.shape, sample_data_shape)
        self.assertEqual(list(df.columns), sample_cols)

        #-----------------------------------------------------------
        # header = None, treat entire file as data

        # k = None
        with open(sample_file, 'r') as f:
            df = dataio.load_subset(f, header=None)
        self.assertEqual(df.shape, sample_data_nocols_shape)
        self.assertEqual(list(df.columns), autocols)

        # k = None, colnames defined
        df = dataio.load_subset(sample_file, header=None, colnames=mycols)
        self.assertEqual(df.shape, sample_data_nocols_shape)
        self.assertEqual(list(df.columns), mycols)

        # k = length(file)
        df = dataio.load_subset(sample_file, k=9, header=None)
        self.assertEqual(df.shape, sample_data_nocols_shape)
        self.assertEqual(list(df.columns), autocols)

        # k > length(file), colnames defined
        df = dataio.load_subset(sample_file, k=100, header=None, colnames=mycols)
        self.assertEqual(df.shape, sample_data_nocols_shape)
        self.assertEqual(list(df.columns), mycols)

        #===========================================================
        # Reading subset

        # colnames defined
        df = dataio.load_subset(sample_file, k=5, colnames=mycols)
        self.assertEqual(df.shape, (5, sample_cols_len))
        self.assertEqual(list(df.columns), mycols)

        # colnames defined, header = None
        df = dataio.load_subset(sample_file, k=5, colnames=mycols, header=None)
        self.assertEqual(df.shape, (5, sample_cols_len))
        self.assertEqual(list(df.columns), mycols)
        # Test that the first line of file is read in as data
        num_iter = 10 * sample_file_length
        num_1st_line = 0
        for i in range(num_iter):
            df = dataio.load_subset(sample_file, k=sample_file_length/2, colnames=mycols, header=None)
            if sample_cols[0] in df[mycols[0]].values:
                num_1st_line += 1
        self.assertTrue(0 < num_1st_line < num_iter)

        # colnames = None
        df = dataio.load_subset(sample_file, k=5)
        self.assertEqual(df.shape, (5, sample_cols_len))
        self.assertEqual(list(df.columns), sample_cols)

        # colnames = None, header = None
        df = dataio.load_subset(sample_file, k=5, header=None)
        self.assertEqual(df.shape, (5, sample_cols_len))
        self.assertEqual(list(df.columns), autocols)
        # Test that the first line of file is read in as data
        num_1st_line = 0
        for i in range(num_iter):
            df = dataio.load_subset(sample_file, k=sample_file_length/2, header=None)
            if sample_cols[0] in df[autocols[0]].values:
                num_1st_line += 1
        self.assertTrue(0 < num_1st_line < num_iter)


        #===========================================================
        # Delimiters and rseed

        sample_file_txt = os.path.join(CURDIR, 'res', 'sample1.txt')

        # sep = '\t', k = None
        df = dataio.load_subset(sample_file_txt, sep='\t')
        self.assertEqual(df.shape, sample_data_shape)
        self.assertEqual(list(df.columns), sample_cols)

        # sep = '\t', k = 5
        df = dataio.load_subset(sample_file_txt, sep='\t', k=5, header=None)
        self.assertEqual(df.shape, (5, sample_cols_len))
        self.assertEqual(list(df.columns), autocols)

        # rseed = 1, k = 5
        df = dataio.load_subset(sample_file_txt, sep='\t', k=5, header=None, rseed=1)
        self.assertEqual(df.shape, (5, sample_cols_len))
        self.assertEqual(list(df.columns), autocols)
        df2 = dataio.load_subset(sample_file_txt, sep='\t', k=5, header=None, rseed=1)
        a1 = np.squeeze(df.values.reshape(np.multiply(*df.shape),1))
        a2 = np.squeeze(df2.values.reshape(np.multiply(*df.shape),1))
        self.assertTrue((a1 == a2).all())

        # rseed = None, k = 5
        df = dataio.load_subset(sample_file_txt, sep='\t', k=5, header=None)
        df2 = dataio.load_subset(sample_file_txt, sep='\t', k=5, header=None)
        a1 = np.squeeze(df.values.reshape(np.multiply(*df.shape),1))
        a2 = np.squeeze(df2.values.reshape(np.multiply(*df.shape),1))
        self.assertFalse((a1 == a2).all())
Example #4
0
def summarize_big_training_data(fname,
                                y_name='Label',
                                n_uniq_toomany=1000,
                                progress_int=None,
                                summary_pkl='summary_data.pkl'):
    '''
    Summarize columnar data

    Input:
      fname: input file name
      y_name: column name of class labels or target y values
      n_uniq_toomany: number of unique column values considered too many to
                      continue counting
      progress_int: Output progress every progress_int number of rows of input
      summary_pkl: Name of output .pkl file for storing summary data.
                   Set to None in order to prevent output

    Returns tuple containing the following:
      DataFrame containing column summaries
      Number of total rows
      Dictionary containing y(label) value counts
    '''
    # Number of rows total
    n_rows = 0
    # Total number of instances for each class label
    label_counts = defaultdict(int)
    # Total number of null values per column
    null_counts = defaultdict(int)
    # Max and min values per column
    col_max = defaultdict(lambda: -np.inf)
    col_min = defaultdict(lambda: np.inf)
    col_numeric = defaultdict(lambda: True)
    # Number of unique values
    col_uniq_vals = defaultdict(set)
    col_uniq_vals_toomany = set()

    with closing(dataio.fopen(fname)) as fin:
        reader = csv.reader(fin)
        # Store colnames
        colnames = next(reader)
        for t,row in enumerate(reader):
            # Output progress
            if progress_int is not None and t % progress_int == 0:
                sys.stdout.write('{}\tencountered: {}\n'.format(datetime.now(), t))

            # Increment count of rows
            n_rows += 1

            # Create dictionary mapping colnames to each row value
            row_dict = dict(zip(colnames, row))

            # Update label couts
            if y_name not in col_uniq_vals_toomany:
                label_counts[row_dict[y_name]] += 1

            # Loop through cols
            for colname in colnames:

                # Update null counts
                col_val = row_dict[colname].strip()
                if not col_val:
                    null_counts[colname] += 1

                # Update max and min values
                if col_val and col_numeric[colname]:
                    try:
                        col_val = float(col_val)
                        col_max[colname] = max(col_max[colname],
                                               col_val)
                        col_min[colname] = min(col_min[colname],
                                               col_val)
                    except ValueError:
                        col_numeric[colname] = False

                # Update unique values per column
                uniq_vals_thiscol = col_uniq_vals[colname]
                if colname not in col_uniq_vals_toomany:
                    uniq_vals_thiscol.add(col_val)
                if len(uniq_vals_thiscol) > n_uniq_toomany:
                    col_uniq_vals_toomany.add(colname)

    summary_data = defaultdict(list)
    for colname in colnames:
        summary_data['attribute'].append(colname)
        summary_data['n_null'].append(null_counts[colname])
        summary_data['perc_null'].append(float(null_counts[colname])/n_rows)
        colmax, colmin = None, None
        if col_numeric[colname]:
            colmax = col_max[colname] if not np.isinf(col_max[colname]) else None
            colmin = col_min[colname] if not np.isinf(col_min[colname]) else None
        summary_data['max'].append(colmax)
        summary_data['min'].append(colmin)

        # Count number of unique values
        if colname in col_uniq_vals_toomany:
            n_uniq = '> {}'.format(n_uniq_toomany)
        else:
            n_uniq = len(col_uniq_vals[colname])
        summary_data['n_uniq'].append(n_uniq)

    # If there are too many y-values, set label_counts to None
    if y_name in col_uniq_vals_toomany:
        label_counts = None

    df_summary = pd.DataFrame(summary_data)

    if summary_pkl is not None:
        summary_data = {'summary': df_summary,
                        'n_rows': n_rows,
                        'label_counts': label_counts}
        with open(summary_pkl, 'wb') as f:
            pickle.dump(summary_data, f)

    return df_summary, n_rows, label_counts