def count_big_file_value_counts(fname, colname): ''' Count the number of occurrances for each unique value in a column Returns a defaultdict containing the value counts ''' value_counts = defaultdict(int) with closing(dataio.fopen(fname)) as fin: reader = csv.DictReader(fin) for row in reader: value_counts[row[colname]] += 1 return value_counts
def test_fopen(self): """Test getting file handle based on file type""" import gzip import tarfile import zipfile ###################################################### # Reading from file # txt file sample_file = os.path.join(CURDIR, 'res', 'sample1.txt') with dataio.fopen(sample_file) as f: self.assertTrue(isinstance(f, io.TextIOWrapper)) self.assertEqual(next(f).strip().split('\t')[0], 'a') self.assertEqual(next(f).strip().split('\t')[1], '2') self.assertEqual(next(f).strip().split('\t')[2], '33') # csv file sample_file = os.path.join(CURDIR, 'res', 'sample1.csv') with dataio.fopen(sample_file) as f: self.assertTrue(isinstance(f, io.TextIOWrapper)) self.assertEqual(next(f).strip().split(',')[0], 'a') self.assertEqual(next(f).strip().split(',')[1], '2') self.assertEqual(next(f).strip().split(',')[2], '33') # tsv file sample_file = os.path.join(CURDIR, 'res', 'sample1.tsv') with dataio.fopen(sample_file) as f: self.assertTrue(isinstance(f, io.TextIOWrapper)) self.assertEqual(next(f).strip().split('\t')[0], 'a') self.assertEqual(next(f).strip().split('\t')[1], '2') self.assertEqual(next(f).strip().split('\t')[2], '33') # zip file sample_file = os.path.join(CURDIR, 'res', 'sample1.csv.zip') with dataio.fopen(sample_file) as f: self.assertTrue(isinstance(f.buffer, zipfile.ZipExtFile)) self.assertEqual(next(f).strip().split(',')[0], 'a') self.assertEqual(next(f).strip().split(',')[1], '2') self.assertEqual(next(f).strip().split(',')[2], '33') # gz file sample_file = os.path.join(CURDIR, 'res', 'sample1.csv.gz') with dataio.fopen(sample_file) as f: self.assertTrue(isinstance(f.buffer, gzip.GzipFile)) self.assertEqual(next(f).strip().split(',')[0], 'a') self.assertEqual(next(f).strip().split(',')[1], '2') self.assertEqual(next(f).strip().split(',')[2], '33') # tar.gz sample_file = os.path.join(CURDIR, 'res', 'sample1.csv.tar.gz') with dataio.fopen(sample_file) as f: self.assertTrue(isinstance(f.buffer, tarfile.ExFileObject)) self.assertEqual(f.readline().strip().split(',')[0], 'a') self.assertEqual(f.readline().strip().split(',')[1], '2') self.assertEqual(f.readline().strip().split(',')[2], '33') # tar.bz2 sample_file = os.path.join(CURDIR, 'res', 'sample1.csv.tar.bz2') with dataio.fopen(sample_file) as f: self.assertTrue(isinstance(f.buffer, tarfile.ExFileObject)) self.assertEqual(f.readline().strip().split(',')[0], 'a') self.assertEqual(f.readline().strip().split(',')[1], '2') self.assertEqual(f.readline().strip().split(',')[2], '33') ###################################################### # Writing to file # tsv file # Read from file sample_file = os.path.join(CURDIR, 'res', 'sample1.tsv') with dataio.fopen(sample_file) as f: data = f.read() # Write to second duplicate file sample_outfile = os.path.join(CURDIR, 'res', 'sample1.dup.tsv') with dataio.fopen(sample_outfile, 'w') as fout: self.assertTrue(isinstance(fout.buffer, io.BufferedWriter)) fout.write(data) # Read from second duplicate file with open(sample_outfile, 'r') as f: data2 = f.read() self.assertEqual(data, data2) os.remove(sample_outfile) # gz file sample_file = os.path.join(CURDIR, 'res', 'sample1.csv.gz') data = dataio.fopen(sample_file).read() # Write to second duplicate file sample_outfile = os.path.join(CURDIR, 'res', 'sample1.csv.dup.gz') with dataio.fopen(sample_outfile, 'w') as fout: self.assertTrue(isinstance(fout.buffer, gzip.GzipFile)) fout.write(data) # Read from second duplicate file data2 = dataio.fopen(sample_outfile, 'r').read() self.assertEqual(data, data2) os.remove(sample_outfile)
def test_load_subset(self): """Test loading a subset of data from filehandle to a dataframe""" # Setup test parameters sample_file = os.path.join(CURDIR, 'res', 'sample1.csv') sample_file_length = 0 with dataio.fopen(sample_file, 'r') as f: for i, line in enumerate(f): if i == 0: sample_cols = line.strip().split(',') sample_file_length += 1 sample_cols_len = len(sample_cols) sample_data_shape = (sample_file_length - 1, sample_cols_len) sample_data_nocols_shape = (sample_file_length, sample_cols_len) mycols = ['x', 'y', 'z'] autocols = [0, 1, 2] #=========================================================== # Reading entire file #----------------------------------------------------------- # colnames defined # k = None df = dataio.load_subset(sample_file, colnames=mycols) self.assertEqual(df.shape, sample_data_shape) self.assertEqual(list(df.columns), mycols) # k = length(file) df = dataio.load_subset( sample_file, k=sample_file_length, colnames=mycols) self.assertEqual(df.shape, sample_data_shape) self.assertEqual(list(df.columns), mycols) #----------------------------------------------------------- # colnames not defined, use first row of file as colnames # k = None with open(sample_file, 'r') as f: df = dataio.load_subset(f) self.assertEqual(df.shape, sample_data_shape) self.assertEqual(list(df.columns), sample_cols) # k > length(file) df = dataio.load_subset(sample_file, k=9) self.assertEqual(df.shape, sample_data_shape) self.assertEqual(list(df.columns), sample_cols) #----------------------------------------------------------- # header = None, treat entire file as data # k = None with open(sample_file, 'r') as f: df = dataio.load_subset(f, header=None) self.assertEqual(df.shape, sample_data_nocols_shape) self.assertEqual(list(df.columns), autocols) # k = None, colnames defined df = dataio.load_subset(sample_file, header=None, colnames=mycols) self.assertEqual(df.shape, sample_data_nocols_shape) self.assertEqual(list(df.columns), mycols) # k = length(file) df = dataio.load_subset(sample_file, k=9, header=None) self.assertEqual(df.shape, sample_data_nocols_shape) self.assertEqual(list(df.columns), autocols) # k > length(file), colnames defined df = dataio.load_subset(sample_file, k=100, header=None, colnames=mycols) self.assertEqual(df.shape, sample_data_nocols_shape) self.assertEqual(list(df.columns), mycols) #=========================================================== # Reading subset # colnames defined df = dataio.load_subset(sample_file, k=5, colnames=mycols) self.assertEqual(df.shape, (5, sample_cols_len)) self.assertEqual(list(df.columns), mycols) # colnames defined, header = None df = dataio.load_subset(sample_file, k=5, colnames=mycols, header=None) self.assertEqual(df.shape, (5, sample_cols_len)) self.assertEqual(list(df.columns), mycols) # Test that the first line of file is read in as data num_iter = 10 * sample_file_length num_1st_line = 0 for i in range(num_iter): df = dataio.load_subset(sample_file, k=sample_file_length/2, colnames=mycols, header=None) if sample_cols[0] in df[mycols[0]].values: num_1st_line += 1 self.assertTrue(0 < num_1st_line < num_iter) # colnames = None df = dataio.load_subset(sample_file, k=5) self.assertEqual(df.shape, (5, sample_cols_len)) self.assertEqual(list(df.columns), sample_cols) # colnames = None, header = None df = dataio.load_subset(sample_file, k=5, header=None) self.assertEqual(df.shape, (5, sample_cols_len)) self.assertEqual(list(df.columns), autocols) # Test that the first line of file is read in as data num_1st_line = 0 for i in range(num_iter): df = dataio.load_subset(sample_file, k=sample_file_length/2, header=None) if sample_cols[0] in df[autocols[0]].values: num_1st_line += 1 self.assertTrue(0 < num_1st_line < num_iter) #=========================================================== # Delimiters and rseed sample_file_txt = os.path.join(CURDIR, 'res', 'sample1.txt') # sep = '\t', k = None df = dataio.load_subset(sample_file_txt, sep='\t') self.assertEqual(df.shape, sample_data_shape) self.assertEqual(list(df.columns), sample_cols) # sep = '\t', k = 5 df = dataio.load_subset(sample_file_txt, sep='\t', k=5, header=None) self.assertEqual(df.shape, (5, sample_cols_len)) self.assertEqual(list(df.columns), autocols) # rseed = 1, k = 5 df = dataio.load_subset(sample_file_txt, sep='\t', k=5, header=None, rseed=1) self.assertEqual(df.shape, (5, sample_cols_len)) self.assertEqual(list(df.columns), autocols) df2 = dataio.load_subset(sample_file_txt, sep='\t', k=5, header=None, rseed=1) a1 = np.squeeze(df.values.reshape(np.multiply(*df.shape),1)) a2 = np.squeeze(df2.values.reshape(np.multiply(*df.shape),1)) self.assertTrue((a1 == a2).all()) # rseed = None, k = 5 df = dataio.load_subset(sample_file_txt, sep='\t', k=5, header=None) df2 = dataio.load_subset(sample_file_txt, sep='\t', k=5, header=None) a1 = np.squeeze(df.values.reshape(np.multiply(*df.shape),1)) a2 = np.squeeze(df2.values.reshape(np.multiply(*df.shape),1)) self.assertFalse((a1 == a2).all())
def summarize_big_training_data(fname, y_name='Label', n_uniq_toomany=1000, progress_int=None, summary_pkl='summary_data.pkl'): ''' Summarize columnar data Input: fname: input file name y_name: column name of class labels or target y values n_uniq_toomany: number of unique column values considered too many to continue counting progress_int: Output progress every progress_int number of rows of input summary_pkl: Name of output .pkl file for storing summary data. Set to None in order to prevent output Returns tuple containing the following: DataFrame containing column summaries Number of total rows Dictionary containing y(label) value counts ''' # Number of rows total n_rows = 0 # Total number of instances for each class label label_counts = defaultdict(int) # Total number of null values per column null_counts = defaultdict(int) # Max and min values per column col_max = defaultdict(lambda: -np.inf) col_min = defaultdict(lambda: np.inf) col_numeric = defaultdict(lambda: True) # Number of unique values col_uniq_vals = defaultdict(set) col_uniq_vals_toomany = set() with closing(dataio.fopen(fname)) as fin: reader = csv.reader(fin) # Store colnames colnames = next(reader) for t,row in enumerate(reader): # Output progress if progress_int is not None and t % progress_int == 0: sys.stdout.write('{}\tencountered: {}\n'.format(datetime.now(), t)) # Increment count of rows n_rows += 1 # Create dictionary mapping colnames to each row value row_dict = dict(zip(colnames, row)) # Update label couts if y_name not in col_uniq_vals_toomany: label_counts[row_dict[y_name]] += 1 # Loop through cols for colname in colnames: # Update null counts col_val = row_dict[colname].strip() if not col_val: null_counts[colname] += 1 # Update max and min values if col_val and col_numeric[colname]: try: col_val = float(col_val) col_max[colname] = max(col_max[colname], col_val) col_min[colname] = min(col_min[colname], col_val) except ValueError: col_numeric[colname] = False # Update unique values per column uniq_vals_thiscol = col_uniq_vals[colname] if colname not in col_uniq_vals_toomany: uniq_vals_thiscol.add(col_val) if len(uniq_vals_thiscol) > n_uniq_toomany: col_uniq_vals_toomany.add(colname) summary_data = defaultdict(list) for colname in colnames: summary_data['attribute'].append(colname) summary_data['n_null'].append(null_counts[colname]) summary_data['perc_null'].append(float(null_counts[colname])/n_rows) colmax, colmin = None, None if col_numeric[colname]: colmax = col_max[colname] if not np.isinf(col_max[colname]) else None colmin = col_min[colname] if not np.isinf(col_min[colname]) else None summary_data['max'].append(colmax) summary_data['min'].append(colmin) # Count number of unique values if colname in col_uniq_vals_toomany: n_uniq = '> {}'.format(n_uniq_toomany) else: n_uniq = len(col_uniq_vals[colname]) summary_data['n_uniq'].append(n_uniq) # If there are too many y-values, set label_counts to None if y_name in col_uniq_vals_toomany: label_counts = None df_summary = pd.DataFrame(summary_data) if summary_pkl is not None: summary_data = {'summary': df_summary, 'n_rows': n_rows, 'label_counts': label_counts} with open(summary_pkl, 'wb') as f: pickle.dump(summary_data, f) return df_summary, n_rows, label_counts