def make_grid_for_file(results_dir, list_file, grid_file): POOL_DIRECTIONS = ['H', 'V'] POOL_RANGE = range(1, 13) logger.info('%s => %s', list_file, grid_file) df = DataFrame(filename=os.path.join(results_dir, list_file)) (pair, id) = df.get_columns('pair', 'id') logger.info('%d hits', len(pair)) col_names = [ 'V' + str(i) for i in POOL_RANGE ] row_names = [ 'H' + str(i) for i in POOL_RANGE ] data_dict = dict() # will hold a list of the hits for each row, column pair for r in row_names: for c in col_names: data_dict[(r,c)] = [ ] for (mypair, myid) in zip(pair, id): (horiz, vert) = mypair.split(' x ') data_dict[(horiz, vert)] = data_dict[(horiz, vert)] + [ myid ] # now build a new data frame as a list of tuples, column name and column list data_by_column = [ ] # first column is the row names data_by_column += [(grid_file, row_names)] # subsequent columns are by vertical pool for c in col_names: col_data = [ ] for r in row_names: col_data.append(' '.join(sorted(data_dict[(r,c)]))) data_by_column += [ (c, col_data)] grid_dataframe = DataFrame(data=data_by_column) grid_dataframe.write(os.path.join(results_dir, grid_file))
def create_map_file(data_dir, map_filename): file_list = sorted(os.listdir(data_dir)) pool_list = [ ] base_list = [ ] for file_name in file_list: (base, ext) = os.path.splitext(file_name) if (ext == '.gpr') or (ext == '.GPR'): logger.info('dir %s file %s base %s ext %s', data_dir, file_name, base, ext) toks = re.split('_|-', base) # split on underscore or dash # start looking from the end for a token that is a valid pool name toks.reverse() pool_str = '' for tok in toks: if is_valid_pool_name(tok): pool_str = tok break if not is_valid_pool_name(pool_str): logger.warn('%s has no valid pool name, skipping %s', ext, file_name) continue if pool_str in pool_list: logger.error('pool %s is repeated, ignoring', ext) pool_list.append(pool_str) base_list.append(base) df = DataFrame(data = [('pool', pool_list), ('file', base_list)]) df.write(map_filename) return None
def get_control_from_file(filename, simple=True): """ read the file as a data frame for each id, check how many times it occurs as control or experimental make a dict with (id, name) as key where pair is often as controls, or name is nd """ logger.info('reading controls from %s', filename) control = DataFrame(filename=filename) control_dict = dict() if (simple): (ids, names) = control.get_columns('id','name') for (id, name) in zip(ids, names): control_dict[(id,name)] = True else: (id, name, control, exptl) = control.get_columns('id', 'name', 'control', 'exptl') id_to_name = dict() for (i, n, c, e) in zip(id, name, control, exptl): isND = n in [ 'ND', 'nd', 'N.D.' ] isControl = (i == 'CONTROL') isIgg = (n == 'IgG') if ((c >= e) or isND or isControl or isIgg): control_dict[(i, n)] = True # insert some special cases control_dict[('CONTROL', 'IgG')] = True for (i, n) in zip(id, name): if i not in id_to_name: id_to_name[i] = dict() id_to_name[i][n] = True id_to_names = dict() for i in id_to_name: names = sorted(id_to_name[i].keys()) cnt = len(names) name_str = ','.join(names) id_to_names[i] = dict() id_to_names[i]['cnt'] = cnt id_to_names[i]['names'] = name_str ids = sorted(id_to_names.keys()) cnts = [ id_to_names[x]['cnt'] for x in ids ] names = [ id_to_names[x]['names'] for x in ids ] df = DataFrame(data=[ ('id', ids), ('cnt', cnts), ('names', names)]) df.write('id_to_names.txt') return(control_dict)
def write_pool_hit(pool_to_file, pool_hit): pool_list = [ ] file_list = [ ] id_list = [ ] zscore_list = [ ] ratio_list = [ ] for (p, f) in zip(pool_to_file.data['pool'], pool_to_file.data['file']): # some pools have no hits if p not in pool_hit: continue for h in pool_hit[p]: pool_list.append(p) file_list.append(f) id_list.append(h) zscore_list.append(pool_hit[p][h]['zscore']) ratio_list.append(pool_hit[p][h]['ratio']) df = DataFrame( data=[('pool', pool_list), ('file', file_list), ('id', id_list), ('zscore', zscore_list), ('ratio', ratio_list)] ) df.write('pool_hit.txt')
def print_control_dict(control_dict, control_dict_filename): keys = sorted(control_dict.keys()) ids = [ x[0] for x in keys ] names = [ x[1] for x in keys ] df = DataFrame(data=[('id', ids), ('name', names)]) df.write(filename=control_dict_filename)
def process_gpr_file(input_file, output_file, summary_file, \ signal_fg, signal_bg, norm_fg, norm_bg, \ do_norm, do_log, \ control_dict=None): """ open input_file as a gpr extract columns corresponding to F635 Median and B635 Median (fore- and back-ground) add new column fg/bg ratio extract Flags column as a mask if control_dict is None: mask out values with Flags == -100 else: mask out values based on control_dict calculate mean and standard deviation of the ratio calculate z-score for each row calculate stouffer's z-score ?or mean z-score? for probes with same ID print probes with (mean) z-score >= 2.5 """ FLAG_BAD = -100 logger.info('%s => %s', input_file, output_file) gpr = GPR(input_file) # print debug information for a gpr file # gpr.print_summary() # keep track of which columns we've added columns_added = [ ] # start by extracting the flags and adding an index for the original row number (flags, ids, names, fg, bg) = gpr.get_columns(['Flags', 'ID', 'Name', signal_fg, signal_bg]) if do_norm: (n_fg, n_bg) = gpr.get_columns([norm_fg, norm_bg]) n_row_orig = len(flags) logger.info('n_row_orig %d', n_row_orig) row_number_orig = np.array(range(1, n_row_orig + 1)) gpr.add_columns( ('row_number_orig', row_number_orig)) columns_added += ['row_number_orig'] # identify rows with bad flags and delete them # follow the semantics of a numpy masked array: delete where mask is True # controls from a dictionary mask_control = [ False for x in ids ] if (control_dict is not None): control_ids = dict() # for controls, just worry about ID, not name for (i, n) in control_dict.keys(): control_ids[i] = True mask_control = [ i in control_ids for i in ids ] # user interface permits manual flagging of bad data, usually -100 mask_flag = flags <= FLAG_BAD # some text values are clearly controls mask_text = [ id == 'CONTROL' for id in ids ] # bad signal mask_signal = [ (x[0] <= 0) or (x[1] <= 0) for x in zip(fg, bg) ] mask_norm = [ False for x in fg ] if do_norm: mask_norm = [ (x[0] <= 0) or (x[1] <= 0) for x in zip(n_fg, n_bg) ] mask = [ x[0] or x[1] or x[2] or x[3] or x[4] for x in zip(mask_control, mask_flag, mask_text, mask_signal, mask_norm) ] logger.info('deleting %d control rows', sum(mask)) gpr.delete_rows(mask) # re-extract just the good columns columns_extracted = [ 'Name', 'ID', signal_fg, signal_bg ] (name, id, fg, bg) = gpr.get_columns(columns_extracted) n_fg = None n_bg = None if do_norm: columns_norm = [norm_fg, norm_bg] columns_extracted = columns_extracted + columns_norm (n_fg, n_bg) = gpr.get_columns(columns_norm) n_row = len(name) assert(sum(bg == 0) == 0), 'bg has %d zero values' % sum(bg==0) # create a new index, idname, combining id with name # this avoids having one id map to multiple names, which could reflect a difference in probes, etc. idname = [ '_'.join([i,n]) for (i, n) in zip(id, name) ] idname_to_id = dict() idname_to_name = dict() for (idn, i, n) in zip(idname, id, name): idname_to_id[idn] = i idname_to_name[idn] = n gpr.add_columns( ('idname', idname)) columns_added += ['idname'] (ratio, zscore) = get_ratio_zscore(fg, bg, n_fg, n_bg, do_norm, do_log) (id_to_mean_zscore, row_to_mean_zscore, id_to_zscores) = apply_by_group(np.mean, idname, zscore) (id_to_mean_ratio, row_to_mean_ratio, id_to_ratios) = apply_by_group(np.mean, idname, ratio) gpr.add_columns(('ratio', ratio), ('zscore', zscore), ('zscore_mean', row_to_mean_zscore)) columns_added += ['ratio', 'zscore', 'zscore_mean' ] # collect rows where flag is good and either zscore is above a threshold (id_subset, row_subset) = get_good_ids_rows(idname, zscore) columns_display = columns_extracted + columns_added gpr.write(output_file, rows=row_subset, columns=columns_display) # gather data for each good id: # id, name, zscore_mean, zscores id_list = [ idname_to_id[i] for i in id_subset ] name_list = [ idname_to_name[i] for i in id_subset ] zscore_list = [ id_to_mean_zscore[i] for i in id_subset ] ratio_list = [ id_to_mean_ratio[i] for i in id_subset ] zscores_list = [ ';'.join([ str(x) for x in id_to_zscores[i] ]) for i in id_subset] ratios_list = [ ';'.join([ str(x) for x in id_to_ratios[i] ]) for i in id_subset] id_data = DataFrame( data=[('IDName', id_subset), ('ID', id_list), ('Name', name_list), ('zscore', zscore_list), ('ratio', ratio_list), ('zscores', zscores_list), ('ratios', ratios_list)] ) id_data.write(summary_file)