def make_grid_for_file(results_dir, list_file, grid_file): POOL_DIRECTIONS = ['H', 'V'] POOL_RANGE = range(1, 13) logger.info('%s => %s', list_file, grid_file) df = DataFrame(filename=os.path.join(results_dir, list_file)) (pair, id) = df.get_columns('pair', 'id') logger.info('%d hits', len(pair)) col_names = [ 'V' + str(i) for i in POOL_RANGE ] row_names = [ 'H' + str(i) for i in POOL_RANGE ] data_dict = dict() # will hold a list of the hits for each row, column pair for r in row_names: for c in col_names: data_dict[(r,c)] = [ ] for (mypair, myid) in zip(pair, id): (horiz, vert) = mypair.split(' x ') data_dict[(horiz, vert)] = data_dict[(horiz, vert)] + [ myid ] # now build a new data frame as a list of tuples, column name and column list data_by_column = [ ] # first column is the row names data_by_column += [(grid_file, row_names)] # subsequent columns are by vertical pool for c in col_names: col_data = [ ] for r in row_names: col_data.append(' '.join(sorted(data_dict[(r,c)]))) data_by_column += [ (c, col_data)] grid_dataframe = DataFrame(data=data_by_column) grid_dataframe.write(os.path.join(results_dir, grid_file))
def get_control_from_file(filename, simple=True): """ read the file as a data frame for each id, check how many times it occurs as control or experimental make a dict with (id, name) as key where pair is often as controls, or name is nd """ logger.info('reading controls from %s', filename) control = DataFrame(filename=filename) control_dict = dict() if (simple): (ids, names) = control.get_columns('id','name') for (id, name) in zip(ids, names): control_dict[(id,name)] = True else: (id, name, control, exptl) = control.get_columns('id', 'name', 'control', 'exptl') id_to_name = dict() for (i, n, c, e) in zip(id, name, control, exptl): isND = n in [ 'ND', 'nd', 'N.D.' ] isControl = (i == 'CONTROL') isIgg = (n == 'IgG') if ((c >= e) or isND or isControl or isIgg): control_dict[(i, n)] = True # insert some special cases control_dict[('CONTROL', 'IgG')] = True for (i, n) in zip(id, name): if i not in id_to_name: id_to_name[i] = dict() id_to_name[i][n] = True id_to_names = dict() for i in id_to_name: names = sorted(id_to_name[i].keys()) cnt = len(names) name_str = ','.join(names) id_to_names[i] = dict() id_to_names[i]['cnt'] = cnt id_to_names[i]['names'] = name_str ids = sorted(id_to_names.keys()) cnts = [ id_to_names[x]['cnt'] for x in ids ] names = [ id_to_names[x]['names'] for x in ids ] df = DataFrame(data=[ ('id', ids), ('cnt', cnts), ('names', names)]) df.write('id_to_names.txt') return(control_dict)