Ejemplo n.º 1
0
def make_grid_for_file(results_dir, list_file, grid_file):
    POOL_DIRECTIONS = ['H', 'V']
    POOL_RANGE = range(1, 13)
    logger.info('%s => %s', list_file, grid_file)
    df = DataFrame(filename=os.path.join(results_dir, list_file))
    (pair, id) = df.get_columns('pair', 'id')
    logger.info('%d hits', len(pair))
    col_names = [ 'V' + str(i) for i in POOL_RANGE ]
    row_names = [ 'H' + str(i) for i in POOL_RANGE ]
    data_dict = dict() # will hold a list of the hits for each row, column pair
    for r in row_names:
        for c in col_names:
            data_dict[(r,c)] = [ ]
    for (mypair, myid) in zip(pair, id):
        (horiz, vert) = mypair.split(' x ')
        data_dict[(horiz, vert)] = data_dict[(horiz, vert)] + [ myid ]
    # now build a new data frame as a list of tuples, column name and column list
    data_by_column = [ ]
    # first column is the row names
    data_by_column += [(grid_file, row_names)]
    # subsequent columns are by vertical pool
    for c in col_names:
        col_data = [ ]
        for r in row_names:
            col_data.append(' '.join(sorted(data_dict[(r,c)])))
        data_by_column += [ (c, col_data)]
    grid_dataframe = DataFrame(data=data_by_column)
    grid_dataframe.write(os.path.join(results_dir, grid_file))
Ejemplo n.º 2
0
def create_map_file(data_dir, map_filename):
    file_list = sorted(os.listdir(data_dir))
    pool_list = [ ]
    base_list = [ ]
    for file_name in file_list:
        (base, ext) = os.path.splitext(file_name)
        if (ext == '.gpr') or (ext == '.GPR'):
            logger.info('dir %s file %s base %s ext %s', data_dir, file_name, base, ext)
            toks = re.split('_|-', base) # split on underscore or dash
            
            # start looking from the end for a token that is a valid pool name
            toks.reverse()
            pool_str = ''
            for tok in toks:
                if is_valid_pool_name(tok):
                    pool_str = tok
                    break
            if not is_valid_pool_name(pool_str):
                logger.warn('%s has no valid pool name, skipping %s', ext, file_name)
                continue
            if pool_str in pool_list:
                logger.error('pool %s is repeated, ignoring', ext)
            pool_list.append(pool_str)
            base_list.append(base)
    df = DataFrame(data = [('pool', pool_list), ('file', base_list)])
    df.write(map_filename)
    return None
Ejemplo n.º 3
0
def get_control_from_file(filename, simple=True):
    """
    read the file as a data frame
    for each id, check how many times it occurs as control or experimental
    make a dict with (id, name) as key where pair is often as controls, or name is nd
    """
    logger.info('reading controls from %s', filename)
    control = DataFrame(filename=filename)
    control_dict = dict()
    
    if (simple):
        (ids, names) = control.get_columns('id','name')
        for (id, name) in zip(ids, names):
            control_dict[(id,name)] = True
    else:
        (id, name, control, exptl) = control.get_columns('id', 'name', 'control', 'exptl')
        id_to_name = dict()
        for (i, n, c, e) in zip(id, name, control, exptl):
            isND = n in [ 'ND', 'nd', 'N.D.' ]
            isControl = (i == 'CONTROL')
            isIgg = (n == 'IgG')
            if ((c >= e) or isND or isControl or isIgg):
                control_dict[(i, n)] = True
                
        # insert some special cases
        control_dict[('CONTROL', 'IgG')] = True
    
        for (i, n) in zip(id, name):
            if i not in id_to_name:
                id_to_name[i] = dict()
            id_to_name[i][n] = True
        
        id_to_names = dict()
        for i in id_to_name:
            names = sorted(id_to_name[i].keys())
            cnt = len(names)
            name_str = ','.join(names)
            id_to_names[i] = dict()
            id_to_names[i]['cnt'] = cnt
            id_to_names[i]['names'] = name_str
        ids = sorted(id_to_names.keys())
        cnts = [ id_to_names[x]['cnt'] for x in ids ]
        names = [ id_to_names[x]['names'] for x in ids ]
        df = DataFrame(data=[ ('id', ids), ('cnt', cnts), ('names', names)])
        df.write('id_to_names.txt')
    return(control_dict)
Ejemplo n.º 4
0
def write_pool_hit(pool_to_file, pool_hit):
    pool_list = [ ]
    file_list = [ ]
    id_list = [ ]
    zscore_list = [ ]
    ratio_list = [ ]
    for (p, f) in zip(pool_to_file.data['pool'], pool_to_file.data['file']):
        # some pools have no hits
        if p not in pool_hit:
            continue
        for h in pool_hit[p]:
            pool_list.append(p)
            file_list.append(f)
            id_list.append(h)
            zscore_list.append(pool_hit[p][h]['zscore'])
            ratio_list.append(pool_hit[p][h]['ratio'])
    df = DataFrame( data=[('pool', pool_list), ('file', file_list), ('id', id_list), ('zscore', zscore_list), ('ratio', ratio_list)] )
    df.write('pool_hit.txt')
Ejemplo n.º 5
0
def print_control_dict(control_dict, control_dict_filename):
    keys = sorted(control_dict.keys())
    ids = [ x[0] for x in keys ]
    names = [ x[1] for x in keys ]
    df = DataFrame(data=[('id', ids), ('name', names)])
    df.write(filename=control_dict_filename)
Ejemplo n.º 6
0
def process_gpr_file(input_file, output_file, summary_file, \
                     signal_fg, signal_bg, norm_fg, norm_bg, \
                     do_norm, do_log, \
                     control_dict=None):
    """
    open input_file as a gpr
    extract columns corresponding to F635 Median and B635 Median (fore- and back-ground)
    add new column fg/bg ratio
    extract Flags column as a mask
    
    if control_dict is None:
        mask out values with Flags == -100
    else:
        mask out values based on control_dict
    
    calculate mean and standard deviation of the ratio
    calculate z-score for each row
    calculate stouffer's z-score ?or mean z-score? for probes with same ID
    print probes with (mean) z-score >= 2.5
    """
    FLAG_BAD = -100
    logger.info('%s => %s', input_file, output_file)
    gpr = GPR(input_file)
    # print debug information for a gpr file
    # gpr.print_summary()

    # keep track of which columns we've added
    columns_added = [ ]

    # start by extracting the flags and adding an index for the original row number
    (flags, ids, names, fg, bg) = gpr.get_columns(['Flags', 'ID', 'Name', signal_fg, signal_bg])
    if do_norm:
        (n_fg, n_bg) = gpr.get_columns([norm_fg, norm_bg])
    n_row_orig = len(flags)
    logger.info('n_row_orig %d', n_row_orig)
    row_number_orig = np.array(range(1, n_row_orig + 1))
    
    gpr.add_columns( ('row_number_orig', row_number_orig))
    columns_added += ['row_number_orig']

    # identify rows with bad flags and delete them
    # follow the semantics of a numpy masked array: delete where mask is True
    
    # controls from a dictionary
    mask_control = [ False for x in ids ]
    if (control_dict is not None):
        control_ids = dict()
        # for controls, just worry about ID, not name
        for (i, n) in control_dict.keys():
            control_ids[i] = True
        mask_control = [ i in control_ids for i in ids ]
        
    # user interface permits manual flagging of bad data, usually -100
    mask_flag = flags <= FLAG_BAD
    
    # some text values are clearly controls
    mask_text = [ id == 'CONTROL' for id in ids ]
    
    # bad signal
    mask_signal = [ (x[0] <= 0) or (x[1] <= 0) for x in zip(fg, bg) ]
    
    mask_norm = [ False for x in fg ]
    if do_norm:
        mask_norm = [ (x[0] <= 0) or (x[1] <= 0) for x in zip(n_fg, n_bg) ]
    
    mask = [ x[0] or x[1] or x[2] or x[3] or x[4] for x in zip(mask_control, mask_flag, mask_text, mask_signal, mask_norm) ]
    logger.info('deleting %d control rows', sum(mask))
    gpr.delete_rows(mask)


    # re-extract just the good columns
    columns_extracted = [ 'Name', 'ID', signal_fg, signal_bg ]
    (name, id, fg, bg) = gpr.get_columns(columns_extracted)
    n_fg = None
    n_bg = None
    if do_norm:
        columns_norm = [norm_fg, norm_bg]
        columns_extracted = columns_extracted + columns_norm
        (n_fg, n_bg) = gpr.get_columns(columns_norm)
    n_row = len(name)
    assert(sum(bg == 0) == 0), 'bg has %d zero values' % sum(bg==0)
    
    # create a new index, idname, combining id with name
    # this avoids having one id map to multiple names, which could reflect a difference in probes, etc.
    idname = [ '_'.join([i,n]) for (i, n) in zip(id, name) ]
    idname_to_id = dict()
    idname_to_name = dict()
    for (idn, i, n) in zip(idname, id, name):
        idname_to_id[idn] = i
        idname_to_name[idn] = n
    
    gpr.add_columns( ('idname', idname))
    columns_added += ['idname']
    
    (ratio, zscore) = get_ratio_zscore(fg, bg, n_fg, n_bg, do_norm, do_log)    
    (id_to_mean_zscore, row_to_mean_zscore, id_to_zscores) = apply_by_group(np.mean, idname, zscore)
    (id_to_mean_ratio, row_to_mean_ratio, id_to_ratios) = apply_by_group(np.mean, idname, ratio)

    gpr.add_columns(('ratio', ratio),
        ('zscore', zscore),
        ('zscore_mean', row_to_mean_zscore))
    columns_added += ['ratio', 'zscore', 'zscore_mean' ]
    
    # collect rows where flag is good and either zscore is above a threshold
    (id_subset, row_subset) = get_good_ids_rows(idname, zscore)
    
    columns_display = columns_extracted + columns_added
    gpr.write(output_file, rows=row_subset, columns=columns_display)
    
    # gather data for each good id:
    # id, name, zscore_mean, zscores
    id_list = [ idname_to_id[i] for i in id_subset ]
    name_list = [ idname_to_name[i] for i in id_subset ]
    zscore_list = [ id_to_mean_zscore[i] for i in id_subset ]
    ratio_list = [ id_to_mean_ratio[i] for i in id_subset ]
    zscores_list = [ ';'.join([ str(x) for x in id_to_zscores[i] ]) for i in id_subset]
    ratios_list = [ ';'.join([ str(x) for x in id_to_ratios[i] ]) for i in id_subset]
    id_data = DataFrame( data=[('IDName', id_subset),
        ('ID', id_list), ('Name', name_list),
        ('zscore', zscore_list), ('ratio', ratio_list),
        ('zscores', zscores_list), ('ratios', ratios_list)] )
    id_data.write(summary_file)