def filter(p_bed, region_bed, max_p=None, p_col_name="P.Value"): ph = ['p' + h for h in get_header(p_bed)] rh = get_header(region_bed) if isinstance(p_col_name, (int, long)): p_col_name = ph[p_col_name][1:] a = dict(p_bed=p_bed, region_bed=region_bed) a['p_bed'] = fix_header(a['p_bed']) yield rh + ["t-pos", "t-neg", "t-sum", "n_gt_p05", "n_gt_p1"] for group, plist in groupby(reader('|bedtools intersect -b %(p_bed)s -a %(region_bed)s -wo' % a, header=rh + ph), itemgetter('chrom','start','end')): plist = list(plist) plist = [x for x in plist if (int(x['start']) <= int(x['pstart']) <= int(x['pend'])) and ((int(x['start']) <= int(x['pend']) <= int(x['end'])))] tscores = [float(row['pt']) for row in plist if 'pt' in row] if max_p: if any(float(row['p' + p_col_name]) > max_p for row in plist): continue ngt05 = sum(1 for row in plist if float(row['p' + p_col_name]) > 0.05) ngt1 = sum(1 for row in plist if float(row['p' + p_col_name]) > 0.1) tpos = sum(1 for ts in tscores if ts > 0) tneg = sum(1 for ts in tscores if ts < 0) tsum = sum(ts for ts in tscores) frow = [plist[0][h] for h in rh] + [str(tpos), str(tneg), str(tsum), str(ngt05), str(ngt1)] yield frow
def partsort(afile, group_cols, sort_cols, sort_convertors, header=False): """ the converted columns are appended to the end of the row. then after the sort, these are removed. this removes problems with floating point reprs. """ the_first_line = get_header(afile) row_len = len(the_first_line) n_extra = len(sort_convertors) # maintain order of the sort cols, but use the appended columns for the # numeric ones. actual_sort_cols = [] n_extra = 0 # since we append floats to the end *and* want to maintain the # requested sort order, we create the `actual_sort_cols` for c in sort_cols: if not c in sort_convertors: actual_sort_cols.append(c) else: idx = row_len + n_extra actual_sort_cols.append(idx) n_extra += 1 # if it was stdin, then we read one line to get the header length. lines = reader(afile, header=header) if afile != "-" \ else chain([the_first_line], reader(afile, header)) # groupby the correct columns for keyed, group in groupby(lines, lambda toks: [toks[i] for i in group_cols]): # then generate the rows with the converted columns appended. def gen_converted_group(): for toks in group: # add the converted columns onto the end. yield toks + [ fn(toks[col_idx]) for col_idx, fn in sort_convertors.items() ] # then iterator over the sorted cols. for toks in sorted(gen_converted_group(), key=itemgetter(*actual_sort_cols)): # strip the extra columns. yield toks[:row_len]
def partsort(afile, group_cols, sort_cols, sort_convertors, header=False): """ the converted columns are appended to the end of the row. then after the sort, these are removed. this removes problems with floating point reprs. """ the_first_line = get_header(afile) row_len = len(the_first_line) n_extra = len(sort_convertors) # maintain order of the sort cols, but use the appended columns for the # numeric ones. actual_sort_cols = [] n_extra = 0 # since we append floats to the end *and* want to maintain the # requested sort order, we create the `actual_sort_cols` for c in sort_cols: if not c in sort_convertors: actual_sort_cols.append(c) else: idx = row_len + n_extra actual_sort_cols.append(idx) n_extra += 1 # if it was stdin, then we read one line to get the header length. lines = reader(afile, header=header) if afile != "-" \ else chain([the_first_line], reader(afile, header)) # groupby the correct columns for keyed, group in groupby(lines, lambda toks: [toks[i] for i in group_cols]): # then generate the rows with the converted columns appended. def gen_converted_group(): for toks in group: # add the converted columns onto the end. yield toks + [fn(toks[col_idx]) for col_idx, fn in sort_convertors.items()] # then iterator over the sorted cols. for toks in sorted(gen_converted_group(), key=itemgetter(*actual_sort_cols)): # strip the extra columns. yield toks[:row_len]
def filter(p_bed, region_bed, max_p=None, region_p=None, p_col_name="P.Value", coef_col_name="logFC"): ph = ['p' + h for h in get_header(p_bed)] rh = get_header(region_bed) if isinstance(p_col_name, (int, long)): p_col_name = ph[p_col_name][1:] a = dict(p_bed=p_bed, region_bed=region_bed) a['p_bed'] = fix_header(a['p_bed']) j = 0 for group, plist in groupby(reader('|bedtools intersect -b %(p_bed)s -a %(region_bed)s -wo' % a, header=rh + ph), itemgetter('chrom','start','end')): plist = list(plist) if region_p: r = plist[0] # first cols are all the same region_p_key = 'slk_sidak_p' if 'slk_sidak_p' in r \ else 'z_sidak_p' if 'z_sidak_p' in r \ else None if region_p_key is None: raise Exception if float(r[region_p_key]) > region_p: continue plist = [x for x in plist if (int(x['start']) <= int(x['pstart']) <= int(x['pend'])) and ((int(x['start']) <= int(x['pend']) <= int(x['end'])))] tscores = [float(row['pt']) for row in plist if 'pt' in row] if max_p: if any(float(row['p' + p_col_name]) > max_p for row in plist): continue ngt05 = sum(1 for row in plist if float(row['p' + p_col_name]) > 0.05) # logic to try to find t and coef headers and skip if not found extra_header = [] extra = [] if tscores: tpos = sum(1 for ts in tscores if ts > 0) tneg = sum(1 for ts in tscores if ts < 0) tpn = "%i/%i" % (tpos, tneg) tsum = str(sum(ts for ts in tscores)) extra_header += ["t.pos/t.neg", "t.sum"] extra += [tpn, tsum] else: tsum = tpn = "NA" if 'p' + coef_col_name not in plist[0] and 'pcoefficient' in plist[0]: coef_col_name = 'coefficient' if 'p' + coef_col_name in plist[0]: coef = (sum(float(row['p' + coef_col_name]) for row in plist) / len(plist)) # since we probably had the data logit transformed, here we # do the inverse and subtract 0.5 since ilogit(0) == 0.5 icoef = (sum(ilogit(float(row['p' + coef_col_name])) for row in plist) / len(plist)) - 0.5 extra_header += ["avg.diff", "ilogit.diff"] extra += ["%.3f" % coef, "%.3f" % icoef] else: coef = icoef = float('nan') frow = [plist[0][h] for h in rh] + extra if j == 0: yield rh + extra_header j = 1 yield frow