def readTilingProbes(ftile, group1, group2): ''' ftile: WebArrayDB transposon analysis result file, with columns: unique_id, probe_strand, probe_start, probe_end return: {'+':[(end, start, (unique_id, M, A, ch1, ch2, ...)), ...], '-':[...], '':[...]} # end should be smaller than start on '-' strand. ''' ftile = TableFile(ftile) head = ftile.next() #i_idx, i_strand, i_start, i_end, i_M, i_A, i_ch1, i_ch2 = if 'unique_id' not in head and 'id' in head: head[head.index('id')] = 'unique_id' i_cols = map( head.index, ('unique_id', 'probe_strand', 'probe_start', 'probe_end', 'M', 'A')) #i_chs = range(head.index('A')+1, head.index('idx')) # the columns between 'A' and 'idx' are channels i_chs = range(i_cols[-1] + 1, i_cols[-1] + 1 + group1 + group2) # the columns after 'A' rlt = {} #'+':[], '-':[], '':[]} for line in ftile: unique_id, strand, start, end, M, A = map(line.__getitem__, i_cols) #start, end, I_group1, I_group2 = int(start), int(end), float(I_group1), float(I_group2) chs = map(line.__getitem__, i_chs) try: start = int(start) except: try: start = int(end) except: start = 0 try: end = int(end) except: end = start try: M = float(M) except: M = 0 try: A = float(A) except: A = 0 if start > end: start, end = end, start if strand == '-': start, end = end, start for i in range(len(chs)): try: chs[i] = float(chs[i]) except: chs[i] = 0 v = (unique_id, M, A) + tuple(chs) rlt.setdefault(strand, []).append((end, start, v)) rlt.setdefault(strand + 'rev', []).append((start, end, v)) for v in rlt.values(): v.sort() # sort it return rlt
def Main(fsrc, fobj, fkey, ksep=None, sep='\t', linesep=os.linesep): if not fkey: return keys = map(lambda a:a.replace('\r', '').replace('\n', ''), InputFile(fkey)) keys = dict(zip(keys, range(len(keys)))) # {key:order, ...} getkey = ksep and ( lambda a,b=keys,c=len(keys),d=ksep:b.get(a[:a.index(d)], c) ) or ( lambda a,b=keys,c=len(keys):b.get(a, c) ) fobj = OutputFile(fobj) for line in TableFile(fsrc): line.sort(key=getkey) fobj.write(sep.join(line) + linesep)
def readMaps(fmap): ''' fmap: a Tab-delimited file with columns "No" and "FileName" return a dict like { No:{(loc_start, loc_end):Locus_tag, ...}, ...} ''' rlt = {} # { No:{(loc_start, loc_end):Locus_tag, ...}, ...} c_No, c_File = 'No', 'FileName' fmap = TableFile(fmap) head = fmap.next() if c_No not in head or c_File not in head: return rlt i_No, i_File = head.index(c_No), head.index(c_File) i_max = max(i_No, i_File) + 1 for line in fmap: if len(line) < i_max: line.extend([''] * (i_max - len(line))) No, File = line[i_No], line[i_File] if os.path.exists(File): rlt[No] = readOneMap(File) return rlt
def readOneMap(fmap): ''' fmap: a Tab-delimited file with columns: 'Feature ID', 'Start', 'Stop', 'Strand' return a dict like {(loc_start, loc_end):Locus_tag, ...}, in which Locus_tag comes from the "Feature ID" column. ''' rlt = {} # {(loc_start, loc_end):Locus_tag, ...} c_cols = c_Locus, c_Start, c_End, c_Strand = 'Feature ID', 'Start', 'Stop', 'Strand' flocus = TableFile(fmap) head = flocus.next() i_cols = map(head.index, c_cols) i_max = max(i_cols) + 1 for line in flocus: if len(line) < i_max: line.extend([''] * (i_max - len(line))) locus, start, end, strand = map(line.__getitem__, i_cols) if '|' in locus: # remove leading "fig|" or "fid|" locus = locus[locus.index('|') + 1:] start, end = int(start), int(end) if start > end: # let start <= end start, end = end, start rlt[(start, end)] = locus return rlt
def readGeneTab(fanno, default_chr=None, chromosome='chromosome', start='start', end='end', strand='strand', col_prefix='gene_', asc_loc=False): ''' fanno: the annotation file has columns: chromosome start end strand gene_type gene_symbol gene_title asc_loc: force st < ed for locs (st, ed) if True. Otherwise st > ed on negative strand. return a tuple: (info_col_names, info_dict) info_dict can be { chr:{'pos':{(start, end):[info_cols], ...}, 'neg':{(start, end):[info_cols], ...}}, ...} if strand is provided, or { chr:[((start, end), [info_cols]), ...], ... } if no strand provided. - use list in the second situation because theoretically it is possibe to have replicated (start, end) pairs (on different strands) ''' fanno = TableFile(fanno) head = fanno.next() has_chr = chromosome in head has_strand = bool(strand) if has_strand: locnms = has_chr and (chromosome, start, end, strand) or (start, end, strand) else: locnms = has_chr and (chromosome, start, end) or (start, end) locidxs = map(head.index, locnms) infoidxs = range(len(head)) #map(infoidxs.remove, locidxs) # only keep idx for other columns: gene_type, gene_symbol, gene_title if col_prefix: map(lambda a: head.__setitem__(a, col_prefix + head[a]), locidxs) # add prefix "gene_" to locnms in head infonms = map(head.__getitem__, infoidxs) # now infornms is same to head chrdic = { } # { chr:{'pos':{(start, end):[info_cols], ...}, 'neg':{(start, end):[info_cols], ...}}, ...} chr = default_chr for line in fanno: if has_chr: if has_strand: chr, st, ed, strd = map(line.__getitem__, locidxs) else: chr, st, ed = map(line.__getitem__, locidxs) else: if has_strand: st, ed, strd = map(line.__getitem__, locidxs) else: st, ed = map(line.__getitem__, locidxs) #st, ed = int(st), int(ed) try: st = int(st) except: st = '' try: ed = int(ed) except: ed = '' infos = map(line.__getitem__, infoidxs) if has_strand: thischr = chrdic.setdefault(chr, {}) else: thischr = chrdic.setdefault(chr, []) if has_strand: if strd == '+': loc = st <= ed and (st, ed) or (ed, st ) # tuple(sorted([st, ed])) thistrand = thischr.setdefault('pos', {}) else: if asc_loc: loc = st < ed and (st, ed) or (ed, st) else: loc = st > ed and (st, ed) or (ed, st) thistrand = thischr.setdefault('neg', {}) thistrand[loc] = infos else: loc = st <= ed and (st, ed) or (ed, st) thischr.append((loc, infos)) return infonms, chrdic
def Main(fsrc, fobj, ftile, chr_name, group1=1, group2=1, n_probes=5, col_chr='chromosome', col_strand='strand', col_start='probe_start', col_end='probe_end', threshold=1, single_row=False, sep='\t', linesep=os.linesep, join_str=' /// '): ''' fsrc: a file with columns for col_chr, col_strand, col_start, col_end ftile: WebArrayDB transposon analysis result file, with columns: unique_id, probe_strand, probe_start, probe_end, M, A chr_name: the chromosome name related to the specific ftile. (chromosome and plasmid are separated in WebArrayDB analysis!) ''' # read Tn result first probes = readTilingProbes( ftile, group1, group2 ) # {'+':[(end, start, (unique_id, M, A, I_gropu1, I_group2)), ...], '-':[], '':[]} # end should be smaller than start on '-' strand. pbs_pos = probes.get('+', []) pbs_pos_rev = probes.get('+rev', []) pbs_neg = probes.get('-', []) pbs_neg_rev = probes.get('-rev', []) ln_pos, ln_neg = len(pbs_pos), len(pbs_neg) # get the general median of intensity values for group1 and group2 #median1 = numpy.median(map(lambda a:a[2][3], pbs_pos) + map(lambda a:a[2][3], pbs_neg)) #median2 = numpy.median(map(lambda a:a[2][4], pbs_pos) + map(lambda a:a[2][4], pbs_neg)) valtmp = [] map(lambda a, b=valtmp: b.extend(a[2][3:3 + group1]), pbs_pos) map(lambda a, b=valtmp: b.extend(a[2][3:3 + group1]), pbs_neg) median1 = numpy.median(valtmp) del valtmp[:] map(lambda a, b=valtmp: b.extend(a[2][3 + group1:3 + group1 + group2]), pbs_pos) map(lambda a, b=valtmp: b.extend(a[2][3 + group1:3 + group1 + group2]), pbs_neg) median2 = numpy.median(valtmp) fsrc = TableFile(fsrc) head = fsrc.next() i_chr, i_strand, i_start, i_end = i_cols = map( head.index, (col_chr, col_strand, col_start, col_end)) col_rlts = ( 'unique_id (+)', 'tuple median of M (+)', 'tuple median (group1, +)', 'tuple median (group2, +)', 'tuple median (group1, +) - general median', 'tuple median (group2, +) - general median', 'island regulation (group1/group2, +)', 'signal call (group1, +)', 'signal call (group2, +)', # 0-8 'unique_id (-)', 'tuple median of M (-)', 'tuple median (group1, -)', 'tuple median (group2, -)', 'tuple median (group1, -) - general median', 'tuple median (group2, -) - general median', 'island regulation (group1/group2, -)', 'signal call (group1, -)', 'signal call (group2, -)' # 9-17 ) for colnm in col_rlts: if colnm not in head: head.append(colnm) i_rlts = dict(zip(col_rlts, map(head.index, col_rlts))) n_col = len(head) fobj = isinstance(fobj, str) and open(fobj, 'w') or fobj fobj.write(sep.join(head) + linesep) # end, start, (unique_id, M, A, I_group1, I_group2)) #dtlist = [('unique_id', '|S10'), ('M', float), ('A', float)] #, ('I_group1', float), ('I_group2', float)] #dtlist.extend(map(lambda i:('ch_'+str(i), float), range(1, 1+group1+group2))) for line in fsrc: n_dif = n_col - len(line) if n_dif > 0: line.extend([''] * n_dif) chrs, strands, starts, ends = map(lambda a: a.split(join_str), map(line.__getitem__, i_cols)) vals = [] for chr, strand, start, end in zip(chrs, strands, starts, ends): if chr != chr_name: continue # skip other chromosomes start, end = int(start), int(end) if start > end: start, end = end, start if strand == '-': start, end = end, start loc = bisect(pbs_neg, (end, )) if loc > 0 and pbs_neg[loc - 1][1] > end: loc -= 1 # check the overlapping tile secs_neg = loc >= ln_neg - n_probes and pbs_neg[ loc:] + pbs_neg[:n_probes - (loc - ln_neg)] or pbs_neg[loc:loc + n_probes] loc = bisect(pbs_pos_rev, (end, )) if loc > 0 and pbs_pos_rev[loc - 1][1] > end: loc -= 1 # check the overlapping tile secs_pos = loc >= ln_pos - n_probes and pbs_pos_rev[ loc:] + pbs_pos_rev[:n_probes - ( loc - ln_neg)] or pbs_pos_rev[loc:loc + n_probes] else: # positive strand loc = bisect(pbs_pos, (end, ())) if loc < ln_pos and pbs_pos[loc][1] < end: loc += 1 # check the overlapping tile secs_pos = loc >= n_probes and pbs_pos[ loc - n_probes:loc] or pbs_pos[-1:-(n_probes + 1 - loc)] + pbs_pos[:loc] loc = bisect(pbs_neg_rev, (end, ())) if loc < ln_neg and pbs_neg_rev[loc][1] < end: loc += 1 # check the overlapping tile secs_neg = loc >= n_probes and pbs_neg_rev[ loc - n_probes:loc] or pbs_pos[-1:-(n_probes + 1 - loc)] + pbs_pos[:loc] val = [] if single_row: for secs in (secs_pos, secs_neg): #ln = max(map(lambda a:len(a[2][0]), secs)) # get max len of unique_id #dtlist[0] = ('unique_id', '|S%d' % ln) #DT = numpy.array(map(lambda a:a[2], secs), dtype=dtlist) #ids = ', '.join(DT['unique_id']) #mM = numpy.median(filter(bool, DT['M'])) # filter out values 0, which is converted from NA #m1, m2 = numpy.median(DT['I_group1']), numpy.median(DT['I_group2']) ids = ', '.join(map(lambda a: a[2][0], secs)) DT = numpy.array(map(lambda a: a[2][1:], secs)) mM = numpy.median(filter(bool, DT[:, 0])) m1, m2 = numpy.median(DT[:, 2:2 + group1]), numpy.median( DT[:, 2 + group1:2 + group1 + group2]) dm1, dm2 = m1 - median1, m2 - median2 sM, s1, s2 = mM < -1 and -1 or mM > 1 and 1 or 0, dm1 > threshold and 'present' or 'absent', dm2 > threshold and 'present' or 'absent' val.extend([ids, mM, m1, m2, dm1, dm2, sM, s1, s2]) vals.append(map(str, val)) else: for secpair in zip(secs_pos, secs_neg): vsp = [] for s in secpair: s = s[2] ids, mM = s[0], s[1], m1, m2 = numpy.median(s[2:2 + group1]), numpy.median( s[2 + group1:2 + group1 + group2]) dm1, dm2 = m1 - median1, m2 - median2 sM, s1, s2 = mM < -1 and -1 or mM > 1 and 1 or 0, dm1 > threshold and 'present' or 'absent', dm2 > threshold and 'present' or 'absent' vsp.extend([ids, mM, m1, m2, dm1, dm2, sM, s1, s2]) val.append(map(str, vsp)) vals.extend(val) ln = len(vals) if single_row or ln == 0: if ln > 0: if ln > 1: vals = numpy.array(vals) vals = map(lambda a: join_str.join(vals[:, a]), range(vals.shape[1])) elif ln == 1: vals = vals[0] for i in range(len(vals)): j = i_rlts[col_rlts[i]] if not line[j].strip(): line[j] = vals[i] else: line[j] = line[j] + join_str + vals[i] fobj.write(sep.join(line) + linesep) else: for val in vals: linetmp = line[:] for i in range(len(val)): j = i_rlts[col_rlts[i]] if not linetmp[j].strip(): linetmp[j] = val[i] else: linetmp[j] = linetmp[j] + join_str + val[i] fobj.write(sep.join(linetmp) + linesep)