예제 #1
0
def readTilingProbes(ftile, group1, group2):
    '''
	ftile: WebArrayDB transposon analysis result file, with columns: unique_id, probe_strand, probe_start, probe_end
	return: {'+':[(end, start, (unique_id, M, A, ch1, ch2, ...)), ...], '-':[...], '':[...]} # end should be smaller than start on '-' strand.
	'''
    ftile = TableFile(ftile)
    head = ftile.next()
    #i_idx, i_strand, i_start, i_end, i_M, i_A, i_ch1, i_ch2 =
    if 'unique_id' not in head and 'id' in head:
        head[head.index('id')] = 'unique_id'
    i_cols = map(
        head.index,
        ('unique_id', 'probe_strand', 'probe_start', 'probe_end', 'M', 'A'))
    #i_chs = range(head.index('A')+1, head.index('idx')) # the columns between 'A' and 'idx' are channels
    i_chs = range(i_cols[-1] + 1,
                  i_cols[-1] + 1 + group1 + group2)  # the columns after 'A'
    rlt = {}  #'+':[], '-':[], '':[]}
    for line in ftile:
        unique_id, strand, start, end, M, A = map(line.__getitem__, i_cols)
        #start, end, I_group1, I_group2 = int(start), int(end), float(I_group1), float(I_group2)
        chs = map(line.__getitem__, i_chs)
        try:
            start = int(start)
        except:
            try:
                start = int(end)
            except:
                start = 0
        try:
            end = int(end)
        except:
            end = start
        try:
            M = float(M)
        except:
            M = 0
        try:
            A = float(A)
        except:
            A = 0
        if start > end: start, end = end, start
        if strand == '-': start, end = end, start
        for i in range(len(chs)):
            try:
                chs[i] = float(chs[i])
            except:
                chs[i] = 0
        v = (unique_id, M, A) + tuple(chs)
        rlt.setdefault(strand, []).append((end, start, v))
        rlt.setdefault(strand + 'rev', []).append((start, end, v))
    for v in rlt.values():
        v.sort()  # sort it
    return rlt
예제 #2
0
def Main(fsrc, fobj, fkey, ksep=None, sep='\t', linesep=os.linesep):
	if not fkey: return
	keys = map(lambda a:a.replace('\r', '').replace('\n', ''), InputFile(fkey))
	keys = dict(zip(keys, range(len(keys)))) # {key:order, ...}
	getkey = ksep and ( lambda a,b=keys,c=len(keys),d=ksep:b.get(a[:a.index(d)], c) ) or ( lambda a,b=keys,c=len(keys):b.get(a, c) )
	fobj = OutputFile(fobj)
	for line in TableFile(fsrc):
		line.sort(key=getkey)
		fobj.write(sep.join(line) + linesep)
예제 #3
0
def readMaps(fmap):
    '''
	fmap: a Tab-delimited file with columns "No" and "FileName"
	return a dict like { No:{(loc_start, loc_end):Locus_tag, ...}, ...}
	'''
    rlt = {}  # { No:{(loc_start, loc_end):Locus_tag, ...}, ...}
    c_No, c_File = 'No', 'FileName'
    fmap = TableFile(fmap)
    head = fmap.next()
    if c_No not in head or c_File not in head:
        return rlt
    i_No, i_File = head.index(c_No), head.index(c_File)
    i_max = max(i_No, i_File) + 1
    for line in fmap:
        if len(line) < i_max:
            line.extend([''] * (i_max - len(line)))
        No, File = line[i_No], line[i_File]
        if os.path.exists(File):
            rlt[No] = readOneMap(File)
    return rlt
예제 #4
0
def readOneMap(fmap):
    '''
	fmap: a Tab-delimited file with columns: 'Feature ID', 'Start', 'Stop', 'Strand'
	return a dict like {(loc_start, loc_end):Locus_tag, ...}, in which Locus_tag comes from the "Feature ID" column.
	'''
    rlt = {}  # {(loc_start, loc_end):Locus_tag, ...}
    c_cols = c_Locus, c_Start, c_End, c_Strand = 'Feature ID', 'Start', 'Stop', 'Strand'
    flocus = TableFile(fmap)
    head = flocus.next()
    i_cols = map(head.index, c_cols)
    i_max = max(i_cols) + 1
    for line in flocus:
        if len(line) < i_max:
            line.extend([''] * (i_max - len(line)))
        locus, start, end, strand = map(line.__getitem__, i_cols)
        if '|' in locus:  # remove leading "fig|" or "fid|"
            locus = locus[locus.index('|') + 1:]
        start, end = int(start), int(end)
        if start > end:  # let start <= end
            start, end = end, start
        rlt[(start, end)] = locus
    return rlt
예제 #5
0
def readGeneTab(fanno,
                default_chr=None,
                chromosome='chromosome',
                start='start',
                end='end',
                strand='strand',
                col_prefix='gene_',
                asc_loc=False):
    '''
	fanno:	the annotation file has columns: chromosome	start	end	strand	gene_type	gene_symbol	gene_title
	asc_loc:	force st < ed for locs (st, ed) if True. Otherwise st > ed on negative strand. 
	return a tuple: (info_col_names, info_dict)
		info_dict can be { chr:{'pos':{(start, end):[info_cols], ...},
		'neg':{(start, end):[info_cols], ...}}, ...} if strand is provided, or
		{ chr:[((start, end), [info_cols]), ...], ... } if no strand provided.
		- use list in the second situation because theoretically it is possibe
		  to have replicated (start, end) pairs (on different strands)
	'''
    fanno = TableFile(fanno)
    head = fanno.next()
    has_chr = chromosome in head
    has_strand = bool(strand)
    if has_strand:
        locnms = has_chr and (chromosome, start, end, strand) or (start, end,
                                                                  strand)
    else:
        locnms = has_chr and (chromosome, start, end) or (start, end)
    locidxs = map(head.index, locnms)
    infoidxs = range(len(head))
    #map(infoidxs.remove, locidxs) # only keep idx for other columns: gene_type, gene_symbol, gene_title
    if col_prefix:
        map(lambda a: head.__setitem__(a, col_prefix + head[a]),
            locidxs)  # add prefix "gene_" to locnms in head
    infonms = map(head.__getitem__, infoidxs)  # now infornms is same to head
    chrdic = {
    }  # { chr:{'pos':{(start, end):[info_cols], ...}, 'neg':{(start, end):[info_cols], ...}}, ...}
    chr = default_chr
    for line in fanno:
        if has_chr:
            if has_strand:
                chr, st, ed, strd = map(line.__getitem__, locidxs)
            else:
                chr, st, ed = map(line.__getitem__, locidxs)
        else:
            if has_strand:
                st, ed, strd = map(line.__getitem__, locidxs)
            else:
                st, ed = map(line.__getitem__, locidxs)
        #st, ed = int(st), int(ed)
        try:
            st = int(st)
        except:
            st = ''
        try:
            ed = int(ed)
        except:
            ed = ''
        infos = map(line.__getitem__, infoidxs)
        if has_strand:
            thischr = chrdic.setdefault(chr, {})
        else:
            thischr = chrdic.setdefault(chr, [])
        if has_strand:
            if strd == '+':
                loc = st <= ed and (st, ed) or (ed, st
                                                )  # tuple(sorted([st, ed]))
                thistrand = thischr.setdefault('pos', {})
            else:
                if asc_loc:
                    loc = st < ed and (st, ed) or (ed, st)
                else:
                    loc = st > ed and (st, ed) or (ed, st)
                thistrand = thischr.setdefault('neg', {})
            thistrand[loc] = infos
        else:
            loc = st <= ed and (st, ed) or (ed, st)
            thischr.append((loc, infos))
    return infonms, chrdic
예제 #6
0
def Main(fsrc,
         fobj,
         ftile,
         chr_name,
         group1=1,
         group2=1,
         n_probes=5,
         col_chr='chromosome',
         col_strand='strand',
         col_start='probe_start',
         col_end='probe_end',
         threshold=1,
         single_row=False,
         sep='\t',
         linesep=os.linesep,
         join_str=' /// '):
    '''
	fsrc: a file with columns for col_chr, col_strand, col_start, col_end
	ftile: WebArrayDB transposon analysis result file, with columns: unique_id, probe_strand, probe_start, probe_end, M, A
	chr_name: the chromosome name related to the specific ftile. (chromosome and plasmid are separated in WebArrayDB analysis!)
	'''
    # read Tn result first
    probes = readTilingProbes(
        ftile, group1, group2
    )  # {'+':[(end, start, (unique_id, M, A, I_gropu1, I_group2)), ...], '-':[], '':[]} # end should be smaller than start on '-' strand.
    pbs_pos = probes.get('+', [])
    pbs_pos_rev = probes.get('+rev', [])
    pbs_neg = probes.get('-', [])
    pbs_neg_rev = probes.get('-rev', [])
    ln_pos, ln_neg = len(pbs_pos), len(pbs_neg)
    # get the general median of intensity values for group1 and group2
    #median1 = numpy.median(map(lambda a:a[2][3], pbs_pos) + map(lambda a:a[2][3], pbs_neg))
    #median2 = numpy.median(map(lambda a:a[2][4], pbs_pos) + map(lambda a:a[2][4], pbs_neg))
    valtmp = []
    map(lambda a, b=valtmp: b.extend(a[2][3:3 + group1]), pbs_pos)
    map(lambda a, b=valtmp: b.extend(a[2][3:3 + group1]), pbs_neg)
    median1 = numpy.median(valtmp)
    del valtmp[:]
    map(lambda a, b=valtmp: b.extend(a[2][3 + group1:3 + group1 + group2]),
        pbs_pos)
    map(lambda a, b=valtmp: b.extend(a[2][3 + group1:3 + group1 + group2]),
        pbs_neg)
    median2 = numpy.median(valtmp)

    fsrc = TableFile(fsrc)
    head = fsrc.next()
    i_chr, i_strand, i_start, i_end = i_cols = map(
        head.index, (col_chr, col_strand, col_start, col_end))
    col_rlts = (
        'unique_id (+)',
        'tuple median of M (+)',
        'tuple median (group1, +)',
        'tuple median (group2, +)',
        'tuple median (group1, +) - general median',
        'tuple median (group2, +) - general median',
        'island regulation (group1/group2, +)',
        'signal call (group1, +)',
        'signal call (group2, +)',  # 0-8
        'unique_id (-)',
        'tuple median of M (-)',
        'tuple median (group1, -)',
        'tuple median (group2, -)',
        'tuple median (group1, -) - general median',
        'tuple median (group2, -) - general median',
        'island regulation (group1/group2, -)',
        'signal call (group1, -)',
        'signal call (group2, -)'  # 9-17
    )
    for colnm in col_rlts:
        if colnm not in head: head.append(colnm)
    i_rlts = dict(zip(col_rlts, map(head.index, col_rlts)))
    n_col = len(head)
    fobj = isinstance(fobj, str) and open(fobj, 'w') or fobj
    fobj.write(sep.join(head) + linesep)
    # end, start, (unique_id, M, A, I_group1, I_group2))
    #dtlist = [('unique_id', '|S10'), ('M', float), ('A', float)] #, ('I_group1', float), ('I_group2', float)]
    #dtlist.extend(map(lambda i:('ch_'+str(i), float), range(1, 1+group1+group2)))
    for line in fsrc:
        n_dif = n_col - len(line)
        if n_dif > 0: line.extend([''] * n_dif)
        chrs, strands, starts, ends = map(lambda a: a.split(join_str),
                                          map(line.__getitem__, i_cols))
        vals = []
        for chr, strand, start, end in zip(chrs, strands, starts, ends):
            if chr != chr_name: continue  # skip other chromosomes
            start, end = int(start), int(end)
            if start > end: start, end = end, start
            if strand == '-':
                start, end = end, start
                loc = bisect(pbs_neg, (end, ))
                if loc > 0 and pbs_neg[loc - 1][1] > end:
                    loc -= 1  # check the overlapping tile
                secs_neg = loc >= ln_neg - n_probes and pbs_neg[
                    loc:] + pbs_neg[:n_probes -
                                    (loc - ln_neg)] or pbs_neg[loc:loc +
                                                               n_probes]
                loc = bisect(pbs_pos_rev, (end, ))
                if loc > 0 and pbs_pos_rev[loc - 1][1] > end:
                    loc -= 1  # check the overlapping tile
                secs_pos = loc >= ln_pos - n_probes and pbs_pos_rev[
                    loc:] + pbs_pos_rev[:n_probes - (
                        loc - ln_neg)] or pbs_pos_rev[loc:loc + n_probes]
            else:  # positive strand
                loc = bisect(pbs_pos, (end, ()))
                if loc < ln_pos and pbs_pos[loc][1] < end:
                    loc += 1  # check the overlapping tile
                secs_pos = loc >= n_probes and pbs_pos[
                    loc - n_probes:loc] or pbs_pos[-1:-(n_probes + 1 -
                                                        loc)] + pbs_pos[:loc]
                loc = bisect(pbs_neg_rev, (end, ()))
                if loc < ln_neg and pbs_neg_rev[loc][1] < end:
                    loc += 1  # check the overlapping tile
                secs_neg = loc >= n_probes and pbs_neg_rev[
                    loc - n_probes:loc] or pbs_pos[-1:-(n_probes + 1 -
                                                        loc)] + pbs_pos[:loc]
            val = []
            if single_row:
                for secs in (secs_pos, secs_neg):
                    #ln = max(map(lambda a:len(a[2][0]), secs)) # get max len of unique_id
                    #dtlist[0] = ('unique_id', '|S%d' % ln)
                    #DT = numpy.array(map(lambda a:a[2], secs), dtype=dtlist)
                    #ids = ', '.join(DT['unique_id'])
                    #mM = numpy.median(filter(bool, DT['M'])) # filter out values 0, which is converted from NA
                    #m1, m2 = numpy.median(DT['I_group1']), numpy.median(DT['I_group2'])

                    ids = ', '.join(map(lambda a: a[2][0], secs))
                    DT = numpy.array(map(lambda a: a[2][1:], secs))
                    mM = numpy.median(filter(bool, DT[:, 0]))
                    m1, m2 = numpy.median(DT[:, 2:2 + group1]), numpy.median(
                        DT[:, 2 + group1:2 + group1 + group2])

                    dm1, dm2 = m1 - median1, m2 - median2
                    sM, s1, s2 = mM < -1 and -1 or mM > 1 and 1 or 0, dm1 > threshold and 'present' or 'absent', dm2 > threshold and 'present' or 'absent'
                    val.extend([ids, mM, m1, m2, dm1, dm2, sM, s1, s2])
                vals.append(map(str, val))
            else:
                for secpair in zip(secs_pos, secs_neg):
                    vsp = []
                    for s in secpair:
                        s = s[2]
                        ids, mM = s[0], s[1],
                        m1, m2 = numpy.median(s[2:2 + group1]), numpy.median(
                            s[2 + group1:2 + group1 + group2])
                        dm1, dm2 = m1 - median1, m2 - median2
                        sM, s1, s2 = mM < -1 and -1 or mM > 1 and 1 or 0, dm1 > threshold and 'present' or 'absent', dm2 > threshold and 'present' or 'absent'
                        vsp.extend([ids, mM, m1, m2, dm1, dm2, sM, s1, s2])
                    val.append(map(str, vsp))
                vals.extend(val)
        ln = len(vals)
        if single_row or ln == 0:
            if ln > 0:
                if ln > 1:
                    vals = numpy.array(vals)
                    vals = map(lambda a: join_str.join(vals[:, a]),
                               range(vals.shape[1]))
                elif ln == 1:
                    vals = vals[0]
                for i in range(len(vals)):
                    j = i_rlts[col_rlts[i]]
                    if not line[j].strip():
                        line[j] = vals[i]
                    else:
                        line[j] = line[j] + join_str + vals[i]
            fobj.write(sep.join(line) + linesep)
        else:
            for val in vals:
                linetmp = line[:]
                for i in range(len(val)):
                    j = i_rlts[col_rlts[i]]
                    if not linetmp[j].strip():
                        linetmp[j] = val[i]
                    else:
                        linetmp[j] = linetmp[j] + join_str + val[i]
                fobj.write(sep.join(linetmp) + linesep)