Esempio n. 1
0
File: label.py Progetto: csiu/promi2
def main(infile, labelfile, outfile):
    label_dict = _labelfile2dict_matchMirna(labelfile)

    with open(outfile, 'w') as out:
        with open(infile) as f:
            for line in f:
                line = line.strip().split('\t')
                chrom  = line[0]
                strand = line[6]

                info   = line[8].strip(';')
                info   = re.split('[;@]', info)

                mstart = get_value_from_keycolonvalue_list('mirna_start', info)
                mstop  = get_value_from_keycolonvalue_list('mirna_stop', info)

                #mirna = ','.join([chrom,mstart,mstop,strand])

                if mstart == '' and mstop == '':
                    label = 'NA'
                else:
                    mirna = get_value_from_keycolonvalue_list('mirbase_id', info)

                    mirna = re.match('^(\w*-\w*-\d*)', mirna).group(1)
                    if label_dict.has_key(mirna):
                        label = label_dict[mirna]
                    else:
                        label = 'unknown'

                info.append('mirna_label:'+label)
                line[8] = ';'.join(info)
                newline = '\t'.join(line)
                out.write(newline + '\n')
    print outfile
Esempio n. 2
0
def _cleanup_extra_positions(infile, outfile):
    ## cleanup of extra positions
    ## compare miRNA positions in PROX & CORR
    with open(outfile, 'w') as out:
        with open(infile) as f:
            for line in f:
                l = line.split('\t')
                descript = l[8].split('@')

                if (descript[1] != '') and (descript[2] != '\n'):
                    info_mprox = descript[1].split(';')
                    prox_start = get_value_from_keycolonvalue_list(
                        'mirna_start', info_mprox)
                    prox_stop = get_value_from_keycolonvalue_list(
                        'mirna_stop', info_mprox)

                    info_corr = descript[2].split(';')
                    corr_start = get_value_from_keycolonvalue_list(
                        'mirna_start', info_corr)
                    corr_stop = get_value_from_keycolonvalue_list(
                        'mirna_stop', info_corr)

                    if (prox_start == corr_start) and \
                           (prox_stop == prox_stop):
                        out.write(line)
                else:
                    out.write(line)
    return outfile
Esempio n. 3
0
def _verify_valid_distance(infile):
    out_good = infile + '.validdistance'
    out_bad = infile + '.badpair'

    with open(out_bad, 'w') as outB:
        with open(out_good, 'w') as outG:
            with open(infile) as f:
                for l in f:
                    l = l.strip().split('\t')

                    info = l[8].split(';')
                    d = get_value_from_keycolonvalue_list('distance', info)

                    if d == 'NA':
                        chrom = l[0]
                        start = l[3]
                        stop = l[4]
                        strand = l[6]
                        mirna = get_value_from_keycolonvalue_list(
                            'mirna_query', info)

                        badpair = 'chr%s:%s..%s,%s\t%s' % (chrom, start, stop,
                                                           strand, mirna)
                        outB.write(badpair + '\n')
                    else:
                        outG.write('\t'.join(l) + '\n')

    if os.stat(out_bad).st_size != 0:
        print "## There are some bad positions in your input file:"
        print "## chromosome or strand differences between TSS and miRNA pair"
        print out_bad
    else:
        os.remove(out_bad)

    return out_good
Esempio n. 4
0
def _make_newline(l, d):
    ## output in gff format
    mirna_proximity = str(distance_score(d))

    chrom      = l[0]
    peak_start = l[3]
    peak_stop  = l[4]
    strand     = l[6]

    info = l[8].split(';')
    region_up   = get_value_from_keycolonvalue_list('region_start', info)
    region_down = get_value_from_keycolonvalue_list('region_stop', info)

    mirna_start = l[12]
    mirna_stop  = l[13]

    mirna_info  = re.sub(' |"', '', l[17]).strip().split(';')
    mirna_acc   = get_value_from_keycolonvalue_list('ACC', mirna_info, '=')
    mirbase_id  = get_value_from_keycolonvalue_list('ID', mirna_info, '=')


    new_info = ';'.join(['distance:'+str(d),
                         #'region_start:'+region_up, 'region_stop:'+region_down,
                         'mirna_acc:'+mirna_acc, 'mirbase_id:'+mirbase_id,
                         'mirna_start:'+mirna_start, 'mirna_stop:'+mirna_stop])

    newline = '\t'.join([chrom, l[1], l[2], #'overlap', 'putative_tss',
                         peak_start, peak_stop, mirna_proximity, strand, '.',
                         new_info])
    return newline
Esempio n. 5
0
def _swap_columns(f_cage, f_out):
    with open(f_out, 'w') as out:
        with open(f_cage) as f:
            for l in f:
                l = l.strip().split('\t')
                tss_up = l[3]
                tss_down = l[4]
                info = l[8].split(';')

                start = get_value_from_keycolonvalue_list('start', info)
                stop = get_value_from_keycolonvalue_list('stop', info)

                ## new info column
                info = filter(
                    lambda x: not (x.startswith('start:') or x.startswith(
                        'stop:')), info)
                info.append('region_start:%s;region_stop:%s' %
                            (tss_up, tss_down))
                l[8] = ';'.join(info)

                ## new start & stop
                l[3] = start
                l[4] = stop

                out.write('\t'.join(l) + '\n')
    return
Esempio n. 6
0
def _pull_putative_prom(somefile, somelist, is_strict):
    with open(somefile) as f:
        for l in f:
            l = l.strip().split('\t')
            label = l[13]

            if label.startswith('prom'):
                if is_strict:
                    mprox = float(
                        get_value_from_keycolonvalue_list(
                            'mirna_prox', l[7].split(';')))
                    if mprox == 0:
                        continue

                chrom = l[0]
                start = l[3]
                stop = l[4]
                count = l[5]
                strand = l[6]
                prob_prom = l[11]

                info = l[8].split('@')[1].split(';')
                mirna = '%s-%s' % (
                    get_value_from_keycolonvalue_list('mirna_start', info),
                    get_value_from_keycolonvalue_list('mirna_stop', info))
                if mirna == '-':
                    mirna = 'NA-NA'

                try:
                    info = l[8].split('@')[2].split(';')
                    mirnaid = get_value_from_keycolonvalue_list(
                        'mirbase_id', info)
                    if mirnaid != '':
                        mirna = '%s:%s' % (mirna, mirnaid)
                    else:
                        mirna = '%s:%s' % (mirna, 'NA')
                except:
                    pass

                position = '%s,%s,%s,%s,%s' % (chrom, start, stop, strand,
                                               mirna)

                ## consider position with max prob
                if position in somelist:
                    old_count, old_prob, nlib, _ = somelist[position]
                    if (old_prob > prob_prom) or \
                           (old_prob == prob_prom and old_count >= count):
                        somelist[position][2] += 1
                        continue
                    else:
                        somelist[position] = [count, prob_prom, nlib + 1, l[7]]
                else:
                    somelist[position] = [count, prob_prom, 1, l[7]]
    return somelist
Esempio n. 7
0
def _verify_mirbaseID(gff_infile, gff_outfile):
    with open(gff_outfile, 'w') as out:
        with open(gff_infile) as f:
            for l in f:
                info = l.strip().split('\t')[8].split('@')
                _x = info[-2].split(';')
                _y = info[-1].split(';')

                _x = get_value_from_keycolonvalue_list('mirbase_id', _x)
                _y = get_value_from_keycolonvalue_list('mirbase_id', _y)

                if _x == _y or _x == '' or _y == '':
                    out.write(l)
    return
Esempio n. 8
0
def _average_conservation(f_cons,f_aver_cons):
    with open(f_aver_cons, 'w') as out:
        with open(f_cons) as f:
            for l in f:
                l = l.split(',')

                info = l[0].split(';')
                start_pos = get_value_from_keycolonvalue_list('start', info)
                stop_pos  = get_value_from_keycolonvalue_list('stop', info)

                try:
                    av_conservation = lmean([float(i) for i in l[1:]])
                except:
                    av_conservation = 0.0

                out.write('\t'.join([start_pos, stop_pos, str(av_conservation)]) +'\n')
Esempio n. 9
0
def build_features_matrix(sorted_gff, sorted_cpg, sorted_avcons, sorted_tata, f_out):
    ## check that all in files contain same number of data lines
    n_g = line_count(sorted_gff)
    n_c = line_count(sorted_cpg)
    n_a = line_count(sorted_avcons)
    n_t = line_count(sorted_tata)
    if not all_same([n_g, n_c, n_a, n_t]):
        sys.exit('Error: line count of feature files are not all equal:%s,%s,%s,%s' %
            n_g, n_c, n_a, n_t)

    ## create matrix
    lcount = 0
    with open(f_out, 'w') as out:
        with open(sorted_gff) as f:
            for l in f:
                lcount += 1

                l = l.strip().split('\t')
                c      = l[0]
                region_up   = l[3] #500bp   upstream of start; not used
                region_down = l[4] #500bp downstream of start; not used
                count  = l[5]
                strand = l[6]

                info = l[8].split(';')
                #dist_score = '?'

                peak_start = get_value_from_keycolonvalue_list('start', info)
                peak_stop  = get_value_from_keycolonvalue_list('stop', info)

                CpG_value    = linecache.getline(sorted_cpg,lcount).strip().split('\t')[3]
                try:
                    conservation = linecache.getline(sorted_avcons,lcount).strip().split('\t')[2]
                except:
                    conservation = '0'

                affinity     = linecache.getline(sorted_tata,lcount).strip().split('\t')[7]

                features = ';'.join(['cpg:'+CpG_value, 'cons:'+conservation, 'tata:'+affinity])
                new_info = ';'.join(['region_start:'+region_up, 'region_stop:'+region_down])
                line = '\t'.join([c, l[1], l[2],
                                  peak_start, peak_stop, count, strand,
                                  features, new_info])
                out.write(line + '\n')
Esempio n. 10
0
def _make_newline(l, d):
    ## output in gff format
    mirna_proximity = str(distance_score(d))

    chrom = l[0]
    peak_start = l[3]
    peak_stop = l[4]
    strand = l[6]

    info = l[8].split(';')
    region_up = get_value_from_keycolonvalue_list('region_start', info)
    region_down = get_value_from_keycolonvalue_list('region_stop', info)

    mirna_start = l[12]
    mirna_stop = l[13]

    mirna_info = re.sub(' |"', '', l[17]).strip().split(';')
    mirna_acc = get_value_from_keycolonvalue_list('ACC', mirna_info, '=')
    mirbase_id = get_value_from_keycolonvalue_list('ID', mirna_info, '=')

    new_info = ';'.join([
        'distance:' + str(d),
        #'region_start:'+region_up, 'region_stop:'+region_down,
        'mirna_acc:' + mirna_acc,
        'mirbase_id:' + mirbase_id,
        'mirna_start:' + mirna_start,
        'mirna_stop:' + mirna_stop
    ])

    newline = '\t'.join([
        chrom,
        l[1],
        l[2],  #'overlap', 'putative_tss',
        peak_start,
        peak_stop,
        mirna_proximity,
        strand,
        '.',
        new_info
    ])
    return newline
Esempio n. 11
0
def gff_unify_features(gff_a,
                       gff_b,
                       fname,
                       dfvalue,
                       f_out,
                       retainSourceFeature=False):
    ## unify
    f_out_tmp = f_out + '.tmp'
    bedtools_intersect(gff_a, gff_b, f_out_tmp)

    ## parse
    with open(f_out, 'w') as out:
        with open(f_out_tmp) as f:
            for l in f:
                l = l.strip().split('\t')

                chrom = l[0]
                start = l[3]
                stop = l[4]
                count = l[5]
                strand = l[6]
                features = l[7]
                info_a = l[8]
                _chrom = l[9]

                if chrom == _chrom:
                    ## yes overlap of features w/ mirna_proximity
                    x_b = l[14]
                    info_b = l[17]
                    mirbase_id = get_value_from_keycolonvalue_list(
                        'mirbase_id', info_b.split(';'))
                else:
                    x_b = dfvalue
                    info_b = ''
                    mirbase_id = '.'

                features = '%s;%s:%s' % (features, fname, x_b)
                new_info = info_a + '@' + info_b

                if retainSourceFeature:
                    newline = '\t'.join([
                        chrom, l[1], l[2], start, stop, count, strand,
                        features, new_info
                    ])
                else:
                    newline = '\t'.join([
                        chrom, 'putative_tss', mirbase_id, start, stop, count,
                        strand, features, new_info
                    ])
                out.write(newline + '\n')

    os.system('rm ' + f_out_tmp)
    return
Esempio n. 12
0
def estimate_betas(f_trainingset, trainingfeatures):
    if 'mirna_prox' in trainingfeatures:
        mprox = trainingfeatures.index('mirna_prox')
        trainingfeatures.pop(mprox)
        add_mprox = True
    else:
        add_mprox = False

    ## preprocess to right format for Rscript
    f_intermediate = f_trainingset + '.intermediate.tmp'
    with open(f_intermediate, 'w') as out:
        with open(f_trainingset) as f:
            for l in f:
                l = l.strip().split('\t')
                features = l[7].split(';')

                fvalues = []
                for i in trainingfeatures:
                    try:
                        fvalues.append(
                            float(
                                get_value_from_keycolonvalue_list(i,
                                                                  features)))
                    except ValueError:
                        fvalues.append(0)

                fvalues = [str(i) for i in fvalues]

                if 'back' in l[2].lower():
                    label = '0'
                else:
                    label = '1'

                out.write('\t'.join([label] + fvalues) + '\n')

    ## estimating the beta parameters
    ## note: mirna_proximity is not considered in the fitting of betas
    f_beta_tmp = f_trainingset + '.parameters_beta.tmp'
    os.system('R --slave --vanilla --args '+f_intermediate+' '+f_beta_tmp+\
              ' < external/choose_beta_params.R')
    betas = []
    with open(f_beta_tmp) as f:
        for l in f:
            betas.append(l.strip())

    ## estimate beta4 (for mirna_proximity)
    betas = [float(b) for b in betas]
    if add_mprox:
        beta_mprox = min(betas[1:])
        betas.insert(mprox + 1, beta_mprox)

    os.system('rm %s %s' % (f_intermediate, f_beta_tmp))
    return betas
Esempio n. 13
0
def estimate_betas(f_trainingset, trainingfeatures):
    if 'mirna_prox' in trainingfeatures:
        mprox = trainingfeatures.index('mirna_prox')
        trainingfeatures.pop(mprox)
        add_mprox = True
    else:
        add_mprox = False

    ## preprocess to right format for Rscript
    f_intermediate = f_trainingset + '.intermediate.tmp'
    with open(f_intermediate, 'w') as out:
        with open(f_trainingset) as f:
            for l in f:
                l = l.strip().split('\t')
                features = l[7].split(';')

                fvalues = []
                for i in trainingfeatures:
                    try:
                        fvalues.append(float(
                            get_value_from_keycolonvalue_list(i, features)))
                    except ValueError:
                        fvalues.append(0)

                fvalues = [str(i) for i in fvalues]

                if 'back' in l[2].lower():
                    label = '0'
                else:
                    label = '1'

                out.write('\t'.join([label] + fvalues) +'\n')

    ## estimating the beta parameters
    ## note: mirna_proximity is not considered in the fitting of betas
    f_beta_tmp = f_trainingset + '.parameters_beta.tmp'
    os.system('R --slave --vanilla --args '+f_intermediate+' '+f_beta_tmp+\
              ' < external/choose_beta_params.R')
    betas = []
    with open(f_beta_tmp) as f:
        for l in f:
            betas.append(l.strip())

    ## estimate beta4 (for mirna_proximity)
    betas = [float(b) for b in betas]
    if add_mprox:
        beta_mprox = min(betas[1:])
        betas.insert(mprox+1, beta_mprox)

    os.system('rm %s %s' % (f_intermediate, f_beta_tmp))
    return betas
Esempio n. 14
0
def _swap_columns(f_cage, f_out):
    with open(f_out, 'w') as out:
        with open(f_cage) as f:
            for l in f:
                l = l.strip().split('\t')
                tss_up   = l[3]
                tss_down = l[4]
                info = l[8].split(';')

                start = get_value_from_keycolonvalue_list('start', info)
                stop  = get_value_from_keycolonvalue_list('stop', info)

                ## new info column
                info = filter(lambda x: not (x.startswith('start:') or x.startswith('stop:')), info)
                info.append('region_start:%s;region_stop:%s' % (tss_up, tss_down))
                l[8] = ';'.join(info)

                ## new start & stop
                l[3] = start
                l[4] = stop

                out.write('\t'.join(l) + '\n')
    return
Esempio n. 15
0
def _check_labelling(infile, labelfile):
    ## simple check
    with open(infile) as f:
        for l in f:
            info = l.strip().split('\t')[8].split(';')
            label = get_value_from_keycolonvalue_list('mirna_label', info)
            if label == '':
                isLabelled = False
            else:
                isLabelled = True
            break

    if isLabelled:
        return infile
    else:
        print '## No labelling is found, proceed with labelling...'
        outfile = '%s.label' % infile

        lb.main(infile, labelfile, outfile)
        return outfile
Esempio n. 16
0
def _readcount_finder(somefile, somelist, get_id=False):
    with open(somefile) as f:
        for l in f:
            l = l.split('\t')
            chrom = l[0]
            start = l[3]
            stop = l[4]
            strand = l[6]

            position = '%s,%s,%s,%s' % (chrom, start, stop, strand)

            if position in somelist:
                x = l[5]

                if get_id:
                    info = l[8].strip().split(';')
                    sid = get_value_from_keycolonvalue_list('id', info)
                    somelist[position].append(sid + ':' + x)
                else:
                    somelist[position].append(x)
    return somelist
Esempio n. 17
0
def read_data(f_trainingset, trainingfeatures):
    data = []
    with open(f_trainingset) as f:
        for l in f:
            l = l.split('\t')
            count = float(l[5])

            if count != 0:
                chrom = l[0]
                start = int(l[3])
                stop = int(l[4])
                strand = l[6]

                features = l[7].split(';')

                fvalues = []
                for i in trainingfeatures:
                    try:
                        fvalues.append(
                            float(
                                get_value_from_keycolonvalue_list(i,
                                                                  features)))
                    except ValueError:
                        fvalues.append(0)

                label = l[2].lower()
                ##probability of promoter (z1) & background (z2)
                if 'back' in label:
                    z1 = 0.0
                    z2 = 1.0
                else:
                    z1 = 0.5
                    z2 = 0.5

                item = (chrom, start, stop, strand, count, trainingfeatures,
                        fvalues, z1, z2)
                data.append(item)
    return data
Esempio n. 18
0
def promi2(f_param, listoffeatures, infile, outfile):
    mu1, mu2, lambda1, lambda2, betas = _read_params(f_param)

    if len(betas) != len(listoffeatures) + 1:
        sys.exit("ERROR: number of betas does not match number of features")

    with open(outfile, 'w') as out:
        with open(infile) as f:
            for line in f:
                line = line.strip()

                l = line.split('\t')
                x = float(l[5])

                _features = l[7].split(';')

                fvalues = []
                for lof in listoffeatures:
                    try:
                        fvalues.append(
                            float(
                                get_value_from_keycolonvalue_list(
                                    lof, _features)))
                    except ValueError:
                        fvalues.append(0)

                p_prom, p_back, prior_prom, prior_back = promirna.promirna(
                    x, mu1, mu2, lambda1, lambda2, betas, fvalues)
                prediction = _make_prediction(prior_prom, p_prom, p_back)

                #line = '\t'.join([line,
                #                  ';'.join(['prior_prom:'+str(prior_prom), 'prior_back:'+str(prior_back),
                #                            'prob_prom:'+str(p_prom), 'prob_back:'+str(p_back)]),
                #                  prediction]) + '\n'
                line = line + '\t%s\t%s\t%s\t%s\t%s\n' % (
                    prior_prom, prior_back, p_prom, p_back, prediction)
                out.write(line)
    return
Esempio n. 19
0
def _read_dat(gff_infile):
    dat = {}
    n = 0
    with open(gff_infile) as f:
        for l in f:
            n += 1

            l = l.strip().split('\t')
            chrom = l[0]
            tstart = l[3]
            tstop = l[4]
            strand = l[6]
            tss = ','.join([chrom, tstart, tstop, strand])

            info = l[8].split(';')
            mirbase_id = get_value_from_keycolonvalue_list('mirbase_id', info)
            mstart = get_value_from_keycolonvalue_list('mirna_start', info)
            mstop = get_value_from_keycolonvalue_list('mirna_start', info)
            label = get_value_from_keycolonvalue_list('mirna_label', info)

            if label == '': label = 'NA'
            mirna = ','.join([chrom, mstart, mstop, strand])

            features = l[7].split(';')
            corr = get_value_from_keycolonvalue_list('corr', features)
            if get_value_from_keycolonvalue_list('mirna_prox', features) != 0:
                distance = get_value_from_keycolonvalue_list('distance', info)
                if distance == '': distance = 0

            dat[n] = [
                tss, mirna, mirbase_id, label, distance,
                abs(float(distance)), corr
            ]

    dat = pd.DataFrame.from_dict(dat, orient='index')
    dat.columns = [
        'tss', 'mirna', 'mirbase_id', 'label', 'Distance', 'distance',
        'correlation'
    ]
    return dat
Esempio n. 20
0
def _index_feat(gff_ufeat, has_mirna):
    pairid_index = {}
    with open(gff_ufeat) as f:
        c = 0
        for l in f:
            c += 1

            chrom, _, _, start, stop, _, strand, _, info = l.strip().rsplit('\t')
            info    = re.split('[;@]', info)

            pid = '.'.join([chrom, start, stop, strand])
            if has_mirna:
                mirna = get_value_from_keycolonvalue_list('mirna_query', info)
                val = '%s:%s' % (mirna, c)
            else:
                val = c

            try:
                pairid_index[pid].append(val)
            except KeyError:
                pairid_index[pid] = [val]

    return pairid_index
Esempio n. 21
0
def read_data(f_trainingset, trainingfeatures):
    data = []
    with open(f_trainingset) as f:
        for l in f:
            l = l.split('\t')
            count = float(l[5])

            if count != 0:
                chrom  = l[0]
                start  = int(l[3])
                stop   = int(l[4])
                strand = l[6]

                features = l[7].split(';')

                fvalues = []
                for i in trainingfeatures:
                    try:
                        fvalues.append(float(
                            get_value_from_keycolonvalue_list(i, features)))
                    except ValueError:
                        fvalues.append(0)

                label  = l[2].lower()
                ##probability of promoter (z1) & background (z2)
                if 'back' in label:
                    z1 = 0.0
                    z2 = 1.0
                else:
                    z1 = 0.5
                    z2 = 0.5

                item = (chrom, start, stop, strand, count,
                        trainingfeatures, fvalues,
                        z1, z2)
                data.append(item)
    return data
Esempio n. 22
0
def _index_feat(gff_ufeat, has_mirna):
    pairid_index = {}
    with open(gff_ufeat) as f:
        c = 0
        for l in f:
            c += 1

            chrom, _, _, start, stop, _, strand, _, info = l.strip().rsplit(
                '\t')
            info = re.split('[;@]', info)

            pid = '.'.join([chrom, start, stop, strand])
            if has_mirna:
                mirna = get_value_from_keycolonvalue_list('mirna_query', info)
                val = '%s:%s' % (mirna, c)
            else:
                val = c

            try:
                pairid_index[pid].append(val)
            except KeyError:
                pairid_index[pid] = [val]

    return pairid_index
Esempio n. 23
0
def feature_closest_corr(
        f_querygff,
        f_mirbasegff,  ##sRNAseq to gff format
        m_mirna,
        m_tss,
        m_back,  ## matrices with expression value
        f_tcfilesinput,  ## will determine columnID for m_back
        method,  ## correlation method
        outfile,
        verbose=False):
    ## files
    d = outfile + '_intermediates'
    ensure_dir(d, False)

    fo_mirnagff = os.path.join(d, 'srnaseq_pos.gff')
    fo_closest = os.path.join(d, 'closest_tss-mirna.txt')
    f_pos_pairing = os.path.join(d, 'pairing_position.txt')
    f_sample_pairing = os.path.join(d, 'pairing_sample.txt')
    fo_corr = os.path.join(d, 'closest_corr.gff')

    ## 1. sRNAseq to gff format
    _find_miRNA_pos(m_mirna, f_mirbasegff, fo_mirnagff)

    ## 2a. find closest pair
    cmd = 'bedtools closest -a ' + f_querygff + ' -b ' + fo_mirnagff + ' -s -iu -D a -t first > ' + fo_closest
    if verbose: print "STATUS: finding closest pair..."
    if verbose: print cmd
    os.system(cmd)

    ## 2b. get pairing info: position
    ## -> seq_id, seq_line, mirna_info, mirna_line, label
    if verbose: print 'STATUS: identifying pairing info: position...'
    with open(f_pos_pairing + '.posSet', 'w') as out_pos:
        with open(f_pos_pairing + '.negSet', 'w') as out_neg:

            cageseq_dict = {}
            with open(m_tss) as f:
                linenum = 0
                for l in f:
                    linenum += 1
                    if l.startswith('#') \
                           or l.startswith('00Annotation') \
                           or l.startswith('01STAT'):
                        continue

                    pos, _ = l.split('\t', 1)

                    chrom, start, _, stop, strand = re.split('[:.,]', pos)
                    start, stop = _determine_region(start, stop)

                    pos = '%s:%s..%s,%s' % (chrom, start, stop, strand)

                    cageseq_dict[pos] = linenum

            background_dict = {}
            with open(m_back) as f:
                linenum = 0
                for l in f:
                    linenum += 1
                    pos = l.split('\t')[3]
                    background_dict[pos] = linenum

            with open(fo_closest) as f:
                for l in f:
                    l = l.strip()
                    _, d = l.rsplit('\t', 1)
                    d = int(d)

                    if (d >= 0) and (d <= 50000):
                        l = l.split('\t')
                        label = l[2]

                        pos = 'chr%s:%s..%s,%s' % (l[0], l[3], l[4], l[6])
                        seq_id = 'title=%s' % pos

                        mirna_line = l[16]
                        mirna_info = ','.join([
                            'title=' + l[17].split(':')[1],
                            'mirbase_id=' + l[11], 'mirna_start=' + l[12],
                            'mirna_stop=' + l[13]
                        ])

                        if label == 'BACK':
                            info = l[8].split(';')
                            pos = 'chr%s:%s..%s,%s' % (
                                l[0],
                                get_value_from_keycolonvalue_list(
                                    'region_start', info),
                                get_value_from_keycolonvalue_list(
                                    'region_stop', info), l[6])
                            try:
                                seq_line = str(background_dict[pos])
                                newline = '\t'.join(
                                    [seq_id, seq_line, mirna_info, mirna_line])
                                out_neg.write(newline + '\n')
                            except KeyError:
                                continue

                        else:
                            try:
                                seq_line = str(cageseq_dict[pos])
                                newline = '\t'.join(
                                    [seq_id, seq_line, mirna_info, mirna_line])
                                out_pos.write(newline + '\n')
                            except KeyError:
                                continue

    ## 3. get pairing info: sample
    ## -> sampleID, cage_column_index, srnaseq_matrix_column_index, (cid,mid)
    if verbose: print 'STATUS: identifying pairing info: samples...'
    cage_id_pattern = re.compile('^tpm.*(CNhs.*\..*)$')
    back_id_pattern = re.compile('^.*(CNhs.*?\..*?)\..*$')
    srnaseq_id_pattern = re.compile('^.*(SRh.*?\..*?)\.')

    cage_index = {}
    with open(m_tss) as f:
        for l in f:
            if l.startswith('00Annotation'):
                l = l.strip().split('\t')
                c = 0
                for header in l:
                    if header.startswith('tpm'):
                        cage_sample_id = cage_id_pattern.match(header).group(1)
                        cage_id = cage_sample_id.split('.')[1]
                        try:
                            cage_index[cage_id].append('%s:%s' %
                                                       (cage_sample_id, c))
                        except KeyError:
                            cage_index[cage_id] = [
                                '%s:%s' % (cage_sample_id, c)
                            ]
                    c += 1
                break

    back_index = {}
    with open(f_tcfilesinput) as f:
        line = 6
        for l in f:
            cage_sample_id = back_id_pattern.match(l).group(1)
            cage_id = cage_sample_id.split('.')[1]
            try:
                back_index[cage_id].append('%s:%s' % (cage_sample_id, line))
            except KeyError:
                back_index[cage_id] = ['%s:%s' % (cage_sample_id, line)]
            line += 1

    srnaseq_index = {}
    with open(m_mirna) as f:
        for l in f:
            if l.startswith('ID'):
                l = l.strip().split('\t')
                c = 0
                for header in l:
                    if header.endswith('.bam'):
                        srnaseq_sample_id = srnaseq_id_pattern.match(
                            header).group(1)
                        srnaseq_id = srnaseq_sample_id.split('.')[1]
                        try:
                            srnaseq_index[srnaseq_id].append(
                                '%s:%s' % (srnaseq_sample_id, c))
                        except KeyError:
                            srnaseq_index[srnaseq_id] = [
                                '%s:%s' % (srnaseq_sample_id, c)
                            ]
                    c += 1
                break

    ## combine
    with open(f_sample_pairing + '.posSet', 'w') as out:
        sample_ids = set(cage_index.keys()).intersection(srnaseq_index.keys())
        for k in sample_ids:
            for c in cage_index[k]:
                for m in srnaseq_index[k]:
                    cid, cindex = c.split(':')
                    mid, mindex = m.split(':')
                    out.write(
                        '\t'.join([k, cindex, mindex,
                                   '%s,%s' % (cid, mid)]) + '\n')

    with open(f_sample_pairing + '.negSet', 'w') as out:
        sample_ids = set(back_index.keys()).intersection(srnaseq_index.keys())
        for k in sample_ids:
            for c in back_index[k]:
                for m in srnaseq_index[k]:
                    cid, cindex = c.split(':')
                    mid, mindex = m.split(':')
                    out.write(
                        '\t'.join([k, cindex, mindex,
                                   '%s,%s' % (cid, mid)]) + '\n')

    ## 4. compute correlation
    if verbose: print 'STATUS: computing correlation (method)...' % method
    _compute_correlation(f_pos_pairing + '.posSet',
                         f_sample_pairing + '.posSet', m_tss, m_mirna,
                         fo_corr + '.posSet', method, 'putative_tss')
    _compute_correlation(f_pos_pairing + '.negSet',
                         f_sample_pairing + '.negSet', m_back, m_mirna,
                         fo_corr + '.negSet', method, 'background')

    with open(fo_corr, 'w') as out:
        with open(fo_corr + '.negSet') as f:
            for l in f:
                out.write(l)
        with open(fo_corr + '.posSet') as f:
            for l in f:
                out.write(l)

    os.remove(fo_corr + '.negSet')
    os.remove(fo_corr + '.posSet')

    ## 5. unify
    if verbose: print 'STATUS: creating "%s"' % outfile
    gff_unify_features.main(f_querygff, fo_corr, 'corr', '0', outfile, True)

    return fo_corr
Esempio n. 24
0
def main(files, outdir, N, percent_lib, is_get_id, f_config, verbose=False):
    if os.path.isdir(outdir):
        sys.exit('## ERROR: "%s" already exists' % outdir)

    cparser = SafeConfigParser()
    cparser.read(f_config)
    verbose = True

    f_mirbasegff = cparser.get('mirbase', 'gff2')
    f_chromsizes = cparser.get('genome', 'chromsizes')
    f_repeats = cparser.get('genome', 'repeats')
    f_ensembl = cparser.get('genome', 'ensemblgtf')
    f_fasta = cparser.get('genome', 'fasta')
    d_phastcons = cparser.get('cons', 'phastcons')
    TRAP = cparser.get('tata', 'trap')
    f_psemmatrix = cparser.get('tata', 'psem')
    f_traincfg = cparser.get('configs', 'tcconfig')
    m_mirna = cparser.get('correlation', 'srnaseqmatrix')
    m_tss = cparser.get('correlation', 'cageseqmatrix')
    corrmethod = cparser.get('correlation', 'corrmethod')

    f_trainingset = os.path.join(outdir, 'TrainingSet.gff')
    outdir1 = f_trainingset + '_intermediates'

    ensure_dir(outdir, False)
    ensure_dir(outdir1, False)

    _files = glob.glob(files)

    ## creating auxillary file for negative set
    f_fiveprimegff = '../data/hsa.five_prime.gff'
    if not os.path.exists(f_fiveprimegff):
        if verbose:
            print 'STATUS: creating "%s" auxillary file...' % f_fiveprimegff
        extract_tss_from_ensembl(f_ensembl, f_fiveprimegff)

    ## create training set
    gff_ts_pos = os.path.join(outdir1, 'trainingset_pos.gff')
    gff_ts_neg = os.path.join(outdir1, 'trainingset_neg.gff')
    if verbose: print 'STATUS: creating positive candidate set...'
    create_positiveset(percent_lib, _files, f_mirbasegff, N, gff_ts_pos,
                       is_get_id)
    if verbose: print 'STATUS: creating negative candidate set...'
    create_negativeset(f_chromsizes, f_repeats, f_fiveprimegff, f_traincfg, N,
                       gff_ts_neg)

    shutil.move(os.path.join(outdir1, 'tc-norm_negSet'),
                os.path.join(outdir, 'tc-norm_negSet'))

    ## feature extraction: cpg, cons, tata (features.py)
    if verbose: print 'STATUS: extracting features cpg/cons/tata...'
    gff_1kbfeatures_pos = os.path.join(outdir1, 'features1kb_ts_pos.gff')
    gff_1kbfeatures_neg = os.path.join(outdir1, 'features1kb_ts_neg.gff')

    features.main(gff_ts_pos, outdir1, f_fasta, f_chromsizes, d_phastcons,
                  TRAP, f_psemmatrix, gff_1kbfeatures_pos)

    features.main(gff_ts_neg, outdir1, f_fasta, f_chromsizes, d_phastcons,
                  TRAP, f_psemmatrix, gff_1kbfeatures_neg)

    ## feature extraction: mirna_proximity
    if verbose: print 'STATUS: extracting features mirna_proximity...'
    gff_mirnaprox_pos = os.path.join(outdir1, 'featureMprox_ts_pos.gff')
    gff_mirnaprox_neg = os.path.join(outdir1, 'featureMprox_ts_neg.gff')
    mirna_proximity.main(gff_ts_pos, f_mirbasegff, gff_mirnaprox_pos)
    mirna_proximity.main(gff_ts_neg, f_mirbasegff, gff_mirnaprox_neg)

    gff_features_pos = os.path.join(outdir1, 'Features_ts_pos.gff')
    gff_features_neg = os.path.join(outdir1, 'Features_ts_neg.gff')
    gff_unify_features.main(gff_1kbfeatures_pos, gff_mirnaprox_pos,
                            'mirna_prox', '0', gff_features_pos, True)
    gff_unify_features.main(gff_1kbfeatures_neg, gff_mirnaprox_neg,
                            'mirna_prox', '0', gff_features_neg, True)

    ## create final training set ...
    ## where background must pass criteria: cpg <= 0.5 and cons <= 0.2 and tata <= 0.1 and mirna_prox == 0
    if verbose: print 'STATUS: creating final training set...'
    good_background = gff_features_neg + '_cpglt0.5-conslt0.2-tatalt0.1-mproxeq0.gff'
    with open(good_background, 'w') as out:
        with open(gff_features_neg) as f:
            for line in f:
                info = line.strip().split('\t')[7].split(';')
                cpg = float(get_value_from_keycolonvalue_list('cpg', info))
                cons = float(get_value_from_keycolonvalue_list('cons', info))
                tata = float(get_value_from_keycolonvalue_list('tata', info))
                mprx = float(
                    get_value_from_keycolonvalue_list('mirna_prox', info))

                if cpg <= 0.5 and cons <= 0.2 and tata <= 0.1 and mprx == 0:
                    out.write(line)

    wc = line_count(good_background)
    selectedlines = random.sample(range(1, wc + 1), N)

    with open(f_trainingset, 'w') as out:
        ## writing negative set
        for l in selectedlines:
            out.write(linecache.getline(good_background, l))

        ## writing positive set
        with open(gff_features_pos) as f:
            ## when mirna_prox extraction feature was used,
            ## extracted all pairs within 50kb upstream mirna
            ## -> single tss could have many mirna
            ## take pair with min distance
            ## -> essential first entry
            pos_list = []
            for line in f:
                l = line.split('\t')
                pos = ','.join([l[0], l[3], l[4], l[6]])
                if not (pos in pos_list):
                    pos_list.append(pos)
                    out.write(line)

    if not (os.path.isfile(m_mirna) and os.path.isfile(m_tss)):
        return f_trainingset

    ## create final training set with feature:correlation of closest tss->miRNA ...
    if verbose:
        print 'STATUS: creating final training set with correlation of closest tss->miRNA...'
    f_trainingset2 = os.path.join(outdir, 'TrainingSet-corr.gff')
    m_back = glob.glob('%s/tc-norm_negSet/*tpm_rle.matrix' % outdir)[0]
    f_tcfilesinput = os.path.join(outdir, 'tc-norm_negSet', 'files.txt')

    feature_closest_corr(f_trainingset, f_mirbasegff, m_mirna, m_tss, m_back,
                         f_tcfilesinput, corrmethod, f_trainingset2)

    return f_trainingset2