Ejemplo n.º 1
0
def fetch_read2(bam, chrom):
    read2_lst = {}
    for read in bam.fetch(chrom):
        # not read1 or secondary alignment
        if read.is_read1 or read.is_secondary:
            continue
        if not read.is_proper_pair:  # not proper pair
            continue
        if read.get_tag('NH') != 1:  # not unique read
            continue
        chrom = read.reference_name
        mate_chrom = read.next_reference_name
        if chrom != mate_chrom:  # not same chromosome
            continue
        if not read.is_reverse and read.mate_is_reverse:
            strand = '+'
            mate_strand = '-'
        elif read.is_reverse and not read.mate_is_reverse:
            strand = '-'
            mate_strand = '+'
        else:
            continue
        mate_pos = str(read.next_reference_start)
        name = read.query_name
        start = str(read.reference_start)
        end = str(read.reference_end)
        if strand == '+':
            barcode = dna_to_rna(read.query_sequence[:15])
        else:
            barcode = dna_to_rna(read.query_sequence[-15:], strand=strand)
        read_id = '\t'.join([name, mate_chrom, mate_pos, mate_strand])
        read2_lst[read_id] = [chrom, start, end, barcode]
    return read2_lst
Ejemplo n.º 2
0
def parse_intron(options, chrom, start, end, strand, intron_info):
    # fetch fasta
    fa = check_fasta(options['--genome'])
    intron_fa = dna_to_rna(fa.fetch(chrom, start, end), strand)
    # load matrix
    matrix3 = load_matrix3()
    # parse options
    phastcons_f = pyBigWig.open(options['--bigwig'])
    min_distance = int(options['--min-distance'])
    min_score = float(options['--min-score'])
    min_phastcons = float(options['--min-phastcons'])
    # start to parse rs sites
    rs_list = []
    for m in re.finditer('AGGT', intron_fa):
        if strand == '+':
            pos = start + m.start() + 2
            left_dist, right_dist, dist_flag = cal_distance(
                pos, start, end, min_distance)
            if not dist_flag:  # not enough distance
                continue
            ss3_seq = dna_to_rna(fa.fetch(chrom, pos - 20, pos + 3))
            if ss3_seq.find('N') != -1:  # ensure there is no N
                continue
            ss3, score_flag = cal_score(ss3_seq, matrix3, min_score)
            if not score_flag:  # not high score
                continue
        else:
            pos = end - m.start() - 2
            left_dist, right_dist, dist_flag = cal_distance(
                pos, start, end, min_distance)
            if not dist_flag:  # not enough distance
                continue
            ss3_seq = dna_to_rna(fa.fetch(chrom, pos - 3, pos + 20),
                                 strand='-')
            if ss3_seq.find('N') != -1:  # ensure there is no N
                continue
            ss3, score_flag = cal_score(ss3_seq, matrix3, min_score)
            if not score_flag:  # not high score
                continue
        phastcons = phastcons_f.stats(chrom, pos - 2, pos + 2)[0]
        if phastcons is None or phastcons < min_phastcons:  # not conserved
            continue
        rs_feature = '%d|%d|%d|%f|%f' % (pos, left_dist, right_dist, ss3,
                                         phastcons)
        rs_list.append(rs_feature)
    if rs_list:
        return (intron_info, rs_list)
    else:
        return (None, None)
Ejemplo n.º 3
0
def parse_intron(options, chrom, start, end, strand, intron_info):
    # fetch fasta
    fa = check_fasta(options['--genome'])
    intron_fa = dna_to_rna(fa.fetch(chrom, start, end), strand)
    # parse options
    phastcons_f = pyBigWig.open(options['--bigwig'])
    min_distance = int(options['--min-distance'])
    # start to parse rs sites
    rs_list = defaultdict(list)
    for i in product('ATCG', repeat=4):
        motif = ''.join(i)
        rs_list[motif].append('x')
        for m in re.finditer(motif, intron_fa):
            if strand == '+':
                pos = start + m.start() + 2
                left_dist, right_dist, dist_flag = cal_distance(pos, start,
                                                                end,
                                                                min_distance)
                if not dist_flag:  # not enough distance
                    continue
            else:
                pos = end - m.start() - 2
                left_dist, right_dist, dist_flag = cal_distance(pos, start,
                                                                end,
                                                                min_distance)
                if not dist_flag:  # not enough distance
                    continue
            phastcons = phastcons_f.stats(chrom, pos - 2, pos + 2)[0]
            if phastcons is None:  # no conservation score
                continue
            rs_list[motif].append(phastcons)
    intron_length = end - start - 2 * min_distance
    return(rs_list, intron_length)
Ejemplo n.º 4
0
def parse_intron(options, chrom, start, end, strand, intron_info):
    # fetch fasta
    fa = check_fasta(options['--genome'])
    intron_fa = dna_to_rna(fa.fetch(chrom, start, end), strand)
    # parse options
    motif = options['-m']
    phastcons_f = pyBigWig.open(options['--bigwig'])
    min_distance = int(options['--min-distance'])
    min_phastcons = float(options['--min-phastcons'])
    # start to parse rs sites
    rs_list = []
    for m in re.finditer(motif, intron_fa):
        if strand == '+':
            pos = start + m.start() + 2
            left_dist, right_dist, dist_flag = cal_distance(
                pos, start, end, min_distance)
            if not dist_flag:  # not enough distance
                continue
        else:
            pos = end - m.start() - 2
            left_dist, right_dist, dist_flag = cal_distance(
                pos, start, end, min_distance)
            if not dist_flag:  # not enough distance
                continue
        phastcons = phastcons_f.stats(chrom, pos - 2, pos + 2)[0]
        if phastcons is None or phastcons < min_phastcons:  # not conserved
            continue
        rs_feature = '%d|%d|%d|%f' % (pos, left_dist, right_dist, phastcons)
        rs_list.append(rs_feature)
    if rs_list:
        return (intron_info, rs_list)
    else:
        return (None, None)
Ejemplo n.º 5
0
Archivo: csf.py Proyecto: kepbod/stuff
def build_index(fa, chrom, site, strand, rlen, thread, out_dir, seq, seq_flag):
    print('Build index...')
    if strand == '+':
        start = site - (rlen - 10)
        end = site + (rlen - 20)
        offset = rlen - 10
    else:
        start = site - (rlen - 20)
        end = site + (rlen - 10)
        offset = rlen - 20
    index_path = os.path.join(out_dir, 'sgRNA.fa')
    if seq_flag:
        os.symlink(seq, index_path)
    else:
        # fetch sgRNA region sequence
        with open(index_path, 'w') as out:
            out.write('>sgRNA_region\n')
            out.write(
                dna_to_rna(fa.fetch(chrom, start, end), strand=strand) + '\n')
    # build index
    if which('bowtie2-build'):
        command = 'bowtie2-build -q --threads %s %s %s'
        command = command % (thread, index_path, index_path)
        run_command(command, 'Error: cannot build index for sgRNA!')
    else:
        sys.exit('Error: no bowtie2-build installed!')
    return index_path, offset