コード例 #1
0
def get_one_hot_sequence(chrname, start, stop, nuc, HPC_MODE):

    if HPC_MODE:
        # Path on the HPC of the 2bit version of the human reference genome (hg19)
        genome = twobit.TwoBitFile(
            '/hpc/cog_bioinf/ridder/users/lsantuari/Datasets/genomes/hg19.2bit'
        )
    else:
        # Path on the local machine of the 2bit version of the human reference genome (hg19)
        genome = twobit.TwoBitFile(
            '/Users/lsantuari/Documents/Data/GiaB/reference/hg19.2bit')

    #ltrdict = {'a': 1, 'c': 2, 'g': 3, 't': 4, 'n': 0}

    # N one-hot
    #ltrdict = {'a': 0, 'c': 0, 'g': 0, 't': 0, 'n': 1}
    #return np.array([ltrdict[x.lower()] for x in genome['chr'+chrname][start:stop]])

    if chrname == 'MT':
        chrname = 'M'

    return np.array([
        1 if x.lower() == nuc.lower() else 0
        for x in genome['chr' + chrname][start:stop]
    ])
コード例 #2
0
def get_one_hot_sequence_by_list(chrname, positions, HPC_MODE):

    if HPC_MODE:
        # Path on the HPC of the 2bit version of the human reference genome (hg19)
        genome = twobit.TwoBitFile(
            '/hpc/cog_bioinf/ridder/users/lsantuari/Datasets/genomes/hg19.2bit'
        )
    else:
        # Path on the local machine of the 2bit version of the human reference genome (hg19)
        genome = twobit.TwoBitFile(
            '/hpc/cog_bioinf/ridder/users/lsantuari/Datasets/genomes/hg19.2bit'
        )

    if chrname == 'MT':
        chrname = 'M'

    whole_chrom = str(genome['chr' + chrname])

    nuc_list = ['A', 'T', 'C', 'G', 'N']
    res = np.zeros(shape=(len(positions), len(nuc_list)), dtype=np.uint32)
    for i, nuc in enumerate(nuc_list, start=0):
        res[:, i] = np.array([
            1 if whole_chrom[pos].lower() == nuc.lower() else 0
            for pos in positions
        ])

    return res
コード例 #3
0
def load_human_genome_sequence(genomic_build):
    genome_file = json_data[genomic_build]["genome"]
    if not os.path.exists(genome_file):
        raise Exception(messages['error_messages']
                        ['GENOME_NOT_PRESENT'].format(genome_file))

    return twobitreader.TwoBitFile(genome_file)
コード例 #4
0
def loadChromInfo(twoBit):
    """map of chrom names to sizes"""
    tbr = twobitreader.TwoBitFile(twoBit)
    try:
        return dict(tbr.sequence_sizes())
    finally:
        tbr.close()
def get_regionLevel_simplex_parameters(inputbed, outputbed, plusbw, minusbw,
                                       biasmat, ext, genome2bit):
    simplex_code = encoding()
    biasdict, flank = readBG(biasmat)
    B, B0, B1, B2 = paramest(biasdict)
    permuteSeq = {}
    inf = open("permuteSeq8mer.txt")
    for line in inf:
        ll = line.split()
        permuteSeq[ll[0]] = ll[1]
    inf.close()
    #    outitem = seq2biasParm("ACTCGCAA",B,simplex_code)
    #print B
    genome = twobitreader.TwoBitFile(genome2bit)
    #    seq = genome[chrm][(int(ll[1])-flank):(int(ll[1])+flank)].upper()

    plusBWH = BigWigFile(open(plusbw, 'rb'))
    minusBWH = BigWigFile(open(minusbw, 'rb'))

    inf = open(inputbed)
    outf = open(outputbed, 'w')
    for line in inf:
        ll = line.split()
        chrm = ll[0]
        center = (int(ll[1]) + int(ll[2])) / 2
        start = max(0, center - ext)
        end = center + ext
        plusSig = plusBWH.summarize(ll[0], start, end, end - start).sum_data
        minusSig = minusBWH.summarize(ll[0], start, end, end - start).sum_data
        if type(plusSig) == None or type(minusSig) == None:
            continue
        plusSequence = genome[chrm][(start - flank):(end + flank)].upper()
        minusSequence = genome[chrm][(start - flank + 1):(end + flank +
                                                          1)].upper()
        plus_data = numpy.array([0.0] * len(B))
        minus_data = numpy.array([0.0] * len(B))
        for i in range(len(plusSig)):
            #position = start + i
            pcuts = plusSig[i]
            if pcuts > 0:
                pseq = plusSequence[i:(i + 2 * flank)].upper()
                if not "N" in pseq:
                    p_out = seq2biasParm(permuteSeq[pseq], B, simplex_code)
                    plus_data += pcuts * p_out

        for i in range(len(minusSig)):
            #position = start + i
            mcuts = minusSig[i]
            if mcuts > 0:
                tmpseq = minusSequence[i:(i + 2 * flank)]
                if not "N" in tmpseq:
                    mseq = revcomp(tmpseq).upper()
                    m_out = seq2biasParm(permuteSeq[mseq], B, simplex_code)
                    minus_data += mcuts * m_out

        newll = ll + list(plus_data) + list(minus_data)
        outf.write("\t".join(map(str, newll)) + "\n")

    inf.close()
    outf.close()
コード例 #6
0
def dupFeature(usename, seq2bit, datatype):

    if datatype == "ATAC":
        biasfile = "/scratch/sh8tv/Project/scATAC/Data/Summary_Data/bias_matrix/summary36bp/singleMat/NakedYeast_ATAC_Enc8mer.txt"
        cutoff = 3.5
    elif datatype == "DNase":
        biasfile = "/scratch/sh8tv/Project/scATAC/Data/Summary_Data/bias_matrix/summary36bp/singleMat/NakedIMR90_DNase_Enc8mer.txt"
        cutoff = -2.2
    else:
        print datatype
        sys.exit()

    bias = {}
    inf = open(biasfile)
    for line in inf:
        if line.startswith("seqtype"):
            continue
        ll = line.split()
        bias[ll[0]] = round(float(ll[2]), 4)
    inf.close()
    flankLen = len(ll[0]) / 2
    genome = twobitreader.TwoBitFile(seq2bit)

    inf = open(usename + "_uniq.bed")
    outf1 = open(usename + "_g1.bed", 'w')
    outf2 = open(usename + "_g2.bed", 'w')
    outf3 = open(usename + "_g3.bed", 'w')
    outf4 = open(usename + "_g4.bed", 'w')

    for line in inf:
        ll = line.split()
        if ll[0] == "chrM":
            continue

        left, right = PEreads_feature(ll, flankLen, genome, bias, datatype)

        if left == "NA" or right == "NA":
            continue

        newll = [
            ll[0], (int(ll[1]) + int(ll[2])) / 2,
            (int(ll[1]) + int(ll[2])) / 2 + 1, ".", ".", "+"
        ]
        newline = "\t".join(map(str, newll)) + "\n"

        if left > cutoff:
            if right > cutoff:
                outf1.write(newline)
            else:
                outf4.write(newline)
        else:
            if right > cutoff:
                outf2.write(newline)
            else:
                outf3.write(newline)

    outf1.close()
    outf2.close()
    outf3.close()
    outf4.close()
def load_genomes(UTRfilestring, twobitfile):
	"""
	make this a separate function so that these only need to be loaded a single time
	"""
	UTRdict= rph.readindict(open(UTRfilestring, "rU"))
	genome= twobitreader.TwoBitFile(twobitfile) # do we actually need to load this in here?
	return UTRdict, genome
コード例 #8
0
 def __init__(self):
     import os
     self.root = os.path.dirname(
         os.path.abspath(__file__)) + "/test/test_corrGC/"
     self.tbitFile = self.root + "sequence.2bit"
     self.bamFile = self.root + "test.bam"
     self.chrNameBam = '2L'
     self.chrNameBit = 'chr2L'
     bam = pysam.Samfile(self.bamFile)
     tbit = twobit.TwoBitFile(self.tbitFile)
     global debug
     debug = 0
     global global_vars
     global_vars = {
         '2bit': self.tbitFile,
         'bam': self.bamFile,
         'filter_out': None,
         'extra_sampling_file': None,
         'max_reads': 5,
         'min_reads': 0,
         'min_reads': 0,
         'reads_per_bp': 0.3,
         'total_reads': bam.mapped,
         'genome_size': sum(tbit.sequence_sizes().values())
     }
コード例 #9
0
def count_cut_nmers(fp, w_minus, lflank, rflank, single_nmer_cutoff, sequence,
                    offset):
    """
    count the number of cuts associated with each nmer in sequence covered by X.
    offset is the position of the cut to be associated with each nmer.
    if offset = 0 the first base of the tag is lined up with the nmer start
    """
    #    w_plus_H=BigWigFile(open(w_plus, 'rb'))
    w_minus_H = BigWigFile(open(w_minus, 'rb'))

    genome = twobitreader.TwoBitFile(sequence)
    # keep count of the number of occurrences of each n-mer

    seq_nmer_dict = {}

    cut_nmer_dict = {}

    for line in fp.readlines():
        ll = line.split()
        chrm = ll[0]
        start = int(ll[1])
        end = int(ll[2])
        #        pseq = genome[chrm][(start-lflank+offset):(end+rflank+offset)].upper()
        nseq = genome[chrm][(start - lflank - offset):(end + rflank -
                                                       offset)].upper()

        #        cp = list(w_plus_H.summarize(ll[0],start,end,end-start).sum_data)
        cn = list(w_minus_H.summarize(ll[0], start, end, end - start).sum_data)
        #each = (len(ll)-5)/2
        #cp = (map(float,ll[5:(5+each)]))
        #cn = (map(float,ll[(5+each):(5+each*2)]))

        for k in range(len(cn)):

            #            p_cut = cp[k]
            n_cut = cn[k]

            #            p_seq = pseq[k:(k+lflank+rflank)]
            n_seq = nseq[(k + 1):(k + lflank + rflank + 1)]
            #     rev_n_seq = rev(n_seq)
            #            if 'N' not in p_seq and p_cut <= single_nmer_cutoff :
            #                try:
            #                    cut_nmer_dict[ p_seq ] += p_cut
            #                except:
            #                    cut_nmer_dict[ p_seq ]  = p_cut
            #                try:
            #                    seq_nmer_dict[ p_seq ] += 1
            #                except:
            #                    seq_nmer_dict[ p_seq ]  = 1
            if 'N' not in n_seq and n_cut <= single_nmer_cutoff:
                rev_n_seq = rev(n_seq)
                try:
                    cut_nmer_dict[rev_n_seq] += n_cut
                except:
                    cut_nmer_dict[rev_n_seq] = n_cut
                try:
                    seq_nmer_dict[rev_n_seq] += 1
                except:
                    seq_nmer_dict[rev_n_seq] = 1
    return seq_nmer_dict, cut_nmer_dict
コード例 #10
0
def getSeq(gene_tsv_path, twobitpath):
    genes_df = pd.read_csv(gene_tsv_path, sep='\t', names=GENE_TSV_HEADER)
    genome_reader = twobitreader.TwoBitFile(twobitpath)
    for i, gene in genes_df.iterrows():
        seq = genome_reader[gene['chrom']][int(gene['startTranscription']
                                               ):int(gene['endTranscription'])]
        seq = seq.upper()
        seq = np.array(list(seq))

        if gene['strand'] == '-':
            seq = np.flip(seq)
            seq = np.vectorize(REVERSE_COMPLEMENT_MAP.get)(seq)
        exon_starts = np.array(gene['exonStart'].split(',')[:-1],
                               dtype=np.int64)
        exon_sizes = np.array(gene['exonSize'].split(',')[:-1], dtype=np.int64)
        intron_exon_lbl = np.full(len(seq), "I")
        for exon_idx, (start, size) in enumerate(zip(exon_starts, exon_sizes)):
            if exon_idx == 0:
                exon_lbl = 'F'
            elif exon_idx == len(exon_starts) - 1:
                exon_lbl = 'L'
            else:
                exon_lbl = 'M'
            intron_exon_lbl[start:start + size] = exon_lbl
        yield seq + ['^'], intron_exon_lbl + ['L']
コード例 #11
0
def bias_correct_flank(inbdg,outname,biasMat,Gen,strand):

    genome = twobitreader.TwoBitFile(Gen)
    
    BGraw,BGenc,Nmer = readBG(biasMat)
    flank = int(Nmer)/2
    
    inf = open(inbdg)
    outf_encCorrect = open(outname + ".bdg",'w')

    for line in inf:
        ll = line.split()
        chrm = ll[0]
        start = int(ll[1])
        end = int(ll[2])
        raw_sig = float(ll[3])
        if raw_sig == 0:
            outf_encCorrect.write(line)
        else:
            for pos in range(start,end):
                if strand == "+":
                    this_seq = genome[chrm][(pos-flank):(pos+flank)].upper()
                else:
                    this_seq = rev(genome[chrm][(pos-flank+1):(pos+flank+1)].upper())
                if BGraw.has_key(this_seq):
                    enc_correct_sig = raw_sig * BGenc[this_seq]
                else:
                    enc_correct_sig = raw_sig
                newllenc = [chrm,pos,pos+1,enc_correct_sig]
                outf_encCorrect.write("\t".join(map(str,newllenc))+"\n")
    outf_encCorrect.close()
    inf.close()
コード例 #12
0
    def _examinePSL(self, querySeq):

        chromfile = os.path.join(self.ChromosomesDir,
                                 self.Chromosome + ".2bit")
        bitfile = tbr.TwoBitFile(chromfile)

        genomicSequence = bitfile[
            self.Chromosome][int(self.tStart):int(self.tEnd)]
        self.genomicFlats = []
        self.DelGenomicSeqs = {}
        for x in range(len(self.qBlocks)):
            try:
                qB = int(self.qCuts[x])
                lenB = int(self.qBlocks[x])
                tS = int(self.tCuts[x])
                genomicCut = genomicSequence[(int(tS) - int(self.tStart)):(
                    int(tS) - int(self.tStart) + lenB)]
                self.genomicFlats.append([int(tS), int(tS) + lenB])
                if self.tStrand == "-":
                    Gseq = str(Seq(genomicCut).reverse_complement())
                else:
                    Gseq = genomicCut
                Qseq = querySeq[qB:(qB + lenB)]
                location = self.Chromosome + "(" + self.tStrand + "):" + str(
                    tS) + "-" + str(int(tS) + lenB)
                self.DelGenomicSeqs[location] = {'Del': Qseq, 'Genomic': Gseq}
            except ValueError:
                pass
        ##Send all information to function that will join overlapping ranges
        ## And find the deletions in the genomic sequence
        self.missingRange()
コード例 #13
0
ファイル: Utility.py プロジェクト: TongjiZhanglab/CAM
def ATprofile(inputbed, genome2bit):
    inf = open(inputbed)
    genome = twobitreader.TwoBitFile(genome2bit)
    ATfrac = [0] * 140
    ave_ATfrac = [0] * 140
    reads_count = 0
    for line in inf:
        ll = line.strip().split('\t')
        #chrom = ll[0]
        if ll[5] == "+":
            seq = genome[ll[0]][(int(ll[1]) + 4):(int(ll[1]) + 145)].upper()
        else:
            #transtab = string.maketrans("ACGTNX","TGCANX")
            seq = genome[ll[0]][(int(ll[2]) - 145):(
                int(ll[2]) - 4)].upper()[::-1]  #.translate(transtab)[::-1]
        if len(seq) != 141:
            continue
        reads_count += 1
        for i in range(140):
            if seq[i:(i + 2)] in ["AA", "TT", "AT", "TA"]:
                ATfrac[i] += 1

    for i in range(140):
        ave_ATfrac[i] = (ATfrac[i] * 1.0) / (reads_count * 1.0)
    inf.close()
    return ave_ATfrac
コード例 #14
0
def seqbias(out, sequence, rflank, lflank):

    genome = twobitreader.TwoBitFile(sequence)
    # keep count of the number of occurrences of each n-mer

    seq_nmer_dict = make_nmer_dict(lflank + rflank)
    for chrom in genome.keys():
        if not chrom.split('_')[0] == chrom:
            continue
        wholeSeq = genome[chrom][:]
        RVwholeSeq = rev(wholeSeq)
        for i in range(len(wholeSeq) - lflank - rflank + 1):
            seq6mer = wholeSeq[i:(i + lflank + rflank)]
            RVseq6mer = RVwholeSeq[i:(i + lflank + rflank)]
            if 'a' in seq6mer or 't' in seq6mer or 'c' in seq6mer or 'g' in seq6mer or 'n' in seq6mer or 'N' in seq6mer:
                pass
            else:
                seq_nmer_dict[seq6mer] += 1
            if 'a' in RVseq6mer or 't' in RVseq6mer or 'c' in RVseq6mer or 'g' in RVseq6mer or 'n' in RVseq6mer or 'N' in RVseq6mer:
                pass
            else:
                seq_nmer_dict[RVseq6mer] += 1

    outf = open(out, 'w')
    for seqtype in sorted(seq_nmer_dict.keys()):
        outf.write("\t".join(map(str, [seqtype, seq_nmer_dict[seqtype]])) +
                   "\n")
    outf.close()
コード例 #15
0
ファイル: Scan_6c_matrix.py プロジェクト: Tarela/RivanaCode
def getsignal(inputfile,outputfile,BGmatrix,pcut,ncut,Ipcut,Incut,pspan,tspan,gen,left,right,fetch_length=100):
    
 #   p=BwIO(pcut)
 #   chrom_len = {}
 #   for i in p.chromosomeTree['nodes']:
 #       chrom_len[i['key']] = i['chromSize']
    genome = twobitreader.TwoBitFile(gen)
    pcutbw = BigWigFile(open(pcut, 'rb'))
    ncutbw = BigWigFile(open(ncut, 'rb'))
    Ipcutbw = BigWigFile(open(Ipcut, 'rb'))
    Incutbw = BigWigFile(open(Incut, 'rb'))

    inf = open(inputfile)    
    testll = inf.readline().split()
    ml = int(testll[2]) - int(testll[1])
    pspan = pspan - ml/2
    inf.seek(0)
    pBG,nBG = readBG(BGmatrix)
    outf = open(outputfile,'w')
    for line in inf:
        ll = line.split()

        chrom = ll[0]
        start = int(ll[1])
        end = int(ll[2])
        strand = ll[5]
        seq = genome[chrom][(start-pspan-left):(end + pspan+right)]
        pout = make_cut(pcutbw,ll,pspan,fetch_length)
        nout = make_cut(ncutbw,ll,pspan,fetch_length)
        Ipout = make_cut(Ipcutbw,ll,pspan,fetch_length)
        Inout = make_cut(Incutbw,ll,pspan,fetch_length)

        if strand == "-":
            pout,nout = nout,pout
            Ipout,Inout = Inout,Ipout
        if pout == 'NA':
            continue        

        if 'N' in seq.upper():
            continue
        #print 1
        pseq = seq[:-1]
        nseq = seq[1:]
        p=[]
        n=[]
        for k in range(len(pseq)  +1 - left-right):
            p.append(pBG[pseq[k:k+left+right].upper()])
            n.append(nBG[nseq[k:k+left+right].upper()])
        if strand != '-':
            pbglist = p
            nbglist = n
        else:
            pbglist = n[::-1]
            nbglist = p[::-1]
        TC,FOS = makeTCFOS(pcutbw,ncutbw,ll,tspan,ml)
        newll = ll  + [TC,FOS] + pout + nout + Ipout + Inout + pbglist + nbglist
        outf.write("\t".join(map(str,newll))+"\n")
    outf.close()
    inf.close()
コード例 #16
0
def render_indel_html(chrom,
                      pos,
                      ref_seq,
                      alt_seq,
                      twobit_file='ref/mm10.2bit',
                      boundary=20):
    import twobitreader

    boundary = boundary + 1

    twobit_ref = twobitreader.TwoBitFile(twobit_file)

    ref_length = len(ref_seq)
    alt_length = len(alt_seq)

    # pad by whichever seq is longest
    bounds = [ref_length + boundary, alt_length + boundary]
    boundary_end = max(bounds)

    # is indel a direct sub, a delete or insert?
    # this affects colours rendered in sequence also
    in_or_del = 'sub'
    if ref_length > alt_length:
        in_or_del = 'delete'
    elif ref_length < alt_length:
        in_or_del = 'insert'

    startpos = pos - boundary
    endpos = pos + boundary

    prefix = twobit_ref[chrom][startpos:pos - 1]
    suffix = twobit_ref[chrom][pos + ref_length - 1:endpos + ref_length - 2]

    #raise

    bp_text = ''
    bp_calc = alt_length

    if in_or_del == 'sub':
        bp_text = "<strong class='%s'>%sbp substitution</strong>" \
        % (in_or_del, bp_calc)
    elif in_or_del == 'insert':
        bp_calc = alt_length - ref_length
        bp_text = "<strong class='%s'>%sbp insert</strong>" \
        % (in_or_del, bp_calc)
    else:
        bp_calc = ref_length - alt_length
        bp_text = "<strong class='%s'>%sbp deletion</strong>" \
        % (in_or_del, bp_calc)

    ref_html = '<strong>REF: </strong>' + prefix + \
        '<strong class="' + in_or_del + '">' + \
        ref_seq.ljust(alt_length, '-') + '</strong>' + suffix

    alt_html = '<strong>ALT: </strong>' + prefix \
        + '<strong class="' + in_or_del + '">' + \
        alt_seq.ljust(ref_length, '-') + '</strong>' + suffix

    return '<pre>%s<br/>%s<br/>%s</pre>' % (bp_text, ref_html, alt_html)
コード例 #17
0
def seqbias(peak, tag, out, sequence, kmer):
    genome = twobitreader.TwoBitFile(sequence)
    pcut = make_nmer_dict(kmer)
    bgseq = make_nmer_dict(kmer)
    inf = open(peak)
    for line in inf:
        ll = line.strip().split("\t")
        seq = genome[ll[0]][int(ll[1]):int(ll[2])]

        for i in range(len(seq)):
            subseq_template = seq[(i - 6):(i + 11)]
            if len(subseq_template) != 17:
                continue
            subseq = SOBfetchSEQ(subseq_template, kmer)
            if bgseq.has_key(subseq):
                bgseq[subseq] += 1
            else:
                pass
    inf.close()
    inf = open(tag)
    PEtag = 0
    for line in inf:
        ll = line.strip().split("\t")
        if len(ll) < 6:
            PEtag = 1
        if PEtag == 1 or ll[5] == '+' or ll[5] == ".":
            rawseq = genome[ll[0]][(int(ll[1]) - 6):(int(ll[1]) + 11)].upper()
            if len(rawseq) != 17:
                continue
            seq = SOBfetchSEQ(rawseq, kmer)
            if pcut.has_key(seq):
                pcut[seq] += 1
            else:
                #print seq
                pass
        if PEtag == 1 or ll[5] == '-' or ll[5] == ".":
            rawseq = rev(genome[ll[0]][(int(ll[2]) - 11):(int(ll[2]) +
                                                          6)].upper())
            if len(rawseq) != 17:
                continue
            seq = SOBfetchSEQ(rawseq, kmer)
            if pcut.has_key(seq):
                pcut[seq] += 1
            else:
                #print seq
                pass

    inf.close()
    outf = open(out, 'w')
    for seqtype in sorted(pcut.keys()):
        if bgseq[seqtype] == 0:
            pbias = -1
        else:
            pbias = float(pcut[seqtype]) / float(bgseq[seqtype])
        #nbias = float(ncut[seqtype])/float(bgseq[seqtype])
        #outf.write("\t".join(map(str,[seqtype,pcut[seqtype]]))+"\n")
        outf.write("\t".join(
            map(str, [seqtype, pbias, pcut[seqtype], bgseq[seqtype]])) + "\n")
    outf.close()
コード例 #18
0
def fetch_subseq(path, chrom, start, end, strand):
    genome = twobitreader.TwoBitFile(path)
    subseq = genome[chrom].get_slice(start, end)

    if strand == "-":
        subseq = reverse(subseq)

    return subseq
コード例 #19
0
 def test_twobit_chr1_sequence(self):
     t = twobitreader.TwoBitFile(self.filename)
     chr1 = str(t['chr1'])
     self.assertEqual(
         chr1,
         'GAACATGTACAACCTGACCTTCCACgaacatgtacaacctgaccttccacNNNNATGTACAACCTGACCTTCCAC'
     )
     t.close()
コード例 #20
0
def load_genomes(UTRfilestring, firstStopsCSV, twobitfile):
	"""
	make this a separate function so that these only need to be loaded a single time
	"""
	UTRdict= rph.readindict(open(UTRfilestring, "rU"))
	utr3adj = pd.read_csv(firstStopsCSV, index_col=0)
	genome= twobitreader.TwoBitFile(twobitfile) # do we actually need to load this in here?
	return UTRdict, utr3adj, genome
コード例 #21
0
ファイル: genome_accessor.py プロジェクト: rajewsky-lab/byo
    def __init__(self,
                 path,
                 chrom,
                 sense,
                 system=None,
                 split_chrom="",
                 **kwargs):
        kwargs['sense_specific'] = False
        super(TwoBitAccessor, self).__init__(path,
                                             chrom,
                                             sense,
                                             system=system,
                                             **kwargs)
        import twobitreader as TB
        from time import time

        self.system = system
        self.data = None

        # try to access the whole genome, using indexing for fast lookup
        fname = os.path.join(path, "{0}.2bit".format(system))

        self.logger = logging.getLogger(
            'byo.io.TwoBitAccessor({0})'.format(fname))
        t0 = time()
        try:
            self.data = TB.TwoBitFile(fname)
            t1 = time()
            self.data.chrom_stats = self.data.sequence_sizes()
            self.chrom_lookup = {}
            if split_chrom:
                self.chrom_lookup = {}
                for chr in self.data.chrom_stats.keys():
                    short = chr.split(split_chrom)[0]
                    self.chrom_lookup[short] = chr

            self.covered_strands = '*'
            self.logger.info("file provides {0} sequences.".format(
                len(self.data.chrom_stats)))

        except IOError:
            t1 = time()
            # all fails: return Ns only
            self.logger.warning(
                "Could not access '{0}'. Switching to dummy mode (only Ns)".
                format(fname))
            self.get_data = self.get_dummy
            self.get_oriented = self.get_dummy
            self.covered_strands = [chrom + '+', chrom + '-']
            self.no_data = True

        # TODO: maybe remove this if not needed
        self.get = self.get_oriented
        #self.logger.debug("covered strands: '{0}'".format(",".join(self.covered_strands[:10])) )
        t2 = time()
        self.logger.debug(
            "opening 2bit file took {0:.1f}ms, entire constructor {1:.1f}ms".
            format((t1 - t0) * 1000., (t2 - t0) * 1000.))
コード例 #22
0
def main():

    outdir = 'f6_mutation_matrix_score'
    os.makedirs(outdir,exist_ok=True)
    indir = 'f2_mutation_on_cancer_specific_CTCF'
    
    # CTCF position weight matrix
    pwm_file ='/nv/vol190/zanglab/shared/Motif/pwm/jaspar_vertebrates/CTCF_MA0139.1.txt' # ACGT
    pwm = pd.read_csv(pwm_file,header=None,sep='\t')
    pwm.columns=['A','C','G','T']
    
    # this is only tested bg_var from /nv/vol190/zanglab/shared/Motif/sites/hg38_fimo_jarspar/raw_results/CTCF/fimo.txt
    bg_var=0.025
    bg=[0.25+bg_var,0.25-bg_var,0.25-bg_var,0.25+bg_var]
    log2LikelihoodRatio = np.log2((pwm+0.0000001)/bg)
    # hg38 sequencing
    genome = twobitreader.TwoBitFile('/nv/vol190/zanglab/zw5j/work2017/fusion_analysis/111_fusion_append/4_CTCFpairing_GCskew_RLoop_RNAprotein_GTExFusion/f0_infiles/hg38.2bit')

    cancertype_mutation_matchness = {'BRCA':'BRCA','CRC':'COAD','LUAD':'LUAD','PRAD':'PRAD','AML':'AML','PRAD_TissueAdded':'PRAD'}
    cancertypes=['BRCA','CRC','LUAD','AML','PRAD','PRAD_TissueAdded']
    
    for celltype in cancertypes:
        for binding_type in ['gained','lost']:
            mutation_info_file ='{}/{}_mutation_on_{}_{}_expand9.csv'.format(indir,cancertype_mutation_matchness[celltype],celltype,binding_type)
            df = pd.read_csv(mutation_info_file,sep='\t',index_col=1,low_memory=False)
            # for each binding position with motif, get the DNA sequence
            matching_score_df = pd.DataFrame()
            for id in df.index:
                chr = df.loc[id,'chr']
                mid = df.loc[id,'mid']
                strand = df.loc[id,'strand']
                mutation_info = df.loc[id,'mutation'].split(',')
                mutation_info_len = [len(i) for i in mutation_info]
                # check if there is a mutation
                matching_score_df.loc[id,'chr'] = chr
                matching_score_df.loc[id,'mid'] = mid
                matching_score_df.loc[id,'strand'] = strand
                
                if sum(mutation_info_len) !=0:
                    sequence = genome[chr][mid-9-1:mid+9];sequence
                    sequence = Seq(sequence).upper()
                    assert if_all_ATCG(sequence)
                    alt_sequence = get_altered_sequencing(sequence,mutation_info,mutation_info_len)
                    seq_score,rev_seq_score = return_matching_score(log2LikelihoodRatio,sequence)
                    alt_seq_score,rev_alt_seq_score = return_matching_score(log2LikelihoodRatio,alt_sequence)
                    #matching_score_df[id,'sequence'] = sequence
                   # matching_score_df[id,'alt_sequence'] = alt_sequence
                    matching_score_df.loc[id,'seq_score'] = seq_score
                    matching_score_df.loc[id,'alt_seq_score'] = alt_seq_score
                    matching_score_df.loc[id,'rev_seq_score'] = rev_seq_score
                    matching_score_df.loc[id,'rev_alt_seq_score'] = rev_alt_seq_score
                else:
                    matching_score_df.loc[id,'seq_score'] = 0
                    matching_score_df.loc[id,'alt_seq_score'] = 0
                    matching_score_df.loc[id,'rev_seq_score'] = 0
                    matching_score_df.loc[id,'rev_alt_seq_score'] = 0
                   
            matching_score_df.to_csv('{}/{}_{}.csv'.format(outdir,celltype,binding_type))#;exit()
コード例 #23
0
def list_sgrnas(genes_file, input_prefix, GC_cutoff, spacing, guides_per_gene,
                gecko, sam):
    """
	Returns a list of (ontarget) sgrna sequences using a genome file and list of
	transcription start sites form a .csv file.
	"""
    genome_2bit_file = input_prefix + '.2bit'
    tbf = twobitreader.TwoBitFile(genome_2bit_file)
    final_guides = []

    with open(genes_file, 'rb') as gf:
        f = [row for row in csv.reader(gf.read().splitlines())]
        for i, l in enumerate(f):
            if i == 0:
                columns = l
                continue

            #fetch the current gene and region
            gene = dict([(columns[i], e) for i, e in enumerate(l)])
            region_bounds = [long(gene["start"]), long(gene["end"]) + 1]
            region = tbf[gene["chrom"]][region_bounds[0]:region_bounds[1]]
            region = region.upper()
            if "N" in region:
                print "found N in target region of", gene["name"]
                continue

            #identify and filter guides that target region
            guides = get_sorted_guides(region, gene, GC_cutoff, spacing,
                                       input_prefix)
            if len(guides) == 0:
                continue

            #add offtarget scores to filtered guides and select guides with higher offtarget scores
            ot_guides_sql = get_ot_guides(guides, input_prefix)
            ot_guides_dict = dict(ot_guides_sql)
            for g in guides:
                spacer = g[1]
                g.append(ot_guides_dict[spacer])

            #sort and add guides with the highest offtarget scores to final guides
            guides = sorted(guides, key=itemgetter(-1), reverse=True)

            if len(guides) <= guides_per_gene:
                final_guides.extend(guides)
            else:
                final_guides.extend(guides[:guides_per_gene])

    # add gecko or sam flanking sequences to the spacer for the oligo library
    if sam or gecko:
        for guide in final_guides:
            spacer = guide[1]
            if gecko:
                oligo = gecko_flank[0] + spacer + gecko_flank[1]
            if sam:
                oligo = sam_flank[0] + spacer + sam_flank[1]
            guide.append(oligo)

    return final_guides
コード例 #24
0
def getTwoBits():
	import twobitreader as tbr
	basedir="/home/clarkg/MouseGenomes/MM10/"
	files=glob.glob(basedir+"*.2bit")
	genome={}
	for f in files:
		chrm=f.split("/")[-1].split(".")[4]
		genome[chrm]=tbr.TwoBitFile(f)
	return genome
コード例 #25
0
def _examinePSL(querySeq, blatinfo):

    multiDels = []
    for blatinfo in results:

        qsize, qCuts, qblocks, tStart, tEnd, tStrand, tCuts, matches, coverage, chromstr = blatinfo
        chromfile = os.path.join(dirset.chromosomes, chromstr + ".2bit")
        bitfile = tbr.TwoBitFile(chromfile)

        genomicSequence = bitfile[chromstr][int(tStart):int(tEnd)]
        #flagged=_initAlign(querySeq,genomicSequence,tStrand)
        #if flagged:
        #	genomeGaps=map(lambda l :int(tStart)+l,flagged)
        #	gapSpans=group_consecutives(genomeGaps)
        #	#print "INSERTIONS INTO QUERY at points:"
        #	for i,gp in enumerate(gapSpans):
        #		if len(gp) == 2:
        #			print "\t\t"+str(i+1)+".)\t"+chromstr+":"+"-".join([str(gp[0]),str(gp[1])])
        #		elif len(gp) == 1:
        #			print "\t\t"+str(i+1)+".)\t"+chromstr+":"+str(gp[0])
        #	qblocks=[]

        genomicFlats = []
        for x in range(len(qblocks)):
            try:
                qB = int(qCuts[x])
                lenB = int(qblocks[x])
                tS = int(tCuts[x])
                genomicCut = genomicSequence[(int(tS) -
                                              int(tStart)):(int(tS) -
                                                            int(tStart) +
                                                            lenB)]
                genomicFlats.append([int(tS), int(tS) + lenB])
                location = chromstr + "(" + tStrand + "):" + str(tS)
                if tStrand == "-":
                    Gseq = str(Seq(genomicCut).reverse_complement())
                else:
                    Gseq = genomicCut
                Qseq = querySeq[qB:(qB + lenB)]
            #	print ">Query\n",Qseq
            #	print ">Genom\n",Gseq
            except ValueError:
                pass

        deletions = missingRange(genomicFlats, tStart, tEnd)
        if len(deletions):
            deletions.insert(0, tStrand)
            multiDels.append(deletions)


#	print "\n\n"
#	print len(multiDels[0])
#	time.sleep(5)
    if len(multiDels):
        return multiDels[0]
    else:
        return []
コード例 #26
0
def main(args):
    dat_file = args.dat_file
    mutalyzer_results = args.mutalyzer_results
    twobit_file = args.twobit_file
    vcf_out = args.vcf_out

    #coord_hash = load_coord_hash(args.blat_coord_hash)
    mut_df = pandas.read_csv(mutalyzer_results, sep='\t')
    #mut_df.loc[:, 'Chromosomal Variant'] = mut_df.apply(lambda row: fix_mutalyzer(row, coord_hash), axis=1)
    
    genome = twobitreader.TwoBitFile(twobit_file)

    df_init = pandas.read_csv(dat_file, sep='\t')
    crit = df_init.apply(lambda row: row['Ref'] != '-'
                         and not row['Alt'] in ('dup', 'del', 'ins')
                         and not 'Leu' in row['Transcript'],
                         axis=1)
    df_pre = df_init[crit]

    # rm duplicate rows
    df_pre.loc[:, 'simple_nm'] = df_pre.apply(fix_transcript, axis=1)
    cols = [x for x in df_pre.columns.values
            if x != 'Transcript']
    idx_vals = df_pre[cols].drop_duplicates().index.values
    df = df_pre.loc[idx_vals][df_init.columns.values]

    df.loc[:, 'ref'] = df.apply(fix_ref, axis=1)
    df.loc[:, 'alt'] = df.apply(fix_alt, axis=1)
    df.loc[:, 'c.'] = df.apply(fix_cdot, axis=1)
    df.loc[:, 'Input Variant'] = df.apply(mk_var_new, axis=1)
    df.loc[:, 'clinical_class'] = df.apply(lambda row: '_'.join(str(row['Classification']).split()), axis=1)
    # should be left, but inner to skip mistakes/misisng for mutalyzer
    # need to drop bad mutalyzer results
    m =  pandas.merge(df, mut_df, on='Input Variant', how='left').dropna(subset=('Chromosomal Variant',))
    m.loc[:, 'chrom'] = m.apply(get_chrom, axis=1)
    m.loc[:, 'pos'] = m.apply(get_pos, axis=1)

    # print('debug')
    # print(m[m.pos==149771732][['chrom','pos','fixed_chrom']])
    # print('debug')

    uc = ['chrom', 'pos', 'ref', 'alt',
          'clinical_class', 'Pos Fam Cnt', 'Neg Fam Cnt', 'Homozygous Fam Cnt',
          'Hemizygous Fam Cnt', 'Heterozygous Fam Cnt', 'Chromosomal Variant',
          'Input Variant']
    new_cols = {'Pos Fam Cnt':'pos_fam_count',
                'Neg Fam Cnt':'neg_fam_count',
                'Homozygous Fam Cnt':'hom_fam_count',
                'Hemizygous Fam Cnt':'hemi_fam_count',
                'Heterozygous Fam Cnt':'het_fam_count'
               }
    df_final = m[uc].rename(columns=new_cols).sort_values(by=['chrom', 'pos'])
    final_cols = [x for x in df_final.columns.values
                  if x != 'Input Variant']
    idx_vals = df_final[final_cols].drop_duplicates().index.values
    df_fix = df_final.loc[idx_vals]
    write_tab(df_fix, genome, vcf_out)
コード例 #27
0
def seqbias(peak, tag, out, sequence, flank):
    genome = twobitreader.TwoBitFile(sequence)
    pcut = make_nmer_dict(2 * flank)
    #ncut = make_nmer_dict(2*flank)
    bgseq = make_nmer_dict(2 * flank)
    inf = open(peak)
    for line in inf:
        ll = line.strip().split("\t")
        seq = genome[ll[0]][int(ll[1]):int(ll[2])]
        for i in range(len(seq) - 2 * flank):
            subseq = seq[i:(i + 2 * flank)]
            if bgseq.has_key(subseq):
                bgseq[subseq] += 1
            else:
                pass
    inf.close()
    inf = open(tag)
    PEtag = 0
    for line in inf:
        ll = line.strip().split("\t")
        if "." in ll[0] or "_" in ll[0]:
            continue
        if ll[0] == "chrMT":
            chrm = 'chrM'
        else:
            chrm = ll[0]
        if len(ll) < 6:
            PEtag = 1
        if PEtag == 1 or ll[5] == '+' or ll[5] == ".":
            seq = genome[chrm][(int(ll[1]) - flank):(int(ll[1]) +
                                                     flank)].upper()
            if pcut.has_key(seq):
                pcut[seq] += 1
            else:
                #print seq
                pass
        if PEtag == 1 or ll[5] == '-' or ll[5] == ".":
            seq = rev(genome[chrm][(int(ll[2]) - flank):(int(ll[2]) +
                                                         flank)].upper())
            if pcut.has_key(seq):
                pcut[seq] += 1
            else:
                #print seq
                pass

    inf.close()
    outf = open(out, 'w')
    for seqtype in sorted(pcut.keys()):
        if bgseq[seqtype] == 0:
            pbias = -1
        else:
            pbias = float(pcut[seqtype]) / float(bgseq[seqtype])
        #nbias = float(ncut[seqtype])/float(bgseq[seqtype])
        #outf.write("\t".join(map(str,[seqtype,pcut[seqtype]]))+"\n")
        outf.write("\t".join(
            map(str, [seqtype, pbias, pcut[seqtype], bgseq[seqtype]])) + "\n")
    outf.close()
コード例 #28
0
def divideReadsFromBias(infile, outname, BiasfileA, BiasfileD, sequence,
                        flank):
    genome = twobitreader.TwoBitFile(sequence)
    biasATAC, medATAC = readDict(BiasfileA)
    biasDNase, medDNase = readDict(BiasfileD)

    inf = open(infile)
    outf = open(outname + "_all.bed", 'w')
    outf11 = open(outname + "_biasG11.bed", 'w')
    outf10 = open(outname + "_biasG10.bed", 'w')
    outf01 = open(outname + "_biasG01.bed", 'w')
    outf00 = open(outname + "_biasG00.bed", 'w')
    random.seed(1228)

    for line in inf:
        ll = line.strip().split("\t")
        if ll[5] == '+':
            seq = genome[ll[0]][(int(ll[1]) - flank):(int(ll[1]) +
                                                      flank)].upper()
        elif ll[5] == '-':
            seq = rev(genome[ll[0]][(int(ll[2]) - flank):(int(ll[2]) +
                                                          flank)].upper())
        else:
            rdnum = random.randint(0, 1)
            if rdnum == 0:
                seq = genome[ll[0]][(int(ll[1]) - flank):(int(ll[1]) +
                                                          flank)].upper()
                newll = [ll[0], ll[1], str(int(ll[1]) + 1), ll[3], ll[4], "+"]
            else:
                seq = rev(genome[ll[0]][(int(ll[2]) - flank):(int(ll[2]) +
                                                              flank)].upper())
                newll = [ll[0], str(int(ll[2]) - 1), ll[2], ll[3], ll[4], "-"]
            line = "\t".join(newll) + "\n"

        if not biasATAC.has_key(seq) or not biasDNase.has_key(seq):
            continue

        biasA = biasATAC[seq]
        biasD = biasDNase[seq]
        outf.write(line)
        if biasA > medATAC:
            if biasD > medDNase:
                outf11.write(line)
            else:
                outf10.write(line)
        else:
            if biasD > medDNase:
                outf01.write(line)
            else:
                outf00.write(line)

    outf11.close()
    outf10.close()
    outf01.close()
    outf00.close()
    outf.close()
    inf.close()
コード例 #29
0
 def test_pickle(self):
     t = twobitreader.TwoBitFile(self.filename)
     buf = StringIO()
     pickle.dump(t, buf)
     buf.seek(0)
     t2 = pickle.load(buf)
     self.assertListEqual(sorted(t.keys()), sorted(t2.keys()))
     for k in t:
         self.assertEqual(str(t[k]), str(t2[k]))
     t.close()
コード例 #30
0
ファイル: seq_parsers.py プロジェクト: ptrebert/creepiest
def get_twobit_seq(fpath, chrom):
    """ Important to remember here that the return value
    is a TwoBitSequence object, allows sliced access
    :param fpath:
    :param chrom:
    :return:
     :rtype: TwoBitSequence
    """
    seqfile = tbr.TwoBitFile(fpath)
    return seqfile[chrom]