コード例 #1
0
ファイル: test_pileup.py プロジェクト: alimanfoo/pysamstats
def test_load_cov_long_contig_name():
    # test that long chrom labels auto handled.

    label = 'AS2_scf7180000696055'
    bampath = 'fixture/longcontignames.bam'

    x = pysamstats.load_coverage(bampath, chrom=label)
    assert len(label) == x.dtype["chrom"].itemsize

    x = pysamstats.load_coverage(Samfile(bampath), chrom=label, dtype={"chrom": "a10"})
    assert 10 == x.dtype["chrom"].itemsize
コード例 #2
0
def cnv_calling(bam_file, output):
    mybam = pysam.AlignmentFile(bam_file)
    for contig in mybam.header.references:
        stats = pysamstats.load_coverage(mybam, chrom=contig)
        reads = stats.reads_pp
        position = stats.pos
        chrom = stats.chrom
        thresh_up, thresh_down = threshold(reads)
        start = 0
        big_start = 0
        last_end = 0
        state = "base"
        for i in range(1000, len(reads), 1000):
            mean = mean_of_array(reads, start, i)
            if mean >= thresh_up:
                if last_end != start:
                    big_start = start
                last_end = i
                state = "on"
                prev_i = i
            else:
                if state == "on":
                    print(chrom[big_start].decode("utf-8"),
                          position[big_start],
                          position[prev_i],
                          'cnv',
                          sep='\t',
                          file=output)
                    state = "off"
                else:
                    state = "off"
            start = i
コード例 #3
0
def get_coverage(bamfile: list, chrom: str, start: int, end: int) -> pd.DataFrame:
    """Get read depth from from bam files according location  

    Args:
        bamfile (list): list of bam file with index
        chrom (str): chromosome name
        start (int): start position of interest
        end (int): end position of interest
      
    Returns:
        pd.DataDrame: A dataframe with chromosom, position and depth for each bam file 
    
    """
    import warnings
    mybam = pysam.AlignmentFile(bamfile)
    df = pd.DataFrame(
        pysamstats.load_coverage(
            mybam,
            chrom=chrom,
            start=start,
            end=end,
            truncate=True,
            pad=True,
            fields=["chrom", "pos", "reads_all"],
        )
    )

    df["chrom"] = df["chrom"].apply(lambda x: x.decode())
    df.rename({"reads_all": "depth"}, inplace=True, axis=1)

    # if window > 1 and agg:
    #     # Group every window line  and aggregate depth
    #     return df.groupby(df.index // window).agg({"chrom":"first", "pos":"first" , "depth":agg})

    return df
コード例 #4
0
ファイル: module.py プロジェクト: Aucomte/snake-Mapping
def check_mapping_stats(bam, out_csv, sep="\t"):
    from numpy import median, mean
    from pysamstats import load_coverage
    dico_size_ref_genome = {}
    dicoResume = defaultdict(OrderedDict)
    # for bam in bam_files:
    if not os.path.exists(Path(bam).as_posix() + ".bai"):
        pysam.index(Path(bam).as_posix())
    sample = Path(bam).stem
    print(f"\n\n{'*'*30}\nSAMPLE NAME: {sample}\n{'*'*30}\n\n")
    bam_file = pysam.AlignmentFile(bam, "r")
    name_fasta_ref = Path(
        re.findall("[/].*\.fasta",
                   bam_file.header["PG"][0]["CL"],
                   flags=re.IGNORECASE)[0]).stem
    if name_fasta_ref not in dico_size_ref_genome:
        dico_size_ref_genome[name_fasta_ref] = sum(
            [dico["LN"] for dico in bam_file.header["SQ"]])
    a = load_coverage(bam_file, pad=True)
    df = pd.DataFrame(a)
    df.chrom = df.chrom.str.decode(encoding='UTF-8')
    listMap = df[df.reads_all >= 1].reads_all

    dicoResume[sample]["Mean mapping Depth coverage"] = f"{mean(listMap):.2f}"
    dicoResume[sample][
        "Median mapping Depth coverage"] = f"{median(listMap):.2f}"
    dicoResume[sample][
        "Mean Genome Coverage"] = f"{(len(listMap)/dico_size_ref_genome[name_fasta_ref])*100:.2f}%"

    dataframe_mapping_stats = pd.DataFrame.from_dict(dicoResume,
                                                     orient='index')
    with open(out_csv, "w") as out_csv_file:
        # print(f"Library size:\n{dataframe_mapping_stats}\n")
        dataframe_mapping_stats.to_csv(out_csv_file, index=True, sep=sep)
コード例 #5
0
ファイル: test_pileup.py プロジェクト: alimanfoo/pysamstats
def test_load_cov_using_steppers():

    # test that expected steppers give different/consistent results
    # this is the only bam file that differs between all/nofilter
    bampath = "fixture/longcontignames.bam"
    seq = 'AS2_scf7180000695891'
    pos = 14311
    steppers = ["all", "nofilter", "samtools"]
    reads_all = [7, 8, 4]
    reads_pp = [4, 5, 4]

    for exp_all, exp_pp, step in zip(reads_all, reads_pp, steppers):
        a = pysamstats.load_coverage(Samfile(bampath), chrom=seq, stepper=step, pad=True)
        eq_(exp_all, a[pos]["reads_all"])
        eq_(exp_pp, a[pos]["reads_pp"])

    with assert_raises(ValueError):
        pysamstats.load_coverage(Samfile(bampath), chrom=seq, stepper="notastepper")
コード例 #6
0
def main():
    mybam = pysam.AlignmentFile('UMB.aligned.sorted.bam')

    file = 'output_std_umb_contig_final.txt'

    with open(file, "w") as f:
        print("Index Chrom Pos Thresh Diff", file=f)
        for contig in mybam.header.references:  # Do the analysis per contig
            stats = pysamstats.load_coverage(mybam, chrom=contig)
            reads = stats.reads_pp
            position = stats.pos
            chrom = stats.chrom

            #print(len(reads))
            #print(len(position))
            #print(len(chrom))

            thresh_up, thresh_down = threshold(reads)
            #print(thresh_up)
            #print(thresh_down)

            start = 0
            big_start = 0
            last_end = 0
            state = "base"

            for i in range(1000, len(reads), 1000):
                mean = mean_of_array(reads, start, i)
                if mean >= thresh_up:
                    if last_end != start:
                        big_start = start
                    last_end = i
                    state = "on"
                    prev_i = i
                else:
                    if state == "on":
                        print(
                            f'{chrom[big_start].decode("utf-8")}:{position[big_start]} {chrom[prev_i].decode("utf-8"):>20}:{position[prev_i]}'
                        )
                        print(
                            f'{chrom[big_start].decode("utf-8")}:{position[big_start]} {chrom[prev_i].decode("utf-8"):>20}:{position[prev_i]}',
                            file=f)
                        state = "off"
                    else:
                        state = "off"
                start = i
コード例 #7
0
def CalculateCoverage(in_bamFile, in_bamLegend, in_selectTrans,
                      in_transLengthDict, in_startCodonCoorDict,
                      in_stopCodonCoorDict, in_readLengths, in_readOffset,
                      output_prefix):
    pysamFile = pysam.AlignmentFile(in_bamFile, "rb")
    pysamFile_trans = pysamFile.references
    in_selectTrans = set(pysamFile_trans).intersection(in_selectTrans)
    trans_set = set()
    all_counts = 0
    if output_prefix:
        outputFileName = output_prefix + "_" + in_bamLegend
    else:
        outputFileName = in_bamLegend
    for trans in in_startCodonCoorDict.keys():
        leftCoor = int(in_startCodonCoorDict[trans]) - 1
        rightCoor = int(in_stopCodonCoorDict[trans]) - 3
        (trans_counts, read_counts_frameSum, total_reads,
         cds_reads) = get_trans_frame_counts(pysamFile, trans, in_readLengths,
                                             in_readOffset,
                                             in_transLengthDict[trans],
                                             leftCoor, rightCoor)
        all_counts += total_reads  ## total_reads for transcript level

    with open(outputFileName + "_raw_depth.txt",
              'w') as f1, open(outputFileName + "_RPM_depth.txt", 'w') as f2:
        for trans in in_selectTrans:
            # print(trans)
            tmpTrans = pysamstats.load_coverage(pysamFile,
                                                chrom=trans,
                                                pad=True)
            tmpTransRaw = np.array([i[2] for i in tmpTrans])
            tmpTransRPM = 10**6 * (tmpTransRaw / all_counts)
            trans_set.add(trans)
            f1.write("%s\t" % (trans))
            f2.write("%s\t" % (trans))
            for i in range(len(tmpTrans)):
                f1.write("%s\t" % (str(tmpTransRaw[i])))
                f2.write("%s\t" % (str(tmpTransRPM[i])))
            f1.write("\n")
            f2.write("\n")
        print("There are about " + str(len(trans_set)) +
              " transcripts used for coverage calculation!",
              file=sys.stderr)
コード例 #8
0
def main():
    mybam = pysam.AlignmentFile('UMB.aligned.sorted.bam')

    # iterate over statistics, one record at a time
    #stats = pysamstats.load_coverage(mybam, chrom='NZ_CP023386.1', start=(2500000), end=3000000)
    stats = pysamstats.load_coverage(mybam)

    reads = stats.reads_all
    position = stats.pos
    chrom = stats.chrom

    print(len(reads))
    print(len(position))
    print(len(chrom))

    with open("output9.txt", "w") as f:
        print("Index Chrom Pos Thresh Diff", file=f)
        for i in range(len(reads)):
            if (i % 200) == 0:
                thresh = threshold(reads, i)
            if (i % 50000) == 0:
                print('currently: ' + str(i) + ' ' +
                      str(chrom[i]).lstrip('b\'').rstrip('\'') + ' ' +
                      str(position[i]) + ' ' +
                      str(datetime.datetime.now().time()))
            if i == (len(reads)) - 1:
                #double check this is right
                current = reads[i]
                next_one = reads[0]
            else:
                current = reads[i]
                next_one = reads[i + 1]
            if abs(next_one - current) > thresh:
                print(str(i) + ' ' + str(chrom[i]).lstrip('b\'').rstrip('\'') +
                      ' ' + str(position[i]) + ' ' + str(thresh) + ' ' +
                      str(next_one - current),
                      file=f)
                print(
                    str(i) + ' ' + str(chrom[i]).lstrip('b\'').rstrip('\'') +
                    ' ' + str(position[i]) + ' ' + str(thresh) + ' ' +
                    str(next_one - current))
コード例 #9
0
ファイル: core.py プロジェクト: dridk/iscard
def get_coverage(bamfile: str,
                 chrom: str,
                 start: int,
                 end: int,
                 sample_rate=1) -> pd.DataFrame:
    """Get read depth from from bam files according location  

    Args:
        bamfile (str): a bam file with index
        chrom (str): chromosome name
        start (int): start position of interest
        end (int): end position of interest
      
    Returns:
        pd.DataDrame: A dataframe with chromosom, position and depth for each bam file 
    
    """
    import warnings

    mybam = pysam.AlignmentFile(bamfile)
    df = pd.DataFrame(
        pysamstats.load_coverage(
            mybam,
            chrom=chrom,
            start=start,
            end=end,
            truncate=True,
            pad=True,
            fields=["chrom", "pos", "reads_all"],
        ))

    df["chrom"] = df["chrom"].apply(lambda x: x.decode())
    df.rename({"reads_all": "depth"}, inplace=True, axis=1)

    # keep line every sample_rate row
    df = df[df.index % sample_rate == 0]

    return df
コード例 #10
0
ファイル: isclipped.py プロジェクト: sleyn/ijump
 def _av_depth(self, chrom, start, stop):
     #aln_depth = self.aln.count_coverage(chrom, start, stop)
     #depth = sum(map(sum, aln_depth))
     #return depth / len(aln_depth[0])  # average depth of the region
     c = pysamstats.load_coverage(self.aln, chrom=chrom, start=start, end=stop, truncate=True, max_depth=300000)
     return mean(c.reads_all)
コード例 #11
0
ファイル: checkcov.py プロジェクト: fai-k/hrp
samples = []
with open(inputfile, 'r') as f:
    for line in csv.reader(f, delimiter='\t'):
        samples.append(line)

suspectdel = []
for i in range(0, len(samples)):
    bam = pysam.AlignmentFile(samples[i][1], 'rb')
    #Coverage of HRP and neighboring gene#
    exoncov = []
    for k in range(0, len(exons)):
        ecoverage = pss.load_coverage(bam,
                                      reffile,
                                      pad=True,
                                      truncate=True,
                                      chrom=exons[k][0],
                                      start=exons[k][1],
                                      end=exons[k][2])
        exonmean = np.mean(ecoverage.reads_all)
        exoncov.append([exonmean, exonmean / float(samples[i][2])])
    with open('%s/HRPcov_%s.txt' % (outfolder, samples[i][0]), 'w') as f:
        f.write(
            'chromosome\tstart\tend\tgeneID\tname\tcoverage\tnormalizedcov\n')
        for m in range(0, len(exons)):
            f.write('%s\t%s\t%s\t%s\t%s\t%.3f\t%.3f\n' %
                    (exons[m][0], exons[m][1], exons[m][2], exons[m][3],
                     exons[m][4], exoncov[m][0], exoncov[m][1]))
    #Flag suspected deletion#
    for n in range(0, len(exoncov)):
        if exoncov[n][1] < cutoff:
コード例 #12
0
def pull_down(name, row):
    row[0] = '{0}_{1}'.format(name, row[0])
    csvwriter.writerow(row)
    rows.append(row)
    chrom = row[0]
    start = 1
#    chrom = row[12].split(':')[0]
#    start = int(row[12].split(':')[1].split('-')[0])
#    end = int(row[12].split(':')[1].split('-')[1])
    #row = ['csi-miR395c-3p_Cluster_151156', '86.36', '21', 'CUGAAGUGUUUGGGGGAACUC', '7', 'GUUCCUCUGAGCACUUCAUUG', '6', '21', 'ACUGAAGUGUUUGGGGGAACUCC', 'AGUUCCUCUGAGCACUUCAUUGG', 'UUGGUCGGAUGUCUCCUAGAGUUCCUCUGAGCACUUCAUUGGGUAUACAAUUUCUUAUGAAGAUUACCCACUGAAGUGUUUGGGGGAACUCCUGGACCCAUUCUACGGUUU']
#    mybam_lc = pysamstats.load_coverage(mybam, chrom = chrom, start = start, 
#                                        end = end)
    mybam_lc = pysamstats.load_coverage(mybam, chrom = chrom)
    G006_bam_lc = pysamstats.load_coverage(G006_bam, chrom = chrom)
    G002_bam_lc = pysamstats.load_coverage(G002_bam, chrom = chrom)
    BTIRed_bam_lc = pysamstats.load_coverage(BTIRed_bam, chrom = chrom)
    
    mature = row[3]
    star = row[5]
    precursor = row[10]
    end = len(precursor)
    if any(m.islower() for m in mature) or any(s.islower() for s in star):
        pass
    else:
        m_start = precursor.index(mature)
        m_end = m_start + len(mature)
        s_start = precursor.index(star)
        s_end = s_start + len(star)
    
#        if len(mybam_lc.pos) > 0:
        if (len(G006_bam_lc.pos) > 0) or (len(G002_bam_lc.pos) > 0) or \
        (len(BTIRed_bam_lc.pos) > 0):
            positions = np.arange(start, end + 1)
            read_counts = []
            
            G006_bam_lc_pos = list(G006_bam_lc.pos)
            for entry in positions:
                if entry in G006_bam_lc_pos:
                    read_counts.append(G006_bam_lc.reads_all[G006_bam_lc_pos.index(entry)])
                else:
                    read_counts.append(0)
                    
            G002_bam_lc_pos = list(G002_bam_lc.pos)
            for entry in positions:
                if entry in G002_bam_lc_pos:
                    pos_ind = entry - 1
                    read_counts[pos_ind] += G002_bam_lc.reads_all[G002_bam_lc_pos.index(entry)]
                else:
                    pass
            
            BTIRed_bam_lc_pos = list(BTIRed_bam_lc.pos)
            for entry in positions:
                if entry in BTIRed_bam_lc_pos:
                    pos_ind = entry - 1
                    read_counts[pos_ind] += BTIRed_bam_lc.reads_all[BTIRed_bam_lc_pos.index(entry)]
                else:
                    pass
            
#            print(read_counts)
            fig = plt.figure(figsize = (15, 15))
            ax = plt.subplot(1, 1, 1)
#            if m_start > s_start:
#                plt.plot(positions[:s_start], read_counts[:s_start], 
#                         color = 'black', label = 'Tails')
#                plt.plot(positions[s_start:s_end+1], read_counts[s_start:s_end+1], 
#                         color = 'red', label = 'Star')
#                plt.plot(positions[s_end+1:m_start], read_counts[s_end+1:m_start], 
#                         color = 'gold', label = 'Loop')
#                plt.plot(positions[m_start:m_end+1], read_counts[m_start:m_end+1], 
#                         color = 'green', label = 'Mature')
#                plt.plot(positions[m_end+1:], read_counts[m_end+1:], 
#                         color = 'black')
#            elif s_start > m_start:
#                plt.plot(positions[:m_start], read_counts[:m_start], 
#                         color = 'black', label = 'Tails')
#                plt.plot(positions[m_start:m_end+1], read_counts[m_start:m_end+1], 
#                         color = 'green', label = 'Mature')
#                plt.plot(positions[m_end+1:s_start], read_counts[m_end+1:s_start], 
#                         color = 'gold', label = 'Loop')
#                plt.plot(positions[s_start:s_end+1], read_counts[s_start:s_end+1], 
#                         color = 'red', label = 'Star')
#                plt.plot(positions[s_end+1:], read_counts[s_end+1:], 
#                         color = 'black')
            barlist = plt.bar(positions, read_counts, width = 1.0, 
                              color = 'black')
            for bar in barlist[m_start:m_end + 1]:
                bar.set_color('red')
            for bar in barlist[s_start:s_end + 1]:
                bar.set_color('green')
            if m_start > s_start:
                for bar in barlist[s_end + 1:m_start]:
                    bar.set_color('yellow')
            elif s_start > m_start:
                for bar in barlist[m_end + 1:s_start]:
                    bar.set_color('yellow')
            else:
                pass
#            plt.legend()
            ind = np.arange(max(read_counts) + 1)
            plt.yticks(ind, ind)
            plt.xlabel('Position')
            plt.ylabel('Reads Mapped')
            plt.title('{}\n'.format(row[0]))
            ax.spines['right'].set_visible(False)  # Removes right axis
            ax.spines['top'].set_visible(False)  # Removes top axis
            ax.yaxis.set_ticks_position('none')  # Keeps vertical ticks hidden
            ax.xaxis.set_ticks_position('bottom')  # Keeps horizontal ticks hidden on top
            plt.savefig('/nethome/mct30/mapping_svgs/{0}_plant-precursors_bowtie1_a_T__mm0_gap0_split.svg'.format(row[12]), 
                        bbox_inches = 'tight', format = 'svg')
            plt.show()
            plt.close('all')
        else:
            pass
    mature_read_counts = read_counts[m_start : m_end + 1]
    star_read_counts = read_counts[s_start : s_end + 1]
    if m_start > s_start:
        loop_read_counts = [s_end + 1 : m_start]
        mature_tail_read_counts = [m_end + 1 : end]
        star_tail_read_counts = [start : s_start]
    elif s_start > m_start:
        loop_read_counts = [m_end + 1 : s_start]
        star_tail_read_counts = [s_end + 1: end]
        mature_tail_read_counts = [start : m_start]
    else:
        pass
    return mybam_lc