Ejemplo n.º 1
0
    def test_write_to_read_grouped_sorted(self):
        write_path = './data/write_test_rg.bam'
        read_groups = set()
        with pysam.AlignmentFile('./data/mini_nla_test.bam') as f:

            input_header = f.header.as_dict()
            write_program_tag(input_header,
                program_name='test_bam_util_test1',
                command_line = " ".join(sys.argv),
                version = singlecellmultiomics.__version__,
                description = f'a description'
                )

            write_program_tag(input_header,
                program_name='test_bam_util_test2',
                command_line = " ".join(sys.argv),
                version = singlecellmultiomics.__version__,
                description = f'a description'
                )
            #print([x for x in input_header['PG'] if not 'bwa mem' in x.get('CL','')])
            with sorted_bam_file(write_path, header=input_header,read_groups=read_groups) as out:
                for molecule in singlecellmultiomics.molecule.MoleculeIterator(
                    alignments=f,
                    molecule_class=singlecellmultiomics.molecule.NlaIIIMolecule,
                    fragment_class=singlecellmultiomics.fragment.NlaIIIFragment,
                    fragment_class_args={'umi_hamming_distance':0},
                    pooling_method=0,
                    yield_invalid=True
                ):
                    molecule.write_pysam(out)
                    for frag in molecule:
                        read_groups.add( frag.get_read_group() )

        self.assertTrue(os.path.exists(write_path))

        # Now test if the program tag is there...
        with pysam.AlignmentFile(write_path) as f:
            self.assertTrue( 1==len([x for x in f.header['PG'] if 'test_bam_util_test1' in x.get('PN','')]) )
            self.assertTrue( 1==len([x for x in f.header['PG'] if 'test_bam_util_test2' in x.get('PN','')]) )
            i =0

            # Test if the file has reads.
            for read in f:
                if read.is_read1:
                    i+=1
            self.assertEqual(i, 293)

        try:
            os.remove(write_path)
        except Exception as e:
            pass

        try:
            os.remove(write_path+'.bai')
        except Exception as e:
            pass
Ejemplo n.º 2
0
def run_tagging_tasks(args: tuple):
    """ Run tagging for one or more tasks

    Args:
        args (tuple): (alignments_path, temp_dir, timeout_time), arglist

    """

    (alignments_path, temp_dir, timeout_time), arglist = args

    target_file = f"{temp_dir}/{uuid4()}.bam"

    timeout_tasks = []
    total_molecules = 0
    read_groups = dict()

    with AlignmentFile(alignments_path) as alignments:
        with sorted_bam_file(target_file, origin_bam=alignments, mode='wb', fast_compression=False,
                             read_groups=read_groups) as output:
            for task in arglist:
                try:
                    statistics = run_tagging_task(alignments, output, read_groups=read_groups, timeout_time=timeout_time, **task)
                    total_molecules += statistics.get('total_molecules_written', 0)
                except TimeoutError:
                    timeout_tasks.append( task )


    meta = {
        'timeout_tasks' : timeout_tasks,
        'total_molecules' : total_molecules,
    }

    if total_molecules>0:
        return target_file, meta
    else:
        # Clean up ?
        try:
            remove(target_file)
            remove(f'{target_file}.bai')
        except Exception as e:
            print(f'Cleaning up failed for {target_file}')
            print(e)
            pass

    return None, meta
Ejemplo n.º 3
0
    def test_write_to_sorted_custom_compression(self):
        write_path = './data/write_test.bam'
        with pysam.AlignmentFile('./data/mini_nla_test.bam') as f:
            with sorted_bam_file(write_path, origin_bam=f,fast_compression=True) as out:
                for molecule in singlecellmultiomics.molecule.MoleculeIterator(
                    alignments=f,
                    molecule_class=singlecellmultiomics.molecule.NlaIIIMolecule,
                    fragment_class=singlecellmultiomics.fragment.NlaIIIFragment,
                    fragment_class_args={'umi_hamming_distance':0},
                    pooling_method=0,
                    yield_invalid=True
                ):
                    molecule.write_pysam(out)

        self.assertTrue(os.path.exists(write_path))
        try:
            os.remove(write_path)
            os.remove(write_path+'.bai')
        except Exception as e:
            pass
Ejemplo n.º 4
0
    def test_write_to_sorted_non_existing_folder(self):
        write_folder = './data/non_yet_existing_folder/'
        write_path = write_folder + 'write_test.bam'
        if os.path.exists(write_path):
            os.remove(write_path)

        rmtree(write_folder, ignore_errors=True)

        with pysam.AlignmentFile('./data/mini_nla_test.bam') as f:
            with sorted_bam_file(write_path, origin_bam=f) as out:
                for molecule in singlecellmultiomics.molecule.MoleculeIterator(
                        alignments=f,
                        molecule_class=singlecellmultiomics.molecule.
                        NlaIIIMolecule,
                        fragment_class=singlecellmultiomics.fragment.
                        NlaIIIFragment,
                        fragment_class_args={'umi_hamming_distance': 0},
                        pooling_method=0,
                        yield_invalid=True):
                    molecule.write_pysam(out)

        self.assertTrue(os.path.exists(write_path))

        with pysam.AlignmentFile(write_path) as f:
            i = 0
            # Test if the file has reads.
            for read in f:
                if read.is_read1:
                    i += 1
            self.assertEqual(i, 293)

        try:
            os.remove(write_path)
            os.remove(write_path + '.bai')
        except Exception as e:
            pass

        rmtree(write_folder, ignore_errors=True)
def main():
    parser = argparse.ArgumentParser(
        description=
        'Dual signal unmixing, through a probablity matrix for each cell across bins probability a read is assigned to signal 1. Use this prob matrix with bam file to split a bam file into signal 1 and signal 2'
    )
    parser.add_argument('-inbam', metavar='INFILE', help='Input bam file')
    parser.add_argument(
        '-inprobmat',
        metavar='INFILE',
        help=
        'Tab sep matrix file. Columns are cell names (first fcolumn is ""). Rows are genomic bins. Values are probability of reads in bin assigned to mark1.'
    )
    parser.add_argument(
        '-outdir',
        metavar='OUTDIR',
        help='Output directory for bams. Full name to be specified in script')
    parser.add_argument('-mapq',
                        metavar='INTEGER 0 to 60',
                        default=0,
                        type=int,
                        help='Minimum quality of read to be considered')
    parser.add_argument(
        '-binsize',
        metavar='Genomic binsize',
        default=50000,
        type=int,
        help=
        'Binsize of genomic bins to consider (assumes row names are defined by nearest 50kb bins)'
    )
    parser.add_argument(
        '--interpolation',
        action='store_true',
        help=
        'Makes a linear interpolation of the bins in your probability matrix (no interpolation across chromosomes).'
    )
    parser.add_argument('--quiet',
                        '-q',
                        action='store_true',
                        help='Suppress some print statements')
    parser.add_argument('--logfile',
                        '-l',
                        metavar='LOGFILE',
                        default=None,
                        help='Write arguments to logfile')
    args = parser.parse_args()

    # store command line arguments for reproducibility
    CMD_INPUTS = ' '.join(['python'] + sys.argv)  # easy printing later
    # store argparse inputs for reproducibility / debugging purposes
    args_dic = vars(args)
    # ARG_INPUTS = ['%s=%s' % (key, val) for key, val in args_dic.iteritems()]  # for python2
    ARG_INPUTS = ['%s=%s' % (key, val)
                  for key, val in args_dic.items()]  # for python3
    ARG_INPUTS = ' '.join(ARG_INPUTS)

    # Print arguments supplied by user
    if not args.quiet:
        if args.logfile is not None:
            sys.stdout = open(args.logfile, "w+")
        print(datetime.datetime.now().strftime('Code output on %c'))
        print('Command line inputs:')
        print(CMD_INPUTS)
        print('Argparse variables:')
        print(ARG_INPUTS)

    p = pd.read_csv(args.inprobmat, sep="\t", index_col=0)

    def parse_bin_name(binname):
        chrname, coords = binname.split(':')
        start, end = coords.split('-')
        return chrname, int(start), int(end)

    if not args.interpolation:
        prob = p

    if args.interpolation:

        def interpolate_prob_mat(p):

            new_rows = []
            for index, (binA_orign,
                        binB_orign) in enumerate(windowed(p.index, 2)):

                binA = binA_orign  #parse_bin_name(binA_orign)
                binB = binB_orign  #parse_bin_name(binB_orign)

                if binA[0] != binB[0]:
                    continue

                if binA[2] > binB[1]:
                    raise ValueError('The input is not sorted')

                contig = binA[0]

                binSize = binA[2] - binA[1]

                new_rows.append(p.loc[binA_orign, :])

                start, end = binA[2], binB[1]

                for new_bin_start in range(binA[2], binB[1], binSize):

                    new_bin_end = new_bin_start + binSize
                    new_bin_centroid = new_bin_start + binSize * 0.5

                    # for every cell do interpolation
                    dx = end - start
                    d = (new_bin_centroid - start)
                    dy = p.loc[binB_orign, :] - p.loc[binA_orign, :]

                    interpolated = (dy / dx) * d + p.loc[binA_orign, :]
                    interpolated.name = (contig, new_bin_start, new_bin_end)

                    new_rows.append(interpolated)

            prob = pd.DataFrame(new_rows)

            indexNames = [
                f'{chromosomes}:{starts}-{ends}'
                for chromosomes, starts, ends in prob.index
            ]
            prob.index = indexNames

            return prob

        p.index = pd.MultiIndex.from_tuples(
            [parse_bin_name(t) for t in p.index])
        p = p.sort_index(0)

        prob = interpolate_prob_mat(p)

        prob.to_csv(os.path.join(args.outdir,
                                 "probabilityMatrix_linearInterpolated.csv"),
                    sep='\t')

    #==========End interpolation============================================

    prob.index = pd.MultiIndex.from_tuples(
        [parse_bin_name(t.replace('chr', '')) for t in prob.index])
    prob.index.set_names(["chr", "start", "end"], inplace=True)

    bamFile = args.inbam
    wrote = 0

    infboth = os.path.join(args.outdir, "both.bam")
    infA = os.path.join(args.outdir, "splitted_A.bam")
    infB = os.path.join(args.outdir, "splitted_B.bam")

    with pysam.AlignmentFile(bamFile) as f:
        with sorted_bam_file(infboth, f) as both, sorted_bam_file(
                infA, origin_bam=f) as a, sorted_bam_file(infB,
                                                          origin_bam=f) as b:
            for readId, (R1,
                         R2) in enumerate(pysamiterators.MatePairIterator(f)):
                if R1.mapping_quality < args.mapq & R2.mapping_quality < args.mapq:
                    continue  # one of two reads should have sufficient MAPQ. Less stringent. Should be OK?

                if R1.is_duplicate:
                    continue

                bin_start, bin_end = coordinate_to_bins(
                    R1.get_tag('DS'), args.binsize, args.binsize)[0]

                # Obtain prob:
                bin_name = (R1.reference_name, bin_start, bin_end)
                if not bin_name in prob.index:
                    continue
                if R1.get_tag('SM') not in prob.columns:
                    continue
                p = prob.loc[bin_name, R1.get_tag('SM')]
                wrote += 1
                group = 'A' if np.random.random() <= p else 'B'
                R1.set_tag('Gr', group)
                R2.set_tag('Gr', group)
                if group == 'A':
                    a.write(R1)
                    a.write(R2)
                else:
                    b.write(R1)
                    b.write(R2)
                both.write(R1)
                both.write(R2)
    print("Number of reads written:" + str(wrote))
Ejemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'After downstream unmixing, we get a probablity matrix for each cell across bins probability a read is assigned to mark1. Use this prob matrix with bam file to split a bam file into mark1 and mark2'
    )
    parser.add_argument('-inbam', metavar='INFILE', help='Input bam file')
    # /hpc/hub_oudenaarden/jyeung/data/dblchic/double_staining_output_downstream/unfixed_louvain2/SplitReads/MF_BM_unfixed_louvain2_clstr_by_louvain_K4m1_K27m3.removeNA_FALSE-prob_mat.K4m1-K27m3_to_K4m1.txt
    parser.add_argument(
        '-inprobmat',
        metavar='INFILE',
        help=
        'Tab sep matrix file. Columns are cell names (first fcolumn is ""). Rows are genomic bins. Values are probability of reads in bin assigned to mark1.'
    )
    parser.add_argument(
        '-outdir',
        metavar='OUTDIR',
        help='Output directory for bams. Full name to be specified in script')
    parser.add_argument('-mapq',
                        metavar='INTEGER 0 to 60',
                        default=40,
                        type=int,
                        help='Minimum quality of read to be considered')
    parser.add_argument(
        '-binsize',
        metavar='Genomic binsize',
        default=50000,
        type=int,
        help=
        'Binsize of genomic bins to consider (assumes row names are defined by nearest 50kb bins)'
    )
    parser.add_argument('--quiet',
                        '-q',
                        action='store_true',
                        help='Suppress some print statements')
    parser.add_argument('--logfile',
                        '-l',
                        metavar='LOGFILE',
                        default=None,
                        help='Write arguments to logfile')
    args = parser.parse_args()

    # store command line arguments for reproducibility
    CMD_INPUTS = ' '.join(['python'] + sys.argv)  # easy printing later
    # store argparse inputs for reproducibility / debugging purposes
    args_dic = vars(args)
    # ARG_INPUTS = ['%s=%s' % (key, val) for key, val in args_dic.iteritems()]  # for python2
    ARG_INPUTS = ['%s=%s' % (key, val)
                  for key, val in args_dic.items()]  # for python3
    ARG_INPUTS = ' '.join(ARG_INPUTS)

    # Print arguments supplied by user
    if not args.quiet:
        if args.logfile is not None:
            sys.stdout = open(args.logfile, "w+")
        print(datetime.datetime.now().strftime('Code output on %c'))
        print('Command line inputs:')
        print(CMD_INPUTS)
        print('Argparse variables:')
        print(ARG_INPUTS)

        # pathToTables = '/hpc/hub_oudenaarden/mflorescu/data/mnase/mm/mergedBAMs/'

        # prob = pd.read_csv(pathToTables + 'MF_BM_unfixed_clstr_by_topics.K4m1-K27m3.removeNA_FALSE-prob_mat.K4m1-K27m3_to_K4m1.txt',
        #                   sep='\t')

        prob = pd.read_csv(args.inprobmat, sep="\t")

        new = prob["Unnamed: 0"].str.split(':|-', n=3, expand=True)

        prob['chr'] = new[0]
        prob['start'] = new[1]
        prob['end'] = new[2]

        prob.set_index(['chr', 'start', 'end'], inplace=True)

        prob.drop(["Unnamed: 0"], axis=1, inplace=True)

        # bamFile = "/hpc/hub_oudenaarden/mflorescu/data/mnase/mm/mergedBAMs/all_BM_K4m1_K27m3_200119.bam"
        bamFile = args.inbam
        wrote = 0

        infboth = os.path.join(args.outdir, "both.bam")
        infA = os.path.join(args.outdir, "splitted_A.bam")
        infB = os.path.join(args.outdir, "splitted_B.bam")

        with pysam.AlignmentFile(bamFile) as f:
            with sorted_bam_file(infboth, f) as both, sorted_bam_file(
                    infA,
                    origin_bam=f) as a, sorted_bam_file(infB,
                                                        origin_bam=f) as b:
                for readId, (R1, R2) in enumerate(
                        pysamiterators.MatePairIterator(f)):
                    if R1.mapping_quality < args.mapq & R2.mapping_quality < args.mapq:
                        continue  # one of two reads should have sufficient MAPQ. Less stringent. Should be OK?

                    if R1.is_duplicate:
                        continue

                    bin_start, bin_end = coordinate_to_bins(
                        R1.get_tag('DS'), args.binsize, args.binsize)[0]
                    # Obtain prob:
                    bin_name = (f'chr{R1.reference_name}', str(bin_start),
                                str(bin_end))
                    if not bin_name in prob.index:
                        continue
                    if R1.get_tag('SM') not in prob.columns:
                        continue
                    p = prob.loc[bin_name, R1.get_tag('SM')]
                    wrote += 1
                    group = 'A' if np.random.random() <= p else 'B'
                    R1.set_tag('Gr', group)
                    R2.set_tag('Gr', group)
                    if group == 'A':
                        a.write(R1)
                        a.write(R2)
                    else:
                        b.write(R1)
                        b.write(R2)
                    both.write(R1)
                    both.write(R2)
        print("Number of reads written:" + str(wrote))
Ejemplo n.º 7
0
    def obtain_conversions(contig: str):
        """ Create conversion dictionary for the suppled contig

        Args:
            contig (str)

        Returns:
            conversions_per_library (defaultdict( conversion_dict_stranded ) ) : Per library conversion dictionary
            n_molecules_per_library (Counter) : observed molecules per library
            contig(str) : the contig passed to the method
            temp_bam_path(str) : path to tagged bam file, tagged with gene annotations and 4su mutation count

        """

        conversions_per_library = defaultdict(conversion_dict_stranded)
        n_molecules_per_library = Counter()

        from singlecellmultiomics.molecule import might_be_variant

        # Create temp directory to write tagged bam file to:
        temp_dir = args.temp_dir
        temp_bam_path = f'{temp_dir}/{contig}.bam'
        if not os.path.exists(temp_dir):
            try:
                os.makedirs(temp_dir)
            except Exception as e:
                pass

        # Load gene annotations for the selected contig:
        transcriptome_features = FeatureContainer()
        transcriptome_features.loadGTF(path=exons_gtf_path,
                                       select_feature_type=['exon'],
                                       identifierFields=('exon_id', 'gene_id'),
                                       store_all=True,
                                       contig=contig,
                                       head=None)

        transcriptome_features.loadGTF(path=introns_gtf_path,
                                       select_feature_type=['intron'],
                                       identifierFields=['transcript_id'],
                                       store_all=True,
                                       contig=contig,
                                       head=None)

        colormap = plt.get_cmap('RdYlBu_r')
        colormap.set_bad((0, 0, 0))

        read_groups = {}
        try:
            with pysam.AlignmentFile(single_cell_bam_path, threads=4) as alignments, \
                 pysam.VariantFile(known_vcf_path) as known, \
                 sorted_bam_file(temp_bam_path, origin_bam=single_cell_bam_path, read_groups=read_groups, fast_compression=True) as out, \
                 pysam.FastaFile(reference_path) as reference_handle:

                # Cache the sequence of the contig: (faster)
                reference = CachedFasta(reference_handle)

                for n_molecules, molecule in enumerate(
                        MoleculeIterator(alignments,
                                         TranscriptMolecule,
                                         SingleEndTranscriptFragment,
                                         fragment_class_args={
                                             'stranded': True,
                                             'features': transcriptome_features
                                         },
                                         molecule_class_args={
                                             'reference': reference,
                                             'features':
                                             transcriptome_features,
                                             'auto_set_intron_exon_features':
                                             True
                                         },
                                         contig=contig)):
                    # Read out mut spectrum
                    consensus = molecule.get_consensus()
                    if args.R2_based:
                        molecule.strand = not molecule.strand  # Invert becayse its R2 based.
                    n_molecules_per_library[molecule.library] += 1

                    n_4su_mutations = 0
                    n_4su_contexts = 0

                    for (chrom, pos), base in consensus.items():
                        context = reference.fetch(chrom, pos - 1,
                                                  pos + 2).upper()
                        if len(context) != 3:
                            continue

                        if ((context[1] == 'A' and not molecule.strand)
                                or (context[1] == 'T' and molecule.strand)):
                            n_4su_contexts += 1

                        # Check if the base matches or the refence contains N's
                        if context[1] == base or 'N' in context or len(
                                context) != 3:
                            continue

                        # Ignore germline variants:
                        if might_be_variant(chrom, pos, known):
                            continue

                        if not molecule.strand:  # reverse template
                            context = reverse_complement(context)
                            base = complement(base)

                        # Count 4SU specific mutations, and write to molecule later
                        if context[1] == 'T' and base == 'C':
                            n_4su_mutations += 1

                        conversions_per_library[molecule.library][(context,
                                                                   base)] += 1

                    # Write 4su modification to molecule
                    molecule.set_meta('4S', n_4su_mutations)
                    molecule.set_meta('4c', n_4su_contexts)
                    # Set read color based on conversion rate:

                    try:
                        # The max color value will be 10% modification rate
                        cfloat = colormap(
                            np.clip(10 * (n_4su_mutations / n_4su_contexts), 0,
                                    1))[:3]
                    except Exception as e:
                        cfloat = colormap._rgba_bad[:3]
                    molecule.set_meta(
                        'YC', '%s,%s,%s' % tuple(
                            (int(x * 255) for x in cfloat)))

                    molecule.set_meta('4c', n_4su_contexts)
                    molecule.write_tags()

                    for fragment in molecule:
                        rgid = fragment.get_read_group()
                        if not rgid in read_groups:
                            read_groups[rgid] = fragment.get_read_group(
                                True)[1]

                    # Write tagged molecule to output file
                    molecule.write_pysam(out)

        except KeyboardInterrupt:
            # This allows you to cancel the analysis (CTRL+C) and get the current result
            pass

        return conversions_per_library, n_molecules_per_library, contig, temp_bam_path