Ejemplo n.º 1
0
    def test_write_to_read_grouped_sorted(self):
        write_path = './data/write_test_rg.bam'
        read_groups = set()
        with pysam.AlignmentFile('./data/mini_nla_test.bam') as f:

            input_header = f.header.as_dict()
            write_program_tag(input_header,
                program_name='test_bam_util_test1',
                command_line = " ".join(sys.argv),
                version = singlecellmultiomics.__version__,
                description = f'a description'
                )

            write_program_tag(input_header,
                program_name='test_bam_util_test2',
                command_line = " ".join(sys.argv),
                version = singlecellmultiomics.__version__,
                description = f'a description'
                )
            #print([x for x in input_header['PG'] if not 'bwa mem' in x.get('CL','')])
            with sorted_bam_file(write_path, header=input_header,read_groups=read_groups) as out:
                for molecule in singlecellmultiomics.molecule.MoleculeIterator(
                    alignments=f,
                    molecule_class=singlecellmultiomics.molecule.NlaIIIMolecule,
                    fragment_class=singlecellmultiomics.fragment.NlaIIIFragment,
                    fragment_class_args={'umi_hamming_distance':0},
                    pooling_method=0,
                    yield_invalid=True
                ):
                    molecule.write_pysam(out)
                    for frag in molecule:
                        read_groups.add( frag.get_read_group() )

        self.assertTrue(os.path.exists(write_path))

        # Now test if the program tag is there...
        with pysam.AlignmentFile(write_path) as f:
            self.assertTrue( 1==len([x for x in f.header['PG'] if 'test_bam_util_test1' in x.get('PN','')]) )
            self.assertTrue( 1==len([x for x in f.header['PG'] if 'test_bam_util_test2' in x.get('PN','')]) )
            i =0

            # Test if the file has reads.
            for read in f:
                if read.is_read1:
                    i+=1
            self.assertEqual(i, 293)

        try:
            os.remove(write_path)
        except Exception as e:
            pass

        try:
            os.remove(write_path+'.bai')
        except Exception as e:
            pass
Ejemplo n.º 2
0
    def test_write_to_sorted_custom_compression(self):
        write_path = './data/write_test.bam'
        with pysam.AlignmentFile('./data/mini_nla_test.bam') as f:
            with sorted_bam_file(write_path, origin_bam=f,fast_compression=True) as out:
                for molecule in singlecellmultiomics.molecule.MoleculeIterator(
                    alignments=f,
                    molecule_class=singlecellmultiomics.molecule.NlaIIIMolecule,
                    fragment_class=singlecellmultiomics.fragment.NlaIIIFragment,
                    fragment_class_args={'umi_hamming_distance':0},
                    pooling_method=0,
                    yield_invalid=True
                ):
                    molecule.write_pysam(out)

        self.assertTrue(os.path.exists(write_path))
        try:
            os.remove(write_path)
            os.remove(write_path+'.bai')
        except Exception as e:
            pass
Ejemplo n.º 3
0
    def test_write_to_sorted_non_existing_folder(self):
        write_folder = './data/non_yet_existing_folder/'
        write_path = write_folder + 'write_test.bam'
        if os.path.exists(write_path):
            os.remove(write_path)

        rmtree(write_folder, ignore_errors=True)

        with pysam.AlignmentFile('./data/mini_nla_test.bam') as f:
            with sorted_bam_file(write_path, origin_bam=f) as out:
                for molecule in singlecellmultiomics.molecule.MoleculeIterator(
                        alignments=f,
                        molecule_class=singlecellmultiomics.molecule.
                        NlaIIIMolecule,
                        fragment_class=singlecellmultiomics.fragment.
                        NlaIIIFragment,
                        fragment_class_args={'umi_hamming_distance': 0},
                        pooling_method=0,
                        yield_invalid=True):
                    molecule.write_pysam(out)

        self.assertTrue(os.path.exists(write_path))

        with pysam.AlignmentFile(write_path) as f:
            i = 0
            # Test if the file has reads.
            for read in f:
                if read.is_read1:
                    i += 1
            self.assertEqual(i, 293)

        try:
            os.remove(write_path)
            os.remove(write_path + '.bai')
        except Exception as e:
            pass

        rmtree(write_folder, ignore_errors=True)
Ejemplo n.º 4
0
        for i, molecule in enumerate(
                singlecellmultiomics.molecule.MoleculeIterator(
                    alignments=alignments,
                    moleculeClass=moleculeClass,
                    yield_invalid=(output is not None),
                    fragmentClass=fragmentClass,
                    fragment_class_args={'umi_hamming_distance': 1},
                    molecule_class_args=molecule_class_args,
                    contig=args.contig)):

            if args.head and (i - 1) >= args.head:
                break

            if not molecule.is_valid(set_rejection_reasons=True):
                if output is not None:
                    molecule.write_pysam(output)
                continue

            # Skip sample if not selected
            if samples is not None and molecule.sample not in samples:
                molecule.set_rejection_reason('sample_not_selected')
                if output is not None:
                    molecule.write_pysam(output)
                continue

            for (chromosome,
                 location), call in molecule.methylation_call_dict.items():
                if call['context'] == '.':  # Only print calls concerning C's
                    continue

                # Skip non-selected contexts
def count_transcripts(cargs):
    args, contig = cargs
    if args.alleles is not None:
        allele_resolver = alleleTools.AlleleResolver(
            args.alleles, lazyLoad=(not args.loadAllelesToMem))
    else:
        allele_resolver = None

    contig_mapping = None

    if args.contigmapping == 'danio':
        contig_mapping = {
            '1': 'CM002885.2',
            '2': 'CM002886.2',
            '3': 'CM002887.2',
            '4': 'CM002888.2',
            '5': 'CM002889.2',

            '6': 'CM002890.2',
            '7': 'CM002891.2',
            '8': 'CM002892.2',
            '9': 'CM002893.2',
            '10': 'CM002894.2',
            '11': 'CM002895.2',
            '12': 'CM002896.2',
            '13': 'CM002897.2',
            '14': 'CM002898.2',
            '15': 'CM002899.2',

            '16': 'CM002900.2',
            '17': 'CM002901.2',
            '18': 'CM002902.2',
            '19': 'CM002903.2',
            '20': 'CM002904.2',
            '21': 'CM002905.2',
            '22': 'CM002906.2',
            '23': 'CM002907.2',
            '24': 'CM002908.2',
            '25': 'CM002909.2',
        }

    # Load features
    contig_mapping = None
    #conversion_table = get_gene_id_to_gene_name_conversion_table(args.gtfexon)
    features = singlecellmultiomics.features.FeatureContainer()
    if contig_mapping is not None:
        features.remapKeys = contig_mapping
    features.loadGTF(
        args.gtfexon,
        select_feature_type=['exon'],
        identifierFields=(
            'exon_id',
            'transcript_id'),
        store_all=True,
        head=args.hf,
        contig=contig)
    features.loadGTF(
        args.gtfintron,
        select_feature_type=['intron'],
        identifierFields=['transcript_id'],
        store_all=True,
        head=args.hf,
        contig=contig)

    # What is used for assignment of molecules?
    if args.method == 'nla':
        moleculeClass = singlecellmultiomics.molecule.AnnotatedNLAIIIMolecule
        fragmentClass = singlecellmultiomics.fragment.NLAIIIFragment
        pooling_method = 1  # all data from the same cell can be dealt with separately
        stranded = None  # data is not stranded
    elif args.method == 'vasa' or args.method == 'cs':
        moleculeClass = singlecellmultiomics.molecule.VASA
        fragmentClass = singlecellmultiomics.fragment.SingleEndTranscript
        pooling_method = 1
        stranded = 1  # data is stranded, mapping to other strand
    else:
        raise ValueError("Supply a valid method")

    # COUNT:
    exon_counts_per_cell = collections.defaultdict(
        collections.Counter)  # cell->gene->umiCount
    intron_counts_per_cell = collections.defaultdict(
        collections.Counter)  # cell->gene->umiCount
    junction_counts_per_cell = collections.defaultdict(
        collections.Counter)  # cell->gene->umiCount
    gene_counts_per_cell = collections.defaultdict(
        collections.Counter)  # cell->gene->umiCount

    gene_set = set()
    sample_set = set()
    annotated_molecules = 0
    read_molecules = 0
    if args.producebam:
        bam_path_produced = f'{args.o}/output_bam_{contig}.unsorted.bam'
        with pysam.AlignmentFile(args.alignmentfiles[0]) as alignments:
            output_bam = pysam.AlignmentFile(
                bam_path_produced, "wb", header=alignments.header)

    ref = None
    if args.ref is not None:
        ref = pysamiterators.iterators.CachedFasta(pysam.FastaFile(args.ref))

    for alignmentfile_path in args.alignmentfiles:

        i = 0
        with pysam.AlignmentFile(alignmentfile_path) as alignments:
            molecule_iterator = MoleculeIterator(
                alignments=alignments,
                check_eject_every=5000,
                moleculeClass=moleculeClass,
                molecule_class_args={
                    'features': features,
                    'stranded': stranded,
                    'min_max_mapping_quality': args.minmq,
                    'reference': ref,
                    'allele_resolver': allele_resolver
                },

                fragmentClass=fragmentClass,
                fragment_class_args={
                    'umi_hamming_distance': args.umi_hamming_distance,
                    'R1_primer_length': 4,
                    'R2_primer_length': 6},
                perform_qflag=True,
                # when the reads have not been tagged yet, this flag is very
                # much required
                pooling_method=pooling_method,
                contig=contig
            )

            for i, molecule in enumerate(molecule_iterator):
                if not molecule.is_valid():
                    if args.producebam:
                        molecule.write_tags()
                        molecule.write_pysam(output_bam)
                    continue

                molecule.annotate(args.annotmethod)
                molecule.set_intron_exon_features()

                if args.producebam:
                    molecule.write_tags()
                    molecule.write_pysam(output_bam)

                allele = None
                if allele_resolver is not None:
                    allele = molecule.allele
                    if allele is None:
                        allele = 'noAllele'

                # Obtain total count introns/exons reduce it so the sum of the
                # count will be 1:
                # len(molecule.introns.union( molecule.exons).difference(molecule.junctions))+len(molecule.junctions)
                total_count_for_molecule = len(molecule.genes)
                if total_count_for_molecule == 0:
                    continue  # we didn't find  any gene counts

                # Distibute count over amount of gene hits:
                count_to_add = 1 / total_count_for_molecule
                for gene in molecule.genes:
                    if allele is not None:
                        gene = f'{allele}_{gene}'
                    gene_counts_per_cell[molecule.sample][gene] += count_to_add
                    gene_set.add(gene)
                    sample_set.add(molecule.get_sample())

                # Obtain introns/exons/splice junction information:
                for intron in molecule.introns:
                    gene = intron
                    if allele is not None:
                        gene = f'{allele}_{intron}'
                    intron_counts_per_cell[molecule.sample][gene] += count_to_add
                    gene_set.add(gene)

                for exon in molecule.exons:
                    gene = exon
                    if allele is not None:
                        gene = f'{allele}_{exon}'
                    exon_counts_per_cell[molecule.sample][gene] += count_to_add
                    gene_set.add(gene)

                for junction in molecule.junctions:
                    gene = junction
                    if allele is not None:
                        gene = f'{allele}_{junction}'
                    junction_counts_per_cell[molecule.sample][gene] += count_to_add
                    gene_set.add(gene)

                annotated_molecules += 1
                if args.head and (i + 1) > args.head:
                    print(
                        f"-head was supplied, {i} molecules discovered, stopping")
                    break

        read_molecules += i

    if args.producebam:
        output_bam.close()
        final_bam_path = bam_path_produced.replace('.unsorted', '')
        sort_and_index(bam_path_produced, final_bam_path, remove_unsorted=True)

    return (
        gene_set,
        sample_set,
        gene_counts_per_cell,
        junction_counts_per_cell,
        exon_counts_per_cell,
        intron_counts_per_cell,
        annotated_molecules,
        read_molecules,
        contig

    )
def run_multiome_tagging(args):
    """
    Run multiome tagging adds molecule information

    Arguments:

        bamin (str) : bam file to process

        o(str) : path to output bam file

        method(str): Protocol to tag, select from:nla, qflag, chic, nla_transcriptome, vasa, cs, nla_taps ,chic_taps, nla_no_overhang, scartrace

        qflagger(str): Query flagging algorithm to use, this algorithm extracts UMI and sample information from your reads. When no query flagging algorithm is specified, the `singlecellmultiomics.universalBamTagger.universalBamTagger.QueryNameFlagger` is used

        method(str) : Method name, what kind of molecules need to be extracted. Select from:
            nla (Data with digested by Nla III enzyme)
            qflag (Only add basic tags like sampple and UMI, no molecule assignment)
            chic (Data digested using mnase fusion)
            nla_transcriptome (Data with transcriptome and genome digested by Nla III )
            vasa (VASA transcriptomic data)
            cs (CELseq data, 1 and 2)
            cs_feature_counts (Single end, deduplicate using a bam file tagged using featurecounts, deduplicates a umi per gene)
            fl_feature_counts (deduplicate using a bam file tagged using featurecounts, deduplicates based on fragment position)
            nla_taps (Data with digested by Nla III enzyme and methylation converted by TAPS)
            chic_taps (Data with digested by mnase enzyme and methylation converted by TAPS)
            chic_nla
            scartrace  (lineage tracing protocol)


        custom_flags(str): Arguments passed to the query name flagger, comma separated "MI,RX,bi,SM"

        ref(str) : Path to reference fasta file, autodected from bam header when not supplied

        umi_hamming_distance(int) : Max hamming distance on UMI's

        head (int) : Amount of molecules to process

        contig (str) : only process this contig

        region_start(int) : Zero based start coordinate of single region to process

        region_end(int) : Zero based end coordinate of single region to process, None: all contigs when contig is not set, complete contig when contig is set.

        alleles (str) : path to allele VCF

        allele_samples(str): Comma separated samples to extract from the VCF file. For example B6,SPRET

        unphased_alleles(str) : Path to VCF containing unphased germline SNPs

        mapfile (str) : 'Path to \*.safe.bgzf file, used to decide if molecules are uniquely mappable, generate one using createMapabilityIndex.py

        annotmethod (int) : Annotation resolving method. 0: molecule consensus aligned blocks. 1: per read per aligned base

        cluster (bool) : Run contigs in separate cluster jobs

        resolve_unproperly_paired_reads(bool) : When enabled bamtagmultiome will look through the complete bam file in a hunt for the mate, the two mates will always end up in 1 molecule if both present in the bam file. This also works when the is_proper_pair bit is not set. Use this option when you want to find the breakpoints of genomic re-arrangements.

        no_rejects(bool) : Do not write rejected reads

        mem (int) : Amount of gigabytes to request for cluster jobs

        time(int) : amount of wall clock hours to request for cluster jobs

        exons(str): Path to exon annotation GTF file

        introns(str): Path to intron annotation GTF file

        consensus(bool) : Calculate molecule consensus read, this feature is _VERY_ experimental

        consensus_model(str) : Path to consensus calling model, when none specified, this is learned based on the supplied bam file, ignoring sites supplied by -consensus_mask_variants

        consensus_mask_variants(str): Path VCF file masked for training on consensus caller

        consensus_n_train(int) : Amount of bases used for training the consensus model

        no_source_reads(bool) :  Do not write original reads, only consensus

        scartrace_r1_primers(str) : comma separated list of R1 primers used in scartrace protocol


    """

    MISC_ALT_CONTIGS_SCMO = 'MISC_ALT_CONTIGS_SCMO'
    every_fragment_as_molecule = args.every_fragment_as_molecule
    skip_contig = set(args.skip_contig.split(',')) if args.skip_contig is not None else set()


    if not args.o.endswith('.bam'):
        raise ValueError(
            "Supply an output which ends in .bam, for example -o output.bam")

    write_status(args.o,'unfinished')

    # Verify wether the input file is indexed and sorted...
    if not args.ignore_bam_issues:
        verify_and_fix_bam(args.bamin)

    for remove_existing_path in [args.o, f'{args.o}.bai']:
        if os.path.exists(remove_existing_path):
            print(f"Removing existing file {remove_existing_path}")
            os.remove(remove_existing_path)

    input_bam = pysam.AlignmentFile(args.bamin, "rb", ignore_truncation=args.ignore_bam_issues, threads=4)

    # autodetect reference:
    reference = None
    if args.ref is None:
        args.ref = get_reference_from_pysam_alignmentFile(input_bam)

    if args.ref is not None:
        try:
            reference = CachedFasta(
                pysam.FastaFile(args.ref))
            print(f'Loaded reference from {args.ref}')
        except Exception as e:
            print("Error when loading the reference file, continuing without a reference")
            reference = None

    ##### Define fragment and molecule class arguments and instances: ####

    queryNameFlagger = None
    if args.qflagger is not None:
        if args.qflagger == 'custom_flags':
            queryNameFlagger = CustomAssingmentQueryNameFlagger(
                args.custom_flags.split(','))
        else:
            raise ValueError("Select from 'custom_flags, ..' ")

    molecule_class_args = {
        'umi_hamming_distance': args.umi_hamming_distance,
        'reference': reference
    }

    fragment_class_args = {
        'read_group_format' : args.read_group_format

    }
    yield_invalid = True  # if invalid reads should be written
    yield_overflow = True  # if overflow reads should be written

    if args.max_fragment_size is not None:
        fragment_class_args['max_fragment_size'] = args.max_fragment_size

    if args.no_rejects:
        yield_invalid = False

    if args.no_overflow:
        yield_overflow = False


    ignore_conversions = None
    if args.method == 'nla_taps' or args.method == 'chic_taps':
        ignore_conversions = set([('C', 'T'), ('G', 'A')])

    if args.alleles is not None:
        molecule_class_args['allele_resolver'] = singlecellmultiomics.alleleTools.AlleleResolver(
            args.alleles,
            select_samples=args.allele_samples.split(',') if args.allele_samples is not None else None,
            lazyLoad=True,
            use_cache=args.use_allele_cache,
            verbose = args.set_allele_resolver_verbose,
            ignore_conversions=ignore_conversions)

    if args.mapfile is not None:
        molecule_class_args['mapability_reader'] = MapabilityReader(
            args.mapfile)

    ### Transcriptome configuration ###
    if args.method in ('nla_transcriptome', 'cs', 'vasa'):
        print(
            colorama.Style.BRIGHT +
            'Running in transcriptome annotation mode' +
            colorama.Style.RESET_ALL)
        if args.exons is None :
            raise ValueError("Supply an exon GTF file")

        if args.introns is not None and args.exons is None:
            raise ValueError("Please supply both intron and exon GTF files")

        transcriptome_features = singlecellmultiomics.features.FeatureContainer()
        print("Loading exons", end='\r')
        transcriptome_features.loadGTF(
            args.exons,
            select_feature_type=['exon'],
            identifierFields=(
                'exon_id',
                'gene_id'),
            store_all=True,
            contig=args.contig,
            head=None)

        if args.introns is not None:
            print("Loading introns", end='\r')
            transcriptome_features.loadGTF(
                args.introns,
                select_feature_type=['intron'],
                identifierFields=['transcript_id'],
                store_all=True,
                contig=args.contig,
                head=None)
        print("All features loaded")

        # Add more molecule class arguments
        molecule_class_args.update({
            'features': transcriptome_features,
            'auto_set_intron_exon_features': True
        })

    ### Method specific configuration ###
    if args.method == 'qflag':
        moleculeClass = singlecellmultiomics.molecule.Molecule
        fragmentClass = singlecellmultiomics.fragment.Fragment
        # Write all reads
        yield_invalid = True

    elif args.method == 'chic':
        moleculeClass = singlecellmultiomics.molecule.CHICMolecule
        fragmentClass = singlecellmultiomics.fragment.CHICFragment

    elif args.method == 'nla' or args.method == 'nla_no_overhang':
        moleculeClass = singlecellmultiomics.molecule.NlaIIIMolecule
        fragmentClass = singlecellmultiomics.fragment.NLAIIIFragment

        if args.method == 'nla_no_overhang':
            assert reference is not None, 'Supply a reference fasta using -ref!'
            fragment_class_args.update({
                    'reference': reference,
                    'no_overhang': True
                })

    elif args.method == 'chic_nla':
        moleculeClass=singlecellmultiomics.molecule.CHICNLAMolecule
        fragmentClass=singlecellmultiomics.fragment.CHICFragment
        assert reference is not None, 'Supply a reference fasta using -ref!'
        molecule_class_args.update({
                'reference': reference,
        })

    elif args.method == 'cs_feature_counts' :
        moleculeClass = singlecellmultiomics.molecule.Molecule
        fragmentClass = singlecellmultiomics.fragment.FeatureCountsSingleEndFragment

    elif args.method == 'fl_feature_counts':

        moleculeClass = singlecellmultiomics.molecule.Molecule
        fragmentClass = singlecellmultiomics.fragment.FeatureCountsFullLengthFragment

    elif args.method == 'episeq' :
        moleculeClass = singlecellmultiomics.molecule.Molecule
        fragmentClass = singlecellmultiomics.fragment.FeatureCountsSingleEndFragment

    elif args.method == 'nla_transcriptome':
        moleculeClass = singlecellmultiomics.molecule.AnnotatedNLAIIIMolecule
        fragmentClass = singlecellmultiomics.fragment.NLAIIIFragment

        molecule_class_args.update({
            'pooling_method': 1,  # all data from the same cell can be dealt with separately
            'stranded': None  # data is not stranded
        })

    elif args.method == 'nla_taps':
        moleculeClass = singlecellmultiomics.molecule.TAPSNlaIIIMolecule
        fragmentClass = singlecellmultiomics.fragment.NLAIIIFragment

        molecule_class_args.update({
            'reference': reference,
            'taps': singlecellmultiomics.molecule.TAPS(reference=reference)
        })

    elif args.method == 'chic_taps':

        molecule_class_args.update({
            'reference': reference,
            'taps': singlecellmultiomics.molecule.TAPS(reference=reference)
        })
        moleculeClass = singlecellmultiomics.molecule.TAPSCHICMolecule
        fragmentClass = singlecellmultiomics.fragment.CHICFragment

    elif args.method == 'vasa' or args.method == 'cs':
        moleculeClass = singlecellmultiomics.molecule.VASA
        fragmentClass = singlecellmultiomics.fragment.SingleEndTranscript

        molecule_class_args.update({
            'pooling_method': 1,  # all data from the same cell can be dealt with separately
            'stranded': 1  # data is stranded
        })

    elif args.method == 'scartrace':

        moleculeClass = singlecellmultiomics.molecule.ScarTraceMolecule
        fragmentClass = singlecellmultiomics.fragment.ScarTraceFragment

        r1_primers = args.scartrace_r1_primers.split(',')
        fragment_class_args.update({
                'scartrace_r1_primers': r1_primers,
                #'reference': reference
            })


    else:
        raise ValueError("Supply a valid method")

    # Allow or disallow cycle shift:
    if args.allow_cycle_shift and fragmentClass is singlecellmultiomics.fragment.NLAIIIFragment:
        fragment_class_args['allow_cycle_shift'] = True

    # This disables umi_cigar_processing:
    if args.no_umi_cigar_processing:
        fragment_class_args['no_umi_cigar_processing'] = True

    if args.max_associated_fragments is not None:
        molecule_class_args['max_associated_fragments'] = args.max_associated_fragments

    # This decides what molecules we will traverse
    if args.contig == MISC_ALT_CONTIGS_SCMO:
        contig = None
    else:
        contig = args.contig

    # This decides to only extract a single genomic region:
    if args.region_start is not None:
        if args.region_end is None:
            raise ValueError('When supplying -region_start then also supply -region_end')
        region_start = args.region_start
        region_end = args.region_end
    else:
        region_start = None
        region_end = None



    last_update = datetime.now()
    init_time = datetime.now()
    if args.molecule_iterator_verbosity_interval is not None and (args.molecule_iterator_verbose or (args.stats_file_path is not None )):

        stats_handle = None
        if args.stats_file_path is not None:
            stats_handle = open(args.stats_file_path,'w')

        def progress_callback_function( iteration, mol_iter, reads ):
            nonlocal last_update
            nonlocal init_time
            nonlocal stats_handle

            now = datetime.now()
            diff = (datetime.now()-last_update).total_seconds()
            if diff>args.molecule_iterator_verbosity_interval:

                diff_from_init = (datetime.now()-init_time).total_seconds()
                _contig, _pos = None, None
                for read in reads:
                    if read is not None:
                        _contig, _pos = read.reference_name, read.reference_start

                if args.molecule_iterator_verbose:
                    print( f'{mol_iter.yielded_fragments} fragments written, {mol_iter.deleted_fragments} fragments deleted ({(mol_iter.deleted_fragments/(mol_iter.deleted_fragments + mol_iter.yielded_fragments))*100:.2f} %), current pos: {_contig}, {_pos}, {mol_iter.waiting_fragments} fragments waiting             ' , end='\r')
                if stats_handle is not None:
                    stats_handle.write(f'{diff_from_init}\t{mol_iter.waiting_fragments}\t{mol_iter.yielded_fragments}\t{mol_iter.deleted_fragments}\t{_contig}\t{_pos}\n')
                    stats_handle.flush()
                last_update = now

    else:
        progress_callback_function = None



    molecule_iterator_args = {
        'alignments': input_bam,
        'queryNameFlagger': queryNameFlagger,
        'moleculeClass': moleculeClass,
        'fragmentClass': fragmentClass,
        'molecule_class_args': molecule_class_args,
        'fragment_class_args': fragment_class_args,
        'yield_invalid': yield_invalid,
        'yield_overflow': yield_overflow,
        'start':region_start,
        'end':region_end,
        'contig': contig,
        'every_fragment_as_molecule': every_fragment_as_molecule,
        'skip_contigs':skip_contig,
        'progress_callback_function':progress_callback_function
    }

    if args.resolve_unproperly_paired_reads:
        molecule_iterator_args['iterator_class'] = MatePairIteratorIncludingNonProper

    if args.contig == MISC_ALT_CONTIGS_SCMO:
        # When MISC_ALT_CONTIGS_SCMO is set as argument, all molecules with reads
        # mapping to a contig returning True from the is_main_chromosome
        # function are used

        def Misc_contig_molecule_generator(molecule_iterator_args):
            for reference in input_bam.references:
                if not is_main_chromosome(reference):
                    molecule_iterator_args['contig'] = reference
                    yield from MoleculeIterator(**molecule_iterator_args)

        molecule_iterator = Misc_contig_molecule_generator(
            molecule_iterator_args)
    else:
        molecule_iterator = MoleculeIterator(**molecule_iterator_args)

    #####
    consensus_model_path = None

    if args.consensus:
        # Load from path if available:

        if args.consensus_model is not None:
            if os.path.exists(args.consensus_model):
                model_path = args.consensus_model
            else:
                model_path = pkg_resources.resource_filename(
                    'singlecellmultiomics', f'molecule/consensus_model/{args.consensus_model}')

            if model_path.endswith('.h5'):
                try:
                    from tensorflow.keras.models import load_model
                except ImportError:
                    print("Please install tensorflow")
                    raise
                consensus_model = load_model(model_path)

            else:
                with open(model_path, 'rb') as f:
                    consensus_model = pickle.load(f)
        else:
            skip_already_covered_bases = not args.consensus_allow_train_location_oversampling
            if args.consensus_mask_variants is None:
                mask_variants = None
            else:
                mask_variants = pysam.VariantFile(args.consensus_mask_variants)
            print("Fitting consensus model, this may take a long time")
            consensus_model = singlecellmultiomics.molecule.train_consensus_model(
                molecule_iterator,
                mask_variants=mask_variants,
                n_train=args.consensus_n_train,
                skip_already_covered_bases=skip_already_covered_bases
                )
            # Write the consensus model to disk
            consensus_model_path = os.path.abspath(
                os.path.dirname(args.o)) + '/consensus_model.pickle.gz'
            print(f'Writing consensus model to {consensus_model_path}')
            with open(consensus_model_path, 'wb') as f:
                pickle.dump(consensus_model, f)

    # We needed to check if every argument is properly placed. If so; the jobs
    # can be sent to the cluster

    if args.cluster:
        if args.contig is None:
            write_status(args.o,'Submitting jobs. If this file remains, a job failed.')
            # Create jobs for all chromosomes:
            unique_id = str(uuid.uuid4())
            temp_prefix = os.path.abspath(os.path.dirname(
                args.o)) + '/SCMO_' + unique_id
            hold_merge = []

            ## Create folder to store cluster files:
            if args.clusterdir is None:
                cluster_file_folder = os.path.abspath(os.path.dirname(
                    args.o)) + '/cluster'
            else:
                cluster_file_folder = args.clusterdir
            print(f'Writing cluster scripts and standard out and error to {cluster_file_folder}')
            if not os.path.exists(cluster_file_folder):
                try:
                    os.makedirs(cluster_file_folder,exist_ok=True)
                except Exception as e:
                    print(e)
                    pass

            found_alts = 0
            files_to_merge = []
            for ci,chrom in enumerate([_chrom  for _chrom in
                        (list(input_bam.references) + [MISC_ALT_CONTIGS_SCMO])
                        if not _chrom in skip_contig]):

                if not is_main_chromosome(chrom):
                    found_alts += 1
                    continue
                if chrom == MISC_ALT_CONTIGS_SCMO and found_alts == 0:
                    continue

                temp_bam_path = f'{temp_prefix}_{chrom}.bam'

                if os.path.exists(temp_bam_path):
                    print(f"Removing existing temporary file {temp_bam_path}")
                    os.remove(temp_bam_path)

                arguments = " ".join(
                    [x for x in sys.argv if not x == args.o and x != '-o']) + f" -contig {chrom} -o {temp_bam_path}"
                files_to_merge.append(temp_bam_path)
                if consensus_model_path is not None:
                    arguments += f' -consensus_model {consensus_model_path}'
                job = f'SCMULTIOMICS_{ci}_{unique_id}'
                write_status(temp_bam_path,'SUBMITTED')
                job_id = submit_job(f'{arguments};', job_name=job, target_directory=cluster_file_folder,  working_directory=None,
                               threads_n=1, memory_gb=args.mem, time_h=args.time, scheduler=args.sched, copy_env=True,
                               email=None, mail_when_finished=False, hold=None,submit=True)


                print(f'Job for contig {chrom} submitted with job id: {job_id}')
                hold_merge.append(job_id)

            hold = hold_merge

            job = f'SCMULTIOMICS_MERGE_{unique_id}'

            if args.sched == 'local':
                hold = None

            final_status = args.o.replace('.bam','.status.txt')
            # Create list of output files
            command = f'samtools merge -@ 4 -c {args.o} {" ".join(files_to_merge)} && samtools index {args.o} && rm {temp_prefix}*.ba* && rm {temp_prefix}*.status.txt && echo "All done" > {final_status}'

            final_job_id = submit_job(f'{command};', job_name=job, target_directory=cluster_file_folder,  working_directory=None,
                           threads_n=4, memory_gb=10, time_h=args.time, scheduler=args.sched, copy_env=True,
                           email=None, mail_when_finished=False, hold=hold,submit=True)
            print(f'final job id is:{final_job_id}')
            exit()

    #####
    # Load unphased variants to memory
    unphased_allele_resolver = None
    if args.unphased_alleles is not None:
        unphased_allele_resolver = singlecellmultiomics.alleleTools.AlleleResolver(
            use_cache=args.use_allele_cache,
            phased=False, ignore_conversions=ignore_conversions,verbose = args.set_allele_resolver_verbose)
        try:
            for i, variant in enumerate(
                pysam.VariantFile(
                    args.unphased_alleles).fetch(
                    args.contig)):
                if 'PASS' not in list(variant.filter):
                    continue
                if not all(
                        len(allele) == 1 for allele in variant.alleles) or len(
                        variant.alleles) != 2:
                    continue
                if sum([len(set(variant.samples[sample].alleles))
                        == 2 for sample in variant.samples]) < 2:
                    # Not heterozygous
                    continue

                unphased_allele_resolver.locationToAllele[variant.chrom][variant.pos - 1] = {
                    variant.alleles[0]: {'U'}, variant.alleles[1]: {'V'}}
        except Exception as e:  # todo catch this more nicely
            print(e)
    out_bam_path = args.o

    # Copy the header
    input_header = input_bam.header.as_dict()

    # Write provenance information to BAM header
    write_program_tag(
        input_header,
        program_name='bamtagmultiome',
        command_line=" ".join(
            sys.argv),
        version=singlecellmultiomics.__version__,
        description=f'SingleCellMultiOmics molecule processing, executed at {datetime.now().strftime("%d/%m/%Y %H:%M:%S")}')

    print(f'Started writing to {out_bam_path}')

    read_groups = dict()  # Store unique read groups in this dict
    with sorted_bam_file(out_bam_path, header=input_header, read_groups=read_groups) as out:

        try:
            for i, molecule in enumerate(molecule_iterator):

                # Stop when enough molecules are processed
                if args.head is not None and (i - 1) >= args.head:
                    break

                # set unique molecule identifier
                molecule.set_meta('mi', f'{molecule.get_a_reference_id()}_{i}')

                # Write tag values
                molecule.write_tags()

                if unphased_allele_resolver is not None:  # write unphased allele tag:
                    molecule.write_allele_phasing_information_tag(
                        unphased_allele_resolver, 'ua')

                # Update read groups
                for fragment in molecule:
                    rgid = fragment.get_read_group()
                    if not rgid in read_groups:
                        read_groups[rgid] = fragment.get_read_group(True)[1]

                # Calculate molecule consensus
                if args.consensus:
                    try:
                        consensus_reads = molecule.deduplicate_to_single_CIGAR_spaced(
                            out,
                            f'consensus_{molecule.get_a_reference_id()}_{i}',
                            consensus_model,
                            NUC_RADIUS=args.consensus_k_rad
                            )
                        for consensus_read in consensus_reads:
                            consensus_read.set_tag('RG', molecule[0].get_read_group())
                            consensus_read.set_tag('mi', i)
                            out.write(consensus_read)
                    except Exception as e:

                        #traceback.print_exc()
                        #print(e)
                        molecule.set_rejection_reason('CONSENSUS_FAILED',set_qcfail=True)
                        molecule.write_pysam(out)


                # Write the reads to the output file
                if not args.no_source_reads:
                    molecule.write_pysam(out)
        except Exception as e:
            write_status(args.o,'FAIL, The file is not complete')
            raise e

        # Reached the end of the generator
        write_status(args.o,'Reached end. All ok!')
def tag_multiome_single_thread(
        input_bam_path,
        out_bam_path,
        molecule_iterator=None,
        molecule_iterator_args=None,
        consensus_model=None,
        consensus_model_args={},  # Clearly the consensus model class and arguments should be part of molecule
        ignore_bam_issues=False,
        head=None,
        no_source_reads=False):

    input_bam = pysam.AlignmentFile(input_bam_path,
                                    "rb",
                                    ignore_truncation=ignore_bam_issues,
                                    threads=4)
    input_header = input_bam.header.as_dict()

    # Write provenance information to BAM header
    write_program_tag(
        input_header,
        program_name='bamtagmultiome',
        command_line=" ".join(sys.argv),
        version=singlecellmultiomics.__version__,
        description=
        f'SingleCellMultiOmics molecule processing, executed at {datetime.now().strftime("%d/%m/%Y %H:%M:%S")}'
    )

    print(f'Started writing to {out_bam_path}')

    molecule_iterator_exec = molecule_iterator(
        input_bam, **{
            k: v
            for k, v in molecule_iterator_args.items() if k != 'alignments'
        })

    read_groups = dict()  # Store unique read groups in this dict
    with sorted_bam_file(out_bam_path,
                         header=input_header,
                         read_groups=read_groups) as out:
        try:
            for i, molecule in enumerate(molecule_iterator_exec):

                # Stop when enough molecules are processed
                if head is not None and (i - 1) >= head:
                    break

                # set unique molecule identifier
                molecule.set_meta('mi', f'{molecule.get_a_reference_id()}_{i}')

                # Write tag values
                molecule.write_tags()
                """
                if unphased_allele_resolver is not None:  # write unphased allele tag:
                    molecule.write_allele_phasing_information_tag(
                        unphased_allele_resolver, 'ua')
                """

                # Update read groups
                for fragment in molecule:
                    rgid = fragment.get_read_group()
                    if not rgid in read_groups:
                        read_groups[rgid] = fragment.get_read_group(True)[1]

                # Calculate molecule consensus
                if consensus_model is not None:
                    calculate_consensus(molecule, consensus_model, i, out,
                                        **consensus_model_args)

                # Write the reads to the output file
                if not no_source_reads:
                    molecule.write_pysam(out)
        except Exception as e:
            write_status(out_bam_path, 'FAIL, The file is not complete')
            raise e

        # Reached the end of the generator
        write_status(out_bam_path, 'Reached end. All ok!')