Beispiel #1
0
    samples = lssp()
    if pnames is not None:
        samples = samples.loc[samples.patient.isin(pnames)]
    elif samplenames is not None:
        samples = samples.loc[samples.index.isin(samplenames)]

    if VERBOSE >= 2:
        print 'samples', samples.index.tolist()

    counts_all = []
    for protein in proteins:
        counts = []
        for samplename, sample in samples.iterrows():
            if submit:
                fork_self(samplename,
                          protein,
                          VERBOSE=VERBOSE,
                          qual_min=qual_min)
                continue

            if VERBOSE >= 1:
                print protein, samplename

            sample = SamplePat(sample)

            # NOTE: How do we find what fragment covers the protein? Well, a
            # protein can happily cross fragments. Since each
            # codon is independent, we should iterate over codons. We do not
            # do that for efficiency reasons. Instead, we identify all potential
            # fragments and split the protein into full codon chunks covered by
            # a single fragment.
            fragment_rois = sample.get_fragments_covered(
Beispiel #2
0
        default=1,
        help='Include details on number of reads, length of consensus')
    parser.add_argument('--submit',
                        action='store_true',
                        help='Execute the script in parallel on the cluster')

    args = parser.parse_args()
    seq_runs = args.runs
    adaIDs = args.adaIDs
    use_pats = args.use_pats
    use_interactive = args.interactive
    detail = args.detail
    submit = args.submit

    if submit:
        fork_self(seq_runs, adaIDs=adaIDs, pats=use_pats, detail=detail)
        sys.exit()

    samples_pat = lssp(include_wrong=True)
    samples = lss()

    samples = samples.loc[samples['seq run'].isin(seq_runs)]
    if adaIDs is not None:
        samples = samples.loc[samples.adapter.isin(adaIDs)]

    if len(seq_runs) >= 2:
        samples.sort(columns=['patient sample', 'seq run'], inplace=True)

    for isa, (samplename, sample) in enumerate(samples.iterrows()):
        sample = SampleSeq(sample)
        print sample.name, 'seq:', sample['seq run'], sample.adapter,
                if PCR is None:
                    PCRs_sample = (1, 2)
                else:
                    PCRs_sample = [PCR]
                for PCR_sample in PCRs_sample:
                    bamfilename = sample.get_mapped_filtered_filename(
                        fragment, PCR=PCR_sample, decontaminated=False)
                    if not os.path.isfile(bamfilename):
                        continue

                    #if check_already_decontaminated(sample, fragment, PCR_sample):
                    #    continue

                    fork_self(samplename,
                              fragment,
                              VERBOSE=VERBOSE,
                              maxreads=maxreads,
                              summary=summary,
                              PCR=PCR_sample)

        sys.exit()

    for fragment in fragments:
        consensi = {
            refname: ''.join(load_custom_reference(refname + '_' + fragment))
            for refname in refnames
        }
        for samplename, sample in samples.iterrows():
            sample = SamplePat(sample)
            try:
                consensi[samplename] = sample.get_consensus(fragment, PCR=1)
            except IOError:
Beispiel #4
0
        if VERBOSE >= 3:
            print 'adaID ' + adaID + ': fragments ' + ' '.join(
                fragments_sample)

        # Iterate over fragments
        for fragment in fragments_sample:

            frag_gen = fragment[:2]

            # Submit to the cluster self if requested
            if submit:
                fork_self(seq_run,
                          adaID,
                          frag_gen,
                          VERBOSE=VERBOSE,
                          threads=threads,
                          maxreads=maxreads,
                          filter_reads=filter_reads,
                          summary=summary,
                          rescue=use_rescue)
                continue

            if summary:
                sfn = get_map_summary_filename(data_folder,
                                               adaID,
                                               frag_gen,
                                               rescue=use_rescue)
                with open(sfn, 'w') as f:
                    f.write('Call: python map_to_consensus.py'+\
                            ' --run '+seq_run+\
                            ' --adaIDs '+adaID+\
    submit = args.submit
    summary = args.summary

    # Specify the dataset
    dataset = MiSeq_runs[seq_run]
    data_folder = dataset["folder"]

    # Branch to the cluster if required
    if submit:
        # If no adaID is specified, use all
        if adaID == 0:
            adaIDs = load_adapter_table(data_folder)["ID"]
        else:
            adaIDs = [adaID]
            for adaID in adaIDs:
                fork_self(seq_run, adaID, VERBOSE=VERBOSE, summary=summary)
        sys.exit()

    ###########################################################################
    # The actual script starts here
    ###########################################################################
    # Open BAM
    bamfilename = get_last_mapped(data_folder, adaID, type="bam", filtered=True)
    # Try to convert to BAM if needed
    if not os.path.isfile(bamfilename):
        samfile = pysam.Samfile(bamfilename[:-3] + "sam", "r")
        bamfile = pysam.Samfile(bamfilename, "wb", template=samfile)
        for s in samfile:
            bamfile.write(s)
    bamfile = pysam.Samfile(bamfilename, "rb")
    samples = lssp()
    if pnames is not None:
        samples = samples.loc[samples.patient.isin(pnames)]
    elif samplenames is not None:
        samples = samples.loc[samples.index.isin(samplenames)]

    if VERBOSE >= 2:
        print 'samples', samples.index.tolist()

    counts_all = []
    for protein in proteins:
        counts = []
        for samplename, sample in samples.iterrows():
            if submit:
                fork_self(samplename, protein, VERBOSE=VERBOSE, qual_min=qual_min)
                continue

            if VERBOSE >= 1:
                print protein, samplename

            sample = SamplePat(sample)

            # NOTE: How do we find what fragment covers the protein? Well, a
            # protein can happily cross fragments. Since each
            # codon is independent, we should iterate over codons. We do not
            # do that for efficiency reasons. Instead, we identify all potential
            # fragments and split the protein into full codon chunks covered by
            # a single fragment.
            fragment_rois = sample.get_fragments_covered(protein,
                                                         include_coordinates=True)
    for adaID in adaIDs:

        # If the script is called with no fragment, iterate over all
        samplename = dataset['samples'][dataset['adapters'].index(adaID)]
        if not fragments:
            fragments_sample = samples[samplename]['fragments']
        else:
            from re import findall
            fragments_all = samples[samplename]['fragments']
            fragments_sample = []
            for fragment in fragments:
                frs = filter(lambda x: fragment in x, fragments_all)
                if len(frs):
                    fragments_sample.append(frs[0])

        if VERBOSE >= 3:
            print 'adaID '+adaID+': fragments '+' '.join(fragments_sample)

        # Iterate over fragments
        for fragment in fragments_sample:

            if submit:
                fork_self(seq_run, adaID, fragment,
                          VERBOSE=VERBOSE,
                          maxreads=maxreads, chunk_size=chunksize)
                continue


            split_reads(data_folder, adaID, fragment, chunk_size=chunksize,
                        maxreads=maxreads, VERBOSE=VERBOSE)
Beispiel #8
0
            fragments_sample = [
                fr[:2] for fr in samples[samplename]['fragments']
            ]
        else:
            fragments_sample = fragments

        if VERBOSE >= 3:
            print 'adaID:', adaID + ', fragments:', fragments_sample

        for fragment in fragments_sample:

            # Submit to the cluster self if requested
            if submit:
                fork_self(data_folder,
                          adaID,
                          fragment,
                          VERBOSE=VERBOSE,
                          summary=summary)
                continue

            # Get coverage and counts
            counts = np.load(
                get_allele_counts_filename(data_folder, adaID, fragment))
            if len(counts.shape) == 2:
                import warnings
                warnings.warn(
                    'Counts not divided by read type: will normalize instead of filter!'
                )
                nu_filtered = 1.0 * counts / counts.sum(axis=0)

            else:
        fragments = ['F'+str(i) for i in xrange(1, 7)]
    if VERBOSE >= 3:
        print 'fragments', fragments

    for (samplename_pat, PCR), samplenames_seq in samples_groups.groups.iteritems():
        sample_pat = samples_pat.loc[samplename_pat].copy()
        samples_seq_group = samples_seq.loc[samples_seq.index.isin(samplenames_seq)]
        sample_pat.samples_seq = samples_seq_group
        pname = sample_pat.patient
        PCR = int(PCR)

        for fragment in fragments:
            if submit:
                fork_self(samplename_pat, fragment,
                          VERBOSE=VERBOSE,
                          n_pairs=n_pairs,
                          PCR=PCR,
                          summary=summary)
                continue

            if summary:
                sfn = get_filter_mapped_init_summary_filename(pname, samplename_pat, fragment, PCR=PCR)
                with open(sfn, 'w') as f:
                    f.write('Call: python filter_mapped_reads.py'+\
                            ' --samples '+samplename_pat+\
                            ' --fragments '+fragment+\
                            ' --verbose '+str(VERBOSE))
                    if n_pairs != -1:
                        f.write(' --maxreads '+str(n_pairs))
                    f.write('\n')
Beispiel #10
0
            from re import findall
            fragments_all = samples[samplename]['fragments']
            fragments_sample = []
            for fragment in fragments:
                frs = filter(lambda x: fragment in x, fragments_all)
                if len(frs):
                    fragments_sample.append(frs[0])

        if VERBOSE >= 3:
            print 'adaID ' + adaID + ': fragments ' + ' '.join(
                fragments_sample)

        # Iterate over fragments
        for fragment in fragments_sample:

            if submit:
                fork_self(seq_run,
                          adaID,
                          fragment,
                          VERBOSE=VERBOSE,
                          maxreads=maxreads,
                          chunk_size=chunksize)
                continue

            split_reads(data_folder,
                        adaID,
                        fragment,
                        chunk_size=chunksize,
                        maxreads=maxreads,
                        VERBOSE=VERBOSE)
                        help='Fork the job to the cluster via qsub')
    parser.add_argument('--no-savefig',
                        action='store_false',
                        dest='savefig',
                        help='Show figure instead of saving it')

    args = parser.parse_args()
    seq_run = args.run
    VERBOSE = args.verbose
    submit = args.submit
    maxreads = args.maxreads
    adaID = args.adaID
    savefig = args.savefig

    if submit:
        fork_self(seq_run, VERBOSE=VERBOSE, maxreads=maxreads, savefig=savefig)
        sys.exit()

    dataset = load_sequencing_run(seq_run)
    data_folder = dataset.folder
    read_len = dataset.cycles // 2

    reads_filenames = get_read_filenames(data_folder, adaID, gzip=True)
    if not os.path.isfile(reads_filenames[0]):
        reads_filenames = get_read_filenames(data_folder, adaID, gzip=False)
    title = seq_run + ', ' + adaID

    quality = quality_score_along_reads(read_len,
                                        reads_filenames,
                                        randomreads=(maxreads >= 1),
                                        maxreads=maxreads,
    submit = args.submit
    summary = args.summary

    # Specify the dataset
    dataset = MiSeq_runs[seq_run]
    data_folder = dataset['folder']

    # Branch to the cluster if required
    if submit:
        # If no adaID is specified, use all
        if adaID == 0:
            adaIDs = load_adapter_table(data_folder)['ID']
        else:
            adaIDs = [adaID]
            for adaID in adaIDs:
                fork_self(seq_run, adaID, VERBOSE=VERBOSE, summary=summary)
        sys.exit()

    ###########################################################################
    # The actual script starts here
    ###########################################################################
    # Open BAM
    bamfilename = get_last_mapped(data_folder,
                                  adaID,
                                  type='bam',
                                  filtered=True)
    # Try to convert to BAM if needed
    if not os.path.isfile(bamfilename):
        samfile = pysam.Samfile(bamfilename[:-3] + 'sam', 'r')
        bamfile = pysam.Samfile(bamfilename, 'wb', template=samfile)
        for s in samfile:
    elif samplenames is not None:
        samples = samples.loc[samples.index.isin(samplenames)]

    if VERBOSE >= 2:
        print 'samples', samples.index.tolist()

    if not fragments:
        fragments = ['F'+str(i) for i in xrange(1, 7)]
    if VERBOSE >= 3:
        print 'fragments', fragments

    if submit:
        for fragment in fragments:
            for samplename, sample in samples.iterrows():
                fork_self(samplename, fragment, VERBOSE=VERBOSE,
                          qual_min=qual_min, PCR=PCR,
                          maxreads=maxreads, use_tests=use_tests)
        sys.exit()

    counts_all = []
    for fragment in fragments:
        counts = []
        for samplename, sample in samples.iterrows():
            sample = SamplePat(sample)
            pname = sample.patient

            if VERBOSE >= 2:
                print pname, fragment, samplename

            refseq = SeqIO.read(get_initial_reference_filename(pname, fragment), 'fasta')
                                                         fragments_sample_gen)
                                if frg in fragments]

        if VERBOSE >= 3:
            print 'adaID '+adaID+': fragments '+' '.join(fragments_sample)

        # Iterate over fragments
        for fragment in fragments_sample:

            frag_gen = fragment[:2]

            # Submit to the cluster self if requested
            if submit:
                fork_self(seq_run, adaID, frag_gen,
                          VERBOSE=VERBOSE,
                          threads=threads, maxreads=maxreads,
                          filter_reads=filter_reads,
                          summary=summary,
                          rescue=use_rescue)
                continue

            if summary:
                sfn = get_map_summary_filename(data_folder, adaID, frag_gen,
                                               rescue=use_rescue)
                with open(sfn, 'w') as f:
                    f.write('Call: python map_to_consensus.py'+\
                            ' --run '+seq_run+\
                            ' --adaIDs '+adaID+\
                            ' --fragments '+frag_gen+\
                            ' --threads '+str(threads)+\
                            ' --verbose '+str(VERBOSE))
                    if maxreads != -1:
Beispiel #15
0
            ind |= samples.index.isin(samples_run.index)

        samples = samples.loc[ind]

    for i, (samplename, sample) in enumerate(samples.iterrows()):
        sample = SampleSeq(sample)
        seq_run = sample['seq run']
        data_folder = sample.sequencing_run.folder
        adaID = sample.adapter
        fragments = sample.regions_complete

        if VERBOSE:
            print seq_run, adaID

        if submit:
            fork_self(samplename, maxreads=maxreads, VERBOSE=VERBOSE)
            continue

        if titles is not None:
            title = titles[i]
        else:
            title = None

        (counts, inserts) = check_premap(data_folder,
                                         adaID,
                                         fragments,
                                         seq_run,
                                         samplename,
                                         maxreads=maxreads,
                                         VERBOSE=VERBOSE,
                                         title=title)
Beispiel #16
0
    # If the script is called with no adaID, iterate over all
    samples = dataset.samples
    if adaIDs is not None:
        samples = samples.loc[samples.adapter.isin(adaIDs)]
    if VERBOSE >= 2:
        print samples.index.tolist()

    # Iterate over all adaIDs
    for samplename, sample in samples.iterrows():
        adaID = str(sample.adapter)

        # Submit to the cluster self if requested
        if submit:
            fork_self(seq_run,
                      adaID,
                      VERBOSE=VERBOSE,
                      threads=threads,
                      reference=refname,
                      summary=summary)
            continue

        if summary:
            with open(get_trim_summary_filename(data_folder, adaID), 'w') as f:
                f.write('Call: python trim_reads_lowq.py --run '+seq_run+\
                        ' --adaIDs '+adaID+\
                        ' --threads '+str(threads)+\
                        ' --reference '+refname+\
                        ' --verbose '+str(VERBOSE)+'\n')

        trim_reads(data_folder, adaID, VERBOSE=VERBOSE, summary=summary)
Beispiel #17
0
    parser.add_argument('--submit', action='store_true', default=False,
                        help='Fork the job to the cluster via qsub')
    parser.add_argument('--no-summary', action='store_false',
                        dest='summary',
                        help='Do not save results in a summary file')

    args = parser.parse_args()
    seq_run = args.run
    VERBOSE = args.verbose
    maxreads = args.maxreads
    submit = args.submit
    summary = args.summary

    # If submit, outsource to the cluster
    if submit:
        fork_self(seq_run, VERBOSE=VERBOSE, maxreads=maxreads, summary=summary)
        sys.exit()

    # Specify the dataset
    dataset = MiSeq_runs[seq_run]
    data_folder = dataset['folder']

    if summary:
        with open(get_demultiplex_summary_filename(data_folder), 'w') as f:
            f.write('Call: python demultiplex.py --run '+seq_run+' --verbose '+str(VERBOSE)+'\n')

    adapters_designed = get_adapters_designed(dataset, VERBOSE=VERBOSE, summary=summary)

    make_output_folders(data_folder, adapters_designed, VERBOSE=VERBOSE,
                        summary=summary)
    if VERBOSE >= 2:
        print 'samples', samples.index.tolist()

    if not fragments:
        fragments = ['F' + str(i) for i in xrange(1, 7)]
    if VERBOSE >= 3:
        print 'fragments', fragments

    if submit:
        for fragment in fragments:
            for samplename, sample in samples.iterrows():
                fork_self(samplename,
                          fragment,
                          VERBOSE=VERBOSE,
                          qual_min=qual_min,
                          PCR=PCR,
                          maxreads=maxreads,
                          use_tests=use_tests)
        sys.exit()

    counts_all = []
    for fragment in fragments:
        counts = []
        for samplename, sample in samples.iterrows():
            sample = SamplePat(sample)
            pname = sample.patient

            if VERBOSE >= 2:
                print pname, fragment, samplename
Beispiel #19
0
    if VERBOSE >= 3:
        print 'fragments', fragments

    counts_all = []
    for fragment in fragments:
        counts = []
        for samplename, sample in samples.iterrows():
            if VERBOSE >= 1:
                print samplename, fragment,
                if VERBOSE >= 2:
                    print ''

            if submit:
                fork_self(samplename,
                          fragment,
                          VERBOSE=VERBOSE,
                          PCR=PCR,
                          block_len=block_len,
                          n_reads_per_ali=n_reads_per_ali)
                continue

            sample = SamplePat(sample)
            pname = sample.patient
            refseq = SeqIO.read(
                get_initial_reference_filename(pname, fragment), 'fasta')
            refm = np.array(refseq)
            len_reference = len(refseq)

            # NOTE: we need consensi to decontaminate, so
            bamfilename = sample.get_mapped_filtered_filename(
                fragment, PCR=PCR, decontaminated=(not use_raw_reads))
            if not os.path.isfile(bamfilename):
    elif samplenames is not None:
        samples = samples.loc[samples.index.isin(samplenames)]

    if VERBOSE >= 2:
        print 'samples', samples.index.tolist()

    if not fragments:
        fragments = ['F'+str(i) for i in xrange(1, 7)]
    if VERBOSE >= 3:
        print 'fragments', fragments

    for fragment in fragments:
        inses = []
        for samplename, sample in samples.iterrows():
            if submit:
                fork_self(samplename, fragment, VERBOSE=VERBOSE, qual_min=qual_min)
                continue

            if VERBOSE >= 1:
                print fragment, samplename

            sample = SamplePat(sample)
            pname = sample.patient
            refseq = SeqIO.read(get_initial_reference_filename(pname, fragment), 'fasta')

            fn = sample.get_mapped_filtered_filename(fragment, PCR=PCR)
            if not os.path.isfile(fn):
                warn('No BAM file found', NoDataWarning)
                continue

            _, inse = gac(fn, len(refseq), qual_min=qual_min, VERBOSE=VERBOSE)
                # If the input file if missing, skip
                input_filename = get_input_filename(sample.seqrun_folder,
                                                    sample.adapter,
                                                    sample.convert_region(fragment),
                                                    type='bam',
                                                    only_chunk=only_chunk,
                                                    filtered=filtered)
                if not os.path.isfile(input_filename):
                    if VERBOSE:
                        print 'WARNING: input file not found'
                    continue

                if submit:
                    fork_self(samplename, fragment,
                              VERBOSE=VERBOSE, threads=threads,
                              n_pairs=n_pairs,
                              summary=summary,
                              only_chunks=[only_chunk],
                              filtered=filtered)
                    continue

                if summary:
                    sfn = get_map_initial_summary_filename(pname, samplename_pat,
                                                           samplename, fragment,
                                                           PCR=PCR, only_chunk=only_chunk)
                    with open(sfn, 'w') as f:
                        f.write('Call: python map_to_initial_reference.py'+\
                                ' --samples '+samplename+\
                                ' --fragments '+fragment+\
                                ' --threads '+str(threads)+\
                                ' --verbose '+str(VERBOSE))
                        if n_pairs != -1:
    use_save = args.save

    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[pnames]

    data = []
    for pname, patient in patients.iterrows():
        if VERBOSE >= 1:
            print patient.code, start, end

        if submit:
            fork_self(patient.code,
                      width,
                      gap,
                      start,
                      end,
                      VERBOSE=VERBOSE,
                      freqmin=freqmin,
                      countmin=countmin)
            continue

        patient = Patient(patient)
        ref = patient.get_reference('genomewide')
        L = len(ref)

        win_start = start
        while win_start + width - gap < min(L, end):
            win_end = min(win_start + width, end, L)

            if VERBOSE >= 1:
                print patient.code, win_start, win_end
        print 'fragments', fragments

    for (samplename_pat,
         PCR), samplenames_seq in samples_groups.groups.iteritems():
        sample_pat = samples_pat.loc[samplename_pat].copy()
        samples_seq_group = samples_seq.loc[samples_seq.index.isin(
            samplenames_seq)]
        sample_pat.samples_seq = samples_seq_group
        pname = sample_pat.patient
        PCR = int(PCR)

        for fragment in fragments:
            if submit:
                fork_self(samplename_pat,
                          fragment,
                          VERBOSE=VERBOSE,
                          n_pairs=n_pairs,
                          PCR=PCR,
                          summary=summary)
                continue

            if summary:
                sfn = get_filter_mapped_init_summary_filename(pname,
                                                              samplename_pat,
                                                              fragment,
                                                              PCR=PCR)
                with open(sfn, 'w') as f:
                    f.write('Call: python filter_mapped_reads.py'+\
                            ' --samples '+samplename_pat+\
                            ' --fragments '+fragment+\
                            ' --verbose '+str(VERBOSE))
                    if n_pairs != -1:
            fragments_sample = []
            for fragment in fragments:
                frs = filter(lambda x: fragment in x, fragments_all)
                if len(frs):
                    fragments_sample.append(frs[0])

        if VERBOSE >= 3:
            print 'adaID '+adaID+': fragments '+' '.join(fragments_sample)

        make_output_folders(data_folder, adaID, VERBOSE=VERBOSE)

        for fragment in fragments_sample:

            # Submit to the cluster self if requested
            if submit:
                fork_self(seq_run, adaID, fragment, n_reads, iterations_max,
                          VERBOSE=VERBOSE)
                continue

            if summary:
                sfn = get_build_consensus_summary_filename(data_folder, adaID,
                                                              fragment)
                with open(sfn, 'w') as f:
                    f.write('Call: python build_consensus_iterative.py'+\
                            ' --run '+seq_run+\
                            ' --adaIDs '+adaID+\
                            ' --fragments '+fragment+\
                            ' --iterations '+str(iterations_max)+\
                            ' -n '+str(n_reads)+\
                            ' --verbose '+str(VERBOSE))
                    f.write('\n')
    # Iterate over all requested samples
    for samplename, sample in samples.iterrows():
        sample = SampleSeq(sample)
        adaID = sample.adapter

        if not fragments:
            fragments_sample = sample.regions_generic
        else:
            fragments_sample = sorted(
                set(fragments) & set(sample.regions_generic))

        for fragment in fragments_sample:

            # Submit to the cluster self if requested
            if submit:
                fork_self(seq_run, adaID, fragment, VERBOSE=VERBOSE)
                continue

            counts, inserts = get_allele_counts(data_folder,
                                                adaID,
                                                fragment,
                                                VERBOSE=VERBOSE)
            write_counts_files(data_folder,
                               adaID,
                               fragment,
                               counts,
                               inserts,
                               VERBOSE=VERBOSE)

            if summary:
                plot_coverage(data_folder,
            for samplename, sample in samples_focal.iterrows():

                sample = SamplePat(sample)
                if PCR is None:
                    PCRs_sample = (1, 2)
                else:
                    PCRs_sample = [PCR]
                for PCR_sample in PCRs_sample:
                    bamfilename = sample.get_mapped_filtered_filename(fragment, PCR=PCR_sample, decontaminated=False)
                    if not os.path.isfile(bamfilename):
                        continue

                    # if check_already_decontaminated(sample, fragment, PCR_sample):
                    #    continue

                    fork_self(samplename, fragment, VERBOSE=VERBOSE, maxreads=maxreads, summary=summary, PCR=PCR_sample)

        sys.exit()

    for fragment in fragments:
        consensi = {refname: "".join(load_custom_reference(refname + "_" + fragment)) for refname in refnames}
        for samplename, sample in samples.iterrows():
            sample = SamplePat(sample)
            try:
                consensi[samplename] = sample.get_consensus(fragment, PCR=1)
            except IOError:
                print samplename, "file not found"
                continue

        for samplename, sample in samples_focal.iterrows():
            sample = SamplePat(sample)
Beispiel #27
0
            if VERBOSE >= 1:
                print fragment

            # There is a blacklist of samples which are probably contaminated,
            # we want to discard those altogether
            contstr = sample['suspected contamination']
            if pd.notnull(contstr) and (fragment in contstr):
                print 'WARNING: This sample has a suspected contamination! Skipping.'
                continue

            # Submit to the cluster self if requested
            if submit:
                fork_self(seq_run,
                          adaID,
                          fragment,
                          VERBOSE=VERBOSE,
                          summary=summary,
                          maxreads=maxreads,
                          max_mismatches=max_mismatches,
                          susp_mismatches=susp_mismatches)
                continue

            if summary:
                sfn = get_filter_mapped_summary_filename(
                    data_folder, adaID, fragment)
                with open(sfn, 'w') as f:
                    f.write('Call: python filter_mapped_reads.py'+\
                            ' --run '+seq_run+\
                            ' --adaIDs '+adaID+\
                            ' --fragments '+fragment+\
                            ' --max-mismatches '+str(max_mismatches)+\
                            ' --suspicious-mismatches '+str(susp_mismatches)+\
Beispiel #28
0
        samples = samples.loc[samples.index.isin(samplenames)]

    if VERBOSE >= 2:
        print 'samples', samples.index.tolist()

    if not fragments:
        fragments = ['F' + str(i) for i in xrange(1, 7)]
    if VERBOSE >= 3:
        print 'fragments', fragments

    for fragment in fragments:
        inses = []
        for samplename, sample in samples.iterrows():
            if submit:
                fork_self(samplename,
                          fragment,
                          VERBOSE=VERBOSE,
                          qual_min=qual_min)
                continue

            if VERBOSE >= 1:
                print fragment, samplename

            sample = SamplePat(sample)
            pname = sample.patient
            refseq = SeqIO.read(
                get_initial_reference_filename(pname, fragment), 'fasta')

            fn = sample.get_mapped_filtered_filename(fragment, PCR=PCR)
            if not os.path.isfile(fn):
                warn('No BAM file found', NoDataWarning)
                continue
            ind |= samples.index.isin(samples_run.index)

        samples = samples.loc[ind]

    for i, (samplename, sample) in enumerate(samples.iterrows()):
        sample = SampleSeq(sample)
        seq_run = sample['seq run']
        data_folder = sample.sequencing_run.folder
        adaID = sample.adapter
        fragments = sample.regions_complete

        if VERBOSE:
            print seq_run, adaID
        
        if submit:
            fork_self(samplename, maxreads=maxreads, VERBOSE=VERBOSE)
            continue

        if titles is not None:
            title = titles[i]
        else:
            title = None

        (counts, inserts) = check_premap(data_folder, adaID,
                                         fragments, seq_run, samplename,
                                         maxreads=maxreads,
                                         VERBOSE=VERBOSE,
                                         title=title)

        if show and (not submit) and (counts is not None):
            plt.ion()
    if not fragments:
        fragments = ['F'+str(i) for i in xrange(1, 7)]
    if VERBOSE >= 3:
        print 'fragments', fragments

    counts_all = []
    for fragment in fragments:
        counts = []
        for samplename, sample in samples.iterrows():
            if VERBOSE >= 1:
                print samplename, fragment,
                if VERBOSE >= 2:
                    print ''

            if submit:
                fork_self(samplename, fragment, VERBOSE=VERBOSE, PCR=PCR,
                          block_len=block_len, n_reads_per_ali=n_reads_per_ali)
                continue

            sample = SamplePat(sample)
            pname = sample.patient
            refseq = SeqIO.read(get_initial_reference_filename(pname, fragment), 'fasta')
            refm = np.array(refseq)
            len_reference = len(refseq)

            # NOTE: we need consensi to decontaminate, so
            bamfilename = sample.get_mapped_filtered_filename(fragment,
                                            PCR=PCR,
                                            decontaminated=(not use_raw_reads))
            if not os.path.isfile(bamfilename):
                continue
            
        samples = samples.loc[samples.adapter.isin(adaIDs)]
    if VERBOSE >= 2:
        print samples.index.tolist()

    # Iterate over all adaIDs
    for samplename, sample in samples.iterrows():
        adaID = str(sample.adapter)

        # Submit to the cluster self if requested
        if submit:
            fork_self(
                seq_run,
                adaID,
                VERBOSE=VERBOSE,
                threads=threads,
                reference=refname,
                summary=summary,
                trimmed=use_trimmed,
                subsrate=subsrate,
                gapopen=gapopen,
                gapextend=gapextend,
                maxreads=maxreads)
            continue

        make_output_folders(
            data_folder, adaID, VERBOSE=VERBOSE, summary=summary)

        if summary:
            with open(get_premap_summary_filename(data_folder, adaID),
                      'w') as f:
                outstr = 'Call: python premap_to_reference.py --run '+seq_run+\
                        ' --adaIDs '+adaID+\
    # Iterate over all requested samples
    for samplename, sample in samples.iterrows():
        sample = SampleSeq(sample)
        adaID = sample.adapter

        if not fragments:
            fragments_sample = sample.regions_generic
        else:
            fragments_sample = sorted(set(fragments) & set(sample.regions_generic))

        for fragment in fragments_sample:

            # Submit to the cluster self if requested
            if submit:
                fork_self(seq_run, adaID, fragment, VERBOSE=VERBOSE)
                continue

            counts, inserts = get_allele_counts(data_folder, adaID, fragment, VERBOSE=VERBOSE)
            write_counts_files(data_folder, adaID, fragment, counts, inserts, VERBOSE=VERBOSE)

            if summary:
                plot_coverage(data_folder, adaID, fragment, counts, VERBOSE=VERBOSE, savefig=True)

            if write_frequencies:
                nu_filtered = filter_nus(counts)
                write_frequency_files(data_folder, adaID, fragment, nu_filtered, VERBOSE=VERBOSE)

                if summary:
                    plot_SFS_folded(data_folder, adaID, fragment, nu_filtered, VERBOSE=VERBOSE, savefig=True)
Beispiel #33
0
                if len(frs):
                    fragments_sample.append(frs[0])

        if VERBOSE >= 3:
            print 'adaID ' + adaID + ': fragments ' + ' '.join(
                fragments_sample)

        make_output_folders(data_folder, adaID, VERBOSE=VERBOSE)

        for fragment in fragments_sample:

            # Submit to the cluster self if requested
            if submit:
                fork_self(seq_run,
                          adaID,
                          fragment,
                          n_reads,
                          iterations_max,
                          VERBOSE=VERBOSE)
                continue

            if summary:
                sfn = get_build_consensus_summary_filename(
                    data_folder, adaID, fragment)
                with open(sfn, 'w') as f:
                    f.write('Call: python build_consensus_iterative.py'+\
                            ' --run '+seq_run+\
                            ' --adaIDs '+adaID+\
                            ' --fragments '+fragment+\
                            ' --iterations '+str(iterations_max)+\
                            ' -n '+str(n_reads)+\
                            ' --verbose '+str(VERBOSE))
    # Specify the dataset
    dataset = load_sequencing_run(seq_run)
    data_folder = dataset.folder

    # If the script is called with no adaID, iterate over all
    samples = dataset.samples
    if adaIDs is not None:
        samples = samples.loc[samples.adapter.isin(adaIDs)]
    if VERBOSE >= 2:
        print samples.index.tolist()

    # Iterate over all adaIDs
    for samplename, sample in samples.iterrows():
        adaID = str(sample.adapter)

        # Submit to the cluster self if requested
        if submit:
            fork_self(seq_run, adaID, VERBOSE=VERBOSE, threads=threads,
                      reference=refname, summary=summary)
            continue

        if summary:
            with open(get_trim_summary_filename(data_folder, adaID), 'w') as f:
                f.write('Call: python trim_reads_lowq.py --run '+seq_run+\
                        ' --adaIDs '+adaID+\
                        ' --threads '+str(threads)+\
                        ' --reference '+refname+\
                        ' --verbose '+str(VERBOSE)+'\n')

        trim_reads(data_folder, adaID, VERBOSE=VERBOSE, summary=summary)
    if VERBOSE >= 3:
        print 'adaIDs', adaIDs

    # If the script is called with no fragment, iterate over all
    if not fragments:
        fragments = ['F'+str(i) for i in xrange(1, 7)]
    if VERBOSE >= 3:
        print 'fragments', fragments

    # Iterate over all requested samples
    for adaID in adaIDs:
        for fragment in fragments:

            # Submit to the cluster self if requested
            if submit:
                fork_self(data_folder, adaID, fragment, VERBOSE=VERBOSE,
                          summary=summary)
                continue

            # Get cocounts
            cocounts = get_coallele_counts(data_folder, adaID, fragment,
                                           VERBOSE=VERBOSE)

            ## Check using the allele counts and the diagonal cocounts
            #counts, _ = get_allele_counts(data_folder, adaID, fragment,
            #                              VERBOSE=VERBOSE,
            #                              maxreads=2 * maxreads)

            #cocount = cocounts.sum(axis=0)
            #count = counts.sum(axis=0)

            ## Read reference
Beispiel #36
0
                    fragments_sample.append(frs[0])
            if 'genomewide' in fragments:
                fragments_sample.append('genomewide')

        if VERBOSE >= 3:
            print 'adaID ' + adaID + ': fragments ' + ' '.join(
                fragments_sample)

        for fragment in fragments_sample:

            # Submit to the cluster self if requested
            if submit:
                fork_self(seq_run,
                          adaID,
                          fragment,
                          block_len_initial,
                          n_reads_per_ali,
                          store_allele_counts,
                          VERBOSE=VERBOSE)
                continue

            if summary:
                sfn = get_build_consensus_summary_filename(data_folder,
                                                           adaID,
                                                           fragment,
                                                           iterative=False)
                with open(sfn, 'w') as f:
                    f.write('Call: python build_consensus.py'+\
                            ' --run '+seq_run+\
                            ' --adaIDs '+adaID+\
                            ' --fragments '+fragment+\
            fragments_sample = []
            for fragment in fragments:
                frs = filter(lambda x: fragment in x, fragments_all)
                if len(frs):
                    fragments_sample.append(frs[0])
            if 'genomewide' in fragments:
                fragments_sample.append('genomewide')

        if VERBOSE >= 3:
            print 'adaID '+adaID+': fragments '+' '.join(fragments_sample)

        for fragment in fragments_sample:

            # Submit to the cluster self if requested
            if submit:
                fork_self(seq_run, adaID, fragment, block_len_initial, n_reads_per_ali,
                          store_allele_counts, VERBOSE=VERBOSE)
                continue

            if summary:
                sfn = get_build_consensus_summary_filename(data_folder, adaID,
                                                           fragment, iterative=False)
                with open(sfn, 'w') as f:
                    f.write('Call: python build_consensus.py'+\
                            ' --run '+seq_run+\
                            ' --adaIDs '+adaID+\
                            ' --fragments '+fragment+\
                            ' --block-length '+str(block_len_initial)+\
                            ' --reads-per-alignment '+str(n_reads_per_ali)+\
                            ' --verbose '+str(VERBOSE))
                    if store_allele_counts:
                        f.write(' --allele-counts')
Beispiel #38
0
        PCR = int(sample.PCR)
        if PCR == 1:
            PCR_suffix = 'o'
        elif PCR ==2:
            PCR_suffix = 'i'
        else:
            raise ValueError('PCR should be only 1 or 2')

        fragments = [str('F'+fr+PCR_suffix) for fr in sample.regions.split(' ')]
        adaID = sample.adapter

        # Submit to the cluster self if requested
        if submit:
            if include_tests:
                raise ValueError('Tests require an interactive shell')
            fork_self(seq_run, adaID, VERBOSE=VERBOSE, maxreads=maxreads,
                      minisize=minisize, summary=summary)
            continue

        make_output_folders(data_folder, adaID, VERBOSE=VERBOSE)

        if summary:
            with open(get_divide_summary_filename(data_folder, adaID), 'w') as f:
                f.write('Call: python trim_and_divide.py --run '+seq_run+\
                        ' --adaIDs '+adaID+\
                        ' --minisize '+str(minisize)+\
                        ' --verbose '+str(VERBOSE))
                if maxreads != -1:
                    f.write(' --maxreads '+str(maxreads))
                if include_tests:
                    f.write(' --include_tests')
                f.write('\n')
Beispiel #39
0
                        help='Fork the job to the cluster via qsub')
    parser.add_argument('--no-summary',
                        action='store_false',
                        dest='summary',
                        help='Do not save results in a summary file')

    args = parser.parse_args()
    seq_run = args.run
    VERBOSE = args.verbose
    maxreads = args.maxreads
    submit = args.submit
    summary = args.summary

    # If submit, outsource to the cluster
    if submit:
        fork_self(seq_run, VERBOSE=VERBOSE, maxreads=maxreads, summary=summary)
        sys.exit()

    # Specify the dataset
    dataset = MiSeq_runs[seq_run]
    data_folder = dataset['folder']

    if summary:
        with open(get_demultiplex_summary_filename(data_folder), 'w') as f:
            f.write('Call: python demultiplex.py --run ' + seq_run +
                    ' --verbose ' + str(VERBOSE) + '\n')

    adapters_designed = get_adapters_designed(dataset,
                                              VERBOSE=VERBOSE,
                                              summary=summary)
        samples = samples.loc[samples.adapter.isin(adaIDs)]
    if VERBOSE >= 2:
        print samples.index.tolist()

    # Iterate over all adaIDs
    for samplename, sample in samples.iterrows():
        adaID = str(sample.adapter)

        # Submit to the cluster self if requested
        if submit:
            fork_self(seq_run,
                      adaID,
                      VERBOSE=VERBOSE,
                      threads=threads,
                      reference=refname,
                      summary=summary,
                      trimmed=use_trimmed,
                      subsrate=subsrate,
                      gapopen=gapopen,
                      gapextend=gapextend,
                      maxreads=maxreads)
            continue

        make_output_folders(data_folder,
                            adaID,
                            VERBOSE=VERBOSE,
                            summary=summary)

        if summary:
            with open(get_premap_summary_filename(data_folder, adaID),
                      'w') as f:
    parser.add_argument("--adaID", required=True, help="Adapter ID to analyze (e.g. TS2)")
    parser.add_argument("--verbose", type=int, default=0, help="Verbosity level [0-3]")
    parser.add_argument("--maxreads", type=int, default=-1, help="Maximal number of reads to analyze")
    parser.add_argument("--submit", action="store_true", default=False, help="Fork the job to the cluster via qsub")
    parser.add_argument("--no-savefig", action="store_false", dest="savefig", help="Show figure instead of saving it")

    args = parser.parse_args()
    seq_run = args.run
    VERBOSE = args.verbose
    submit = args.submit
    maxreads = args.maxreads
    adaID = args.adaID
    savefig = args.savefig

    if submit:
        fork_self(seq_run, VERBOSE=VERBOSE, maxreads=maxreads, savefig=savefig)
        sys.exit()

    dataset = load_sequencing_run(seq_run)
    data_folder = dataset.folder
    read_len = dataset.cycles // 2

    reads_filenames = get_read_filenames(data_folder, adaID, gzip=True)
    if not os.path.isfile(reads_filenames[0]):
        reads_filenames = get_read_filenames(data_folder, adaID, gzip=False)
    title = seq_run + ", " + adaID

    quality = quality_score_along_reads(
        read_len, reads_filenames, randomreads=(maxreads >= 1), maxreads=maxreads, VERBOSE=VERBOSE
    )
    countmin = args.countmin
    submit = args.submit
    use_plot = args.plot
    use_save = args.save

    patients = load_patients()
    if pnames is not None:
        patients = patients.loc[pnames]

    data = []
    for pname, patient in patients.iterrows():
        if VERBOSE >= 1:
            print patient.code, start, end

        if submit:
            fork_self(patient.code, width, gap, start, end, VERBOSE=VERBOSE,
                      freqmin=freqmin, countmin=countmin)
            continue

        patient = Patient(patient)
        ref = patient.get_reference('genomewide')
        L = len(ref)

        win_start = start
        while win_start + width - gap < min(L, end):
            win_end = min(win_start + width, end, L)

            if VERBOSE >= 1:
                print patient.code, win_start, win_end
    
            if VERBOSE >= 2:
                print 'Get region haplotypes'
        for fragment in fragments_sample:
            if VERBOSE >= 1:
                print fragment

            # There is a blacklist of samples which are probably contaminated,
            # we want to discard those altogether
            contstr = sample['suspected contamination']
            if pd.notnull(contstr) and (fragment in contstr):
                print 'WARNING: This sample has a suspected contamination! Skipping.'
                continue

            # Submit to the cluster self if requested
            if submit:
                fork_self(seq_run, adaID, fragment, VERBOSE=VERBOSE, summary=summary,
                          maxreads=maxreads,
                          max_mismatches=max_mismatches, susp_mismatches=susp_mismatches)
                continue

            if summary:
                sfn = get_filter_mapped_summary_filename(data_folder, adaID, fragment)
                with open(sfn, 'w') as f:
                    f.write('Call: python filter_mapped_reads.py'+\
                            ' --run '+seq_run+\
                            ' --adaIDs '+adaID+\
                            ' --fragments '+fragment+\
                            ' --max-mismatches '+str(max_mismatches)+\
                            ' --suspicious-mismatches '+str(susp_mismatches)+\
                            ' --verbose '+str(VERBOSE))
                    if maxreads > 0:
                        f.write(' --maxreads '+str(maxreads))