Ejemplo n.º 1
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.skip or args.is_multi_genome:
        return

    chunk_out = chunk_outs[0]
    cr_utils.copy(chunk_out.pca_h5, outs.pca_h5)
    cr_utils.copytree(chunk_out.pca_csv, outs.pca_csv)
Ejemplo n.º 2
0
def main(args, outs):
    if args.skip:
        return

    if args.random_seed is not None:
        np.random.seed(args.random_seed)

    # detect barnyard
    genomes = cr_matrix.GeneBCMatrices.load_genomes_from_h5(args.matrix_h5)
    if len(genomes) > 1:
        outs.is_multi_genome = True
        cr_utils.copy(args.matrix_h5, outs.preprocessed_matrix_h5)
        return
    else:
        outs.is_multi_genome = False

    genome = genomes[0]
    matrix = cr_matrix.GeneBCMatrices.load_h5(
        args.matrix_h5).get_matrix(genome)
    matrix = cr_matrix.GeneBCMatrix.preprocess_matrix(
        matrix,
        num_bcs=args.num_bcs,
        use_bcs=args.use_bcs,
        use_genes=args.use_genes,
        force_cells=args.force_cells)

    gbm = cr_matrix.GeneBCMatrices()
    gbm.matrices[genome] = matrix
    matrix_attrs = cr_matrix.get_matrix_attrs(args.matrix_h5)
    gbm.save_h5(outs.preprocessed_matrix_h5, extra_attrs=matrix_attrs)
Ejemplo n.º 3
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.skip:
        return

    chunk_out = chunk_outs[0]
    cr_utils.copy(chunk_out.preprocessed_matrix_h5,
                  outs.preprocessed_matrix_h5)
    outs.is_multi_genome = chunk_out.is_multi_genome
Ejemplo n.º 4
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.skip or not args.is_multi_genome:
        return

    chunk_out = chunk_outs[0]
    cr_utils.copy(chunk_out.multi_genome_summary, outs.multi_genome_summary)
    cr_utils.copytree(chunk_out.multi_genome_csv, outs.multi_genome_csv)
    cr_utils.copytree(chunk_out.multi_genome_json, outs.multi_genome_json)
Ejemplo n.º 5
0
def main(args, outs):
    parsed = parse_parameters(args.params_csv)
    for param in ANALYSIS_PARAMS:
        if param in parsed:
            setattr(outs, param, parsed[param])
        else:
            setattr(outs, param, None)

    if args.params_csv is not None:
        cr_utils.copy(args.params_csv, outs.params_csv)
Ejemplo n.º 6
0
def join(args, outs, chunk_defs, chunk_outs):
    cr_utils.copy(args.extract_reads_summary, outs.summary)
    cr_utils.copy(args.barcode_counts, outs.barcode_counts)

    outs.gem_groups = args.gem_groups
    outs.read_groups = args.read_groups
    outs.align = args.align
    outs.bam_comments = args.bam_comments

    outs.read1s = [co.read1s for co in chunk_outs]
    outs.read2s = [co.read2s for co in chunk_outs]
Ejemplo n.º 7
0
def get_gem_group_index_json(args, outs):
    if args.gem_group_index_json:
        cr_utils.copy(args.gem_group_index_json, outs.gem_group_index_json)
    else:
        generated_index = cr_matrix.get_gem_group_index(
            args.filtered_gene_bc_matrices_h5)
        if generated_index:
            with open(outs.gem_group_index_json, 'w') as outfile:
                tk_json.dump_numpy({"gem_group_index": generated_index},
                                   outfile)
    return outs.gem_group_index_json
Ejemplo n.º 8
0
 def write_genome_fasta(self, out_fasta_fn):
     if len(self.genomes) > 1:
         with open(out_fasta_fn, 'w') as f:
             for genome_prefix, in_fasta_fn in itertools.izip(self.genome_prefixes, self.in_fasta_fns):
                 with open(in_fasta_fn, 'r') as g:
                     for line in g:
                         line = line.strip()
                         if line.startswith('>'):
                             line = '>' + genome_prefix + '_' + line[1:]
                         f.write(line + '\n')
     else:
         cr_utils.copy(self.in_fasta_fns[0], out_fasta_fn)
Ejemplo n.º 9
0
def main(args, outs):
    if args.read1 is not None:
        # Ensure same extension
        out_path, _ = cr_utils.splitexts(outs.read1s)
        _, in_ext = cr_utils.splitexts(args.read1)
        outs.read1s = out_path + in_ext
        cr_utils.copy(args.read1, outs.read1s)
    if args.read2 is not None:
        out_path, _ = cr_utils.splitexts(outs.read2s)
        _, in_ext = cr_utils.splitexts(args.read2)
        outs.read2s = out_path + in_ext
        cr_utils.copy(args.read2, outs.read2s)
Ejemplo n.º 10
0
def join(args, outs, chunk_defs, chunk_outs):
    summary_files = [
        args.reads_summary,
        args.filter_umis_summary,
        args.filter_barcodes_summary,
        args.trim_reads_summary,
        args.filter_reads_summary,
        args.filter_contigs_summary,
        args.report_contigs_summary,
        args.report_contig_alignments_summary,
        args.raw_consensus_summary,
        args.group_clonotypes_summary,
    ]

    summary_files = [sum_file for sum_file in summary_files if not sum_file is None]

    cr_report.merge_jsons(summary_files, outs.metrics_summary_json)

    # Copy barcode summary h5
    if args.barcode_summary:
        cr_utils.copy(args.barcode_summary, outs.barcode_summary)

    # Copy cell barcodes
    if args.cell_barcodes:
        cr_utils.copy(args.cell_barcodes, outs.cell_barcodes)

    # Copy barcode support
    if args.barcode_support:
        cr_utils.copy(args.barcode_support, outs.barcode_support)

    # Copy barcode umi summary
    if args.barcode_umi_summary:
        cr_utils.copy(args.barcode_umi_summary, outs.barcode_umi_summary)

    # Copy umi info
    if args.umi_info:
        cr_utils.copy(args.umi_info, outs.umi_info)

    sample_data_paths = cr_webshim_data.SampleDataPaths(
        summary_path=outs.metrics_summary_json,
        barcode_summary_path=args.barcode_summary,
        vdj_clonotype_summary_path=args.clonotype_summary,
        vdj_barcode_support_path=args.barcode_support,
    )

    sample_properties = cr_webshim.get_sample_properties(args.sample_id, args.sample_desc, [], version=martian.get_pipelines_version())

    sample_data = cr_webshim.load_sample_data(sample_properties, sample_data_paths)

    if args.barcode_whitelist is not None:
        cr_webshim.build_web_summary_html(outs.web_summary, sample_properties, sample_data, PIPELINE_VDJ,
                                          alerts_output_filename=outs.alerts)
        cr_webshim.build_metrics_summary_csv(outs.metrics_summary_csv, sample_properties, sample_data, PIPELINE_VDJ)
Ejemplo n.º 11
0
def join(args, outs, chunk_defs, chunk_outs):
    # Copy files from single chunk to join
    for out_name in ['summary',
                     'contig_annotations',
                     'filtered_contig_fasta',
                     'filtered_contig_fastq',
    ]:
        src = getattr(chunk_outs[0], out_name)
        dest = getattr(outs, out_name)
        if os.path.isfile(src):
            cr_utils.copy(src, dest)
        else:
            setattr(outs, out_name, None)
Ejemplo n.º 12
0
def join(args, outs, chunk_defs, chunk_outs):
    chunk_out = chunk_outs[0]

    cr_utils.copy(chunk_out.web_summary, outs.web_summary)
    cr_utils.copy(chunk_out.alerts, outs.alerts)
    cr_utils.copy(chunk_out.metrics_summary_json, outs.metrics_summary_json)
    cr_utils.copy(chunk_out.metrics_summary_csv, outs.metrics_summary_csv)
Ejemplo n.º 13
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.skip:
        outs.analysis = None
        outs.analysis_csv = None
        outs.summary = None
        return

    chunk_out = chunk_outs[0]
    cr_utils.copytree(chunk_out.analysis, outs.analysis)
    cr_utils.copytree(chunk_out.analysis_csv, outs.analysis_csv)

    if args.is_multi_genome:
        cr_utils.copy(args.multi_genome_summary, outs.summary)
    else:
        outs.summary = None
Ejemplo n.º 14
0
def main(args, outs):
    # NOOP if no vdj ref path specified
    if args.vdj_reference_path is None:
        outs.recombinome = None
        outs.recombinome_index = None
        return

    fasta_filename = vdj_reference.get_vdj_reference_fasta(
        args.vdj_reference_path)
    cr_utils.copy(fasta_filename, outs.recombinome)
    os.makedirs(outs.recombinome_index)

    # Build a bowtie2 index
    subprocess.check_call([
        'bowtie2-build', outs.recombinome,
        os.path.join(outs.recombinome_index, 'recombinome')
    ])
Ejemplo n.º 15
0
def join(args, outs, chunk_defs, chunk_outs):
    downsample = chunk_defs[0].downsample
    downsample_map = chunk_defs[0].downsample_map
    if downsample and len(downsample_map) > 1:
        input_h5_filenames = [
            chunk_out.out_molecules for chunk_out in chunk_outs
        ]
        cr_mol_counter.MoleculeCounter.concatenate(outs.out_molecules,
                                                   input_h5_filenames)
    else:
        # just copy input molecules
        cr_utils.copy(args.molecules, outs.out_molecules)

    # merge summaries
    summary = merge_summaries(chunk_outs)
    summary['downsample_info'] = downsample_map
    with open(outs.summary, 'w') as f:
        json.dump(summary, f, indent=4, sort_keys=True)
Ejemplo n.º 16
0
def join(args, outs, chunk_defs, chunk_outs):
    chunk_out = chunk_outs[0]
    cr_utils.copy(chunk_out.summary, outs.summary)
    cr_utils.copy(chunk_out.filtered_matrices_h5, outs.filtered_matrices_h5)
    cr_utils.copy(chunk_out.filtered_barcodes, outs.filtered_barcodes)
    cr_utils.copytree(chunk_out.filtered_matrices_mex,
                      outs.filtered_matrices_mex)
Ejemplo n.º 17
0
def join(args, outs, chunk_defs, chunk_outs):
    summary_files = [
        args.extract_reads_summary,
        args.correct_barcodes_summary,
    ]

    summary_files = [sum_file for sum_file in summary_files if not sum_file is None]

    cr_report.merge_jsons(summary_files, outs.summary)

    cr_utils.copy(args.raw_barcode_counts, outs.raw_barcode_counts)
    cr_utils.copy(args.corrected_barcode_counts, outs.corrected_barcode_counts)
    cr_utils.copy(args.barcode_summary, outs.barcode_summary)
    outs.gem_groups = args.gem_groups
    outs.read_groups = args.read_groups
    outs.align = args.align
    outs.bam_comments = args.bam_comments

    outs.bc_corrected_read1s = [out.bc_corrected_read1s for out in chunk_outs]
    outs.bc_corrected_read2s = [out.bc_corrected_read2s for out in chunk_outs]
Ejemplo n.º 18
0
def main(args, outs):
    if args.read1s is not None:
        cr_utils.copy(args.read1s, outs.bc_corrected_read1s)
    if args.read2s is not None:
        cr_utils.copy(args.read2s, outs.bc_corrected_read2s)
Ejemplo n.º 19
0
def join(args, outs, chunk_defs, chunk_outs):
    outs.chain_type = chunk_outs[0].chain_type
    cr_utils.copy(chunk_outs[0].summary, outs.summary)
Ejemplo n.º 20
0
def main(args, outs):
    reporter = vdj_report.VdjReporter(
        vdj_reference_path=args.vdj_reference_path)
    gene_umi_counts_per_bc = {}

    strand = cr_chem.get_strandedness(args.chemistry_def)

    paired_end = cr_chem.is_paired_end(args.chemistry_def)
    assert paired_end != (args.read2_chunk is None)

    # For the entire chunk, match reads against the V(D)J reference
    ref_fasta = vdj_reference.get_vdj_reference_fasta(args.vdj_reference_path)

    # The filtering code will write this bam. Then we'll read it, correct the UMIs
    # and write outs.chunked_bams.
    filter_bam = martian.make_path('tmp.bam')

    vdj_filt.run_read_match(args.read1_chunk, args.read2_chunk, ref_fasta,
                            filter_bam, strand, args.sw_params)

    # Make two passes over the BAM file, processing one barcode at a time
    bam1 = pysam.AlignmentFile(filter_bam, check_sq=False)
    bam2 = pysam.AlignmentFile(filter_bam, check_sq=False)
    bc_iter1 = get_bc_grouped_pair_iter(bam1, paired_end)
    bc_iter2 = get_bc_grouped_pair_iter(bam2, paired_end)

    reads_per_bc = open(outs.reads_per_bc, 'w')
    out_bam, _ = tk_bam.create_bam_outfile(outs.barcode_chunked_bams,
                                           None,
                                           None,
                                           template=bam1)

    for (bc, pair_iter1), (_,
                           pair_iter2) in itertools.izip(bc_iter1, bc_iter2):
        nreads = 0

        # Pass 1: UMI correction
        umi_counts = defaultdict(int)
        for header, (read1, read2) in pair_iter1:
            nreads += 2
            umi_counts[header.get_tag(cr_constants.RAW_UMI_TAG)] += 1

        corrected_umis = correct_umis(umi_counts)

        # Pass 2: Write the UMI-corrected records
        process_bam_barcode(bam1, pair_iter2, bc, corrected_umis, reporter,
                            gene_umi_counts_per_bc, strand, out_bam,
                            paired_end)

        reads_per_bc.write('{}\t{}\n'.format(bc, nreads))

    bam1.close()
    bam2.close()
    out_bam.close()

    # Write bc-gene-umi counts
    cPickle.dump(gene_umi_counts_per_bc, open(outs.chunked_gene_umi_counts,
                                              'w'))

    # Copy the input barcodes
    if args.barcodes_chunk is not None:
        cr_utils.copy(args.barcodes_chunk, outs.barcodes_in_chunks)
    else:
        outs.barcodes_in_chunks = None

    reporter.save(outs.chunked_reporter)
Ejemplo n.º 21
0
def join(args, outs, chunk_defs, chunk_outs):
    contigs = []
    contig_fastqs = []
    contig_bams = []

    if len(chunk_outs) == 0:
        # No input reads
        # Create empty BAM file
        with open(outs.contig_bam, 'w') as f:
            pass
        outs.contig_bam_bai = None
        # Create empty contig FASTA
        with open(outs.contig_fasta, 'w') as f:
            pass
        outs.contig_fasta_fai = None
        # Create empty contig FASTQ
        with open(outs.contig_fastq, 'w') as f:
            pass
        outs.metrics_summary_json = None
        outs.summary_tsv = None
        outs.umi_summary_tsv = None
        return

    summary_tsvs = []
    umi_summary_tsvs = []

    for chunk_out in chunk_outs:
        if not os.path.isfile(chunk_out.contig_fasta):
            continue
        contigs.append(chunk_out.contig_fasta)

        contig_fastqs.append(chunk_out.contig_fastq)
        contig_bams.append(chunk_out.contig_bam)

        summary_tsvs.append(chunk_out.summary_tsv)
        umi_summary_tsvs.append(chunk_out.umi_summary_tsv)

    cr_utils.concatenate_files(outs.contig_fasta, contigs)

    if os.path.getsize(outs.contig_fasta) > 0:
        tk_subproc.check_call('samtools faidx %s' % outs.contig_fasta,
                              shell=True)
        outs.contig_fasta_fai = outs.contig_fasta + '.fai'

    cr_utils.concatenate_files(outs.contig_fastq, contig_fastqs)

    if len(summary_tsvs) > 0:
        cr_utils.concatenate_headered_files(outs.summary_tsv, summary_tsvs)
    if len(umi_summary_tsvs) > 0:
        cr_utils.concatenate_headered_files(outs.umi_summary_tsv,
                                            umi_summary_tsvs)

    if contig_bams:
        # Merge every N BAMs. Trying to merge them all at once
        #  risks hitting the filehandle limit.
        n_merged = 0

        while len(contig_bams) > 1:
            to_merge = contig_bams[0:MERGE_BAMS_N]

            tmp_bam = martian.make_path('merged-%04d.bam' % n_merged)
            n_merged += 1

            print "Merging %d BAMs into %s ..." % (len(to_merge), tmp_bam)
            tk_bam.merge(tmp_bam, to_merge, threads=args.__threads)

            # Delete any temporary bams that have been merged
            for in_bam in to_merge:
                if os.path.basename(in_bam).startswith('merged-'):
                    cr_utils.remove(in_bam)

            # Pop the input bams and push the merged bam
            contig_bams = contig_bams[len(to_merge):] + [tmp_bam]

        if os.path.basename(contig_bams[0]).startswith('merged-'):
            # We merged at least two chunks together.
            # Rename it to the output bam.
            cr_utils.move(contig_bams[0], outs.contig_bam)
        else:
            # There was only a single chunk, so copy it from the input
            cr_utils.copy(contig_bams[0], outs.contig_bam)

        tk_bam.index(outs.contig_bam)

        # Make sure the Martian out matches the actual index filename
        outs.contig_bam_bai = outs.contig_bam + '.bai'

    # Merge the assembler summary jsons
    merged_summary = cr_utils.merge_jsons_single_level(
        [out.metrics_summary_json for out in chunk_outs])

    with open(outs.metrics_summary_json, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(merged_summary),
                  f,
                  indent=4,
                  sort_keys=True)
Ejemplo n.º 22
0
def join(args, outs, chunk_defs, chunk_outs):
    cr_utils.copy(chunk_outs[0].summary, outs.summary)
Ejemplo n.º 23
0
def build_reference_fasta_from_ensembl(gtf_paths, transcripts_to_remove_path,
                                       genome_fasta_path, reference_path,
                                       reference_name, ref_version,
                                       mkref_version):
    """Create cellranger-compatible vdj reference files from a list of ENSEMBL-like GTF files.

    Input files are concatenated. No attempt to merge/reconcile information
    across them is made. Providing the files in a different order might change the
    output in cases where there are multiple entries with the same transcript id
    and the same feature type (eg. V-region).
    """

    transcripts = collections.defaultdict(list)

    if transcripts_to_remove_path:
        with open(transcripts_to_remove_path) as f:
            rm_transcripts = set([line.strip() for line in f.readlines()])
    else:
        rm_transcripts = set()

    # Note: We cannot symlink here because some filesystems in the wild
    #       do not support symlinks.
    print 'Copying genome reference sequence...'
    os.makedirs(os.path.dirname(get_vdj_reference_fasta(reference_path)))
    tmp_genome_fa_path = os.path.join(reference_path, 'genome.fasta')
    cr_utils.copy(genome_fasta_path, tmp_genome_fa_path)
    print '...done.\n'

    print 'Indexing genome reference sequence...'
    tk_subproc.check_call(['samtools', 'faidx', tmp_genome_fa_path])
    print '...done.\n'

    print 'Loading genome reference sequence...'
    genome_fasta = pysam.FastaFile(tmp_genome_fa_path)
    print '...done.\n'

    print 'Computing hash of genome FASTA file...'
    fasta_hash = cr_utils.compute_hash_of_file(tmp_genome_fa_path)
    print '...done.\n'

    for gtf in gtf_paths:
        print 'Reading GTF {}'.format(gtf)

        for line_no, entry in enumerate(get_gtf_iter(open(gtf))):
            if not entry.feature in [
                    ENSEMBL_FIVE_PRIME_UTR_FEATURE, ENSEMBL_CDS_FEATURE
            ]:
                continue
            entry = parse_attributes(entry)
            transcript_id = entry.attributes.get('transcript_id')
            transcript_biotype = entry.attributes.get('transcript_biotype')
            gene_biotype = entry.attributes.get('gene_biotype')
            gene_name = entry.attributes.get('gene_name')

            # Skip irrelevant biotypes
            if transcript_biotype not in ENSEMBL_VDJ_BIOTYPES and not gene_biotype in ENSEMBL_VDJ_BIOTYPES:
                continue

            # Skip blacklisted gene names
            if transcript_id in rm_transcripts:
                continue

            # Warn and skip if transcript_id missing
            if transcript_id is None:
                print 'Warning: Entry on row %d has no transcript_id' % line_no
                continue

            # Warn and skip if gene_name missing
            if gene_name is None:
                print 'Warning: Transcript %s on row %d has biotype %s but no gene_name. Skipping.' % (
                    transcript_id, line_no, transcript_biotype)
                continue

            # Infer region type from biotype
            if transcript_biotype in ENSEMBL_VDJ_BIOTYPES:
                vdj_feature = infer_ensembl_vdj_feature_type(
                    entry.feature, transcript_biotype)
            else:
                vdj_feature = infer_ensembl_vdj_feature_type(
                    entry.feature, gene_biotype)

            # Warn and skip if region type could not be inferred
            if vdj_feature is None:
                print 'Warning: Transcript %s has biotype %s. Could not infer VDJ gene type. Skipping.' % (
                    transcript_id, transcript_biotype)
                continue

            # Features that share a transcript_id and feature type are presumably exons
            # so keep them together.
            transcripts[(transcript_id, vdj_feature)].append(entry)

        print '...done.\n'

    print 'Computing hash of genes GTF files...'
    digest = hashlib.sha1()
    # concatenate all the hashes into a string and then hash that string
    digest.update(
        reduce(lambda x, y: x + y,
               [cr_utils.compute_hash_of_file(gtf) for gtf in gtf_paths]))
    gtf_hash = digest.hexdigest()
    print '...done.\n'

    print 'Fetching sequences...'
    out_fasta = open(get_vdj_reference_fasta(reference_path), 'w')

    feature_id = 1
    seen_features = set()

    for (transcript_id, region_type), regions in transcripts.iteritems():
        if not all(r.chrom == regions[0].chrom for r in regions):
            chroms = sorted(list(set([r.chrom for r in regions])))
            print 'Warning: Transcript %s spans multiple contigs: %s. Skipping.' % (
                transcript_id, str(chroms))
            continue

        if not all(r.strand == regions[0].strand for r in regions):
            print 'Warning: Transcript %s spans multiple strands. Skipping.' % transcript_id
            continue

        chrom = regions[0].chrom
        strand = regions[0].strand
        ens_gene_name = standardize_ensembl_gene_name(
            regions[0].attributes['gene_name'])
        transcript_id = regions[0].attributes['transcript_id']

        if chrom not in genome_fasta:
            print 'Warning: Transcript %s is on contig "%s" which is not in the provided reference fasta. Skipping.' % (
                transcript_id, chrom)
            continue

        # Build sequence
        regions.sort(key=lambda r: r.start)
        seq = ''
        for region in regions:
            # GTF coordinates are 1-based
            start, end = int(region.start) - 1, int(region.end)
            seq += genome_fasta.fetch(chrom, start, end)

        # Revcomp if transcript on reverse strand
        if strand == '-':
            seq = tk_seq.get_rev_comp(seq)

        # Strip Ns from termini
        if 'N' in seq:
            print 'Warning: Feature %s contains Ns. Stripping from the ends.' % str(
                (ens_gene_name, transcript_id, region_type))
            seq = seq.strip('N')

        if len(seq) == 0:
            print 'Warning: Feature %s is all Ns. Skipping.' % str(
                (ens_gene_name, transcript_id, region_type))
            continue

        # Infer various attributes from the Ensembl gene name
        record_id = transcript_id
        gene_name = ens_gene_name
        display_name = make_display_name(gene_name=gene_name, allele_name=None)
        chain = infer_ensembl_vdj_chain(gene_name)
        chain_type = infer_ensembl_vdj_chain_type(gene_name)
        # Ensembl doesn't encode alleles
        allele_name = '00'

        # Disallow spaces in these fields
        if ' ' in region_type:
            raise ValueError('Spaces not allowed in region type: "%s"' %
                             region_type)
        if ' ' in gene_name:
            raise ValueError('Spaces not allowed in gene name: "%s"' %
                             gene_name)
        if ' ' in record_id:
            raise ValueError('Spaces not allowed in record ID: "%s"' %
                             record_id)

        # Warn on features we couldn't classify properly
        if chain_type not in vdj_constants.VDJ_CHAIN_TYPES:
            print ('Warning: Could not infer chain type for: %s. ' + \
                'Expected the first two characters of the gene name to be in %s. Feature skipped.') % \
                (str((gene_name, record_id, region_type)),
                 str(tuple(vdj_constants.VDJ_CHAIN_TYPES)))
            continue

        if region_type in vdj_constants.VDJ_C_FEATURE_TYPES and chain in vdj_constants.CHAINS_WITH_ISOTYPES:
            isotype = infer_ensembl_isotype(ens_gene_name)
        else:
            isotype = None

        feature = VdjAnnotationFeature(
            feature_id=feature_id,
            record_id=record_id,
            display_name=display_name,
            gene_name=gene_name,
            region_type=region_type,
            chain_type=chain_type,
            chain=chain,
            isotype=isotype,
            allele_name=allele_name,
            sequence=seq,
        )

        # Don't add duplicate entries
        feat_key = get_duplicate_feature_key(feature)
        if feat_key in seen_features:
            print 'Warning: Skipping duplicate entry for %s (%s, %s).' % (
                display_name, region_type, record_id)
            continue
        seen_features.add(feat_key)

        feature_id += 1

        out_fasta.write(convert_vdj_feature_to_fasta_entry(feature) + '\n')
    print '...done.\n'

    print 'Deleting copy of genome fasta...'
    os.remove(tmp_genome_fa_path)
    os.remove(tmp_genome_fa_path + '.fai')
    print '...done.\n'

    print 'Writing metadata JSON file into reference folder...'
    metadata = {
        cr_constants.REFERENCE_GENOMES_KEY:
        reference_name,
        cr_constants.REFERENCE_FASTA_HASH_KEY:
        fasta_hash,
        cr_constants.REFERENCE_GTF_HASH_KEY:
        gtf_hash,
        cr_constants.REFERENCE_INPUT_FASTA_KEY:
        os.path.basename(genome_fasta_path),
        cr_constants.REFERENCE_INPUT_GTF_KEY:
        ','.join([os.path.basename(gtf_path) for gtf_path in gtf_paths]),
        cr_constants.REFERENCE_VERSION_KEY:
        ref_version,
        cr_constants.REFERENCE_MKREF_VERSION_KEY:
        mkref_version,
        cr_constants.REFERENCE_TYPE_KEY:
        vdj_constants.REFERENCE_TYPE,
    }
    with open(
            os.path.join(reference_path, cr_constants.REFERENCE_METADATA_FILE),
            'w') as json_file:
        json.dump(tk_safe_json.json_sanitize(metadata),
                  json_file,
                  sort_keys=True,
                  indent=4)
    print '...done.\n'
Ejemplo n.º 24
0
def main(args, outs):
    if args.summary is not None:
        cr_utils.copy(args.summary, outs.summary)
    if args.barcodes_detected is not None:
        cr_utils.copy(args.barcodes_detected, outs.barcodes_detected)
Ejemplo n.º 25
0
def join(args, outs, chunk_defs, chunk_outs):
    if chunk_outs[0].output_for_cloupe is None:
        # Set output to null if noloupe is set, or if we ran on a barnyard
        outs.output_for_cloupe = None
    else:
        cr_utils.copy(chunk_outs[0].output_for_cloupe, outs.output_for_cloupe)
Ejemplo n.º 26
0
def join(args, outs, chunk_defs, chunk_outs):
    cr_utils.copy(chunk_outs[0].summary, outs.summary)
    if chunk_outs[0].report is not None:
        cr_utils.copy(chunk_outs[0].report, outs.report)
    outs.chemistry_type = chunk_outs[0].chemistry_type
Ejemplo n.º 27
0
def main(args, outs):
    cr_utils.copy(args.trim_reads_summary, outs.summary)
Ejemplo n.º 28
0
def join(args, outs, chunk_defs, chunk_outs):
    chunk_out = chunk_outs[0]

    cr_utils.copy(chunk_out.web_summary, outs.web_summary)
    cr_utils.copy(chunk_out.summary, outs.summary)
Ejemplo n.º 29
0
def join(args, outs, chunk_defs, chunk_outs):
    cr_utils.copy(chunk_outs[0].cell_barcodes, outs.cell_barcodes)
    cr_utils.copy(chunk_outs[0].barcode_support, outs.barcode_support)
    cr_utils.copy(chunk_outs[0].summary, outs.summary)
    cr_utils.copy(chunk_outs[0].barcode_umi_summary, outs.barcode_umi_summary)
Ejemplo n.º 30
0
def join(args, outs, chunk_defs, chunk_outs):
    outs.min_readpairs_per_umi = chunk_outs[0].min_readpairs_per_umi
    cr_utils.copy(chunk_outs[0].cell_barcodes, outs.cell_barcodes)
    cr_utils.copy(chunk_outs[0].barcode_support, outs.barcode_support)
    cr_utils.copy(chunk_outs[0].summary, outs.summary)
    cr_utils.copy(chunk_outs[0].barcode_umi_summary, outs.barcode_umi_summary)