Ejemplo n.º 1
0
    def build_reference(self):
        print "Creating new reference folder at %s" % self.out_dir
        os.mkdir(self.out_dir)
        print "...done\n"

        print "Writing genome FASTA file into reference folder..."
        new_genome_fasta = os.path.join(self.out_dir, cr_constants.REFERENCE_FASTA_PATH)
        os.mkdir(os.path.dirname(new_genome_fasta))
        self.write_genome_fasta(new_genome_fasta)
        print "...done\n"

        print "Computing hash of genome FASTA file..."
        fasta_hash = cr_utils.compute_hash_of_file(new_genome_fasta)
        print "...done\n"

        print "Writing genes GTF file into reference folder..."
        new_gene_gtf = os.path.join(self.out_dir, cr_constants.REFERENCE_GENES_GTF_PATH)
        os.mkdir(os.path.dirname(new_gene_gtf))
        self.write_genome_gtf(new_gene_gtf)
        print "...done\n"

        print "Computing hash of genes GTF file..."
        gtf_hash = cr_utils.compute_hash_of_file(new_gene_gtf)
        print "...done\n"

        print "Writing genes index file into reference folder (may take over 10 minutes for a 3Gb genome)..."
        new_gene_index = os.path.join(self.out_dir, cr_constants.REFERENCE_GENES_INDEX_PATH)
        os.mkdir(os.path.dirname(new_gene_index))
        self.write_genome_gene_index(new_gene_index, new_gene_gtf, new_genome_fasta)
        print "...done\n"

        print "Writing genome metadata JSON file into reference folder..."
        metadata = {
            cr_constants.REFERENCE_GENOMES_KEY: self.genomes,
            cr_constants.REFERENCE_NUM_THREADS_KEY: int(math.ceil(float(self.mem_gb) / 8.0)),
            cr_constants.REFERENCE_MEM_GB_KEY: self.mem_gb,
            cr_constants.REFERENCE_FASTA_HASH_KEY: fasta_hash,
            cr_constants.REFERENCE_GTF_HASH_KEY: gtf_hash,
            cr_constants.REFERENCE_INPUT_FASTA_KEY: [os.path.basename(x) for x in self.in_fasta_fns],
            cr_constants.REFERENCE_INPUT_GTF_KEY: [os.path.basename(x) for x in self.in_gtf_fns],
            cr_constants.REFERENCE_VERSION_KEY: self.ref_version,
            cr_constants.REFERENCE_MKREF_VERSION_KEY: self.mkref_version,
        }
        new_metadata_json = os.path.join(self.out_dir, cr_constants.REFERENCE_METADATA_FILE)
        with open(new_metadata_json, 'w') as f:
            json.dump(tk_safe_json.json_sanitize(metadata), f, sort_keys=True, indent=4)
        print "...done\n"

        print "Generating STAR genome index (may take over 8 core hours for a 3Gb genome)..."
        new_star_path = os.path.join(self.out_dir, cr_constants.REFERENCE_STAR_PATH)
        star = STAR(new_star_path)
        star.index_reference_with_mem_gb(new_genome_fasta, new_gene_gtf,
                                         num_threads=self.num_threads,
                                         mem_gb=self.mem_gb)
        print "...done.\n"

        print ">>> Reference successfully created! <<<\n"
        print "You can now specify this reference on the command line:"
        print "cellranger --transcriptome=%s ..." % self.out_dir
Ejemplo n.º 2
0
def build_reference_fasta_from_fasta(fasta_path, reference_path,
                                     reference_name, ref_version,
                                     mkref_version):
    """Create cellranger-compatible vdj reference files from a
       V(D)J segment FASTA file.
    """

    seen_features = set()
    seen_ids = set()
    features = []

    print 'Checking FASTA entries...'

    with open(fasta_path) as f:
        for header, sequence in cr_utils.get_fasta_iter(f):
            feat = parse_fasta_entry(header, sequence)

            # Enforce unique feature IDs
            if feat.feature_id in seen_ids:
                raise ValueError(
                    'Duplicate feature ID found in input FASTA: %d.' %
                    feat.feature_id)
            # Sanity check values
            if ' ' in feat.region_type:
                raise ValueError('Spaces not allowed in region type: "%s"' %
                                 feat.region_type)
            if ' ' in feat.gene_name:
                raise ValueError('Spaces not allowed in gene name: "%s"' %
                                 feat.gene_name)
            if ' ' in feat.record_id:
                raise ValueError('Spaces not allowed in record ID: "%s"' %
                                 feat.record_id)

            key = get_duplicate_feature_key(feat)
            if key in seen_features:
                print 'Warning: Skipping duplicate entry for %s (%s, %s).' % (
                    feat.display_name, feat.region_type, feat.record_id)
                continue

            # Strip Ns from termini
            seq = feat.sequence
            if 'N' in seq:
                print 'Warning: Feature %s contains Ns. Stripping from the ends.' % \
                    str((feat.display_name, feat.record_id, feat.region_type))
                seq = seq.strip('N')

            if len(seq) == 0:
                print 'Warning: Feature %s is all Ns. Skipping.' % \
                    str((feat.display_name, feat.record_id, feat.region_type))
                continue

            # Warn on features we couldn't classify properly
            if feat.chain_type not in vdj_constants.VDJ_CHAIN_TYPES:
                print 'Warning: Unknown chain type for: %s. Expected name to be in %s. Skipping.' % \
                (str((feat.display_name, feat.record_id, feat.region_type)),
                 str(tuple(vdj_constants.VDJ_CHAIN_TYPES)))
                continue

            seen_ids.add(feat.feature_id)
            seen_features.add(key)

            # Update the sequence since we may have modified it
            feat_dict = feat._asdict()
            feat_dict.update({'sequence': seq})
            new_feat = VdjAnnotationFeature(**feat_dict)
            features.append(new_feat)
    print '...done.\n'

    print 'Writing sequences...'
    os.makedirs(os.path.dirname(get_vdj_reference_fasta(reference_path)))
    with open(get_vdj_reference_fasta(reference_path), 'w') as out_fasta:
        for feat in features:
            out_fasta.write(convert_vdj_feature_to_fasta_entry(feat) + '\n')
    print '...done.\n'

    print 'Computing hash of input FASTA file...'
    fasta_hash = cr_utils.compute_hash_of_file(fasta_path)
    print '...done.\n'

    print 'Writing metadata JSON file into reference folder...'
    metadata = {
        cr_constants.REFERENCE_GENOMES_KEY: reference_name,
        cr_constants.REFERENCE_FASTA_HASH_KEY: fasta_hash,
        cr_constants.REFERENCE_GTF_HASH_KEY: None,
        cr_constants.REFERENCE_INPUT_FASTA_KEY: os.path.basename(fasta_path),
        cr_constants.REFERENCE_INPUT_GTF_KEY: None,
        cr_constants.REFERENCE_VERSION_KEY: ref_version,
        cr_constants.REFERENCE_MKREF_VERSION_KEY: mkref_version,
        cr_constants.REFERENCE_TYPE_KEY: vdj_constants.REFERENCE_TYPE,
    }
    with open(
            os.path.join(reference_path, cr_constants.REFERENCE_METADATA_FILE),
            'w') as json_file:
        json.dump(tk_safe_json.json_sanitize(metadata),
                  json_file,
                  sort_keys=True,
                  indent=4)
    print '...done.\n'
Ejemplo n.º 3
0
def build_reference_fasta_from_ensembl(gtf_paths, transcripts_to_remove_path,
                                       genome_fasta_path, reference_path,
                                       reference_name, ref_version,
                                       mkref_version):
    """Create cellranger-compatible vdj reference files from a list of ENSEMBL-like GTF files.

    Input files are concatenated. No attempt to merge/reconcile information
    across them is made. Providing the files in a different order might change the
    output in cases where there are multiple entries with the same transcript id
    and the same feature type (eg. V-region).
    """

    transcripts = collections.defaultdict(list)

    if transcripts_to_remove_path:
        with open(transcripts_to_remove_path) as f:
            rm_transcripts = set([line.strip() for line in f.readlines()])
    else:
        rm_transcripts = set()

    # Note: We cannot symlink here because some filesystems in the wild
    #       do not support symlinks.
    print 'Copying genome reference sequence...'
    os.makedirs(os.path.dirname(get_vdj_reference_fasta(reference_path)))
    tmp_genome_fa_path = os.path.join(reference_path, 'genome.fasta')
    cr_utils.copy(genome_fasta_path, tmp_genome_fa_path)
    print '...done.\n'

    print 'Indexing genome reference sequence...'
    tk_subproc.check_call(['samtools', 'faidx', tmp_genome_fa_path])
    print '...done.\n'

    print 'Loading genome reference sequence...'
    genome_fasta = pysam.FastaFile(tmp_genome_fa_path)
    print '...done.\n'

    print 'Computing hash of genome FASTA file...'
    fasta_hash = cr_utils.compute_hash_of_file(tmp_genome_fa_path)
    print '...done.\n'

    for gtf in gtf_paths:
        print 'Reading GTF {}'.format(gtf)

        for line_no, entry in enumerate(get_gtf_iter(open(gtf))):
            if not entry.feature in [
                    ENSEMBL_FIVE_PRIME_UTR_FEATURE, ENSEMBL_CDS_FEATURE
            ]:
                continue
            entry = parse_attributes(entry)
            transcript_id = entry.attributes.get('transcript_id')
            transcript_biotype = entry.attributes.get('transcript_biotype')
            gene_biotype = entry.attributes.get('gene_biotype')
            gene_name = entry.attributes.get('gene_name')

            # Skip irrelevant biotypes
            if transcript_biotype not in ENSEMBL_VDJ_BIOTYPES and not gene_biotype in ENSEMBL_VDJ_BIOTYPES:
                continue

            # Skip blacklisted gene names
            if transcript_id in rm_transcripts:
                continue

            # Warn and skip if transcript_id missing
            if transcript_id is None:
                print 'Warning: Entry on row %d has no transcript_id' % line_no
                continue

            # Warn and skip if gene_name missing
            if gene_name is None:
                print 'Warning: Transcript %s on row %d has biotype %s but no gene_name. Skipping.' % (
                    transcript_id, line_no, transcript_biotype)
                continue

            # Infer region type from biotype
            if transcript_biotype in ENSEMBL_VDJ_BIOTYPES:
                vdj_feature = infer_ensembl_vdj_feature_type(
                    entry.feature, transcript_biotype)
            else:
                vdj_feature = infer_ensembl_vdj_feature_type(
                    entry.feature, gene_biotype)

            # Warn and skip if region type could not be inferred
            if vdj_feature is None:
                print 'Warning: Transcript %s has biotype %s. Could not infer VDJ gene type. Skipping.' % (
                    transcript_id, transcript_biotype)
                continue

            # Features that share a transcript_id and feature type are presumably exons
            # so keep them together.
            transcripts[(transcript_id, vdj_feature)].append(entry)

        print '...done.\n'

    print 'Computing hash of genes GTF files...'
    digest = hashlib.sha1()
    # concatenate all the hashes into a string and then hash that string
    digest.update(
        reduce(lambda x, y: x + y,
               [cr_utils.compute_hash_of_file(gtf) for gtf in gtf_paths]))
    gtf_hash = digest.hexdigest()
    print '...done.\n'

    print 'Fetching sequences...'
    out_fasta = open(get_vdj_reference_fasta(reference_path), 'w')

    feature_id = 1
    seen_features = set()

    for (transcript_id, region_type), regions in transcripts.iteritems():
        if not all(r.chrom == regions[0].chrom for r in regions):
            chroms = sorted(list(set([r.chrom for r in regions])))
            print 'Warning: Transcript %s spans multiple contigs: %s. Skipping.' % (
                transcript_id, str(chroms))
            continue

        if not all(r.strand == regions[0].strand for r in regions):
            print 'Warning: Transcript %s spans multiple strands. Skipping.' % transcript_id
            continue

        chrom = regions[0].chrom
        strand = regions[0].strand
        ens_gene_name = standardize_ensembl_gene_name(
            regions[0].attributes['gene_name'])
        transcript_id = regions[0].attributes['transcript_id']

        if chrom not in genome_fasta:
            print 'Warning: Transcript %s is on contig "%s" which is not in the provided reference fasta. Skipping.' % (
                transcript_id, chrom)
            continue

        # Build sequence
        regions.sort(key=lambda r: r.start)
        seq = ''
        for region in regions:
            # GTF coordinates are 1-based
            start, end = int(region.start) - 1, int(region.end)
            seq += genome_fasta.fetch(chrom, start, end)

        # Revcomp if transcript on reverse strand
        if strand == '-':
            seq = tk_seq.get_rev_comp(seq)

        # Strip Ns from termini
        if 'N' in seq:
            print 'Warning: Feature %s contains Ns. Stripping from the ends.' % str(
                (ens_gene_name, transcript_id, region_type))
            seq = seq.strip('N')

        if len(seq) == 0:
            print 'Warning: Feature %s is all Ns. Skipping.' % str(
                (ens_gene_name, transcript_id, region_type))
            continue

        # Infer various attributes from the Ensembl gene name
        record_id = transcript_id
        gene_name = ens_gene_name
        display_name = make_display_name(gene_name=gene_name, allele_name=None)
        chain = infer_ensembl_vdj_chain(gene_name)
        chain_type = infer_ensembl_vdj_chain_type(gene_name)
        # Ensembl doesn't encode alleles
        allele_name = '00'

        # Disallow spaces in these fields
        if ' ' in region_type:
            raise ValueError('Spaces not allowed in region type: "%s"' %
                             region_type)
        if ' ' in gene_name:
            raise ValueError('Spaces not allowed in gene name: "%s"' %
                             gene_name)
        if ' ' in record_id:
            raise ValueError('Spaces not allowed in record ID: "%s"' %
                             record_id)

        # Warn on features we couldn't classify properly
        if chain_type not in vdj_constants.VDJ_CHAIN_TYPES:
            print ('Warning: Could not infer chain type for: %s. ' + \
                'Expected the first two characters of the gene name to be in %s. Feature skipped.') % \
                (str((gene_name, record_id, region_type)),
                 str(tuple(vdj_constants.VDJ_CHAIN_TYPES)))
            continue

        if region_type in vdj_constants.VDJ_C_FEATURE_TYPES and chain in vdj_constants.CHAINS_WITH_ISOTYPES:
            isotype = infer_ensembl_isotype(ens_gene_name)
        else:
            isotype = None

        feature = VdjAnnotationFeature(
            feature_id=feature_id,
            record_id=record_id,
            display_name=display_name,
            gene_name=gene_name,
            region_type=region_type,
            chain_type=chain_type,
            chain=chain,
            isotype=isotype,
            allele_name=allele_name,
            sequence=seq,
        )

        # Don't add duplicate entries
        feat_key = get_duplicate_feature_key(feature)
        if feat_key in seen_features:
            print 'Warning: Skipping duplicate entry for %s (%s, %s).' % (
                display_name, region_type, record_id)
            continue
        seen_features.add(feat_key)

        feature_id += 1

        out_fasta.write(convert_vdj_feature_to_fasta_entry(feature) + '\n')
    print '...done.\n'

    print 'Deleting copy of genome fasta...'
    os.remove(tmp_genome_fa_path)
    os.remove(tmp_genome_fa_path + '.fai')
    print '...done.\n'

    print 'Writing metadata JSON file into reference folder...'
    metadata = {
        cr_constants.REFERENCE_GENOMES_KEY:
        reference_name,
        cr_constants.REFERENCE_FASTA_HASH_KEY:
        fasta_hash,
        cr_constants.REFERENCE_GTF_HASH_KEY:
        gtf_hash,
        cr_constants.REFERENCE_INPUT_FASTA_KEY:
        os.path.basename(genome_fasta_path),
        cr_constants.REFERENCE_INPUT_GTF_KEY:
        ','.join([os.path.basename(gtf_path) for gtf_path in gtf_paths]),
        cr_constants.REFERENCE_VERSION_KEY:
        ref_version,
        cr_constants.REFERENCE_MKREF_VERSION_KEY:
        mkref_version,
        cr_constants.REFERENCE_TYPE_KEY:
        vdj_constants.REFERENCE_TYPE,
    }
    with open(
            os.path.join(reference_path, cr_constants.REFERENCE_METADATA_FILE),
            'w') as json_file:
        json.dump(tk_safe_json.json_sanitize(metadata),
                  json_file,
                  sort_keys=True,
                  indent=4)
    print '...done.\n'