def build_reference(self): print "Creating new reference folder at %s" % self.out_dir os.mkdir(self.out_dir) print "...done\n" print "Writing genome FASTA file into reference folder..." new_genome_fasta = os.path.join(self.out_dir, cr_constants.REFERENCE_FASTA_PATH) os.mkdir(os.path.dirname(new_genome_fasta)) self.write_genome_fasta(new_genome_fasta) print "...done\n" print "Computing hash of genome FASTA file..." fasta_hash = cr_utils.compute_hash_of_file(new_genome_fasta) print "...done\n" print "Writing genes GTF file into reference folder..." new_gene_gtf = os.path.join(self.out_dir, cr_constants.REFERENCE_GENES_GTF_PATH) os.mkdir(os.path.dirname(new_gene_gtf)) self.write_genome_gtf(new_gene_gtf) print "...done\n" print "Computing hash of genes GTF file..." gtf_hash = cr_utils.compute_hash_of_file(new_gene_gtf) print "...done\n" print "Writing genes index file into reference folder (may take over 10 minutes for a 3Gb genome)..." new_gene_index = os.path.join(self.out_dir, cr_constants.REFERENCE_GENES_INDEX_PATH) os.mkdir(os.path.dirname(new_gene_index)) self.write_genome_gene_index(new_gene_index, new_gene_gtf, new_genome_fasta) print "...done\n" print "Writing genome metadata JSON file into reference folder..." metadata = { cr_constants.REFERENCE_GENOMES_KEY: self.genomes, cr_constants.REFERENCE_NUM_THREADS_KEY: int(math.ceil(float(self.mem_gb) / 8.0)), cr_constants.REFERENCE_MEM_GB_KEY: self.mem_gb, cr_constants.REFERENCE_FASTA_HASH_KEY: fasta_hash, cr_constants.REFERENCE_GTF_HASH_KEY: gtf_hash, cr_constants.REFERENCE_INPUT_FASTA_KEY: [os.path.basename(x) for x in self.in_fasta_fns], cr_constants.REFERENCE_INPUT_GTF_KEY: [os.path.basename(x) for x in self.in_gtf_fns], cr_constants.REFERENCE_VERSION_KEY: self.ref_version, cr_constants.REFERENCE_MKREF_VERSION_KEY: self.mkref_version, } new_metadata_json = os.path.join(self.out_dir, cr_constants.REFERENCE_METADATA_FILE) with open(new_metadata_json, 'w') as f: json.dump(tk_safe_json.json_sanitize(metadata), f, sort_keys=True, indent=4) print "...done\n" print "Generating STAR genome index (may take over 8 core hours for a 3Gb genome)..." new_star_path = os.path.join(self.out_dir, cr_constants.REFERENCE_STAR_PATH) star = STAR(new_star_path) star.index_reference_with_mem_gb(new_genome_fasta, new_gene_gtf, num_threads=self.num_threads, mem_gb=self.mem_gb) print "...done.\n" print ">>> Reference successfully created! <<<\n" print "You can now specify this reference on the command line:" print "cellranger --transcriptome=%s ..." % self.out_dir
def build_reference_fasta_from_fasta(fasta_path, reference_path, reference_name, ref_version, mkref_version): """Create cellranger-compatible vdj reference files from a V(D)J segment FASTA file. """ seen_features = set() seen_ids = set() features = [] print 'Checking FASTA entries...' with open(fasta_path) as f: for header, sequence in cr_utils.get_fasta_iter(f): feat = parse_fasta_entry(header, sequence) # Enforce unique feature IDs if feat.feature_id in seen_ids: raise ValueError( 'Duplicate feature ID found in input FASTA: %d.' % feat.feature_id) # Sanity check values if ' ' in feat.region_type: raise ValueError('Spaces not allowed in region type: "%s"' % feat.region_type) if ' ' in feat.gene_name: raise ValueError('Spaces not allowed in gene name: "%s"' % feat.gene_name) if ' ' in feat.record_id: raise ValueError('Spaces not allowed in record ID: "%s"' % feat.record_id) key = get_duplicate_feature_key(feat) if key in seen_features: print 'Warning: Skipping duplicate entry for %s (%s, %s).' % ( feat.display_name, feat.region_type, feat.record_id) continue # Strip Ns from termini seq = feat.sequence if 'N' in seq: print 'Warning: Feature %s contains Ns. Stripping from the ends.' % \ str((feat.display_name, feat.record_id, feat.region_type)) seq = seq.strip('N') if len(seq) == 0: print 'Warning: Feature %s is all Ns. Skipping.' % \ str((feat.display_name, feat.record_id, feat.region_type)) continue # Warn on features we couldn't classify properly if feat.chain_type not in vdj_constants.VDJ_CHAIN_TYPES: print 'Warning: Unknown chain type for: %s. Expected name to be in %s. Skipping.' % \ (str((feat.display_name, feat.record_id, feat.region_type)), str(tuple(vdj_constants.VDJ_CHAIN_TYPES))) continue seen_ids.add(feat.feature_id) seen_features.add(key) # Update the sequence since we may have modified it feat_dict = feat._asdict() feat_dict.update({'sequence': seq}) new_feat = VdjAnnotationFeature(**feat_dict) features.append(new_feat) print '...done.\n' print 'Writing sequences...' os.makedirs(os.path.dirname(get_vdj_reference_fasta(reference_path))) with open(get_vdj_reference_fasta(reference_path), 'w') as out_fasta: for feat in features: out_fasta.write(convert_vdj_feature_to_fasta_entry(feat) + '\n') print '...done.\n' print 'Computing hash of input FASTA file...' fasta_hash = cr_utils.compute_hash_of_file(fasta_path) print '...done.\n' print 'Writing metadata JSON file into reference folder...' metadata = { cr_constants.REFERENCE_GENOMES_KEY: reference_name, cr_constants.REFERENCE_FASTA_HASH_KEY: fasta_hash, cr_constants.REFERENCE_GTF_HASH_KEY: None, cr_constants.REFERENCE_INPUT_FASTA_KEY: os.path.basename(fasta_path), cr_constants.REFERENCE_INPUT_GTF_KEY: None, cr_constants.REFERENCE_VERSION_KEY: ref_version, cr_constants.REFERENCE_MKREF_VERSION_KEY: mkref_version, cr_constants.REFERENCE_TYPE_KEY: vdj_constants.REFERENCE_TYPE, } with open( os.path.join(reference_path, cr_constants.REFERENCE_METADATA_FILE), 'w') as json_file: json.dump(tk_safe_json.json_sanitize(metadata), json_file, sort_keys=True, indent=4) print '...done.\n'
def build_reference_fasta_from_ensembl(gtf_paths, transcripts_to_remove_path, genome_fasta_path, reference_path, reference_name, ref_version, mkref_version): """Create cellranger-compatible vdj reference files from a list of ENSEMBL-like GTF files. Input files are concatenated. No attempt to merge/reconcile information across them is made. Providing the files in a different order might change the output in cases where there are multiple entries with the same transcript id and the same feature type (eg. V-region). """ transcripts = collections.defaultdict(list) if transcripts_to_remove_path: with open(transcripts_to_remove_path) as f: rm_transcripts = set([line.strip() for line in f.readlines()]) else: rm_transcripts = set() # Note: We cannot symlink here because some filesystems in the wild # do not support symlinks. print 'Copying genome reference sequence...' os.makedirs(os.path.dirname(get_vdj_reference_fasta(reference_path))) tmp_genome_fa_path = os.path.join(reference_path, 'genome.fasta') cr_utils.copy(genome_fasta_path, tmp_genome_fa_path) print '...done.\n' print 'Indexing genome reference sequence...' tk_subproc.check_call(['samtools', 'faidx', tmp_genome_fa_path]) print '...done.\n' print 'Loading genome reference sequence...' genome_fasta = pysam.FastaFile(tmp_genome_fa_path) print '...done.\n' print 'Computing hash of genome FASTA file...' fasta_hash = cr_utils.compute_hash_of_file(tmp_genome_fa_path) print '...done.\n' for gtf in gtf_paths: print 'Reading GTF {}'.format(gtf) for line_no, entry in enumerate(get_gtf_iter(open(gtf))): if not entry.feature in [ ENSEMBL_FIVE_PRIME_UTR_FEATURE, ENSEMBL_CDS_FEATURE ]: continue entry = parse_attributes(entry) transcript_id = entry.attributes.get('transcript_id') transcript_biotype = entry.attributes.get('transcript_biotype') gene_biotype = entry.attributes.get('gene_biotype') gene_name = entry.attributes.get('gene_name') # Skip irrelevant biotypes if transcript_biotype not in ENSEMBL_VDJ_BIOTYPES and not gene_biotype in ENSEMBL_VDJ_BIOTYPES: continue # Skip blacklisted gene names if transcript_id in rm_transcripts: continue # Warn and skip if transcript_id missing if transcript_id is None: print 'Warning: Entry on row %d has no transcript_id' % line_no continue # Warn and skip if gene_name missing if gene_name is None: print 'Warning: Transcript %s on row %d has biotype %s but no gene_name. Skipping.' % ( transcript_id, line_no, transcript_biotype) continue # Infer region type from biotype if transcript_biotype in ENSEMBL_VDJ_BIOTYPES: vdj_feature = infer_ensembl_vdj_feature_type( entry.feature, transcript_biotype) else: vdj_feature = infer_ensembl_vdj_feature_type( entry.feature, gene_biotype) # Warn and skip if region type could not be inferred if vdj_feature is None: print 'Warning: Transcript %s has biotype %s. Could not infer VDJ gene type. Skipping.' % ( transcript_id, transcript_biotype) continue # Features that share a transcript_id and feature type are presumably exons # so keep them together. transcripts[(transcript_id, vdj_feature)].append(entry) print '...done.\n' print 'Computing hash of genes GTF files...' digest = hashlib.sha1() # concatenate all the hashes into a string and then hash that string digest.update( reduce(lambda x, y: x + y, [cr_utils.compute_hash_of_file(gtf) for gtf in gtf_paths])) gtf_hash = digest.hexdigest() print '...done.\n' print 'Fetching sequences...' out_fasta = open(get_vdj_reference_fasta(reference_path), 'w') feature_id = 1 seen_features = set() for (transcript_id, region_type), regions in transcripts.iteritems(): if not all(r.chrom == regions[0].chrom for r in regions): chroms = sorted(list(set([r.chrom for r in regions]))) print 'Warning: Transcript %s spans multiple contigs: %s. Skipping.' % ( transcript_id, str(chroms)) continue if not all(r.strand == regions[0].strand for r in regions): print 'Warning: Transcript %s spans multiple strands. Skipping.' % transcript_id continue chrom = regions[0].chrom strand = regions[0].strand ens_gene_name = standardize_ensembl_gene_name( regions[0].attributes['gene_name']) transcript_id = regions[0].attributes['transcript_id'] if chrom not in genome_fasta: print 'Warning: Transcript %s is on contig "%s" which is not in the provided reference fasta. Skipping.' % ( transcript_id, chrom) continue # Build sequence regions.sort(key=lambda r: r.start) seq = '' for region in regions: # GTF coordinates are 1-based start, end = int(region.start) - 1, int(region.end) seq += genome_fasta.fetch(chrom, start, end) # Revcomp if transcript on reverse strand if strand == '-': seq = tk_seq.get_rev_comp(seq) # Strip Ns from termini if 'N' in seq: print 'Warning: Feature %s contains Ns. Stripping from the ends.' % str( (ens_gene_name, transcript_id, region_type)) seq = seq.strip('N') if len(seq) == 0: print 'Warning: Feature %s is all Ns. Skipping.' % str( (ens_gene_name, transcript_id, region_type)) continue # Infer various attributes from the Ensembl gene name record_id = transcript_id gene_name = ens_gene_name display_name = make_display_name(gene_name=gene_name, allele_name=None) chain = infer_ensembl_vdj_chain(gene_name) chain_type = infer_ensembl_vdj_chain_type(gene_name) # Ensembl doesn't encode alleles allele_name = '00' # Disallow spaces in these fields if ' ' in region_type: raise ValueError('Spaces not allowed in region type: "%s"' % region_type) if ' ' in gene_name: raise ValueError('Spaces not allowed in gene name: "%s"' % gene_name) if ' ' in record_id: raise ValueError('Spaces not allowed in record ID: "%s"' % record_id) # Warn on features we couldn't classify properly if chain_type not in vdj_constants.VDJ_CHAIN_TYPES: print ('Warning: Could not infer chain type for: %s. ' + \ 'Expected the first two characters of the gene name to be in %s. Feature skipped.') % \ (str((gene_name, record_id, region_type)), str(tuple(vdj_constants.VDJ_CHAIN_TYPES))) continue if region_type in vdj_constants.VDJ_C_FEATURE_TYPES and chain in vdj_constants.CHAINS_WITH_ISOTYPES: isotype = infer_ensembl_isotype(ens_gene_name) else: isotype = None feature = VdjAnnotationFeature( feature_id=feature_id, record_id=record_id, display_name=display_name, gene_name=gene_name, region_type=region_type, chain_type=chain_type, chain=chain, isotype=isotype, allele_name=allele_name, sequence=seq, ) # Don't add duplicate entries feat_key = get_duplicate_feature_key(feature) if feat_key in seen_features: print 'Warning: Skipping duplicate entry for %s (%s, %s).' % ( display_name, region_type, record_id) continue seen_features.add(feat_key) feature_id += 1 out_fasta.write(convert_vdj_feature_to_fasta_entry(feature) + '\n') print '...done.\n' print 'Deleting copy of genome fasta...' os.remove(tmp_genome_fa_path) os.remove(tmp_genome_fa_path + '.fai') print '...done.\n' print 'Writing metadata JSON file into reference folder...' metadata = { cr_constants.REFERENCE_GENOMES_KEY: reference_name, cr_constants.REFERENCE_FASTA_HASH_KEY: fasta_hash, cr_constants.REFERENCE_GTF_HASH_KEY: gtf_hash, cr_constants.REFERENCE_INPUT_FASTA_KEY: os.path.basename(genome_fasta_path), cr_constants.REFERENCE_INPUT_GTF_KEY: ','.join([os.path.basename(gtf_path) for gtf_path in gtf_paths]), cr_constants.REFERENCE_VERSION_KEY: ref_version, cr_constants.REFERENCE_MKREF_VERSION_KEY: mkref_version, cr_constants.REFERENCE_TYPE_KEY: vdj_constants.REFERENCE_TYPE, } with open( os.path.join(reference_path, cr_constants.REFERENCE_METADATA_FILE), 'w') as json_file: json.dump(tk_safe_json.json_sanitize(metadata), json_file, sort_keys=True, indent=4) print '...done.\n'