def _as_seqdict_genome_regions(regions, minsize=None): """ Accepts list of regions where the genome is encoded in the region, using the genome@chrom:start-end format. """ genomic_regions = {} for region in regions: genome, region = region.split("@") if genome not in genomic_regions: Genome(genome) genomic_regions[genome] = [] genomic_regions[genome].append(region) tmpfa = NamedTemporaryFile(mode="w", delete=False) for genome, g_regions in genomic_regions.items(): g = Genome(genome) fa = g.track2fasta(g_regions) for seq in fa: seq.name = f"{genome}@{seq.name}" print(seq.__repr__(), file=tmpfa) tmpfa.flush() # Open tempfile and restore original sequence order fa = as_seqdict(tmpfa.name) fa = {region: fa[region] for region in regions} return _check_minsize(fa, minsize)
def prepare_denovo_input_bed(inputfile, params, outdir): """Prepare a BED file for de novo motif prediction. All regions to same size; split in test and validation set; converted to FASTA. Parameters ---------- inputfile : str BED file with input regions. params : dict Dictionary with parameters. outdir : str Output directory to save files. """ logger.info("preparing input (BED)") # Create BED file with regions of equal size width = int(params["width"]) bedfile = os.path.join(outdir, "input.bed") write_equalwidth_bedfile(inputfile, width, bedfile) abs_max = int(params["abs_max"]) fraction = float(params["fraction"]) pred_bedfile = os.path.join(outdir, "prediction.bed") val_bedfile = os.path.join(outdir, "validation.bed") # Split input into prediction and validation set logger.debug( "Splitting %s into prediction set (%s) and validation set (%s)", bedfile, pred_bedfile, val_bedfile) divide_file(bedfile, pred_bedfile, val_bedfile, fraction, abs_max) config = MotifConfig() genome = Genome(params["genome"]) for infile in [pred_bedfile, val_bedfile]: genome.track2fasta( infile, infile.replace(".bed", ".fa"), ) # Create file for location plots lwidth = int(params["lwidth"]) extend = (lwidth - width) // 2 genome.track2fasta( val_bedfile, os.path.join(outdir, "localization.fa"), extend_up=extend, extend_down=extend, stranded=params["use_strand"], )
def as_fasta(seqs, genome=None): ftype = get_seqs_type(seqs) if ftype == "fasta": return seqs elif ftype == "fastafile": return Fasta(seqs) else: if genome is None: raise ValueError("need genome to convert to FASTA") tmpfa = NamedTemporaryFile() if type(genome) == type(""): genome = Genome(genome) genome.track2fasta(seqs, tmpfa.name) return Fasta(tmpfa.name)
def __call__(self, parser, args, name, option_string=None): try: genome = Genome(name, genomes_dir=genomes_dir) except FileNotFoundError: logger.warning(f"Genome {name} not found!") if auto_install: logger.info( "Trying to install it automatically using genomepy...") install_genome(name, annotation=True, genomes_dir=genomes_dir) genome = Genome(name, genomes_dir=genomes_dir) else: logger.info("You can install it using `genomepy install`.") sys.exit(1) setattr(args, self.dest, genome)
def load_mapping(genome_name): logger.info("Loading chromosome mapping.") genome = Genome(genome_name) asm_acc = genome.assembly_accession if genome.provider not in ["UCSC", "NCBI"]: logger.error(f"Can't map to provider {genome.provider}") return None asm_report = ncbi_assembly_report(asm_acc) asm_report.loc[asm_report["Sequence-Role"] != "assembled-molecule", "Assigned-Molecule"] = "na" mapping = asm_report[[ "Sequence-Name", "UCSC-style-name", "Assigned-Molecule", "GenBank-Accn" ]] if genome.provider == "NCBI": logger.info("Mapping to NCBI sequence names") id_column = "Sequence-Name" elif genome.provider == "UCSC": logger.info("Mapping to UCSC sequence names") id_column = "UCSC-style-name" mapping = pd.melt(mapping, id_vars=[id_column]) mapping = mapping[mapping["value"] != "na"] mapping = mapping.drop_duplicates().set_index("value")[[id_column]] mapping.columns = ["chrom"] return mapping
def check_denovo_input(inputfile, params): genome = params["genome"] background = params["background"] input_type = "BED" # If we can load it as fasta then it is a fasta, yeh? try: Fasta(inputfile) logger.debug("Inputfile is a FASTA file") input_type = "FASTA" except Exception: # Leave it to BED pass if input_type == "FASTA": valid_bg = FA_VALID_BGS elif input_type == "BED": valid_bg = BED_VALID_BGS if "genomic" in background: Genome(genome) # is it a valid bed-file etc. check_bed_file(inputfile) # bed-specific for bg in background: if not bg in valid_bg: logger.info("Input type is %s, ignoring background type '%s'", input_type, bg) background = [bg for bg in background if bg in valid_bg] if len(background) == 0: logger.error("No valid backgrounds specified!") sys.exit(1) return input_type, background
def set_peak_size(self, peak_bed, seqlen=200): """set all input peaks to 200bp Arguments: peak_bed {[bed]} -- [input peak bed file] Keyword Arguments: seqlen {int} -- [peak length] (default: {200}) Returns: [type] -- [200bp peak file] """ gsizedic = Genome(self.genome).sizes peaks = BedTool(peak_bed) fl2 = NamedTemporaryFile(mode="w", dir=mytmpdir(), delete=False) for peak in peaks: if peak.length < seqlen or peak.length > seqlen: # get the summit and the flanking low and high sequences summit = (peak.start + peak.end) // 2 start, end = summit - seqlen // 2, summit + seqlen // 2 else: start, end = peak.start, peak.end # remove seq which langer than chromosome length or smaller than 0 if start > 0 and end < int(gsizedic[peak.chrom]): fl2.write(f"{peak.chrom}\t{start}\t{end}\n") return fl2.name
def _local_gene_annotation(genes: Iterable[str], genome: str) -> pd.DataFrame: """Retrieve gene location from local annotation. Parameters ---------- genes : Iterable List of gene names or gene identifiers such as ensembl_id. genome : str Genome name Returns ------- pandas.DataFrame with gene annotation. """ g = Genome(genome) gene_list = list(genes) bed = os.path.join(os.path.dirname(g.filename), f"{genome}.annotation.bed.gz") gene_info = pd.DataFrame() if os.path.exists(bed): df = pd.read_table( bed, index_col=3, usecols=[0, 1, 2, 3, 5], names=["chrom", "start", "end", "name", "strand"], ) gene_info = df.loc[gene_list] # If we find more than half of the genes we assume this worked. if gene_info.shape[0] >= 0.5 * len(gene_list): return gene_info.reset_index()[[ "chrom", "start", "end", "name", "strand" ]]
def peak2fasta(peak_ids, ref_genome): ''' Convert peak_id into fasta object. Args: peak_id (str or list of str): Peak_id. e.g. "chr5_0930303_9499409" or it can be a list of peak_id. e.g. ["chr5_0930303_9499409", "chr11_123445555_123445577"] ref_genome (str): Reference genome name. e.g. "mm9", "mm10", "hg19" etc Returns: gimmemotifs fasta object: DNA sequence in fasta format ''' genome_data = Genome(ref_genome) def peak2seq(peak_id): chromosome_name, start, end = decompose_chrstr(peak_id) locus = (int(start), int(end)) tmp = genome_data[chromosome_name][locus[0]:locus[1]] name = f"{tmp.name}_{tmp.start}_{tmp.end}" seq = tmp.seq return (name, seq) if type(peak_ids) is str: peak_ids = [peak_ids] fasta = Fasta() for peak_id in peak_ids: name, seq = peak2seq(peak_id) fasta.add(name, seq) return fasta
def create_gc_bin_index(genome, fname, min_bin_size=100): """Create index of GC content for a genome. Parameters ---------- genome : str Genome name. fname : str Name of the index file. min_bin_size : int Minimum bin size (default 100). Warning: setting to a small value will result in a very large index file! """ logger.info("Creating index for genomic GC frequencies.") g = Genome(genome) fasta = g.filename sizes = g.filename + ".sizes" # props["sizes"]["sizes"] with NamedTemporaryFile() as tmp: # pylint: disable=unexpected-keyword-arg pybedtools.BedTool().window_maker( g=sizes, w=min_bin_size).nucleotide_content(fi=fasta).saveas(tmp.name) df = pd.read_csv( tmp.name, sep="\t", usecols=[0, 1, 2, 4, 9], dtype={ "#1_usercol": "string", "2_usercol": np.int64, "3_usercol": np.int64, "5_pct_gc": np.float32, "10_num_N": np.int8, }, ) cols = [ "chrom", "start", "end", "w{}".format(min_bin_size), "n{}".format(min_bin_size), ] for t in (2, 5): df["w{}".format(min_bin_size * t)] = (df.iloc[:, 3].rolling( t, min_periods=t).mean()) df["n{}".format(min_bin_size * t)] = (df.iloc[:, 4].rolling( t, min_periods=t).sum()) cols += [ "w{}".format(min_bin_size * t), "n{}".format(min_bin_size * t) ] df.columns = cols # Make really sure that column 'chrom' is a string df.dropna(subset=["chrom"], inplace=True) df["chrom"] = df["chrom"].apply(str).astype("string") df.reset_index()[cols].to_feather(fname)
def create_random_genomic_bedfile(out, genome, size, n): features = Genome(genome).get_random_sequences(n, size) # Write result to bedfile tmp = open(out, "w") for chrom, start, end in features: tmp.write("%s\t%d\t%d\n" % (chrom, start, end)) tmp.flush()
def _genomepy_convert(to_convert, genome, minsize=None): """ Convert a variety of inputs using track2fasta(). """ if genome is None: raise ValueError("input file is not a FASTA file, need a genome!") if isinstance(genome, Genome): g = genome else: g = Genome(genome) tmpfile = NamedTemporaryFile() g.track2fasta(to_convert, tmpfile.name) fa = as_seqdict(tmpfile.name) return _check_minsize(fa, minsize)
def _scan_regions(self, regions, nreport, scan_rc): genome = self.genome motif_file = self.motifs motif_digest = self.checksum.get(motif_file, None) # determine which regions are not in the cache scan_regions = regions if self.use_cache: scan_regions = [] for region in regions: key = str((region, genome, motif_digest, nreport, scan_rc)) ret = self.cache.get(key) if ret == NO_VALUE: scan_regions.append(region) # scan the regions that are not in the cache if len(scan_regions) > 0: g = Genome(genome) motifs = [(m, self.threshold[m.id]) for m in read_motifs(self.motifs)] scan_func = partial( scan_region_mult, genome=g, motifs=motifs, nreport=nreport, scan_rc=scan_rc, ) for region, ret in self._scan_jobs(scan_func, scan_regions): # return values or store values in cache if self.use_cache: # store values in cache key = str(( region, genome, motif_digest, nreport, scan_rc, self.threshold_str, )) self.cache.set(key, ret) else: # return values yield ret if self.use_cache: # return results from cache for region in regions: key = str((region, genome, motif_digest, nreport, scan_rc, self.threshold_str)) ret = self.cache.get(key) if ret == NO_VALUE or ret is None: raise Exception("cache is not big enough to hold all " "results, try increasing the cache size " "or disable cache") yield ret
def as_fasta(seqs, genome=None): ftype = get_seqs_type(seqs) if ftype == "fasta": return seqs elif ftype == "fastafile": return Fasta(seqs) else: if genome is None: raise ValueError("need genome to convert to FASTA") tmpfa = NamedTemporaryFile() if isinstance(genome, str): genome = Genome(genome) if isinstance(seqs, np.ndarray): seqs = list(seqs) genome.track2fasta(seqs, tmpfa.name) return Fasta(tmpfa.name)
def ensembl_genome_info(genome_name: str) -> Tuple[str, str, str]: """Return Ensembl genome information for a local genome managed by genomepy. Parameters ---------- genome_name : str Name of local genome. Returns ------- (str, str, str) Ensembl name, accession, taxonomy_id """ # Fast lookup for some common queries common_names = { "danRer11": "GRCz11", "hg38": "GRCh38", "mm10": "GRCm38", "dm6": "BDGP6.28", } if genome_name in common_names: search_term = common_names[genome_name] else: try: genome = Genome(genome_name) search_term = genome.tax_id except FileNotFoundError: logger.info(f"Genome {genome_name} not installed locally") p = ProviderBase.create("Ensembl") for name, *_rest in p.search(genome_name): if name == genome_name: logger.info( f"It can be downloaded from Ensembl: genomepy install {name} Ensembl --annotation" ) return None return None # search Ensembl by taxonomy_id or by specific Ensembl name (if we know it) p = ProviderBase.create("Ensembl") name, accession, species, tax_id, *rest = [ row for row in p.search(search_term) ][0] # Check if the assembly_id of the current Ensembl genome is the same as the # local genome. If it is identical, we can correctly assume that the genomes # sequences are identical. # For the genomes in the lookup table, we already know they match. if genome_name in common_names or accession == genome.assembly_accession: return name, accession, tax_id else: print(f"Could not find a matching genome in Ensembl") return None
def set_genome(self, genome): """ set the genome to be used for: - converting regions to sequences - background for MOODS """ if not genome: return # raises error if checks fail Genome(genome) self.genome = genome
def __init__(self, matchfile, genome="hg19", number=None, size=None): # Create temporary files tmpbed = NamedTemporaryFile(dir=mytmpdir()).name tmpfasta = NamedTemporaryFile(dir=mytmpdir()).name # Create bed-file with coordinates of random sequences matched_gc_bedfile(tmpbed, matchfile, genome, number, size=size) # Convert track to fasta Genome(genome).track2fasta(tmpbed, fastafile=tmpfasta) # Initialize super Fasta object Fasta.__init__(self, tmpfasta) # Delete the temporary files os.remove(tmpbed) os.remove(tmpfasta)
def check_genome(genome): """Check if genome is a valid FASTA file or genomepy genome genome. Parameters ---------- genome : str Genome name or file to check. Returns ------- is_genome : bool """ try: Genome(genome) return True except Exception as e: pass return False
def create_gc_bin_index(genome, fname, min_bin_size=100): """Create index of GC content for a genome. Parameters ---------- genome : str Genome name. fname : str Name of the index file. min_bin_size : int Minimum bin size (default 100). Warning: setting to a small value will result in a very large index file! """ logger.info("Creating index for genomic GC frequencies.") g = Genome(genome) fasta = g.filename sizes = g.props["sizes"]["sizes"] with NamedTemporaryFile() as tmp: # pylint: disable=unexpected-keyword-arg pybedtools.BedTool().window_maker( g=sizes, w=min_bin_size).nucleotide_content(fi=fasta).saveas(tmp.name) df = pd.read_csv(tmp.name, sep="\t", usecols=[0, 1, 2, 4, 9]) cols = [ "chrom", "start", "end", "w{}".format(min_bin_size), "n{}".format(min_bin_size), ] for t in (2, 5): df["w{}".format(min_bin_size * t)] = (df.iloc[:, 3].rolling( t, min_periods=t).mean()) df["n{}".format(min_bin_size * t)] = (df.iloc[:, 4].rolling( t, min_periods=t).sum()) cols += [ "w{}".format(min_bin_size * t), "n{}".format(min_bin_size * t) ] df.columns = cols df.reset_index()[cols].to_feather(fname)
def __init__(self, genome, size=None, n=None): size = int(size) # Create temporary files tmpbed = NamedTemporaryFile(dir=mytmpdir()).name tmpfasta = NamedTemporaryFile(dir=mytmpdir()).name # Create bed-file with coordinates of random sequences create_random_genomic_bedfile(tmpbed, genome, size, n) # Convert track to fasta Genome(genome).track2fasta(tmpbed, fastafile=tmpfasta, stranded=True) # Initialize super Fasta object Fasta.__init__(self, tmpfasta) # Delete the temporary files os.remove(tmpbed) os.remove(tmpfasta)
def __init__(self, name): self.name = str(name) self.data_dir = Path(locate_data(name)) with open(os.path.join(self.data_dir, "info.yaml")) as f: self.config = yaml.load(f, Loader=yaml.FullLoader) self.source = None source = self.config.get("source", None) if source: self.source = ScepiaDataset(source) try: Genome(self.genome) except FileNotFoundError: logger.error(f"Genome {self.genome} is needed for this dataset.") logger.error("Please install it with genomepy.") logger.error(f"Command-line: genomepy install {self.genome}") logger.error( f'Python: import genomepy; genomepy.install_genome("{self.genome}")' )
def create_link_file(meanstd_file: str, genes_file: str, genome: Optional[str] = "hg38") -> pd.DataFrame: meanstd_file = str(meanstd_file) # Read enhancer locations if meanstd_file.endswith("feather"): tmp = pd.read_feather(meanstd_file)["index"] else: tmp = pd.read_csv(meanstd_file, sep="\t")["index"] enhancers = BedTool.from_dataframe(tmp.str.split("[-:]", expand=True)) # Calculating overlap with certain distance g = Genome(genome).sizes_file genes = BedTool(genes_file).slop(b=100000, g=g).cut([0, 1, 2, 3]) overlap = genes.intersect(b=enhancers, wo=True) overlap = overlap.to_dataframe().iloc[:, 3:7] overlap.columns = ["gene", "chrom", "start", "end"] overlap["loc"] = (overlap["chrom"] + ":" + overlap["start"].astype(str) + "-" + overlap["end"].astype(str)) overlap["pos"] = ((overlap["start"] + overlap["end"]) / 2).astype(int) overlap = overlap[["gene", "loc", "pos"]] return overlap
def is_genome_installed(ref_genome): """ Celloracle motif_analysis module uses gimmemotifs and genomepy internally. Reference genome files should be installed in the PC to use gimmemotifs and genomepy. This function checks the installation status of the reference genome. Args: ref_genome (str): names of reference genome. i.e., "mm10", "hg19" """ try: genome_data = Genome(ref_genome) return True except: print(f"genome {ref_genome} is not installed in this environment.") print("Please install genome using genomepy.") print('e.g.\n >>> import genomepy\n >>> genomepy.install_genome("mm9", "UCSC")') return False
def check_denovo_input(inputfile, params): """ Check if an input file is valid, which means BED, narrowPeak or FASTA """ background = params["background"] input_type = determine_file_type(inputfile) if input_type == "fasta": valid_bg = FA_VALID_BGS elif input_type in ["bed", "narrowpeak"]: genome = params["genome"] valid_bg = BED_VALID_BGS if "genomic" in background or "gc" in background: Genome(genome) # is it a valid bed-file etc. check_bed_file( inputfile) # bed-specific, will also work for narrowPeak else: sys.stderr.write( "Format of inputfile {} not recognized.\n".format(inputfile)) sys.stderr.write("Input should be FASTA, BED or narrowPeak.\n") sys.stderr.write( "See https://genome.ucsc.edu/FAQ/FAQformat.html for specifications.\n" ) sys.exit(1) for bg in background: if bg not in valid_bg: logger.info("Input type is %s, ignoring background type '%s'", input_type, bg) background = [bg for bg in background if bg in valid_bg] if len(background) == 0: logger.error("No valid backgrounds specified!") sys.exit(1) return input_type, background
def test2_as_fasta(self): """ convert bed, regions, etc to Fasta """ tmpdir = mkdtemp() g = Genome("genome", genome_dir=self.genome_dir) fafile = os.path.join(self.datadir, "test.fa") fa = Fasta(fafile) bedfile = os.path.join(self.datadir, "test.bed") regionfile = os.path.join(self.datadir, "test.txt") with open(regionfile) as f: regions = [l.strip() for l in f] self.assertTrue(isinstance(as_fasta(fa), Fasta)) self.assertTrue(isinstance(as_fasta(fafile), Fasta)) self.assertTrue(isinstance(as_fasta(bedfile, g), Fasta)) self.assertTrue(isinstance(as_fasta(regionfile, g), Fasta)) self.assertTrue(isinstance(as_fasta(regions, g), Fasta)) with self.assertRaises(ValueError): as_fasta(bedfile) rmtree(tmpdir)
def background(args): inputfile = args.inputfile out = args.outputfile bg_type = args.bg_type outformat = args.outformat.lower() length = args.length if bg_type not in BG_TYPES: print("The argument 'type' should be one of: %s" % (",".join(BG_TYPES))) sys.exit(1) if outformat == "bed" and bg_type == "random": print("Random background can only be generated in FASTA format!") sys.exit(1) if bg_type == "gc" and not inputfile: print("need a FASTA formatted input file for background gc") sys.exit(1) # GimmeMotifs configuration for file and directory locations config = MotifConfig() # Genome index location for creation of FASTA files if bg_type in ["gc", "genomic", "promoter"] and outformat == "fasta": Genome(args.genome) # Gene definition fname = Genome(args.genome).filename gene_file = fname.replace(".fa", ".annotation.bed.gz") if not gene_file: gene_file = os.path.join(config.get_gene_dir(), "{}.bed".format(args.genome)) if bg_type in ["promoter"]: if not os.path.exists(gene_file): print("Could not find a gene file for genome {}".format(args.genome)) print("Did you use the --annotation flag for genomepy?") print("Alternatively make sure there is a file called {}.bed in {}".format(args.genome, config.get_gene_dir())) sys.exit(1) # Number of sequences number = None if args.number: number = args.number elif inputfile: number = number_of_seqs_in_file(inputfile) else: sys.stderr.write("please provide either a number or an inputfile\n") sys.exit(1) if bg_type == "random": f = Fasta(inputfile) m = bg.MarkovFasta(f, n=number, k=args.markov_order) m.writefasta(out) elif bg_type == "gc": if outformat in ["fasta", "fa"]: m = bg.MatchedGcFasta(inputfile, args.genome, number=number) m.writefasta(out) else: bg.matched_gc_bedfile(out, inputfile, args.genome, number) elif bg_type == "promoter": if outformat in ["fasta", "fa"]: m = bg.PromoterFasta(gene_file, args.genome, length=length, n=number) m.writefasta(out) else: bg.create_promoter_bedfile(out, gene_file, length, number) elif bg_type == "genomic": if outformat in ["fasta", "fa"]: m = bg.RandomGenomicFasta(args.genome, length, number) m.writefasta(out) else: bg.create_random_genomic_bedfile(out, args.genome, length, number)
def matched_gc_bedfile(bedfile, matchfile, genome, number, size=None, min_bin_size=100): """Create a BED file with GC% matched to input file. Parameters ---------- bedfile : str Name of the output BED file. matchfile : str Name of input file (BED or FASTA format) genome : str Genome name. number : int Number of sequences to retrieve. size : int, optional Size of the generated sequenced. If not provided, the input size is used. """ g = Genome(genome) genome_fa = g.filename try: fa = Fasta(matchfile) gc = [(seq.upper().count("C") + seq.upper().count("G")) / len(seq) for seq in fa.seqs] sizes = [len(seq) for seq in fa.seqs] except Exception: try: # pylint: disable=unexpected-keyword-arg fields = pd.read_csv(matchfile, comment="#", nrows=10, sep="\t").shape[1] tmp = (pybedtools.BedTool(matchfile).filter( lambda x: len(x) >= 10).saveas().fn) bed = pybedtools.BedTool(tmp) gc = np.array([ float(x[fields + 1]) for x in bed.nucleotide_content(fi=genome_fa) ]) sizes = np.array([x.length for x in bed]) gc = [round(x, 2) for x in gc] except Exception: sys.stderr.write( "Please provide input file in BED or FASTA format\n") raise # Get the median size of the sequences if size is None or size == 0: size = int(np.median(sizes)) if np.std(sizes) > size * 0.05: sys.stderr.write("Sequences do not seem to be of equal size.\n") sys.stderr.write(("GC% matched sequences of the median size ({}) " "will be created\n").format(size)) bins = [(0.0, 0.2), (0.8, 1)] for b in np.arange(0.2, 0.799, 0.05): bins.append((b, b + 0.05)) fraction = number / len(gc) gc = np.array(gc) # print("GC", gc) bin_count = [] for b_start, b_end in bins: bin_count.append( int( np.sum((gc > round(b_start, 2)) & (gc <= round(b_end, 2))) * fraction)) # To make te requested number, divide remaining over # all bins that have counts rest = number - sum(bin_count) i = 0 for _ in range(rest): while bin_count[i % len(bins)] == 0: i += 1 bin_count[i % len(bins)] += 1 i += 1 nseqs = max(bin_count) * len(bins) with NamedTemporaryFile(delete=False) as tmp: gc_bin_bedfile( tmp.name, genome, nseqs, length=size, bins=bins, random_state=None, min_bin_size=min_bin_size, ) df = pd.read_csv(tmp.name, sep="\t", names=["chrom", "start", "end", "bin"]) # print(tmp.name) with open(bedfile, "w") as f: pass with open(bedfile, "a") as f: for (b_start, b_end), n in zip(bins, bin_count): if n == 0: continue # print(b_start, b_end, n) b = "{:.2f}-{:.2f}".format(b_start, b_end) df.loc[df["bin"] == b, ["chrom", "start", "end"]].sample(n).to_csv(f, sep="\t", header=False, index=False)
def create_background_file(outfile, bg_type, fmt="fasta", size=None, genome=None, inputfile=None, number=10000): """ Create a background file for motif analysis. Parameters ---------- outfile : str Name of the output file. bg_type : str Type of background (gc, genomic, random or promoter). fmt : str, optional Either 'fasta' or 'bed'. size : int, optional Size of the generated sequences, is determined from the inputfile if not given. genome : str, optional inputfile : str, optional number : int, optional """ fmt = fmt.lower() if fmt in ["fa", "fsa"]: fmt = "fasta" if bg_type not in BG_TYPES: print("The argument 'type' should be one of: %s" % (",".join(BG_TYPES))) sys.exit(1) if fmt == "bed" and bg_type == "random": print("Random background can only be generated in FASTA format!") sys.exit(1) if bg_type == "gc" and not inputfile: print("need a FASTA formatted input file for background gc") sys.exit(1) # GimmeMotifs configuration for file and directory locations config = MotifConfig() # Genome index location for creation of FASTA files if bg_type in ["gc", "genomic", "promoter"] and fmt == "fasta": if genome is None: print("Need a genome to create background file") sys.exit(1) Genome(genome) if bg_type in ["promoter"]: # Gene definition fname = Genome(genome).filename gene_file = fname.replace(".fa", ".annotation.bed.gz") if not gene_file: gene_file = os.path.join(config.get_gene_dir(), "{}.bed".format(genome)) if not os.path.exists(gene_file): print("Could not find a gene file for genome {}".format(genome)) print("Did you use the --annotation flag for genomepy?") print( "Alternatively make sure there is a file called {}.bed in {}". format(genome, config.get_gene_dir())) sys.exit(1) # Number of sequences if number is None: if inputfile: number = number_of_seqs_in_file(inputfile) logger.info("Using %s of background sequences based on input file", number) else: number = 10000 logger.info( "Number of background sequences not specified, using 10,000 sequences" ) if bg_type == "random": f = Fasta(inputfile) m = MarkovFasta(f, n=number, k=1) m.writefasta(outfile) elif bg_type == "gc": if fmt == "fasta": m = MatchedGcFasta(inputfile, genome, number=number, size=size) m.writefasta(outfile) else: matched_gc_bedfile(outfile, inputfile, genome, number, size=size) else: if size is None: size = np.median( [len(seq) for seq in as_fasta(inputfile, genome=genome).seqs]) if bg_type == "promoter": if fmt == "fasta": m = PromoterFasta(gene_file, genome, size=size, n=number) m.writefasta(outfile) else: create_promoter_bedfile(outfile, gene_file, size, number) elif bg_type == "genomic": if fmt == "fasta": m = RandomGenomicFasta(genome, size, number) m.writefasta(outfile) else: create_random_genomic_bedfile(outfile, genome, size, number)
def create_background(bg_type, fafile, outfile, genome="hg18", width=200, nr_times=10, custom_background=None): """Create background of a specific type. Parameters ---------- bg_type : str Name of background type. fafile : str Name of input FASTA file. outfile : str Name of output FASTA file. genome : str, optional Genome name. width : int, optional Size of regions. nr_times : int, optional Generate this times as many background sequences as compared to input file. Returns ------- nr_seqs : int Number of sequences created. """ width = int(width) config = MotifConfig() fg = Fasta(fafile) if bg_type in ["genomic", "gc"]: if not genome: logger.error("Need a genome to create background") sys.exit(1) if bg_type == "random": f = MarkovFasta(fg, k=1, n=nr_times * len(fg)) logger.debug("Random background: %s", outfile) elif bg_type == "genomic": logger.debug("Creating genomic background") f = RandomGenomicFasta(genome, width, nr_times * len(fg)) elif bg_type == "gc": logger.debug("Creating GC matched background") f = MatchedGcFasta(fafile, genome, nr_times * len(fg)) logger.debug("GC matched background: %s", outfile) elif bg_type == "promoter": fname = Genome(genome).filename gene_file = fname.replace(".fa", ".annotation.bed.gz") if not gene_file: gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % genome) if not os.path.exists(gene_file): print("Could not find a gene file for genome {}") print("Did you use the --annotation flag for genomepy?") print("Alternatively make sure there is a file called {}.bed in {}".format(genome, config.get_gene_dir())) raise ValueError() logger.info( "Creating random promoter background (%s, using genes in %s)", genome, gene_file) f = PromoterFasta(gene_file, genome, width, nr_times * len(fg)) logger.debug("Random promoter background: %s", outfile) elif bg_type == "custom": bg_file = custom_background if not bg_file: raise IOError( "Background file not specified!") if not os.path.exists(bg_file): raise IOError( "Custom background file %s does not exist!", bg_file) else: logger.info("Copying custom background file %s to %s.", bg_file, outfile) f = Fasta(bg_file) l = np.median([len(seq) for seq in f.seqs]) if l < (width * 0.95) or l > (width * 1.05): logger.warn( "The custom background file %s contains sequences with a " "median length of %s, while GimmeMotifs predicts motifs in sequences " "of length %s. This will influence the statistics! It is recommended " "to use background sequences of the same length.", bg_file, l, width) f.writefasta(outfile) return len(f)
def matched_gc_bedfile(bedfile, matchfile, genome, number): N_FRACTION = 0.1 g = Genome(genome) genome_fa = g.filename try: fa = Fasta(matchfile) gc = [(seq.upper().count("C") + seq.upper().count("G")) / len(seq) for seq in fa.seqs] lengths = [len(seq) for seq in fa.seqs] except Exception: try: # pylint: disable=unexpected-keyword-arg bed = pybedtools.BedTool(matchfile) gc = [float(x[4]) for x in bed.nucleotide_content(fi=genome_fa)] lengths = [x.length for x in bed] except: sys.stderr.write("Please provide input file in BED or FASTA format\n") raise gc_hist,bins = np.histogram(gc, range=(0,1), bins=20) length = np.median(lengths) if np.std(lengths) > length * 0.05: sys.stderr.write("Sequences do not seem to be of equal length.\n") sys.stderr.write("GC% matched sequences of the median length ({}) will be created\n".format(length)) if number: norm = number * gc_hist / (float(sum(gc_hist))) + 0.5 inorm = norm.astype(np.int) s = np.argsort(norm - inorm) while sum(inorm) > number: if inorm[np.argmin(s)] > 0: inorm[np.argmin(s)] -= 1 s[np.argmin(s)] = len(s) while sum(inorm) < number: inorm[np.argmax(s)] += 1 s[np.argmax(s)] = 0 gc_hist = inorm rnd = pybedtools.BedTool() out = open(bedfile, "w") #sys.stderr.write("Generating sequences\n") #sys.stderr.write("{}\n".format(number)) # Create a file with chromosome sizes if it doesn't exist yet genome_size = genome_fa + ".sizes" del_size = False if not os.path.exists(genome_size): genome_size = NamedTemporaryFile().name del_size = True with open(genome_size, "w") as f: for seqname in g.keys(): f.write("{}\t{}\n".format(seqname, len(g[seqname]))) # pylint: disable=unexpected-keyword-arg r = rnd.random(l=length, n=number * 30, g=genome_size).nucleotide_content(fi=genome_fa) if del_size: os.unlink(genome_size) features = [f[:3] + [float(f[7])] for f in r if float(f[12]) <= length * N_FRACTION] gc = [f[3] for f in features] #sys.stderr.write("Done\n") for bin_start, bin_end, count in zip(bins[:-1], bins[1:], gc_hist): #sys.stderr.write("CG {}-{}\n".format(bin_start, bin_end)) if count > 0: rcount = 0 for f in features: if (f[3] >= bin_start and f[3] < bin_end): out.write("{}\t{}\t{}\n".format(*f[:3])) rcount += 1 if rcount >= count: break if count != rcount: sys.stderr.write("not enough random sequences found for {} <= GC < {} ({} instead of {})\n".format(bin_start, bin_end, rcount, count)) out.close()
def create_background( bg_type, fafile, outfile, genome="hg18", size=200, nr_times=10, custom_background=None, ): """Create background of a specific type. Parameters ---------- bg_type : str Name of background type. fafile : str Name of input FASTA file. outfile : str Name of output FASTA file. genome : str, optional Genome name. size : int, optional Size of regions. nr_times : int, optional Generate this times as many background sequences as compared to input file. Returns ------- nr_seqs : int Number of sequences created. """ size = int(size) config = MotifConfig() fg = Fasta(fafile) if bg_type in ["genomic", "gc"]: if not genome: logger.error("Need a genome to create background") sys.exit(1) if bg_type == "random": f = MarkovFasta(fg, k=1, n=nr_times * len(fg)) logger.debug("Random background: %s", outfile) elif bg_type == "genomic": logger.debug("Creating genomic background") f = RandomGenomicFasta(genome, size, nr_times * len(fg)) elif bg_type == "gc": logger.debug("Creating GC matched background") f = MatchedGcFasta(fafile, genome, nr_times * len(fg)) logger.debug("GC matched background: %s", outfile) elif bg_type == "promoter": fname = Genome(genome).filename gene_file = fname.replace(".fa", ".annotation.bed.gz") if not gene_file: gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % genome) if not os.path.exists(gene_file): print("Could not find a gene file for genome {}") print("Did you use the --annotation flag for genomepy?") print( "Alternatively make sure there is a file called {}.bed in {}". format(genome, config.get_gene_dir())) raise ValueError() logger.info( "Creating random promoter background (%s, using genes in %s)", genome, gene_file, ) f = PromoterFasta(gene_file, genome, size, nr_times * len(fg)) logger.debug("Random promoter background: %s", outfile) elif bg_type == "custom": bg_file = custom_background if not bg_file: raise IOError("Background file not specified!") if not os.path.exists(bg_file): raise IOError("Custom background file %s does not exist!", bg_file) else: logger.info("Copying custom background file %s to %s.", bg_file, outfile) f = Fasta(bg_file) median_length = np.median([len(seq) for seq in f.seqs]) if median_length < (size * 0.95) or median_length > (size * 1.05): logger.warn( "The custom background file %s contains sequences with a " "median size of %s, while GimmeMotifs predicts motifs in sequences " "of size %s. This will influence the statistics! It is recommended " "to use background sequences of the same size.", bg_file, median_length, size, ) f.writefasta(outfile) return len(f)
def gene_annotation(genes: Iterable[str], genome: str) -> pd.DataFrame: """Retrieve genomic annotation of a set of genes. If the annotation is not present locally, then mygene.info is used. All genome assemblies that are present in the latest version of Ensembl are supported by mygene.info. Parameters ---------- genes : Iterable List of gene names or gene identifiers such as ensembl_id. genome : str Genome name Returns ------- pandas.DataFrame with gene annotation. """ # First try to find the genes in the local annotation installed by genomepy. gene_info = _local_gene_annotation(genes, genome) if gene_info is not None: return gene_info # Genes are not identified locally. # Retrieve the gene information using the mygene.info API logger.info( f"No local matching genes found for {genome}, trying mygene.info") # mygene.info only queries the most recent version of the Ensembl database # We can only safely continue if the local genome matched the Ensembl genome. # Even if the local genome was installed via Ensembl, we still need to check # if it is the same version result = ensembl_genome_info(genome) if result is None: return None # Run the actual query g = Genome(genome) logger.info("Querying mygene.info...") mg = mygene.MyGeneInfo() result = mg.querymany(genes, scopes="symbol,name,ensemblgene,entrezgene", fields="genomic_pos", species=g.tax_id, as_dataframe=True, verbose=False) if "notfound" in result and result.shape[1] == 1: logger.error("No matching genes found") sys.exit() if g.provider == "Ensembl": result = result.rename(columns={"genomic_pos.chr": "chrom"}) else: # Ensembl, UCSC and NCBI chromosome names can all be different :-/ logger.info("Local genome is not an Ensembl genome.") mapping = load_mapping(g.name) result = result.join(mapping, on="genomic_pos.chr") result = result.dropna(subset=["chrom"]) # Convert genomic positions from string to integer result["genomic_pos.start"] = result["genomic_pos.start"].astype(int) result["genomic_pos.end"] = result["genomic_pos.end"].astype(int) # For each gene use the match with the highest score result = result.reset_index().sort_values("_score").groupby("query").last() # Map the Ensembl 1/-1 strand to +/- strand strand_df = pd.DataFrame({ "ens_strand": [-1, 1], "strand": ["-", "+"] }).set_index("ens_strand") result = result.join(strand_df, on="genomic_pos.strand") # Select the correct columns and name them result = result.reset_index()[[ "chrom", "genomic_pos.start", "genomic_pos.end", "query", "strand" ]] result.columns = [["chrom", "start", "end", "name", "strand"]] return result