Example #1
0
def _as_seqdict_genome_regions(regions, minsize=None):
    """
    Accepts list of regions where the genome is encoded in the region,
    using the genome@chrom:start-end format.
    """
    genomic_regions = {}
    for region in regions:
        genome, region = region.split("@")
        if genome not in genomic_regions:
            Genome(genome)
            genomic_regions[genome] = []
        genomic_regions[genome].append(region)

    tmpfa = NamedTemporaryFile(mode="w", delete=False)
    for genome, g_regions in genomic_regions.items():
        g = Genome(genome)

        fa = g.track2fasta(g_regions)

        for seq in fa:
            seq.name = f"{genome}@{seq.name}"
            print(seq.__repr__(), file=tmpfa)

    tmpfa.flush()

    # Open tempfile and restore original sequence order
    fa = as_seqdict(tmpfa.name)
    fa = {region: fa[region] for region in regions}
    return _check_minsize(fa, minsize)
Example #2
0
def prepare_denovo_input_bed(inputfile, params, outdir):
    """Prepare a BED file for de novo motif prediction.

    All regions to same size; split in test and validation set;
    converted to FASTA.

    Parameters
    ----------
    inputfile : str
        BED file with input regions.

    params : dict
        Dictionary with parameters.

    outdir : str
        Output directory to save files.
    """
    logger.info("preparing input (BED)")
    
    # Create BED file with regions of equal size
    width = int(params["width"])
    bedfile = os.path.join(outdir, "input.bed")
    write_equalwidth_bedfile(inputfile, width, bedfile)
    
    abs_max = int(params["abs_max"])
    fraction = float(params["fraction"])
    pred_bedfile = os.path.join(outdir, "prediction.bed")
    val_bedfile = os.path.join(outdir, "validation.bed")
    # Split input into prediction and validation set
    logger.debug(
                "Splitting %s into prediction set (%s) and validation set (%s)",
                bedfile, pred_bedfile, val_bedfile)
    divide_file(bedfile, pred_bedfile, val_bedfile, fraction, abs_max)

    config = MotifConfig()
   
    genome = Genome(params["genome"])
    for infile in [pred_bedfile, val_bedfile]:
        genome.track2fasta(
            infile, 
            infile.replace(".bed", ".fa"), 
            )

    # Create file for location plots
    lwidth = int(params["lwidth"])
    extend = (lwidth - width) // 2
    
    genome.track2fasta(
            val_bedfile, 
            os.path.join(outdir, "localization.fa"), 
            extend_up=extend, 
            extend_down=extend, 
            stranded=params["use_strand"], 
            )
Example #3
0
def as_fasta(seqs, genome=None):
    ftype = get_seqs_type(seqs)
    if ftype == "fasta":
        return seqs
    elif ftype == "fastafile":
        return Fasta(seqs)
    else:
        if genome is None:
            raise ValueError("need genome to convert to FASTA")

        tmpfa = NamedTemporaryFile()
        if type(genome) == type(""):
            genome = Genome(genome)
        genome.track2fasta(seqs, tmpfa.name)
        return Fasta(tmpfa.name)
Example #4
0
def as_fasta(seqs, genome=None):
    ftype = get_seqs_type(seqs)
    if ftype == "fasta":
        return seqs
    elif ftype == "fastafile":
        return Fasta(seqs)
    else:
        if genome is None:
            raise ValueError("need genome to convert to FASTA")

        tmpfa = NamedTemporaryFile()
        if type(genome) == type(""):
            genome = Genome(genome)
        genome.track2fasta(seqs, tmpfa.name) 
        return Fasta(tmpfa.name)
 def __call__(self, parser, args, name, option_string=None):
     try:
         genome = Genome(name, genomes_dir=genomes_dir)
     except FileNotFoundError:
         logger.warning(f"Genome {name} not found!")
         if auto_install:
             logger.info(
                 "Trying to install it automatically using genomepy...")
             install_genome(name,
                            annotation=True,
                            genomes_dir=genomes_dir)
             genome = Genome(name, genomes_dir=genomes_dir)
         else:
             logger.info("You can install it using `genomepy install`.")
             sys.exit(1)
     setattr(args, self.dest, genome)
Example #6
0
def load_mapping(genome_name):
    logger.info("Loading chromosome mapping.")
    genome = Genome(genome_name)
    asm_acc = genome.assembly_accession

    if genome.provider not in ["UCSC", "NCBI"]:
        logger.error(f"Can't map to provider {genome.provider}")
        return None

    asm_report = ncbi_assembly_report(asm_acc)
    asm_report.loc[asm_report["Sequence-Role"] != "assembled-molecule",
                   "Assigned-Molecule"] = "na"

    mapping = asm_report[[
        "Sequence-Name", "UCSC-style-name", "Assigned-Molecule", "GenBank-Accn"
    ]]

    if genome.provider == "NCBI":
        logger.info("Mapping to NCBI sequence names")
        id_column = "Sequence-Name"
    elif genome.provider == "UCSC":
        logger.info("Mapping to UCSC sequence names")
        id_column = "UCSC-style-name"
    mapping = pd.melt(mapping, id_vars=[id_column])
    mapping = mapping[mapping["value"] != "na"]
    mapping = mapping.drop_duplicates().set_index("value")[[id_column]]
    mapping.columns = ["chrom"]
    return mapping
Example #7
0
def check_denovo_input(inputfile, params):

    genome = params["genome"]
    background = params["background"]
    
    input_type = "BED"
    # If we can load it as fasta then it is a fasta, yeh?
    try:
        Fasta(inputfile)
        logger.debug("Inputfile is a FASTA file")
        input_type = "FASTA"
    except Exception:
        # Leave it to BED
        pass

    if input_type == "FASTA":
        valid_bg = FA_VALID_BGS    
    elif input_type == "BED":
        valid_bg = BED_VALID_BGS    
        if "genomic" in background:
            Genome(genome)
        # is it a valid bed-file etc.
        check_bed_file(inputfile)    # bed-specific
    
    for bg in background:
        if not bg in valid_bg:
            logger.info("Input type is %s, ignoring background type '%s'", 
                            input_type, bg)
        background = [bg for bg in background if bg in valid_bg]

    if len(background) == 0:
        logger.error("No valid backgrounds specified!")
        sys.exit(1)

    return input_type, background
Example #8
0
    def set_peak_size(self, peak_bed, seqlen=200):
        """set all input peaks to 200bp
        Arguments:
            peak_bed {[bed]} -- [input peak bed file]

        Keyword Arguments:
            seqlen {int} -- [peak length] (default: {200})

        Returns:
            [type] -- [200bp peak file]
        """
        gsizedic = Genome(self.genome).sizes

        peaks = BedTool(peak_bed)
        fl2 = NamedTemporaryFile(mode="w", dir=mytmpdir(), delete=False)

        for peak in peaks:
            if peak.length < seqlen or peak.length > seqlen:
                # get the summit and the flanking low and high sequences
                summit = (peak.start + peak.end) // 2
                start, end = summit - seqlen // 2, summit + seqlen // 2
            else:
                start, end = peak.start, peak.end

            # remove seq which langer than chromosome length or smaller than 0
            if start > 0 and end < int(gsizedic[peak.chrom]):
                fl2.write(f"{peak.chrom}\t{start}\t{end}\n")

        return fl2.name
Example #9
0
def _local_gene_annotation(genes: Iterable[str], genome: str) -> pd.DataFrame:
    """Retrieve gene location from local annotation.

    Parameters
    ----------
    genes : Iterable
        List of gene names or gene identifiers such as ensembl_id.
    genome : str
        Genome name

    Returns
    -------
    pandas.DataFrame with gene annotation.
    """
    g = Genome(genome)
    gene_list = list(genes)
    bed = os.path.join(os.path.dirname(g.filename),
                       f"{genome}.annotation.bed.gz")
    gene_info = pd.DataFrame()
    if os.path.exists(bed):
        df = pd.read_table(
            bed,
            index_col=3,
            usecols=[0, 1, 2, 3, 5],
            names=["chrom", "start", "end", "name", "strand"],
        )
        gene_info = df.loc[gene_list]

    # If we find more than half of the genes we assume this worked.
    if gene_info.shape[0] >= 0.5 * len(gene_list):
        return gene_info.reset_index()[[
            "chrom", "start", "end", "name", "strand"
        ]]
Example #10
0
def peak2fasta(peak_ids, ref_genome):
    '''
    Convert peak_id into fasta object.

    Args:
        peak_id (str or list of str): Peak_id.  e.g. "chr5_0930303_9499409"
            or it can be a list of peak_id.  e.g. ["chr5_0930303_9499409", "chr11_123445555_123445577"]

        ref_genome (str): Reference genome name.   e.g. "mm9", "mm10", "hg19" etc

    Returns:
        gimmemotifs fasta object: DNA sequence in fasta format

    '''
    genome_data = Genome(ref_genome)

    def peak2seq(peak_id):
        chromosome_name, start, end = decompose_chrstr(peak_id)
        locus = (int(start), int(end))

        tmp = genome_data[chromosome_name][locus[0]:locus[1]]
        name = f"{tmp.name}_{tmp.start}_{tmp.end}"
        seq = tmp.seq
        return (name, seq)

    if type(peak_ids) is str:
        peak_ids = [peak_ids]

    fasta = Fasta()
    for peak_id in peak_ids:
        name, seq = peak2seq(peak_id)
        fasta.add(name, seq)

    return fasta
def create_gc_bin_index(genome, fname, min_bin_size=100):
    """Create index of GC content for a genome.

    Parameters
    ----------
    genome : str
        Genome name.
    fname : str
        Name of the index file.
    min_bin_size : int
        Minimum bin size (default 100). Warning: setting to a small value
        will result in a very large index file!
    """
    logger.info("Creating index for genomic GC frequencies.")
    g = Genome(genome)
    fasta = g.filename
    sizes = g.filename + ".sizes"  # props["sizes"]["sizes"]

    with NamedTemporaryFile() as tmp:
        # pylint: disable=unexpected-keyword-arg
        pybedtools.BedTool().window_maker(
            g=sizes,
            w=min_bin_size).nucleotide_content(fi=fasta).saveas(tmp.name)
        df = pd.read_csv(
            tmp.name,
            sep="\t",
            usecols=[0, 1, 2, 4, 9],
            dtype={
                "#1_usercol": "string",
                "2_usercol": np.int64,
                "3_usercol": np.int64,
                "5_pct_gc": np.float32,
                "10_num_N": np.int8,
            },
        )

    cols = [
        "chrom",
        "start",
        "end",
        "w{}".format(min_bin_size),
        "n{}".format(min_bin_size),
    ]
    for t in (2, 5):
        df["w{}".format(min_bin_size * t)] = (df.iloc[:, 3].rolling(
            t, min_periods=t).mean())
        df["n{}".format(min_bin_size * t)] = (df.iloc[:, 4].rolling(
            t, min_periods=t).sum())
        cols += [
            "w{}".format(min_bin_size * t), "n{}".format(min_bin_size * t)
        ]

    df.columns = cols

    # Make really sure that column 'chrom' is a string
    df.dropna(subset=["chrom"], inplace=True)
    df["chrom"] = df["chrom"].apply(str).astype("string")

    df.reset_index()[cols].to_feather(fname)
def create_random_genomic_bedfile(out, genome, size, n):
    features = Genome(genome).get_random_sequences(n, size)

    # Write result to bedfile
    tmp = open(out, "w")
    for chrom, start, end in features:
        tmp.write("%s\t%d\t%d\n" % (chrom, start, end))
    tmp.flush()
Example #13
0
def _genomepy_convert(to_convert, genome, minsize=None):
    """
    Convert a variety of inputs using track2fasta().
    """
    if genome is None:
        raise ValueError("input file is not a FASTA file, need a genome!")

    if isinstance(genome, Genome):
        g = genome
    else:
        g = Genome(genome)

    tmpfile = NamedTemporaryFile()
    g.track2fasta(to_convert, tmpfile.name)

    fa = as_seqdict(tmpfile.name)
    return _check_minsize(fa, minsize)
Example #14
0
    def _scan_regions(self, regions, nreport, scan_rc):
        genome = self.genome
        motif_file = self.motifs
        motif_digest = self.checksum.get(motif_file, None)

        # determine which regions are not in the cache
        scan_regions = regions
        if self.use_cache:
            scan_regions = []
            for region in regions:
                key = str((region, genome, motif_digest, nreport, scan_rc))
                ret = self.cache.get(key)
                if ret == NO_VALUE:
                    scan_regions.append(region)

        # scan the regions that are not in the cache
        if len(scan_regions) > 0:

            g = Genome(genome)

            motifs = [(m, self.threshold[m.id])
                      for m in read_motifs(self.motifs)]
            scan_func = partial(
                scan_region_mult,
                genome=g,
                motifs=motifs,
                nreport=nreport,
                scan_rc=scan_rc,
            )

            for region, ret in self._scan_jobs(scan_func, scan_regions):
                # return values or store values in cache
                if self.use_cache:
                    # store values in cache
                    key = str((
                        region,
                        genome,
                        motif_digest,
                        nreport,
                        scan_rc,
                        self.threshold_str,
                    ))
                    self.cache.set(key, ret)
                else:
                    # return values
                    yield ret

        if self.use_cache:
            # return results from cache
            for region in regions:
                key = str((region, genome, motif_digest, nreport, scan_rc,
                           self.threshold_str))
                ret = self.cache.get(key)
                if ret == NO_VALUE or ret is None:
                    raise Exception("cache is not big enough to hold all "
                                    "results, try increasing the cache size "
                                    "or disable cache")
                yield ret
Example #15
0
def as_fasta(seqs, genome=None):
    ftype = get_seqs_type(seqs)
    if ftype == "fasta":
        return seqs
    elif ftype == "fastafile":
        return Fasta(seqs)
    else:
        if genome is None:
            raise ValueError("need genome to convert to FASTA")

        tmpfa = NamedTemporaryFile()
        if isinstance(genome, str):
            genome = Genome(genome)

        if isinstance(seqs, np.ndarray):
            seqs = list(seqs)
        genome.track2fasta(seqs, tmpfa.name)
        return Fasta(tmpfa.name)
Example #16
0
def ensembl_genome_info(genome_name: str) -> Tuple[str, str, str]:
    """Return Ensembl genome information for a local genome managed by genomepy.

    Parameters
    ----------
    genome_name : str
        Name of local genome.

    Returns
    -------
    (str, str, str)
        Ensembl name, accession, taxonomy_id
    """
    # Fast lookup for some common queries
    common_names = {
        "danRer11": "GRCz11",
        "hg38": "GRCh38",
        "mm10": "GRCm38",
        "dm6": "BDGP6.28",
    }
    if genome_name in common_names:
        search_term = common_names[genome_name]
    else:
        try:
            genome = Genome(genome_name)
            search_term = genome.tax_id
        except FileNotFoundError:
            logger.info(f"Genome {genome_name} not installed locally")
            p = ProviderBase.create("Ensembl")
            for name, *_rest in p.search(genome_name):
                if name == genome_name:
                    logger.info(
                        f"It can be downloaded from Ensembl: genomepy install {name} Ensembl --annotation"
                    )
                    return None
            return None

    # search Ensembl by taxonomy_id or by specific Ensembl name (if we know it)
    p = ProviderBase.create("Ensembl")
    name, accession, species, tax_id, *rest = [
        row for row in p.search(search_term)
    ][0]

    # Check if the assembly_id of the current Ensembl genome is the same as the
    # local genome. If it is identical, we can correctly assume that the genomes
    # sequences are identical.
    # For the genomes in the lookup table, we already know they match.
    if genome_name in common_names or accession == genome.assembly_accession:
        return name, accession, tax_id
    else:
        print(f"Could not find a matching genome in Ensembl")
        return None
Example #17
0
    def set_genome(self, genome):
        """
        set the genome to be used for:
            - converting regions to sequences
            - background for MOODS
        """
        if not genome:
            return

        # raises error if checks fail
        Genome(genome)

        self.genome = genome
Example #18
0
def prepare_denovo_input_bed(inputfile, params, outdir):
    """Prepare a BED file for de novo motif prediction.

    All regions to same size; split in test and validation set;
    converted to FASTA.

    Parameters
    ----------
    inputfile : str
        BED file with input regions.

    params : dict
        Dictionary with parameters.

    outdir : str
        Output directory to save files.
    """
    logger.info("preparing input (BED)")

    # Create BED file with regions of equal size
    width = int(params["width"])
    bedfile = os.path.join(outdir, "input.bed")
    write_equalwidth_bedfile(inputfile, width, bedfile)

    abs_max = int(params["abs_max"])
    fraction = float(params["fraction"])
    pred_bedfile = os.path.join(outdir, "prediction.bed")
    val_bedfile = os.path.join(outdir, "validation.bed")
    # Split input into prediction and validation set
    logger.debug(
        "Splitting %s into prediction set (%s) and validation set (%s)",
        bedfile, pred_bedfile, val_bedfile)
    divide_file(bedfile, pred_bedfile, val_bedfile, fraction, abs_max)

    config = MotifConfig()

    genome = Genome(params["genome"])
    for infile in [pred_bedfile, val_bedfile]:
        genome.track2fasta(
            infile,
            infile.replace(".bed", ".fa"),
        )

    # Create file for location plots
    lwidth = int(params["lwidth"])
    extend = (lwidth - width) // 2

    genome.track2fasta(
        val_bedfile,
        os.path.join(outdir, "localization.fa"),
        extend_up=extend,
        extend_down=extend,
        stranded=params["use_strand"],
    )
    def __init__(self, matchfile, genome="hg19", number=None, size=None):
        # Create temporary files
        tmpbed = NamedTemporaryFile(dir=mytmpdir()).name
        tmpfasta = NamedTemporaryFile(dir=mytmpdir()).name

        # Create bed-file with coordinates of random sequences
        matched_gc_bedfile(tmpbed, matchfile, genome, number, size=size)

        # Convert track to fasta
        Genome(genome).track2fasta(tmpbed, fastafile=tmpfasta)

        # Initialize super Fasta object
        Fasta.__init__(self, tmpfasta)

        # Delete the temporary files
        os.remove(tmpbed)
        os.remove(tmpfasta)
Example #20
0
def check_genome(genome):
    """Check if genome is a valid FASTA file or genomepy genome genome.

    Parameters
    ----------
    genome : str
        Genome name or file to check.

    Returns
    -------
    is_genome : bool
    """
    try:
        Genome(genome)
        return True
    except Exception as e:
        pass
    return False
Example #21
0
def create_gc_bin_index(genome, fname, min_bin_size=100):
    """Create index of GC content for a genome.

    Parameters
    ----------
    genome : str
        Genome name.
    fname : str
        Name of the index file.
    min_bin_size : int
        Minimum bin size (default 100). Warning: setting to a small value
        will result in a very large index file!
    """
    logger.info("Creating index for genomic GC frequencies.")
    g = Genome(genome)
    fasta = g.filename
    sizes = g.props["sizes"]["sizes"]

    with NamedTemporaryFile() as tmp:
        # pylint: disable=unexpected-keyword-arg
        pybedtools.BedTool().window_maker(
            g=sizes,
            w=min_bin_size).nucleotide_content(fi=fasta).saveas(tmp.name)
        df = pd.read_csv(tmp.name, sep="\t", usecols=[0, 1, 2, 4, 9])

    cols = [
        "chrom",
        "start",
        "end",
        "w{}".format(min_bin_size),
        "n{}".format(min_bin_size),
    ]
    for t in (2, 5):
        df["w{}".format(min_bin_size * t)] = (df.iloc[:, 3].rolling(
            t, min_periods=t).mean())
        df["n{}".format(min_bin_size * t)] = (df.iloc[:, 4].rolling(
            t, min_periods=t).sum())
        cols += [
            "w{}".format(min_bin_size * t), "n{}".format(min_bin_size * t)
        ]

    df.columns = cols
    df.reset_index()[cols].to_feather(fname)
    def __init__(self, genome, size=None, n=None):
        size = int(size)

        # Create temporary files
        tmpbed = NamedTemporaryFile(dir=mytmpdir()).name
        tmpfasta = NamedTemporaryFile(dir=mytmpdir()).name

        # Create bed-file with coordinates of random sequences
        create_random_genomic_bedfile(tmpbed, genome, size, n)

        # Convert track to fasta
        Genome(genome).track2fasta(tmpbed, fastafile=tmpfasta, stranded=True)

        # Initialize super Fasta object
        Fasta.__init__(self, tmpfasta)

        # Delete the temporary files
        os.remove(tmpbed)
        os.remove(tmpfasta)
Example #23
0
    def __init__(self, name):
        self.name = str(name)
        self.data_dir = Path(locate_data(name))

        with open(os.path.join(self.data_dir, "info.yaml")) as f:
            self.config = yaml.load(f, Loader=yaml.FullLoader)

        self.source = None
        source = self.config.get("source", None)
        if source:
            self.source = ScepiaDataset(source)

        try:
            Genome(self.genome)
        except FileNotFoundError:
            logger.error(f"Genome {self.genome} is needed for this dataset.")
            logger.error("Please install it with genomepy.")
            logger.error(f"Command-line: genomepy install {self.genome}")
            logger.error(
                f'Python: import genomepy; genomepy.install_genome("{self.genome}")'
            )
Example #24
0
def create_link_file(meanstd_file: str,
                     genes_file: str,
                     genome: Optional[str] = "hg38") -> pd.DataFrame:
    meanstd_file = str(meanstd_file)
    # Read enhancer locations
    if meanstd_file.endswith("feather"):
        tmp = pd.read_feather(meanstd_file)["index"]
    else:
        tmp = pd.read_csv(meanstd_file, sep="\t")["index"]
    enhancers = BedTool.from_dataframe(tmp.str.split("[-:]", expand=True))

    # Calculating overlap with certain distance
    g = Genome(genome).sizes_file
    genes = BedTool(genes_file).slop(b=100000, g=g).cut([0, 1, 2, 3])
    overlap = genes.intersect(b=enhancers, wo=True)
    overlap = overlap.to_dataframe().iloc[:, 3:7]
    overlap.columns = ["gene", "chrom", "start", "end"]
    overlap["loc"] = (overlap["chrom"] + ":" + overlap["start"].astype(str) +
                      "-" + overlap["end"].astype(str))
    overlap["pos"] = ((overlap["start"] + overlap["end"]) / 2).astype(int)
    overlap = overlap[["gene", "loc", "pos"]]
    return overlap
Example #25
0
def is_genome_installed(ref_genome):
    """
    Celloracle motif_analysis module uses gimmemotifs and genomepy internally.
    Reference genome files should be installed in the PC to use gimmemotifs and genomepy.
    This function checks the installation status of the reference genome.

    Args:
        ref_genome (str): names of reference genome. i.e., "mm10", "hg19"

    """
    try:
        genome_data = Genome(ref_genome)

        return True

    except:

        print(f"genome {ref_genome} is not installed in this environment.")
        print("Please install genome using genomepy.")
        print('e.g.\n    >>> import genomepy\n    >>> genomepy.install_genome("mm9", "UCSC")')

    return False
Example #26
0
def check_denovo_input(inputfile, params):
    """
    Check if an input file is valid, which means BED, narrowPeak or FASTA
    """
    background = params["background"]

    input_type = determine_file_type(inputfile)

    if input_type == "fasta":
        valid_bg = FA_VALID_BGS
    elif input_type in ["bed", "narrowpeak"]:
        genome = params["genome"]
        valid_bg = BED_VALID_BGS
        if "genomic" in background or "gc" in background:
            Genome(genome)
        # is it a valid bed-file etc.
        check_bed_file(
            inputfile)  # bed-specific, will also work for narrowPeak
    else:
        sys.stderr.write(
            "Format of inputfile {} not recognized.\n".format(inputfile))
        sys.stderr.write("Input should be FASTA, BED or narrowPeak.\n")
        sys.stderr.write(
            "See https://genome.ucsc.edu/FAQ/FAQformat.html for specifications.\n"
        )
        sys.exit(1)

    for bg in background:
        if bg not in valid_bg:
            logger.info("Input type is %s, ignoring background type '%s'",
                        input_type, bg)
        background = [bg for bg in background if bg in valid_bg]

    if len(background) == 0:
        logger.error("No valid backgrounds specified!")
        sys.exit(1)

    return input_type, background
Example #27
0
    def test2_as_fasta(self):
        """ convert bed, regions, etc to Fasta """
        tmpdir = mkdtemp()

        g = Genome("genome", genome_dir=self.genome_dir)

        fafile = os.path.join(self.datadir, "test.fa")
        fa = Fasta(fafile)
        bedfile = os.path.join(self.datadir, "test.bed")
        regionfile = os.path.join(self.datadir, "test.txt")
        with open(regionfile) as f:
            regions = [l.strip() for l in f]

        self.assertTrue(isinstance(as_fasta(fa), Fasta))
        self.assertTrue(isinstance(as_fasta(fafile), Fasta))

        self.assertTrue(isinstance(as_fasta(bedfile, g), Fasta))
        self.assertTrue(isinstance(as_fasta(regionfile, g), Fasta))
        self.assertTrue(isinstance(as_fasta(regions, g), Fasta))

        with self.assertRaises(ValueError):
            as_fasta(bedfile)

        rmtree(tmpdir)
Example #28
0
def background(args):

    inputfile = args.inputfile
    out = args.outputfile
    bg_type = args.bg_type
    outformat = args.outformat.lower()
    length = args.length

    if bg_type not in BG_TYPES:
        print("The argument 'type' should be one of: %s" % (",".join(BG_TYPES)))
        sys.exit(1)

    if outformat == "bed" and bg_type == "random":
        print("Random background can only be generated in FASTA format!")
        sys.exit(1)
        
    if bg_type == "gc" and not inputfile:
        print("need a FASTA formatted input file for background gc")
        sys.exit(1)
    
    # GimmeMotifs configuration for file and directory locations
    config = MotifConfig()
        
    # Genome index location for creation of FASTA files
    if bg_type in ["gc", "genomic", "promoter"] and outformat == "fasta":
        Genome(args.genome)

    # Gene definition
    fname = Genome(args.genome).filename
    gene_file = fname.replace(".fa", ".annotation.bed.gz")
    if not gene_file:
        gene_file = os.path.join(config.get_gene_dir(), "{}.bed".format(args.genome))
    
    if bg_type in ["promoter"]:
        if not os.path.exists(gene_file):
            print("Could not find a gene file for genome {}".format(args.genome))
            print("Did you use the --annotation flag for genomepy?")
            print("Alternatively make sure there is a file called {}.bed in {}".format(args.genome, config.get_gene_dir()))
            sys.exit(1)

    # Number of sequences
    number = None
    if args.number:
        number = args.number
    elif inputfile:
        number = number_of_seqs_in_file(inputfile)
    else:
        sys.stderr.write("please provide either a number or an inputfile\n")
        sys.exit(1)
    
    if bg_type == "random":
        f = Fasta(inputfile)
        m = bg.MarkovFasta(f, n=number, k=args.markov_order)
        m.writefasta(out)
    elif bg_type == "gc":
        if outformat in ["fasta", "fa"]:
            m = bg.MatchedGcFasta(inputfile, args.genome, number=number)
            m.writefasta(out)
        else:
            bg.matched_gc_bedfile(out, inputfile, args.genome, number)
    elif bg_type == "promoter":
        if outformat in ["fasta", "fa"]:
            m = bg.PromoterFasta(gene_file, args.genome, length=length, n=number)
            m.writefasta(out)
        else:
            bg.create_promoter_bedfile(out, gene_file, length, number)
    elif bg_type == "genomic":
        if outformat in ["fasta", "fa"]:
            m = bg.RandomGenomicFasta(args.genome, length, number)
            m.writefasta(out)
        else:
            bg.create_random_genomic_bedfile(out, args.genome, length, number)
def matched_gc_bedfile(bedfile,
                       matchfile,
                       genome,
                       number,
                       size=None,
                       min_bin_size=100):
    """Create a BED file with GC% matched to input file.

    Parameters
    ----------
    bedfile : str
        Name of the output BED file.
    matchfile : str
        Name of input file (BED or FASTA format)
    genome : str
        Genome name.
    number : int
        Number of sequences to retrieve.
    size : int, optional
        Size of the generated sequenced. If not provided, the input size is used.
    """
    g = Genome(genome)
    genome_fa = g.filename
    try:
        fa = Fasta(matchfile)
        gc = [(seq.upper().count("C") + seq.upper().count("G")) / len(seq)
              for seq in fa.seqs]
        sizes = [len(seq) for seq in fa.seqs]
    except Exception:
        try:
            # pylint: disable=unexpected-keyword-arg
            fields = pd.read_csv(matchfile, comment="#", nrows=10,
                                 sep="\t").shape[1]
            tmp = (pybedtools.BedTool(matchfile).filter(
                lambda x: len(x) >= 10).saveas().fn)
            bed = pybedtools.BedTool(tmp)
            gc = np.array([
                float(x[fields + 1])
                for x in bed.nucleotide_content(fi=genome_fa)
            ])
            sizes = np.array([x.length for x in bed])
            gc = [round(x, 2) for x in gc]
        except Exception:
            sys.stderr.write(
                "Please provide input file in BED or FASTA format\n")
            raise

    # Get the median size of the sequences
    if size is None or size == 0:
        size = int(np.median(sizes))
        if np.std(sizes) > size * 0.05:
            sys.stderr.write("Sequences do not seem to be of equal size.\n")
            sys.stderr.write(("GC% matched sequences of the median size ({}) "
                              "will be created\n").format(size))

    bins = [(0.0, 0.2), (0.8, 1)]
    for b in np.arange(0.2, 0.799, 0.05):
        bins.append((b, b + 0.05))

    fraction = number / len(gc)
    gc = np.array(gc)
    # print("GC", gc)
    bin_count = []
    for b_start, b_end in bins:
        bin_count.append(
            int(
                np.sum((gc > round(b_start, 2)) & (gc <= round(b_end, 2))) *
                fraction))

    # To make te requested number, divide remaining over
    # all bins that have counts
    rest = number - sum(bin_count)
    i = 0
    for _ in range(rest):
        while bin_count[i % len(bins)] == 0:
            i += 1
        bin_count[i % len(bins)] += 1
        i += 1

    nseqs = max(bin_count) * len(bins)

    with NamedTemporaryFile(delete=False) as tmp:
        gc_bin_bedfile(
            tmp.name,
            genome,
            nseqs,
            length=size,
            bins=bins,
            random_state=None,
            min_bin_size=min_bin_size,
        )
        df = pd.read_csv(tmp.name,
                         sep="\t",
                         names=["chrom", "start", "end", "bin"])
        # print(tmp.name)
    with open(bedfile, "w") as f:
        pass
    with open(bedfile, "a") as f:
        for (b_start, b_end), n in zip(bins, bin_count):
            if n == 0:
                continue
            # print(b_start, b_end, n)
            b = "{:.2f}-{:.2f}".format(b_start, b_end)
            df.loc[df["bin"] == b,
                   ["chrom", "start", "end"]].sample(n).to_csv(f,
                                                               sep="\t",
                                                               header=False,
                                                               index=False)
def create_background_file(outfile,
                           bg_type,
                           fmt="fasta",
                           size=None,
                           genome=None,
                           inputfile=None,
                           number=10000):
    """
    Create a background file for motif analysis.

    Parameters
    ----------
    outfile : str
        Name of the output file.
    bg_type : str
        Type of background (gc, genomic, random or promoter).
    fmt : str, optional
        Either 'fasta' or 'bed'.
    size : int, optional
        Size of the generated sequences, is determined from the inputfile if not
        given.
    genome : str, optional
    inputfile : str, optional
    number : int, optional
    """
    fmt = fmt.lower()
    if fmt in ["fa", "fsa"]:
        fmt = "fasta"

    if bg_type not in BG_TYPES:
        print("The argument 'type' should be one of: %s" %
              (",".join(BG_TYPES)))
        sys.exit(1)

    if fmt == "bed" and bg_type == "random":
        print("Random background can only be generated in FASTA format!")
        sys.exit(1)

    if bg_type == "gc" and not inputfile:
        print("need a FASTA formatted input file for background gc")
        sys.exit(1)

    # GimmeMotifs configuration for file and directory locations
    config = MotifConfig()

    # Genome index location for creation of FASTA files
    if bg_type in ["gc", "genomic", "promoter"] and fmt == "fasta":
        if genome is None:
            print("Need a genome to create background file")
            sys.exit(1)
        Genome(genome)

    if bg_type in ["promoter"]:
        # Gene definition
        fname = Genome(genome).filename
        gene_file = fname.replace(".fa", ".annotation.bed.gz")
        if not gene_file:
            gene_file = os.path.join(config.get_gene_dir(),
                                     "{}.bed".format(genome))

        if not os.path.exists(gene_file):
            print("Could not find a gene file for genome {}".format(genome))
            print("Did you use the --annotation flag for genomepy?")
            print(
                "Alternatively make sure there is a file called {}.bed in {}".
                format(genome, config.get_gene_dir()))
            sys.exit(1)

    # Number of sequences
    if number is None:
        if inputfile:
            number = number_of_seqs_in_file(inputfile)
            logger.info("Using %s of background sequences based on input file",
                        number)
        else:
            number = 10000
            logger.info(
                "Number of background sequences not specified, using 10,000 sequences"
            )

    if bg_type == "random":
        f = Fasta(inputfile)
        m = MarkovFasta(f, n=number, k=1)
        m.writefasta(outfile)
    elif bg_type == "gc":
        if fmt == "fasta":
            m = MatchedGcFasta(inputfile, genome, number=number, size=size)
            m.writefasta(outfile)
        else:
            matched_gc_bedfile(outfile, inputfile, genome, number, size=size)
    else:
        if size is None:
            size = np.median(
                [len(seq) for seq in as_fasta(inputfile, genome=genome).seqs])
        if bg_type == "promoter":
            if fmt == "fasta":
                m = PromoterFasta(gene_file, genome, size=size, n=number)
                m.writefasta(outfile)
            else:
                create_promoter_bedfile(outfile, gene_file, size, number)
        elif bg_type == "genomic":
            if fmt == "fasta":
                m = RandomGenomicFasta(genome, size, number)
                m.writefasta(outfile)
            else:
                create_random_genomic_bedfile(outfile, genome, size, number)
Example #31
0
def create_background(bg_type, fafile, outfile, genome="hg18", width=200, nr_times=10, custom_background=None):
    """Create background of a specific type.

    Parameters
    ----------
    bg_type : str
        Name of background type.

    fafile : str
        Name of input FASTA file.

    outfile : str
        Name of output FASTA file.

    genome : str, optional
        Genome name.

    width : int, optional
        Size of regions.

    nr_times : int, optional
        Generate this times as many background sequences as compared to 
        input file.
    
    Returns
    -------
    nr_seqs  : int
        Number of sequences created.
    """
    width = int(width)
    config = MotifConfig()
    fg = Fasta(fafile)

    if bg_type in ["genomic", "gc"]:
        if not genome:
            logger.error("Need a genome to create background")
            sys.exit(1)
    
    if bg_type == "random":
        f = MarkovFasta(fg, k=1, n=nr_times * len(fg))
        logger.debug("Random background: %s", outfile)
    elif bg_type == "genomic":
        logger.debug("Creating genomic background")
        f = RandomGenomicFasta(genome, width, nr_times * len(fg))
    elif bg_type == "gc":
        logger.debug("Creating GC matched background")
        f = MatchedGcFasta(fafile, genome, nr_times * len(fg))
        logger.debug("GC matched background: %s", outfile)
    elif bg_type == "promoter":
        fname = Genome(genome).filename
        gene_file = fname.replace(".fa", ".annotation.bed.gz")
        if not gene_file:
            gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % genome)
        if not os.path.exists(gene_file):
            print("Could not find a gene file for genome {}")
            print("Did you use the --annotation flag for genomepy?")
            print("Alternatively make sure there is a file called {}.bed in {}".format(genome, config.get_gene_dir()))
            raise ValueError()

        logger.info(
                "Creating random promoter background (%s, using genes in %s)",
                genome, gene_file)
        f = PromoterFasta(gene_file, genome, width, nr_times * len(fg))
        logger.debug("Random promoter background: %s", outfile)
    elif bg_type == "custom":
        bg_file = custom_background
        if not bg_file:
            raise IOError(
                    "Background file not specified!")

        if not os.path.exists(bg_file):
            raise IOError(
                    "Custom background file %s does not exist!",
                    bg_file)
        else:
            logger.info("Copying custom background file %s to %s.",
                    bg_file, outfile)
            f = Fasta(bg_file)
            l = np.median([len(seq) for seq in f.seqs])
            if l < (width * 0.95) or l > (width * 1.05):
                   logger.warn(
                    "The custom background file %s contains sequences with a "
                    "median length of %s, while GimmeMotifs predicts motifs in sequences "
                    "of length %s. This will influence the statistics! It is recommended "
                    "to use background sequences of the same length.", 
                    bg_file, l, width)
    
    f.writefasta(outfile)
    return len(f)
Example #32
0
def matched_gc_bedfile(bedfile, matchfile, genome, number):
    N_FRACTION = 0.1
    
    g = Genome(genome)
    genome_fa = g.filename
    try:
        fa = Fasta(matchfile)
        gc = [(seq.upper().count("C") + seq.upper().count("G")) / len(seq) for seq in fa.seqs]
        lengths = [len(seq) for seq in fa.seqs]
    except Exception:
        try:
            # pylint: disable=unexpected-keyword-arg
            bed = pybedtools.BedTool(matchfile)
            gc = [float(x[4]) for x in bed.nucleotide_content(fi=genome_fa)]
            lengths = [x.length for x in bed]
        except:
            sys.stderr.write("Please provide input file in BED or FASTA format\n")
            raise
    gc_hist,bins = np.histogram(gc, range=(0,1), bins=20)
    
    length = np.median(lengths)
    if np.std(lengths) > length * 0.05:
        sys.stderr.write("Sequences do not seem to be of equal length.\n")
        sys.stderr.write("GC% matched sequences of the median length ({}) will be created\n".format(length))

    if number:
        norm = number * gc_hist / (float(sum(gc_hist))) + 0.5
        inorm = norm.astype(np.int)

        s = np.argsort(norm - inorm)
        while sum(inorm) > number:
            if inorm[np.argmin(s)] > 0:
                inorm[np.argmin(s)] -= 1
            s[np.argmin(s)] = len(s)
        while sum(inorm) < number:
            inorm[np.argmax(s)] += 1
            s[np.argmax(s)] = 0
        gc_hist = inorm

    rnd = pybedtools.BedTool()
    out = open(bedfile, "w")
    #sys.stderr.write("Generating sequences\n")
    #sys.stderr.write("{}\n".format(number))
    
    # Create a file with chromosome sizes if it doesn't exist yet
    genome_size = genome_fa + ".sizes"
    del_size = False
    if not os.path.exists(genome_size):
        genome_size = NamedTemporaryFile().name
        del_size = True
        with open(genome_size, "w") as f:
            for seqname in g.keys():
                f.write("{}\t{}\n".format(seqname, len(g[seqname])))
   
    # pylint: disable=unexpected-keyword-arg
    r = rnd.random(l=length, n=number * 30, g=genome_size).nucleotide_content(fi=genome_fa)
    if del_size:
        os.unlink(genome_size)
    
    features = [f[:3] + [float(f[7])] for f in r if float(f[12]) <= length * N_FRACTION]
    gc = [f[3] for f in features]
    
    #sys.stderr.write("Done\n")
    for bin_start, bin_end, count in zip(bins[:-1], bins[1:], gc_hist):
        #sys.stderr.write("CG {}-{}\n".format(bin_start, bin_end))
        if count > 0:
            rcount = 0
            for f in features:
                if (f[3] >= bin_start and f[3] < bin_end):
                    out.write("{}\t{}\t{}\n".format(*f[:3]))
                    rcount += 1
                    if rcount >= count:
                        break

            if count != rcount:
                sys.stderr.write("not enough random sequences found for {} <= GC < {} ({} instead of {})\n".format(bin_start, bin_end, rcount, count))
    out.close()
Example #33
0
def create_background(
    bg_type,
    fafile,
    outfile,
    genome="hg18",
    size=200,
    nr_times=10,
    custom_background=None,
):
    """Create background of a specific type.

    Parameters
    ----------
    bg_type : str
        Name of background type.

    fafile : str
        Name of input FASTA file.

    outfile : str
        Name of output FASTA file.

    genome : str, optional
        Genome name.

    size : int, optional
        Size of regions.

    nr_times : int, optional
        Generate this times as many background sequences as compared to
        input file.

    Returns
    -------
    nr_seqs  : int
        Number of sequences created.
    """
    size = int(size)
    config = MotifConfig()
    fg = Fasta(fafile)

    if bg_type in ["genomic", "gc"]:
        if not genome:
            logger.error("Need a genome to create background")
            sys.exit(1)

    if bg_type == "random":
        f = MarkovFasta(fg, k=1, n=nr_times * len(fg))
        logger.debug("Random background: %s", outfile)
    elif bg_type == "genomic":
        logger.debug("Creating genomic background")
        f = RandomGenomicFasta(genome, size, nr_times * len(fg))
    elif bg_type == "gc":
        logger.debug("Creating GC matched background")
        f = MatchedGcFasta(fafile, genome, nr_times * len(fg))
        logger.debug("GC matched background: %s", outfile)
    elif bg_type == "promoter":
        fname = Genome(genome).filename
        gene_file = fname.replace(".fa", ".annotation.bed.gz")
        if not gene_file:
            gene_file = os.path.join(config.get_gene_dir(), "%s.bed" % genome)
        if not os.path.exists(gene_file):
            print("Could not find a gene file for genome {}")
            print("Did you use the --annotation flag for genomepy?")
            print(
                "Alternatively make sure there is a file called {}.bed in {}".
                format(genome, config.get_gene_dir()))
            raise ValueError()

        logger.info(
            "Creating random promoter background (%s, using genes in %s)",
            genome,
            gene_file,
        )
        f = PromoterFasta(gene_file, genome, size, nr_times * len(fg))
        logger.debug("Random promoter background: %s", outfile)
    elif bg_type == "custom":
        bg_file = custom_background
        if not bg_file:
            raise IOError("Background file not specified!")

        if not os.path.exists(bg_file):
            raise IOError("Custom background file %s does not exist!", bg_file)
        else:
            logger.info("Copying custom background file %s to %s.", bg_file,
                        outfile)
            f = Fasta(bg_file)
            median_length = np.median([len(seq) for seq in f.seqs])
            if median_length < (size * 0.95) or median_length > (size * 1.05):
                logger.warn(
                    "The custom background file %s contains sequences with a "
                    "median size of %s, while GimmeMotifs predicts motifs in sequences "
                    "of size %s. This will influence the statistics! It is recommended "
                    "to use background sequences of the same size.",
                    bg_file,
                    median_length,
                    size,
                )

    f.writefasta(outfile)
    return len(f)
Example #34
0
def gene_annotation(genes: Iterable[str], genome: str) -> pd.DataFrame:
    """Retrieve genomic annotation of a set of genes.

    If the annotation is not present locally, then mygene.info is used.
    All genome assemblies that are present in the latest version of
    Ensembl are supported by mygene.info.

    Parameters
    ----------
    genes : Iterable
        List of gene names or gene identifiers such as ensembl_id.
    genome : str
        Genome name

    Returns
    -------
    pandas.DataFrame with gene annotation.
    """

    # First try to find the genes in the local annotation installed by genomepy.
    gene_info = _local_gene_annotation(genes, genome)
    if gene_info is not None:
        return gene_info

    # Genes are not identified locally.
    # Retrieve the gene information using the mygene.info API
    logger.info(
        f"No local matching genes found for {genome}, trying mygene.info")

    # mygene.info only queries the most recent version of the Ensembl database
    # We can only safely continue if the local genome matched the Ensembl genome.
    # Even if the local genome was installed via Ensembl, we still need to check
    # if it is the same version
    result = ensembl_genome_info(genome)
    if result is None:
        return None

    # Run the actual query
    g = Genome(genome)
    logger.info("Querying mygene.info...")
    mg = mygene.MyGeneInfo()
    result = mg.querymany(genes,
                          scopes="symbol,name,ensemblgene,entrezgene",
                          fields="genomic_pos",
                          species=g.tax_id,
                          as_dataframe=True,
                          verbose=False)

    if "notfound" in result and result.shape[1] == 1:
        logger.error("No matching genes found")
        sys.exit()

    if g.provider == "Ensembl":
        result = result.rename(columns={"genomic_pos.chr": "chrom"})
    else:
        # Ensembl, UCSC and NCBI chromosome names can all be different :-/
        logger.info("Local genome is not an Ensembl genome.")
        mapping = load_mapping(g.name)
        result = result.join(mapping, on="genomic_pos.chr")
        result = result.dropna(subset=["chrom"])

    # Convert genomic positions from string to integer
    result["genomic_pos.start"] = result["genomic_pos.start"].astype(int)
    result["genomic_pos.end"] = result["genomic_pos.end"].astype(int)

    # For each gene use the match with the highest score
    result = result.reset_index().sort_values("_score").groupby("query").last()

    # Map the Ensembl 1/-1 strand to +/- strand
    strand_df = pd.DataFrame({
        "ens_strand": [-1, 1],
        "strand": ["-", "+"]
    }).set_index("ens_strand")
    result = result.join(strand_df, on="genomic_pos.strand")

    # Select the correct columns and name them
    result = result.reset_index()[[
        "chrom", "genomic_pos.start", "genomic_pos.end", "query", "strand"
    ]]
    result.columns = [["chrom", "start", "end", "name", "strand"]]

    return result