Example #1
0
def prepare_denovo_input_bed(inputfile, params, outdir):
    """Prepare a BED file for de novo motif prediction.

    All regions to same size; split in test and validation set;
    converted to FASTA.

    Parameters
    ----------
    inputfile : str
        BED file with input regions.

    params : dict
        Dictionary with parameters.

    outdir : str
        Output directory to save files.
    """
    logger.info("preparing input (BED)")
    
    # Create BED file with regions of equal size
    width = int(params["width"])
    bedfile = os.path.join(outdir, "input.bed")
    write_equalwidth_bedfile(inputfile, width, bedfile)
    
    abs_max = int(params["abs_max"])
    fraction = float(params["fraction"])
    pred_bedfile = os.path.join(outdir, "prediction.bed")
    val_bedfile = os.path.join(outdir, "validation.bed")
    # Split input into prediction and validation set
    logger.debug(
                "Splitting %s into prediction set (%s) and validation set (%s)",
                bedfile, pred_bedfile, val_bedfile)
    divide_file(bedfile, pred_bedfile, val_bedfile, fraction, abs_max)

    config = MotifConfig()
   
    genome = Genome(params["genome"])
    for infile in [pred_bedfile, val_bedfile]:
        genome.track2fasta(
            infile, 
            infile.replace(".bed", ".fa"), 
            )

    # Create file for location plots
    lwidth = int(params["lwidth"])
    extend = (lwidth - width) // 2
    
    genome.track2fasta(
            val_bedfile, 
            os.path.join(outdir, "localization.fa"), 
            extend_up=extend, 
            extend_down=extend, 
            stranded=params["use_strand"], 
            )
Example #2
0
def prepare_denovo_input_bed(inputfile, params, outdir):
    """Prepare a BED file for de novo motif prediction.

    All regions to same size; split in test and validation set;
    converted to FASTA.

    Parameters
    ----------
    inputfile : str
        BED file with input regions.

    params : dict
        Dictionary with parameters.

    outdir : str
        Output directory to save files.
    """
    logger.info("preparing input (BED)")

    # Create BED file with regions of equal size
    width = int(params["width"])
    bedfile = os.path.join(outdir, "input.bed")
    write_equalwidth_bedfile(inputfile, width, bedfile)

    abs_max = int(params["abs_max"])
    fraction = float(params["fraction"])
    pred_bedfile = os.path.join(outdir, "prediction.bed")
    val_bedfile = os.path.join(outdir, "validation.bed")
    # Split input into prediction and validation set
    logger.debug(
        "Splitting %s into prediction set (%s) and validation set (%s)",
        bedfile, pred_bedfile, val_bedfile)
    divide_file(bedfile, pred_bedfile, val_bedfile, fraction, abs_max)

    config = MotifConfig()

    genome = Genome(params["genome"])
    for infile in [pred_bedfile, val_bedfile]:
        genome.track2fasta(
            infile,
            infile.replace(".bed", ".fa"),
        )

    # Create file for location plots
    lwidth = int(params["lwidth"])
    extend = (lwidth - width) // 2

    genome.track2fasta(
        val_bedfile,
        os.path.join(outdir, "localization.fa"),
        extend_up=extend,
        extend_down=extend,
        stranded=params["use_strand"],
    )
Example #3
0
def as_fasta(seqs, genome=None):
    ftype = get_seqs_type(seqs)
    if ftype == "fasta":
        return seqs
    elif ftype == "fastafile":
        return Fasta(seqs)
    else:
        if genome is None:
            raise ValueError("need genome to convert to FASTA")

        tmpfa = NamedTemporaryFile()
        if type(genome) == type(""):
            genome = Genome(genome)
        genome.track2fasta(seqs, tmpfa.name)
        return Fasta(tmpfa.name)
Example #4
0
def as_fasta(seqs, genome=None):
    ftype = get_seqs_type(seqs)
    if ftype == "fasta":
        return seqs
    elif ftype == "fastafile":
        return Fasta(seqs)
    else:
        if genome is None:
            raise ValueError("need genome to convert to FASTA")

        tmpfa = NamedTemporaryFile()
        if type(genome) == type(""):
            genome = Genome(genome)
        genome.track2fasta(seqs, tmpfa.name) 
        return Fasta(tmpfa.name)
Example #5
0
def _as_seqdict_genome_regions(regions, minsize=None):
    """
    Accepts list of regions where the genome is encoded in the region,
    using the genome@chrom:start-end format.
    """
    genomic_regions = {}
    for region in regions:
        genome, region = region.split("@")
        if genome not in genomic_regions:
            Genome(genome)
            genomic_regions[genome] = []
        genomic_regions[genome].append(region)

    tmpfa = NamedTemporaryFile(mode="w", delete=False)
    for genome, g_regions in genomic_regions.items():
        g = Genome(genome)

        fa = g.track2fasta(g_regions)

        for seq in fa:
            seq.name = f"{genome}@{seq.name}"
            print(seq.__repr__(), file=tmpfa)

    tmpfa.flush()

    # Open tempfile and restore original sequence order
    fa = as_seqdict(tmpfa.name)
    fa = {region: fa[region] for region in regions}
    return _check_minsize(fa, minsize)
Example #6
0
def _genomepy_convert(to_convert, genome, minsize=None):
    """
    Convert a variety of inputs using track2fasta().
    """
    if genome is None:
        raise ValueError("input file is not a FASTA file, need a genome!")

    if isinstance(genome, Genome):
        g = genome
    else:
        g = Genome(genome)

    tmpfile = NamedTemporaryFile()
    g.track2fasta(to_convert, tmpfile.name)

    fa = as_seqdict(tmpfile.name)
    return _check_minsize(fa, minsize)
Example #7
0
def as_fasta(seqs, genome=None):
    ftype = get_seqs_type(seqs)
    if ftype == "fasta":
        return seqs
    elif ftype == "fastafile":
        return Fasta(seqs)
    else:
        if genome is None:
            raise ValueError("need genome to convert to FASTA")

        tmpfa = NamedTemporaryFile()
        if isinstance(genome, str):
            genome = Genome(genome)

        if isinstance(seqs, np.ndarray):
            seqs = list(seqs)
        genome.track2fasta(seqs, tmpfa.name)
        return Fasta(tmpfa.name)