Example #1
0
def scan_it_moods(infile,
                  motifs,
                  cutoff,
                  bgfile,
                  nreport=1,
                  scan_rc=True,
                  pvalue=None,
                  count=False):
    tmpdir = mkdtemp()
    matrices = []
    pseudocount = 1e-3
    # sys.stderr.write("bgfile: {}\n".format(bgfile))
    bg = MOODS.tools.bg_from_sequence_dna("".join(Fasta(bgfile).seqs), 1)

    for motif in motifs:
        pfmname = os.path.join(tmpdir, "{}.pfm".format(motif.id))
        with open(pfmname, "w") as f:
            matrix = np.array(motif.pwm).transpose()
            for line in [" ".join([str(x) for x in row]) for row in matrix]:
                f.write("{}\n".format(line))

        matrices.append(MOODS.parsers.pfm_log_odds(pfmname, bg, pseudocount))

    thresholds = []
    if pvalue is not None:
        thresholds = [
            MOODS.tools.threshold_from_p(m, bg, float(pvalue))
            for m in matrices
        ]
        # sys.stderr.write("{}\n".format(thresholds))
    else:
        thresholds = [calc_threshold_moods(m, float(cutoff)) for m in matrices]

    scanner = MOODS.scan.Scanner(7)
    scanner.set_motifs(matrices, bg, thresholds)

    config = MotifConfig()
    ncpus = int(config.get_default_params()["ncpus"])
    fa = Fasta(infile)
    chunk = 500
    if (len(fa) / chunk) < ncpus:
        chunk = len(fa) / (ncpus + 1)

    jobs = []
    func = scan_fa_with_motif_moods
    if count:
        func = scan_fa_with_motif_moods_count

    pool = mp.Pool()
    for i in range(0, len(fa), chunk):
        jobs.append(
            pool.apply_async(
                func,
                (fa[i:i + chunk], motifs, matrices, bg, thresholds, nreport,
                 scan_rc),
            ))

    for job in jobs:
        for ret in job.get():
            yield ret
Example #2
0
    def __init__(self,
                 outfile,
                 fg_file=None,
                 background=None,
                 do_counter=True,
                 job_server=None):
        self.lock = thread.allocate_lock()
        self.motifs = []
        self.finished = []
        self.stats = {}
        self.stat_jobs = []
        self.outfile = outfile
        if job_server:
            self.job_server = job_server
        else:
            self.job_server = Pool(2)
        self.counter = 0
        self.do_counter = do_counter

        open(outfile, "w").close()

        if fg_file and background:
            self.fg_fa = Fasta(fg_file)
            self.background = dict([(bg, Fasta(fname))
                                    for bg, fname in background.items()])
            self.do_stats = True
        else:
            self.do_stats = False
Example #3
0
def as_fasta(seqs, index_dir=None):
    ftype = get_seqs_type(seqs)
    if ftype == "fasta":
        return seqs
    elif ftype == "fastafile":
        return Fasta(seqs)
    else:
        if index_dir is None:
            raise ValueError("need index_dir / genome to convert to FASTA")

        tmpfa = NamedTemporaryFile()
        
        if ftype == "bedfile":
            track2fasta(index_dir, seqs, tmpfa.name) 
        else:

            if ftype == "regionfile":
                seqs = [l.strip() for l in open(seqs).readlines()]
            tmpbed = NamedTemporaryFile()
            for seq in seqs:
                vals = re.split(r'[:-]', seq)
                tmpbed.write("{}\t{}\t{}\n".format(*vals))
            tmpbed.flush()
            track2fasta(index_dir, tmpbed.name, tmpfa.name) 
        return Fasta(tmpfa.name)
Example #4
0
	def calculate_enrichment(self, motif_file, fg, bg):
		""" fg: [sample_fa, sample_gff] bg: [[bg1_fa, bg1_gff, bg1_enrichment], [bg2_fa, bg2_gff, bg2_enrichment], .. etc] """
		
		self.logger.info("Scanning background sequences with motifs")
		scan_cmd = scan_fasta_file_with_motifs
		jobs = []
		if self.parallel:
			jobs.append(self.job_server().submit(scan_cmd, (fg[0], motif_file, self.SCAN_THRESHOLD, fg[1],), (),()))
		else:
			scan_cmd(fg[0], motif_file, self.SCAN_THRESHOD, fg[1])

		for fasta_file, gff_file in [x[:2] for x in bg]:
			if self.parallel:
				jobs.append(self.job_server().submit(scan_cmd, (fasta_file, motif_file, self.SCAN_THRESHOLD, gff_file,), (),()))
			else:
				scan_cmd(fasta_file, motif_file, self.SCAN_THRESHOLD, gff_file)
			
		for job in jobs:
				error = job()
				if error:
					self.logger.error("Error in thread: %s" % error)
					sys.exit(1)

		self.logger.info("Calculating enrichment")
		enrichment_cmd = gff_enrichment
		num_sample = len(Fasta(fg[0]).items())	
		for fasta_file, gff_file, out_file in bg:
			num_bg = len(Fasta(fasta_file).items())
			enrichment_cmd(fg[1], gff_file, num_sample, num_bg, out_file)
Example #5
0
 def setUp(self):
     self.data_dir = "test/data/pwmscan"
     
     self.motif = read_motifs(open(os.path.join(self.data_dir, "TATA.pwm")), fmt="pwm")[0]
     self.prom = Fasta(os.path.join(self.data_dir, "promoters.fa"))
     self.prom_gff = os.path.join(self.data_dir, "promoters_result.gff")
     self.random = Fasta(os.path.join(self.data_dir, "random_sequences.fa"))
     self.random_gff = os.path.join(self.data_dir, "random_result.gff")
     self.enrichment = os.path.join(self.data_dir, "enrichment.txt")
     self.tmp = NamedTemporaryFile().name
Example #6
0
def get_roc_values(motif, fg_file, bg_file):
    try:
        fg_result = motif.pwm_scan_score(Fasta(fg_file), cutoff=0.0, nreport=1)
        fg_vals = [sorted(x)[-1] for x in fg_result.values()]

        bg_result = motif.pwm_scan_score(Fasta(bg_file), cutoff=0.0, nreport=1)
        bg_vals = [sorted(x)[-1] for x in bg_result.values()]

        (x, y) = ROC_values(fg_vals, bg_vals)
        return None, x, y
    except Exception, e:
        error = e
        return error, [], []
Example #7
0
def as_fasta(seqs, genome=None):
    ftype = get_seqs_type(seqs)
    if ftype == "fasta":
        return seqs
    elif ftype == "fastafile":
        return Fasta(seqs)
    else:
        if genome is None:
            raise ValueError("need genome to convert to FASTA")

        tmpfa = NamedTemporaryFile()
        if type(genome) == type(""):
            genome = Genome(genome)
        genome.track2fasta(seqs, tmpfa.name)
        return Fasta(tmpfa.name)
Example #8
0
def location(args):
    fastafile = args.fastafile
    pwmfile = args.pwmfile

    lwidth = args.width
    if not lwidth:
        f = Fasta(fastafile)
        lwidth = len(f.items()[0][1])
        f = None

    jobs = []
    motifs = pwmfile_to_motifs(pwmfile)
    ids = [motif.id for motif in motifs]
    if args.ids:
        ids = args.ids.split(",")

    for motif in motifs:
        if motif.id in ids:
            outfile = os.path.join("%s_histogram" % motif.id)
            jobs.append(
                    pool.apply_async(
                        motif_localization, 
                        (fastafile,motif,lwidth,outfile, args.cutoff)
                        ))
    
    for job in jobs:
        job.get()
Example #9
0
def prepare_denovo_input_fa(inputfile, params, outdir):
    """Create all the FASTA files for de novo motif prediction and validation.

    Parameters
    ----------
    """
    fraction = float(params["fraction"])
    abs_max = int(params["abs_max"])

    logger.info("preparing input (FASTA)")

    pred_fa = os.path.join(outdir, "prediction.fa")
    val_fa = os.path.join(outdir, "validation.fa")
    loc_fa = os.path.join(outdir, "localization.fa")

    # Split inputfile in prediction and validation set
    logger.debug(
        "Splitting %s into prediction set (%s) and validation set (%s)",
        inputfile,
        pred_fa,
        val_fa,
    )

    divide_fa_file(inputfile, pred_fa, val_fa, fraction, abs_max)

    # File for location plots
    shutil.copy(val_fa, loc_fa)
    seqs = Fasta(loc_fa).seqs
    lsize = len(seqs[0])
    all_same_size = not (False in [len(seq) == lsize for seq in seqs])
    if not all_same_size:
        logger.warn(
            "PLEASE NOTE: FASTA file contains sequences of different sizes. "
            "Positional preference plots might be incorrect!")
Example #10
0
def location(args):
    """
    Creates histrogram of motif location.

    Parameters
    ----------
    args : argparse object
        Command line arguments.
    """
    fastafile = args.fastafile
    pwmfile = args.pwmfile

    lwidth = args.width
    if not lwidth:
        f = Fasta(fastafile)
        lwidth = len(f.items()[0][1])
        f = None

    jobs = []
    motifs = pwmfile_to_motifs(pwmfile)
    ids = [motif.id for motif in motifs]
    if args.ids:
        ids = args.ids.split(",")

    for motif in motifs:
        if motif.id in ids:
            outfile = os.path.join("%s_histogram" % motif.id)
            jobs.append(
                pool.apply_async(
                    motif_localization,
                    (fastafile, motif, lwidth, outfile, args.cutoff)))

    for job in jobs:
        job.get()
Example #11
0
 def test_track2fasta_exons(self):
     """ track2fasta should convert bed12 to fasta"""
     from gimmemotifs.fasta import Fasta
     bedfile = os.path.join(self.fasta_dir, "genes.bed")
     fafile = os.path.join(self.fasta_dir, "genes.out")
     
     # Create index
     self.g.create_index(self.fasta_dir, self.index_dir)
     # Convert bed to fasta
     track2fasta(self.index_dir, bedfile, self.temp_file, use_strand=True)
     target = Fasta(fafile)
     test = Fasta(self.temp_file)
     for gene in test.ids:
         name = gene.split(" ")[-1]
         self.assertEqual(len(test[gene]), len(target[name]))
         self.assertEqual(test[gene].upper(), target[name].upper())
Example #12
0
    def test1_scan_sequences(self):
        """ Scanner """
        for ncpus in [1, 2, 3]:
            s = Scanner(ncpus=ncpus)
            s.set_motifs(self.motifs)

            f = Fasta(self.fa)

            s.set_threshold(threshold=0.0)
            nmatches = [len(m[0]) for m in s._scan_sequences(f.seqs, 1, False)]
            self.assertEqual([1, 1, 1], nmatches)

            s.set_threshold(threshold=0.99)
            nmatches = [len(m[0]) for m in s._scan_sequences(f.seqs, 1, False)]
            self.assertEqual([0, 1, 1], nmatches)

            s.set_threshold(threshold=0.99)
            nmatches = [
                len(m[0]) for m in s._scan_sequences(f.seqs, 10, False)
            ]
            self.assertEqual([0, 1, 2], nmatches)

            s.set_threshold(threshold=0.99)
            nmatches = [len(m[0]) for m in s._scan_sequences(f.seqs, 10, True)]
            self.assertEqual([0, 2, 4], nmatches)
Example #13
0
def check_denovo_input(inputfile, params):

    genome = params["genome"]
    background = params["background"]
    
    input_type = "BED"
    # If we can load it as fasta then it is a fasta, yeh?
    try:
        Fasta(inputfile)
        logger.debug("Inputfile is a FASTA file")
        input_type = "FASTA"
    except Exception:
        # Leave it to BED
        pass

    if input_type == "FASTA":
        valid_bg = FA_VALID_BGS    
    elif input_type == "BED":
        valid_bg = BED_VALID_BGS    
        if "genomic" in background:
            Genome(genome)
        # is it a valid bed-file etc.
        check_bed_file(inputfile)    # bed-specific
    
    for bg in background:
        if not bg in valid_bg:
            logger.info("Input type is %s, ignoring background type '%s'", 
                            input_type, bg)
        background = [bg for bg in background if bg in valid_bg]

    if len(background) == 0:
        logger.error("No valid backgrounds specified!")
        sys.exit(1)

    return input_type, background
Example #14
0
def peak2fasta(peak_ids, ref_genome):
    '''
    Convert peak_id into fasta object.

    Args:
        peak_id (str or list of str): Peak_id.  e.g. "chr5_0930303_9499409"
            or it can be a list of peak_id.  e.g. ["chr5_0930303_9499409", "chr11_123445555_123445577"]

        ref_genome (str): Reference genome name.   e.g. "mm9", "mm10", "hg19" etc

    Returns:
        gimmemotifs fasta object: DNA sequence in fasta format

    '''
    genome_data = Genome(ref_genome)

    def peak2seq(peak_id):
        chromosome_name, start, end = decompose_chrstr(peak_id)
        locus = (int(start), int(end))

        tmp = genome_data[chromosome_name][locus[0]:locus[1]]
        name = f"{tmp.name}_{tmp.start}_{tmp.end}"
        seq = tmp.seq
        return (name, seq)

    if type(peak_ids) is str:
        peak_ids = [peak_ids]

    fasta = Fasta()
    for peak_id in peak_ids:
        name, seq = peak2seq(peak_id)
        fasta.add(name, seq)

    return fasta
Example #15
0
def get_roc_values(motif, fg_file, bg_file):
	error = None
	x = []
	y = []
	try:
		from gimmemotifs.fasta import Fasta
		from gimmemotifs.rocmetrics import ROC_values,ROC_AUC,MNCP,max_fmeasure
	
		fg_result = motif.pwm_scan_score(Fasta(fg_file), cutoff=0.0, nreport=1)
		fg_vals = [sorted(x)[-1] for x in fg_result.values()]
	
		bg_result = motif.pwm_scan_score(Fasta(bg_file), cutoff=0.0, nreport=1)
		bg_vals = [sorted(x)[-1] for x in bg_result.values()]
	
		(x, y) = ROC_values(fg_vals, bg_vals)
	except Exception,e:
		error = e
Example #16
0
	def _create_background(self, bg_type, bedfile, fafile, outfile, organism="hg18", width=200, nr_times=10):
		if bg_type == "random":
			if int(self.markov_model) >= 6:
				self.logger.warn("Are you sure about the Markov model? It seems too high!")
			else:
				order = {"1":"1st","2":"2nd", "3":"3rd", "4":"4th", "5":"5th"}[str(self.markov_model)]
				self.logger.info("Creating random background (%s order Markov)" % order)
		
			f = Fasta(fafile)
			m = MarkovFasta(f, k=int(self.markov_model))
			m.writefasta(outfile)
			self.logger.debug("Random background: %s" % (outfile))
			# return the number of random sequences created
			return len(m)
		elif bg_type == "genomic_matched":	
			gene_file = os.path.join(self.config.get_gene_dir(), "%s.bed" % organism)
			index_dir = os.path.join(self.config.get_index_dir(), organism)
			self.logger.info("Creating matched genomic background (%s, using genes in %s)" % (organism, gene_file))
		
			f = MatchedGenomicFasta(bedfile, gene_file, index_dir, width, nr_times)
			f.writefasta(outfile)
			self.logger.debug("Matched genomic background: %s" % (outfile))
			return len(f)
		elif bg_type == "promoter":
			gene_file = os.path.join(self.config.get_gene_dir(), "%s.bed" % organism)
			index_dir = os.path.join(self.config.get_index_dir(), organism)
			
			self.logger.info("Creating random promoter background (%s, using genes in %s)" % (organism, gene_file))
			fg = Fasta(fafile)
			f = PromoterFasta(gene_file, index_dir, width, nr_times * len(fg))
			f.writefasta(outfile)
			self.logger.debug("Random promoter background: %s" % (outfile))
			return len(f)
		elif bg_type == "user":
			bg_file = self.params["user_background"]
			if not os.path.exists(bg_file):
				self.logger.error("User-specified background file %s does not exist!" % bg_file)
				sys.exit(1)
			else:
				self.logger.info("Copying user-specified background file %s to %s." % (bg_file, outfile))
				fa = Fasta(bg_file)
				l = median([len(seq) for seq in fa.seqs])
				if l < width * 0.95 or l > width * 1.05:
					self.logger.warn("The user-specified background file %s contains sequences with a median length of %s, while GimmeMotifs predicts motifs in sequences of length %s. This will influence the statistics! It is recommended to use background sequences of the same length." % (bg_file, l, width))
				fa.writefasta(outfile)
				return len(fa)
Example #17
0
def as_fasta(seqs, genome=None):
    ftype = get_seqs_type(seqs)
    if ftype == "fasta":
        return seqs
    elif ftype == "fastafile":
        return Fasta(seqs)
    else:
        if genome is None:
            raise ValueError("need genome to convert to FASTA")

        tmpfa = NamedTemporaryFile()
        if isinstance(genome, str):
            genome = Genome(genome)

        if isinstance(seqs, np.ndarray):
            seqs = list(seqs)
        genome.track2fasta(seqs, tmpfa.name)
        return Fasta(tmpfa.name)
Example #18
0
def remove_zero_seq(fasta_object):
    """
    Remove DNA sequence with zero length
    """
    fasta = Fasta()
    for i, seq in enumerate(fasta_object.seqs):
        if seq:
            name = fasta_object.ids[i]
            fasta.add(name, seq)
    return fasta
Example #19
0
    def __init__(
        self,
        outfile,
        genome=None,
        fg_file=None,
        background=None,
        gc=False,
        do_counter=True,
        job_server=None,
    ):
        self.lock = thread.allocate_lock()
        self.motifs = []
        self.finished = []
        self.stats = {}
        self.stat_jobs = []
        self.outfile = outfile
        self.genome = genome
        if job_server:
            self.job_server = job_server
        else:
            self.job_server = Pool(2)
        self.counter = 0
        self.do_counter = do_counter

        open(outfile, "w").close()

        if fg_file and background:
            self.fg_fa = Fasta(fg_file)
            self.background = dict(
                [(bg, Fasta(fname)) for bg, fname in background.items()]
            )
            self.do_stats = True
            self.gc = gc
            self.zscore = self.gc
            if self.gc:
                if genome is None:
                    raise ValueError(
                        "Need a genome when calculating GC% zscores for motif statistics"
                    )
                else:
                    self.genome = genome
        else:
            self.do_stats = False
Example #20
0
def download_genome(genomebuild, genome_dir): 
    # download genome based on URL + genomebuild
    sys.stderr.write("Downloading {} genome\n".format(genomebuild))
    for genome_url in UCSC_GENOME_URLS:

        remote = genome_url.format(genomebuild)

        genome_fa = os.path.join(
                genome_dir,
                os.path.split(remote)[-1]
                )

        sys.stderr.write("Trying to download {}\n".format(genome_url.format(genomebuild)))
        
        try:
            urlretrieve(
                genome_url.format(genomebuild),
                genome_fa
                )

            if not check_genome_file(genome_fa):
                os.unlink(genome_fa)
                continue

            break
        except:
            pass

    if not check_genome_file(genome_fa):
        sys.stderr.write("Failed to download genome\n")
        sys.exit(1)

    sys.stderr.write("Unpacking\n")
    genome_fa = os.path.basename(genome_fa)
    if genome_fa.endswith("tar.gz"):
        cmd = "tar -C {0} -xvzf {1} && rm {1}".format(genome_dir, genome_fa)
    elif genome_fa.endswith(".zip"):
        cmd = "unzip {0}".format(genome_fa)
    else:
        cmd = "gunzip {0}".format(genome_fa)

    sp.call(cmd, shell=True, cwd=genome_dir)
    
    fa_files = glob("{}/*.fa".format(genome_dir))
    if len(fa_files) == 1:
        f = Fasta(fa_files[0])
        for n,s in f.items():
            with open("{}/{}.fa".format(genome_dir, n), "w") as f:
                f.write(">{}\n{}\n".format(n,s))

        os.unlink(fa_files[0])

    genome_fa = os.path.join(genome_dir, genome_fa)
    if os.path.exists(genome_fa):
        os.unlink(genome_fa)
Example #21
0
    def set_background(self, fname=None, genome=None, length=200, nseq=10000):
        """Set the background to use for FPR and z-score calculations.

        Background can be specified either as a genome name or as the 
        name of a FASTA file.
        
        Parameters
        ----------
        fname : str, optional
            Name of FASTA file to use as background.

        genome : str, optional
            Name of genome to use to retrieve random sequences.

        length : int, optional
            Length of genomic sequences to retrieve. The default
            is 200.

        nseq : int, optional
            Number of genomic sequences to retrieve.
        """
        length = int(length)

        if genome and fname:
            raise ValueError("Need either genome or filename for background.")

        if fname:
            if not os.path.exists(fname):
                raise IOError(
                    "Background file {} does not exist!".format(fname))

            self.background = Fasta(fname)
            self.background_hash = file_checksum(fname)
            return

        if not genome:
            if self.genome:
                genome = self.genome
                logger.info(
                    "Using default background: genome {} with length {}".
                    format(genome, length))
            else:
                raise ValueError(
                    "Need either genome or filename for background.")

        logger.info("Using background: genome {} with length {}".format(
            genome, length))
        with Cache(CACHE_DIR) as cache:
            self.background_hash = "{}\{}".format(genome, int(length))
            fa = cache.get(self.background_hash)
            if not fa:
                fa = RandomGenomicFasta(genome, length, nseq)
                cache.set(self.background_hash, fa)
        self.background = fa
Example #22
0
def get_scores(motif, fg_file, bg_file):
    error = None
    auc = None
    mncp = None
    max_f = None
    y = None
    try:
        fg_result = motif.pwm_scan_score(Fasta(fg_file), cutoff=0.0, nreport=1)
        fg_vals = [sorted(x)[-1] for x in fg_result.values()]

        bg_result = motif.pwm_scan_score(Fasta(bg_file), cutoff=0.0, nreport=1)
        bg_vals = [sorted(x)[-1] for x in bg_result.values()]

        (x, y) = ROC_values(fg_vals, bg_vals)
        auc = ROC_AUC(fg_vals, bg_vals)
        mncp = MNCP(fg_vals, bg_vals)
        max_f, y = max_fmeasure(x, y)

    except Exception, e:
        error = e
Example #23
0
def scan_fasta_file_with_motifs(fastafile, motiffile, threshold, gfffile, scan_rc=True):
	error = None
	try:
		from gimmemotifs.fasta import Fasta
		from gimmemotifs.motif import pwmfile_to_motifs
		motifs = pwmfile_to_motifs(motiffile)
		fa = Fasta(fastafile)
		for motif in motifs:
			motif.pwm_scan_to_gff(fa, gfffile, nreport=1, cutoff=float(threshold), scan_rc=scan_rc, append=True)
	except Exception,e :
		error = e
Example #24
0
def motif_localization(fastafile, motif, width, outfile, cutoff=0.9):
    NR_HIST_MATCHES = 100

    matches = motif.pwm_scan(Fasta(fastafile), cutoff=cutoff, nreport=NR_HIST_MATCHES)
    if len(matches) > 0:
        ar = []
        for a in matches.values():
            ar += a
        matches = np.array(ar)
        p = ks_pvalue(matches, width - len(motif))
        plot_histogram(matches - width / 2 + len(motif) / 2, outfile, xrange=(-width / 2, width / 2), breaks=21, title="%s (p=%0.2e)" % (motif.id, p), xlabel="Position")
        return motif.id, p
    else:
        return motif.id, 1.0
Example #25
0
def get_seqs_type(seqs):
    """
    automagically determine input type
    the following types are detected:
        - Fasta object
        - FASTA file
        - list of regions
        - region file
        - BED file
    """

    region_p = re.compile(r'^(.+):(\d+)-(\d+)$')
    if isinstance(seqs, Fasta):
        return "fasta"
    elif isinstance(seqs, list):
        if len(seqs) == 0:
            raise ValueError("empty list of sequences to scan")
        else:
            if region_p.search(seqs[0]):
                return "regions"
            else:
                raise ValueError("unknown region type")
    elif isinstance(seqs, str) or isinstance(seqs, unicode):
        if os.path.isfile(seqs):
            try:
                Fasta(seqs)
                return "fastafile"
            except:
                pass
            try:
                with open(seqs) as f:
                    for line in f.readlines():
                        line = line.strip()
                        if not line.startswith("#"):
                            break

                if region_p.search(line):
                    return "regionfile"
                else:
                    vals = line.split("\t")
                    if len(vals) >= 3:
                        _, _ = int(vals[1]), int(vals[2])
                        return "bedfile"
                raise ValueError("unknown type")
            except:
                raise ValueError("unknown type")
        else:
            raise ValueError("no file found with name {}".format(seqs))
    else:
        raise ValueError("unknown type {}".format(type(seqs).__name__))
Example #26
0
def number_of_seqs_in_file(fname):
    try:
        fa = Fasta(fname)
        return len(fa)
    except:
        pass

    try:
        bed = pybedtools.BedTool(fname)
        return len([x for x in bed])
    except:
        pass

    sys.stderr.write("unknown filetype {}\n".format(fname))
    sys.exit(1)
Example #27
0
    def calculate_enrichment(self, motif_file, fg, bg):
        """ fg: [sample_fa, sample_gff] bg: [[bg1_fa, bg1_gff, bg1_enrichment], [bg2_fa, bg2_gff, bg2_enrichment], .. etc] """

        self.logger.debug("Scanning background sequences with motifs")

        # define filenames
        fnames = [(fg[0], fg[1])] + [x[:2] for x in bg]
        # scan and save as gff
        for infile, outfile in fnames:
            with open(outfile, "w") as f:
                for line in command_scan(infile,
                                         motif_file,
                                         nreport=1,
                                         cutoff=self.SCAN_THRESHOLD,
                                         bed=False,
                                         scan_rc=True):
                    f.write(line + "\n")

        self.logger.debug("Calculating enrichment")
        enrichment_cmd = gff_enrichment
        num_sample = len(Fasta(fg[0]).items())
        for fasta_file, gff_file, out_file in bg:
            num_bg = len(Fasta(fasta_file).items())
            enrichment_cmd(fg[1], gff_file, num_sample, num_bg, out_file)
Example #28
0
def create_denovo_motif_report(inputfile,
                               pfmfile,
                               fgfa,
                               background,
                               locfa,
                               outdir,
                               params,
                               stats=None):
    """Create text and graphical (.html) motif reports."""
    logger.info("creating de novo reports")

    motifs = read_motifs(pfmfile, fmt="pwm")

    # ROC plots
    create_roc_plots(pfmfile, fgfa, background, outdir, params["genome"])

    # Closest match in database
    mc = MotifComparer()
    closest_match = mc.get_closest_match(motifs)

    if stats is None:
        stats = {}
        for bg, bgfa in background.items():
            for m, s in calc_stats(fg_file=fgfa, bg_file=bgfa,
                                   motifs=motifs).items():
                if m not in stats:
                    stats[m] = {}
                stats[m][bg] = s

    stats = add_star(stats)

    if not params:
        params = {}
    cutoff_fpr = params.get("cutoff_fpr", 0.9)
    lsize = np.median([len(seq) for seq in Fasta(locfa).seqs])

    # Location plots
    logger.debug("Creating localization plots")
    for motif in motifs:
        logger.debug("  {} {}".format(motif.id, motif))
        outfile = os.path.join(outdir,
                               "images/{}_histogram.svg".format(motif.id))
        motif_localization(locfa, motif, lsize, outfile, cutoff=cutoff_fpr)

    # Create reports
    _create_text_report(inputfile, motifs, closest_match, stats, outdir)
    _create_graphical_report(inputfile, pfmfile, background, closest_match,
                             outdir, stats)
Example #29
0
    def _prepare_files(self, fastafile):

        hmsdir = self.dir()
        thetas = ["theta%s.txt" % i for i in [0, 1, 2, 3]]
        for t in thetas:
            shutil.copy(os.path.join(hmsdir, t), self.tmpdir)

        summitfile = os.path.join(self.tmpdir, "HMS.in.summits.txt")
        outfile = os.path.join(self.tmpdir, "thetafinal.txt")
        fgfile = os.path.join(self.tmpdir, "HMS.in.fa")

        shutil.copy(fastafile, fgfile)
        fa = Fasta(fgfile)
        with open(summitfile, "w") as out:
            for seq in fa.seqs:
                out.write("%s\n" % (len(seq) / 2))
        return fgfile, summitfile, outfile
Example #30
0
    def _run_program(self, bin, fastafile, savedir="", params=None):

        hms = bin
        thetas = ["theta%s.txt" % i for i in [0, 1, 2, 3]]

        fastafile = os.path.abspath(fastafile)

        fgfile = os.path.join(self.tmpdir, "HMS.in.fa")
        summitfile = os.path.join(self.tmpdir, "HMS.in.summits.txt")
        outfile = os.path.join(self.tmpdir, "thetafinal.txt")

        hmsdir = os.path.join(self.config.get_tools_dir(), "HMS")
        shutil.copy(fastafile, fgfile)
        for t in thetas:
            shutil.copy(os.path.join(hmsdir, t), self.tmpdir)

        fa = Fasta(fgfile)
        out = open(summitfile, "w")
        for seq in fa.seqs:
            out.write("%s\n" % (len(seq) / 2))
        out.close()

        current_path = os.getcwd()
        os.chdir(self.tmpdir)

        stdout = ""
        stderr = ""

        cmd = "%s -i %s -w 21 -dna 4 -iteration 50 -chain 20 -seqprop -0.1 -strand 2 -peaklocation %s -t_dof 3 -dep 2" % (
            hms, fgfile, summitfile)
        p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE)
        out, err = p.communicate()
        stdout += out
        stderr += err

        os.chdir(current_path)
        motifs = []
        if os.path.exists(outfile):
            f = open(outfile)
            motifs = self.parse(f)
            f.close()

        return motifs, stdout, stderr