def Compute_MAD(a, c=0.6745, axis=None):
        """
        Median Absolute Deviation along given axis of an array:

        median(abs(a - median(a))) / c

        c = 0.6745 is the constant to convert from MAD to std; it is used by
        default

        Copied from http://code.google.com/p/agpy/source/browse/trunk/agpy/mad.py
        Downloaded 7-Dec-2012.
        """

        LE.debug("Computing MAD for {0} #elements ({1}bytes)".format(
            len(a), sys.getsizeof(a)))

        d = VcfAnnotator.Median(a)
        nary = array.array('d')
        for i in a:
            nary.append(float(abs(i - d)) / c)
        nary = list(nary)
        nary.sort()
        mad = VcfAnnotator.Median(nary)
        del nary

        return mad
Beispiel #2
0
    def merge(self):
        LE.debug("Doing merge, writing in " + self.output)
        filestomerge = [pysam.Samfile(i) for i in self.newsams]

        newHeader = {}

        rgs = {}
        for j in itertools.chain(*[i.header["RG"] for i in filestomerge]):
            rgs.setdefault(j["ID"], j)

        sqs = OrderedDict()
        for j in itertools.chain(*[i.header["SQ"] for i in filestomerge]):
            sqs.setdefault(j["SN"], j)

        newHeader["HD"] = filestomerge[0].header["HD"]
        newHeader["RG"] = rgs.values()
        newHeader["SQ"] = sqs.values()
        newHeader["CO"] = list(
            itertools.chain(*[i.header["CO"] for i in filestomerge]))

        pgs = list(itertools.chain(*[i.header["PG"] for i in filestomerge]))
        for i in pgs:
            newHeader["CO"].append("\t".join([":".join(k) for k in i.items()]))
        newHeader["CO"] = list(set(newHeader["CO"]))
        newHeader["CO"].append("CMD:{0}".format(" ".join(sys.argv)))
        for i in self.commandsHistory:
            newHeader["CO"].append("CMD:{0}".format(i))

        outBam = pysam.Samfile(self.output, "wb", header=newHeader)
        for j in filestomerge:
            for i in j:
                outBam.write(i)
        outBam.close()
        pysam.sort(self.output, self.output)
        shutil.move(self.output + ".bam", self.output)
	def Index_Fasta( self ):
		"""
        Create/update .fasta.fai file using samtools faidx.
        """
		LE.info('Creating index {0}.fai.'.format(self.outfastapath))
		self.cleanUpExecution(
			*COMPASSCFG['tools']['samtools'].execute(append="faidx {0}".format(self.outfastapath)))
Beispiel #4
0
 def substitutePars(self, cad):
     vardict = dict([(i, getattr(self, i)) for i in dir(self)
                     if i.startswith("mpileup") or i.startswith("bcftools")
                     or i.startswith("opts")])
     LE.debug("Running command: [{0}]".format(cad.format(**vardict)))
     cmdStr = cad.format(**vardict)
     return cmdStr
    def merge(self):
        LE.info("Merging SAMfiles from different readgroup mappings")

        input = pysam.Samfile(self.input)

        self.seqstat = self.generateSeqStats(self.input)

        newheaders = dict(input.header.items())

        # samheader = pysam.Samfile(
        # self.tmpdir + "/" + self.readgroups[0] + ".sam")

        samheader = input

        newheaders["SQ"] = samheader.header["SQ"]
        samheader.close()
        input.close()

        newheaders["PG"] = [{
            "PN": "bwa",
            "VN": "0.7.10",
            "CL": self.commandsHistory[0]
        }]
        if "CO" in newheaders.keys():
            newheaders["CO"] = list(set(newheaders["CO"]))
        else:
            newheaders["CO"] = []
        newheaders["CO"].append("CMD:{0}".format(" ".join(sys.argv)))
        for i in self.commandsHistory:
            newheaders["CO"].append("CMD:{0}".format(i))

        LE.debug("Doing merge, writing in " + self.output)
        unsortedBamName = self.output + "_unsorted.bam"

        # print unsortedBamName

        output = pysam.Samfile(unsortedBamName, "wb", header=newheaders)

        # print "========"
        # print self.readgroups
        # print "========"

        for i in self.readgroups:
            with pysam.Samfile(i + "_alignment.sam") as source:
                for j in source:
                    if not j.flag & 2048:
                        output.write(j)
            # os.unlink(self.tmpdir + "/" + i + ".sam")
        output.close()
        outputNames = self.output.split(".")
        outputPrefix = ".".join(outputNames[0:len(outputNames) - 1])
        pysam.sort(unsortedBamName, self.output)

        os.unlink(unsortedBamName)
        shutil.move(self.output + ".bam", self.output)
	def cleanUpExecution( self, cmd, stdout, stderr, errcode ):
		"""

        Args:
            cmd:
            stdout:
            stderr:
            errcode:
        """
		LE.debug(StringIO(stdout))
		if errcode:
			LE.error(StringIO(stderr))
			raise Exception(
				"CMD [{0}] exit with status [{1}]".format(cmd, errcode))
Beispiel #7
0
    def map(self):
        sf = pysam.Samfile(self.input)
        rgs = [i['ID'] for i in sf.header["RG"]]
        sf.close()
        self.newsams = [
            os.path.join(os.getcwd(), str(uuid.uuid4())) + ".sam"
            for i in range(len(rgs))
        ]

        for readgroup, samout in zip(rgs, self.newsams):
            append = "--substitutionrate={0} -g {1} -h {1} -M {2} -o {3} --logfile={3}.log --readgroup=ID:{4} --outputformat=sam -v 3 ".format(
                self.subrate, self.ref, self.input, samout, readgroup)

            if self.keepgoodreads: stampyCmd += " --bamkeepgoodreads "
            if self.alignquals: stampyCmd += " --alignquals "
            if self.baq: stampyCmd += " --baq "

            cmd, stdout, stderr, errcode = COMPASSCFG["tools"][
                "stampy"].execute(append=append)
            cmd, stdout, stderr, errcode = "", "", "", 0
            LE.debug(StringIO(stdout), "stdout")
            LE.debug(StringIO(stderr), "stderr")

            if errcode:
                LE.critical("Stampy execution failed {0}".format(errcode))
                raise Exception("Stampy execution failed {0}".format(errcode))

            self.insertsizes[readgroup] = self.generateSeqStats(samout)
    def markDuplicates(self):
        print COMPASSCFG
        cmd, stdout, stderr, errcode = COMPASSCFG["tools"]["picard"].execute(
            source="path",
            prepend="java -jar",
            file="MarkDuplicates.jar",
            append=
            "I={0} O={1}.dedup METRICS_FILE=metrics.txt ASSUME_SORTED=true VERBOSITY=DEBUG VALIDATION_STRINGENCY=SILENT"
            .format(self.output, self.output))

        LE.debug(StringIO(stdout), "stdout")
        LE.debug(stdout, "stdout")
        LE.debug(stderr, "stderr")
        LE.debug(StringIO(stderr), "stderr")

        if errcode:
            LE.critical("MarkDuplicates execution failed {0}".format(
                cmd.returncode))
            raise Exception("MarkDuplicates execution failed {0}".format(
                cmd.returncode))
        #
        shutil.move(self.output + ".dedup", self.output)
	def Fix_Fasta_Headers( self ):
		"""
        Create correctly formatted fasta file. Contigs must be in the form REFID[, REFID-2, REFID-3,...].
        """
		
		LE.info('Creating master fasta file {0}.'.format(self.outfastapath))
		
		# helper function to reformat each fasta record on the fly
		def _fixed_records():
			"""

            """
			for i, contig in enumerate(SeqIO.parse(self.infasta, 'fasta'), 1):
				correct_name = contig.id = self.newrefid + \
				                           ('-{0}'.format(i), '')[i == 1]
				if contig.name != correct_name:
					contig.name = correct_name
					contig.description = '{0} {1} {2}'.format(
						correct_name, self.newrefid, contig.description)
				yield contig
		
		SeqIO.write(_fixed_records(), self.outfastapath, 'fasta')
		return
    def map(self):
        for i in self.readgroups:
            fq1 = self.tmpdir + "/" + i + "-A.fq"
            fq2 = self.tmpdir + "/" + i + "-B.fq"

            output = open(self.tmpdir + "/" + i + ".sam", "w")
            p = COMPASSCFG["tools"]["bwa"].popen(
                append="mem -R '@RG\\tID:{0}' {1} {2} {3} -L 20 -B 3 -O 6 -T 20"
                .format(i, self.ref, fq1, fq2),
                stderr=subprocess.PIPE,
                stdout=output)
            self.commandsHistory.append(p.cmd)

            LE.debug(p.stderr, "stderr")
            errcode = p.wait()

            if errcode:
                LE.error("BWA tool failed")
                raise Exception("BWA tool failed")
            os.unlink(fq1)
            os.unlink(fq2)
            output = self.tmpdir + "/" + i + ".sam"
            self.insertsizes[i] = self.generateSeqStats(output)
Beispiel #11
0
    def markDuplicates(self):
        cmd, stdout, stderr, errcode = COMPASSCFG['tools']['picard'].execute(
            source='path',
            file='MarkDuplicates.jar',
            prepend='java -jar',
            append=
            "I={0} O={0}.dedup METRICS_FILE={1}_metrics.txt ASSUME_SORTED=true VERBOSITY=DEBUG VALIDATION_STRINGENCY=SILENT"
            .format(self.output, self.input))
        LE.debug(StringIO(stdout), "stdout")
        LE.debug(StringIO(stderr), "stderr")

        if errcode:
            LE.critical("MarkDuplicates execution failed {0}".format(errcode))
            raise Exception(
                "MarkDuplicates execution failed {0}".format(errcode))

        shutil.move(self.output + ".dedup", self.output)
	def execCommand( self, cmd ):
		"""

        Args:
            cmd:
        """
		origcmd = cmd
		LE.debug("Running command: [{0}]".format(cmd))
		cmd = shlex.split(cmd)
		p = subprocess.Popen(cmd, stdout=subprocess.PIPE,
		                     stderr=subprocess.PIPE)
		stdout, stderr = p.communicate()
		errcode = p.wait()
		LE.debug(StringIO(stdout))
		if errcode:
			LE.error(StringIO(stderr))
			raise Exception(
				"CMD [{0}] exit with status [{1}]".format(origcmd, errcode))
	def generateAll( self ):
     		self.Fix_Fasta_Headers()
        	self.Create_Indexes()
        	self.Make_Repeat_Mask_Txt()
        	LE.info("Everything went OKAY!")
Beispiel #14
0
    parser.add_argument(
        '-dh',
        dest="headerinfo",
        help=
        "Default ummaped header, you must specify [readgroup,platform,lib,sample,SeqCentre] ex: -dh RG0045,ILLUMINA,LIB03,SN123,Sanger",
        default=None)
    parser.add_argument(
        '-o',
        dest="output.bam",
        help="You must specifiy a file with the SAM header in text format",
        default="-")
    args = parser.parse_args()

    # print " ".join(sys.argv)

    LE.info("Input fastq files in {0} , {1}".format(args.fq1, args.fq2))
    LE.info("Output bam in {0}".format(args.output))

    if args.headerinfo == "None":
        args.headerinfo = None

    if (not args.header and not args.headerinfo) or (args.header
                                                     and args.headerinfo):
        print(
            "You must specify either a header file for the SAM header or default header information (-H/-dh)"
        )
        sys.exit(-1)
    '''if args.output.bam=="-":
        output.bam=sys.stdout
    else:
        output.bam=open(args.output.bam,"w")'''
Beispiel #15
0
            parser.add_argument(i[0],
                                dest=i[1],
                                help=i[4],
                                default=False,
                                action='store_true')
        else:
            parser.add_argument(i[0], dest=i[1], help=i[4], default="DISABLED")
    args = parser.parse_args()

    try:
        for i in FILTERS.availableFilters():
            value = getattr(args, i[1])
            if value and value != "DISABLED":
                FILTERS.setUpFilter(i[1], value)
    except ParameterError as e:
        LE.critical("Parameter setup Error: {0}".format(e.message))
        dump_exc()

    try:
        vcfFile = GormVcf(args.invcf)
        FILTERS.filterVcf(vcfFile,
                          outvcf_path=args.outvcf,
                          outvcfIndel=args.outvcfIndel,
                          outfasta=args.outfasta,
                          stats=args.outstats,
                          guuid=args.guuid,
                          refid=args.ref_id)
        print("Done")
    except:
        dump_exc()
	def Make_Repeat_Mask_Txt( self, word_size=17, gapopen=5, e_thresh=0.0001, perc_identity=90, gapextend=2,
	                          min_length=75 ):
		"""
        Run blastn on contigs in input fasta file against database dbname. Parameters set to NCBI recommended defaults for blastn.
        """
		outfastapath = os.path.join(
			self.outdir, '{0}.fasta'.format(self.newrefid))
		prefix = os.path.join(self.outdir, self.newrefid)
		maskpath = prefix + '_repmask.array'
		regionspath = prefix + '_repregions.array'
		statspath = prefix + '.stats'
		
		blastn_cline = blastn(cmd=COMPASSCFG['tools']['blast']['path'] + "blastn", db=prefix, query=outfastapath,
		                      dust='no', word_size=word_size, gapopen=gapopen, gapextend=gapextend, evalue=e_thresh,
		                      perc_identity=perc_identity,
		                      outfmt='"6 qseqid sseqid pident length qstart qend sstart send"')
		try:
			blast_out, blast_err = blastn_cline()
			assert not blast_err
		except (AppError, AssertionError) as err:
			raise Exception(
				'Erro: Blast failed during construction of repeat mask : {0}'.format(err))
		
		repmask_fp = open(maskpath, 'w')
		repregions_fp = open(regionspath, 'w')
		total_bp = 0
		repetitive_bp = 0
		num_regions = 0
		
		# each blast_rec is result from one query sequence (contig)
		blast_stream = StringIO(blast_out)
		prev_header = None
		for contig_count, contig in enumerate(SeqIO.parse(outfastapath, 'fasta'), 1):
			if prev_header != contig.name:
				repregions_fp.write('>{0}\n'.format(contig.name))
				prev_header = contig.name
			total_bp += len(contig)
			repmask = np.zeros(len(contig), dtype=np.bool)
			try:
				fields = blast_stream.next().split()
			except StopIteration:
				fields = None
			while fields and fields[0] == contig.name:
				contig_name, match_name = fields[:2]
				hit_perc_ident = float(fields[2])
				hit_length, q_start, q_end, s_start, s_end = (
					int(x) for x in fields[3:])
				(x1, y1), (x2, y2) = sorted(
					((q_start, q_end), sorted((s_start, s_end))))
				if hit_length >= min_length and (contig_name != match_name or not (x2 <= x1 <= y2 and x2 <= y1 <= y2)):
					repmask[q_start - 1:q_end] = True
				try:
					fields = blast_stream.next().split()
				except StopIteration:  # end of blast hits
					fields = None
			# output.bam repmask as 1 and 0, 100 per line
			repmask_fp.write('>{0}\n'.format(contig.name))
			for i in xrange(0, len(repmask), 100):
				j = min(i + 100, len(repmask))
				repmask_fp.write('{0}\n'.format(''.join(str(i)
				                                        for i in repmask[i:j].astype(int))))
			# identify postitions of repetitive regions (runs of 1s in the
			# repmask array)
			# 0-based numbering
			region_starts = list(np.where(repmask[1:] > repmask[:-1])[0] + 1)
			region_ends = list(np.where(repmask[1:] < repmask[:-1])[0] + 1)
			# special case: full blast hit for this contig against another
			# contig
			if repmask.all():
				region_starts = [0]
				region_ends = [len(repmask)]
			# fix ends, in case regions start from the first position in the
			# sequence or end at the last
			if region_starts and ((not region_ends) or (region_starts[-1] > region_ends[-1])):
				region_ends.append(len(repmask))
			if region_ends and ((not region_starts) or (region_starts[0] > region_ends[0])):
				region_starts = [0] + region_starts
			repregions_fp.writelines('{0}\t{1}\n'.format(
				rs, re) for rs, re in izip(region_starts, region_ends))
			repetitive_bp += repmask.sum()
			num_regions += len(region_starts)
		
		repmask_fp.close()
		repregions_fp.close()
		pct_repetitive = '{0:.2f}'.format(
			(float(repetitive_bp) / total_bp) * 100)
		LE.debug(
			'Info: Repetitive regions for all of {0}: {1}/{2} bp ({3}%)'.format(self.newrefid, repetitive_bp, total_bp,
			                                                                    pct_repetitive))
		
		# save result summary
		statsvalues = '\t'.join((self.newrefid, self.newrefid, str(contig_count), str(total_bp), str(repetitive_bp),
		                         str(num_regions), pct_repetitive))
		with open(statspath, 'w') as o:
			o.write('refid\trefcd\tcontigs\tnumbp\trepetitivebp\trepregions\trepetitivepct\n{values}\n'.format(
				values=statsvalues))
		return
Beispiel #17
0
 def dumpStdError(self):
     self.stderror.seek(0)
     LE.error(self.stderror)
    def createFQs(self):
        fileList = {}
        for i in self.readgroups:
            fileList[i] = [
                open(self.tmpdir + "/" + i + "-1.fq", "w"),
                open(self.tmpdir + "/" + i + "-2.fq", "w")
            ]

        for i in pysam.Samfile(self.input):
            rg = dict(i.tags)["RG"]
            if i.flag & 64:
                fileList[rg][0].write("@{0}\n{1}\n+\n{2}\n".format(
                    i.qname, i.seq, i.qual))
            else:
                fileList[rg][1].write("@{0}\n{1}\n+\n{2}\n".format(
                    i.qname, i.seq, i.qual))

        names = os.listdir(self.path)
        k = 2
        for name in names:
            pattern = re.compile(
                'output[0-9]_[0-9a-z]{8}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{4}-[0-9a-z]{12}_bam.bam'
            )
            match = re.search(pattern, name)
            if match:
                bam = os.path.join(self.path, name)
                for i in pysam.Samfile(bam):
                    rg = dict(i.tags)["RG"]
                    if i.flag & 64:
                        fileList[rg][0].write("@{0}\n{1}\n+\n{2}\n".format(
                            i.qname, i.seq, i.qual))
                    else:
                        fileList[rg][1].write("@{0}\n{1}\n+\n{2}\n".format(
                            i.qname, i.seq, i.qual))
                k += 1

        for i in fileList.values():
            i[0].close()
            i[1].close()

        for i in self.readgroups:
            fqSort(FastQReader(self.tmpdir + "/" + i + "-1.fq"),
                   self.tmpdir + "/" + i + "-1.sort")
            fqSort(FastQReader(self.tmpdir + "/" + i + "-2.fq"),
                   self.tmpdir + "/" + i + "-2.sort")
            os.unlink(self.tmpdir + "/" + i + "-1.fq")
            os.unlink(self.tmpdir + "/" + i + "-2.fq")

        for i in self.readgroups:
            pn = PairFqNormalizer(self.tmpdir + "/" + i + "-1.sort",
                                  self.tmpdir + "/" + i + "-2.sort",
                                  FastQWriter(self.tmpdir + "/" + i + "-A.fq"),
                                  FastQWriter(self.tmpdir + "/" + i + "-B.fq"),
                                  True, 1)
            pn.normalize()
            os.unlink(self.tmpdir + "/" + i + "-1.sort")
            os.unlink(self.tmpdir + "/" + i + "-2.sort")

        self.fqfiles = [(self.tmpdir + "/" + i + "-A.fq",
                         self.tmpdir + "/" + i + "-B.fq")
                        for i in self.readgroups]
        LE.debug("FQs created")
Beispiel #19
0
        calcLDforadjacentsites=options.bcftools_calcLDforadjacentsites,
        scaledsubstmutrate=options.bcftools_scaledsubstmutrate,
        indeltosubstratio=options.bcftools_indeltosubstratio,
        variantifprobltint=options.bcftools_variantifprobltint,
        typeofprior=options.bcftools_typeofprior,
        inbam=options.inbam,
        inref=options.ref_id,
        pileup_out=options.outpileup)
    try:
        c.runPileup()

        if c.annotate():
            print "Error with annotation!"
            c.dumpStdError()
            sys.exit(-1)
        c.merge(options.output)
        c.clean()
        LE.info("Finished!")

    except:
        c.dumpStdError()
        c.clean()
        dump_exc()
        try:
            pass
            c.dumpStdError()
            c.clean()
        except:
            pass
        dump_exc()