Example #1
0
        def testFromTabix(self):

            # use ascii encoding - should raise error
            t = pysam.TabixFile(self.tmpfilename + ".gz", encoding="ascii")
            results = list(t.fetch(parser=pysam.asVCF()))
            self.assertRaises(UnicodeDecodeError, getattr, results[1], "id")

            t = pysam.TabixFile(self.tmpfilename + ".gz", encoding="utf-8")
            results = list(t.fetch(parser=pysam.asVCF()))
            self.assertEqual(getattr(results[1], "id"), u"Rene\xe9")
Example #2
0
        def testFromTabix(self):

            # use ascii encoding - should raise error
            t = pysam.TabixFile(self.tmpfilename + ".gz", encoding="ascii")
            results = list(t.fetch(parser=pysam.asVCF()))
            self.assertRaises(UnicodeDecodeError, getattr, results[1], "id")

            t = pysam.TabixFile(self.tmpfilename + ".gz", encoding="utf-8")
            results = list(t.fetch(parser=pysam.asVCF()))
            self.assertEqual(getattr(results[1], "id"), u"Rene\xe9")
Example #3
0
 def fetch(self, chrom, start, end):
     """ yield tuples (vcf object + info dict key->val) for a range 
     vcf row attributes are: 
     contig
     pos chromosomal position, zero-based
     id
     ref reference
     alt alt
     qual qual
     filter filter
     info info
     format format specifier.
     """
     chrom = chrom.replace("chr", "")
     #fname = self.fnameDict[chrom]
     #vcf = pysam.VCF()
     #vcf.connect(fname)
     tbi = self.fhDict[chrom]
     it = tbi.fetch(chrom, start, end, parser=pysam.asVCF())
     for row in it:
         infoDict = {}
         infoStr = row.info
         for keyVal in infoStr.split(";"):
             if "=" not in keyVal:
                 continue
             key, val = keyVal.split("=")
             infoDict[key] = val
         yield row, infoDict
Example #4
0
    def testRead(self):

        ncolumns = len(self.columns)

        for x, r in enumerate(self.tabix.fetch(parser=pysam.asVCF())):
            c = self.compare[x]
            for y, field in enumerate(self.columns):
                # it is ok to have a missing format column
                if y == 8 and y == len(c):
                    continue
                if field == "pos":
                    self.assertEqual(int(c[y]) - 1, getattr(r, field))
                    self.assertEqual(int(c[y]) - 1, r.pos)
                else:
                    self.assertEqual(
                        c[y], getattr(r, field),
                        "mismatch in field %s: %s != %s" %
                        (field, c[y], getattr(r, field)))
            if len(c) == 8:
                self.assertEqual(0, len(r))
            else:
                self.assertEqual(len(c), len(r) + ncolumns)

            for y in range(len(c) - ncolumns):
                self.assertEqual(c[ncolumns + y], r[y])
Example #5
0
 def __init__(self, file_path, parser=pysam.asVCF()):
     self.vcf_file_path = file_path
     self.tabix_file = pysam.TabixFile(file_path, parser=parser)
     self.sample_names = self.read_sample_names()
     self.clens = self.contig_lengths()
     self.indexDelta = -1 if tuple(map(
         int, pysam.__version__.split('.'))) > (0, 5, 0) else 0
Example #6
0
    def testRead(self):

        ncolumns = len(self.columns)

        for x, r in enumerate(self.tabix.fetch(parser=pysam.asVCF())):
            c = self.compare[x]
            for y, field in enumerate(self.columns):
                # it is ok to have a missing format column
                if y == 8 and y == len(c):
                    continue
                if field == "pos":
                    self.assertEqual(int(c[y]) - 1, getattr(r, field))
                    self.assertEqual(int(c[y]) - 1, r.pos)
                else:
                    self.assertEqual(c[y], getattr(r, field),
                                     "mismatch in field %s: %s != %s" %
                                     (field, c[y], getattr(r, field)))
            if len(c) == 8:
                self.assertEqual(0, len(r))
            else:
                self.assertEqual(len(c), len(r) + ncolumns)

            for y in range(len(c) - ncolumns):
                self.assertEqual(c[ncolumns + y], r[y])
            self.assertEqual("\t".join(map(str, c)),
                             str(r))
Example #7
0
 def fetch(self, chrom, start, end):
     """ yield tuples (vcf object + info dict key->val) for a range 
     vcf row attributes are: 
     contig
     pos chromosomal position, zero-based
     id
     ref reference
     alt alt
     qual qual
     filter filter
     info info
     format format specifier.
     """
     chrom = chrom.replace("chr","")
     #fname = self.fnameDict[chrom]
     #vcf = pysam.VCF()
     #vcf.connect(fname)
     tbi = self.fhDict[chrom]
     it = tbi.fetch(chrom, start, end, parser=pysam.asVCF())
     for row in it:
         infoDict = {}
         infoStr = row.info
         for keyVal in infoStr.split(";"):
             if "=" not in keyVal:
                 continue
             key, val = keyVal.split("=")
             infoDict[key] = val
         yield row, infoDict
Example #8
0
class VcfReader:
    """
    Read comfortably from VCF style files with main focus on chr, start, ref and alt fields.
    Note that this API uses 1-based coordinates with both start and end included in the interval. 
    PySam API uses 0-based half-open intervals, so we have to convert internally.
    """
    def __init__(self, input_file):
        self.filename = input_file
        self.indexed = False

        if input_file.strip() == "-":
            ifile = sys.stdin
        elif input_file.endswith(".bz2"):
            try:
                ifile = bz2file.BZ2File(input_file, "r", buffering=0)
            except Exception, e:
                raise e
        elif input_file.endswith(".gz") or input_file.endswith(".bgz"):
            # try to open the file with Tabix
            try:
                ifile = pysam.Tabixfile(input_file, parser=pysam.asVCF())
                self.indexed = True
            except Exception, e:
                try:
                    ifile = gzip.GzipFile(input_file, "r")
                except Exception, e:
                    raise e
Example #9
0
def filter_vcfs(genotype, contig, start, end):
    if contig in genotype.contigs:
        parser = pysam.asVCF()
        # This raises a ValueError if the VCF does not
        # contain any entries for the specified contig.
        for vcf in genotype.fetch(contig, start, end, parser=parser):
            if vcf.filter in ("PASS", "."):
                yield vcf
Example #10
0
def filter_vcfs(genotype, contig, start, end):
    if contig in genotype.contigs:
        parser = pysam.asVCF()
        # This raises a ValueError if the VCF does not
        # contain any entries for the specified contig.
        for vcf in genotype.fetch(contig, start, end, parser=parser):
            if vcf.filter in ("PASS", "."):
                yield vcf
Example #11
0
    def fetch(self,
              reference = None,
              start = None, 
              end = None, 
              region = None ):
        """ Parse a stream of VCF-formatted lines.  Initializes class instance and return generator """

        iter = self.tabixfile.fetch( reference, start, end, region, parser = pysam.asVCF() )
        for x in iter:
            yield VCFRecord( x, self )
Example #12
0
def get_contigs(fns):
    contigs = set()
    i = 0
    vcfs = []
    for v in fns:
        tbx = pysam.TabixFile(v, parser=pysam.asVCF())
        cs = tbx.contigs
        contigs.update(cs)
        vcfs.append(tbx)
    return (sorted(contigs, key=cmp_to_key(chrom_cmp)), vcfs)
Example #13
0
    def records(self, chromosome_num, start_pos_bp, end_pos_bp, parser=pysam.asVCF()):
        '''
            Returns an iterator for the file records.
        '''

        start_pos_bp = 1 if start_pos_bp is None else start_pos_bp
        end_pos_bp = self.clens[str(chromosome_num)] if end_pos_bp is None else end_pos_bp

        # subtract one since pysam uses incorrect 0-indexed positions
        records = self.tabix_file.fetch( str(chromosome_num), start_pos_bp+self.indexDelta, end_pos_bp+self.indexDelta, parser)

        return records
Example #14
0
    def records(self, chromosome_num, start_pos_bp, end_pos_bp, parser=pysam.asVCF()):
        '''
            Returns an iterator for the file records.
        '''

        start_pos_bp = 1 if start_pos_bp is None else start_pos_bp
        end_pos_bp = self.clens[str(chromosome_num)] if end_pos_bp is None else end_pos_bp

        # subtract one since pysam uses incorrect 0-indexed positions
        records = self.tabix_file.fetch( str(chromosome_num), start=start_pos_bp+self.indexDelta, end=end_pos_bp+self.indexDelta, parser=parser)

        return records
Example #15
0
def check_nth_sample(options, genotype):
    parser = pysam.asVCF()

    for contig in genotype.contigs:
        for record in text.parse_lines(genotype.fetch(contig), parser):
            if len(record) <= options.nth_sample:
                sys.stderr.write("ERROR: Sample %i selected with --nth-sample,"
                                 " but file only contains %i sample(s)!\n"
                                 % (options.nth_sample + 1, len(record)))
                return False
            return True
    return True
Example #16
0
def check_nth_sample(options, genotype):
    parser = pysam.asVCF()

    for contig in genotype.contigs:
        for record in genotype.fetch(contig, parser=parser):
            if len(record) <= options.nth_sample:
                sys.stderr.write("ERROR: Sample %i selected with --nth-sample,"
                                 " but file only contains %i sample(s)!\n" %
                                 (options.nth_sample + 1, len(record)))
                return False
            return True
    return True
Example #17
0
def _read_files(filenames, args):
    in_header = True
    has_filters = False
    vcf_parser = pysam.asVCF()
    for line in fileinput.input(filenames):
        if not line.startswith("#"):
            in_header = False
            line = line.rstrip("\n\r")
            yield vcf_parser(line, len(line))
        elif in_header:
            if not (line.startswith("##") or has_filters):
                has_filters = True
                for item in sorted(vcffilter.describe_filters(args).items()):
                    print('##FILTER=<ID=%s,Description="%s">' % item)

            print(line, end="")
Example #18
0
	def __init__(self, inFile, ploidy=1, parser=pysam.asVCF()):
		TabixReader.__init__(self, inFile, parser=parser)
		assert ploidy in (1,2)
		self.ploidy = ploidy
		self.clens = []
		self.sample_names = None
		for line in self.header:
			if line.startswith('##contig=<ID=') and line.endswith('>'):
				line = line[13:-1]
				c = line.split(',')[0]
				clen = int(line.split('=')[1])
				self.clens.append((c,clen))
			elif line.startswith('#CHROM'):
				row = line.split('\t')
				self.sample_names = row[9:]
		self.clens = dict(self.clens)
		assert self.sample_names
Example #19
0
 def __init__(self, inFile, ploidy=1, parser=pysam.asVCF()):
     TabixReader.__init__(self, inFile, parser=parser)
     assert ploidy in (1, 2)
     self.ploidy = ploidy
     self.clens = []
     self.sample_names = None
     for line in self.header:
         if line.startswith('##contig=<ID=') and line.endswith('>'):
             line = line[13:-1]
             c = line.split(',')[0]
             clen = int(line.split('=')[1])
             self.clens.append((c, clen))
         elif line.startswith('#CHROM'):
             row = line.split('\t')
             self.sample_names = row[9:]
     self.clens = dict(self.clens)
     assert self.sample_names
Example #20
0
    def testRead(self):

        ncolumns = len(self.columns)

        for x, r in enumerate(self.tabix.fetch(parser=pysam.asVCF())):
            c = self.compare[x]
            for y, field in enumerate(self.columns):
                if field == "pos":
                    self.assertEqual(int(c[y]) - 1, getattr(r, field))
                    self.assertEqual(int(c[y]) - 1, r.pos)
                else:
                    self.assertEqual( c[y], getattr( r, field ),
                                      "mismatch in field %s: %s != %s" %\
                                          ( field,c[y], getattr( r, field ) ) )
            self.assertEqual(len(c), len(r) + ncolumns)

            for y in range(len(c) - ncolumns):
                self.assertEqual(c[ncolumns + y], r[y])
Example #21
0
    def testRead( self ):
        
        ncolumns = len(self.columns) 

        for x, r in enumerate(self.tabix.fetch( parser = pysam.asVCF() )):
            c = self.compare[x]
            for y, field in enumerate( self.columns ):
                if field == "pos":
                    self.assertEqual( int(c[y]) - 1, getattr( r, field ) )
                    self.assertEqual( int(c[y]) - 1, r.pos )
                else:
                    self.assertEqual( c[y], getattr( r, field ), 
                                      "mismatch in field %s: %s != %s" %\
                                          ( field,c[y], getattr( r, field ) ) )
            self.assertEqual( len(c), len( r ) + ncolumns )
            
            for y in range(len(c) - ncolumns):
                self.assertEqual( c[ncolumns+y], r[y] )
Example #22
0
    def testWrite(self):

        ncolumns = len(self.columns)

        for x, r in enumerate(self.tabix.fetch(parser=pysam.asVCF())):
            c = self.compare[x]
            # check unmodified string
            cmp_string = str(r)
            ref_string = "\t".join([x for x in c])

            self.assertEqual(ref_string, cmp_string)

            # set fields and compare field-wise
            for y, field in enumerate(self.columns):
                # it is ok to have a missing format column
                if y == 8 and y == len(c):
                    continue
                if field == "pos":
                    rpos = getattr(r, field)
                    self.assertEqual(int(c[y]) - 1, rpos)
                    self.assertEqual(int(c[y]) - 1, r.pos)
                    # increment pos by 1
                    setattr(r, field, rpos + 1)
                    self.assertEqual(getattr(r, field), rpos + 1)
                    c[y] = str(int(c[y]) + 1)
                else:
                    setattr(r, field, "test_%i" % y)
                    c[y] = "test_%i" % y
                    self.assertEqual(
                        c[y], getattr(r, field),
                        "mismatch in field %s: %s != %s" %
                        (field, c[y], getattr(r, field)))

            if len(c) == 8:
                self.assertEqual(0, len(r))
            else:
                self.assertEqual(len(c), len(r) + ncolumns)

            for y in range(len(c) - ncolumns):
                c[ncolumns + y] = "test_%i" % y
                r[y] = "test_%i" % y
                self.assertEqual(c[ncolumns + y], r[y])
Example #23
0
def _get_hits(coords, annotation, parser_type):
    """Retrieve BED information, recovering if BED annotation file does have a chromosome.
    """
    if parser_type == "bed":
        parser = pysam.asBed()
    elif parser_type == "vcf":
        parser = pysam.asVCF()
    elif parser_type == "tuple":
        parser = pysam.asTuple()
    elif parser_type is None:
        parser = None
    else:
        raise ValueError("Unexpected parser type: %s" % parser)
    chrom, start, end = coords
    try:
        hit_iter = annotation.fetch(str(chrom), start, end, parser=parser)
    # catch invalid region errors raised by ctabix
    except ValueError:
        hit_iter = []
    return hit_iter
Example #24
0
def _get_hits(coords, annotation, parser_type):
    """Retrieve BED information, recovering if BED annotation file does have a chromosome.
    """
    if parser_type == "bed":
        parser = pysam.asBed()
    elif parser_type == "vcf":
        parser = pysam.asVCF()
    elif parser_type == "tuple":
        parser = pysam.asTuple()
    elif parser_type is None:
        parser = None
    else:
        raise ValueError("Unexpected parser type: %s" % parser)
    chrom, start, end = coords
    try:
        hit_iter = annotation.fetch(str(chrom), start, end, parser=parser)
    # catch invalid region errors raised by ctabix
    except ValueError:
        hit_iter = []
    return hit_iter
Example #25
0
def GetSumOfDifferencesFromTheReference(vcfpath):
    from subprocess import check_call
    from utilBMF.HTSUtils import TrimExt
    import pysam
    import numpy as np
    from sys import stderr
    from itertools import chain
    cfi = chain.from_iterable
    bgvcfpath = TrimExt(vcfpath) + ".gz"
    check_call("bgzip -c %s > %s" % (vcfpath, bgvcfpath), shell=True)
    stderr.write("bgvcf now at %s" % bgvcfpath)
    tabixstr = "tabix " + bgvcfpath
    stderr.write("Now calling tabixstr: '%s'" % tabixstr)
    check_call("tabix %s" % bgvcfpath, shell=True)
    infh = open(bgvcfpath, "rb")
    tabixhandle = pysam.tabix_iterator(infh, pysam.asVCF())
    return np.sum(np.array(list(cfi([dict(tup.split("=") for
                                          tup in i.info.split(";"))[
        'I16'].split(",")[2:4] for i in tabixhandle if
                                     "INDEL" not in i.info])), dtype=np.int64))
Example #26
0
    def testWrite(self):

        ncolumns = len(self.columns)

        for x, r in enumerate(self.tabix.fetch(parser=pysam.asVCF())):
            c = self.compare[x]
            # check unmodified string
            cmp_string = str(r)
            ref_string = "\t".join([x for x in c])

            self.assertEqual(ref_string, cmp_string)

            # set fields and compare field-wise
            for y, field in enumerate(self.columns):
                # it is ok to have a missing format column
                if y == 8 and y == len(c):
                    continue
                if field == "pos":
                    rpos = getattr(r, field)
                    self.assertEqual(int(c[y]) - 1, rpos)
                    self.assertEqual(int(c[y]) - 1, r.pos)
                    # increment pos by 1
                    setattr(r, field, rpos + 1)
                    self.assertEqual(getattr(r, field), rpos + 1)
                    c[y] = str(int(c[y]) + 1)
                else:
                    setattr(r, field, "test_%i" % y)
                    c[y] = "test_%i" % y
                    self.assertEqual(c[y], getattr(r, field),
                                     "mismatch in field %s: %s != %s" %
                                     (field, c[y], getattr(r, field)))

            if len(c) == 8:
                self.assertEqual(0, len(r))
            else:
                self.assertEqual(len(c), len(r) + ncolumns)

            for y in range(len(c) - ncolumns):
                c[ncolumns + y] = "test_%i" % y
                r[y] = "test_%i" % y
                self.assertEqual(c[ncolumns + y], r[y])
def select_vcf_records(bed_records, vcf_records):
    """Returns an iterable of VCF records, corresponding to the contents of each
    region specified by the BED records. Records are returned at most once, even
    if covered by multiple BED records."""
    contigs = frozenset(vcf_records.contigs)
    vcf_parser = pysam.asVCF()

    # Timer class used processing progress; meant primarily for BAM files
    progress = timer.BAMTimer(None)

    # Cache of positions observed for this contig, to prevent returning
    # positions in overlapping regions multiple times
    contig_cache = None
    contig_cache_name = None

    for bed in sorted(bed_records):
        if bed.contig not in contigs:
            # Skip contigs for which no calls have been made (e.g. due to
            # low coverage. Otherwise Pysam raises an exception.
            continue
        elif contig_cache_name != bed.contig:
            # Reset cache per contig, to save memory
            contig_cache = set()
            contig_cache_name = bed.contig

        for record in vcf_records.fetch(bed.contig,
                                        bed.start,
                                        bed.end,
                                        parser=vcf_parser):
            progress.increment()

            if record.pos in contig_cache:
                # We've already reported this VCF record
                continue

            contig_cache.add(record.pos)
            # Skip records filtered by VCF_filter
            if record.filter in ('.', "PASS"):
                yield record
    progress.finalize()
Example #28
0
def _read_files(args):
    in_header = True
    has_filters = False
    vcf_parser = pysam.asVCF()
    for filename in args.filenames:
        with open_ro(filename, "rb") as handle:
            for line in handle:
                if not line.startswith(b"#"):
                    in_header = False
                    line = line.rstrip(b"\n\r")
                    vcf = vcf_parser(line, len(line))
                    if args.reset_filter:
                        vcf.filter = "."

                    yield vcf
                elif in_header:
                    if not (line.startswith(b"##") or has_filters):
                        has_filters = True
                        for item in sorted(
                                vcffilter.describe_filters(args).items()):
                            print('##FILTER=<ID=%s,Description="%s">' % item)

                    print(line.decode("utf-8"), end="")
Example #29
0
 def __init__(self, inFile, ploidy=1, parser=pysam.asVCF()):
     # using old-style class superclass calling here
     # since TabixReader is derived from pysam.TabixFile
     # which is itself an old-style class (due to Cython version?)
     TabixReader.__init__(self, inFile, parser=parser)
     # when pysam uses new-style classes, we can replace with:
     #super(VcfReader, self).__init__(inFile, parser=parser)
     assert ploidy in (1, 2)
     self.ploidy = ploidy
     self.clens = []
     self.sample_names = None
     for line in self.header:
         line = bytes_to_string(line)
         if line.startswith('##contig=<ID=') and line.endswith('>'):
             line = line[13:-1]
             c = line.split(',')[0]
             clen = int(line.split('=')[1])
             self.clens.append((c, clen))
         elif line.startswith('#CHROM'):
             row = line.split('\t')
             self.sample_names = row[9:]
     self.clens = dict(self.clens)
     assert self.sample_names
Example #30
0
 def __init__(self, inFile, ploidy=1, parser=pysam.asVCF()):
     # using old-style class superclass calling here
     # since TabixReader is derived from pysam.TabixFile
     # which is itself an old-style class (due to Cython version?)
     TabixReader.__init__(self, inFile, parser=parser)
     # when pysam uses new-style classes, we can replace with:
     #super(VcfReader, self).__init__(inFile, parser=parser)
     assert ploidy in (1, 2)
     self.ploidy = ploidy
     self.clens = []
     self.sample_names = None
     for line in self.header:  # pylint: disable=E1101
         line = bytes_to_string(line)
         if line.startswith('##contig=<ID=') and line.endswith('>'):
             line = line[13:-1]
             c = line.split(',')[0]
             clen = int(line.split('=')[1])
             self.clens.append((c, clen))
         elif line.startswith('#CHROM'):
             row = line.split('\t')
             self.sample_names = row[9:]
     self.clens = dict(self.clens)
     assert self.sample_names
def select_vcf_records(bed_records, vcf_records):
    """Returns an iterable of VCF records, corresponding to the contents of each
    region specified by the BED records. Records are returned at most once, even
    if covered by multiple BED records."""
    contigs    = frozenset(vcf_records.contigs)
    vcf_parser = pysam.asVCF()

    # Timer class used processing progress; meant primarily for BAM files
    progress   = timer.BAMTimer(None)

    # Cache of positions observed for this contig, to prevent returning
    # positions in overlapping regions multiple times
    contig_cache = None
    contig_cache_name = None

    for bed in sorted(bed_records):
        if bed.contig not in contigs:
            # Skip contigs for which no calls have been made (e.g. due to
            # low coverage. Otherwise Pysam raises an exception.
            continue
        elif contig_cache_name != bed.contig:
            # Reset cache per contig, to save memory
            contig_cache = set()
            contig_cache_name = bed.contig

        for record in vcf_records.fetch(bed.contig, bed.start, bed.end, parser = vcf_parser):
            progress.increment()

            if record.pos in contig_cache:
                # We've already reported this VCF record
                continue

            contig_cache.add(record.pos)
            # Skip records filtered by VCF_filter
            if record.filter in ('.', "PASS"):
                yield record
    progress.finalize()
Example #32
0
def RunTools(args):
	cfg = Parse.generate_tools_cfg(args)
	Parse.print_tools_options(cfg)

	if not cfg['debug']:
		logging.disable(logging.CRITICAL)

	regions_df = pd.read_table(cfg['region_file'], compression='gzip' if cfg['region_file'].split('.')[-1] == 'gz' else None)
	regions_df = regions_df[regions_df['job'] == int(cfg['job'])].reset_index(drop=True)
	return_values = {}
	print ''
	print "initializing out file"
	try:
		bgzfile = bgzf.BgzfWriter(cfg['out'] + '.gz', 'wb')
	except:
		print Process.Error("failed to initialize bgzip format out file " + cfg['out'] + '.gz').out
		return 1

	if cfg['cpus'] > 1:
		pool = mp.Pool(cfg['cpus']-1)
		for i in xrange(1,cfg['cpus']):
			return_values[i] = pool.apply_async(process_regions, args=(regions_df,cfg,i,True,))
			print "submitting job on cpu " + str(i) + " of " + str(cfg['cpus'])
		pool.close()
		print "executing job for cpu " + str(cfg['cpus']) + " of " + str(cfg['cpus']) + " via main process"
		main_return = process_regions(regions_df,cfg,cfg['cpus'],True)
		pool.join()

		if 1 in [return_values[i].get() for i in return_values] or main_return == 1:
			print Process.Error("error detected, see log files").out
			return 1

	else:
		main_return = process_regions(regions_df,cfg,1,True)
		if main_return == 1:
			print Process.Error("error detected, see log files").out
			return 1

	for i in xrange(1,cfg['cpus']+1):
		try:
			logfile = open(cfg['out'] + '.cpu' + str(i) + '.log', 'r')
		except:
			print Process.Error("failed to initialize log file " + cfg['out'] + '.cpu' + str(i) + '.log').out
			return 1
		print logfile.read()
		logfile.close()
		os.remove(cfg['out'] + '.cpu' + str(i) + '.log')

	written = False
	for i in xrange(1,cfg['cpus']+1):
		cpu_regions_df = regions_df[regions_df['cpu'] == i].reset_index()
		for j in xrange(0,len(cpu_regions_df.index)):
			f_temp=glob.glob(cfg['out'] + '.cpu' + str(i) + '.chr' + cpu_regions_df['region'][j].replace(':','bp') + '*.gz')[0]
			try:
				h=pysam.TabixFile(filename=f_temp,parser=pysam.asVCF())
			except:
				print Process.Error("failed to load vcf file " + f_temp)
				return 1
			if not written:
				for row in h.header:
					bgzfile.write(str(row) + '\n')
				written = True
			h_iter = h.fetch(region=str(cpu_regions_df['chr'][j]))
			for row in h_iter:
				bgzfile.write(str(row) + '\n')
			for f in glob.glob(cfg['out'] + '.cpu' + str(i) + '.chr' + cpu_regions_df['region'][j].replace(':','bp') + '.*'):
				os.remove(f)

	bgzfile.close()

	print "indexing out file"
	try:
		pysam.tabix_index(cfg['out'] + '.gz',preset="vcf",force=True)
	except:
		print Process.Error('failed to generate index').out
		return 1

	print "process complete"
	return 0
Example #33
0
def process_piece(merge_info):

    stats = {}

    try:
        output_file_left = None
        output_file_right = None

        if merge_info.output_file_left:
            output_file_left = open(merge_info.output_file_left, "w")

        if merge_info.output_file_right:
            output_file_right = open(merge_info.output_file_right, "w")

        mi = ['L']
        if merge_info.diploid:
            mi = ['L', 'R']

        LOG.info("Processing Chromosome {0}...".format(merge_info.chromosome))

        iterators = []
        discard_functions = []

        for i, file_info in enumerate(merge_info.vcf_files):
            vcf_tabix = pysam.TabixFile(file_info.file_name)
            try:
                vcf_iterator = vcf_tabix.fetch(merge_info.chromosome,
                                               parser=pysam.asVCF())
                iterators.append(vcf_iterator)
            except ValueError as ve:
                iterators.append(None)

            if file_info.discard_file:
                vcf_discard = open(file_info.discard_file, "w")

                def discard_record(rec):
                    vcf_discard.write(str(rec))
                    vcf_discard.write("\n")

                discard_functions.append(discard_record)
            else:
                discard_functions.append(lambda rec: None)

        n = 0

        line_numbers = 0
        for vcf_records in walk_vcfs_together(iterators):
            for i, vcf_record in enumerate(vcf_records):
                #LOG.debug(vcf_record)
                if vcf_record is None:
                    continue
                #LOG.debug(vcf_record.alt)
                #LOG.debug(type(vcf_record.alt))

                gt = vcf.parse_gt_tuple(vcf_record,
                                        merge_info.vcf_files[i].sample_index)
                #LOG.debug(gt)

                line_numbers = line_numbers + 1
                if gt.is_snp:
                    # snp
                    if merge_info.passed and 'PASS' not in vcf_record.filter:
                        discard_functions[i](vcf_record)

                    #LOG.debug("Processing SNP {}".format(vcf_record))
                    n += 1

                    if merge_info.quality and gt.fi == '0':
                        discard_functions[i](vcf_record)
                    elif gt.left is None or gt.right is None:
                        discard_functions[i](vcf_record)
                    else:
                        if merge_info.diploid:
                            # 0 i sthe same as REF and do not need
                            if gt.gt_left != 0:
                                output_file_left.write(
                                    "{}_L\t{}\t{}\t{}\t{}\t{}\n".format(
                                        merge_info.chromosome,
                                        vcf_record.pos + 1, '.',
                                        vcf_record.ref, gt.left, '.'))
                            if gt.gt_right != 0:
                                output_file_right.write(
                                    "{}_R\t{}\t{}\t{}\t{}\t{}\n".format(
                                        merge_info.chromosome,
                                        vcf_record.pos + 1, '.',
                                        vcf_record.ref, gt.right, '.'))
                        else:
                            if gt.gt_left == gt.gt_right and gt.gt_left != 0:
                                # ignore heterozygotes 0/1, 1/0, only process 0/0 and 1/1
                                #LOG.debug("ACCEPTED")

                                #LOG.debug('pos {} : ref {}, left {}, right {}'.format(vcf_snp.pos, vcf_snp.ref, gt.left, gt.right))
                                output_file_left.write(
                                    "{}\t{}\t{}\t{}\t{}\t{}\n".format(
                                        merge_info.chromosome,
                                        vcf_record.pos + 1, '.',
                                        vcf_record.ref, gt.left, '.'))
                else:
                    # indel
                    LOG.debug("Processing INDEL {}".format(vcf_record))

                    if merge_info.passed and 'PASS' not in vcf_record.filter:

                        LOG.debug("TOSSED: FILTERED ON PASS")
                        LOG.debug(vcf_record)
                        stats = update_stats(stats, 'FILTERED ON PASS')

                        discard_functions[i](vcf_record)
                        continue

                    elif merge_info.quality and gt.fi == '0':

                        # FI : Whether a sample was a Pass(1) or fail (0) based on FILTER values

                        LOG.debug("TOSSED: FILTERED ON QUALITY")
                        LOG.debug(vcf_record)
                        stats = update_stats(stats, 'FILTERED ON QUALITY')
                        discard_functions[i](vcf_record)
                        continue

                    elif gt.left is None and gt.right is None:

                        LOG.debug("TOSSED: NO STRAIN DATA")
                        LOG.debug(vcf_record)
                        stats = update_stats(stats, 'NO STRAIN DATA')
                        LOG.debug(i)
                        LOG.debug(type(vcf_record))
                        discard_functions[i](vcf_record)
                        continue

                    elif not merge_info.diploid and gt.left != gt.right:
                        # haploid or hexaploid
                        # gt must be equal

                        LOG.debug("TOSSED: HETEROZYGOUS")
                        LOG.debug(vcf_record)
                        stats = update_stats(stats, 'HETEROZYGOUS')
                        discard_functions[i](vcf_record)
                        continue

                    # START L AND R, ONLY R IF DIPLOID

                    for l_or_r in mi:
                        #LOG.debug("******************")
                        #LOG.debug(l_or_r)
                        lr_out = ''
                        if l_or_r == 'L':
                            #LOG.debug("->LEFT")
                            lr_out = '_L' if merge_info.diploid else ''
                            alt_seq = str(gt.left)
                            stats = merge_info.stats_left
                            output_file = output_file_left
                            prev_next_ref_pos = merge_info.prev_next_ref_pos_left
                        else:
                            #LOG.debug("->RIGHT")
                            lr_out = '_R' if merge_info.diploid else ''
                            alt_seq = str(gt.right)
                            stats = merge_info.stats_right
                            output_file = output_file_right
                            prev_next_ref_pos = merge_info.prev_next_ref_pos_right

                        LOG.debug(
                            "prev_next_ref_pos={}".format(prev_next_ref_pos))

                        if gt.ref == alt_seq:

                            LOG.debug("TOSSED, REF AND ALT ARE EQUAL")
                            LOG.debug(vcf_record)
                            stats = update_stats(stats,
                                                 'REF AND ALT ARE EQUAL')
                            discard_functions[i](vcf_record)
                            continue

                        orig_alt_seq = alt_seq

                        LOG.debug("SAMPLE: {0}".format(
                            vcf_record[merge_info.vcf_files[i].sample_index]))
                        LOG.debug(
                            "REF='{0}', ALT_L='{1}', ALT_R='{2}'. POS={3}".
                            format(gt.ref, gt.left, gt.right, vcf_record.pos))

                        position = vcf_record.pos + 1

                        ref_seq = str(gt.ref)
                        len_ref = len(ref_seq)
                        len_alt = len(alt_seq)

                        base_changes = len_ref - len_alt
                        base_pos_diff = 0

                        if position < prev_next_ref_pos:
                            LOG.debug(
                                "TOSSED: VCF ROLLBACK: {0}".format(vcf_record))
                            LOG.debug(vcf_record)

                            stats = update_stats(stats, 'VCF ROLLBACK')
                            discard_functions[i](vcf_record)
                            continue

                        # find the position where the first base change is
                        for n in xrange(min(len_ref, len_alt)):
                            if ref_seq[n] != alt_seq[n]:
                                base_pos_diff = n
                                break

                        # if it is 0, take the minimum length
                        if base_pos_diff == 0:
                            base_pos_diff = min(len_ref, len_alt)

                        # add the base position difference
                        position += base_pos_diff

                        # recalculate the strings
                        shared_bases = ref_seq[:base_pos_diff]
                        ref_seq = ref_seq[base_pos_diff:]
                        alt_seq = alt_seq[base_pos_diff:]

                        dt = len(ref_seq)
                        dq = len(alt_seq)

                        next_ref_pos = position + len(ref_seq)
                        fragment_size = position - prev_next_ref_pos
                        '''

                        LOG.debug('           gt.ref: {0}'.format(gt.ref))
                        LOG.debug('          ref_seq: {0}'.format(ref_seq))
                        LOG.debug('               dt: {0}'.format(dt))
                        LOG.debug('           gt.alt: {0}'.format(orig_alt_seq))
                        LOG.debug('          alt_seq: {0}'.format(alt_seq))
                        LOG.debug('               dq: {0}'.format(dq))
                        LOG.debug('         position: {0}'.format(position))
                        LOG.debug('prev_next_ref_pos: {0}'.format(prev_next_ref_pos))
                        LOG.debug('     next_ref_pos: {0}'.format(next_ref_pos))
                        LOG.debug('    fragment_size: {0}'.format(fragment_size))
                        LOG.debug('     base_changes: {0}'.format(base_changes))
                        LOG.debug('    base_pos_diff: {0}'.format(base_pos_diff))
                        LOG.debug('     shared_bases: {0}'.format(shared_bases))
                        '''
                        # fix any 0 length
                        if fragment_size < 0:
                            #LOG.debug("TOSSED: FRAGMENT: {0}".format(vcf_record))

                            stats = update_stats(stats, 'FRAGMENT SIZE < 0')
                            discard_functions[i](vcf_record)
                            continue

                        if fragment_size != 0:
                            ref_str = ref_seq if ref_seq else '.'
                            alt_str = alt_seq if alt_seq else '.'
                            out = "{}{}\t{}\t{}\t{}\t{}\t{}\n".format(
                                merge_info.chromosome, lr_out,
                                vcf_record.pos + 1, shared_bases, ref_str,
                                alt_str, fragment_size)
                            LOG.debug(out)
                            output_file.write(out)
                        else:
                            #
                            # THIS SHOULD NOT HAPPEN
                            #
                            raise exceptions.G2GVCFError(
                                'Conflicting VCF entries')

                        stats = update_stats(stats, 'ACCEPTED')

                        if l_or_r == 'L':
                            merge_info.stats_left = stats
                            merge_info.prev_next_ref_pos_left = next_ref_pos
                            LOG.debug(
                                'setting merge_info.prev_next_ref_pos_left={}'.
                                format(merge_info.prev_next_ref_pos_left))
                        else:
                            merge_info.stats_right = stats
                            merge_info.prev_next_ref_pos_right = next_ref_pos
                            LOG.debug(
                                'setting merge_info.prev_next_ref_pos_right={}'
                                .format(merge_info.prev_next_ref_pos_right))

        if merge_info.output_file_left:
            output_file_left.close()

        if merge_info.output_file_right:
            output_file_right.close()

    except KeyboardInterrupt:
        raise exceptions.KeyboardInterruptError()
    except Exception as e:
        g2g_utils._show_error()
        raise Exception("Unknown exception")

    ret = {}
    ret['chrom'] = merge_info.chromosome
    ret['stats'] = stats
    ret['merge_info'] = merge_info
    ret['line_numbers'] = line_numbers

    return ret
Example #34
0
        except IOError:
            sys.exit("Gemini cannot open this annotation file: %s. \n"
                     "Have you installed the annotation files?  If so, "
                     "have they been moved or deleted? Exiting...\n\n"
                     "For more details:\n\t"
                     "http://gemini.readthedocs.org/en/latest/content/"
                     "#installation.html\#installing-annotation-files\n" %
                     anno_files[anno])


# ## Standard access to Tabix indexed files

PARSERS = {
    "bed": pysam.asBed(),
    "vcf": pysam.asVCF(),
    "tuple": pysam.asTuple(),
    None: None
}


def _get_hits(coords, annotation, parser_type, _parsers=PARSERS):
    """Retrieve BED information, recovering if BED annotation file does have a chromosome.
    """
    try:
        parser = _parsers[parser_type]
    except KeyError:
        raise ValueError("Unexpected parser type: %s" % parser)
    chrom, start, end = coords
    try:
        hit_iter = annotation.fetch(str(chrom), start, end, parser=parser)
Example #35
0
def process_piece(filename_vcf, chrom, chrom_length, sample_index, chain_info, diploid, passed, quality, vcf_keep, vcf_discard_file):
    ret = {'chrom': chrom, 'stats': {}, 'chain_info': chain_info}

    stats = OrderedDict()
    stats['ACCEPTED'] = 0

    if vcf_keep:
        vcf_discard = open(vcf_discard_file, "w")

    line_no = 0

    try:
        LOG.info("Processing Chromosome {0}...".format(chrom))
        tb = pysam.TabixFile(filename_vcf)

        for vcf_rec in tb.fetch(chrom, parser=pysam.asVCF()):
            line_no += 1

            try:
                gt = parse_gt_new(vcf_rec, sample_index)
            except:
                LOG.info("Unable to parse record, improper VCF file?")
                continue

            LOG.debug('\n')
            LOG.debug(vcf_rec)
            LOG.debug(gt)
            LOG.debug(vcf_rec[sample_index])

            if passed and 'PASS' not in vcf_rec.FILTER:

                LOG.debug("TOSSED: FILTERED ON PASS")
                stats = update_stats(stats, 'FILTERED ON PASS')

                if vcf_keep:
                    vcf_discard.write(vcf_rec)
                    vcf_discard.write("\n")
                continue

            elif quality and gt.fi == '0':

                # FI : Whether a sample was a Pass(1) or fail (0) based on FILTER values

                LOG.debug("TOSSED: FILTERED ON QUALITY")
                stats = update_stats(stats, 'FILTERED ON QUALITY')

                if vcf_keep:
                    vcf_discard.write(vcf_rec)
                    vcf_discard.write("\n")
                continue

            elif gt.left is None and gt.right is None:

                LOG.debug("TOSSED: NOT RELEVANT")
                stats = update_stats(stats, 'NOT RELEVANT')

                if vcf_keep:
                    vcf_discard.write(vcf_rec)
                    vcf_discard.write("\n")
                continue

            elif not diploid and gt.left != gt.right:
                # haploid or hexaploid
                # gt must be equal

                LOG.debug("TOSSED: HETEROZYGOUS")
                stats = update_stats(stats, 'HETEROZYGOUS')

                if vcf_keep:
                    vcf_discard.write(vcf_rec)
                    vcf_discard.write("\n")
                continue

            # START L AND R, ONLY R IF DIPLOID

            for ci, lr in chain_info.iteritems():
                if ci == 'left':
                    alt_seq = str(gt.left)
                else:
                    alt_seq = str(gt.right)

                if gt.ref == alt_seq:

                    LOG.debug("TOSSED, SAME AS REF")
                    lr.stats = update_stats(lr.stats, 'SAME AS REF')

                    if vcf_keep:
                        vcf_discard.write(vcf_rec)
                        vcf_discard.write("\n")
                    continue

                orig_alt_seq = alt_seq

                LOG.debug("SAMPLE: {0}".format(vcf_rec[sample_index]))
                LOG.debug("REF='{0}', ALT_L='{1}', ALT_R='{2}'. POS={3}".format(gt.ref, gt.left, gt.right, vcf_rec.pos))

                position = vcf_rec.pos + 1

                ref_seq = str(gt.ref)
                len_ref = len(ref_seq)
                len_alt = len(alt_seq)

                base_changes = len_ref - len_alt
                base_pos_diff = 0

                if position < lr.prev_next_ref_pos:
                    LOG.debug("TOSSED: CONFLICTING VCF ENTRIES: {0}".format(vcf_rec))

                    lr.stats = update_stats(lr.stats, 'CONFLICTING VCF ENTRIES')

                    if vcf_keep:
                        vcf_discard.write(vcf_rec)
                        vcf_discard.write("\n")

                    continue

                # find the position where the first base change is
                for n in xrange(min(len_ref, len_alt)):
                    if ref_seq[n] != alt_seq[n]:
                        base_pos_diff = n
                        break

                # if it is 0, take the minimum length
                if base_pos_diff == 0:
                    base_pos_diff = min(len_ref, len_alt)

                # add the base position difference
                position += base_pos_diff

                # recalculate the strings
                shared_bases = ref_seq[:base_pos_diff]
                ref_seq = ref_seq[base_pos_diff:]
                alt_seq = alt_seq[base_pos_diff:]

                dt = len(ref_seq)
                dq = len(alt_seq)

                next_ref_pos = position + len(ref_seq)
                fragment_size = position - lr.prev_next_ref_pos

                LOG.debug('           gt.ref: {0}'.format(gt.ref))
                LOG.debug('          ref_seq: {0}'.format(ref_seq))
                LOG.debug('               dt: {0}'.format(dt))
                LOG.debug('           gt.alt: {0}'.format(orig_alt_seq))
                LOG.debug('          alt_seq: {0}'.format(alt_seq))
                LOG.debug('               dq: {0}'.format(dq))
                LOG.debug('         position: {0}'.format(position))
                LOG.debug('prev_next_ref_pos: {0}'.format(lr.prev_next_ref_pos))
                LOG.debug('     next_ref_pos: {0}'.format(next_ref_pos))
                LOG.debug('    fragment_size: {0}'.format(fragment_size))
                LOG.debug('     base_changes: {0}'.format(base_changes))
                LOG.debug('    base_pos_diff: {0}'.format(base_pos_diff))
                LOG.debug('     shared_bases: {0}'.format(shared_bases))

                # fix any 0 length
                if fragment_size < 0:
                    LOG.debug("TOSSED: CONFLICTING VCF ENTRIES: {0}".format(vcf_rec))

                    lr.stats = update_stats(lr.stats, 'CONFLICTING VCF ENTRIES')

                    if vcf_keep:
                        vcf_discard.write(vcf_rec)
                        vcf_discard.write("\n")

                    continue

                if fragment_size != 0:
                    ref_str = ref_seq if ref_seq else '.'
                    alt_str = alt_seq if alt_seq else '.'
                    lr.chain_entries.append([fragment_size, len(ref_seq), len(alt_seq), shared_bases, ref_str, alt_str, vcf_rec.pos+1])
                else:
                    #
                    # THIS SHOULD NOT HAPPEN
                    #
                    raise G2GChainFileError('Unable to create chain file due to conflicting VCF entries')

                lr.stats = update_stats(lr.stats, 'ACCEPTED')

                LOG.debug(lr.chain_entries[-1])

                last_position = position
                lr.prev_next_ref_pos = next_ref_pos
                lr.sums[0] += fragment_size
                lr.sums[1] += dt
                lr.sums[2] += dq
                prev_line = vcf_rec

                #lr.prev_chrom = vcf_rec.contig
            chain_info[ci] = lr

        for ci, lr in chain_info.iteritems():
            #LOG.debug("CHROMOSOME[{0}] LENGTH = {1}".format(lr.prev_chrom, chrom_length))

            lr.chromosome = chrom
            lr.chromosome_length = chrom_length
            lr.last_fragment_size = chrom_length - lr.sums[0] - lr.sums[1]
            lr.end_length = lr.sums[0] + lr.last_fragment_size + lr.sums[2]
            lr.number_vcf_lines = line_no

            chain_info[ci] = lr

        if vcf_keep:
            vcf_discard.close()

    except KeyboardInterrupt:
        raise KeyboardInterruptError()
    except Exception, e:
        pass
Example #36
0
    def process_vcf_into_selscan_tped(cls, vcf_file, gen_map_file, outfile_location,
        outfile_prefix, chromosome_num, samples_to_include=None, start_pos_bp=None, end_pos_bp=None, ploidy=2, 
        consider_multi_allelic=True, rescale_genetic_distance=False, include_variants_with_low_qual_ancestral=False, coding_function=None, 
        multi_alleli_merge_function="AND"):
        """
            Process a bgzipped-VCF (such as those included in the Phase 3 1000 Genomes release) into a gzip-compressed
            tped file of the sort expected by selscan. 
        """
        assert ploidy > 0

        processor = VCFReader(vcf_file)

        end_pos = processor.clens[str(chromosome_num)] if end_pos_bp is None else end_pos_bp

        records = processor.records( str(chromosome_num), start_pos_bp, end_pos, pysam.asVCF())

        outTpedFile = outfile_location + "/" + outfile_prefix + ".tped.gz"
        outTpedMetaFile = outfile_location + "/" + outfile_prefix + ".tped.allele_metadata.gz"

        if samples_to_include is not None and len(samples_to_include) > 0:
            indices_of_matching_samples = sorted([processor.sample_names.index(x) for x in samples_to_include])
        else:
            indices_of_matching_samples = list(range(0,len(processor.sample_names)))

        indices_of_matching_genotypes = [(x*2, (x*2)+1) for x in indices_of_matching_samples]
        indices_of_matching_genotypes = list(np.ravel(np.array(indices_of_matching_genotypes)))

        rm = RecomMap(gen_map_file)

        for filePath in [outTpedFile, outTpedMetaFile]:
            assert not os.path.exists(filePath), "File {} already exists. Consider removing this file or specifying a different output prefix. Processing aborted.".format(filePath)

        mergeOperatorString = ""
        if multi_alleli_merge_function == "OR":
            mergeOperatorString = "|"
        if multi_alleli_merge_function == "AND":
            mergeOperatorString = "&"
        if multi_alleli_merge_function == "XOR":
            mergeOperatorString = "^"

        startTime = datetime.now()
        sec_remaining_avg = 0
        current_pos_bp = 1

        with util.file.open_or_gzopen(outTpedFile, 'wt') as of1, util.file.open_or_gzopen(outTpedMetaFile, 'wt') as of2:
            # WRITE header for metadata file here with selected subset of sample_names
            headerString = "CHROM VARIANT_ID POS_BP MAP_POS_CM REF_ALLELE ALT_ALLELE ANCESTRAL_CALL ALLELE_FREQ_IN_POP\n".replace(" ","\t")
            of2.write(headerString)

            of1linesToWrite = []
            of2linesToWrite = []

            recordLengths = set() #

            recordCount = 0
            mostRecentRecordPosSeen = -1
            positionHasBeenSeenBefore = False
            previouslyCodedGenotypes = GenoRecord([])
            lineToWrite1 = None
            lineToWrite2 = None
            previousAncestral = None
            ancestralDiffersFromPrevious = True
            for record in records:
                # in some cases, there may be records with duplicate positions in the VCF file
                # to account for that we collapse rows that pass our filter and then write out the rows when we 
                # encounter a record with a new position 
                if record.pos != mostRecentRecordPosSeen:

                    if positionHasBeenSeenBefore and not consider_multi_allelic:
                        lineToWrite1 = None
                        lineToWrite2 = None

                    if lineToWrite1 is not None and lineToWrite2 is not None:
                        if len(previouslyCodedGenotypes) == ploidy*len(indices_of_matching_samples):
                            # write the output line
                            of1.write(lineToWrite1)
                            of2.write(lineToWrite2)
                               
                        else:
                            genotypesCount            = len(previouslyCodedGenotypes)
                            countSpecificTpedName     = outTpedFile.replace(outfile_prefix, outfile_prefix + "_" + str(genotypesCount))
                            countSpecificMetafileName = outTpedMetaFile.replace(outfile_prefix, outfile_prefix + "_" + str(genotypesCount))
                            with util.file.open_or_gzopen(countSpecificTpedName, 'at') as of1l, util.file.open_or_gzopen(countSpecificMetafileName, 'at') as of2l:
                                of1l.write(lineToWrite1)
                                of2l.write(lineToWrite2)

                        lineToWrite1 = None
                        lineToWrite2 = None
                    mostRecentRecordPosSeen = record.pos
                    positionHasBeenSeenBefore = False
                else:
                    positionHasBeenSeenBefore = True

                # if the variant is a SNP
                # OLD style looking at INFO VT value: 
                # processor.variant_is_type(record.info, "SNP"):
                VALID_BASES = ["A","C","G","T","a","c","g","t"] #"N","n"
                if (len(record.ref) == 1 and len(record.alt) == 1) or ( all(variant in VALID_BASES for variant in record.ref.split(",")) and 
                     all(variant in VALID_BASES for variant in record.alt.split(",")) ):

                    alternateAlleles = [record.alt]
                    if record.alt not in ['A','T','C','G']:
                        #print record.alt
                        if consider_multi_allelic:
                            pass
                            alternateAlleles = record.alt.split(",")
                        else:
                            # continue on to next variant record
                            continue

                    ancestral_allele = processor.parse_ancestral(record.info)
                    chromStr = "chr{}".format(record.contig)

                    # if the AA is populated, and the call meets the specified criteria
                    if (ancestral_allele in ['A','T','C','G']) or include_variants_with_low_qual_ancestral:#(include_variants_with_low_qual_ancestral and ancestral_allele in ['a','t','c','g',]): 
                        
                        if include_variants_with_low_qual_ancestral:
                            if ancestral_allele in ['a','t','c','g',]:
                                ancestral_allele = ancestral_allele.upper()
                            else: #if no info, encode ref as ancestral
                                ancestral_allele = record.ref


                        if previousAncestral != ancestral_allele:
                            previousAncestral = ancestral_allele
                            ancestralDiffersFromPrevious = True
                        else:
                            ancestralDiffersFromPrevious = False

                        recordString = record.__str__()

                        match = cls.genoRegex.match(recordString)
                        if match:
                            rawGenos = match.group("genos")
                            genos = rawGenos[::2]
                            recordPosition = record.pos+1
                            if chromosome_num.upper() != "X":
                                try:
                                    genotypes_for_selected_samples = operator.itemgetter(*indices_of_matching_genotypes)(genos)
                                except IndexError:
                                    # if something about this locus makes the genotype line incomplete
                                    # we can continue to the next record. This can happen if '.' is incorrectly used in place of '.|.' in phased VCFs.
                                    # anything that causes the genotype line length to mismatch with the number of genotypes
                                    print("\ngenotype line length appears to be off:")
                                    print(record)
                                    continue
                                except Exception: # if this is a record of mixed ploidy, that is to say the X chromosome
                                    # raise an exception
                                    raise
                            else:
                                matching_genotypes = np.array(record[:len(record)])[indices_of_matching_samples]
                                genotypes_for_selected_samples_split = [x.split("|") for x in matching_genotypes]
                                genotypes_for_selected_samples = [y for x in genotypes_for_selected_samples_split for y in x]

                            recordLengths.add(len(genotypes_for_selected_samples))

                            map_pos_cm = rm.physToMap(chromStr, record.pos, rescale=rescale_genetic_distance)

                            numberOfHaplotypes = float(len(genotypes_for_selected_samples))
                            
                            codingFunc = np.vectorize(coding_function)

                            coded_genotypes_for_selected_samples = GenoRecord(["0"] * len(genotypes_for_selected_samples))
                            if consider_multi_allelic:
                                #coded_genotypes_for_selected_samples = GenoRecord(["1"] * len(genotypes_for_selected_samples))
                                for idx, altAllele in enumerate(alternateAlleles):
                                    value_of_current_allele = str(idx+1)
                                    coded_genotypes_for_selected_samples_for_allele = GenoRecord(codingFunc(genotypes_for_selected_samples, record.ref, altAllele, ancestral_allele, value_of_current_allele))
                                    #coded_genotypes_for_selected_samples |= coded_genotypes_for_selected_samples_for_allele
                                    if idx==0:
                                        coded_genotypes_for_selected_samples = coded_genotypes_for_selected_samples_for_allele
                                    else:
                                        coded_genotypes_for_selected_samples = coded_genotypes_for_selected_samples.f[mergeOperatorString](coded_genotypes_for_selected_samples_for_allele)
                                    #coded_genotypes_for_selected_samples = np.array(list(str(bin(int("".join(coded_genotypes_for_selected_samples),2) | int("".join(coded_genotypes_for_selected_samples_for_allele),2)))[2:].zfill(len(coded_genotypes_for_selected_samples))))
                            else:
                                coded_genotypes_for_selected_samples = GenoRecord(codingFunc(genotypes_for_selected_samples, record.ref, record.alt, ancestral_allele, "1"))

                            # if this is the first record in the file, create an array filled with zeros for the previously coded alleles
                            if recordCount == 0:
                                previouslyCodedGenotypes = GenoRecord(["1"] * len(genotypes_for_selected_samples))
                                
                            # bitwise OR coded genotypes for duplicate records, merge variants
                            # ...except selscan logic is inverted, so bitwise AND
                            # TODO: invert?
                            # record @ pos1 = 001001
                            # record @ pos1 = 100001
                            #                -------
                            # coded result  = 101001
                            #log.debug(genotypes_for_selected_samples)
                            #log.debug(coded_genotypes_for_selected_samples)

                            if positionHasBeenSeenBefore:
                                #coded_genotypes_for_selected_samples |= previouslyCodedGenotypes
                                coded_genotypes_for_selected_samples = coded_genotypes_for_selected_samples.f[mergeOperatorString](previouslyCodedGenotypes)
                                #coded_genotypes_for_selected_samples = np.array(list(str(bin(int("".join(coded_genotypes_for_selected_samples),2) & int("".join(previouslyCodedGenotypes),2)))[2:].zfill(len(coded_genotypes_for_selected_samples))))

                            previouslyCodedGenotypes = coded_genotypes_for_selected_samples

                            allele_freq_for_pop = float(list(coded_genotypes_for_selected_samples).count("1")) / numberOfHaplotypes

                            outStrDict = cls._build_variant_output_strings(record.contig, str(1), recordPosition, 
                                map_pos_cm, coded_genotypes_for_selected_samples, record.ref, record.alt, 
                                ancestral_allele, allele_freq_for_pop)

                            lineToWrite1 = outStrDict["tpedString"]
                            lineToWrite2 = outStrDict["metadataString"].replace(" ","\t")

                            recordCount += 1
                            current_pos_bp = int(recordPosition)

                            if recordCount % 1000 == 0:
                                number_of_seconds_elapsed = (datetime.now() - startTime).total_seconds()
                                bp_per_sec = float(current_pos_bp) / float(number_of_seconds_elapsed)
                                bp_remaining = end_pos - current_pos_bp
                                sec_remaining = bp_remaining / bp_per_sec
                                sec_remaining_avg = cls._moving_avg(sec_remaining, sec_remaining_avg, 10)
                                time_left = timedelta(seconds=sec_remaining_avg)
                            
                                if sec_remaining > 10:
                                    human_time_remaining = relative_time(datetime.utcnow()+time_left)
                                    print("")
                                    print(("Completed: {:.2%}".format(float(current_pos_bp)/float(end_pos))))
                                    print(("Estimated time of completion: {}".format(human_time_remaining)))
                                    #log.info("Genotype counts found: %s", str(list(recordLengths)))


            if positionHasBeenSeenBefore and not consider_multi_allelic:
                lineToWrite1 = None
                lineToWrite2 = None

            if lineToWrite1 is not None and lineToWrite2 is not None:
                if len(previouslyCodedGenotypes) == ploidy*len(indices_of_matching_samples):
                    # write the output lines
                    of1.write(lineToWrite1)
                    of2.write(lineToWrite2)
                       
                else:
                    genotypesCount            = len(previouslyCodedGenotypes)
                    countSpecificTpedName     = outTpedFile.replace(outfile_prefix, outfile_prefix + "_" + str(genotypesCount))
                    countSpecificMetafileName = outTpedMetaFile.replace(outfile_prefix, outfile_prefix + "_" + str(genotypesCount))
                    with util.file.open_or_gzopen(countSpecificTpedName, 'a') as of1l, util.file.open_or_gzopen(countSpecificMetafileName, 'a') as of2l:
                        of1l.write(lineToWrite1)
                        of2l.write(lineToWrite2)

            log.info("Genotype counts found: %s", str(list(recordLengths)))
Example #37
0
def RunSnvplot(args):
	cfg = Parse.generate_snvplot_cfg(args)
	Parse.print_snvplot_options(cfg)

	if not cfg['debug']:
		logging.disable(logging.CRITICAL)

	ro.r('suppressMessages(library(ggplot2))')
	ro.r('suppressMessages(library(grid))')

	handle=pysam.TabixFile(filename=cfg['file'],parser=pysam.asVCF())
	header = [x for x in handle.header]
	skip_rows = len(header)-1
	cols = header[-1].split()
	pcols = cfg['pcol'].split(',')
	cols_extract = [cfg['chrcol'],cfg['bpcol']] + pcols
	if cfg['qq_strat_freq']:
		if cfg['freqcol'] not in cols:
			print Process.Error("frequency column " + cfg['freqcol'] + " not found, unable to proceed with frequency stratified plots").out
			return 1
		else:
			cols_extract = cols_extract + [cfg['freqcol']]
			print "frequency column " + cfg['freqcol'] + " found"
	if cfg['qq_strat_mac']:
		if cfg['maccol'] not in cols:
			print Process.Error("minor allele count column " + cfg['maccol'] + " not found, unable to proceed with minor allele count stratified plots").out
			return 1
		else:
			cols_extract = cols_extract + [cfg['maccol']]
			print "minor allele count column " + cfg['maccol'] + " found"

	print "importing data"
	r = pd.read_table(cfg['file'],sep='\t',skiprows=skip_rows,usecols=cols_extract,compression='gzip')
	print str(r.shape[0]) + " total variants found"

	for pcol in pcols:
		print "plotting p-values for column " + pcol + " ..."
		results = r[[cfg['chrcol'],cfg['bpcol'],cfg['freqcol'],pcol]] if cfg['freqcol'] in r else r[[cfg['chrcol'],cfg['bpcol'],pcol]]
		results.dropna(inplace=True)
		results = results[(results[pcol] > 0) & (results[pcol] <= 1)].reset_index(drop=True)
		print "   " + str(results.shape[0]) + " variants with plottable p-values"

		results['logp'] = -1 * np.log10(results[pcol]) + 0.0

		ro.globalenv['results'] = results
		l = np.median(scipy.chi2.ppf([1-x for x in results[pcol].tolist()], df=1))/scipy.chi2.ppf(0.5,1)
		# in R: median(qchisq(results$p, df=1, lower.tail=FALSE))/qchisq(0.5,1)
		print "   genomic inflation (all variants) = " + str(l)

		if cfg['qq']:
			print "   generating standard qq plot"
			print "   minimum p-value: " + str(np.min(results[pcol]))
			a = -1 * np.log10(ro.r('ppoints(' + str(len(results.index)) + ')'))
			a.sort()
			results.sort_values(by=['logp'], inplace=True)
			print "   maximum -1*log10(p-value): " + str(np.max(results['logp']))

			ci_upper = -1 * np.log10(scipy.beta.ppf(0.95, range(1,len(results[pcol]) + 1), range(len(results[pcol]),0,-1)))
			ci_upper.sort()
			ci_lower = -1 * np.log10(scipy.beta.ppf(0.05, range(1,len(results[pcol]) + 1), range(len(results[pcol]),0,-1)))
			ci_lower.sort()
			
			ro.globalenv['df'] = ro.DataFrame({'a': ro.FloatVector(a), 'b': ro.FloatVector(results['logp']), 'ci_lower': ro.FloatVector(ci_lower), 'ci_upper': ro.FloatVector(ci_upper)})
			dftext_label = 'lambda %~~% ' + str(l)
			ro.globalenv['dftext'] = ro.DataFrame({'x': ro.r('Inf'), 'y': 0.5, 'lab': dftext_label})

			if cfg['ext'] == 'tiff':
				ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq.tiff')
			elif cfg['ext'] == 'eps':
				ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,bg="white",horizontal=True)' % (cfg['out'] + '.' + pcol + '.qq.eps')
			else:
				ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq.pdf')
			ro.r("""
				gp<-ggplot(df)
				pp<-gp + 
					aes_string(x='a',y='b') +
					geom_ribbon(aes_string(x='a',ymin='ci_lower',ymax='ci_upper'), data=df, alpha=0.25, fill='black') + 
					geom_point(size=2) +
					geom_abline(intercept=0, slope=1, alpha=0.5) + 
					scale_x_discrete(expression(Expected~~-log[10](italic(p)))) +
					scale_y_discrete(expression(Observed~~-log[10](italic(p)))) +
					coord_fixed() +
					theme_bw(base_size = 12) + 
					geom_text(aes_string(x='x', y='y', label='lab'), data = dftext, colour="black", vjust=0, hjust=1, size = 4, parse=TRUE) +
					theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.position = 'none', 
						panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), 
						panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12))
				%s
				""" % (ggsave))

			if np.max(results['logp']) > cfg['crop']:
				print "   generating cropped standard qq plot"
				ro.r('df$b[df$b > ' + str(cfg['crop']) + ']<-' + str(cfg['crop']))
				ro.r('df$shape<-0')
				ro.r('df$shape[df$b == ' + str(cfg['crop']) + ']<-1')
				if cfg['ext'] == 'tiff':
					ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq.cropped.tiff')
				elif cfg['ext'] == 'eps':
					ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,bg="white",horizontal=True)' % (cfg['out'] + '.' + pcol + '.qq.cropped.eps')
				else:
					ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq.cropped.pdf')
				ro.r("""
					gp<-ggplot(df)
					pp<-gp + 
						aes_string(x='a',y='b') +
						geom_ribbon(aes_string(x='a',ymin='ci_lower',ymax='ci_upper'), data=df, alpha=0.25, fill='black') + 
						geom_point(aes(shape=factor(shape)),size=2) +
						geom_abline(intercept=0, slope=1, alpha=0.5) + 
						scale_x_discrete(expression(Expected~~-log[10](italic(p)))) +
						scale_y_discrete(expression(Observed~~-log[10](italic(p)))) +
						coord_fixed() +
						theme_bw(base_size = 12) + 
						geom_text(aes_string(x='x', y='y', label='lab'), data = dftext, colour="black", vjust=0, hjust=1, size = 4, parse=TRUE) +
						theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.position = 'none', 
							panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), 
							panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12))
					%s
					""" % (ggsave))

		if cfg['qq_strat_freq']:
			print "   generating frequency stratified qq plot"
			
			
			
			strat_ticks = [0.005, 0.01, 0.03, 0.05]
			
			
			
			
			results['UGA___QQ_BIN___'] = 'E'
			
			results.loc[(results[cfg['freqcol']] >= 0.01) & (results[cfg['freqcol']] <= 0.99),'UGA___QQ_BIN___'] = 'D'
			results.loc[(results[cfg['freqcol']] >= 0.03) & (results[cfg['freqcol']] <= 0.97),'UGA___QQ_BIN___'] = 'C'
			results.loc[(results[cfg['freqcol']] >= 0.05) & (results[cfg['freqcol']] <= 0.95),'UGA___QQ_BIN___'] = 'B'
			results.loc[(results[cfg['freqcol']] >= 0.1) & (results[cfg['freqcol']] <= 0.9),'UGA___QQ_BIN___'] = 'A'
			lA='NA'
			lB='NA'
			lC='NA'
			lD='NA'
			lE='NA'
			lE_n=len(results[pcol][(results[cfg['freqcol']] < 0.01) | (results[cfg['freqcol']] > 0.99)])
			lD_n=len(results[pcol][((results[cfg['freqcol']] >= 0.01) & (results[cfg['freqcol']] < 0.03)) | ((results[cfg['freqcol']] <= 0.99) & (results[cfg['freqcol']] > 0.97))])
			lC_n=len(results[pcol][((results[cfg['freqcol']] >= 0.03) & (results[cfg['freqcol']] < 0.05)) | ((results[cfg['freqcol']] <= 0.97) & (results[cfg['freqcol']] > 0.95))])
			lB_n=len(results[pcol][((results[cfg['freqcol']] >= 0.05) & (results[cfg['freqcol']] < 0.1)) | ((results[cfg['freqcol']] <= 0.95) & (results[cfg['freqcol']] > 0.9))])
			lA_n=len(results[pcol][(results[cfg['freqcol']] >= 0.1) & (results[cfg['freqcol']] <= 0.9)])
			if lE_n > 0:
				lE=np.median(scipy.chi2.ppf([1-x for x in results[pcol][(results[cfg['freqcol']] < 0.01) | (results[cfg['freqcol']] > 0.99)].tolist()], df=1))/scipy.chi2.ppf(0.5,1)
			if lD_n > 0:
				lD=np.median(scipy.chi2.ppf([1-x for x in results[pcol][((results[cfg['freqcol']] >= 0.01) & (results[cfg['freqcol']] < 0.03)) | ((results[cfg['freqcol']] <= 0.99) & (results[cfg['freqcol']] > 0.97))].tolist()], df=1))/scipy.chi2.ppf(0.5,1)
			if lC_n > 0:
				lC=np.median(scipy.chi2.ppf([1-x for x in results[pcol][((results[cfg['freqcol']] >= 0.03) & (results[cfg['freqcol']] < 0.05)) | ((results[cfg['freqcol']] <= 0.97) & (results[cfg['freqcol']] > 0.95))].tolist()], df=1))/scipy.chi2.ppf(0.5,1)
			if lB_n > 0:
				lB=np.median(scipy.chi2.ppf([1-x for x in results[pcol][((results[cfg['freqcol']] >= 0.05) & (results[cfg['freqcol']] < 0.1)) | ((results[cfg['freqcol']] <= 0.95) & (results[cfg['freqcol']] > 0.9))].tolist()], df=1))/scipy.chi2.ppf(0.5,1)
			if lA_n > 0:
				lA=np.median(scipy.chi2.ppf([1-x for x in results[pcol][(results[cfg['freqcol']] >= 0.1) & (results[cfg['freqcol']] <= 0.9)].tolist()], df=1))/scipy.chi2.ppf(0.5,1)
			print "   genomic inflation (MAF >= 10%, n=" + str(lA_n) + ") = " + str(lA)
			print "   genomic inflation (5% <= MAF < 10%, n=" + str(lB_n) + ") = " + str(lB)
			print "   genomic inflation (3% <= MAF < 5%, n=" + str(lC_n) + ") = " + str(lC)
			print "   genomic inflation (1% <= MAF < 3%, n=" + str(lD_n) + ") = " + str(lD)
			print "   genomic inflation (MAF < 1%, n=" + str(lE_n) + ") = " + str(lE)

			a = np.array([])
			b = np.array([])
			c = np.array([])
			results.sort_values(by=['logp'], inplace=True)
			if len(results[results['UGA___QQ_BIN___'] == 'E'].index) > 0:
				aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA___QQ_BIN___'] == 'E'].index)) + ')'))
				aa.sort()
				bb = results['logp'][results['UGA___QQ_BIN___'] == 'E']
				#bb.sort()
				cc = results['UGA___QQ_BIN___'][results['UGA___QQ_BIN___'] == 'E']
				a = np.append(a,aa)
				b = np.append(b,bb)
				c = np.append(c,cc)
				print "   minimum p-value (MAF < 1%): " + str(np.min(results[pcol][results['UGA___QQ_BIN___'] == 'E']))
				print "   maximum -1*log10(p-value) (MAF < 1%): " + str(np.max(results['logp'][results['UGA___QQ_BIN___'] == 'E']))
			if len(results[results['UGA___QQ_BIN___'] == 'D'].index) > 0:
				aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA___QQ_BIN___'] == 'D'].index)) + ')'))
				aa.sort()
				bb = results['logp'][results['UGA___QQ_BIN___'] == 'D']
				#bb.sort()
				cc = results['UGA___QQ_BIN___'][results['UGA___QQ_BIN___'] == 'D']
				a = np.append(a,aa)
				b = np.append(b,bb)
				c = np.append(c,cc)
				print "   minimum p-value (1% <= MAF < 3%): " + str(np.min(results[pcol][results['UGA___QQ_BIN___'] == 'D']))
				print "   maximum -1*log10(p-value) (1% <= MAF < 3%): " + str(np.max(results['logp'][results['UGA___QQ_BIN___'] == 'D']))
			if len(results[results['UGA___QQ_BIN___'] == 'C'].index) > 0:
				aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA___QQ_BIN___'] == 'C'].index)) + ')'))
				aa.sort()
				bb = results['logp'][results['UGA___QQ_BIN___'] == 'C']
				#bb.sort()
				cc = results['UGA___QQ_BIN___'][results['UGA___QQ_BIN___'] == 'C']
				a = np.append(a,aa)
				b = np.append(b,bb)
				c = np.append(c,cc)
				print "   minimum p-value (3% <= MAF < 5%): " + str(np.min(results[pcol][results['UGA___QQ_BIN___'] == 'C']))
				print "   maximum -1*log10(p-value) (3% <= MAF < 5%): " + str(np.max(results['logp'][results['UGA___QQ_BIN___'] == 'C']))
			if len(results[results['UGA___QQ_BIN___'] == 'B'].index) > 0:
				aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA___QQ_BIN___'] == 'B'].index)) + ')'))
				aa.sort()
				bb = results['logp'][results['UGA___QQ_BIN___'] == 'B']
				#bb.sort()
				cc = results['UGA___QQ_BIN___'][results['UGA___QQ_BIN___'] == 'B']
				a = np.append(a,aa)
				b = np.append(b,bb)
				c = np.append(c,cc)
				print "   minimum p-value (5% <= MAF < 10%): " + str(np.min(results[pcol][results['UGA___QQ_BIN___'] == 'B']))
				print "   maximum -1*log10(p-value) (5% <= MAF < 10%): " + str(np.max(results['logp'][results['UGA___QQ_BIN___'] == 'B']))
			if len(results[results['UGA___QQ_BIN___'] == 'A'].index) > 0:
				aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA___QQ_BIN___'] == 'A'].index)) + ')'))
				aa.sort()
				bb = results['logp'][results['UGA___QQ_BIN___'] == 'A']
				#bb.sort()
				cc = results['UGA___QQ_BIN___'][results['UGA___QQ_BIN___'] == 'A']
				a = np.append(a,aa)
				b = np.append(b,bb)
				c = np.append(c,cc)
				print "   minimum p-value (MAF >= 10%): " + str(np.min(results[pcol][results['UGA___QQ_BIN___'] == 'A']))
				print "   maximum -1*log10(p-value) (MAF >= 10%): " + str(np.max(results['logp'][results['UGA___QQ_BIN___'] == 'A']))
        
			ro.globalenv['df'] = ro.DataFrame({'a': ro.FloatVector(a), 'b': ro.FloatVector(b), 'UGA___QQ_BIN___': ro.StrVector(c)})
        
			if cfg['ext'] == 'tiff':
				ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.tiff')
			elif cfg['ext'] == 'eps':
				ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white",horizontal=True)' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.eps')
			else:
				ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.pdf')
			ro.r("""
				gp<-ggplot(df, aes_string(x='a',y='b')) +
					geom_point(aes_string(color='UGA___QQ_BIN___'), size=2) +
					scale_colour_manual(values=c("E"="#a8ddb5", "D"="#7bccc4", "C"="#4eb3d3", "B"="#2b8cbe", "A"="#08589e"), labels=c("E"="MAF < 1%%","D"="1%% <= MAF < 3%%","C"="3%% <= MAF < 5%%","B"="5%% <= MAF < 10%%","A"="MAF >= 10%%")) +
					geom_abline(intercept=0, slope=1, alpha=0.5) + 
					scale_x_discrete(expression(Expected~~-log[10](italic(p)))) +
					scale_y_discrete(expression(Observed~~-log[10](italic(p)))) +
					coord_fixed() +
					theme_bw(base_size = 12) + 
					theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.title = element_blank(), 
						legend.key.height = unit(0.1,"in"), legend.text = element_text(size=5), legend.key = element_blank(), legend.justification = c(0,1), 
						legend.position = c(0,1), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), 
						panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12))
				%s
				""" % (ggsave))
        
			if np.max(results['logp']) > cfg['crop']:
				print "   generating cropped frequency stratified qq plot"
				ro.r('df$b[df$b > ' + str(cfg['crop']) + ']<-' + str(cfg['crop']))
				ro.r('df$shape<-0')
				ro.r('df$shape[df$b == ' + str(cfg['crop']) + ']<-1')
				if cfg['ext'] == 'tiff':
					ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.cropped.tiff')
				elif cfg['ext'] == 'eps':
					ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white",horizontal=True)' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.cropped.eps')
				else:
					ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.cropped.pdf')
				ro.r("""
					gp<-ggplot(df, aes_string(x='a',y='b')) +
						geom_point(aes(shape=factor(shape), color=UGA_MAF), size=2) +
						scale_colour_manual(values=c("E"="#a8ddb5", "D"="#7bccc4", "C"="#4eb3d3", "B"="#2b8cbe", "A"="#08589e"), labels=c("E"="MAF < 1%%","D"="1%% <= MAF < 3%%","C"="3%% <= MAF < 5%%","B"="5%% <= MAF < 10%%","A"="MAF >= 10%%")) +
						geom_abline(intercept=0, slope=1, alpha=0.5) + 
						scale_x_discrete(expression(Expected~~-log[10](italic(p)))) +
						scale_y_discrete(expression(Observed~~-log[10](italic(p)))) +
						coord_fixed() +
						theme_bw(base_size = 12) + 
						guides(shape=FALSE) + 
						theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.title = element_blank(), 
							legend.key.height = unit(0.1,"in"), legend.text = element_text(size=5), legend.key = element_blank(), legend.justification = c(0,1), 
							legend.position = c(0,1), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), 
							panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12))
					%s
					""" % (ggsave))
					
		#if cfg['qq_strat_mac']:
		#	print "   generating minor allele count stratified qq plot"
		#
		#	results['UGA_MAC'] = 'E'
		#	results.loc[results[cfg['maccol']] < 5),'UGA_MAC'] = 'D'
		#	results.loc[(results[cfg['maccol']] >= 0.03) & (results[cfg['maccol']] <= 0.97),'UGA_MAC'] = 'C'
		#	results.loc[(results[cfg['maccol']] >= 0.05) & (results[cfg['maccol']] <= 0.95),'UGA_MAC'] = 'B'
		#	results.loc[(results[cfg['maccol']] >= 0.1) & (results[cfg['maccol']] <= 0.9),'UGA_MAC'] = 'A'
		#	lA='NA'
		#	lB='NA'
		#	lC='NA'
		#	lD='NA'
		#	lE='NA'
		#	lE_n=len(results[pcol][(results[cfg['maccol']] < 0.01) | (results[cfg['maccol']] > 0.99)])
		#	lD_n=len(results[pcol][((results[cfg['maccol']] >= 0.01) & (results[cfg['maccol']] < 0.03)) | ((results[cfg['maccol']] <= 0.99) & (results[cfg['maccol']] > 0.97))])
		#	lC_n=len(results[pcol][((results[cfg['maccol']] >= 0.03) & (results[cfg['maccol']] < 0.05)) | ((results[cfg['maccol']] <= 0.97) & (results[cfg['maccol']] > 0.95))])
		#	lB_n=len(results[pcol][((results[cfg['maccol']] >= 0.05) & (results[cfg['maccol']] < 0.1)) | ((results[cfg['maccol']] <= 0.95) & (results[cfg['maccol']] > 0.9))])
		#	lA_n=len(results[pcol][(results[cfg['maccol']] >= 0.1) & (results[cfg['maccol']] <= 0.9)])
		#	if lE_n > 0:
		#		lE=np.median(scipy.chi2.ppf([1-x for x in results[pcol][(results[cfg['maccol']] < 0.01) | (results[cfg['maccol']] > 0.99)].tolist()], df=1))/scipy.chi2.ppf(0.5,1)
		#	if lD_n > 0:
		#		lD=np.median(scipy.chi2.ppf([1-x for x in results[pcol][((results[cfg['maccol']] >= 0.01) & (results[cfg['maccol']] < 0.03)) | ((results[cfg['maccol']] <= 0.99) & (results[cfg['maccol']] > 0.97))].tolist()], df=1))/scipy.chi2.ppf(0.5,1)
		#	if lC_n > 0:
		#		lC=np.median(scipy.chi2.ppf([1-x for x in results[pcol][((results[cfg['maccol']] >= 0.03) & (results[cfg['maccol']] < 0.05)) | ((results[cfg['maccol']] <= 0.97) & (results[cfg['maccol']] > 0.95))].tolist()], df=1))/scipy.chi2.ppf(0.5,1)
		#	if lB_n > 0:
		#		lB=np.median(scipy.chi2.ppf([1-x for x in results[pcol][((results[cfg['maccol']] >= 0.05) & (results[cfg['maccol']] < 0.1)) | ((results[cfg['maccol']] <= 0.95) & (results[cfg['maccol']] > 0.9))].tolist()], df=1))/scipy.chi2.ppf(0.5,1)
		#	if lA_n > 0:
		#		lA=np.median(scipy.chi2.ppf([1-x for x in results[pcol][(results[cfg['maccol']] >= 0.1) & (results[cfg['maccol']] <= 0.9)].tolist()], df=1))/scipy.chi2.ppf(0.5,1)
		#	print "   genomic inflation (MAF >= 10%, n=" + str(lA_n) + ") = " + str(lA)
		#	print "   genomic inflation (5% <= MAF < 10%, n=" + str(lB_n) + ") = " + str(lB)
		#	print "   genomic inflation (3% <= MAF < 5%, n=" + str(lC_n) + ") = " + str(lC)
		#	print "   genomic inflation (1% <= MAF < 3%, n=" + str(lD_n) + ") = " + str(lD)
		#	print "   genomic inflation (MAF < 1%, n=" + str(lE_n) + ") = " + str(lE)
        #
		#	a = np.array([])
		#	b = np.array([])
		#	c = np.array([])
		#	results.sort_values(by=['logp'], inplace=True)
		#	if len(results[results['UGA_MAC'] == 'E'].index) > 0:
		#		aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA_MAC'] == 'E'].index)) + ')'))
		#		aa.sort()
		#		bb = results['logp'][results['UGA_MAC'] == 'E']
		#		#bb.sort()
		#		cc = results['UGA_MAC'][results['UGA_MAC'] == 'E']
		#		a = np.append(a,aa)
		#		b = np.append(b,bb)
		#		c = np.append(c,cc)
		#		print "   minimum p-value (MAF < 1%): " + str(np.min(results[pcol][results['UGA_MAC'] == 'E']))
		#		print "   maximum -1*log10(p-value) (MAF < 1%): " + str(np.max(results['logp'][results['UGA_MAC'] == 'E']))
		#	if len(results[results['UGA_MAC'] == 'D'].index) > 0:
		#		aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA_MAC'] == 'D'].index)) + ')'))
		#		aa.sort()
		#		bb = results['logp'][results['UGA_MAC'] == 'D']
		#		#bb.sort()
		#		cc = results['UGA_MAC'][results['UGA_MAC'] == 'D']
		#		a = np.append(a,aa)
		#		b = np.append(b,bb)
		#		c = np.append(c,cc)
		#		print "   minimum p-value (1% <= MAF < 3%): " + str(np.min(results[pcol][results['UGA_MAC'] == 'D']))
		#		print "   maximum -1*log10(p-value) (1% <= MAF < 3%): " + str(np.max(results['logp'][results['UGA_MAC'] == 'D']))
		#	if len(results[results['UGA_MAC'] == 'C'].index) > 0:
		#		aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA_MAC'] == 'C'].index)) + ')'))
		#		aa.sort()
		#		bb = results['logp'][results['UGA_MAC'] == 'C']
		#		#bb.sort()
		#		cc = results['UGA_MAC'][results['UGA_MAC'] == 'C']
		#		a = np.append(a,aa)
		#		b = np.append(b,bb)
		#		c = np.append(c,cc)
		#		print "   minimum p-value (3% <= MAF < 5%): " + str(np.min(results[pcol][results['UGA_MAC'] == 'C']))
		#		print "   maximum -1*log10(p-value) (3% <= MAF < 5%): " + str(np.max(results['logp'][results['UGA_MAC'] == 'C']))
		#	if len(results[results['UGA_MAC'] == 'B'].index) > 0:
		#		aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA_MAC'] == 'B'].index)) + ')'))
		#		aa.sort()
		#		bb = results['logp'][results['UGA_MAC'] == 'B']
		#		#bb.sort()
		#		cc = results['UGA_MAC'][results['UGA_MAC'] == 'B']
		#		a = np.append(a,aa)
		#		b = np.append(b,bb)
		#		c = np.append(c,cc)
		#		print "   minimum p-value (5% <= MAF < 10%): " + str(np.min(results[pcol][results['UGA_MAC'] == 'B']))
		#		print "   maximum -1*log10(p-value) (5% <= MAF < 10%): " + str(np.max(results['logp'][results['UGA_MAC'] == 'B']))
		#	if len(results[results['UGA_MAC'] == 'A'].index) > 0:
		#		aa = -1 * np.log10(ro.r('ppoints(' + str(len(results[results['UGA_MAC'] == 'A'].index)) + ')'))
		#		aa.sort()
		#		bb = results['logp'][results['UGA_MAC'] == 'A']
		#		#bb.sort()
		#		cc = results['UGA_MAC'][results['UGA_MAC'] == 'A']
		#		a = np.append(a,aa)
		#		b = np.append(b,bb)
		#		c = np.append(c,cc)
		#		print "   minimum p-value (MAF >= 10%): " + str(np.min(results[pcol][results['UGA_MAC'] == 'A']))
		#		print "   maximum -1*log10(p-value) (MAF >= 10%): " + str(np.max(results['logp'][results['UGA_MAC'] == 'A']))
        #
		#	ro.globalenv['df'] = ro.DataFrame({'a': ro.FloatVector(a), 'b': ro.FloatVector(b), 'UGA_MAC': ro.StrVector(c)})
        #
		#	if cfg['ext'] == 'tiff':
		#		ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq_strat.tiff')
		#	elif cfg['ext'] == 'eps':
		#		ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white",horizontal=True)' % (cfg['out'] + '.' + pcol + '.qq_strat.eps')
		#	else:
		#		ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq_strat.pdf')
		#	ro.r("""
		#		gp<-ggplot(df, aes_string(x='a',y='b')) +
		#			geom_point(aes_string(color='UGA_MAC'), size=2) +
		#			scale_colour_manual(values=c("E"="#a8ddb5", "D"="#7bccc4", "C"="#4eb3d3", "B"="#2b8cbe", "A"="#08589e"), labels=c("E"="MAF < 1%%","D"="1%% <= MAF < 3%%","C"="3%% <= MAF < 5%%","B"="5%% <= MAF < 10%%","A"="MAF >= 10%%")) +
		#			geom_abline(intercept=0, slope=1, alpha=0.5) + 
		#			scale_x_continuous(expression(Expected~~-log[10](italic(p)))) +
		#			scale_y_continuous(expression(Observed~~-log[10](italic(p)))) +
		#			theme_bw(base_size = 12) + 
		#			theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.title = element_blank(), 
		#				legend.key.height = unit(0.1,"in"), legend.text = element_text(size=5), legend.key = element_blank(), legend.justification = c(0,1), 
		#				legend.position = c(0,1), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), 
		#				panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12))
		#		%s
		#		""" % (ggsave))
        #
		#	if np.max(results['logp']) > cfg['crop']:
		#		print "   generating cropped frequency stratified qq plot"
		#		ro.r('df$b[df$b > ' + str(cfg['crop']) + ']<-' + str(cfg['crop']))
		#		ro.r('df$shape<-0')
		#		ro.r('df$shape[df$b == ' + str(cfg['crop']) + ']<-1')
		#		if cfg['ext'] == 'tiff':
		#			ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq_strat.cropped.tiff')
		#		elif cfg['ext'] == 'eps':
		#			ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white",horizontal=True)' % (cfg['out'] + '.' + pcol + '.qq_strat.cropped.eps')
		#		else:
		#			ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq_strat.cropped.pdf')
		#		ro.r("""
		#			gp<-ggplot(df, aes_string(x='a',y='b')) +
		#				geom_point(aes(shape=factor(shape), color=UGA_MAC), size=2) +
		#				scale_colour_manual(values=c("E"="#a8ddb5", "D"="#7bccc4", "C"="#4eb3d3", "B"="#2b8cbe", "A"="#08589e"), labels=c("E"="MAF < 1%%","D"="1%% <= MAF < 3%%","C"="3%% <= MAF < 5%%","B"="5%% <= MAF < 10%%","A"="MAF >= 10%%")) +
		#				geom_abline(intercept=0, slope=1, alpha=0.5) + 
		#				scale_x_continuous(expression(Expected~~-log[10](italic(p)))) +
		#				scale_y_continuous(expression(Observed~~-log[10](italic(p)))) +
		#				theme_bw(base_size = 12) + 
		#				guides(shape=FALSE) + 
		#				theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.title = element_blank(), 
		#					legend.key.height = unit(0.1,"in"), legend.text = element_text(size=5), legend.key = element_blank(), legend.justification = c(0,1), 
		#					legend.position = c(0,1), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), 
		#					panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12))
		#			%s
		#			""" % (ggsave))

		if cfg['mht']:
			print "   generating standard manhattan plot"
			print "   minimum p-value: " + str(np.min(results[pcol]))
			print "   maximum -1*log10(p-value): " + str(np.max(results['logp']))
			if cfg['gc'] and l > 1:
				print "   adjusting p-values for genomic inflation for p-value column " + pcol
				results[pcol]=2 * scipy.norm.cdf(-1 * np.abs(scipy.norm.ppf(0.5*results[pcol]) / math.sqrt(l)))
				print "   minimum post-gc adjustment p-value: " + str(np.min(results[pcol]))
				print "   maximum post-gc adjustment -1*log10(p-value): " + str(np.max(results['logp']))
			else:
				print "   skipping genomic inflation correction"

			print "   calculating genomic positions"
			results.sort_values(by=[cfg['chrcol'],cfg['bpcol']], inplace=True)
			ticks = []
			lastbase = 0
			results['gpos'] = 0
			nchr = len(list(np.unique(results[cfg['chrcol']].values)))
			chrs = np.unique(results[cfg['chrcol']].values)
			if cfg['color']:
				colours = ["#08306B","#41AB5D","#000000","#F16913","#3F007D","#EF3B2C","#08519C","#238B45","#252525","#D94801","#54278F","#CB181D","#2171B5","#006D2C","#525252","#A63603","#6A51A3","#A50F15","#4292C6","#00441B","#737373","#7F2704","#807DBA","#67000D"]
			else:
				colours = ["#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3"]
			if nchr == 1:
				results['gpos'] = results[cfg['bpcol']]
				results['colours'] = "#08589e"
				if results['gpos'].max() - results['gpos'].min() <= 1000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 100 == 0]
				elif results['gpos'].max() - results['gpos'].min() <= 10000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 1000 == 0]
				elif results['gpos'].max() - results['gpos'].min() <= 100000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 10000 == 0]
				elif results['gpos'].max() - results['gpos'].min() <= 200000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 20000 == 0]
				elif results['gpos'].max() - results['gpos'].min() <= 300000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 30000 == 0]
				elif results['gpos'].max() - results['gpos'].min() <= 400000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 40000 == 0]
				elif results['gpos'].max() - results['gpos'].min() <= 500000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 50000 == 0]
				elif results['gpos'].max() - results['gpos'].min() <= 600000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 60000 == 0]
				elif results['gpos'].max() - results['gpos'].min() <= 700000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 70000 == 0]
				elif results['gpos'].max() - results['gpos'].min() <= 800000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 80000 == 0]
				elif results['gpos'].max() - results['gpos'].min() <= 900000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 90000 == 0]
				elif results['gpos'].max() - results['gpos'].min() <= 1000000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 100000 == 0]
				elif results['gpos'].max() - results['gpos'].min() <= 10000000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 1000000 == 0]
				elif results['gpos'].max() - results['gpos'].min() <= 100000000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 10000000 == 0]
				elif results['gpos'].max() - results['gpos'].min() > 100000000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 25000000 == 0]
			else:
				results['colours'] = "#000000"
				for i in range(len(chrs)):
					print "      processed chromosome " + str(int(chrs[i]))
					if i == 0:
						results.loc[results[cfg['chrcol']] == chrs[i],'gpos'] = results.loc[results[cfg['chrcol']] == chrs[i],cfg['bpcol']]
					else:
						lastbase = lastbase + results.loc[results[cfg['chrcol']] == chrs[i-1],cfg['bpcol']].iloc[-1]
						results.loc[results[cfg['chrcol']] == chrs[i],'gpos'] = (results.loc[results[cfg['chrcol']] == chrs[i],cfg['bpcol']]) + lastbase
					if results.loc[results[cfg['chrcol']] == chrs[i]].shape[0] > 1:
						ticks.append(results.loc[results[cfg['chrcol']] == chrs[i],'gpos'].iloc[0] + (results.loc[results[cfg['chrcol']] == chrs[i],'gpos'].iloc[-1] - results.loc[results[cfg['chrcol']] == chrs[i],'gpos'].iloc[0])/2)
					else:
						ticks.append(results.loc[results[cfg['chrcol']] == chrs[i],'gpos'].iloc[0])
					results.loc[results[cfg['chrcol']] == chrs[i],'colours'] = colours[int(chrs[i])]
			results['logp'] = -1 * np.log10(results[pcol])
			if results.shape[0] >= 1000000:
				sig = 5.4e-8
			else:
				sig = 0.05 / results.shape[0]
			print "   significance level set to p-value = " + str(sig) + " (-1*log10(p-value) = " + str(-1 * np.log10(sig)) + ")"
			print "   " + str(len(results[pcol][results[pcol] <= sig])) + " genome wide significant variants"
			chr = results[cfg['chrcol']][0]
			maxy=int(max(np.ceil(-1 * np.log10(sig)),np.ceil(results['logp'].max())))
			if maxy > 20:
				y_breaks = range(0,maxy,5)
				y_labels = range(0,maxy,5)
			else:
				y_breaks = range(0,maxy)
				y_labels = range(0,maxy)
			ro.globalenv['df'] = ro.DataFrame({'gpos': ro.FloatVector(results['gpos']), 'logp': ro.FloatVector(results['logp']), 'colours': ro.FactorVector(results['colours'])})
			ro.globalenv['ticks'] = ro.FloatVector(ticks)
			ro.globalenv['labels'] = ro.Vector(["{:,}".format(x/1000) for x in ticks])
			ro.globalenv['colours'] = ro.StrVector(colours)
			ro.globalenv['chrs'] = ro.FloatVector(chrs)

			print "   generating manhattan plot"
			if cfg['ext'] == 'tiff':
				ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.mht.tiff')
			elif cfg['ext'] == 'eps':
				ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,bg="white",horizontal=True)' % (cfg['out'] + '.' + pcol + '.mht.eps')
			else:
				ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.mht.pdf')
			if nchr == 1:
				ro.r("""
					gp<-ggplot(df, aes_string(x='gpos',y='logp')) +
						geom_hline(yintercept = -1 * log10(%g),colour="#B8860B", linetype=5, size = 0.25) + 
						geom_point(size=1.5) + 
						scale_x_continuous(expression(Chromosome~~%d~~(kb))'),breaks=ticks,labels=labels) + \
						scale_y_continuous(expression(-log[10](italic(p))),breaks=seq(0,%d,1),limits=c(0,%d)) + \
						theme_bw(base_size = 8) + \
						theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), 
								panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), 
								panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.title = element_text(size=10), 
								axis.text = element_text(size=12), legend.position = 'none')
					%s
					""" % (sig, chr, maxy, maxy, ggsave))
			else:
				ro.r("""
					gp = ggplot(df, aes_string(x='gpos',y='logp',colour='colours')) + 
						geom_hline(yintercept = -1 * log10(%g),colour="#B8860B", linetype=5, size = 0.25) + 
						geom_point(size=1.5) + 
						scale_colour_manual(values=colours) + 
						scale_x_continuous(expression(Chromosome),breaks=ticks,labels=chrs) + 
						scale_y_continuous(expression(-log[10](italic(p))),breaks=seq(0,%d,1),limits=c(0,%d)) + 
						theme_bw(base_size = 8) + 
						theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), 
								panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), 
								panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.title = element_text(size=10), 
								axis.text = element_text(size=12), legend.position = 'none')
					%s
					""" % (sig, maxy, maxy, ggsave))

			if maxy > cfg['crop']:
				maxy = cfg['crop']
				ro.r('df$logp[df$logp > ' + str(cfg['crop']) + ']<-' + str(cfg['crop']))
				ro.r('df$shape<-0')
				ro.r('df$shape[df$logp == ' + str(cfg['crop']) + ']<-1')
				print "   generating cropped manhattan plot"
				if cfg['ext'] == 'tiff':
					ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.mht.cropped.tiff')
				elif cfg['ext'] == 'eps':
					ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,bg="white",horizontal=True)' % (cfg['out'] + '.' + pcol + '.mht.cropped.eps')
				else:
					ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.mht.cropped.pdf')
				if nchr == 1:
					ro.r("""
						gp<-ggplot(df, aes_string(x='gpos',y='logp')) +
							geom_hline(yintercept = -1 * log10(%g),colour="#B8860B", linetype=5, size = 0.25) + 
							geom_point(aes(shape=factor(shape)),size=1.5) + 
							scale_x_continuous(expression(Chromosome~~%d~~(kb))'),breaks=ticks,labels=labels) + 
							scale_y_continuous(expression(-log[10](italic(p))),breaks=seq(0,%d,1),limits=c(0,%d)) + 
							theme_bw(base_size = 8) + 
							theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), 
									panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), 
									panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.title = element_text(size=10), 
									axis.text = element_text(size=12), legend.position = 'none')
						%s
						""" % (sig, chr, maxy, maxy, ggsave))
				else:
					ro.r("""
						gp = ggplot(df, aes_string(x='gpos',y='logp',colour='colours')) + 
							geom_hline(yintercept = -1 * log10(%g),colour="#B8860B", linetype=5, size = 0.25) + 
							geom_point(aes(shape=factor(shape)),size=1.5) + 
							scale_colour_manual(values=colours) + 
							scale_x_continuous(expression(Chromosome),breaks=ticks,labels=chrs) + 
							scale_y_continuous(expression(-log[10](italic(p))),breaks=seq(0,%d,1),limits=c(0,%d)) + 
							theme_bw(base_size = 8) + 
							theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), 
									panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), 
									panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.title = element_text(size=8), 
									axis.text = element_text(size=12), legend.position = 'none')
						%s
						""" % (sig, maxy, maxy, ggsave))

	print "process complete"
	return 0
Example #38
0
 def __init__(self, file_path, parser=pysam.asVCF()):
     self.vcf_file_path = file_path
     self.tabix_file    = pysam.TabixFile(file_path, parser=parser)
     self.sample_names  = self.read_sample_names()
     self.clens         = self.contig_lengths()
     self.indexDelta    = -1 if tuple(map(int, pysam.__version__.split('.'))) > (0,5) else 0
Example #39
0
def RunSnvplot(args):
	cfg = Parse.generate_snvplot_cfg(args)
	Parse.print_snvplot_options(cfg)

	if not cfg['debug']:
		logging.disable(logging.CRITICAL)

	ro.r('suppressMessages(library(ggplot2))')
	ro.r('suppressMessages(library(grid))')
	ro.r('suppressMessages(library(RColorBrewer))')

	handle=pysam.TabixFile(filename=cfg['file'],parser=pysam.asVCF())
	header = [x for x in handle.header]
	skip_rows = len(header)-1
	cols = header[-1].split()
	pcols = cfg['pcol'].split(',')
	cols_extract = [cfg['chrcol'],cfg['bpcol']] + pcols
	if cfg['qq_strat_freq']:
		if cfg['freqcol'] not in cols:
			print Process.Error("frequency column " + cfg['freqcol'] + " not found, unable to proceed with frequency stratified plots").out
			return 1
		else:
			cols_extract = cols_extract + [cfg['freqcol']]
			print "frequency column " + cfg['freqcol'] + " found"
	if cfg['qq_strat_mac']:
		if cfg['maccol'] not in cols:
			print Process.Error("minor allele count column " + cfg['maccol'] + " not found, unable to proceed with minor allele count stratified plots").out
			return 1
		else:
			cols_extract = cols_extract + [cfg['maccol']]
			print "minor allele count column " + cfg['maccol'] + " found"

	print "importing data"
	r = pd.read_table(cfg['file'],sep='\t',skiprows=skip_rows,usecols=cols_extract,compression='gzip')
	print str(r.shape[0]) + " total variants found"

	for pcol in pcols:
		print "plotting p-values for column " + pcol + " ..."
		extract_cols = [cfg['chrcol'],cfg['bpcol'],pcol]
		if cfg['freqcol'] in r:
			extract_cols = extract_cols + [cfg['freqcol']]
		if cfg['maccol'] in r:
			extract_cols = extract_cols + [cfg['maccol']]
		results = r[extract_cols]
		results.dropna(inplace=True)
		results = results[(results[pcol] > 0) & (results[pcol] <= 1)].reset_index(drop=True)
		print "   " + str(results.shape[0]) + " variants with plottable p-values"

		results['logp'] = -1 * np.log10(results[pcol]) + 0.0

		ro.globalenv['results'] = results
		l = np.median(scipy.chi2.ppf([1-x for x in results[pcol].tolist()], df=1))/scipy.chi2.ppf(0.5,1)
		# in R: median(qchisq(results$p, df=1, lower.tail=FALSE))/qchisq(0.5,1)
		print "   genomic inflation (all variants) = " + str(l)

		if cfg['qq']:
			print "   generating standard qq plot"
			print "   minimum p-value: " + str(np.min(results[pcol]))
			a = -1 * np.log10(ro.r('ppoints(' + str(len(results.index)) + ')'))
			a.sort()
			results.sort_values(by=['logp'], inplace=True)
			print "   maximum -1*log10(p-value): " + str(np.max(results['logp']))

			ci_upper = -1 * np.log10(scipy.beta.ppf(0.95, range(1,len(results[pcol]) + 1), range(len(results[pcol]),0,-1)))
			ci_upper.sort()
			ci_lower = -1 * np.log10(scipy.beta.ppf(0.05, range(1,len(results[pcol]) + 1), range(len(results[pcol]),0,-1)))
			ci_lower.sort()
			
			ro.globalenv['df'] = ro.DataFrame({'a': ro.FloatVector(a), 'b': ro.FloatVector(results['logp']), 'ci_lower': ro.FloatVector(ci_lower), 'ci_upper': ro.FloatVector(ci_upper)})
			dftext_label = 'lambda %~~% ' + str(round(l,3))
			ro.globalenv['dftext'] = ro.DataFrame({'x': ro.r('Inf'), 'y': 0.5, 'lab': dftext_label})

			if cfg['ext'] == 'tiff':
				ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq.tiff')
			elif cfg['ext'] == 'png':
				ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,units="in",bg="white",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq.png')
			elif cfg['ext'] == 'eps':
				ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq.eps')
			else:
				ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq.pdf')
			ro.r("""
				gp<-ggplot(df)
				pp<-gp + 
					aes_string(x='a',y='b') +
					geom_ribbon(aes_string(x='a',ymin='ci_lower',ymax='ci_upper'), data=df, alpha=0.25, fill='black') + 
					geom_point(size=2) +
					geom_abline(intercept=0, slope=1, alpha=0.5) + 
					scale_x_discrete(expression(Expected~~-log[10](italic(p)))) +
					scale_y_discrete(expression(Observed~~-log[10](italic(p)))) +
					coord_fixed() +
					theme_bw(base_size = 12) + 
					geom_text(aes_string(x='x', y='y', label='lab'), data = dftext, colour="black", vjust=0, hjust=1, size = 4, parse=TRUE) +
					theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.position = 'none', 
						panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), 
						panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12))
				%s
				""" % (ggsave))

			if np.max(results['logp']) > cfg['crop']:
				print "   generating cropped standard qq plot"
				ro.r('df$b[df$b > ' + str(cfg['crop']) + ']<-' + str(cfg['crop']))
				ro.r('df$shape<-0')
				ro.r('df$shape[df$b == ' + str(cfg['crop']) + ']<-1')
				if cfg['ext'] == 'tiff':
					ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq.cropped.tiff')
				elif cfg['ext'] == 'png':
					ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,units="in",bg="white",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq.cropped.png')
				elif cfg['ext'] == 'eps':
					ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq.cropped.eps')
				else:
					ggsave = 'ggsave(filename="%s",plot=pp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq.cropped.pdf')
				ro.r("""
					gp<-ggplot(df)
					pp<-gp + 
						aes_string(x='a',y='b') +
						geom_ribbon(aes_string(x='a',ymin='ci_lower',ymax='ci_upper'), data=df, alpha=0.25, fill='black') + 
						geom_point(aes(shape=factor(shape)),size=2) +
						geom_abline(intercept=0, slope=1, alpha=0.5) + 
						scale_x_discrete(expression(Expected~~-log[10](italic(p)))) +
						scale_y_discrete(expression(Observed~~-log[10](italic(p)))) +
						coord_fixed() +
						theme_bw(base_size = 12) + 
						geom_text(aes_string(x='x', y='y', label='lab'), data = dftext, colour="black", vjust=0, hjust=1, size = 4, parse=TRUE) +
						theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.position = 'none', 
							panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), 
							panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12))
					%s
					""" % (ggsave))

		def ppoints(n, a):
			try:
				n = np.float(len(n))
			except TypeError:
				n = np.float(n)
			return (np.arange(n) + 1 - a)/(n + 1 - 2*a)

		if cfg['qq_strat_freq']:
			print "   generating frequency stratified qq plot"

			strat_ticks = np.sort([np.float(x) for x in cfg['freq_ticks'].split(',')])
			results['UGA___QQ_BIN___'] = 0
			for i in xrange(len(strat_ticks)):
				results.loc[(results[cfg['freqcol']] >= strat_ticks[i]) & (results[cfg['freqcol']] <= 1-strat_ticks[i]),'UGA___QQ_BIN___'] = i+1
			bin_values = results['UGA___QQ_BIN___'].value_counts()
			for i in xrange(len(strat_ticks)+1):
				if i not in bin_values.index:
					bin_values[i] = 0
			counts = pd.DataFrame(bin_values)
			counts['lambda'] = np.nan
			results['description'] = 'NA'
			for i in xrange(len(strat_ticks)+1):
				if counts.loc[i,'UGA___QQ_BIN___'] > 0:
					counts.loc[i,'lambda'] = np.median(scipy.chi2.ppf([1-x for x in results[pcol][results['UGA___QQ_BIN___'] == i].tolist()], df=1))/scipy.chi2.ppf(0.5,1)
				else:
					counts.loc[i,'lambda'] = np.nan
				if i == 0:
					results.loc[results['UGA___QQ_BIN___'] == i,'description'] = "(0," + str(strat_ticks[i]) + ") ~" + str(round(counts.loc[i,'lambda'],3))
					print "   MAF (0," + str(strat_ticks[i]) + "): n=" + str(np.int(counts.loc[i,'UGA___QQ_BIN___'])) + ", lambda=" + str(counts.loc[i,'lambda'])
				elif i < len(strat_ticks):
					results.loc[results['UGA___QQ_BIN___'] == i,'description'] = "[" + str(strat_ticks[i-1]) + "," + str(strat_ticks[i]) + ") ~" + str(round(counts.loc[i,'lambda'],3))
					print "   MAF [" + str(strat_ticks[i-1]) + "," + str(strat_ticks[i]) + "): n=" + str(np.int(counts.loc[i,'UGA___QQ_BIN___'])) + ", lambda=" + str(counts.loc[i,'lambda'])
				else:
					results.loc[results['UGA___QQ_BIN___'] == i,'description'] = "[" + str(strat_ticks[i-1]) + ",0.5]  ~" + str(round(counts.loc[i,'lambda'],3))
					print "   MAF [" + str(strat_ticks[i-1]) + ",0.5]: n=" + str(np.int(counts.loc[i,'UGA___QQ_BIN___'])) + ", lambda=" + str(counts.loc[i,'lambda'])
			results.sort_values(['UGA___QQ_BIN___','logp'],inplace=True)
			results['expected'] = 0
			for i in counts.index:
				if counts.loc[i,'UGA___QQ_BIN___'] > 0:
					results.loc[results['UGA___QQ_BIN___'] == i,'expected'] = np.sort(-1 * np.log10(ppoints(len(results.loc[results['UGA___QQ_BIN___'] == i,'expected']),0)))
			ro.globalenv['df'] = ro.DataFrame({'expected': ro.FloatVector(results['expected']), 'logp': ro.FloatVector(results['logp']), 'UGA___QQ_BIN___': ro.IntVector(results['UGA___QQ_BIN___']), 'description': ro.StrVector(results['description'])})
			ro.r("df<-df[order(df$UGA___QQ_BIN___),]")
			ro.r("df$description<-ordered(df$description,levels=unique(df$description))")

			if cfg['ext'] == 'tiff':
				ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.tiff')
			elif cfg['ext'] == 'png':
				ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,units="in",bg="white",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.png')
			elif cfg['ext'] == 'eps':
				ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.eps')
			else:
				ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.pdf')
			ro.r("""
				gp<-ggplot(df, aes_string(x='expected',y='logp')) +
					geom_point(aes_string(color='description'), size=2) +
					scale_colour_manual(values=colorRampPalette(brewer.pal(9,"Blues"))(length(unique(df$description))+2)[3:(length(unique(df$description))+2)]) +
					geom_abline(intercept=0, slope=1, alpha=0.5) + 
					scale_x_discrete(expression(Expected~~-log[10](italic(p)))) +
					scale_y_discrete(expression(Observed~~-log[10](italic(p)))) +
					coord_fixed() +
					theme_bw(base_size = 12) + 
					theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.title = element_blank(), 
						legend.key.height = unit(0.1,"in"), legend.text = element_text(size=6), legend.key = element_blank(), legend.justification = c(0,1), 
						legend.position = c(0,1), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), 
						panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12))
				%s
				""" % (ggsave))

			if np.max(results['logp']) > cfg['crop']:
				print "   generating cropped frequency stratified qq plot"
				ro.r('df$logp[df$logp > ' + str(cfg['crop']) + ']<-' + str(cfg['crop']))
				ro.r('df$shape<-0')
				ro.r('df$shape[df$logp == ' + str(cfg['crop']) + ']<-1')
				if cfg['ext'] == 'tiff':
					ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.cropped.tiff')
				elif cfg['ext'] == 'png':
					ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,units="in",bg="white",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.cropped.png')
				elif cfg['ext'] == 'eps':
					ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.cropped.eps')
				else:
					ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq_strat_freq.cropped.pdf')
				ro.r("""
					gp<-ggplot(df, aes_string(x='expected',y='logp')) +
						geom_point(aes(shape=factor(shape), color=description), size=2) +
						scale_colour_manual(values=colorRampPalette(brewer.pal(9,"Blues"))(length(unique(df$description))+2)[3:(length(unique(df$description))+2)]) +
						geom_abline(intercept=0, slope=1, alpha=0.5) + 
						scale_x_discrete(expression(Expected~~-log[10](italic(p)))) +
						scale_y_discrete(expression(Observed~~-log[10](italic(p)))) +
						coord_fixed() +
						theme_bw(base_size = 12) + 
						guides(shape=FALSE) + 
						theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.title = element_blank(), 
							legend.key.height = unit(0.1,"in"), legend.text = element_text(size=6), legend.key = element_blank(), legend.justification = c(0,1), 
							legend.position = c(0,1), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), 
							panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12))
					%s
					""" % (ggsave))

		if cfg['qq_strat_mac']:
			print "   generating minor allele count stratified qq plot"

			strat_ticks = np.sort([np.float(x) for x in cfg['mac_ticks'].split(',')])
			results['UGA___QQ_BIN___'] = 0
			for i in xrange(len(strat_ticks)):
				results.loc[results[cfg['maccol']] >= strat_ticks[i],'UGA___QQ_BIN___'] = i+1
			bin_values = results['UGA___QQ_BIN___'].value_counts()
			for i in xrange(len(strat_ticks)+1):
				if i not in bin_values.index:
					bin_values[i] = 0
			counts = pd.DataFrame(bin_values)
			counts['lambda'] = 0
			results['description'] = 'NA'
			for i in np.sort(counts.index):
				if counts.loc[i,'UGA___QQ_BIN___'] > 0:
					counts.loc[i,'lambda'] = np.median(scipy.chi2.ppf([1-x for x in results[pcol][results['UGA___QQ_BIN___'] == i].tolist()], df=1))/scipy.chi2.ppf(0.5,1)
				else:
					counts.loc[i,'lambda'] = np.nan
				if i == 0:
					results.loc[results['UGA___QQ_BIN___'] == i,'description'] = "(0," + str(int(strat_ticks[i])) + ") ~" + str(round(counts.loc[i,'lambda'],3))
					print "   MAC (0," + str(int(strat_ticks[i])) + "): n=" + str(np.int(counts.loc[i,'UGA___QQ_BIN___'])) + ", lambda=" + str(counts.loc[i,'lambda'])
				elif i < len(strat_ticks):
					results.loc[results['UGA___QQ_BIN___'] == i,'description'] = "[" + str(int(strat_ticks[i-1])) + "," + str(int(strat_ticks[i])) + ") ~" + str(round(counts.loc[i,'lambda'],3))
					print "   MAC [" + str(int(strat_ticks[i-1])) + "," + str(int(strat_ticks[i])) + "): n=" + str(np.int(counts.loc[i,'UGA___QQ_BIN___'])) + ", lambda=" + str(counts.loc[i,'lambda'])
				else:
					results.loc[results['UGA___QQ_BIN___'] == i,'description'] = "[" + str(int(strat_ticks[i-1])) + ",...] ~" + str(round(counts.loc[i,'lambda'],3))
					print "   MAC [" + str(int(strat_ticks[i-1])) + ",...]: n=" + str(np.int(counts.loc[i,'UGA___QQ_BIN___'])) + ", lambda=" + str(counts.loc[i,'lambda'])
			results.sort_values(['UGA___QQ_BIN___','logp'],inplace=True)
			results['expected'] = 0
			for i in counts.index:
				results.loc[results['UGA___QQ_BIN___'] == i,'expected'] = np.sort(-1 * np.log10(ppoints(len(results.loc[results['UGA___QQ_BIN___'] == i,'expected']),0)))

			ro.globalenv['df'] = ro.DataFrame({'expected': ro.FloatVector(results['expected']), 'logp': ro.FloatVector(results['logp']), 'UGA___QQ_BIN___': ro.IntVector(results['UGA___QQ_BIN___']), 'description': ro.StrVector(results['description'])})
			ro.r("df<-df[order(df$UGA___QQ_BIN___),]")
			ro.r("df$description<-ordered(df$description,levels=unique(df$description))")

			if cfg['ext'] == 'tiff':
				ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq_strat_mac.tiff')
			elif cfg['ext'] == 'png':
				ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,units="in",bg="white",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq_strat_mac.png')
			elif cfg['ext'] == 'eps':
				ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq_strat_mac.eps')
			else:
				ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq_strat_mac.pdf')
			ro.r("""
				gp<-ggplot(df, aes_string(x='expected',y='logp')) +
					geom_point(aes_string(color='description'), size=2) +
					scale_colour_manual(values=colorRampPalette(brewer.pal(9,"Blues"))(length(unique(df$description))+2)[3:(length(unique(df$description))+2)]) +
					geom_abline(intercept=0, slope=1, alpha=0.5) + 
					scale_x_discrete(expression(Expected~~-log[10](italic(p)))) +
					scale_y_discrete(expression(Observed~~-log[10](italic(p)))) +
					coord_fixed() +
					theme_bw(base_size = 12) + 
					theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.title = element_blank(), 
						legend.key.height = unit(0.1,"in"), legend.text = element_text(size=6), legend.key = element_blank(), legend.justification = c(0,1), 
						legend.position = c(0,1), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), 
						panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12))
				%s
				""" % (ggsave))
        
			if np.max(results['logp']) > cfg['crop']:
				print "   generating cropped frequency stratified qq plot"
				ro.r('df$logp[df$logp > ' + str(cfg['crop']) + ']<-' + str(cfg['crop']))
				ro.r('df$shape<-0')
				ro.r('df$shape[df$logp == ' + str(cfg['crop']) + ']<-1')
				if cfg['ext'] == 'tiff':
					ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq_strat_mac.cropped.tiff')
				elif cfg['ext'] == 'png':
					ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,units="in",bg="white",dpi=300)' % (cfg['out'] + '.' + pcol + '.qq_strat_mac.cropped.png')
				elif cfg['ext'] == 'eps':
					ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq_strat_mac.cropped.eps')
				else:
					ggsave = 'ggsave(filename="%s",plot=gp,width=4,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.qq_strat_mac.cropped.pdf')
				ro.r("""
					gp<-ggplot(df, aes_string(x='expected',y='logp')) +
						geom_point(aes(shape=factor(shape), color=description), size=2) +
						scale_colour_manual(values=colorRampPalette(brewer.pal(9,"Blues"))(length(unique(df$description))+2)[3:(length(unique(df$description))+2)]) +
						geom_abline(intercept=0, slope=1, alpha=0.5) + 
						scale_x_discrete(expression(Expected~~-log[10](italic(p)))) +
						scale_y_discrete(expression(Observed~~-log[10](italic(p)))) +
						coord_fixed() +
						theme_bw(base_size = 12) + 
						guides(shape=FALSE) + 
						theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), legend.title = element_blank(), 
							legend.key.height = unit(0.1,"in"), legend.text = element_text(size=6), legend.key = element_blank(), legend.justification = c(0,1), 
							legend.position = c(0,1), panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), 
							panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.text = element_text(size=12))
					%s
					""" % (ggsave))

		if cfg['mht']:
			print "   generating standard manhattan plot"
			print "   minimum p-value: " + str(np.min(results[pcol]))
			print "   maximum -1*log10(p-value): " + str(np.max(results['logp']))
			if cfg['gc'] and l > 1:
				print "   adjusting p-values for genomic inflation for p-value column " + pcol
				results[pcol]=2 * scipy.norm.cdf(-1 * np.abs(scipy.norm.ppf(0.5*results[pcol]) / math.sqrt(l)))
				print "   minimum post-gc adjustment p-value: " + str(np.min(results[pcol]))
				print "   maximum post-gc adjustment -1*log10(p-value): " + str(np.max(results['logp']))
			else:
				print "   skipping genomic inflation correction"

			print "   calculating genomic positions"
			results.sort_values(by=[cfg['chrcol'],cfg['bpcol']], inplace=True)
			ticks = []
			lastbase = 0
			results['gpos'] = 0
			nchr = len(list(np.unique(results[cfg['chrcol']].values)))
			chrs = np.unique(results[cfg['chrcol']].values)
			if cfg['color']:
				colours = ["#08306B","#41AB5D","#000000","#F16913","#3F007D","#EF3B2C","#08519C","#238B45","#252525","#D94801","#54278F","#CB181D","#2171B5","#006D2C","#525252","#A63603","#6A51A3","#A50F15","#4292C6","#00441B","#737373","#7F2704","#807DBA","#67000D"]
			else:
				colours = ["#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3","#08589e","#4eb3d3"]
			if nchr == 1:
				results['gpos'] = results[cfg['bpcol']]
				results['colours'] = "#08589e"
				if results['gpos'].max() - results['gpos'].min() <= 1000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 100 == 0]
				elif results['gpos'].max() - results['gpos'].min() <= 10000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 1000 == 0]
				elif results['gpos'].max() - results['gpos'].min() <= 100000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 10000 == 0]
				elif results['gpos'].max() - results['gpos'].min() <= 200000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 20000 == 0]
				elif results['gpos'].max() - results['gpos'].min() <= 300000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 30000 == 0]
				elif results['gpos'].max() - results['gpos'].min() <= 400000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 40000 == 0]
				elif results['gpos'].max() - results['gpos'].min() <= 500000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 50000 == 0]
				elif results['gpos'].max() - results['gpos'].min() <= 600000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 60000 == 0]
				elif results['gpos'].max() - results['gpos'].min() <= 700000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 70000 == 0]
				elif results['gpos'].max() - results['gpos'].min() <= 800000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 80000 == 0]
				elif results['gpos'].max() - results['gpos'].min() <= 900000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 90000 == 0]
				elif results['gpos'].max() - results['gpos'].min() <= 1000000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 100000 == 0]
				elif results['gpos'].max() - results['gpos'].min() <= 10000000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 1000000 == 0]
				elif results['gpos'].max() - results['gpos'].min() <= 100000000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 10000000 == 0]
				elif results['gpos'].max() - results['gpos'].min() > 100000000:
					ticks = [x for x in range(results['gpos'].min(),results['gpos'].max()) if x % 25000000 == 0]
			else:
				results['colours'] = "#000000"
				for i in range(len(chrs)):
					print "      processed chromosome " + str(int(chrs[i]))
					if i == 0:
						results.loc[results[cfg['chrcol']] == chrs[i],'gpos'] = results.loc[results[cfg['chrcol']] == chrs[i],cfg['bpcol']]
					else:
						lastbase = lastbase + results.loc[results[cfg['chrcol']] == chrs[i-1],cfg['bpcol']].iloc[-1]
						results.loc[results[cfg['chrcol']] == chrs[i],'gpos'] = (results.loc[results[cfg['chrcol']] == chrs[i],cfg['bpcol']]) + lastbase
					if results.loc[results[cfg['chrcol']] == chrs[i]].shape[0] > 1:
						ticks.append(results.loc[results[cfg['chrcol']] == chrs[i],'gpos'].iloc[0] + (results.loc[results[cfg['chrcol']] == chrs[i],'gpos'].iloc[-1] - results.loc[results[cfg['chrcol']] == chrs[i],'gpos'].iloc[0])/2)
					else:
						ticks.append(results.loc[results[cfg['chrcol']] == chrs[i],'gpos'].iloc[0])
					results.loc[results[cfg['chrcol']] == chrs[i],'colours'] = colours[int(chrs[i])]
			results['logp'] = -1 * np.log10(results[pcol])
			if results.shape[0] >= 1000000:
				sig = 5.4e-8
			else:
				sig = 0.05 / results.shape[0]
			print "   significance level set to p-value = " + str(sig) + " (-1*log10(p-value) = " + str(-1 * np.log10(sig)) + ")"
			print "   " + str(len(results[pcol][results[pcol] <= sig])) + " genome wide significant variants"
			chr = results[cfg['chrcol']][0]
			maxy=int(max(np.ceil(-1 * np.log10(sig)),np.ceil(results['logp'].max())))
			if maxy > 20:
				y_breaks = range(0,maxy,5)
				y_labels = range(0,maxy,5)
			else:
				y_breaks = range(0,maxy)
				y_labels = range(0,maxy)
			ro.globalenv['df'] = ro.DataFrame({'gpos': ro.FloatVector(results['gpos']), 'logp': ro.FloatVector(results['logp']), 'colours': ro.FactorVector(results['colours'])})
			ro.globalenv['ticks'] = ro.FloatVector(ticks)
			ro.globalenv['labels'] = ro.Vector(["{:,}".format(x/1000) for x in ticks])
			ro.globalenv['colours'] = ro.StrVector(colours)
			ro.globalenv['chrs'] = ro.FloatVector(chrs)

			print "   generating manhattan plot"
			if cfg['ext'] == 'tiff':
				ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.mht.tiff')
			elif cfg['ext'] == 'png':
				ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,units="in",bg="white",dpi=300)' % (cfg['out'] + '.' + pcol + '.mht.png')
			elif cfg['ext'] == 'eps':
				ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.mht.eps')
			else:
				ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.mht.pdf')
			if nchr == 1:
				ro.r("""
					gp<-ggplot(df, aes_string(x='gpos',y='logp')) +
						geom_hline(yintercept = -1 * log10(%g),colour="#B8860B", linetype=5, size = 0.25) + 
						geom_point(size=1.5) + 
						scale_x_continuous(expression(Chromosome~~%d~~(kb))'),breaks=ticks,labels=labels) + \
						scale_y_continuous(expression(-log[10](italic(p))),breaks=seq(0,%d,1),limits=c(0,%d)) + \
						theme_bw(base_size = 8) + \
						theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), 
								panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), 
								panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.title = element_text(size=10), 
								axis.text = element_text(size=12), legend.position = 'none')
					%s
					""" % (sig, chr, maxy, maxy, ggsave))
			else:
				ro.r("""
					gp = ggplot(df, aes_string(x='gpos',y='logp',colour='colours')) + 
						geom_hline(yintercept = -1 * log10(%g),colour="#B8860B", linetype=5, size = 0.25) + 
						geom_point(size=1.5) + 
						scale_colour_manual(values=colours) + 
						scale_x_continuous(expression(Chromosome),breaks=ticks,labels=chrs) + 
						scale_y_continuous(expression(-log[10](italic(p))),breaks=seq(0,%d,1),limits=c(0,%d)) + 
						theme_bw(base_size = 8) + 
						theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), 
								panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), 
								panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.title = element_text(size=10), 
								axis.text = element_text(size=12), legend.position = 'none')
					%s
					""" % (sig, maxy, maxy, ggsave))

			if maxy > cfg['crop']:
				maxy = cfg['crop']
				ro.r('df$logp[df$logp > ' + str(cfg['crop']) + ']<-' + str(cfg['crop']))
				ro.r('df$shape<-0')
				ro.r('df$shape[df$logp == ' + str(cfg['crop']) + ']<-1')
				print "   generating cropped manhattan plot"
				if cfg['ext'] == 'tiff':
					ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,units="in",bg="white",compression="lzw",dpi=300)' % (cfg['out'] + '.' + pcol + '.mht.cropped.tiff')
				elif cfg['ext'] == 'png':
					ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,units="in",bg="white",dpi=300)' % (cfg['out'] + '.' + pcol + '.mht.cropped.png')
				elif cfg['ext'] == 'eps':
					ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.mht.cropped.eps')
				else:
					ggsave = 'ggsave(filename="%s",plot=gp,width=16,height=4,bg="white")' % (cfg['out'] + '.' + pcol + '.mht.cropped.pdf')
				if nchr == 1:
					ro.r("""
						gp<-ggplot(df, aes_string(x='gpos',y='logp')) +
							geom_hline(yintercept = -1 * log10(%g),colour="#B8860B", linetype=5, size = 0.25) + 
							geom_point(aes(shape=factor(shape)),size=1.5) + 
							scale_x_continuous(expression(Chromosome~~%d~~(kb))'),breaks=ticks,labels=labels) + 
							scale_y_continuous(expression(-log[10](italic(p))),breaks=seq(0,%d,1),limits=c(0,%d)) + 
							theme_bw(base_size = 8) + 
							theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), 
									panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), 
									panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.title = element_text(size=10), 
									axis.text = element_text(size=12), legend.position = 'none')
						%s
						""" % (sig, chr, maxy, maxy, ggsave))
				else:
					ro.r("""
						gp = ggplot(df, aes_string(x='gpos',y='logp',colour='colours')) + 
							geom_hline(yintercept = -1 * log10(%g),colour="#B8860B", linetype=5, size = 0.25) + 
							geom_point(aes(shape=factor(shape)),size=1.5) + 
							scale_colour_manual(values=colours) + 
							scale_x_continuous(expression(Chromosome),breaks=ticks,labels=chrs) + 
							scale_y_continuous(expression(-log[10](italic(p))),breaks=seq(0,%d,1),limits=c(0,%d)) + 
							theme_bw(base_size = 8) + 
							theme(axis.title.x = element_text(vjust=-0.5,size=14), axis.title.y = element_text(vjust=1,angle=90,size=14), 
									panel.background = element_blank(), panel.border = element_blank(), panel.grid.minor = element_blank(), 
									panel.grid.major = element_blank(), axis.line = element_line(colour="black"), axis.title = element_text(size=8), 
									axis.text = element_text(size=12), legend.position = 'none')
						%s
						""" % (sig, maxy, maxy, ggsave))

	print "process complete"
	return 0
Example #40
0
File: selscan.py Project: quank/cms
    def process_vcf_into_selscan_tped(cls, vcf_file, gen_map_file, outfile_location,
        outfile_prefix, chromosome_num, samples_to_include=None, start_pos_bp=None, end_pos_bp=None, ploidy=2, 
        consider_multi_allelic=True, rescale_genetic_distance=False, include_variants_with_low_qual_ancestral=False, coding_function=None, 
        multi_alleli_merge_function="AND"):
        """
            Process a bgzipped-VCF (such as those included in the Phase 3 1000 Genomes release) into a gzip-compressed
            tped file of the sort expected by selscan. 
        """
        assert ploidy > 0

        processor = VCFReader(vcf_file)

        end_pos = processor.clens[str(chromosome_num)] if end_pos_bp is None else end_pos_bp

        records = processor.records( str(chromosome_num), start_pos_bp, end_pos, pysam.asVCF())

        outTpedFile = outfile_location + "/" + outfile_prefix + ".tped.gz"
        outTpedMetaFile = outfile_location + "/" + outfile_prefix + ".tped.allele_metadata.gz"

        if samples_to_include is not None and len(samples_to_include) > 0:
            indices_of_matching_samples = sorted([processor.sample_names.index(x) for x in samples_to_include])
        else:
            indices_of_matching_samples = range(0,len(processor.sample_names))

        indices_of_matching_genotypes = [(x*2, (x*2)+1) for x in indices_of_matching_samples]
        indices_of_matching_genotypes = list(np.ravel(np.array(indices_of_matching_genotypes)))

        rm = RecomMap(gen_map_file)

        for filePath in [outTpedFile, outTpedMetaFile]:
            assert not os.path.exists(filePath), "File {} already exists. Consider removing this file or specifying a different output prefix. Processing aborted.".format(filePath)

        mergeOperatorString = ""
        if multi_alleli_merge_function == "OR":
            mergeOperatorString = "|"
        if multi_alleli_merge_function == "AND":
            mergeOperatorString = "&"
        if multi_alleli_merge_function == "XOR":
            mergeOperatorString = "^"

        startTime = datetime.now()
        sec_remaining_avg = 0
        current_pos_bp = 1

        with util.file.open_or_gzopen(outTpedFile, 'w') as of1, util.file.open_or_gzopen(outTpedMetaFile, 'w') as of2:
            # WRITE header for metadata file here with selected subset of sample_names
            headerString = "CHROM VARIANT_ID POS_BP MAP_POS_CM REF_ALLELE ALT_ALLELE ANCESTRAL_CALL ALLELE_FREQ_IN_POP\n".replace(" ","\t")
            of2.write(headerString)

            of1linesToWrite = []
            of2linesToWrite = []

            recordLengths = set() #

            recordCount = 0
            mostRecentRecordPosSeen = -1
            positionHasBeenSeenBefore = False
            previouslyCodedGenotypes = GenoRecord([])
            lineToWrite1 = None
            lineToWrite2 = None
            previousAncestral = None
            ancestralDiffersFromPrevious = True
            for record in records:
                # in some cases, there may be records with duplicate positions in the VCF file
                # to account for that we collapse rows that pass our filter and then write out the rows when we 
                # encounter a record with a new position 
                if record.pos != mostRecentRecordPosSeen:

                    if positionHasBeenSeenBefore and not consider_multi_allelic:
                        lineToWrite1 = None
                        lineToWrite2 = None

                    if lineToWrite1 is not None and lineToWrite2 is not None:
                        if len(previouslyCodedGenotypes) == ploidy*len(indices_of_matching_samples):
                            # write the output line
                            of1.write(lineToWrite1)
                            of2.write(lineToWrite2)
                               
                        else:
                            genotypesCount            = len(previouslyCodedGenotypes)
                            countSpecificTpedName     = outTpedFile.replace(outfile_prefix, outfile_prefix + "_" + str(genotypesCount))
                            countSpecificMetafileName = outTpedMetaFile.replace(outfile_prefix, outfile_prefix + "_" + str(genotypesCount))
                            with util.file.open_or_gzopen(countSpecificTpedName, 'a') as of1l, util.file.open_or_gzopen(countSpecificMetafileName, 'a') as of2l:
                                of1l.write(lineToWrite1)
                                of2l.write(lineToWrite2)

                        lineToWrite1 = None
                        lineToWrite2 = None
                    mostRecentRecordPosSeen = record.pos
                    positionHasBeenSeenBefore = False
                else:
                    positionHasBeenSeenBefore = True

                # if the variant is a SNP
                # OLD style looking at INFO VT value: 
                # processor.variant_is_type(record.info, "SNP"):
                VALID_BASES = ["A","C","G","T","N","a","c","g","t","n"]
                if (len(record.ref) == 1 and len(record.alt) == 1) or ( all(variant in VALID_BASES for variant in record.ref.split(",")) and 
                     all(variant in VALID_BASES for variant in record.alt.split(",")) ):

                    alternateAlleles = [record.alt]
                    if record.alt not in ['A','T','C','G']:
                        #print record.alt
                        if consider_multi_allelic:
                            pass
                            alternateAlleles = record.alt.split(",")
                        else:
                            # continue on to next variant record
                            continue

                    ancestral_allele = processor.parse_ancestral(record.info)
                    chromStr = "chr{}".format(record.contig)

                    # if the AA is populated, and the call meets the specified criteria
                    if (ancestral_allele in ['A','T','C','G']) or (include_variants_with_low_qual_ancestral and ancestral_allele in ['a','t','c','g']):
                        
                        if previousAncestral != ancestral_allele:
                            previousAncestral = ancestral_allele
                            ancestralDiffersFromPrevious = True
                        else:
                            ancestralDiffersFromPrevious = False

                        recordString = record.__str__()

                        match = cls.genoRegex.match(recordString)
                        if match:
                            rawGenos = match.group("genos")
                            genos = rawGenos[::2]
                            recordPosition = record.pos+1
                            if chromosome_num.upper() != "X":
                                try:
                                    genotypes_for_selected_samples = operator.itemgetter(*indices_of_matching_genotypes)(genos)
                                except Exception: # if this is a record of mixed ploidy, that is to say the X chromosome
                                    raise
                            else:
                                matching_genotypes = np.array(record[:len(record)])[indices_of_matching_samples]
                                genotypes_for_selected_samples_split = [x.split("|") for x in matching_genotypes]
                                genotypes_for_selected_samples = [y for x in genotypes_for_selected_samples_split for y in x]

                            recordLengths.add(len(genotypes_for_selected_samples))

                            map_pos_cm = rm.physToMap(chromStr, record.pos, rescale=rescale_genetic_distance)

                            numberOfHaplotypes = float(len(genotypes_for_selected_samples))
                            
                            codingFunc = np.vectorize(coding_function)

                            coded_genotypes_for_selected_samples = GenoRecord(["0"] * len(genotypes_for_selected_samples))
                            if consider_multi_allelic:
                                #coded_genotypes_for_selected_samples = GenoRecord(["1"] * len(genotypes_for_selected_samples))
                                for idx, altAllele in enumerate(alternateAlleles):
                                    value_of_current_allele = str(idx+1)
                                    coded_genotypes_for_selected_samples_for_allele = GenoRecord(codingFunc(genotypes_for_selected_samples, record.ref, altAllele, ancestral_allele, value_of_current_allele))
                                    #coded_genotypes_for_selected_samples |= coded_genotypes_for_selected_samples_for_allele
                                    if idx==0:
                                        coded_genotypes_for_selected_samples = coded_genotypes_for_selected_samples_for_allele
                                    else:
                                        coded_genotypes_for_selected_samples = coded_genotypes_for_selected_samples.f[mergeOperatorString](coded_genotypes_for_selected_samples_for_allele)
                                    #coded_genotypes_for_selected_samples = np.array(list(str(bin(int("".join(coded_genotypes_for_selected_samples),2) | int("".join(coded_genotypes_for_selected_samples_for_allele),2)))[2:].zfill(len(coded_genotypes_for_selected_samples))))
                            else:
                                coded_genotypes_for_selected_samples = GenoRecord(codingFunc(genotypes_for_selected_samples, record.ref, record.alt, ancestral_allele, "1"))

                            # if this is the first record in the file, create an array filled with zeros for the previously coded alleles
                            if recordCount == 0:
                                previouslyCodedGenotypes = GenoRecord(["1"] * len(genotypes_for_selected_samples))
                                
                            # bitwise OR coded genotypes for duplicate records, merge variants
                            # ...except selscan logic is inverted, so bitwise AND
                            # TODO: invert?
                            # record @ pos1 = 001001
                            # record @ pos1 = 100001
                            #                -------
                            # coded result  = 101001
                            #log.debug(genotypes_for_selected_samples)
                            #log.debug(coded_genotypes_for_selected_samples)

                            if positionHasBeenSeenBefore:
                                #coded_genotypes_for_selected_samples |= previouslyCodedGenotypes
                                coded_genotypes_for_selected_samples = coded_genotypes_for_selected_samples.f[mergeOperatorString](previouslyCodedGenotypes)
                                #coded_genotypes_for_selected_samples = np.array(list(str(bin(int("".join(coded_genotypes_for_selected_samples),2) & int("".join(previouslyCodedGenotypes),2)))[2:].zfill(len(coded_genotypes_for_selected_samples))))

                            previouslyCodedGenotypes = coded_genotypes_for_selected_samples

                            allele_freq_for_pop = float(list(coded_genotypes_for_selected_samples).count("1")) / numberOfHaplotypes

                            outStrDict = cls._build_variant_output_strings(record.contig, str(1), recordPosition, 
                                map_pos_cm, coded_genotypes_for_selected_samples, record.ref, record.alt, 
                                ancestral_allele, allele_freq_for_pop)

                            lineToWrite1 = outStrDict["tpedString"]
                            lineToWrite2 = outStrDict["metadataString"].replace(" ","\t")

                            recordCount += 1
                            current_pos_bp = int(recordPosition)

                            if recordCount % 1000 == 0:
                                number_of_seconds_elapsed = (datetime.now() - startTime).total_seconds()
                                bp_per_sec = float(current_pos_bp) / float(number_of_seconds_elapsed)
                                bp_remaining = end_pos - current_pos_bp
                                sec_remaining = bp_remaining / bp_per_sec
                                sec_remaining_avg = cls._moving_avg(sec_remaining, sec_remaining_avg, 10)
                                time_left = timedelta(seconds=sec_remaining_avg)
                            
                                if sec_remaining > 10:
                                    human_time_remaining = relative_time(datetime.utcnow()+time_left)
                                    print("")
                                    print("Completed: {:.2%}".format(float(current_pos_bp)/float(end_pos)))
                                    print("Estimated time of completion: {}".format(human_time_remaining))
                                    #log.info("Genotype counts found: %s", str(list(recordLengths)))

            if positionHasBeenSeenBefore and not consider_multi_allelic:
                lineToWrite1 = None
                lineToWrite2 = None

            if lineToWrite1 is not None and lineToWrite2 is not None:
                if len(previouslyCodedGenotypes) == ploidy*len(indices_of_matching_samples):
                    # write the output lines
                    of1.write(lineToWrite1)
                    of2.write(lineToWrite2)
                       
                else:
                    genotypesCount            = len(previouslyCodedGenotypes)
                    countSpecificTpedName     = outTpedFile.replace(outfile_prefix, outfile_prefix + "_" + str(genotypesCount))
                    countSpecificMetafileName = outTpedMetaFile.replace(outfile_prefix, outfile_prefix + "_" + str(genotypesCount))
                    with util.file.open_or_gzopen(countSpecificTpedName, 'a') as of1l, util.file.open_or_gzopen(countSpecificMetafileName, 'a') as of2l:
                        of1l.write(lineToWrite1)
                        of2l.write(lineToWrite2)

            log.info("Genotype counts found: %s", str(list(recordLengths)))
Example #41
0
    def __init__(self, vcf_files):
        self._readers = []

        for file_name in vcf_files:
            self._readers.append(
                pysam.Tabixfile(file_name, parser=pysam.asVCF()))
Example #42
0
                annos[anno] = BigWigFile(open(anno_files[anno]))

        except IOError:
            sys.exit("Gemini cannot open this annotation file: %s. \n"
                     "Have you installed the annotation files?  If so, "
                     "have they been moved or deleted? Exiting...\n\n"
                     "For more details:\n\t"
                     "http://gemini.readthedocs.org/en/latest/content/"
                     "#installation.html\#installing-annotation-files\n"
                     % anno_files[anno])

# ## Standard access to Tabix indexed files


PARSERS = {"bed": pysam.asBed(),
           "vcf": pysam.asVCF(),
           "tuple": pysam.asTuple(),
           None: None}

def _get_hits(coords, annotation, parser_type, _parsers=PARSERS):
    """Retrieve BED information, recovering if BED annotation file does have a chromosome.
    """
    try:
        parser = _parsers[parser_type]
    except KeyError:
        raise ValueError("Unexpected parser type: %s" % parser)
    chrom, start, end = coords
    try:
        hit_iter = annotation.fetch(str(chrom), start, end, parser=parser)
    # catch invalid region errors raised by ctabix
    except ValueError:
Example #43
0
    def handle(
        self,
        file: str,
        organism: str,
        doi: str = None,
        cpu: int = 1,
        verbosity: int = 1,
        **options
    ):
        """Execute the main function."""
        # retrieve only the file name
        filename = os.path.basename(file)
        if verbosity > 0:
            self.stdout.write("Processing file: {}".format(filename))

        try:
            FileValidator().validate(file)
        except ImportingError as e:
            raise CommandError(e)

        try:
            index_file = "{}.tbi".format(file)
            FileValidator().validate(index_file)
        except ImportingError:
            try:
                index_file = "{}.csi".format(file)
                FileValidator().validate(index_file)
            except ImportingError:
                raise CommandError("No index found (.tbi/.csi)")

        try:
            feature_file = FeatureLoader(
                filename=filename, source="VCF_SOURCE", doi=doi
            )
        except ImportingError as e:
            raise CommandError(e)

        pool = ThreadPoolExecutor(max_workers=cpu)
        tasks = list()

        chunk_size = cpu * 2

        # Load the GFF3 file
        with open(file) as tbx_file:
            tbx = pysam.TabixFile(filename=tbx_file.name, index=index_file)
            for row in tqdm(tbx.fetch(parser=pysam.asVCF()), total=get_num_lines(file)):
                tasks.append(
                    pool.submit(feature_file.store_tabix_VCF_feature, row, organism)
                )

                if len(tasks) >= chunk_size:
                    for task in as_completed(tasks):
                        try:
                            task.result()
                        except ImportingError as e:
                            raise CommandError(e)
                    tasks.clear()
            else:
                for task in as_completed(tasks):
                    try:
                        task.result()
                    except ImportingError as e:
                        raise CommandError(e)
                tasks.clear()

        pool.shutdown()

        if verbosity > 0:
            self.stdout.write(self.style.SUCCESS("Done with {}".format(filename)))
Example #44
0
    def Variants(self, chromosome, start, end, freqAsPrior=False):
        """
        Generator funtion. Yields variants in order of
        genomic co-ordinate.
        """
        varList = []

        for vcfFile in self.vcfFiles:
            try:
                vcfLines = vcfFile.fetch(chromosome,
                                         start,
                                         end,
                                         parser=pysam.asVCF())
            except Exception, e:
                logger.warning(
                    "Could not retrieve variants from source file in region %s:%s-%s. Error was %s"
                    % (chromosome, start, end, e.message))
                continue

            for line in vcfLines:

                pos = line.pos
                ref = line.ref
                alt = line.alt
                info = line.info

                infoEntries = info.split(";")
                freq = None

                if freqAsPrior:
                    for entry in infoEntries:

                        cols = entry.split("=")

                        if cols[0] == "AF":
                            if len(cols) == 2:
                                freq = float(cols[1])
                            else:
                                # Skip multi-allelic sites for now
                                continue

                lenRef = len(ref)
                lenAlt = len(alt)

                # SNP
                if lenRef == 1 and lenAlt == 1:
                    var = Variant(chromosome, pos, ref, alt, 0, 0, 0, 0, 0)

                    if freqAsPrior:
                        var.freqPrior = freq

                    varList.append(var)

                # Anything else
                else:

                    # VCF4 is -1 indexed for indels, so trim off first base
                    ref = ref[1:]
                    alt = alt[1:]
                    removed = ref
                    added = alt

                    # Trim the matching bits off and shift position. This will decompose
                    # multi-variant sites into individual alleles at different positions.
                    while len(ref) > 0 and len(alt) > 0 and ref[0] == alt[0]:
                        ref = ref[1:]
                        alt = alt[1:]
                        removed = ref
                        added = alt
                        pos += 1

                    var = Variant(chromosome, pos, removed, added, 0, 0, 0, 0,
                                  0)

                    if freqAsPrior:
                        var.freqPrior = freq

                    varList.append(var)
Example #45
0
def process_piece(filename_vcf, chrom, chrom_length, sample_index, chain_info,
                  diploid, passed, quality, vcf_keep, vcf_discard_file):
    ret = {'chrom': chrom, 'stats': {}, 'chain_info': chain_info}

    stats = OrderedDict()
    stats['ACCEPTED'] = 0

    if vcf_keep:
        vcf_discard = open(vcf_discard_file, "w")

    line_no = 0

    try:
        LOG.info("Processing Chromosome {0}...".format(chrom))
        tb = pysam.TabixFile(filename_vcf)

        for vcf_rec in tb.fetch(chrom, parser=pysam.asVCF()):
            line_no += 1

            try:
                gt = parse_gt_new(vcf_rec, sample_index)
            except:
                LOG.info("Unable to parse record, improper VCF file?")
                continue

            LOG.debug('\n')
            LOG.debug(vcf_rec)
            LOG.debug(gt)
            LOG.debug(vcf_rec[sample_index])

            if passed and 'PASS' not in vcf_rec.FILTER:

                LOG.debug("TOSSED: FILTERED ON PASS")
                stats = update_stats(stats, 'FILTERED ON PASS')

                if vcf_keep:
                    vcf_discard.write(vcf_rec)
                    vcf_discard.write("\n")
                continue

            elif quality and gt.fi == '0':

                # FI : Whether a sample was a Pass(1) or fail (0) based on FILTER values

                LOG.debug("TOSSED: FILTERED ON QUALITY")
                stats = update_stats(stats, 'FILTERED ON QUALITY')

                if vcf_keep:
                    vcf_discard.write(vcf_rec)
                    vcf_discard.write("\n")
                continue

            elif gt.left is None and gt.right is None:

                LOG.debug("TOSSED: NOT RELEVANT")
                stats = update_stats(stats, 'NOT RELEVANT')

                if vcf_keep:
                    vcf_discard.write(vcf_rec)
                    vcf_discard.write("\n")
                continue

            elif not diploid and gt.left != gt.right:
                # haploid or hexaploid
                # gt must be equal

                LOG.debug("TOSSED: HETEROZYGOUS")
                stats = update_stats(stats, 'HETEROZYGOUS')

                if vcf_keep:
                    vcf_discard.write(vcf_rec)
                    vcf_discard.write("\n")
                continue

            # START L AND R, ONLY R IF DIPLOID

            for ci, lr in chain_info.iteritems():
                if ci == 'left':
                    alt_seq = str(gt.left)
                else:
                    alt_seq = str(gt.right)

                if gt.ref == alt_seq:

                    LOG.debug("TOSSED, SAME AS REF")
                    lr.stats = update_stats(lr.stats, 'SAME AS REF')

                    if vcf_keep:
                        vcf_discard.write(vcf_rec)
                        vcf_discard.write("\n")
                    continue

                orig_alt_seq = alt_seq

                LOG.debug("SAMPLE: {0}".format(vcf_rec[sample_index]))
                LOG.debug(
                    "REF='{0}', ALT_L='{1}', ALT_R='{2}'. POS={3}".format(
                        gt.ref, gt.left, gt.right, vcf_rec.pos))

                position = vcf_rec.pos + 1

                ref_seq = str(gt.ref)
                len_ref = len(ref_seq)
                len_alt = len(alt_seq)

                base_changes = len_ref - len_alt
                base_pos_diff = 0

                if position < lr.prev_next_ref_pos:
                    LOG.debug(
                        "TOSSED: CONFLICTING VCF ENTRIES: {0}".format(vcf_rec))

                    lr.stats = update_stats(lr.stats,
                                            'CONFLICTING VCF ENTRIES')

                    if vcf_keep:
                        vcf_discard.write(vcf_rec)
                        vcf_discard.write("\n")

                    continue

                # find the position where the first base change is
                for n in xrange(min(len_ref, len_alt)):
                    if ref_seq[n] != alt_seq[n]:
                        base_pos_diff = n
                        break

                # if it is 0, take the minimum length
                if base_pos_diff == 0:
                    base_pos_diff = min(len_ref, len_alt)

                # add the base position difference
                position += base_pos_diff

                # recalculate the strings
                shared_bases = ref_seq[:base_pos_diff]
                ref_seq = ref_seq[base_pos_diff:]
                alt_seq = alt_seq[base_pos_diff:]

                dt = len(ref_seq)
                dq = len(alt_seq)

                next_ref_pos = position + len(ref_seq)
                fragment_size = position - lr.prev_next_ref_pos

                LOG.debug('           gt.ref: {0}'.format(gt.ref))
                LOG.debug('          ref_seq: {0}'.format(ref_seq))
                LOG.debug('               dt: {0}'.format(dt))
                LOG.debug('           gt.alt: {0}'.format(orig_alt_seq))
                LOG.debug('          alt_seq: {0}'.format(alt_seq))
                LOG.debug('               dq: {0}'.format(dq))
                LOG.debug('         position: {0}'.format(position))
                LOG.debug('prev_next_ref_pos: {0}'.format(
                    lr.prev_next_ref_pos))
                LOG.debug('     next_ref_pos: {0}'.format(next_ref_pos))
                LOG.debug('    fragment_size: {0}'.format(fragment_size))
                LOG.debug('     base_changes: {0}'.format(base_changes))
                LOG.debug('    base_pos_diff: {0}'.format(base_pos_diff))
                LOG.debug('     shared_bases: {0}'.format(shared_bases))

                # fix any 0 length
                if fragment_size < 0:
                    LOG.debug(
                        "TOSSED: CONFLICTING VCF ENTRIES: {0}".format(vcf_rec))

                    lr.stats = update_stats(lr.stats,
                                            'CONFLICTING VCF ENTRIES')

                    if vcf_keep:
                        vcf_discard.write(vcf_rec)
                        vcf_discard.write("\n")

                    continue

                if fragment_size != 0:
                    ref_str = ref_seq if ref_seq else '.'
                    alt_str = alt_seq if alt_seq else '.'
                    lr.chain_entries.append([
                        fragment_size,
                        len(ref_seq),
                        len(alt_seq), shared_bases, ref_str, alt_str,
                        vcf_rec.pos + 1
                    ])
                else:
                    #
                    # THIS SHOULD NOT HAPPEN
                    #
                    raise G2GChainFileError(
                        'Unable to create chain file due to conflicting VCF entries'
                    )

                lr.stats = update_stats(lr.stats, 'ACCEPTED')

                LOG.debug(lr.chain_entries[-1])

                last_position = position
                lr.prev_next_ref_pos = next_ref_pos
                lr.sums[0] += fragment_size
                lr.sums[1] += dt
                lr.sums[2] += dq
                prev_line = vcf_rec

                #lr.prev_chrom = vcf_rec.contig
            chain_info[ci] = lr

        for ci, lr in chain_info.iteritems():
            #LOG.debug("CHROMOSOME[{0}] LENGTH = {1}".format(lr.prev_chrom, chrom_length))

            lr.chromosome = chrom
            lr.chromosome_length = chrom_length
            lr.last_fragment_size = chrom_length - lr.sums[0] - lr.sums[1]
            lr.end_length = lr.sums[0] + lr.last_fragment_size + lr.sums[2]
            lr.number_vcf_lines = line_no

            chain_info[ci] = lr

        if vcf_keep:
            vcf_discard.close()

    except KeyboardInterrupt:
        raise KeyboardInterruptError()
    except Exception, e:
        pass
subprocess.check_output("samtools faidx {}/ref.fa".format(outdir), shell=True)
logger.info("Lifter over the reference to %s/ref.fa" % outdir)

# This will liftover the VCFs to the new reference
for invcf in invcfs:
  if not os.path.isfile(invcf):
    logger.error("%s not found" % invcf)
    continue
  # get the base name and use it in the output
  outvcf = os.path.join(outdir, os.path.splitext(os.path.basename(invcf))[0])
  vcf_template_reader = vcf.Reader(open(invcf, "r"))
  vcf_template_reader.metadata["reference"] = os.path.join(outdir, "ref.fa")
  vcf_template_reader.contigs = OrderedDict([(contig_name, vcf.parser._Contig(contig_name, contig_length)) for (contig_name, contig_length) in contigs])
  vcf_writer = vcf.Writer(open(outvcf, "w"), vcf_template_reader)

  tabix_vcf = pysam.TabixFile(invcf, parser=pysam.asVCF())
  info_warned = False
  for region_index, region in enumerate(regions_bedtool, start=1):
    records = None
    try: records = tabix_vcf.fetch(reference=str(region.chrom), start=region.start, end=region.end)
    except ValueError: logger.error("Failed to retrieve %s from %s" % (str(region).strip(), invcf))
    if records is None: continue
    for record in records:
      if record.pos <= region.start + args.flank or record.pos + len(record.ref) + args.flank - 1 >= region.end: continue
      record.contig = str(region_index) if args.short_contig_names else ("%s_%d_%d" % (str(region.chrom), region.start, region.end))
      # record.pos seems to be zero-based, at least in the infinite wisdom of my version of pysam
      record.pos = record.pos - region.start
      fmt = "GT" if len(record) else None
      sample_indexes = [0] if len(record) else []
      info = None
      try: info = vcf_template_reader._parse_info(record.info)