Example #1
0
 def testSamples(self):
     self.assertEqual(len(self.VCF.samples), 526)
     self.assertEqual(self.VCF.info[0][1],'1')
     self.assertEqual(self.VCF.gformat[0],'GT')
     self.assertEqual(self.VCF.novel[0],'Y_2649856_A')
     self.assertEqual('NA19088' in VCF.list_samples_with_alternate_allele(self.VCF,'rs11575897')
             , True)
     self.assertEqual(len(VCF.list_samples_with_alternate_allele(self.VCF,'rs11575897')),99)
     self.assertEqual(self.VCF.hardyweinberg('rs11575897'), False)
Example #2
0
 def test_sample_summary(self):
     self.assertEqual(len(self.VCF.samples), 526)
     # Y chromosome shouldn't have any homozygous 
     #self.assertFalse(any(self.VCF.geno == 2))
     print(any(self.VCF.geno == 2))
     self.assertEqual(self.VCF.info[0][1], '1' )
     self.assertEqual(self.VCF.gformat[0],'GT')
     self.assertEqual(self.VCF.novel[0],'Y_2649856_A')
     self.assertEqual('NA19088' in VCF.list_samples_with_alternate_allele(self.VCF,'rs11575897')
             , True)
     self.assertEqual(len(VCF.list_samples_with_alternate_allele(self.VCF,'rs11575897')),99)
Example #3
0
 def test_sample_summary(self):
     self.assertEqual(len(self.VCF.samples), 526)
     # Y chromosome shouldn't have any homozygous
     #self.assertFalse(any(self.VCF.geno == 2))
     print(any(self.VCF.geno == 2))
     self.assertEqual(self.VCF.info[0][1], '1')
     self.assertEqual(self.VCF.gformat[0], 'GT')
     self.assertEqual(self.VCF.novel[0], 'Y_2649856_A')
     self.assertEqual(
         'NA19088'
         in VCF.list_samples_with_alternate_allele(self.VCF,
                                                   'rs11575897'), True)
     self.assertEqual(
         len(VCF.list_samples_with_alternate_allele(self.VCF,
                                                    'rs11575897')), 99)
Example #4
0
class TestLoadingVCF(unittest.TestCase):
    """ Testing loading VCF files into pandas
    """

    def setUp(self):
        self.filename = './data/chrY.test.vcf'
        self.VCF = VCF(self.filename)

    def testSamples(self):
        self.assertEqual(len(self.VCF.samples), 526)
        self.assertEqual(self.VCF.info[0][1],'1')
        self.assertEqual(self.VCF.gformat[0],'GT')
        self.assertEqual(self.VCF.novel[0],'Y_2649856_A')
        self.assertEqual('NA19088' in VCF.list_samples_with_alternate_allele(self.VCF,'rs11575897')
                , True)
        self.assertEqual(len(VCF.list_samples_with_alternate_allele(self.VCF,'rs11575897')),99)
        self.assertEqual(self.VCF.hardyweinberg('rs11575897'), False)

    def testInfo(self):
        pass
Example #5
0
def main():
    """ Main loop that iterates over the desired loci to calculate Aellic
    Imbalance
    """
    #################################################################
    # Argument and Options Parsing
    #################################################################

    p = optparse.OptionParser(__doc__)
    p.add_option("-C",
                 "--chrom",
                 dest="Chrom",
                 help=("Chromosome to"
                       "restrict counting alleles on"))
    p.add_option("-o",
                 "--output",
                 dest="filename",
                 help="write \
            report to FILE")
    p.add_option("-a",
                 "--annot",
                 dest="annot",
                 help="Annotation file\
            for SNPs")
    p.add_option("-G", "--genotype", dest="G", action='store', help=\
            "Use imputed/genotypes if available, should be in VCF file format",
            default=None)
    p.add_option("-v",
                 "--vcf_file",
                 action="store_true",
                 dest="inputisvcffile",
                 help="the input is a VCF file",
                 default=False)
    p.add_option("-q",
                 "--quality_threshold",
                 type="int",
                 dest="qual",
                 help="base quality threshold to take allele counts from")
    p.add_option("-P", "--variant_positions", action="store", help="")
    p.add_option("-p",
                 "--pileup",
                 action="store_true",
                 dest="p",
                 help="Input files are pileup files")
    p.add_option("-D", "--debug", action="store_true", dest="D", help="debug")
    p.add_option("-c",
                 "--count-threshold",
                 action="store",
                 type="int",
                 dest="c",
                 help="Set the count threshold for making AEI calls")
    '''
    p.add_option("-A", "--auto_parse", action="store_true", dest="auto",
                 help="Autoparse readgroups, if set to false will assume a\
                 single sample in each file")
    '''
    options, args = p.parse_args()
    if options.qual: pass
    else: options.qual = 20

    bam_inputs = open(args[1], 'rU')
    sample_to_file = {}
    for line in bam_inputs:
        line = line.split("\t")
        sample_to_file[line[0]] = line[1].rstrip("\n").rstrip("/n")
        file_to_sample = {v: k for k, v in sample_to_file.items()}
    debug = 1
    if 'vcf' in args[0]:
        vcf = VCF(args[0])
        chrm = vcf.vcf['#CHROM']
        pos = vcf.vcf['POS']
        #geno = vcf.geno
        geno = pd.DataFrame(np.zeros((len(pos), len(sample_to_file))),
                            columns=pd.Index(sample_to_file.keys()))
    elif options.annot:
        # This is the annotation file that I actually use
        # for the paper
        chrom = 18
        print('Right annotation file')
        rsIDs = []
        pos = []
        try:
            file_a = pysam.Tabixfile(args[0])
        except IOError:
            os.exit()
        # :TODO fix this
        # Currently counts at all SNPs
        a_iter = file_a.fetch(str(chrom))
        chrm = []
        '''
        base_path = '/proj/GenomicsHD/Atrial_RNASeq/'
        s_ann = pd.read_pickle(base_path + 'ref/snp_annot/' + str(chrom) +\
                '.pkl')
        '''
        debug = 0
        for i in a_iter:
            i = i.split("\t")
            if not int(i[-5]) == 0:
                rsIDs.append(i[3])
                chrm.append('chr' + str(i[0]))
                pos.append(int(i[1]))
                '''
                debug += 1
                if debug > 200:
                    break
                '''
        geno = pd.DataFrame(np.zeros((len(rsIDs), len(sample_to_file))),
                            index=pd.Index(rsIDs),
                            columns=pd.Index(sample_to_file.keys()))
        #print(s_ann.head())
        pos = np.asarray(pos, dtype=np.uint32)
        # Probably inefficient to pass in one array
        chrm = np.asarray(chrm)
        print('Chrom length')
        print(len(chrm))
        print(len(pos))
        print(pos[-1])
    # A tab delimited file mapping sample names to bams #############
    if options.c:
        count_threshold = options.c
    else:
        count_threshold = 20
    multi_tuples = []
    subset_geno = geno.ix[:, sample_to_file.keys()].copy()
    subset_geno.rename(columns=sample_to_file, inplace=True)
    for i in subset_geno.columns:
        multi_tuples.append(i)
        multi_tuples.append(i)
        multi_tuples.append(i)
        multi_tuples.append(i)
    multi = zip(multi_tuples, [0, 1, 2, 3] * subset_geno.shape[1])
    multi_index = pd.MultiIndex.from_tuples(multi, names=['sample', 'alleles'])
    counts_matrix = pd.DataFrame(np.zeros(
        (subset_geno.shape[0], len(subset_geno.columns) * 4), dtype=np.uint32),
                                 index=subset_geno.index,
                                 columns=multi_index)

    c_m = aei_count_samples(subset_geno.values, subset_geno.columns.values,
                            counts_matrix.values, np.asarray(chrm), pos)
    c_m = pd.DataFrame(c_m, index=subset_geno.index, columns=multi_index)
    c_m.to_pickle(options.filename)
Example #6
0
 def setUp(self):
     path = os.path.dirname(os.path.abspath(__file__))
     self.filename = path + '/data/chrY.test.vcf'
     self.VCF = VCF(self.filename)
Example #7
0
 def setUp(self):
     self.filename = './data/chrY.test.vcf'
     self.VCF = VCF(self.filename)