Exemple #1
0
    def __iter__(self):
        from pysam import Tabixfile, asTuple
        f = Tabixfile(self.filename, mode='r')
        try:
            # header row
            if self.header is not None:
                yield self.header
            else:
                # assume last header line has fields
                h = list(f.header)
                if len(h) > 0:
                    header_line = text_type(h[-1], encoding='ascii')
                    yield tuple(header_line.split('\t'))

            # data rows
            for row in f.fetch(reference=self.reference,
                               start=self.start,
                               end=self.stop,
                               region=self.region,
                               parser=asTuple()):
                yield tuple(row)

        except:
            raise
        finally:
            f.close()
Exemple #2
0
    def __init__(self, task_queue, results_queue, family, args):
        multiprocessing.Process.__init__(self)
        self.task_queue = task_queue
        self.family = family
        self.results_queue = results_queue
        self.verbosity = args.verbose
        self.phased = args.phased
        self.cadd_file = args.cadd_file[0]
        self.chr_prefix = args.chr_prefix

        if self.cadd_file:
            self.cadd_file = Tabixfile(self.cadd_file, parser=asTuple())
Exemple #3
0
    def __init__(self, *filenames, **kwargs):  #data_format=None,printer=None):
        """Create a |BigBedGenomeHash|
        
        Parameters
        ----------
        filenames : str or list of str
            Filename or list of filenames of `Tabix`_-compressed files

        data_format : str
            Format of tabix-compressed file(s). Choices are:
            `'GTF2'`,`'GFF3'`,`'BED'`,`'PSL'` (Default: `GTF2`)
        """
        from pysam import Tabixfile
        if len(filenames) == 1 and isinstance(filenames[0], list):
            filenames = filenames[0]

        self.filenames = list(multiopen(filenames))
        self.printer = kwargs.get("printer", NullWriter())
        data_format = kwargs.get("data_format", "GTF2")
        try:
            self._reader_class = TabixGenomeHash._READERS[data_format]
        except ValueError:
            msg = "Supported file formats for TabixGenomeHash are: %s" % ", ".join(
                sorted(TabixGenomeHash._READERS.keys()))
            self.printer.write(msg)
            raise ValueError(msg)

        self.tabix_readers = [Tabixfile(X) for X in self.filenames]
Exemple #4
0
 def __init__(self, task_queue, results_queue, family, args):
     multiprocessing.Process.__init__(self)
     self.task_queue = task_queue
     self.family = family
     self.results_queue = results_queue
     self.verbosity = args.verbose
     self.phased = args.phased
     self.cadd_file = args.cadd_file[0]
     self.chr_prefix = args.chr_prefix
                 
     if self.cadd_file:
         self.cadd_file = Tabixfile(self.cadd_file, parser = asTuple())
Exemple #5
0
 def __init__(self, chromosome, position, annotation_table_file):
     annotation_table = Tabixfile(annotation_table_file)
     self.line = annotation_table.fetch(reference=chromosome,
                                        start=position - 1,
                                        end=position).next()
     self.chromosome, \
     self.position, \
     self.reference_base, \
     self.genic, \
     self.exonic, \
     self.intronic, \
     self.intergenic, \
     self.utr5, \
     self.utr3, \
     self.fold0, \
     self.fold4, \
     self.fold2, \
     self.fold3, \
     self.CDS, \
     self.mRNA, \
     self.rRNA, \
     self.tRNA, \
     self.feature_names, \
     self.feature_types, \
     self.feature_ID, \
     self.cds_position, \
     self.strand, \
     self.frame, \
     self.codon, \
     self.aa, \
     self.degen, \
     self.FPKM, \
     self.rho, \
     self.FAIRE, \
     self.recombination, \
     self.mutability, \
     self.quebec_alleles = self.line.split('\t')
     self.position = int(self.position)
     annotation_table.close()
Exemple #6
0
 def __iter__(self):
     try:
         from pysam import Tabixfile, asTuple
     except ImportError as e:
         raise UnsatisfiedDependency(e, dep_message)
     f = Tabixfile(self.filename, mode='r')
     try:
         # header row
         if self.header is not None:
             yield self.header
         else:
             # assume last header line has fields
             h = list(f.header)
             if len(h) > 0:
                 yield tuple(h[-1].split('\t'))
         # data rows
         for row in f.fetch(reference=self.reference, start=self.start, end=self.end, region=self.region, parser=asTuple()):
             yield tuple(row)
     except:
         raise
     finally:
         f.close()
Exemple #7
0
    def __iter__(self):
        from pysam import Tabixfile, asTuple
        f = Tabixfile(self.filename, mode='r')
        try:
            # header row
            if self.header is not None:
                yield self.header
            else:
                # assume last header line has fields
                h = list(f.header)
                if len(h) > 0:
                    header_line = text_type(h[-1], encoding='ascii')
                    yield tuple(header_line.split('\t'))

            # data rows
            for row in f.fetch(reference=self.reference, start=self.start,
                               end=self.stop, region=self.region,
                               parser=asTuple()):
                yield tuple(row)

        except:
            raise
        finally:
            f.close()
Exemple #8
0
 def __iter__(self):
     try:
         from pysam import Tabixfile, asTuple
     except ImportError as e:
         raise UnsatisfiedDependency(e, dep_message)
     f = Tabixfile(self.filename, mode="r")
     try:
         # header row
         if self.header is not None:
             yield self.header
         else:
             # assume last header line has fields
             h = list(f.header)
             if len(h) > 0:
                 yield tuple(h[-1].split("\t"))
         # data rows
         for row in f.fetch(
             reference=self.reference, start=self.start, end=self.end, region=self.region, parser=asTuple()
         ):
             yield tuple(row)
     except:
         raise
     finally:
         f.close()
Exemple #9
0
class VariantConsumer(multiprocessing.Process):
    """Yeilds all unordered pairs from a list of objects as tuples, like (obj_1, obj_2)"""
    def __init__(self, task_queue, results_queue, family, args):
        multiprocessing.Process.__init__(self)
        self.task_queue = task_queue
        self.family = family
        self.results_queue = results_queue
        self.verbosity = args.verbose
        self.phased = args.phased
        self.cadd_file = args.cadd_file[0]
        self.chr_prefix = args.chr_prefix

        if self.cadd_file:
            self.cadd_file = Tabixfile(self.cadd_file, parser=asTuple())

    def fix_variants(self, variant_batch):
        """Merge the variants into one dictionary, make shure that the compounds are treated right."""
        fixed_variants = {}
        for feature in variant_batch:
            for variant_id in variant_batch[feature]:
                if variant_id in fixed_variants:
                    # We need to add compound information from different features
                    if len(variant_batch[feature][variant_id]
                           ['Compounds']) > 0:
                        fixed_variants[variant_id]['Compounds'] = (dict(
                            list(variant_batch[feature][variant_id]
                                 ['Compounds'].items()) +
                            list(fixed_variants[variant_id]
                                 ['Compounds'].items())))
                else:
                    fixed_variants[variant_id] = variant_batch[feature][
                        variant_id]

        return fixed_variants

    def get_cadd_score(self, variant):
        """Get the cadd score and add it to the variant."""
        cadd_score = '-'
        alternatives = variant['ALT'].split(',')
        # CADD vales are only for snps:
        if max([len(alt)
                for alt in alternatives]) == 1 and len(variant['REF']) == 1:
            if self.cadd_file:
                cadd_key = int(variant['POS'])
                try:
                    for tpl in self.cadd_file.fetch(str(variant['CHROM']),
                                                    cadd_key - 1, cadd_key):
                        if alternatives[0] == str(tpl[3]):
                            try:
                                return str(tpl[5], encoding='utf-8')
                            except TypeError:
                                return str(unicode(tpl[5], encoding='utf-8'))
                except (IndexError, KeyError) as e:
                    if self.verbosity:
                        print(e, variant['CHROM'], variant['POS'])

        return cadd_score

    def make_print_version(self, variant_dict):
        """Get the variants ready for printing"""
        for variant_id in variant_dict:
            if self.cadd_file:
                variant_dict[variant_id]['CADD'] = self.get_cadd_score(
                    variant_dict[variant_id])
            model_list = []
            compounds_list = []
            #Remove the 'Genotypes' post since we will not need them for now
            variant_dict[variant_id].pop('Genotypes', 0)

            feature_list = variant_dict[variant_id]['Annotation']

            if len(variant_dict[variant_id]['Compounds']) > 0:
                #We do not want reference to itself as a compound:
                variant_dict[variant_id]['Compounds'].pop(variant_id, 0)
                compounds_list = list(
                    variant_dict[variant_id]['Compounds'].keys())
            else:
                compounds_list = ['-']

            for model in variant_dict[variant_id]['Inheritance_model']:
                if variant_dict[variant_id]['Inheritance_model'][model]:
                    model_list.append(model)
            if len(model_list) == 0:
                model_list = ['NA']
            model_score = '-'
            genotype_scores = []
            for individual in self.family.individuals:
                gt_call = variant_dict[variant_id][individual].split(':')
                gt_info = variant_dict[variant_id]['FORMAT'].split(':')
                if len(gt_call) == 1:
                    gt_call = {'GT': gt_call[0]}
                else:
                    gt_call = dict(zip(gt_info, gt_call))
                if 'GQ' in gt_call:
                    # Add the error probabilities to genotype scores
                    genotype_scores.append(10**-(float(gt_call['GQ']) / 10))
            if len(genotype_scores) > 0:
                model_score = (str(
                    round(-10 * log10(1 - reduce(
                        operator.mul, [1 - score
                                       for score in genotype_scores])))))
            variant_dict[variant_id].pop('Compounds', 0)
            variant_dict[variant_id].pop('Inheritance_model', 0)
            variant_dict[variant_id].pop('Annotation', 0)
            vcf_info = variant_dict[variant_id]['INFO'].split(';')

            if self.chr_prefix:
                variant_dict[variant_id][
                    'CHROM'] = 'chr' + variant_dict[variant_id]['CHROM']
            # if we should include the annotation:
            vcf_info.append('ANN=' + ':'.join(feature_list))
            # if we should include compounds:
            vcf_info.append('Comp=' + ':'.join(compounds_list))
            # if we should include genetic models:
            vcf_info.append('GM=' + ':'.join(model_list))
            if model_list == ['NA']:
                model_score = '-'
            vcf_info.append('MS=' + model_score)
            if self.cadd_file:
                vcf_info.append('CADD=%s' %
                                str(variant_dict[variant_id].pop('CADD', '-')))
            variant_dict[variant_id]['INFO'] = ';'.join(vcf_info)
        return

    def run(self):
        """Run the consuming"""
        proc_name = self.name
        if self.verbosity:
            print('%s: Starting!' % proc_name)
        while True:
            # A batch is a dictionary on the form {gene:{variant_id:variant_dict}}
            next_batch = self.task_queue.get()
            # if self.verbosity:
            # if self.results_queue.full():
            #     print('Batch results queue Full! %s' % proc_name)
            # if self.task_queue.full():
            #     print('Variant queue full! %s' % proc_name)
            if next_batch is None:
                self.task_queue.task_done()
                if self.verbosity:
                    print('%s: Exiting' % proc_name)
                break
            genetic_models.check_genetic_models(next_batch, self.family,
                                                self.verbosity, self.phased,
                                                proc_name)
            # Make shure we only have one copy of each variant:
            fixed_variants = self.fix_variants(next_batch)

            # Now we want to make versions of the variants that are ready for printing.
            self.make_print_version(fixed_variants)
            self.results_queue.put(fixed_variants)
            self.task_queue.task_done()
        return
Exemple #10
0
class VariantConsumer(multiprocessing.Process):
    """Yeilds all unordered pairs from a list of objects as tuples, like (obj_1, obj_2)"""
    
    def __init__(self, task_queue, results_queue, family, args):
        multiprocessing.Process.__init__(self)
        self.task_queue = task_queue
        self.family = family
        self.results_queue = results_queue
        self.verbosity = args.verbose
        self.phased = args.phased
        self.cadd_file = args.cadd_file[0]
        self.chr_prefix = args.chr_prefix
                    
        if self.cadd_file:
            self.cadd_file = Tabixfile(self.cadd_file, parser = asTuple())
    
    def fix_variants(self, variant_batch):
        """Merge the variants into one dictionary, make shure that the compounds are treated right."""
        fixed_variants = {}
        for feature in variant_batch:
            for variant_id in variant_batch[feature]:
                if variant_id in fixed_variants:
                    # We need to add compound information from different features
                    if len(variant_batch[feature][variant_id]['Compounds']) > 0:
                        fixed_variants[variant_id]['Compounds'] = (
                         dict(list(variant_batch[feature][variant_id]['Compounds'].items()) +
                                    list(fixed_variants[variant_id]['Compounds'].items())))
                else:
                    fixed_variants[variant_id] = variant_batch[feature][variant_id]
        
        return fixed_variants
    
    def get_cadd_score(self, variant):
        """Get the cadd score and add it to the variant."""
        cadd_score = '-'
        alternatives = variant['ALT'].split(',')
        # CADD vales are only for snps:
        if max([len(alt) for alt in alternatives]) == 1 and len(variant['REF']) == 1:
            if self.cadd_file:
                cadd_key = int(variant['POS'])
                try:
                    for tpl in self.cadd_file.fetch(str(variant['CHROM']), cadd_key-1, cadd_key):
                        if alternatives[0] == str(tpl[3]):
                            try:
                                return str(tpl[5], encoding='utf-8')
                            except TypeError:
                                return str(unicode(tpl[5], encoding='utf-8'))
                except (IndexError, KeyError) as e:
                    if self.verbosity:
                        print(e, variant['CHROM'], variant['POS'])
                            
        return cadd_score

    
    def make_print_version(self, variant_dict):
        """Get the variants ready for printing"""
        for variant_id in variant_dict:
            if self.cadd_file:
                variant_dict[variant_id]['CADD'] = self.get_cadd_score(variant_dict[variant_id])
            model_list = []
            compounds_list = []
            #Remove the 'Genotypes' post since we will not need them for now
            variant_dict[variant_id].pop('Genotypes', 0)
            
            feature_list = variant_dict[variant_id]['Annotation']
                            
            if len(variant_dict[variant_id]['Compounds']) > 0:
                #We do not want reference to itself as a compound:
                variant_dict[variant_id]['Compounds'].pop(variant_id, 0)
                compounds_list = list(variant_dict[variant_id]['Compounds'].keys())
            else:
                compounds_list = ['-']
            
            for model in variant_dict[variant_id]['Inheritance_model']:
                if variant_dict[variant_id]['Inheritance_model'][model]:
                    model_list.append(model)
            if len(model_list) == 0:
                model_list = ['NA']
            model_score = '-'
            genotype_scores = []
            for individual in self.family.individuals:
                gt_call = variant_dict[variant_id][individual].split(':')
                gt_info = variant_dict[variant_id]['FORMAT'].split(':')
                if len(gt_call) == 1:
                    gt_call = {'GT':gt_call[0]}
                else:
                    gt_call = dict(zip(gt_info, gt_call))
                if 'GQ' in gt_call:
                    # Add the error probabilities to genotype scores
                    genotype_scores.append(10**-(float(gt_call['GQ'])/10))
            if len(genotype_scores) > 0:
                model_score = (str(round(-10*log10(1-reduce(operator.mul, [1-score for score in genotype_scores])))))
            variant_dict[variant_id].pop('Compounds',0)
            variant_dict[variant_id].pop('Inheritance_model',0)
            variant_dict[variant_id].pop('Annotation',0)
            vcf_info = variant_dict[variant_id]['INFO'].split(';')
            
            if self.chr_prefix:
                variant_dict[variant_id]['CHROM'] = 'chr'+variant_dict[variant_id]['CHROM']
            # if we should include the annotation:
            vcf_info.append('ANN=' + ':'.join(feature_list))
            # if we should include compounds:
            vcf_info.append('Comp=' + ':'.join(compounds_list))
            # if we should include genetic models:
            vcf_info.append('GM=' + ':'.join(model_list))
            if model_list == ['NA']:
                model_score = '-'
            vcf_info.append('MS=' + model_score)
            if self.cadd_file:
                vcf_info.append('CADD=%s' % str(variant_dict[variant_id].pop('CADD', '-')))
            variant_dict[variant_id]['INFO'] = ';'.join(vcf_info)
        return
    
    def run(self):
        """Run the consuming"""
        proc_name = self.name
        if self.verbosity:
            print('%s: Starting!' % proc_name)
        while True:
            # A batch is a dictionary on the form {gene:{variant_id:variant_dict}}
            next_batch = self.task_queue.get()
            # if self.verbosity:
                # if self.results_queue.full():
                #     print('Batch results queue Full! %s' % proc_name)
                # if self.task_queue.full():
                #     print('Variant queue full! %s' % proc_name)
            if next_batch is None:
                self.task_queue.task_done()
                if self.verbosity:
                    print('%s: Exiting' % proc_name)
                break
            genetic_models.check_genetic_models(next_batch, self.family, self.verbosity, self.phased, proc_name)
            # Make shure we only have one copy of each variant:
            fixed_variants = self.fix_variants(next_batch)
            
            # Now we want to make versions of the variants that are ready for printing.
            self.make_print_version(fixed_variants)
            self.results_queue.put(fixed_variants)
            self.task_queue.task_done()
        return
genofins = []
for line in genofinfile:
	genofins.append(line.strip().split()[1])

genofinfile.close()

genoinds = [genofins.index(x) + 6 for x in officialfindivs]
y = {}
currbimbam = open(currfiles + '.bimbam','w')
#t0 = time.time()
for snp in masterdic.keys():
#for snp in masterdic.keys()[0:1000]:
	chrm = masterdic[snp][0]
	if chrm == 'chrm':
		continue
	tabixer = Tabixfile('/mnt/lustre/home/cusanovich/500HT/Imputed1415/ByChr/hutt.all.imputed.' + chrm + '.txt.gz')
	tempgenos = [x.split('\t') for x in tabixer.fetch(chrm,int(masterdic[snp][1])-1,int(masterdic[snp][2]))][0]
	genos = [tempgenos[x] for x in range(0,6) + genoinds]
	tabixer.close()
	y[snp] = [genos[3], 'A', 'G'] + genos[6:]
	print >> currbimbam, ", ".join(y)

#t1 = time.time()
#print t1-t0
currbimbam.close()

#genomat = matrix_reader(genodir + 'hutt.imputed.dhssnps.bimbam',sep=",")
print "Running GEMMA..."
gemmer = (hmdir + 'Programs/gemma0.94 -g ' + currfiles + '.bimbam -p ' + currfiles + '.pheno -k ' + currfiles + '.square.txt -c ' + currfiles + '.covariates -lmm 4 -maf 0.05 -o curr_' + pheno)
t0 = time.time()
ifier(gemmer)
                   fmt='%s')
 if not regressPCs:
     phener = ('cut -f' + str(int(exprcoldic[gene]) + 1) + ' -d" " ' +
               hmdir + '500HT/Exprs/qqnorm.500ht' + gccor + covcor +
               '.ordered.' + chrm + '.bimbam > ' + currfiles + '.pheno')
     ifier(phener)
 currgenos = []
 ####Pull genotypes for the SNPs in cis, if genotypes not already in dictionary: go to geno file and pull in appropriate data
 for snp in masterdic[gene]:
     try:
         currgenos.append(", ".join(genodic[snp]))
     except KeyError:
         #tabixer = pysam.Tabixfile('/mnt/lustre/home/cusanovich/500HT/Imputed1415/ByChr/hutt.imputed.' + chrm + '.txt.gz')
         #tabixer = pysam.Tabixfile('/mnt/lustre/home/cusanovich/500HT/' + mapper + '/ByChr/hutt.' + mapper + '.' + distance + '.' + chrm + '.txt.gz')
         tabixer = Tabixfile(
             '/mnt/lustre/home/cusanovich/500HT/Imputed1415/ByChr/hutt' +
             mapper + '.' + chrm + '.txt.gz')
         genos = [
             x.split('\t') for x in tabixer.fetch(chrm, int(snpdic[snp][1]),
                                                  int(snpdic[snp][2]))
         ][0]
         tabixer.close()
         y = [genos[3], 'A', 'G'] + genos[6:len(genos)]
         genodic[snp] = y
         currgenos.append(", ".join(genodic[snp]))
 currbimbam = open(currfiles + '.bimbam', 'w')
 print >> currbimbam, "\n".join(currgenos)
 currbimbam.close()
 #print "Running GEMMA..."
 if regressPCs:
     gemmer = (hmdir + 'Programs/gemma0.94 -g ' + currfiles +
	if regressPCs:
		numpy.savetxt(currfiles + '.pheno',Yfit[exprcoldic[gene],],delimiter='\n',fmt='%s')
	if not regressPCs:
		phener = ('cut -f' + str(int(exprcoldic[gene]) + 1) + ' -d" " ' +
			hmdir + '500HT/Exprs/qqnorm.500ht' + gccor + covcor +
			'.ordered.' + chrm + '.bimbam > ' + currfiles + '.pheno')
		ifier(phener)
	currgenos = []
	####Pull genotypes for the SNPs in cis, if genotypes not already in dictionary: go to geno file and pull in appropriate data
	for snp in masterdic[gene]:
		try:
			currgenos.append(", ".join(genodic[snp]))
		except KeyError:
			#tabixer = pysam.Tabixfile('/mnt/lustre/home/cusanovich/500HT/Imputed1415/ByChr/hutt.imputed.' + chrm + '.txt.gz')
			#tabixer = pysam.Tabixfile('/mnt/lustre/home/cusanovich/500HT/' + mapper + '/ByChr/hutt.' + mapper + '.' + distance + '.' + chrm + '.txt.gz')
			tabixer = Tabixfile('/mnt/lustre/home/cusanovich/500HT/Imputed1415/ByChr/hutt' + mapper + '.' + chrm + '.txt.gz')
			genos = [x.split('\t') for x in tabixer.fetch(chrm,int(snpdic[snp][1]),int(snpdic[snp][2]))][0]
			tabixer.close()
			y = [genos[3], 'A', 'G'] + genos[6:len(genos)]
			genodic[snp] = y
			currgenos.append(", ".join(genodic[snp]))
	currbimbam = open(currfiles + '.bimbam','w')
	print >> currbimbam, "\n".join(currgenos)
	currbimbam.close()
	#print "Running GEMMA..."
	if regressPCs:
		gemmer = (hmdir + 'Programs/gemma0.94 -g ' + currfiles + '.bimbam -p ' + currfiles + '.pheno -k ' + currfiles + '.square.txt -lmm 4 -maf 0.05 -o curr_' + chrm + '_pc' + str(pcs) + '_' + correction)
		ifier(gemmer)
	if not regressPCs:
		gemmer = (hmdir + 'Programs/gemma0.94 -g ' + currfiles + '.bimbam -p ' + currfiles + '.pheno -k ' + currfiles + '.square.txt -c ' + currfiles + '.pcs.txt -lmm 4 -maf 0.05 -o curr_' + chrm + '_pc' + str(pcs) + '_' + correction)
		ifier(gemmer)