def __iter__(self): from pysam import Tabixfile, asTuple f = Tabixfile(self.filename, mode='r') try: # header row if self.header is not None: yield self.header else: # assume last header line has fields h = list(f.header) if len(h) > 0: header_line = text_type(h[-1], encoding='ascii') yield tuple(header_line.split('\t')) # data rows for row in f.fetch(reference=self.reference, start=self.start, end=self.stop, region=self.region, parser=asTuple()): yield tuple(row) except: raise finally: f.close()
def __init__(self, task_queue, results_queue, family, args): multiprocessing.Process.__init__(self) self.task_queue = task_queue self.family = family self.results_queue = results_queue self.verbosity = args.verbose self.phased = args.phased self.cadd_file = args.cadd_file[0] self.chr_prefix = args.chr_prefix if self.cadd_file: self.cadd_file = Tabixfile(self.cadd_file, parser=asTuple())
def __init__(self, *filenames, **kwargs): #data_format=None,printer=None): """Create a |BigBedGenomeHash| Parameters ---------- filenames : str or list of str Filename or list of filenames of `Tabix`_-compressed files data_format : str Format of tabix-compressed file(s). Choices are: `'GTF2'`,`'GFF3'`,`'BED'`,`'PSL'` (Default: `GTF2`) """ from pysam import Tabixfile if len(filenames) == 1 and isinstance(filenames[0], list): filenames = filenames[0] self.filenames = list(multiopen(filenames)) self.printer = kwargs.get("printer", NullWriter()) data_format = kwargs.get("data_format", "GTF2") try: self._reader_class = TabixGenomeHash._READERS[data_format] except ValueError: msg = "Supported file formats for TabixGenomeHash are: %s" % ", ".join( sorted(TabixGenomeHash._READERS.keys())) self.printer.write(msg) raise ValueError(msg) self.tabix_readers = [Tabixfile(X) for X in self.filenames]
def __init__(self, task_queue, results_queue, family, args): multiprocessing.Process.__init__(self) self.task_queue = task_queue self.family = family self.results_queue = results_queue self.verbosity = args.verbose self.phased = args.phased self.cadd_file = args.cadd_file[0] self.chr_prefix = args.chr_prefix if self.cadd_file: self.cadd_file = Tabixfile(self.cadd_file, parser = asTuple())
def __init__(self, chromosome, position, annotation_table_file): annotation_table = Tabixfile(annotation_table_file) self.line = annotation_table.fetch(reference=chromosome, start=position - 1, end=position).next() self.chromosome, \ self.position, \ self.reference_base, \ self.genic, \ self.exonic, \ self.intronic, \ self.intergenic, \ self.utr5, \ self.utr3, \ self.fold0, \ self.fold4, \ self.fold2, \ self.fold3, \ self.CDS, \ self.mRNA, \ self.rRNA, \ self.tRNA, \ self.feature_names, \ self.feature_types, \ self.feature_ID, \ self.cds_position, \ self.strand, \ self.frame, \ self.codon, \ self.aa, \ self.degen, \ self.FPKM, \ self.rho, \ self.FAIRE, \ self.recombination, \ self.mutability, \ self.quebec_alleles = self.line.split('\t') self.position = int(self.position) annotation_table.close()
def __iter__(self): try: from pysam import Tabixfile, asTuple except ImportError as e: raise UnsatisfiedDependency(e, dep_message) f = Tabixfile(self.filename, mode='r') try: # header row if self.header is not None: yield self.header else: # assume last header line has fields h = list(f.header) if len(h) > 0: yield tuple(h[-1].split('\t')) # data rows for row in f.fetch(reference=self.reference, start=self.start, end=self.end, region=self.region, parser=asTuple()): yield tuple(row) except: raise finally: f.close()
def __iter__(self): try: from pysam import Tabixfile, asTuple except ImportError as e: raise UnsatisfiedDependency(e, dep_message) f = Tabixfile(self.filename, mode="r") try: # header row if self.header is not None: yield self.header else: # assume last header line has fields h = list(f.header) if len(h) > 0: yield tuple(h[-1].split("\t")) # data rows for row in f.fetch( reference=self.reference, start=self.start, end=self.end, region=self.region, parser=asTuple() ): yield tuple(row) except: raise finally: f.close()
class VariantConsumer(multiprocessing.Process): """Yeilds all unordered pairs from a list of objects as tuples, like (obj_1, obj_2)""" def __init__(self, task_queue, results_queue, family, args): multiprocessing.Process.__init__(self) self.task_queue = task_queue self.family = family self.results_queue = results_queue self.verbosity = args.verbose self.phased = args.phased self.cadd_file = args.cadd_file[0] self.chr_prefix = args.chr_prefix if self.cadd_file: self.cadd_file = Tabixfile(self.cadd_file, parser=asTuple()) def fix_variants(self, variant_batch): """Merge the variants into one dictionary, make shure that the compounds are treated right.""" fixed_variants = {} for feature in variant_batch: for variant_id in variant_batch[feature]: if variant_id in fixed_variants: # We need to add compound information from different features if len(variant_batch[feature][variant_id] ['Compounds']) > 0: fixed_variants[variant_id]['Compounds'] = (dict( list(variant_batch[feature][variant_id] ['Compounds'].items()) + list(fixed_variants[variant_id] ['Compounds'].items()))) else: fixed_variants[variant_id] = variant_batch[feature][ variant_id] return fixed_variants def get_cadd_score(self, variant): """Get the cadd score and add it to the variant.""" cadd_score = '-' alternatives = variant['ALT'].split(',') # CADD vales are only for snps: if max([len(alt) for alt in alternatives]) == 1 and len(variant['REF']) == 1: if self.cadd_file: cadd_key = int(variant['POS']) try: for tpl in self.cadd_file.fetch(str(variant['CHROM']), cadd_key - 1, cadd_key): if alternatives[0] == str(tpl[3]): try: return str(tpl[5], encoding='utf-8') except TypeError: return str(unicode(tpl[5], encoding='utf-8')) except (IndexError, KeyError) as e: if self.verbosity: print(e, variant['CHROM'], variant['POS']) return cadd_score def make_print_version(self, variant_dict): """Get the variants ready for printing""" for variant_id in variant_dict: if self.cadd_file: variant_dict[variant_id]['CADD'] = self.get_cadd_score( variant_dict[variant_id]) model_list = [] compounds_list = [] #Remove the 'Genotypes' post since we will not need them for now variant_dict[variant_id].pop('Genotypes', 0) feature_list = variant_dict[variant_id]['Annotation'] if len(variant_dict[variant_id]['Compounds']) > 0: #We do not want reference to itself as a compound: variant_dict[variant_id]['Compounds'].pop(variant_id, 0) compounds_list = list( variant_dict[variant_id]['Compounds'].keys()) else: compounds_list = ['-'] for model in variant_dict[variant_id]['Inheritance_model']: if variant_dict[variant_id]['Inheritance_model'][model]: model_list.append(model) if len(model_list) == 0: model_list = ['NA'] model_score = '-' genotype_scores = [] for individual in self.family.individuals: gt_call = variant_dict[variant_id][individual].split(':') gt_info = variant_dict[variant_id]['FORMAT'].split(':') if len(gt_call) == 1: gt_call = {'GT': gt_call[0]} else: gt_call = dict(zip(gt_info, gt_call)) if 'GQ' in gt_call: # Add the error probabilities to genotype scores genotype_scores.append(10**-(float(gt_call['GQ']) / 10)) if len(genotype_scores) > 0: model_score = (str( round(-10 * log10(1 - reduce( operator.mul, [1 - score for score in genotype_scores]))))) variant_dict[variant_id].pop('Compounds', 0) variant_dict[variant_id].pop('Inheritance_model', 0) variant_dict[variant_id].pop('Annotation', 0) vcf_info = variant_dict[variant_id]['INFO'].split(';') if self.chr_prefix: variant_dict[variant_id][ 'CHROM'] = 'chr' + variant_dict[variant_id]['CHROM'] # if we should include the annotation: vcf_info.append('ANN=' + ':'.join(feature_list)) # if we should include compounds: vcf_info.append('Comp=' + ':'.join(compounds_list)) # if we should include genetic models: vcf_info.append('GM=' + ':'.join(model_list)) if model_list == ['NA']: model_score = '-' vcf_info.append('MS=' + model_score) if self.cadd_file: vcf_info.append('CADD=%s' % str(variant_dict[variant_id].pop('CADD', '-'))) variant_dict[variant_id]['INFO'] = ';'.join(vcf_info) return def run(self): """Run the consuming""" proc_name = self.name if self.verbosity: print('%s: Starting!' % proc_name) while True: # A batch is a dictionary on the form {gene:{variant_id:variant_dict}} next_batch = self.task_queue.get() # if self.verbosity: # if self.results_queue.full(): # print('Batch results queue Full! %s' % proc_name) # if self.task_queue.full(): # print('Variant queue full! %s' % proc_name) if next_batch is None: self.task_queue.task_done() if self.verbosity: print('%s: Exiting' % proc_name) break genetic_models.check_genetic_models(next_batch, self.family, self.verbosity, self.phased, proc_name) # Make shure we only have one copy of each variant: fixed_variants = self.fix_variants(next_batch) # Now we want to make versions of the variants that are ready for printing. self.make_print_version(fixed_variants) self.results_queue.put(fixed_variants) self.task_queue.task_done() return
class VariantConsumer(multiprocessing.Process): """Yeilds all unordered pairs from a list of objects as tuples, like (obj_1, obj_2)""" def __init__(self, task_queue, results_queue, family, args): multiprocessing.Process.__init__(self) self.task_queue = task_queue self.family = family self.results_queue = results_queue self.verbosity = args.verbose self.phased = args.phased self.cadd_file = args.cadd_file[0] self.chr_prefix = args.chr_prefix if self.cadd_file: self.cadd_file = Tabixfile(self.cadd_file, parser = asTuple()) def fix_variants(self, variant_batch): """Merge the variants into one dictionary, make shure that the compounds are treated right.""" fixed_variants = {} for feature in variant_batch: for variant_id in variant_batch[feature]: if variant_id in fixed_variants: # We need to add compound information from different features if len(variant_batch[feature][variant_id]['Compounds']) > 0: fixed_variants[variant_id]['Compounds'] = ( dict(list(variant_batch[feature][variant_id]['Compounds'].items()) + list(fixed_variants[variant_id]['Compounds'].items()))) else: fixed_variants[variant_id] = variant_batch[feature][variant_id] return fixed_variants def get_cadd_score(self, variant): """Get the cadd score and add it to the variant.""" cadd_score = '-' alternatives = variant['ALT'].split(',') # CADD vales are only for snps: if max([len(alt) for alt in alternatives]) == 1 and len(variant['REF']) == 1: if self.cadd_file: cadd_key = int(variant['POS']) try: for tpl in self.cadd_file.fetch(str(variant['CHROM']), cadd_key-1, cadd_key): if alternatives[0] == str(tpl[3]): try: return str(tpl[5], encoding='utf-8') except TypeError: return str(unicode(tpl[5], encoding='utf-8')) except (IndexError, KeyError) as e: if self.verbosity: print(e, variant['CHROM'], variant['POS']) return cadd_score def make_print_version(self, variant_dict): """Get the variants ready for printing""" for variant_id in variant_dict: if self.cadd_file: variant_dict[variant_id]['CADD'] = self.get_cadd_score(variant_dict[variant_id]) model_list = [] compounds_list = [] #Remove the 'Genotypes' post since we will not need them for now variant_dict[variant_id].pop('Genotypes', 0) feature_list = variant_dict[variant_id]['Annotation'] if len(variant_dict[variant_id]['Compounds']) > 0: #We do not want reference to itself as a compound: variant_dict[variant_id]['Compounds'].pop(variant_id, 0) compounds_list = list(variant_dict[variant_id]['Compounds'].keys()) else: compounds_list = ['-'] for model in variant_dict[variant_id]['Inheritance_model']: if variant_dict[variant_id]['Inheritance_model'][model]: model_list.append(model) if len(model_list) == 0: model_list = ['NA'] model_score = '-' genotype_scores = [] for individual in self.family.individuals: gt_call = variant_dict[variant_id][individual].split(':') gt_info = variant_dict[variant_id]['FORMAT'].split(':') if len(gt_call) == 1: gt_call = {'GT':gt_call[0]} else: gt_call = dict(zip(gt_info, gt_call)) if 'GQ' in gt_call: # Add the error probabilities to genotype scores genotype_scores.append(10**-(float(gt_call['GQ'])/10)) if len(genotype_scores) > 0: model_score = (str(round(-10*log10(1-reduce(operator.mul, [1-score for score in genotype_scores]))))) variant_dict[variant_id].pop('Compounds',0) variant_dict[variant_id].pop('Inheritance_model',0) variant_dict[variant_id].pop('Annotation',0) vcf_info = variant_dict[variant_id]['INFO'].split(';') if self.chr_prefix: variant_dict[variant_id]['CHROM'] = 'chr'+variant_dict[variant_id]['CHROM'] # if we should include the annotation: vcf_info.append('ANN=' + ':'.join(feature_list)) # if we should include compounds: vcf_info.append('Comp=' + ':'.join(compounds_list)) # if we should include genetic models: vcf_info.append('GM=' + ':'.join(model_list)) if model_list == ['NA']: model_score = '-' vcf_info.append('MS=' + model_score) if self.cadd_file: vcf_info.append('CADD=%s' % str(variant_dict[variant_id].pop('CADD', '-'))) variant_dict[variant_id]['INFO'] = ';'.join(vcf_info) return def run(self): """Run the consuming""" proc_name = self.name if self.verbosity: print('%s: Starting!' % proc_name) while True: # A batch is a dictionary on the form {gene:{variant_id:variant_dict}} next_batch = self.task_queue.get() # if self.verbosity: # if self.results_queue.full(): # print('Batch results queue Full! %s' % proc_name) # if self.task_queue.full(): # print('Variant queue full! %s' % proc_name) if next_batch is None: self.task_queue.task_done() if self.verbosity: print('%s: Exiting' % proc_name) break genetic_models.check_genetic_models(next_batch, self.family, self.verbosity, self.phased, proc_name) # Make shure we only have one copy of each variant: fixed_variants = self.fix_variants(next_batch) # Now we want to make versions of the variants that are ready for printing. self.make_print_version(fixed_variants) self.results_queue.put(fixed_variants) self.task_queue.task_done() return
genofins = [] for line in genofinfile: genofins.append(line.strip().split()[1]) genofinfile.close() genoinds = [genofins.index(x) + 6 for x in officialfindivs] y = {} currbimbam = open(currfiles + '.bimbam','w') #t0 = time.time() for snp in masterdic.keys(): #for snp in masterdic.keys()[0:1000]: chrm = masterdic[snp][0] if chrm == 'chrm': continue tabixer = Tabixfile('/mnt/lustre/home/cusanovich/500HT/Imputed1415/ByChr/hutt.all.imputed.' + chrm + '.txt.gz') tempgenos = [x.split('\t') for x in tabixer.fetch(chrm,int(masterdic[snp][1])-1,int(masterdic[snp][2]))][0] genos = [tempgenos[x] for x in range(0,6) + genoinds] tabixer.close() y[snp] = [genos[3], 'A', 'G'] + genos[6:] print >> currbimbam, ", ".join(y) #t1 = time.time() #print t1-t0 currbimbam.close() #genomat = matrix_reader(genodir + 'hutt.imputed.dhssnps.bimbam',sep=",") print "Running GEMMA..." gemmer = (hmdir + 'Programs/gemma0.94 -g ' + currfiles + '.bimbam -p ' + currfiles + '.pheno -k ' + currfiles + '.square.txt -c ' + currfiles + '.covariates -lmm 4 -maf 0.05 -o curr_' + pheno) t0 = time.time() ifier(gemmer)
fmt='%s') if not regressPCs: phener = ('cut -f' + str(int(exprcoldic[gene]) + 1) + ' -d" " ' + hmdir + '500HT/Exprs/qqnorm.500ht' + gccor + covcor + '.ordered.' + chrm + '.bimbam > ' + currfiles + '.pheno') ifier(phener) currgenos = [] ####Pull genotypes for the SNPs in cis, if genotypes not already in dictionary: go to geno file and pull in appropriate data for snp in masterdic[gene]: try: currgenos.append(", ".join(genodic[snp])) except KeyError: #tabixer = pysam.Tabixfile('/mnt/lustre/home/cusanovich/500HT/Imputed1415/ByChr/hutt.imputed.' + chrm + '.txt.gz') #tabixer = pysam.Tabixfile('/mnt/lustre/home/cusanovich/500HT/' + mapper + '/ByChr/hutt.' + mapper + '.' + distance + '.' + chrm + '.txt.gz') tabixer = Tabixfile( '/mnt/lustre/home/cusanovich/500HT/Imputed1415/ByChr/hutt' + mapper + '.' + chrm + '.txt.gz') genos = [ x.split('\t') for x in tabixer.fetch(chrm, int(snpdic[snp][1]), int(snpdic[snp][2])) ][0] tabixer.close() y = [genos[3], 'A', 'G'] + genos[6:len(genos)] genodic[snp] = y currgenos.append(", ".join(genodic[snp])) currbimbam = open(currfiles + '.bimbam', 'w') print >> currbimbam, "\n".join(currgenos) currbimbam.close() #print "Running GEMMA..." if regressPCs: gemmer = (hmdir + 'Programs/gemma0.94 -g ' + currfiles +
if regressPCs: numpy.savetxt(currfiles + '.pheno',Yfit[exprcoldic[gene],],delimiter='\n',fmt='%s') if not regressPCs: phener = ('cut -f' + str(int(exprcoldic[gene]) + 1) + ' -d" " ' + hmdir + '500HT/Exprs/qqnorm.500ht' + gccor + covcor + '.ordered.' + chrm + '.bimbam > ' + currfiles + '.pheno') ifier(phener) currgenos = [] ####Pull genotypes for the SNPs in cis, if genotypes not already in dictionary: go to geno file and pull in appropriate data for snp in masterdic[gene]: try: currgenos.append(", ".join(genodic[snp])) except KeyError: #tabixer = pysam.Tabixfile('/mnt/lustre/home/cusanovich/500HT/Imputed1415/ByChr/hutt.imputed.' + chrm + '.txt.gz') #tabixer = pysam.Tabixfile('/mnt/lustre/home/cusanovich/500HT/' + mapper + '/ByChr/hutt.' + mapper + '.' + distance + '.' + chrm + '.txt.gz') tabixer = Tabixfile('/mnt/lustre/home/cusanovich/500HT/Imputed1415/ByChr/hutt' + mapper + '.' + chrm + '.txt.gz') genos = [x.split('\t') for x in tabixer.fetch(chrm,int(snpdic[snp][1]),int(snpdic[snp][2]))][0] tabixer.close() y = [genos[3], 'A', 'G'] + genos[6:len(genos)] genodic[snp] = y currgenos.append(", ".join(genodic[snp])) currbimbam = open(currfiles + '.bimbam','w') print >> currbimbam, "\n".join(currgenos) currbimbam.close() #print "Running GEMMA..." if regressPCs: gemmer = (hmdir + 'Programs/gemma0.94 -g ' + currfiles + '.bimbam -p ' + currfiles + '.pheno -k ' + currfiles + '.square.txt -lmm 4 -maf 0.05 -o curr_' + chrm + '_pc' + str(pcs) + '_' + correction) ifier(gemmer) if not regressPCs: gemmer = (hmdir + 'Programs/gemma0.94 -g ' + currfiles + '.bimbam -p ' + currfiles + '.pheno -k ' + currfiles + '.square.txt -c ' + currfiles + '.pcs.txt -lmm 4 -maf 0.05 -o curr_' + chrm + '_pc' + str(pcs) + '_' + correction) ifier(gemmer)