def configure(self, line, sample_id_to_population, dir_launch='..', ancestral_check=False): column_labels = line.split() self.n_lineages = 2 * (len(column_labels) - 9) print('\t{} columns'.format(len(column_labels))) print('\tcolumn index to pop') self.column_index_to_population = get_column_index_to_population( column_labels, sample_id_to_population) self.refseq = reference_sequence(self.chrom, self.ref, dir_launch=dir_launch) print('len of reference seq: {}'.format(len(self.refseq))) if ancestral_check: print('\tdifferences to {}'.format(self.short)) self.human_chimp_differences = get_human_chimp_differences( self.chrom, self.ref, self.short, dir_launch=dir_launch) else: self.human_chimp_differences = {} print('\tconfigure regions') for included_region in self.included_regions: included_region.configure(column_labels, sample_id_to_population)
def configure(self, line): column_labels = line.split() self.n_lineages = 2 * (len(column_labels) - 9) self.column_index_to_population = get_column_index_to_population( column_labels) self.refseq = reference_sequence(self.chrom) self.human_chimp_differences = get_human_chimp_differences(self.chrom) for included_region in self.included_regions: included_region.configure(column_labels)
def process_chromosome(chrom, ref, short, vcf_file, suff='chr', vcf_dir='vcfs', dir_launch='..', outfile_dir='..'): ''' Platform function read vcf file; read fasta; read reference to vcf differences; dispatch each genotype line to process_line(); update mutation / individual library counts; write ''' # check args refseq = reference_sequence(chrom, ref, dir_launch=dir_launch) refseq = refseq.decode() # check args chimp_alleles = get_human_chimp_differences(chrom, ref, short, dir_launch=dir_launch) ## mutations imported # check args: chrom,vcf_file,suff='chr',vcf_dir= 'vcfs',dir_launch='..' gzip_path = ''.join( [dir_launch, '/data/', vcf_dir, '/', vcf_file, suff, chrom, '.vcf.gz']) # gzip_path = '../data/vcfs/ALL.chr'+chrom+'.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz' with gzip.open(gzip_path) as infile: c = 0 d = 0 for line in infile: line = line.decode() c += 1 if line.split()[0] == '#CHROM': sample_ids = line.split()[9:] n_lineages = 2 * len(sample_ids) mutation_counts = {(mutation, haplotype_index): 0 for haplotype_index in range(n_lineages) for mutation in mutations} d = 1 if d == 1: try: derived_allele, derived_count, this_mut, alleles = process_line( line, refseq, chimp_alleles) except BadDataQualityError: continue update_counts(alleles, mutation_counts, derived_count, derived_allele, n_lineages, this_mut) write_output(mutation_counts, sample_ids, mutations, n_lineages, chrom, outfile_dir)