Esempio n. 1
0
    def configure(self,
                  line,
                  sample_id_to_population,
                  dir_launch='..',
                  ancestral_check=False):
        column_labels = line.split()
        self.n_lineages = 2 * (len(column_labels) - 9)

        print('\t{} columns'.format(len(column_labels)))
        print('\tcolumn index to pop')
        self.column_index_to_population = get_column_index_to_population(
            column_labels, sample_id_to_population)

        self.refseq = reference_sequence(self.chrom,
                                         self.ref,
                                         dir_launch=dir_launch)
        print('len of reference seq: {}'.format(len(self.refseq)))

        if ancestral_check:
            print('\tdifferences to {}'.format(self.short))
            self.human_chimp_differences = get_human_chimp_differences(
                self.chrom, self.ref, self.short, dir_launch=dir_launch)

        else:
            self.human_chimp_differences = {}

        print('\tconfigure regions')
        for included_region in self.included_regions:
            included_region.configure(column_labels, sample_id_to_population)
    def configure(self, line):
        column_labels = line.split()
        self.n_lineages = 2 * (len(column_labels) - 9)
        self.column_index_to_population = get_column_index_to_population(
            column_labels)

        self.refseq = reference_sequence(self.chrom)
        self.human_chimp_differences = get_human_chimp_differences(self.chrom)

        for included_region in self.included_regions:
            included_region.configure(column_labels)
def process_chromosome(chrom,
                       ref,
                       short,
                       vcf_file,
                       suff='chr',
                       vcf_dir='vcfs',
                       dir_launch='..',
                       outfile_dir='..'):
    '''
    Platform function
        read vcf file;
        read fasta;
        read reference to vcf differences; 

        dispatch each genotype line to process_line(); 
        update mutation / individual library counts;
        write
    '''
    # check args
    refseq = reference_sequence(chrom, ref, dir_launch=dir_launch)
    refseq = refseq.decode()

    # check args
    chimp_alleles = get_human_chimp_differences(chrom,
                                                ref,
                                                short,
                                                dir_launch=dir_launch)

    ## mutations imported

    # check args: chrom,vcf_file,suff='chr',vcf_dir= 'vcfs',dir_launch='..'
    gzip_path = ''.join(
        [dir_launch, '/data/', vcf_dir, '/', vcf_file, suff, chrom, '.vcf.gz'])
    # gzip_path = '../data/vcfs/ALL.chr'+chrom+'.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz'
    with gzip.open(gzip_path) as infile:
        c = 0
        d = 0
        for line in infile:
            line = line.decode()
            c += 1

            if line.split()[0] == '#CHROM':

                sample_ids = line.split()[9:]

                n_lineages = 2 * len(sample_ids)

                mutation_counts = {(mutation, haplotype_index): 0
                                   for haplotype_index in range(n_lineages)
                                   for mutation in mutations}
                d = 1

            if d == 1:

                try:
                    derived_allele, derived_count, this_mut, alleles = process_line(
                        line, refseq, chimp_alleles)
                except BadDataQualityError:
                    continue

                update_counts(alleles, mutation_counts, derived_count,
                              derived_allele, n_lineages, this_mut)

    write_output(mutation_counts, sample_ids, mutations, n_lineages, chrom,
                 outfile_dir)