Ejemplo n.º 1
0
def process_alleles(vcf_record_in_one_contig, sample_names, curr_reference):
    sample_to_allele = generate_empty_hash_with_sample(sample_names)
    command = "samtools view -F 1028 %s %s" % (bam_file, curr_reference)
    stream, process = utils_commands.get_output_stream_from_command(command)
    for line in stream:
        sam_record = Sam_record(line)
        allele_array = []
        sequence = sam_record.get_query_sequence()
        sample = sam_record.get_tag("RG")
        for position in vcf_record_in_one_contig.keys():
            #if vcf_record_in_one_contig.get(position).get_genotype_quality(sample)>20:
            allele_array.append(sequence[position - 1])
            #else:
            #    allele_array.append('.')
        count_with_hash(sample_to_allele[sample], ''.join(allele_array))
        count_with_hash(sample_to_allele['all'], ''.join(allele_array))
    process.wait()
    pprint.pprint(sample_to_allele)
    filter_alleles(sample_to_allele)
    pprint.pprint(sample_to_allele)
    all_alleles = set()
    valid = True
    for sample in sample_to_allele.keys():
        alleles = sample_to_allele.get(sample)
        all_alleles.update(set(alleles.keys()))
        if len(alleles) > 2:
            valid = False
    if len(all_alleles) > 4:
        valid = False
    if not valid:
        print curr_reference
Ejemplo n.º 2
0
def process_alleles(vcf_record_in_one_contig, sample_names,curr_reference):
    sample_to_allele = generate_empty_hash_with_sample(sample_names)
    command = "samtools view -F 1028 %s %s"%(bam_file,curr_reference)
    stream,process=utils_commands.get_output_stream_from_command(command)
    for line in stream:
        sam_record=Sam_record(line)
        allele_array=[]
        sequence = sam_record.get_query_sequence()
        sample = sam_record.get_tag("RG")
        for position in vcf_record_in_one_contig.keys():
            #if vcf_record_in_one_contig.get(position).get_genotype_quality(sample)>20:
            allele_array.append(sequence[position-1])
            #else:
            #    allele_array.append('.')
        count_with_hash(sample_to_allele[sample], ''.join(allele_array))
        count_with_hash(sample_to_allele['all'], ''.join(allele_array))
    process.wait()
    pprint.pprint(sample_to_allele)
    filter_alleles(sample_to_allele)
    pprint.pprint(sample_to_allele)
    all_alleles=set()
    valid=True
    for sample in sample_to_allele.keys():
        alleles = sample_to_allele.get(sample)
        all_alleles.update(set(alleles.keys()))
        if len(alleles)>2:
            valid=False
    if len(all_alleles)>4:
        valid=False
    if not valid:
        print curr_reference
Ejemplo n.º 3
0
def process_single_samtools_run_with_read_group(bam_file, all_contigs_info, samtools_bin):
    command = "%s view -h -F 132 %s" % (samtools_bin, bam_file)
    open_stream, process = get_output_stream_from_command(command)
    current_contig = None
    sample_name, ext = os.path.splitext(bam_file)
    read_groups = {}
    try:
        for line in open_stream:
            if not line.startswith("@"):
                break
            if line.startswith("@RG"):
                sp_line = line.strip().split()
                rg_id = rg_sample = rg_library = None
                for value in sp_line:
                    if value.startswith("ID"):
                        rg_id = value[3:]
                    elif value.startswith("SM"):
                        rg_sample = value[3:]
                    elif value.startswith("LB"):
                        rg_library = value[3:]
                if rg_id:
                    if rg_sample:
                        read_groups[rg_id] = rg_sample
                    elif rg_library:
                        read_groups[rg_id] = rg_library
                    else:
                        read_groups[rg_id] = rg_id
        all_sample_coverage = {}
        all_sample_duplicate = {}
        for sample in read_groups.values():
            all_sample_coverage[sample] = 0
            all_sample_duplicate[sample] = 0
            # process the first read
            # if line.startswith("@"):
        #    #Still in the header. There's no read, exit
        #    return
        sam_record = Sam_record(line.strip())
        current_contig = sam_record.get_reference_name()
        if not sam_record.is_unmapped():
            rg_id = sam_record.get_tag("RG")
            if sam_record.is_duplicate_read():
                all_sample_duplicate[read_groups.get(rg_id)] += 1
            all_sample_coverage[read_groups.get(rg_id)] += 1
        i = 1
        # process all the others
        for line in open_stream:
            i += 1
            if i % 1000000 == 0:
                print i
            sam_record = Sam_record(line.strip())
            if current_contig != sam_record.get_reference_name() and current_contig != None:
                for sample in read_groups.values():
                    all_contigs_info.add_values(current_contig, all_sample_coverage.get(sample),
                                                all_sample_duplicate.get(sample), sample=sample)
                    all_sample_coverage[sample] = 0
                    all_sample_duplicate[sample] = 0
            current_contig = sam_record.get_reference_name()

            if not sam_record.is_unmapped():
                rg_id = sam_record.get_tag("RG")
                if sam_record.is_duplicate_read():
                    all_sample_duplicate[read_groups.get(rg_id)] += 1
                all_sample_coverage[read_groups.get(rg_id)] += 1
        if current_contig != None:
            for sample in read_groups.values():
                all_contigs_info.add_values(current_contig, all_sample_coverage.get(sample),
                                            all_sample_duplicate.get(sample), sample=sample)
                all_sample_coverage[sample] = 0
                all_sample_duplicate[sample] = 0
    finally:
        open_stream.close()
Ejemplo n.º 4
0
def process_single_samtools_run_with_read_group(bam_file, all_contigs_info,
                                                samtools_bin):
    command = "%s view -h -F 132 %s" % (samtools_bin, bam_file)
    open_stream, process = get_output_stream_from_command(command)
    current_contig = None
    sample_name, ext = os.path.splitext(bam_file)
    read_groups = {}
    try:
        for line in open_stream:
            if not line.startswith("@"):
                break
            if line.startswith("@RG"):
                sp_line = line.strip().split()
                rg_id = rg_sample = rg_library = None
                for value in sp_line:
                    if value.startswith("ID"):
                        rg_id = value[3:]
                    elif value.startswith("SM"):
                        rg_sample = value[3:]
                    elif value.startswith("LB"):
                        rg_library = value[3:]
                if rg_id:
                    if rg_sample:
                        read_groups[rg_id] = rg_sample
                    elif rg_library:
                        read_groups[rg_id] = rg_library
                    else:
                        read_groups[rg_id] = rg_id
        all_sample_coverage = {}
        all_sample_duplicate = {}
        for sample in read_groups.values():
            all_sample_coverage[sample] = 0
            all_sample_duplicate[sample] = 0
            # process the first read
            # if line.startswith("@"):
        #    #Still in the header. There's no read, exit
        #    return
        sam_record = Sam_record(line.strip())
        current_contig = sam_record.get_reference_name()
        if not sam_record.is_unmapped():
            rg_id = sam_record.get_tag("RG")
            if sam_record.is_duplicate_read():
                all_sample_duplicate[read_groups.get(rg_id)] += 1
            all_sample_coverage[read_groups.get(rg_id)] += 1
        i = 1
        # process all the others
        for line in open_stream:
            i += 1
            if i % 1000000 == 0:
                print i
            sam_record = Sam_record(line.strip())
            if current_contig != sam_record.get_reference_name(
            ) and current_contig != None:
                for sample in read_groups.values():
                    all_contigs_info.add_values(
                        current_contig,
                        all_sample_coverage.get(sample),
                        all_sample_duplicate.get(sample),
                        sample=sample)
                    all_sample_coverage[sample] = 0
                    all_sample_duplicate[sample] = 0
            current_contig = sam_record.get_reference_name()

            if not sam_record.is_unmapped():
                rg_id = sam_record.get_tag("RG")
                if sam_record.is_duplicate_read():
                    all_sample_duplicate[read_groups.get(rg_id)] += 1
                all_sample_coverage[read_groups.get(rg_id)] += 1
        if current_contig != None:
            for sample in read_groups.values():
                all_contigs_info.add_values(current_contig,
                                            all_sample_coverage.get(sample),
                                            all_sample_duplicate.get(sample),
                                            sample=sample)
                all_sample_coverage[sample] = 0
                all_sample_duplicate[sample] = 0
    finally:
        open_stream.close()
Ejemplo n.º 5
0
def process_single_samtools_run_with_read_group(bam_file,all_contigs_info,samtools_bin):
    command="%s view -h -F 132 %s"%(samtools_bin, bam_file)
    open_stream, process = get_output_stream_from_command(command)
    current_contig=None
    sample_name, ext = os.path.splitext(bam_file)
    read_groups={}
    try:
        for line in open_stream:
            if not line.startswith("@"):
                break
            if line.startswith("@RG"):
                sp_line = line.strip().split()
                rg_id=rg_sample=rg_library=None
                for value in sp_line:
                    if value.startswith("ID"):
                        rg_id=value[3:]
                    elif value.startswith("SM"):
                        rg_sample=value[3:]
                    elif value.startswith("LB"):
                        rg_library=value[3:]
                if rg_id:
                    if rg_sample:
                        read_groups[rg_id]=rg_sample
                    elif rg_library:
                        read_groups[rg_id]=rg_library
                    else:
                        read_groups[rg_id]=rg_id
        all_sample_coverage={}
        all_sample_coverage_reads = {}
        all_sample_duplicate={}
        for sample in read_groups.values():
            all_sample_coverage[sample]=Counter()
            all_sample_duplicate[sample]=Counter()
            all_sample_coverage_reads[sample] = defaultdict(Counter)
        #process the first read
        sam_record = Sam_record(line.strip())
        current_contig = sam_record.get_reference_name()
        if not sam_record.is_unmapped():
            rg_id = sam_record.get_tag("RG")
            read_sequence = sam_record.get_query_sequence()
            loci = get_loci_from_read(sam_record)
            if sam_record.is_duplicate_read():
                all_sample_duplicate[read_groups.get(rg_id)][str(loci)]+=1
            all_sample_coverage[read_groups.get(rg_id)][str(loci)]+=1
            all_sample_coverage_reads[read_groups.get(rg_id)][str(loci)][read_sequence] +=1
        i=1
        #process all the others
        for line in open_stream:
            i+=1
            if i%1000000==0:
                print i
            sam_record = Sam_record(line.strip())
            if current_contig != sam_record.get_reference_name() and current_contig != None:
                for sample in read_groups.values():
                    for loci in all_sample_coverage.get(sample):
                        alleles = all_sample_coverage_reads[sample].get(loci)
                        all_contigs_info.add_values(current_contig, loci, all_sample_coverage.get(sample).get(loci, 0),
                                                    all_sample_duplicate.get(sample).get(loci, 0), alleles=alleles,
                                                    sample=sample)

                    all_sample_coverage[sample]=Counter()
                    all_sample_duplicate[sample]=Counter()
                    all_sample_coverage_reads[sample] = defaultdict(Counter)
            current_contig = sam_record.get_reference_name()
            
            if not sam_record.is_unmapped():
                rg_id = sam_record.get_tag("RG")
                loci = get_loci_from_read(sam_record)
                read_sequence = sam_record.get_query_sequence()
                if sam_record.is_duplicate_read():
                    all_sample_duplicate[read_groups.get(rg_id)][str(loci)]+=1
                all_sample_coverage[read_groups.get(rg_id)][str(loci)]+=1
                all_sample_coverage_reads[read_groups.get(rg_id)][str(loci)][read_sequence] +=1
        if current_contig != None:
            for sample in read_groups.values():
                for loci in all_sample_coverage.get(sample):
                    alleles = all_sample_coverage_reads[sample].get(loci)
                    all_contigs_info.add_values(current_contig, loci, all_sample_coverage.get(sample).get(loci, 0),
                                                all_sample_duplicate.get(sample).get(loci, 0), alleles=alleles,
                                                sample=sample)
                all_sample_coverage[sample]=Counter()
                all_sample_duplicate[sample]=Counter()
                all_sample_coverage_reads[sample] = defaultdict(Counter)
    finally:
        open_stream.close()