def get_readgroup_and_seq_dict_from_bam(bam_files, allow_collision=False): print "Gather seq dict and read groups from %s bam files" % len(bam_files) all_read_groups = {} all_seq_dict = OrderedDict() for bam_file in bam_files: command = "samtools view -H %s | egrep '@RG|@SQ' " % bam_file stdout, process = utils_commands.get_output_stream_from_command( command) for line in stdout: if line.startswith('@RG'): read_group_dict = {} for element in line.strip().split('\t'): if element != '@RG': key, value = element.split(':') read_group_dict[key] = value if read_group_dict.has_key('ID') and read_group_dict.get( 'ID') not in all_read_groups: all_read_groups[read_group_dict.get( 'ID')] = read_group_dict if line.startswith('@SQ'): seq_dict = {} for element in line.strip().split('\t'): if element != '@SQ': key, value = element.split(':') if key == 'LN': value = int(value) seq_dict[key] = value if seq_dict.has_key('SN'): name = seq_dict.get('SN') if all_seq_dict.has_key(name) and not allow_collision: raise StandardError( "Identical sequence dictionary name %s in %s and previous bam entry and collision not allowed" % (name, bam_file)) all_seq_dict[name] = seq_dict return all_read_groups.values(), all_seq_dict.values()
def process_alleles(vcf_record_in_one_contig, sample_names,curr_reference): sample_to_allele = generate_empty_hash_with_sample(sample_names) command = "samtools view -F 1028 %s %s"%(bam_file,curr_reference) stream,process=utils_commands.get_output_stream_from_command(command) for line in stream: sam_record=Sam_record(line) allele_array=[] sequence = sam_record.get_query_sequence() sample = sam_record.get_tag("RG") for position in vcf_record_in_one_contig.keys(): #if vcf_record_in_one_contig.get(position).get_genotype_quality(sample)>20: allele_array.append(sequence[position-1]) #else: # allele_array.append('.') count_with_hash(sample_to_allele[sample], ''.join(allele_array)) count_with_hash(sample_to_allele['all'], ''.join(allele_array)) process.wait() pprint.pprint(sample_to_allele) filter_alleles(sample_to_allele) pprint.pprint(sample_to_allele) all_alleles=set() valid=True for sample in sample_to_allele.keys(): alleles = sample_to_allele.get(sample) all_alleles.update(set(alleles.keys())) if len(alleles)>2: valid=False if len(all_alleles)>4: valid=False if not valid: print curr_reference
def get_readgroup_and_seq_dict_from_bam(bam_files, allow_collision=False): print "Gather seq dict and read groups from %s bam files" % len(bam_files) all_read_groups = {} all_seq_dict = OrderedDict() for bam_file in bam_files: command = "samtools view -H %s | egrep '@RG|@SQ' " % bam_file stdout, process = utils_commands.get_output_stream_from_command(command) for line in stdout: if line.startswith('@RG'): read_group_dict = {} for element in line.strip().split('\t'): if element != '@RG': key, value = element.split(':') read_group_dict[key] = value if read_group_dict.has_key('ID') and read_group_dict.get('ID') not in all_read_groups: all_read_groups[read_group_dict.get('ID')] = read_group_dict if line.startswith('@SQ'): seq_dict = {} for element in line.strip().split('\t'): if element != '@SQ': key, value = element.split(':') if key == 'LN': value = int(value) seq_dict[key] = value if seq_dict.has_key('SN'): name = seq_dict.get('SN') if all_seq_dict.has_key(name) and not allow_collision: raise StandardError( "Identical sequence dictionary name %s in %s and previous bam entry and collision not allowed" % ( name, bam_file)) all_seq_dict[name] = seq_dict return all_read_groups.values(), all_seq_dict.values()
def createDirectories(baseDir, directories, server=None): """ This function create the directories listed in the directories array. @param baseDir: the parent directory. @param directories: the list of directory to create. @param server: the server on which the directory will be created. """ for (directory) in directories: dir = os.path.join(baseDir, directory) if server is not None: command = 'ssh %s "ls -d1 %s"' % (server, dir) stream, process = utils_commands.get_output_stream_from_command( command, logger_name=None) line = None for line in stream: line = line.strip() if line == dir: break if line != dir: logging.info('%s does not exists on %s: create it' % (dir, server)) command = 'ssh %s "mkdir %s"' % (server, dir) utils_commands.launchCommandLocally(command) elif not os.path.exists(dir): logging.info('%s does not exists: create it' % dir) os.mkdir(dir, 0775)
def check_file_or_dir(filePath, server=None): """ Check if the given file is a file and if its size is greater than 0.""" if server: returnValue = False command = 'ssh %s "ls -ld %s"' % (server, filePath) stream, process = utils_commands.get_output_stream_from_command( command, logger_name=None) line = None for line in stream: line = line.strip() if line: if line.startswith('d'): #It's a directory returnValue = 'dir' else: returnValue = 'file' break else: returnValue = False else: if os.path.isfile(filePath): returnValue = 'file' elif os.path.isdir(filePath): returnValue = 'dir' else: returnValue = False return returnValue
def check_file_or_dir(filePath, server=None): """ Check if the given file is a file and if its size is greater than 0.""" if server: returnValue = False command = 'ssh %s "ls -ld %s"'%(server, filePath) stream,process = utils_commands.get_output_stream_from_command(command,logger_name=None) line=None for line in stream: line=line.strip() if line: if line.startswith('d'): #It's a directory returnValue = 'dir' else: returnValue = 'file' break else: returnValue = False else: if os.path.isfile(filePath): returnValue = 'file' elif os.path.isdir(filePath): returnValue = 'dir' else: returnValue = False return returnValue
def process_alleles(vcf_record_in_one_contig, sample_names, curr_reference): sample_to_allele = generate_empty_hash_with_sample(sample_names) command = "samtools view -F 1028 %s %s" % (bam_file, curr_reference) stream, process = utils_commands.get_output_stream_from_command(command) for line in stream: sam_record = Sam_record(line) allele_array = [] sequence = sam_record.get_query_sequence() sample = sam_record.get_tag("RG") for position in vcf_record_in_one_contig.keys(): #if vcf_record_in_one_contig.get(position).get_genotype_quality(sample)>20: allele_array.append(sequence[position - 1]) #else: # allele_array.append('.') count_with_hash(sample_to_allele[sample], ''.join(allele_array)) count_with_hash(sample_to_allele['all'], ''.join(allele_array)) process.wait() pprint.pprint(sample_to_allele) filter_alleles(sample_to_allele) pprint.pprint(sample_to_allele) all_alleles = set() valid = True for sample in sample_to_allele.keys(): alleles = sample_to_allele.get(sample) all_alleles.update(set(alleles.keys())) if len(alleles) > 2: valid = False if len(all_alleles) > 4: valid = False if not valid: print curr_reference
def get_readgroup_from_bam(bam_files): all_read_groups=[] for bam_file in bam_files: command = "samtools view -H %s | grep '^@RG' " % bam_file stdout, process = utils_commands.get_output_stream_from_command(command) for line in stdout: all_read_groups.append(line.strip()) return all_read_groups
def run_velvetk(fastq_file_name, estimated_size=600, **kwarg): command = "%s --size %s --best %s 2> /dev/null"%(velvetk_bin,estimated_size, fastq_file_name) logging.info(command) stream,process = utils_commands.get_output_stream_from_command(command) kmer_length=29 for line in stream: if line.strip().isdigit(): kmer_length = int(line.strip()) if kmer_length<19: kmer_length=19 elif kmer_length>99: kmer_length=99 logging.info("velvetk kmer: %s"%kmer_length) return kmer_length
def run_velvetk(fastq_file_name, estimated_size=600, **kwarg): command = "%s --size %s --best %s 2> /dev/null" % ( velvetk_bin, estimated_size, fastq_file_name) logging.info(command) stream, process = utils_commands.get_output_stream_from_command(command) kmer_length = 29 for line in stream: if line.strip().isdigit(): kmer_length = int(line.strip()) if kmer_length < 19: kmer_length = 19 elif kmer_length > 99: kmer_length = 99 logging.info("velvetk kmer: %s" % kmer_length) return kmer_length
def count_reads_in_fastq(fastq_file): command = '''awk '{if (NR%%4==1){split($1,array,"RGID:"); print array[2]}}' %s| uniq -c'''%(fastq_file) logging.info(command) stream, process = get_output_stream_from_command(command) total=0 all_read_groups=Counter() for line in stream: if len(line.strip().split())==2: count, rgid = line.strip().split() count=int(count) total+=count all_read_groups[rgid] return total, all_read_groups
def count_reads_in_fastq(fastq_file): command = '''awk '{if (NR%%4==1){split($1,array,"RGID:"); print array[2]}}' %s| uniq -c''' % ( fastq_file) logging.info(command) stream, process = get_output_stream_from_command(command) total = 0 all_read_groups = Counter() for line in stream: if len(line.strip().split()) == 2: count, rgid = line.strip().split() count = int(count) total += count all_read_groups[rgid] return total, all_read_groups
def process_double_digest_rad_run(bam_file,all_sites_info,samtools_bin): command="%s view -h %s"%(samtools_bin, bam_file) open_stream, process = get_output_stream_from_command(command) sample_name, ext = os.path.splitext(bam_file) read_groups={} try: for line in open_stream: if not line.startswith("@"): break if line.startswith("@RG"): sp_line = line.strip().split() rg_id=rg_sample=rg_library=None for value in sp_line: if value.startswith("ID"): rg_id=value[3:] elif value.startswith("SM"): rg_sample=value[3:] elif value.startswith("LB"): rg_library=value[3:] if rg_id: if rg_sample: read_groups[rg_id]=rg_sample elif rg_library: read_groups[rg_id]=rg_library else: read_groups[rg_id]=rg_id all_sample_coverage={} for sample in read_groups.values(): all_sample_coverage[sample]=Counter() i=0 for sam_record_r1,sam_record_r2 in load_from_sites_generator(open_stream): duplicate=0 i+=1 if i%1000000==0: print i if not sam_record_r1.is_unmapped() and not sam_record_r2.is_unmapped(): loci = get_dd_RAD_loci_from_read_pair(sam_record_r1,sam_record_r2) if sam_record_r1.is_duplicate_read(): duplicate=1 all_sites_info.add_values(loci, coverage=1, duplicate=duplicate, sample=read_groups.get(sam_record_r1.get_tag("RG"))) finally: open_stream.close()
def process_single_samtools_run(bam_file, all_contigs_info, samtools_bin): command="%s view -F 132 %s"%(samtools_bin, bam_file) open_stream, process=get_output_stream_from_command(command) current_contig=None coverage=0 duplicate=0 sample_name, ext = os.path.splitext(bam_file) for line in open_stream: sp_line=line.strip().split() if current_contig!=sp_line[2] and current_contig != None: all_contigs_info.add_values(current_contig, coverage, duplicate, sample=sample_name) coverage=0 duplicate=0 current_contig=sp_line[2] if int(sp_line[3])==1: if int(sp_line[1]) & 1024 == 1024: duplicate+=1 coverage+=1 open_stream.close()
def process_single_samtools_run(bam_file, all_contigs_info, samtools_bin): command = "%s view -F 132 %s" % (samtools_bin, bam_file) open_stream, process = get_output_stream_from_command(command) current_contig = None coverage = 0 duplicate = 0 sample_name, ext = os.path.splitext(bam_file) for line in open_stream: sp_line = line.strip().split() if current_contig != sp_line[2] and current_contig != None: all_contigs_info.add_values(current_contig, coverage, duplicate, sample=sample_name) coverage = 0 duplicate = 0 current_contig = sp_line[2] if int(sp_line[3]) == 1: if int(sp_line[1]) & 1024 == 1024: duplicate += 1 coverage += 1 open_stream.close()
def generate_readgroup_exclusion_file_per_samples(bam_file): directory = os.path.dirname(os.path.abspath(bam_file)) command = 'samtools view -H %s | grep @RG'%(bam_file) stream, process = get_output_stream_from_command(command) all_samples=set() all_samples2id=defaultdict(list) for line in stream: RG_dict = parse_RG_line(line) all_samples.add(RG_dict.get('SM')) all_samples2id[RG_dict.get('SM')].append(RG_dict.get('ID')) all_samples2exclusion_id_file={} for sample in all_samples: exclusion_id = [] exclusion_samples = all_samples.difference(set([sample])) for exclusion_sample in exclusion_samples: exclusion_id.extend(all_samples2id.get(exclusion_sample)) sample_exclusion_file=os.path.join(directory,'exclusion_id_for_%s.txt'%sample) with open(sample_exclusion_file,'w') as open_file: open_file.write('\n'.join(exclusion_id)) all_samples2exclusion_id_file[sample]= sample_exclusion_file return all_samples2exclusion_id_file
def generate_readgroup_exclusion_file_per_samples(bam_file): directory = os.path.dirname(os.path.abspath(bam_file)) command = 'samtools view -H %s | grep @RG' % (bam_file) stream, process = get_output_stream_from_command(command) all_samples = set() all_samples2id = defaultdict(list) for line in stream: RG_dict = parse_RG_line(line) all_samples.add(RG_dict.get('SM')) all_samples2id[RG_dict.get('SM')].append(RG_dict.get('ID')) all_samples2exclusion_id_file = {} for sample in all_samples: exclusion_id = [] exclusion_samples = all_samples.difference(set([sample])) for exclusion_sample in exclusion_samples: exclusion_id.extend(all_samples2id.get(exclusion_sample)) sample_exclusion_file = os.path.join( directory, 'exclusion_id_for_%s.txt' % sample) with open(sample_exclusion_file, 'w') as open_file: open_file.write('\n'.join(exclusion_id)) all_samples2exclusion_id_file[sample] = sample_exclusion_file return all_samples2exclusion_id_file
def createDirectories(baseDir,directories, server=None): """ This function create the directories listed in the directories array. @param baseDir: the parent directory. @param directories: the list of directory to create. @param server: the server on which the directory will be created. """ for (directory) in directories: dir=os.path.join(baseDir,directory) if server is not None: command = 'ssh %s "ls -d1 %s"'%(server, dir) stream, process = utils_commands.get_output_stream_from_command(command, logger_name=None) line=None for line in stream: line=line.strip() if line == dir: break if line != dir: logging.info('%s does not exists on %s: create it'%(dir, server)) command = 'ssh %s "mkdir %s"'%(server, dir) utils_commands.launchCommandLocally(command) elif not os.path.exists(dir): logging.info('%s does not exists: create it'%dir) os.mkdir(dir,0775)
def get_mpileup_from_bam(bam_file, options=''): try: pipeline_parm = utils_param.get_pipeline_parameters() samtools_bin = os.path.join(pipeline_parm.get_samtools_dir(), 'samtools') except Config_file_error, e: logging.warning( "Can't find the configuration file you'll need to have samtools in you path." ) samtools_bin = 'samtools' if bam_file == 'PIPE': bam_file = '-' else: command = '%s mpileup -A %s %s' % (samtools_bin, bam_file, options) stream, process = utils_commands.get_output_stream_from_command( command, logger_name=None) return stream def allele_freq_from_bam_and_list_pos(output_file, input_file, list_position_file, all_positions_loaded, exclusion_id_file, bas_qual_threshold=20, map_qual_threshold=10, coverage_threshold=6): input_stream = get_mpileup_from_bam( input_file, options='-s -l %s -G %s' % (list_position_file, exclusion_id_file)) all_positions_loaded = copy.copy(all_positions_loaded)
def get_mapview_stream(maq_bin, map_file): """This method opens a .map file with Maq and returns an open file. The std error will be output in the console through another thread.""" command = '%s mapview %s' % (maq_bin, map_file) stdout, process = utils_commands.get_output_stream_from_command(command) return stdout
chomosome_and_position=''): """This method opens a .bam file with samtools and returns an open file. The std error will be output in the console through another thread.""" if samtools_bin == None: try: pipeline_parm = utils_param.get_pipeline_parameters() samtools_bin = os.path.join(pipeline_parm.get_samtools_dir(), 'samtools') except Config_file_error, e: logging.warning( "Can't find the configuration file you'll need to have samtools in you path." ) samtools_bin = 'samtools' command = '%s view %s %s %s' % (samtools_bin, options, bam_file, chomosome_and_position) stdout, process = utils_commands.get_output_stream_from_command(command) return stdout, process def get_pileup_from_bam(bam_file, genome_file=None, samtools_bin=None, options=''): if samtools_bin == None: try: pipeline_parm = utils_param.get_pipeline_parameters() samtools_bin = os.path.join(pipeline_parm.get_samtools_dir(), 'samtools') except Config_file_error, e: logging.warning( "Can't find the configuration file you'll need to have samtools in you path."
pipeline_param=utils_param.get_pipeline_parameters() samtools_dir=pipeline_param.get_samtools_dir() except Config_file_error, e: #logging.exception('Config_file_error:') logging.critical("You need to have the environment variable properly set to use that script") return False samtools_bin=os.path.join(samtools_dir,'samtools') name, ext = os.path.splitext(output_bam_file) if ext=='.bam': output_bam_file=name #change_consensus_on_read2 command ="%s view -h %s "%(samtools_bin,input_bam_file) logging.info(command) input_stream,process_input = utils_commands.get_output_stream_from_command(command) command ="%s view -bS - | %s sort - %s"%(samtools_bin, samtools_bin, output_bam_file) logging.info(command) output_stream,process_output= utils_commands.get_input_stream_from_command(command) #get the header line = input_stream.readline() while line.startswith("@"): output_stream.write(line) line = input_stream.readline() while line: read1=Sam_record(line) line = input_stream.readline() read2=Sam_record(line) if read1.get_query_name() == read2.get_query_name():
def process_single_samtools_run_with_read_group(bam_file, all_contigs_info, samtools_bin): command = "%s view -h -F 132 %s" % (samtools_bin, bam_file) open_stream, process = get_output_stream_from_command(command) current_contig = None sample_name, ext = os.path.splitext(bam_file) read_groups = {} try: for line in open_stream: if not line.startswith("@"): break if line.startswith("@RG"): sp_line = line.strip().split() rg_id = rg_sample = rg_library = None for value in sp_line: if value.startswith("ID"): rg_id = value[3:] elif value.startswith("SM"): rg_sample = value[3:] elif value.startswith("LB"): rg_library = value[3:] if rg_id: if rg_sample: read_groups[rg_id] = rg_sample elif rg_library: read_groups[rg_id] = rg_library else: read_groups[rg_id] = rg_id all_sample_coverage = {} all_sample_duplicate = {} for sample in read_groups.values(): all_sample_coverage[sample] = 0 all_sample_duplicate[sample] = 0 # process the first read # if line.startswith("@"): # #Still in the header. There's no read, exit # return sam_record = Sam_record(line.strip()) current_contig = sam_record.get_reference_name() if not sam_record.is_unmapped(): rg_id = sam_record.get_tag("RG") if sam_record.is_duplicate_read(): all_sample_duplicate[read_groups.get(rg_id)] += 1 all_sample_coverage[read_groups.get(rg_id)] += 1 i = 1 # process all the others for line in open_stream: i += 1 if i % 1000000 == 0: print i sam_record = Sam_record(line.strip()) if current_contig != sam_record.get_reference_name() and current_contig != None: for sample in read_groups.values(): all_contigs_info.add_values(current_contig, all_sample_coverage.get(sample), all_sample_duplicate.get(sample), sample=sample) all_sample_coverage[sample] = 0 all_sample_duplicate[sample] = 0 current_contig = sam_record.get_reference_name() if not sam_record.is_unmapped(): rg_id = sam_record.get_tag("RG") if sam_record.is_duplicate_read(): all_sample_duplicate[read_groups.get(rg_id)] += 1 all_sample_coverage[read_groups.get(rg_id)] += 1 if current_contig != None: for sample in read_groups.values(): all_contigs_info.add_values(current_contig, all_sample_coverage.get(sample), all_sample_duplicate.get(sample), sample=sample) all_sample_coverage[sample] = 0 all_sample_duplicate[sample] = 0 finally: open_stream.close()
def process_single_samtools_run_with_read_group(bam_file, all_contigs_info, samtools_bin): command = "%s view -h -F 132 %s" % (samtools_bin, bam_file) open_stream, process = get_output_stream_from_command(command) current_contig = None sample_name, ext = os.path.splitext(bam_file) read_groups = {} try: for line in open_stream: if not line.startswith("@"): break if line.startswith("@RG"): sp_line = line.strip().split() rg_id = rg_sample = rg_library = None for value in sp_line: if value.startswith("ID"): rg_id = value[3:] elif value.startswith("SM"): rg_sample = value[3:] elif value.startswith("LB"): rg_library = value[3:] if rg_id: if rg_sample: read_groups[rg_id] = rg_sample elif rg_library: read_groups[rg_id] = rg_library else: read_groups[rg_id] = rg_id all_sample_coverage = {} all_sample_duplicate = {} for sample in read_groups.values(): all_sample_coverage[sample] = 0 all_sample_duplicate[sample] = 0 # process the first read # if line.startswith("@"): # #Still in the header. There's no read, exit # return sam_record = Sam_record(line.strip()) current_contig = sam_record.get_reference_name() if not sam_record.is_unmapped(): rg_id = sam_record.get_tag("RG") if sam_record.is_duplicate_read(): all_sample_duplicate[read_groups.get(rg_id)] += 1 all_sample_coverage[read_groups.get(rg_id)] += 1 i = 1 # process all the others for line in open_stream: i += 1 if i % 1000000 == 0: print i sam_record = Sam_record(line.strip()) if current_contig != sam_record.get_reference_name( ) and current_contig != None: for sample in read_groups.values(): all_contigs_info.add_values( current_contig, all_sample_coverage.get(sample), all_sample_duplicate.get(sample), sample=sample) all_sample_coverage[sample] = 0 all_sample_duplicate[sample] = 0 current_contig = sam_record.get_reference_name() if not sam_record.is_unmapped(): rg_id = sam_record.get_tag("RG") if sam_record.is_duplicate_read(): all_sample_duplicate[read_groups.get(rg_id)] += 1 all_sample_coverage[read_groups.get(rg_id)] += 1 if current_contig != None: for sample in read_groups.values(): all_contigs_info.add_values(current_contig, all_sample_coverage.get(sample), all_sample_duplicate.get(sample), sample=sample) all_sample_coverage[sample] = 0 all_sample_duplicate[sample] = 0 finally: open_stream.close()
command='%s mapview %s'%(maq_bin, map_file) stdout, process = utils_commands.get_output_stream_from_command(command) return stdout def get_sam_stream(bam_file, samtools_bin=None, options='', chomosome_and_position=''): """This method opens a .bam file with samtools and returns an open file. The std error will be output in the console through another thread.""" if samtools_bin==None: try: pipeline_parm=utils_param.get_pipeline_parameters() samtools_bin=os.path.join(pipeline_parm.get_samtools_dir(),'samtools') except Config_file_error, e: logging.warning("Can't find the configuration file you'll need to have samtools in you path.") samtools_bin='samtools' command='%s view %s %s %s'%(samtools_bin, options, bam_file, chomosome_and_position) stdout, process = utils_commands.get_output_stream_from_command(command) return stdout, process def get_pileup_from_bam(bam_file, genome_file=None, samtools_bin=None, options=''): if samtools_bin==None: try: pipeline_parm=utils_param.get_pipeline_parameters() samtools_bin=os.path.join(pipeline_parm.get_samtools_dir(),'samtools') except Config_file_error, e: logging.warning("Can't find the configuration file you'll need to have samtools in you path.") samtools_bin='samtools' if bam_file=='PIPE': bam_file='-' if genome_file: command = '%s pileup -f %s %s %s'%(samtools_bin, genome_file, bam_file, options)
out.append('C:%s:%s'%(ATCG['C'],ATCG_filtered['C'])) out.append('G:%s:%s'%(ATCG['G'],ATCG_filtered['G'])) return '\t'.join(out) def get_mpileup_from_bam(bam_file, options=''): try: pipeline_parm=utils_param.get_pipeline_parameters() samtools_bin=os.path.join(pipeline_parm.get_samtools_dir(),'samtools') except Config_file_error, e: logging.warning("Can't find the configuration file you'll need to have samtools in you path.") samtools_bin='samtools' if bam_file=='PIPE': bam_file='-' else: command = '%s mpileup -A %s %s'%(samtools_bin, bam_file, options) stream, process = utils_commands.get_output_stream_from_command(command, logger_name=None) return stream def allele_freq_from_bam_and_list_pos(output_file, input_file, list_position_file, all_positions_loaded, exclusion_id_file, bas_qual_threshold=20, map_qual_threshold=10, coverage_threshold=6): input_stream = get_mpileup_from_bam(input_file, options='-s -l %s -G %s'%(list_position_file,exclusion_id_file)) all_positions_loaded=copy.copy(all_positions_loaded) if input_stream is not None: open_output=open(output_file,'w') for line in input_stream: sp_line = line.strip().split() position = '%s\t%s'%(sp_line[0],sp_line[1]) if position in all_positions_loaded : all_positions_loaded.remove(position)
def get_mapview_stream(maq_bin, map_file): """This method opens a .map file with Maq and returns an open file. The std error will be output in the console through another thread.""" command='%s mapview %s'%(maq_bin, map_file) stdout, process = utils_commands.get_output_stream_from_command(command) return stdout
def process_single_samtools_run_with_read_group(bam_file,all_contigs_info,samtools_bin): command="%s view -h -F 132 %s"%(samtools_bin, bam_file) open_stream, process = get_output_stream_from_command(command) current_contig=None sample_name, ext = os.path.splitext(bam_file) read_groups={} try: for line in open_stream: if not line.startswith("@"): break if line.startswith("@RG"): sp_line = line.strip().split() rg_id=rg_sample=rg_library=None for value in sp_line: if value.startswith("ID"): rg_id=value[3:] elif value.startswith("SM"): rg_sample=value[3:] elif value.startswith("LB"): rg_library=value[3:] if rg_id: if rg_sample: read_groups[rg_id]=rg_sample elif rg_library: read_groups[rg_id]=rg_library else: read_groups[rg_id]=rg_id all_sample_coverage={} all_sample_coverage_reads = {} all_sample_duplicate={} for sample in read_groups.values(): all_sample_coverage[sample]=Counter() all_sample_duplicate[sample]=Counter() all_sample_coverage_reads[sample] = defaultdict(Counter) #process the first read sam_record = Sam_record(line.strip()) current_contig = sam_record.get_reference_name() if not sam_record.is_unmapped(): rg_id = sam_record.get_tag("RG") read_sequence = sam_record.get_query_sequence() loci = get_loci_from_read(sam_record) if sam_record.is_duplicate_read(): all_sample_duplicate[read_groups.get(rg_id)][str(loci)]+=1 all_sample_coverage[read_groups.get(rg_id)][str(loci)]+=1 all_sample_coverage_reads[read_groups.get(rg_id)][str(loci)][read_sequence] +=1 i=1 #process all the others for line in open_stream: i+=1 if i%1000000==0: print i sam_record = Sam_record(line.strip()) if current_contig != sam_record.get_reference_name() and current_contig != None: for sample in read_groups.values(): for loci in all_sample_coverage.get(sample): alleles = all_sample_coverage_reads[sample].get(loci) all_contigs_info.add_values(current_contig, loci, all_sample_coverage.get(sample).get(loci, 0), all_sample_duplicate.get(sample).get(loci, 0), alleles=alleles, sample=sample) all_sample_coverage[sample]=Counter() all_sample_duplicate[sample]=Counter() all_sample_coverage_reads[sample] = defaultdict(Counter) current_contig = sam_record.get_reference_name() if not sam_record.is_unmapped(): rg_id = sam_record.get_tag("RG") loci = get_loci_from_read(sam_record) read_sequence = sam_record.get_query_sequence() if sam_record.is_duplicate_read(): all_sample_duplicate[read_groups.get(rg_id)][str(loci)]+=1 all_sample_coverage[read_groups.get(rg_id)][str(loci)]+=1 all_sample_coverage_reads[read_groups.get(rg_id)][str(loci)][read_sequence] +=1 if current_contig != None: for sample in read_groups.values(): for loci in all_sample_coverage.get(sample): alleles = all_sample_coverage_reads[sample].get(loci) all_contigs_info.add_values(current_contig, loci, all_sample_coverage.get(sample).get(loci, 0), all_sample_duplicate.get(sample).get(loci, 0), alleles=alleles, sample=sample) all_sample_coverage[sample]=Counter() all_sample_duplicate[sample]=Counter() all_sample_coverage_reads[sample] = defaultdict(Counter) finally: open_stream.close()