def abyssmap_rmdups(in_fa, out_fa, strand_specific=False, cleanup=False, threads=1): ids_file = in_fa + '.dup_ids' # run abyssmap cmd_params = ['abyss-map', '--dup'] if strand_specific: cmd_params.append('--SS') #endif if threads > 1: cmd_params.append('--threads=%d' % threads) #endif cmd_params.extend([in_fa, in_fa]) cmd_params.append('> %s' % ids_file) run_shell_cmd(' '.join(cmd_params)) cids_set = set() with open(ids_file, 'r') as fh: for line in fh: line_stripped = line.strip() if len(line_stripped) > 0: cids_set.add(line_stripped) #endif #endfor #endwith filter_fasta(in_fa, out_fa, remove_set=cids_set) if cleanup: os.remove(ids_file)
def bowtie2_self_align(fasta, outputsam, threads=1, strand_specific=False, path_strip_sam_seq_qual=None, preset='--sensitive', k=2): """Align all fasta sequences to each other with Bowtie2. """ # Build index files for the concatenated fasta bt2_index_cmd_params = ['bowtie2-build --quiet', fasta, fasta] run_shell_cmd(' '.join(bt2_index_cmd_params)) # Self-align concatenated fasta with Bowtie2 bt2_align_cmd_params = ['set -euo pipefail && bowtie2'] if strand_specific: bt2_align_cmd_params.append('--norc') #endif bt2_align_cmd_params.extend([preset, '-k %d' % k, '--omit-sec-seq --end-to-end -f', '-p %d' % threads, fasta, fasta]) if path_strip_sam_seq_qual: bt2_align_cmd_params.append('|' + path_strip_sam_seq_qual) #endif bt2_align_cmd_params.append('|gzip -c >' + outputsam) run_shell_cmd(' '.join(bt2_align_cmd_params))
def blat_merge_fastas(path_prefix_map, merged_fa, concat_fa=None, concat_fa_selfalign_psl=None, percent_identity=0.95, strand_specific=False, cleanup=False, minoverlap=0, threads=1, indel_size_tolerance=1, min_seq_len=32): """Merge fasta files into a single fasta file by removing redundant sequences. """ if concat_fa is None: concat_fa = merged_fa + '.tmp.concat.fa' #endif if concat_fa_selfalign_psl is None: concat_fa_selfalign_psl = merged_fa + '.tmp.concat.psl' #endif # Concatenate the fastas together and give the contigs of each set a prefix concat_fastas(path_prefix_map, concat_fa) # Self-align concatenated fasta with Bowtie2 blat_self_align(concat_fa, concat_fa_selfalign_psl, percent_id=percent_identity, max_consecutive_edits=indel_size_tolerance, min_seq_len=min_seq_len, threads=threads) # Identify NON-redundant contigs nrrefs = psl_cid_extractor.extract_cids(psl=concat_fa_selfalign_psl, samestrand=strand_specific, min_percent_identity=percent_identity, max_consecutive_edits=indel_size_tolerance, report_redundant=False) tmpfiles = [] nr_fa_long = merged_fa + '.tmp.long.fa' nr_fa_short = None if minoverlap > 0: nr_fa_short = merged_fa + '.tmp.short.fa' tmpfiles.append(nr_fa_short) #endif # Gather the non-contained sequences and split into 2 partitions: # 1. shorter than (min overlap + 1) # 2. longer than or equal to (min overlap + 1) filter_fasta(concat_fa, nr_fa_long, min_length=minoverlap+1, keep_set=nrrefs, fasta_out_st=nr_fa_short) # overlap-layout the long sequences if minoverlap > 0: # generate the sequence overlap graph overlap_dot = merged_fa + '.tmp.long.dot' overlap_cmd_params = ['abyss-overlap', '--threads=%d' % threads, '--min=%d' % minoverlap] if strand_specific: overlap_cmd_params.append('--SS') #endif overlap_cmd_params.append(nr_fa_long) overlap_cmd_params.append('>' + overlap_dot) run_shell_cmd(' '.join(overlap_cmd_params)) # layout contigs using the overlap graph layout_path = merged_fa + '.tmp.long.path' layout_cmd_params = ['abyss-layout', '--kmer=%d' % (minoverlap+1), '--out=%s' % layout_path] if strand_specific: layout_cmd_params.append('--SS') #endif layout_cmd_params.append(overlap_dot) run_shell_cmd(' '.join(layout_cmd_params)) # generate fasta for O-L overlap_fa = merged_fa + '.incomplete' mergecontigs_cmd_params = ['MergeContigs', '--kmer=%d' % (minoverlap+1), '--out=%s' % overlap_fa, nr_fa_long, overlap_dot, layout_path] run_shell_cmd(' '.join(mergecontigs_cmd_params)) # append the short sequences to the same fasta with open(overlap_fa, 'a') as fout: with open(nr_fa_short, 'r') as fin: for line in fin: fout.write(line) #endfor #endwith #endwith shutil.move(overlap_fa, merged_fa) tmpfiles.extend([concat_fa, concat_fa_selfalign_psl, nr_fa_long, nr_fa_short, overlap_dot, layout_path]) else: shutil.move(nr_fa_long, merged_fa) tmpfiles.extend([concat_fa, concat_fa_selfalign_psl]) #endif if cleanup and tmpfiles is not None: for t in tmpfiles: if t is not None and os.path.isfile(t): os.remove(t)