def make_output_folders(data_folder, adaIDs, VERBOSE=0): '''Make output folders for symlinking''' from hivwholeseq.utils.generic import mkdirs mkdirs(data_folder) if VERBOSE >= 1: print 'Folder created:', data_folder for adaID in adaIDs + [-1]: mkdirs(data_folder+foldername_adapter(adaID)) if VERBOSE >= 1: print 'Folder created:', data_folder+foldername_adapter(adaID)
def make_output_folders(data_folder, adapters_designed, VERBOSE=0, summary=True): '''Make output folders for all adapters and unclassified (e.g. PhiX)''' from hivwholeseq.utils.generic import mkdirs # Make folders for the samples for (adaID, s) in adapters_designed: dirname = foldername_adapter(adaID) mkdirs(data_folder + dirname) if VERBOSE: print 'Folder created:', dirname # Make a default directory for unclassified reads mkdirs(data_folder + 'unclassified_reads') if VERBOSE: print 'Folder created: unclassified reads' if summary: with open(get_demultiplex_summary_filename(data_folder), 'a') as f: f.write('\n') f.write( 'Folders created for samples and unclassified reads (including phix).' ) f.write('\n')
def get_reference_premap_hash_filename(data_folder, adaID, ext=True): '''Get the filename of the stampy hash of the reference used for premapping''' fn = 'reference' if ext: fn = fn + '.sthash' fn = data_folder + foldername_adapter(adaID) + 'premapped/' + fn return fn
def make_output_folders(data_folder, adaID, VERBOSE=0): '''Make output folders for the script''' from hivwholeseq.utils.generic import mkdirs dirname = data_folder + foldername_adapter(adaID) + 'map_iter/' mkdirs(dirname) if VERBOSE: print 'Folder created:', dirname
def get_reference_all_filename(data_folder, adaID, fragment, ext=True): '''Get the file with the cumulated consensi''' fn = '_'.join(['consensus', 'alliters', fragment]) fn = data_folder + foldername_adapter(adaID) + 'map_iter/' + fn if ext: fn = fn + '.fasta' return fn
def get_mapped_filename(data_folder, adaID=None, fragment=None, type='bam', bwa=False, filtered=False, sort=False, part=None, unsorted=False, rescue=False, trashed=False): '''Get the filename of the mapped reads onto consensus''' if fragment is None: raise ValueError('Select a fragment') filename = fragment if rescue: filename = filename + '_rescue' if bwa: filename = filename + '_bwa' if filtered: filename = filename + '_filtered' if trashed: filename = filename + '_trashed' if sort: filename = filename + '_sorted' elif part is not None: filename = filename+'_part'+str(part) elif unsorted: filename = filename+'_unsorted' filename = 'mapped/'+filename+'.'+type if adaID is not None: filename = foldername_adapter(adaID)+filename return data_folder+filename
def get_reference_premap_hash_filename(data_folder, adaID, ext=True): '''Get the filename of the stampy hash of the reference used for premapping''' fn = 'reference' if ext: fn = fn + '.sthash' fn = data_folder+foldername_adapter(adaID)+'premapped/'+fn return fn
def get_figure_folder(data_folder, adaID=None): '''Get the folder for figures for this sample''' folder = 'figures/' if adaID is not None: folder = foldername_adapter(adaID)+folder folder = data_folder+folder return folder
def make_output_folders(data_folder, adaID, VERBOSE=0): '''Make output folders for the script''' from hivwholeseq.utils.generic import mkdirs dirname = data_folder+foldername_adapter(adaID)+'map_iter/' mkdirs(dirname) if VERBOSE: print 'Folder created:', dirname
def get_mapped_filename(data_folder, adaID=None, fragment=None, type='bam', bwa=False, filtered=False, sort=False, part=None, unsorted=False, rescue=False, trashed=False): '''Get the filename of the mapped reads onto consensus''' if fragment is None: raise ValueError('Select a fragment') filename = fragment if rescue: filename = filename + '_rescue' if bwa: filename = filename + '_bwa' if filtered: filename = filename + '_filtered' if trashed: filename = filename + '_trashed' if sort: filename = filename + '_sorted' elif part is not None: filename = filename + '_part' + str(part) elif unsorted: filename = filename + '_unsorted' filename = 'mapped/' + filename + '.' + type if adaID is not None: filename = foldername_adapter(adaID) + filename return data_folder + filename
def get_reference_all_filename(data_folder, adaID, fragment, ext=True): '''Get the file with the cumulated consensi''' fn = '_'.join(['consensus', 'alliters', fragment]) fn = data_folder+foldername_adapter(adaID)+'map_iter/'+fn if ext: fn = fn+'.fasta' return fn
def get_premapped_filename(data_folder, adaID=None, type='bam', bwa=False, part=None, unsorted=False): '''Get the filename of the readed mapped to reference to split into fragments''' filename = 'premapped' filename = 'premapped/' + filename if adaID is not None: filename = foldername_adapter(adaID) + filename if part is not None: filename = filename + '_part' + str(part) elif unsorted: filename = filename + '_unsorted' if bwa: filename = filename + '_bwa' if type == 'sam': filename = filename + '.sam' elif type == 'bam': filename = filename + '.bam' else: raise ValueError('Type of mapped reads file not recognized') return data_folder + filename
def get_figure_folder(data_folder, adaID=None): '''Get the folder for figures for this sample''' folder = 'figures/' if adaID is not None: folder = foldername_adapter(adaID) + folder folder = data_folder + folder return folder
def get_reference_premap_filename(data_folder, adaID, fragment=None): '''Get the filename of the reference used from premapping''' fn = 'reference' if fragment is not None: fn = fn+'_'+fragment fn = fn+'.fasta' fn = data_folder+foldername_adapter(adaID)+'premapped/'+fn return fn
def get_consensus_old_filename(data_folder, adaID, fragment, trim_primers=True): '''Find the filename of the final consensus''' filename = 'consensus_old_'+fragment if not trim_primers: filename = filename+'_with_primers' filename = filename+'.fasta' filename = foldername_adapter(adaID)+filename return data_folder+filename
def get_hash_file(data_folder, adaID, fragment, ext=True): '''Get the index filename, with or w/o extension''' filename = 'consensus_' + fragment filename = 'hash/' + filename filename = foldername_adapter(adaID) + filename if ext: filename = filename + '.sthash' return data_folder + filename
def get_merged_consensus_filename(data_folder, adaID=None, fragments=['F1', 'F2', 'F3', 'F4', 'F5', 'F6']): '''Get the merged consensus of several fragments''' filename = 'consensus_'+'-'.join(fragments)+'.fasta' if adaID is not None: filename = foldername_adapter(adaID)+filename filename = data_folder+filename return filename
def get_hash_file(data_folder, adaID, fragment, ext=True): '''Get the index filename, with or w/o extension''' filename = 'consensus_'+fragment filename = 'hash/'+filename filename = foldername_adapter(adaID)+filename if ext: filename = filename+'.sthash' return data_folder+filename
def get_reference_premap_filename(data_folder, adaID, fragment=None): '''Get the filename of the reference used from premapping''' fn = 'reference' if fragment is not None: fn = fn + '_' + fragment fn = fn + '.fasta' fn = data_folder + foldername_adapter(adaID) + 'premapped/' + fn return fn
def get_build_consensus_summary_filename(data_folder, adaID, fragment='general', iterative=True): '''Get the filename of the summary of the iterative consensus''' filename = 'summary_build_consensus_'+fragment+'.txt' if iterative: filename = 'map_iter/'+filename filename = data_folder+foldername_adapter(adaID)+filename return filename
def get_mapped_filename(data_folder, adaID, fragment, n_iter, type='bam'): '''Get the mapped filenames''' filename = 'mapped_to_' if n_iter == 1: filename = filename + 'reference' else: filename = filename + 'consensus_' + str(n_iter - 1) filename = filename + '_' + fragment + '.' + type return data_folder + foldername_adapter(adaID) + 'map_iter/' + filename
def get_map_summary_filename(data_folder, adaID, fragment, rescue=False): '''Get the filename of the summary of the division into fragments''' filename = 'summary_map'+fragment if rescue: filename = filename+'_rescue' filename = filename+'.txt' filename = 'mapped/'+filename filename = data_folder+foldername_adapter(adaID)+filename return filename
def get_mapped_filename(data_folder, adaID, fragment, n_iter, type='bam'): '''Get the mapped filenames''' filename = 'mapped_to_' if n_iter == 1: filename = filename + 'reference' else: filename = filename + 'consensus_'+str(n_iter - 1) filename = filename+'_'+fragment+'.'+type return data_folder+foldername_adapter(adaID)+'map_iter/'+filename
def get_map_summary_filename(data_folder, adaID, fragment, rescue=False): '''Get the filename of the summary of the division into fragments''' filename = 'summary_map' + fragment if rescue: filename = filename + '_rescue' filename = filename + '.txt' filename = 'mapped/' + filename filename = data_folder + foldername_adapter(adaID) + filename return filename
def __init__(self, *args, **kwargs): '''Initialize a sequenced sample''' super(SampleSeq, self).__init__(*args, **kwargs) from hivwholeseq.sequencing.filenames import get_seqrun_foldername from hivwholeseq.sequencing.adapter_info import foldername_adapter seq_run = self.loc['seq run'] adaID = self.loc['adapter'] self['folder'] = str(get_seqrun_foldername(seq_run)+foldername_adapter(adaID)) self['seqrun_folder'] = str(get_seqrun_foldername(seq_run))
def get_build_consensus_summary_filename(data_folder, adaID, fragment='general', iterative=True): '''Get the filename of the summary of the iterative consensus''' filename = 'summary_build_consensus_' + fragment + '.txt' if iterative: filename = 'map_iter/' + filename filename = data_folder + foldername_adapter(adaID) + filename return filename
def get_merged_allele_frequencies_filename(data_folder, adaID, fragments=[ 'F1', 'F2', 'F3', 'F4', 'F5', 'F6' ]): '''Get the merged allele frequencies of several fragments''' filename = 'allele_frequencies_' + '-'.join(fragments) + '.fasta' filename = data_folder + foldername_adapter(adaID) + filename return filename
def __init__(self, *args, **kwargs): '''Initialize a sequenced sample''' super(SampleSeq, self).__init__(*args, **kwargs) from hivwholeseq.sequencing.filenames import get_seqrun_foldername from hivwholeseq.sequencing.adapter_info import foldername_adapter seq_run = self.loc['seq run'] adaID = self.loc['adapter'] self['folder'] = str( get_seqrun_foldername(seq_run) + foldername_adapter(adaID)) self['seqrun_folder'] = str(get_seqrun_foldername(seq_run))
def get_consensus_old_filename(data_folder, adaID, fragment, trim_primers=True): '''Find the filename of the final consensus''' filename = 'consensus_old_' + fragment if not trim_primers: filename = filename + '_with_primers' filename = filename + '.fasta' filename = foldername_adapter(adaID) + filename return data_folder + filename
def get_merged_consensus_filename(data_folder, adaID=None, fragments=[ 'F1', 'F2', 'F3', 'F4', 'F5', 'F6' ]): '''Get the merged consensus of several fragments''' filename = 'consensus_' + '-'.join(fragments) + '.fasta' if adaID is not None: filename = foldername_adapter(adaID) + filename filename = data_folder + filename return filename
def get_reference_filename(data_folder, adaID, fragment, n_iter, ext=True): '''Get the reference filename for the intermediate mappings''' if n_iter == 1: fn = get_reference_premap_filename(data_folder, adaID, fragment) if not ext: fn = fn[:-6] else: fn = '_'.join(['consensus', str(n_iter-1), fragment]) fn = data_folder+foldername_adapter(adaID)+'map_iter/'+fn if ext: fn = fn+'.fasta' return fn
def get_divided_filename(data_folder, adaID=None, fragment=None, type='bam', chunk=None): '''Get the filename of the BAM files divided for a single fragment''' filename = 'divided' filename = 'divided/'+filename if adaID is not None: filename = foldername_adapter(adaID)+filename filename = data_folder+filename filename = filename+'_'+fragment if chunk is not None: filename = filename+'_chunk_'+str(chunk) filename = filename+'.'+type return filename
def get_reference_filename(data_folder, adaID, fragment, n_iter, ext=True): '''Get the reference filename for the intermediate mappings''' if n_iter == 1: fn = get_reference_premap_filename(data_folder, adaID, fragment) if not ext: fn = fn[:-6] else: fn = '_'.join(['consensus', str(n_iter - 1), fragment]) fn = data_folder + foldername_adapter(adaID) + 'map_iter/' + fn if ext: fn = fn + '.fasta' return fn
def get_divided_filenames(data_folder, adaID=None, fragments=None, type='bam'): '''Get the filenames of the BAM files divided by fragment''' filename = 'divided' filename = 'divided/' + filename if adaID is not None: filename = foldername_adapter(adaID) + filename filename = data_folder + filename filenames = [] for fragment in (list(fragments) + ['ambiguous', 'crossmapped', 'unmapped', 'low_quality']): fnf = filename + '_' + fragment + '.' + type filenames.append(fnf) return filenames
def get_divided_filenames(data_folder, adaID=None, fragments=None, type='bam'): '''Get the filenames of the BAM files divided by fragment''' filename = 'divided' filename = 'divided/'+filename if adaID is not None: filename = foldername_adapter(adaID)+filename filename = data_folder+filename filenames = [] for fragment in (list(fragments) + ['ambiguous', 'crossmapped', 'unmapped', 'low_quality']): fnf = filename+'_'+fragment+'.'+type filenames.append(fnf) return filenames
def get_read_filenames(data_folder, adaID=None, fragment=None, suffix='', gzip=False, trimmed=False): '''Get the filenames of the demultiplexed reads''' filenames = ['read1', 'read2'] for i,fn in enumerate(filenames): if adaID is not None: fn = foldername_adapter(adaID)+fn fn = data_folder+fn if trimmed: fn = fn+'_trimmed' fn = fn+suffix+'.fastq' if gzip: fn = fn+'.gz' filenames[i] = fn return filenames
def get_divided_filename(data_folder, adaID=None, fragment=None, type='bam', chunk=None): '''Get the filename of the BAM files divided for a single fragment''' filename = 'divided' filename = 'divided/' + filename if adaID is not None: filename = foldername_adapter(adaID) + filename filename = data_folder + filename filename = filename + '_' + fragment if chunk is not None: filename = filename + '_chunk_' + str(chunk) filename = filename + '.' + type return filename
def get_read_filenames(data_folder, adaID=None, fragment=None, suffix='', gzip=False, trimmed=False): '''Get the filenames of the demultiplexed reads''' filenames = ['read1', 'read2'] for i, fn in enumerate(filenames): if adaID is not None: fn = foldername_adapter(adaID) + fn fn = data_folder + fn if trimmed: fn = fn + '_trimmed' fn = fn + suffix + '.fastq' if gzip: fn = fn + '.gz' filenames[i] = fn return filenames
def make_output_folders(data_folder, adapters_designed, VERBOSE=0, summary=True): '''Make output folders for all adapters and unclassified (e.g. PhiX)''' from hivwholeseq.utils.generic import mkdirs # Make folders for the samples for (adaID, s) in adapters_designed: dirname = foldername_adapter(adaID) mkdirs(data_folder+dirname) if VERBOSE: print 'Folder created:', dirname # Make a default directory for unclassified reads mkdirs(data_folder+'unclassified_reads') if VERBOSE: print 'Folder created: unclassified reads' if summary: with open(get_demultiplex_summary_filename(data_folder), 'a') as f: f.write('\n') f.write('Folders created for samples and unclassified reads (including phix).') f.write('\n')
def get_premapped_filename(data_folder, adaID=None, type='bam', bwa=False, part=None, unsorted=False): '''Get the filename of the readed mapped to reference to split into fragments''' filename = 'premapped' filename = 'premapped/'+filename if adaID is not None: filename = foldername_adapter(adaID)+filename if part is not None: filename = filename+'_part'+str(part) elif unsorted: filename = filename+'_unsorted' if bwa: filename = filename + '_bwa' if type == 'sam': filename = filename + '.sam' elif type == 'bam': filename = filename + '.bam' else: raise ValueError('Type of mapped reads file not recognized') return data_folder+filename
def get_fragment_positions_filename(data_folder, adaID): '''Get the filename of the positions of fragments in the reference for premap''' filename = 'fragment_positions_premapped.dat' filename = 'divided/'+filename return data_folder+foldername_adapter(adaID)+filename
def get_mutations_file(data_folder, adaID, fragment): '''Get the filename with the mutations for all reads''' filename = 'mutations_' + fragment + '.pickle' filename = foldername_adapter(adaID) + filename return data_folder + filename
def get_filter_mapped_summary_filename(data_folder, adaID, fragment): '''Get the filename of the summary of the division into fragments''' filename = 'summary_filter_'+fragment+'.txt' filename = 'mapped/'+filename filename = data_folder+foldername_adapter(adaID)+filename return filename
def get_filter_mapped_summary_filename(data_folder, adaID, fragment): '''Get the filename of the summary of the division into fragments''' filename = 'summary_filter_' + fragment + '.txt' filename = 'mapped/' + filename filename = data_folder + foldername_adapter(adaID) + filename return filename
def get_fragment_positions_filename(data_folder, adaID): '''Get the filename of the positions of fragments in the reference for premap''' filename = 'fragment_positions_premapped.dat' filename = 'divided/' + filename return data_folder + foldername_adapter(adaID) + filename
def get_premap_summary_filename(data_folder, adaID): '''Get the filename of the premap to reference''' filename = 'summary_premapped.txt' filename = 'premapped/'+filename filename = data_folder+foldername_adapter(adaID)+filename return filename
def get_divide_summary_filename(data_folder, adaID): '''Get the filename of the summary of the division into fragments''' filename = 'summary_divide.txt' filename = 'divided/' + filename filename = data_folder + foldername_adapter(adaID) + filename return filename
def get_premap_summary_filename(data_folder, adaID): '''Get the filename of the premap to reference''' filename = 'summary_premapped.txt' filename = 'premapped/' + filename filename = data_folder + foldername_adapter(adaID) + filename return filename
def get_trim_summary_filename(data_folder, adaID): '''Get the filename of the trim low quality''' filename = 'summary_trim.txt' filename = data_folder + foldername_adapter(adaID) + filename return filename
def get_divide_summary_filename(data_folder, adaID): '''Get the filename of the summary of the division into fragments''' filename = 'summary_divide.txt' filename = 'divided/'+filename filename = data_folder+foldername_adapter(adaID)+filename return filename
def get_trim_summary_filename(data_folder, adaID): '''Get the filename of the trim low quality''' filename = 'summary_trim.txt' filename = data_folder+foldername_adapter(adaID)+filename return filename
def get_read_unpaired_filename(data_folder, adaID): '''Get the reads pairs for which one read is low quality''' fn = 'reads_unpaired.fastq' fn = foldername_adapter(adaID) + fn fn = data_folder + fn return fn
def get_mapped_suspicious_filename(data_folder, adaID, fragment, type='bam'): '''The the filename of the mapped reads with many mutations from consensus''' filename = fragment + '_suspicious.' + type filename = data_folder + foldername_adapter(adaID) + 'mapped/' + filename return filename
def get_merged_allele_frequencies_filename(data_folder, adaID, fragments=['F1', 'F2', 'F3', 'F4', 'F5', 'F6']): '''Get the merged allele frequencies of several fragments''' filename = 'allele_frequencies_'+'-'.join(fragments)+'.fasta' filename = data_folder+foldername_adapter(adaID)+filename return filename
def get_read_unpaired_filename(data_folder, adaID): '''Get the reads pairs for which one read is low quality''' fn = 'reads_unpaired.fastq' fn = foldername_adapter(adaID)+fn fn = data_folder+fn return fn
def get_allele_frequencies_filename(data_folder, adaID, fragment): '''Get the filename with the corrected allele frequencies''' filename = 'allele_frequencies_'+fragment+'.npy' filename = foldername_adapter(adaID)+filename return data_folder+filename
def get_mapped_suspicious_filename(data_folder, adaID, fragment, type='bam'): '''The the filename of the mapped reads with many mutations from consensus''' filename = fragment+'_suspicious.'+type filename = data_folder+foldername_adapter(adaID)+'mapped/'+filename return filename
def get_allele_frequencies_filename(data_folder, adaID, fragment): '''Get the filename with the corrected allele frequencies''' filename = 'allele_frequencies_' + fragment + '.npy' filename = foldername_adapter(adaID) + filename return data_folder + filename
def get_mutations_file(data_folder, adaID, fragment): '''Get the filename with the mutations for all reads''' filename = 'mutations_'+fragment+'.pickle' filename = foldername_adapter(adaID)+filename return data_folder+filename