def generate_uxi_library(self): # Perform sequence analysis (read-parsing, clustering, pairing UEIs/UMIs, sub-sampling data for rarefaction analyses) if not sysOps.check_file_exists('uxi_lib_tasklist.csv'): # create task list for library processing [subdirnames, filenames] = sysOps.get_directory_and_file_list(sysOps.globaldatapath) with open(sysOps.globaldatapath + 'uxi_lib_tasklist.csv','w') as task_input_file_handle: for subdir in subdirnames: if sysOps.check_file_exists(subdir + '//libsettings.txt'): task_input_file_handle.write('generate_uxi_library;' + sysOps.globaldatapath + subdir + '//\n') original_datapath = str(sysOps.globaldatapath) [my_task,time_start] = parallelOps.get_next_open_task('tasklog.csv', 'uxi_lib_tasklist.csv', 'generate_uxi_library') if not (my_task is None): sysOps.initiate_runpath(str(my_task[1])) myLibObj = libOps.libObj(settingsfilename = 'libsettings.txt', output_prefix = '_') if not sysOps.check_file_exists(myLibObj.output_prefix + 'lib_stats.txt'): myLibObj.partition_fastq_library(discarded_sequence_path = "discarded_sequences.fastq", mean_phred_score_path = "mean_phred_scores.txt") self.generate_cluster_analysis() libOps.subsample(myLibObj.seqform_for_params,myLibObj.seqform_rev_params, myLibObj.output_prefix) [subdirnames, filenames] = sysOps.get_directory_and_file_list() dirnames = list([subdirname for subdirname in subdirnames if subdirname.startswith('sub')]) sysOps.throw_status('Performing cluster analysis on sub-directories: ' + str(dirnames)) for dirname in dirnames: sysOps.initiate_runpath(str(my_task[1]) + dirname + '//') self.generate_cluster_analysis() sysOps.globaldatapath = str(original_datapath) if not parallelOps.close_task('tasklog.csv', ';'.join(my_task), time_start): sysOps.throw_exception('Task ' + str(my_task) + ' no longer exists in log ' + sysOps.globaldatapath + 'tasklog.csv' + ' -- exiting.') sysOps.exitProgram()
def consolidate_uxi(uxi_file, start_index=0, prefix='', include_inv_amp=False): # Function generates file ("identical_" + uxi_file) with list of identical unique uxi's (perfectly matched) with indices and and the number of corresponding reads #aux_info_file, if provided, contains line-by-line auxiliary assignments (stagger + amplicon-identity, if either exist) uxi_lib = dict() #build dictionary directly in memory with open(sysOps.globaldatapath + uxi_file, 'rU') as fasta_handle: sysOps.throw_status('Proceeding with consolidation ...') for my_record in SeqIO.parse(fasta_handle, "fasta"): my_seq = str(my_record.seq) if my_seq in uxi_lib: uxi_lib[my_seq].append(str(my_record.id)) else: uxi_lib[my_seq] = [my_record.id] uxi_len = len(my_seq) #final sequence used if include_inv_amp: with open( sysOps.globaldatapath + uxi_file[:uxi_file.find('.')] + '_amp_inv' + uxi_file[uxi_file.find('.'):], 'rU') as fasta_handle: sysOps.throw_status( 'Proceeding with consolidation, including invalid amplicons ...' ) for my_record in SeqIO.parse(fasta_handle, "fasta"): my_seq = str(my_record.seq) if my_seq in uxi_lib: uxi_lib[my_seq].append(str(my_record.id)) else: uxi_lib[my_seq] = [my_record.id] uxi_list_handle = open( sysOps.globaldatapath + prefix + "identical_" + uxi_file, 'w') uxi_index = int(start_index) for my_uxi_key, my_uxi_record_ids in sorted( uxi_lib.items()): #alphabetize by uxi sequence uxi_list_handle.write( str(my_uxi_key) + '_' + str(uxi_index) + '_' + str(len(my_uxi_record_ids)) + '\n') #output line includes uxi index and number of reads for my_record_id in my_uxi_record_ids: uxi_list_handle.write(my_record_id + '\n') uxi_index += 1 uxi_list_handle.close() del uxi_lib return [uxi_index, uxi_len ] #returns total number of unique entries and length of uxi itself
def threshold_cluster_uxi_prelinked(uxi_list,identical_uxi_filename,threshold,P=0,subsample = -1, prefix = ''): # Function will be called while loading linkage_file into uxi_list through load_linkage_file_to_list(linkage_file) in hashAlignments.py # Format of linkage file: # uxi-sequence, self-read-number, RND: list of linked-to indices with self-index as first in line # linkage_list elements: [uxi-sequence,self-read-number,RND,[list of linked-to indices with self-index as first in line]]) #sort uxi_list by decreasing RND num_uxi = len(uxi_list) sysOps.throw_status('Starting uxi list sort. List size = ' + str(num_uxi)) sorted_uxi_list = sorted(uxi_list, key=lambda row: -row[2]) #note: sorted_uxi_list _REMAINS_ a pointer to uxi_list index_vals = [-1 for i in range(num_uxi)] sysOps.throw_status('Completed uxi list sort. Assigning EASL-clusters ...') for sorted_uxi_el in sorted_uxi_list: #index_vals, with indices corresponding to _original_ positions in pre-sorted uxi_list, are initiated at -1 (stored in list at row[3]) #uxi's accepted into cluster with seed of index i, will be given value i in index_vals #uxi's rejected from all classification are given index if index_vals[sorted_uxi_el[3][0]] < 0: #if this seed has index -1 (has not been assigned to any seed itself) index_vals[sorted_uxi_el[3][0]] = int(sorted_uxi_el[3][0]) # set cluster seed to itself my_index_val = int(index_vals[sorted_uxi_el[3][0]]) for i in range(1,len(sorted_uxi_el[3])): if index_vals[sorted_uxi_el[3][i]] < 0: #connected read is unassigned -- assign to current cluster seed index_vals[sorted_uxi_el[3][i]] = my_index_val sysOps.throw_status('Consolidating clustered uxis ...') #consolidate clustered uxi's if -1 in index_vals: sysOps.throw_exception('Error: UNASSIGNED/UNCLUSTERED uxis. Exiting program') sysOps.exitProgram() index_str_vals = [str(int(x)) for x in index_vals] new_uxi_dict= dict() for i in range(num_uxi): my_index_str = index_str_vals[i] if my_index_str in new_uxi_dict: new_uxi_dict[my_index_str].append(uxi_list[i][0] + "_" + str(uxi_list[i][1])) else: new_uxi_dict[my_index_str] = [(uxi_list[i][0] + "_" + str(uxi_list[i][1]))] if(subsample<=0): new_uxi_handle = open(sysOps.globaldatapath + prefix + "thresh" + str(threshold) + "_" + identical_uxi_filename,'w') else: new_uxi_handle = open(sysOps.globaldatapath + prefix + "thresh" + str(threshold) + "_sub" + str(subsample) + identical_uxi_filename,'w') i = 0 for dict_el in new_uxi_dict: for el in new_uxi_dict[dict_el]: new_uxi_handle.write(str(i) + "_" + el + "\n") i += 1 new_uxi_handle.close() print "Completed clustering." return True
def generate_wmat(consensus_pairing_csv_file, minreadcount, min_uei_count, outfilename = 'wmat.csv'): #consensus_pairing_csv_file has elements: #uei index, beacon-umi index, target-umi index, read-count #if outfilename == None, does not print data to new files [bcn_dict,trg_dict, bcn_abund_dict,trg_abund_dict, bcn_div_dict,trg_div_dict] = get_umi_uei_matrices(consensus_pairing_csv_file, minreadcount) if len(trg_dict)==0 or len(bcn_dict)==0: sysOps.throw_exception(consensus_pairing_csv_file + ' generated an empty UEI matrix.') sysOps.exitProgram() sysOps.throw_status(['Generating feature list.',sysOps.statuslogfilename]) trg_feature_dict_list = get_features_from_dict(trg_dict) #collects salient pieces of information on targets for printing in file later [bcn_dict, trg_dict, bcn_div_dict, trg_div_dict] = filter_mats(bcn_dict, trg_dict, bcn_div_dict, trg_div_dict, min_uei_count) sysOps.throw_status(['Replacing matrix elements with UEI numbers (scalars).',sysOps.statuslogfilename]) del bcn_dict sysOps.throw_status(['Generating weight matrix.',sysOps.statuslogfilename]) if len(trg_dict)==0: sysOps.throw_exception('After filtering, ' + consensus_pairing_csv_file + ' generated an empty UEI matrix.') sysOps.exitProgram() if outfilename != None: print_features(trg_dict, 'trg_' + outfilename, trg_feature_dict_list) return trg_dict
def __init__(self, paramfilename): #Open parameter-file sysOps.throw_status("Reading from " + sysOps.globaldatapath + paramfilename + " ...") sim_settings = fileOps.read_settingsfile_to_dictionary( sysOps.globaldatapath + paramfilename) self.effic_monomer = float(sim_settings['-effic_monomer'][0]) self.effic_dimer = float(sim_settings['-effic_dimer'][0]) self.diffconst = float(sim_settings['-diffconst'][0]) self.lin_cycles = int(sim_settings['-lin_cycles'][0]) self.exp_cycles = int(sim_settings['-exp_cycles'][0]) self.posfilename = str(sim_settings['-posfilename'][0]) # position file contains columns: UMI-index (stored as-is for later), 0 for bcn/1 for trg, x-coordinate, y-coordinate raw_image_csv = np.loadtxt(sysOps.globaldatapath + self.posfilename, delimiter=',') raw_image_csv = raw_image_csv[np.argsort( raw_image_csv[:, 1]), :] # arranged as beacons followed by targets self.sim_pos = np.array(raw_image_csv[:, 2:], dtype=np.float64) self.sim_dims = self.sim_pos.shape[1] self.Nbcn = np.sum(raw_image_csv[:, 1] == 0) self.Ntrg = np.sum(raw_image_csv[:, 1] == 1) self.Nuei = int(sim_settings['-uei_per_bcn_umi'][0]) * self.Nbcn self.N_reads = int(sim_settings['-reads_per_uei'][0]) * self.Nuei self.index_key = np.int64(raw_image_csv[:, 0]) self.sim_pos = np.append( self.sim_pos, np.ones([self.Nbcn + self.Ntrg, 1], dtype=np.float64), axis=1) #number of starting molecules is always = 1 sysOps.throw_status("Assigned point-dimensionality to " + str(self.sim_dims)) return
def initiate_hash_alignment(uxi_file, P=0.0): ''' Takes in specific uxi_file, already formatted from source, consolidates identical sequences, performs hash-alignment, and clusters them. Each of these tasks is skipped, in order, if it's found up-to-date based on dates-of-modification. ''' identical_uxi_file = 'identical_' + uxi_file consolidation_up_to_date = False clustering_up_to_date = False alignment_up_to_date = False [dirnames, filenames] = sysOps.get_directory_and_file_list() if identical_uxi_file in filenames: consolidation_up_to_date = ( os.stat(sysOps.globaldatapath + identical_uxi_file).st_mtime > os.stat(sysOps.globaldatapath + uxi_file).st_mtime) #if time of last modification of identical-consolidation file is later than time of modification/writing of uxi_file sysOps.throw_status([ 'Consolidation up-to-date = ' + str(consolidation_up_to_date), sysOps.statuslogfilename ]) if ('linked_' + identical_uxi_file) in filenames: alignment_up_to_date = ( os.stat(sysOps.globaldatapath + 'linked_' + identical_uxi_file).st_mtime > os.stat(sysOps.globaldatapath + identical_uxi_file).st_mtime) #if time of last modification of threshold-clustering file is later than time of modification/writing of uxi_file sysOps.throw_status([ 'Alignment up-to-date = ' + str(alignment_up_to_date), sysOps.statuslogfilename ]) if ('thresh1_' + identical_uxi_file) in filenames: clustering_up_to_date = ( os.stat(sysOps.globaldatapath + 'thresh1_' + identical_uxi_file).st_mtime > os.stat(sysOps.globaldatapath + identical_uxi_file).st_mtime) #if time of last modification of threshold-clustering file is later than time of modification/writing of uxi_file sysOps.throw_status([ 'Clustering up-to-date = ' + str(clustering_up_to_date), sysOps.statuslogfilename ]) if not (consolidation_up_to_date and alignment_up_to_date): #write placeholder file with open(sysOps.globaldatapath + 'thresh1_' + identical_uxi_file, 'w') as placeholderfile: placeholderfile.write('In progress.') if not consolidation_up_to_date: sysOps.throw_status([ 'Consolidation not up to date, consolidating file ' + sysOps.globaldatapath + uxi_file, sysOps.statuslogfilename ]) [num_elements, uxi_len] = alignOps.consolidate_uxi(uxi_file, start_index=0, prefix='', include_inv_amp=False) else: #fetch uxi_len sysOps.throw_status([ 'Consolidation up to date, reading from file ' + sysOps.globaldatapath + identical_uxi_file, sysOps.statuslogfilename ]) with open(sysOps.globaldatapath + identical_uxi_file, 'rU') as uxi_handle: for uxi_line in uxi_handle: split_str = uxi_line.split('_') if (len(split_str) == 3): uxi_len = len( split_str[0] ) #first element of identical-sequence file is U(M/E)I sequence itself break for mismatch_pos in range( uxi_len ): #output members (indexed by ) of substrings (and abundances) corresponding to all characters except for the one at mismatch_pos #format as follows -- substring: member1-index_abundance1,member2-index_abundance2,... sysOps.throw_status([ 'Performing hash alignment on position ' + str(mismatch_pos), sysOps.statuslogfilename ]) output_hashed_mismatch_alignment( identical_uxi_file, mismatch_pos, 'mis' + str(mismatch_pos) + '_' + identical_uxi_file) sysOps.throw_status([ 'Hash alignments complete. Proceeding to assemble linked file.', sysOps.statuslogfilename ]) generate_linkage_file(identical_uxi_file, [ 'mis' + str(mismatch_pos) + '_' + identical_uxi_file for mismatch_pos in range(uxi_len) ], "linked_" + identical_uxi_file, P) #now that linkage file has been constructed, delete hash-alignment files for hash_filename in [ 'mis' + str(mismatch_pos) + '_' + identical_uxi_file for mismatch_pos in range(uxi_len) ]: os.remove(sysOps.globaldatapath + hash_filename) if not clustering_up_to_date: sysOps.delay_with_alertfile('_cluster_inprog' + uxi_file) clustOps.threshold_cluster_uxi_prelinked( alignOps.load_linkage_file_to_list("linked_" + identical_uxi_file), identical_uxi_file, 1, P) clustering_up_to_date = True sysOps.remove_alertfile('_cluster_inprog' + uxi_file) return clustering_up_to_date
def print_final_results(trgcalls_filename, trgseq_filename): #output final_*.csv containing columns (index, -1 (beacon)/ target-amplicon match, # x, y, ..., segment #output final_feat*.csv containing columns (index, features, consensus sequence (if target) # [dirnames, filenames] = sysOps.get_directory_and_file_list() seq_dat_filename = [ filename for filename in filenames if filename.startswith('seq_params') ] seq_dat_filename = seq_dat_filename[0][len('seq_params_'):] for result_dat_file in filenames: if (result_dat_file.startswith('Xumi_') and not (sysOps.check_file_exists('final_' + result_dat_file))): key_dat_file = 'key' + seq_dat_filename[ (seq_dat_filename.find('_')):] if sysOps.check_file_exists(key_dat_file): coords_dict = dict() sysOps.throw_status('Generating final output for ' + sysOps.globaldatapath + str(result_dat_file)) result = np.loadtxt(sysOps.globaldatapath + result_dat_file, delimiter=',') for i in range(result.shape[0]): coords_dict[str(int(result[i, 0]))] = ','.join( [str(x) for x in result[i, 1:]]) trg_match_dict = dict() trg_match_file = open( sysOps.globaldatapath + trgcalls_filename, 'rU') trg_seq_file = open(sysOps.globaldatapath + trgseq_filename, 'rU') for line, fasta_record in itertools.izip( trg_match_file, SeqIO.parse(trg_seq_file, "fasta")): [trg_umi_index, max_match, max_tally, tot_tally] = line.strip('\n').split(',') trg_match_dict[trg_umi_index] = [ str(max_match), str(max_tally), str(tot_tally), str(fasta_record.seq) ] trg_match_file.close() trg_seq_file.close() outfile = open( sysOps.globaldatapath + '//final_' + result_dat_file, 'w') outfile_feat = open( sysOps.globaldatapath + '//final_feat_' + result_dat_file, 'w') bcn_excluded = 0 trg_excluded = 0 with open(sysOps.globaldatapath + key_dat_file, 'rU') as key_file: for line in key_file: [bcn0trg1, orig_index, mle_index] = line.strip('\n').split(',') #key file columns: 0 or 1 (for beacon or target, respectively), cluster-index, MLE processing index if mle_index in coords_dict: outfile.write(orig_index + ',' + coords_dict[mle_index] + '\n') if bcn0trg1 == '0': outfile_feat.write(orig_index + ',-1,-1,-1,N\n') else: outfile_feat.write( orig_index + ',' + ','.join(trg_match_dict[orig_index]) + '\n') else: if bcn0trg1 == '0': bcn_excluded += 1 else: trg_excluded += 1 sysOps.throw_status( str(bcn_excluded) + ' beacons, ' + str(trg_excluded) + ' targets excluded from final estimation') outfile.close() outfile_feat.close() else: sysOps.throw_exception(sysOps.globaldatapath + key_dat_file + ' does not exist.') return
def assign_consensus_pairs(pairing_csv_file, min_pairing_readcount): ''' Assumes CSV file with columns: 1. UEI cluster-index 2. Beacon UMI cluster-index 3. Target UMI cluster-index 4. Read-number ''' sysOps.throw_status('Loading pairing file ' + pairing_csv_file + ' ...') uei_clust_index_dict = dict() with open(sysOps.globaldatapath + pairing_csv_file, 'rU') as csvfile: for line in csvfile: row = line.strip('\n').split(',') index_str = str(row[0]) #UEI cluster-index if index_str in uei_clust_index_dict: uei_clust_index_dict[index_str].append( [int(row[1]), int(row[2]), int(row[3]), int(row[4])] ) #append dictionary entry as list with row having indices of beacon- and target-umi clusters, the read-number, and the set-index (will all be 0 if invalid-amplicon reads are excluded) else: uei_clust_index_dict[index_str] = [[ int(row[1]), int(row[2]), int(row[3]), int(row[4]) ]] #replace each entry with umi pairing having plurality of reads, in same indexed format sysOps.throw_status('Generating consensus-pairs ...') discarded_ueis = 0 accepted_ueis = 0 for uei_clust_el in uei_clust_index_dict: maxcount = 0 secondmaxcount = 0 #detect ties, discard if tie exists maxcount_pair_bcn_index = -1 maxcount_pair_trg_index = -1 maxcount_set_index = -1 for row in uei_clust_index_dict[uei_clust_el]: if (row[2] >= min_pairing_readcount and row[2] > maxcount): secondmaxcount = int(maxcount) if maxcount_set_index >= 0 and maxcount_set_index != row[3]: sysOps.throw_exception('Error: set-index mismatch.') sysOps.exitProgram() maxcount_pair_bcn_index = int(row[0]) maxcount_pair_trg_index = int(row[1]) maxcount = int(row[2]) maxcount_set_index = int(row[3]) elif (row[2] >= min_pairing_readcount and row[2] > secondmaxcount): secondmaxcount = int(row[2]) if maxcount >= min_pairing_readcount and maxcount > secondmaxcount: # note: this condition requires that not only must the uei have at least min_pairing_readcount, # but the plurality-tally be must min_pairing_readcount as well uei_clust_index_dict[uei_clust_el] = list([ int(maxcount_pair_bcn_index), int(maxcount_pair_trg_index), int(maxcount), int(maxcount_set_index) ]) accepted_ueis += 1 else: uei_clust_index_dict[uei_clust_el] = list() discarded_ueis += 1 sysOps.throw_status('Outputting consensus-pairs with at least ' + str(min_pairing_readcount) + ' read-plurality. Accepted ' + str(accepted_ueis) + ' UEIs, discarded ' + str(discarded_ueis) + ' UEIs ...') #index outputted as uei-index, beacon-umi-index, target-umi-index, read-count outfile_handle = open( sysOps.globaldatapath + "consensus_" + str(min_pairing_readcount) + "r_" + pairing_csv_file, 'w') for uei_clust_el in uei_clust_index_dict: if len(uei_clust_index_dict[uei_clust_el]) > 0: outfile_handle.write( uei_clust_el + "," + ",".join([str(s) for s in uei_clust_index_dict[uei_clust_el]]) + "\n") outfile_handle.close() return
def assign_umi_pairs(uei_cluster_file, bcn_umi_cluster_file, trg_umi_cluster_file, uei_fasta_file, bcn_umi_fasta_file, trg_umi_fasta_file, outfile_prefix, filter_val=0.75, include_inv_amp=False): #at most filter_val fraction of total bases in given uxi allowed to be the same #Cluster-files have row-formats: uxi-cluster-index_uxi-sequence_read-number #load_cluster_file_to_dictionary outputs dictionary with entries {uxi-sequence: [uxi-cluster-index, read-number]} sysOps.throw_status("Finalizing consensus UMI sequences ...") uei_cluster_dict = fileOps.load_cluster_file_to_dictionary( uei_cluster_file) bcn_umi_cluster_dict = fileOps.load_cluster_file_to_dictionary( bcn_umi_cluster_file) trg_umi_cluster_dict = fileOps.load_cluster_file_to_dictionary( trg_umi_cluster_file) uei_clust_readcount_tally = dict() bcn_umi_clust_readcount_tally = dict() trg_umi_clust_readcount_tally = dict() # initiate tally dictionaries addressed by clust index # one element per file_set_index (if invalid amplicon sequences are being excluded, only first index will be populated) for uei_seq in uei_cluster_dict: uei_clust_readcount_tally[str(uei_cluster_dict[uei_seq][0])] = [0, 0] for umi_seq in bcn_umi_cluster_dict: bcn_umi_clust_readcount_tally[str( bcn_umi_cluster_dict[umi_seq][0])] = [0, 0] for umi_seq in trg_umi_cluster_dict: trg_umi_clust_readcount_tally[str( trg_umi_cluster_dict[umi_seq][0])] = [0, 0] #outputs dictionary with entries {uxi-sequence: [uxi-cluster-index, read-number]} #generate list of list of lists with index-order uei,umi uei_umi_dict = dict() inadmis_seq_count = 0 admis_seq_count = 0 uei_fasta_list = [uei_fasta_file] bcn_umi_fasta_list = [bcn_umi_fasta_file] trg_umi_fasta_list = [trg_umi_fasta_file] if include_inv_amp: uei_fasta_list.append(uei_fasta_file[:uei_fasta_file.find('.')] + '_amp_inv' + uei_fasta_file[uei_fasta_file.find('.'):]) bcn_umi_fasta_list.append( bcn_umi_fasta_file[:bcn_umi_fasta_file.find('.')] + '_amp_inv' + bcn_umi_fasta_file[bcn_umi_fasta_file.find('.'):]) trg_umi_fasta_list.append( trg_umi_fasta_file[:trg_umi_fasta_file.find('.')] + '_amp_inv' + trg_umi_fasta_file[trg_umi_fasta_file.find('.'):]) file_set_index = 0 sysOps.throw_status( "Inputting data to UEI-UMI dictionary using file-sets: " + str(uei_fasta_list) + ", " + str(bcn_umi_fasta_list) + ", " + str(trg_umi_fasta_list)) for uei_fasta, bcn_umi_fasta, trg_umi_fasta in itertools.izip( uei_fasta_list, bcn_umi_fasta_list, trg_umi_fasta_list): uei_handle = open(sysOps.globaldatapath + uei_fasta, "rU") bcn_umi_handle = open(sysOps.globaldatapath + bcn_umi_fasta, "rU") trg_umi_handle = open(sysOps.globaldatapath + trg_umi_fasta, "rU") for uei_record, bcn_umi_record, trg_umi_record in itertools.izip( SeqIO.parse(uei_handle, "fasta"), SeqIO.parse(bcn_umi_handle, "fasta"), SeqIO.parse(trg_umi_handle, "fasta")): uei_seq = str(uei_record.seq) bcn_umi_seq = str(bcn_umi_record.seq) trg_umi_seq = str(trg_umi_record.seq) max_uei_frac = max( np.bincount([('ACGT').index(s) for s in uei_seq])) / float(len(uei_seq)) max_bcn_umi_frac = max( np.bincount([('ACGT').index(s) for s in bcn_umi_seq])) / float(len(bcn_umi_seq)) max_trg_umi_frac = max( np.bincount([('ACGT').index(s) for s in trg_umi_seq])) / float(len(trg_umi_seq)) if max_uei_frac <= filter_val and max_bcn_umi_frac <= filter_val and max_trg_umi_frac <= filter_val: uei_clust_ind = str(uei_cluster_dict[uei_seq][0]) bcn_umi_clust_ind = str(bcn_umi_cluster_dict[bcn_umi_seq][0]) trg_umi_clust_ind = str(trg_umi_cluster_dict[trg_umi_seq][0]) uei_clust_readcount_tally[uei_clust_ind][file_set_index] += 1 bcn_umi_clust_readcount_tally[bcn_umi_clust_ind][ file_set_index] += 1 trg_umi_clust_readcount_tally[trg_umi_clust_ind][ file_set_index] += 1 pair_str = bcn_umi_clust_ind + "_" + trg_umi_clust_ind if uei_clust_ind in uei_umi_dict and uei_umi_dict[ uei_clust_ind][2] == file_set_index: #if uei from read has already been inserted into uei-umi dictionary if pair_str in uei_umi_dict[uei_clust_ind][ 0]: #if bcn-trg pair has already been added to this uei entry pair_ind = uei_umi_dict[uei_clust_ind][0].index( pair_str) uei_umi_dict[uei_clust_ind][1][pair_ind] += 1 else: # uei in uei_umi_dict -- but corresponding UMI-pair not found in existing list uei_umi_dict[uei_clust_ind][0].append(pair_str) uei_umi_dict[uei_clust_ind][1].append(1) admis_seq_count += 1 elif uei_clust_ind not in uei_umi_dict: # uei not yet in uei_umi_dict -- create new list uei_umi_dict[uei_clust_ind] = [[pair_str], [1], int(file_set_index)] admis_seq_count += 1 else: inadmis_seq_count += 1 # if UEI has been found but not as part of the first file_set_index for which it was detected (this depends on orderring any invalid-amplicon files second in the fasta-lists above), then disregard else: inadmis_seq_count += 1 uei_handle.close() bcn_umi_handle.close() trg_umi_handle.close() file_set_index += 1 sysOps.throw_status( 'Did not use ' + str(inadmis_seq_count) + '/' + str(admis_seq_count + inadmis_seq_count) + ' pairings due to repetitive base-usage in UMI or UEI sequence.') #elements of uei_umi_dict are now list of cluster indices (ordered uei,bcn,trg,# times that element has been called) #convert embedded dictionaries into list list_output = list() for uei_el in uei_umi_dict: for i in range(len(uei_umi_dict[uei_el][0])): pair_str = uei_umi_dict[uei_el][0][i] [bcn_umi_el, trg_umi_el] = pair_str.split('_') list_output.append([ int(uei_el), int(bcn_umi_el), int(trg_umi_el), int(uei_umi_dict[uei_el][1][i]), int(uei_umi_dict[uei_el][2]) ]) del uei_umi_dict list_output.sort( key=lambda row: (row[0], row[1], row[2], -row[3], row[4]) ) #sort by uei-cluster, then beacon-umi-cluster, then target-umi-cluster, then decreasing read-count sysOps.throw_status("Writing file ...") with open( sysOps.globaldatapath + outfile_prefix + "_filter" + str(filter_val) + "_uei_umi.csv", 'w') as outfile_handle: for row in list_output: outfile_handle.write(','.join([str(s) for s in row]) + "\n") sysOps.throw_status("Tallying clusters ...") uei_clust_counts = [[0, 0], [0, 0], [0, 0]] bcn_umi_clust_counts = [[0, 0], [0, 0], [0, 0]] trg_umi_clust_counts = [[0, 0], [0, 0], [0, 0]] uei_clust_counts_inclusive = [[0, 0], [0, 0], [0, 0]] bcn_umi_clust_counts_inclusive = [[0, 0], [0, 0], [0, 0]] trg_umi_clust_counts_inclusive = [[0, 0], [0, 0], [0, 0]] # with file_set_index=0 corresponding to amplicon-valid and file_set_index=1 corresponding to amplicon-invalid and # a cluster is counted for file_set_index = 0 if none of its members are valid for uei_clust_ind in uei_clust_readcount_tally: file_set_index = 0 this_readcount = uei_clust_readcount_tally[uei_clust_ind][ file_set_index] if this_readcount == 0: file_set_index = 1 this_readcount = uei_clust_readcount_tally[uei_clust_ind][ file_set_index] if this_readcount > 0: uei_clust_counts[min(this_readcount, 3) - 1][file_set_index] += 1 uei_clust_counts_inclusive[ min(uei_clust_readcount_tally[uei_clust_ind][0], 3) - 1][0] += 1 uei_clust_counts_inclusive[ min(uei_clust_readcount_tally[uei_clust_ind][1], 3) - 1][1] += 1 for umi_clust_ind in bcn_umi_clust_readcount_tally: file_set_index = 0 this_readcount = bcn_umi_clust_readcount_tally[umi_clust_ind][ file_set_index] if this_readcount == 0: file_set_index = 1 this_readcount = bcn_umi_clust_readcount_tally[umi_clust_ind][ file_set_index] if this_readcount > 0: bcn_umi_clust_counts[min(this_readcount, 3) - 1][file_set_index] += 1 bcn_umi_clust_counts_inclusive[ min(bcn_umi_clust_readcount_tally[umi_clust_ind][0], 3) - 1][0] += 1 bcn_umi_clust_counts_inclusive[ min(bcn_umi_clust_readcount_tally[umi_clust_ind][1], 3) - 1][1] += 1 for umi_clust_ind in trg_umi_clust_readcount_tally: file_set_index = 0 this_readcount = trg_umi_clust_readcount_tally[umi_clust_ind][ file_set_index] if this_readcount == 0: file_set_index = 1 this_readcount = trg_umi_clust_readcount_tally[umi_clust_ind][ file_set_index] if this_readcount > 0: trg_umi_clust_counts[min(this_readcount, 3) - 1][file_set_index] += 1 trg_umi_clust_counts_inclusive[ min(trg_umi_clust_readcount_tally[umi_clust_ind][0], 3) - 1][0] += 1 trg_umi_clust_counts_inclusive[ min(trg_umi_clust_readcount_tally[umi_clust_ind][1], 3) - 1][1] += 1 with open(sysOps.globaldatapath + outfile_prefix + '_clust_stats.txt', 'w') as out_stats: tot_file_sets = 1 if include_inv_amp: tot_file_sets = 2 for file_set_index in range(tot_file_sets): out_stats.write('uei:' + str(file_set_index) + ':' + ','.join( [str(uei_clust_counts[i][file_set_index]) for i in range(3)]) + '\n') out_stats.write('bcn_umi:' + str(file_set_index) + ':' + ','.join([ str(bcn_umi_clust_counts[i][file_set_index]) for i in range(3) ]) + '\n') out_stats.write('trg_umi:' + str(file_set_index) + ':' + ','.join([ str(trg_umi_clust_counts[i][file_set_index]) for i in range(3) ]) + '\n') sysOps.throw_status("Completed.") return
def assign_umi_amplicons(trg_umi_cluster_file, trg_umi_fasta, amp_match_file, amp_seq_fasta, outfilename): #function will tally reads counted for each target umi across each amplicon-call, and return a csv file with the following columns: #(target umi cluster-index),(leading amplicon-call),(reads for leading amplicon-call),(total reads counted) sysOps.throw_status('Loading cluster-file ' + sysOps.globaldatapath + trg_umi_cluster_file) trg_umi_cluster_dict = fileOps.load_cluster_file_to_dictionary( trg_umi_cluster_file) #outputs dictionary with entries {uxi-sequence: [uxi-cluster-index, read-number]} trg_umi_handle = open(sysOps.globaldatapath + trg_umi_fasta, "rU") amp_seq_handle = open(sysOps.globaldatapath + amp_seq_fasta, "rU") realign_amplicons = False amp_match_handle = None try: sysOps.throw_status('Loading ' + sysOps.globaldatapath + amp_match_file) amp_match_handle = open(sysOps.globaldatapath + amp_match_file, "rU") except: sysOps.throw_status( sysOps.globaldatapath + amp_match_file + ' not found. Alignments will occur from sequence-consenses directly.' ) realign_amplicons = True if not sysOps.check_file_exists('amplicon_refs.txt'): sysOps.throw_exception('Error: ' + sysOps.globaldatapath + 'amplicon_refs.txt not found.') sysOps.exitProgram() trg_umi_dict = dict() trg_amp_seq_dict = dict() for trg_umi_record, amp_seq_record in itertools.izip( SeqIO.parse(trg_umi_handle, "fasta"), SeqIO.parse(amp_seq_handle, "fasta")): if not realign_amplicons: amp_match = int(amp_match_handle.readline().strip('\n')) else: amp_match = -1 trg_umi_seq = str(trg_umi_record.seq) if trg_umi_seq in trg_umi_cluster_dict: trg_umi_index = str( trg_umi_cluster_dict[trg_umi_seq][0]) #uxi cluster-index if trg_umi_index in trg_umi_dict: if amp_match in trg_umi_dict[trg_umi_index]: trg_umi_dict[trg_umi_index][ amp_match] += 1 #add 1, because every read is being entered else: trg_umi_dict[trg_umi_index][amp_match] = 1 else: trg_umi_dict[trg_umi_index] = dict() trg_amp_seq_dict[trg_umi_index] = baseTally() trg_umi_dict[trg_umi_index][amp_match] = 1 trg_amp_seq_dict[trg_umi_index].add_record(str(amp_seq_record.seq), 1) trg_umi_handle.close() amp_seq_handle.close() if not realign_amplicons: amp_match_handle.close() csvfile = open(sysOps.globaldatapath + outfilename, 'w') fastafile = open( sysOps.globaldatapath + outfilename[:outfilename.rfind('.')] + '.fasta', 'w') ref_sequences = list() if realign_amplicons and sysOps.check_file_exists('amplicon_refs.txt'): with open(sysOps.globaldatapath + 'amplicon_refs.txt', 'rU') as ref_file_handle: for ref_line in ref_file_handle: [ref_name, ref_seq] = ref_line.strip('\n').upper().split('|') # amplicon_refs.txt will contain sequences in reverse complementary orientation. We therefore reverse both complementarity and order ref_sequences.append([ str(Seq.Seq(my_ref_seq).reverse_complement()) for my_ref_seq in reversed(ref_seq.split(',')) ]) mySettings = fileOps.read_settingsfile_to_dictionary('libsettings.txt') max_mismatch_amplicon = float(mySettings["-max_mismatch_amplicon"][0]) trg_umi_index_dict = dict() accepted_consensus_sequences = 0 inadmis_consensus_sequences = 0 for trg_umi_index in trg_umi_dict: max_tally = 0 tot_tally = 0 for amp_match in trg_umi_dict[trg_umi_index]: my_tally = trg_umi_dict[trg_umi_index][amp_match] if my_tally >= max_tally: max_tally = int(my_tally) max_match = int(amp_match) tot_tally += int(my_tally) consensus_seq = str( trg_amp_seq_dict[trg_umi_index].get_str_consensus()) if realign_amplicons: # perform direct, un-gapped alignment of consensus_seq to reference options to obtain max_match max_match = -1 max_tally = -1 # exclude max_tally as count, since alignment is happening post-consensus min_mismatch_count = -1 for i in range(len(ref_sequences)): all_subamplicons_pass = True start_index = 0 tot_mismatches = 0 for j in range(len(ref_sequences[i]) ): # loop through sub-amplicon-sequences ref_subamplicon_len = len(ref_sequences[i][j]) my_mismatches, minlen = alignOps.count_mismatches( ref_sequences[i][j], consensus_seq[start_index:(start_index + ref_subamplicon_len)]) if minlen == 0: all_subamplicons_pass = False break all_subamplicons_pass = all_subamplicons_pass and ( my_mismatches / float(minlen) <= max_mismatch_amplicon) start_index += ref_subamplicon_len tot_mismatches += my_mismatches if all_subamplicons_pass and ( max_match < 0 or min_mismatch_count < tot_mismatches): max_match = int(i) min_mismatch_count = int(tot_mismatches) if max_match >= 0: csvfile.write(trg_umi_index + "," + str(max_match) + "," + str(max_tally) + "," + str(tot_tally) + "\n") fastafile.write(">" + trg_umi_index + '\n') fastafile.write(consensus_seq + '\n') if realign_amplicons: trg_umi_index_dict[trg_umi_index] = True accepted_consensus_sequences += 1 else: inadmis_consensus_sequences += 1 csvfile.close() fastafile.close() sysOps.throw_status('Discarded ' + str(inadmis_consensus_sequences) + '/' + str(accepted_consensus_sequences + inadmis_consensus_sequences) + ' sequences in writing ' + sysOps.globaldatapath + outfilename + ' due to inadequate amplicon match.') if realign_amplicons: # create a new consensus pairing file that's filtered with the accepted trg umi indices [dirnames, filenames] = sysOps.get_directory_and_file_list() consensus_filenames = [ filename for filename in filenames if filename.startswith('consensus') ] for consensus_filename in consensus_filenames: # find all consensus files present accepted_consensus_sequences = 0 inadmis_consensus_sequences = 0 os.rename( sysOps.globaldatapath + consensus_filename, sysOps.globaldatapath + 'unfiltered_' + consensus_filename) with open(sysOps.globaldatapath + consensus_filename, 'w') as new_consensus_file: with open( sysOps.globaldatapath + 'unfiltered_' + consensus_filename, 'rU') as old_consensus_file: for old_consensus_file_line in old_consensus_file: consensus_list = old_consensus_file_line.strip( '\n' ).split( ',' ) # [uei_index, bcn_umi_index, trg_umi_index, read_count, (additional variables)] if consensus_list[2] in trg_umi_index_dict: new_consensus_file.write(old_consensus_file_line) accepted_consensus_sequences += 1 else: inadmis_consensus_sequences += 1 sysOps.throw_status('Discarded ' + str(inadmis_consensus_sequences) + '/' + str(accepted_consensus_sequences + inadmis_consensus_sequences) + ' consensus-pairings in writing ' + sysOps.globaldatapath + consensus_filename + ' due to inadequate amplicon match.') if len(consensus_filenames) == 0: sysOps.throw_exception( 'Error: no consensus files available to update with realigned amplicon information. Exiting.' ) sysOps.exitProgram()
def gather_rarefaction_data(conditions_filename = 'conditions.csv', outfilename = 'rarefaction_file.txt', raw_uxi_files = ['_for_uxi0.fasta','_for_uxi1.fasta','_rev_uxi0.fasta']): #use conditions conditions_filename to specify output order dirnames = list() with open(sysOps.globaldatapath + conditions_filename, 'rU') as conditions_handle: for myline in conditions_handle: thisline = myline.strip('\n').split(',') dirnames.append('lib_' + str(thisline[0]) + '_' + str(thisline[1]) + '_' + str(thisline[2])) outfile_1r = open(sysOps.globaldatapath +'1r_' + outfilename,'w') outfile_2r = open(sysOps.globaldatapath +'2r_' + outfilename,'w') outfile_3r = open(sysOps.globaldatapath +'3r_' + outfilename,'w') for dir in dirnames: print 'Gathering rarefaction data for directory ' + sysOps.globaldatapath + dir sum_reads_raw = 0 with open(sysOps.globaldatapath +dir + '/' + raw_uxi_files[0],'rU') as uxi_file_handle: #first UMI/UEI file in list to count raw reads for uxi_record in SeqIO.parse(uxi_file_handle,'fasta'): sum_reads_raw += 1 subsample = 500 terminate = False while not terminate: all_diversities = [] try: for my_raw_uxi_file in raw_uxi_files: try: cluster_file_handle = open(sysOps.globaldatapath +dir + '/thresh1_identical_sub' + str(subsample) + my_raw_uxi_file,'rU') consensus_pairing_csv_file = dir + '/consensus_2r_sub' + str(subsample) + 'pairing_filter0.75_uei_umi.csv' except: terminate = True try: cluster_file_handle = open(sysOps.globaldatapath +dir + '/thresh1_identical_' + my_raw_uxi_file,'rU') consensus_pairing_csv_file = dir + '/consensus_2r_pairing_filter0.75_uei_umi.csv' except: sysOps.throw_exception('Directory ' + sysOps.globaldatapath + dir + ' does not contain clustered file' + sysOps.globaldatapath +dir + '/thresh1_identical_' + my_raw_uxi_file + '. Skipping ...') break subsample = sum_reads_raw cluster_dict = dict() for myline in cluster_file_handle: thisline = myline.strip('\n').split('_') if thisline[0] in cluster_dict: cluster_dict[thisline[0]] += int(thisline[2]) else: cluster_dict[thisline[0]] = int(thisline[2]) cluster_file_handle.close() diversity = [0,0,0] #first element is 1-read-gated diversity, second is 2-read-gated, third is 3-read-gated for el in cluster_dict: if cluster_dict[el]>=3: diversity[0] += 1 diversity[1] += 1 diversity[2] += 1 elif cluster_dict[el]>=2: diversity[0] += 1 diversity[1] += 1 else: diversity[0] += 1 all_diversities.append(diversity) #if sysOps.check_file_exists(consensus_pairing_csv_file): if False: #temp sysOps.throw_status('Found ' + sysOps.globaldatapath + consensus_pairing_csv_file + '.') min_uei_count = 2 min_umi_readcount = 2 outname = 'minb' + str(min_uei_count) + 'v' + str(0) + '_' + str(min_umi_readcount) + 'r_filter0.75' wmat_outfilename = 'noabundcorr_wmat_' + outname + '.csv' sysOps.throw_status('Calling matOps.generate_wmat()') [num_unique_trg, num_unique_bcn, trg_dict] = matOps.generate_wmat(consensus_pairing_csv_file, min_umi_readcount, min_umi_readcount, min_uei_count, wmat_outfilename = None) if num_unique_bcn>0: filtered_minb_diversity_2r = [num_unique_bcn, sum([trg_dict[trg_el] for trg_el in trg_dict]), num_unique_trg] else: filtered_minb_diversity_2r = [0,0,0] else: sysOps.throw_status(sysOps.globaldatapath + consensus_pairing_csv_file + ' not found.') filtered_minb_diversity_2r = [] outfile_1r.write(','.join([dir, str(subsample), ','.join([str(my_diversity[0]) for my_diversity in all_diversities])]) + '\n') outfile_2r.write(','.join([dir, str(subsample), ','.join([str(my_diversity[1]) for my_diversity in all_diversities]), ','.join([str(s) for s in filtered_minb_diversity_2r])]) + '\n') outfile_3r.write(','.join([dir, str(subsample), ','.join([str(my_diversity[2]) for my_diversity in all_diversities])]) + '\n') except: terminate = True subsample *= 2 outfile_1r.close() outfile_2r.close() outfile_3r.close()
def generate_cluster_analysis(self): # Perform clustering analysis of UMI and UEI sequences, consolidate pairings and determine consenses of these pairings sysOps.initiate_statusfilename() missing_uxi_files = sysOps.find_missing_uxi_files( 'libsettings.txt', '_') if len(missing_uxi_files) > 0: sysOps.throw_exception('Missing uxi files: ' + str(missing_uxi_files)) if (sysOps.check_file_exists('_for_uxi0.fasta')): sysOps.throw_status("Clustering for_uxi0") clustering_up_to_date_1 = hashAlignments.initiate_hash_alignment( '_for_uxi0.fasta') else: clustering_up_to_date_1 = True sysOps.throw_status(sysOps.globaldatapath + '_for_uxi0.fasta does not exist. Skipping.') if (sysOps.check_file_exists('_for_uxi1.fasta')): sysOps.throw_status("Clustering for_uxi1") clustering_up_to_date_2 = hashAlignments.initiate_hash_alignment( '_for_uxi1.fasta') else: clustering_up_to_date_2 = True sysOps.throw_status(sysOps.globaldatapath + '_for_uxi1.fasta does not exist. Skipping.') if (sysOps.check_file_exists('_rev_uxi0.fasta')): sysOps.throw_status("Clustering rev_uxi0") clustering_up_to_date_3 = hashAlignments.initiate_hash_alignment( '_rev_uxi0.fasta') else: clustering_up_to_date_3 = True sysOps.throw_status(sysOps.globaldatapath + '_rev_uxi0.fasta does not exist. Skipping.') if (clustering_up_to_date_1 and clustering_up_to_date_2 and clustering_up_to_date_3): filter_val = 0.75 #maximum fraction of same-base permitted in a single UMI/UEI min_pairing_readcount = 2 sysOps.throw_status( 'Clustering completed. Beginning final output.') if (sysOps.check_file_exists('thresh1_identical__for_uxi0.fasta') and sysOps.check_file_exists( 'thresh1_identical__for_uxi1.fasta') and sysOps.check_file_exists( 'thresh1_identical__rev_uxi0.fasta') and not (sysOps.check_file_exists('consensus_pairing_filter' + str(filter_val) + '_uei_umi.csv'))): if not sysOps.check_file_exists("pairing_filter" + str(filter_val) + "_uei_umi.csv"): dnamicOps.assign_umi_pairs( 'thresh1_identical__for_uxi1.fasta', 'thresh1_identical__for_uxi0.fasta', 'thresh1_identical__rev_uxi0.fasta', '_for_uxi1.fasta', '_for_uxi0.fasta', '_rev_uxi0.fasta', 'pairing', filter_val, False ) # final parameter = False: excluding invalid amplicon sequences dnamicOps.assign_consensus_pairs( "pairing_filter" + str(filter_val) + "_uei_umi.csv", min_pairing_readcount) else: sysOps.throw_status( 'Consensus-pairing file found pre-computed.') if (sysOps.check_file_exists('thresh1_identical__rev_uxi0.fasta') and not sysOps.check_file_exists('trg_amplicon_calls.csv')): #assign amplicon-identities to target umi's sysOps.throw_status( 'Assigning amplicon-identities and consensus sequences to target umis.' ) dnamicOps.assign_umi_amplicons( 'thresh1_identical__rev_uxi0.fasta', '_rev_uxi0.fasta', '_amp_match.txt', '_rev_amp0.fasta', 'trg_amplicon_calls.csv')
def sim_physics(self): sysOps.throw_status("Running DNA microscopy simulation.") # bcn-indices must range from 0 to Nbcn-1, trg-indices must range from Nbcn to Nbcn+Ntrg-1 phys_dims = 3.0 sysOps.throw_status("Using num_dims = " + str(self.sim_dims) + ", Nbcn = " + str(self.Nbcn) + ", Ntrg = " + str(self.Ntrg)) tot_rate = 0.0 # initialize all molecule numbers to 1 self.sim_pos[:, self.sim_dims] = 1.0 np.savetxt(sysOps.globaldatapath + "sim_index_key.csv", np.reshape(self.index_key, [self.Nbcn + self.Ntrg, 1]), delimiter=',') sysOps.throw_status("BEGINNING PART 1") # PART 1: (a) estimate pairwise reaction rates (and record their sum) # (b) simulate amplification with stochasticity (parameterized by effic_monomer) for C in range(1, self.lin_cycles + self.exp_cycles + 1): # amplify with effic_monomer <=1 for n in range(self.Nbcn + self.Ntrg): if C <= self.lin_cycles: binom_res = np.random.binomial( 1, self.effic_monomer ) # if linear amplification step, keep template number constant at 1 else: binom_res = np.random.binomial( int(self.sim_pos[n, self.sim_dims]), self.effic_monomer) # if exponential amplification step, template number varies over time self.sim_pos[n, self.sim_dims] += binom_res np.savetxt(sysOps.globaldatapath + "molcountfile_" + str(C) + ".csv", np.reshape(np.int64(self.sim_pos[:, self.sim_dims]), [self.Nbcn + self.Ntrg, 1]), delimiter=',') if C > self.lin_cycles: #only sum/print UEI-formation rates for exponential amplification steps # iterate through simulated PCR cycles, print relative rates to rate_file sysOps.throw_status("C=" + str(C) + " --> 8*D*d*t = " + str(8 * self.diffconst * phys_dims * float(C))) # diffconst multiplies time to incorporate that dependence t_term = 8 * self.diffconst * phys_dims * float(C) # that need to be zoomed into so that individual UMI's for concatenation can be extracted tot_rate += sum_partition_function(self.sim_pos, self.Nbcn, self.Ntrg, self.sim_dims, phys_dims, t_term) sysOps.throw_status("BEGINNING PART 2") #PART 2: simulate UEI generation sysOps.throw_status("Generating " + str(self.Nuei) + " random numbers.") sorted_unif_rand_res = np.sort(np.random.rand(self.Nuei)) uei_arr = np.zeros([self.Nuei, 2], dtype=np.int64) uei_ind = 0 inp_cumul_rate = np.array([0.0], dtype=np.float64) for C in range(self.lin_cycles + 1, self.lin_cycles + self.exp_cycles + 1): self.sim_pos[:, self.sim_dims] = np.int64( np.loadtxt(sysOps.globaldatapath + "molcountfile_" + str(C) + ".csv", delimiter=',')) t_term = 8 * self.diffconst * phys_dims * float(C) uei_arr[:] = -1 prev_uei_ind = int(uei_ind) uei_ind = generate_ueis(uei_arr, self.sim_pos, sorted_unif_rand_res, inp_cumul_rate, self.Nbcn, self.Ntrg, self.Nuei, self.sim_dims, phys_dims, t_term, tot_rate, uei_ind) if uei_ind > prev_uei_ind: np.savetxt(sysOps.globaldatapath + "ueifile_" + str(C) + ".csv", uei_arr[:(uei_ind - prev_uei_ind), :], delimiter=',') sysOps.throw_status("C = " + str(C) + ". Current UEI-count: " + str(uei_ind) + '.') del sorted_unif_rand_res sysOps.throw_status("BEGINNING PART 3") #PART 3: simulate UEI amplification all_uei = np.array([]) for C in range(self.lin_cycles + 1, self.lin_cycles + self.exp_cycles + 1): for i in range(all_uei.shape[0]): all_uei[i, 2] += np.random.binomial(all_uei[i, 2], self.effic_dimer) if sysOps.check_file_exists(sysOps.globaldatapath + "ueifile_" + str(C) + ".csv"): this_uei_arr = np.int64( np.loadtxt(sysOps.globaldatapath + "ueifile_" + str(C) + ".csv", delimiter=',')) if len(this_uei_arr.shape) == 1: this_uei_arr = np.array([this_uei_arr]) if this_uei_arr.shape[0] > 0: this_uei_arr = np.append(this_uei_arr, np.ones( [this_uei_arr.shape[0], 1]), axis=1) if all_uei.shape[0] == 0: all_uei = np.array(this_uei_arr) else: all_uei = np.concatenate([all_uei, this_uei_arr], axis=0) sysOps.throw_status("BEGINNING PART 4") tot_mol = np.sum(all_uei[:, 2]) #PART 4: output simulated reads my_N_reads = self.N_reads sysOps.throw_status('my_N_reads = ' + str(my_N_reads) + '/' + str(self.N_reads) + ',' + sysOps.globaldatapath + 'r' + str(my_N_reads) + '_sim_ueifile.csv') with open( sysOps.globaldatapath + "r" + str(my_N_reads) + "_sim_ueifile.csv", 'w') as finalsimdata_outfile: sorted_unif_rand_reads = np.sort(np.random.rand(my_N_reads)) #For downstream processing, need consensus-pairing file with the following comma-separated columns: #1. uei index #2. beacon-umi index #3. target-umi index #4. read-count read_ind = 0 cumul_read_frac = 0.0 for uei_index in range(uei_arr.shape[0]): cumul_read_frac += all_uei[uei_index, 2] / tot_mol my_reads = 0 while cumul_read_frac >= sorted_unif_rand_reads[read_ind]: my_reads += 1 read_ind += 1 if read_ind == my_N_reads: break #no more reads to generate if my_reads > 1: #only include those UEI's with at least 2 reads finalsimdata_outfile.write( str(uei_index) + "," + str(all_uei[uei_index, 0]) + "," + str(all_uei[uei_index, 1]) + "," + str(my_reads) + '\n') if read_ind >= my_N_reads: break del sorted_unif_rand_reads my_N_reads *= 2 sysOps.throw_status("SIMULATION COMPLETE") return
def sim_reads(self): simLibObj = libOps.libObj(settingsfilename='libsettings.txt', output_prefix='_') enforced_rev_read_len = 100 [for_read_len, rev_read_len] = simLibObj.get_min_allowed_readlens( simLibObj.filter_amplicon_window) rev_read_len = int(enforced_rev_read_len) ''' simLibObj.seqform_for_params and simLibObj.seqform_rev_params are already stored in current object's memory Form of these variables is a list of the following: Element 1: [start_pos,end_pos] Element 2: np.ndarray(seq_bool_vec, dtype=np.bool_) Element 3: np.ndarray(capital_bool_vec, dtype=np.bool_) Element 4: np.ndarray(ambig_vec, dtype=np.bool_) ''' [subdirnames, filenames] = sysOps.get_directory_and_file_list() for_umi_seqs = list() rev_umi_seqs = list() rev_umi_amplicon_list = list() uei_seqs = list() base_order = 'ACGT' sysOps.throw_status('Generating simulated sequences ...') amplicon_list = list() if "-amplicon" in simLibObj.mySettings: amplicon_list = [ simLibObj.mySettings["-amplicon"][i].upper().split(',') for i in range(len(simLibObj.mySettings["-amplicon"])) ] for for_umi_i in range(self.Nbcn): for_param_index = np.random.randint( len(simLibObj.seqform_for_params)) if len(simLibObj.seqform_for_params[for_param_index]) > 1: sysOps.throw_exception( 'Error: len(simLibObj.seqform_for_params[for_param_index]) = ' + str(len(simLibObj.seqform_for_params[for_param_index]))) sysOps.exitProgram() my_for_umi_param = simLibObj.seqform_for_params[for_param_index][ 0]['U'][0] [start_pos, end_pos] = my_for_umi_param[0] seq_bool_vec = my_for_umi_param[1] my_for_umi = str('') for pos in range(end_pos - start_pos): possible_bases = np.where(seq_bool_vec[(pos * 4):((pos + 1) * 4)])[0] my_for_umi += base_order[possible_bases[np.random.randint( possible_bases.shape[0])]] for_umi_seqs.append([int(for_param_index), str(my_for_umi)]) for for_uei_i in range(self.Nuei): for_param_index = 0 # there should be no difference across UMI's my_for_uei_param = simLibObj.seqform_for_params[for_param_index][ 0]['U'][1] [start_pos, end_pos] = my_for_uei_param[0] seq_bool_vec = my_for_uei_param[1] my_for_uei = str('') for pos in range(end_pos - start_pos): possible_bases = np.where(seq_bool_vec[(pos * 4):((pos + 1) * 4)])[0] my_for_uei += base_order[possible_bases[np.random.randint( possible_bases.shape[0])]] uei_seqs.append(str(my_for_uei)) for rev_umi_i in range(self.Ntrg): rev_param_index = np.random.randint( len(simLibObj.seqform_rev_params)) my_rev_umi_param = simLibObj.seqform_rev_params[rev_param_index][ 0]['U'][0] [start_pos, end_pos] = my_rev_umi_param[0] seq_bool_vec = my_rev_umi_param[1] my_rev_umi = str('') for pos in range(end_pos - start_pos): possible_bases = np.where(seq_bool_vec[(pos * 4):((pos + 1) * 4)])[0] my_rev_umi += base_order[possible_bases[np.random.randint( possible_bases.shape[0])]] if len(amplicon_list) == 0: encoded_amplicon = str('') else: this_gsp_primer_amplicon_pair = list( amplicon_list[np.random.randint(len(amplicon_list))] ) # already properly oriented # already properly oriented # generate single error on amplicon lenamp = len(this_gsp_primer_amplicon_pair[1]) rand_loc = np.random.randint(lenamp) this_gsp_primer_amplicon_pair[1] = str( this_gsp_primer_amplicon_pair[1][:rand_loc] + base_order[np.random.randint(4)] + this_gsp_primer_amplicon_pair[1][(rand_loc + 1):]) encoded_amplicon = ''.join(this_gsp_primer_amplicon_pair) tmp_umi_index = float(rev_umi_i) if tmp_umi_index == 0: encoded_amplicon += base_order[0] else: for myexponent in range( int(np.floor(np.log(tmp_umi_index) / np.log(4.0))), -1, -1): mydigit = np.floor(tmp_umi_index / np.power(4.0, myexponent)) encoded_amplicon += base_order[int(mydigit)] tmp_umi_index -= mydigit * np.power(4.0, myexponent) rev_umi_seqs.append( [int(rev_param_index), str(my_rev_umi), str(encoded_amplicon)]) sysOps.throw_status('Writing simulated reads ...') for filename in filenames: if filename.endswith('_sim_ueifile.csv'): ueifile = np.int64( np.loadtxt(sysOps.globaldatapath + filename, delimiter=',')) newdirname = filename[:filename.find('_')] read_list = list() for i in range(ueifile.shape[0]): for myread in range(ueifile[i, 3]): read_list.append(np.array([ueifile[i, :3]])) read_list = np.concatenate( read_list, axis=0 ) # re-write array so that there is now one row per read # randomly permute: read_list = read_list[ np.random.permutation(read_list.shape[0]), :] for_chararray = np.chararray((for_read_len)) rev_chararray = np.chararray((rev_read_len)) for_fastq_outfile = open(newdirname + '_for.fastq', "w") rev_fastq_outfile = open(newdirname + '_rev.fastq', "w") for i in range(read_list.shape[0]): for_param_index = for_umi_seqs[read_list[i, 1]][0] for_umi_seq = for_umi_seqs[read_list[i, 1]][1] rev_param_index = rev_umi_seqs[read_list[i, 2]][ 0] # both beacon and target indices are at this point are independently indexed from 0 rev_umi_seq = rev_umi_seqs[read_list[i, 2]][1] rev_amp_seq = rev_umi_seqs[read_list[i, 2]][2] uei_seq = uei_seqs[read_list[i, 0]] for j in range(for_read_len): for_chararray[j] = 'N' for j in range(rev_read_len): rev_chararray[j] = 'N' my_for_umi_param = simLibObj.seqform_for_params[ for_param_index][0]['U'][0] [start_pos, end_pos] = my_for_umi_param[0] for j in range(end_pos - start_pos): for_chararray[j + start_pos] = for_umi_seq[j] my_for_uei_param = simLibObj.seqform_for_params[ for_param_index][0]['U'][1] [start_pos, end_pos] = my_for_uei_param[0] for j in range(end_pos - start_pos): for_chararray[j + start_pos] = uei_seq[j] for my_for_param in simLibObj.seqform_for_params[ for_param_index][0]['P']: [start_pos, end_pos] = my_for_param[0] for j in range(end_pos - start_pos): for_chararray[j + start_pos] = base_order[np.where( my_for_param[1][(4 * j):(4 * (j + 1))])[0][0]] my_rev_umi_param = simLibObj.seqform_rev_params[ rev_param_index][0]['U'][0] [start_pos, end_pos] = my_rev_umi_param[0] for j in range(end_pos - start_pos): rev_chararray[j + start_pos] = rev_umi_seq[j] my_rev_amp_param = simLibObj.seqform_rev_params[ rev_param_index][0]['A'][0] start_pos = my_rev_amp_param[0][0] for j in range(len(rev_amp_seq)): rev_chararray[j + start_pos] = rev_amp_seq[j] if 'P' in simLibObj.seqform_rev_params[rev_param_index][0]: for my_rev_param in simLibObj.seqform_rev_params[ rev_param_index][0]['P']: [start_pos, end_pos] = my_rev_param[0] for j in range(end_pos - start_pos): rev_chararray[j + start_pos] = base_order[np.where( my_rev_param[1][(4 * j):( 4 * (j + 1))])[0][0]] for_record = SeqIO.SeqRecord( Seq.Seq(for_chararray.tostring())) for_record.id = '-' + str(i) + '-' + str(read_list[i, 1]) for_record.description = '' for_record.letter_annotations['phred_quality'] = list( [30 for j in range(for_read_len)]) rev_record = SeqIO.SeqRecord( Seq.Seq(rev_chararray.tostring())) rev_record.id = '-' + str(i) + '-' + str(read_list[i, 2]) rev_record.description = '' rev_record.letter_annotations['phred_quality'] = list( [30 for j in range(rev_read_len)]) SeqIO.write(for_record, for_fastq_outfile, "fastq") SeqIO.write(rev_record, rev_fastq_outfile, "fastq") for_fastq_outfile.close() rev_fastq_outfile.close() os.mkdir(newdirname) with open('libsettings.txt', 'rU') as oldsettingsfile: with open(newdirname + '//libsettings.txt', 'w') as newsettingsfile: for oldsettings_row in oldsettingsfile: if oldsettings_row.startswith('-source_for'): newsettingsfile.write('-source_for ..//' + newdirname + '_for.fastq\n') elif oldsettings_row.startswith('-source_rev'): newsettingsfile.write('-source_rev ..//' + newdirname + '_rev.fastq\n') else: newsettingsfile.write(oldsettings_row) sysOps.throw_status('Done.') return
def generate_data_layout(data_layout_file = 'data_layout.csv'): # Format of data_layout_file as follows: # Sample Sample name, description, etc # Barcode Number/search-term for run directories # Run Run directory-1 # Run Run directory-2 # Run etc # Beacon Beacon oligo-1 # Beacon Beacon oligo-2 # Beacon etc # Target Target oligo-1 # Target Target oligo-2 # Target etc # OE1a OE-primer-1 # OE4b OE-primer-2 # Amplicon Amplicon file-1 # Amplicon Amplicon file-2 # Standardize-amplicon-start TRUE/left blank data_layout_dict = dict() with open(data_layout_file,'rU') as csvfile: curr_sample = None for myline in csvfile: thisline = rm_hidden_char(myline).strip('\n').split(',') if len(thisline) >= 2: if thisline[0].lower() == 'sample': curr_sample = thisline[1] data_layout_dict[curr_sample] = dict() else: if thisline[0].lower() not in data_layout_dict[curr_sample]: data_layout_dict[curr_sample][thisline[0].lower()] = list() data_layout_dict[curr_sample][thisline[0].lower()].append(thisline[1]) final10_sbs12_sbs3 = 'CTTCCGATCT' for sample in data_layout_dict: missing_keys = [my_key for my_key in ['barcode','run','beacon','target','oe1a','oe4b','amplicon'] if my_key not in data_layout_dict[sample]] if (len(missing_keys)>0): sysOps.throw_status('Skipping sample ' + str(sample) + ' due to missing keys:' + str(missing_keys)) else: source_for = list() source_rev = list() for run_index in range(len(data_layout_dict[sample]['run'])): run_dir = data_layout_dict[sample]['run'][run_index] if not run_dir.endswith('//'): run_dir += '//' run_dir_exists = False try: # try opening run_dir for writing with open(run_dir + 'test.txt','w'): run_dir_exists = True os.remove(run_dir + 'test.txt') except: sysOps.throw_status('Skipping run-directory ' + str(run_dir)) if run_dir_exists: [subdirnames, filenames] = sysOps.get_directory_and_file_list(run_dir) this_sample_run_R1 = list(['..//' + run_dir + filename for filename in filenames if (data_layout_dict[sample]['barcode'][0]+'_' in filename and 'R1' in filename)]) this_sample_run_R2 = list([filename[:(filename.find('R1'))] + 'R2' + filename[(filename.find('R1')+2):] for filename in this_sample_run_R1]) source_for.extend(this_sample_run_R1) # since new directory is being created, adding an additional level to the path source_rev.extend(this_sample_run_R2) source_for = ','.join(source_for) source_rev = ','.join(source_rev) # join oe sequences seqform_for = list() find_index, conjoined_oe_seq = rigid_conjoin(get_revcomp(data_layout_dict[sample]['oe1a'][0]),data_layout_dict[sample]['oe4b'][0],10) for beacon_oligo_index in range(len(data_layout_dict[sample]['beacon'])): revcomp_bcn_oligo = get_revcomp(data_layout_dict[sample]['beacon'][beacon_oligo_index]) revcomp_bcn_oligo = revcomp_bcn_oligo[(revcomp_bcn_oligo.find(final10_sbs12_sbs3)+len(final10_sbs12_sbs3)):] oe_start_index, conjoined_bcn_oe_seq = rigid_conjoin(revcomp_bcn_oligo,conjoined_oe_seq,10) uei_start_index = np.min(np.array([(oe_start_index+conjoined_bcn_oe_seq[oe_start_index:].upper().find(my_char)) for my_char in 'NWSRY' if my_char in conjoined_bcn_oe_seq[oe_start_index:].upper()])) uei_end_index = 1+np.max(np.array([(oe_start_index+conjoined_bcn_oe_seq[oe_start_index:].upper().rfind(my_char)) for my_char in 'NWSRY' if my_char in conjoined_bcn_oe_seq[oe_start_index:].upper()])) my_seqform_for = list() my_seqform_for.append('U_' + conjoined_bcn_oe_seq[1:oe_start_index] + '_1:' + str(oe_start_index)) my_seqform_for.append('P_' + conjoined_bcn_oe_seq[oe_start_index:uei_start_index] + '_' + str(oe_start_index) + ':' + str(uei_start_index)) my_seqform_for.append('U_' + conjoined_bcn_oe_seq[uei_start_index:uei_end_index] + '_' + str(uei_start_index) + ':' + str(uei_end_index)) my_seqform_for.append('P_' + conjoined_bcn_oe_seq[uei_end_index:(uei_end_index+2)] + '_' + str(uei_end_index) + ':' + str(uei_end_index+2)) my_seqform_for = '|'.join(my_seqform_for) if my_seqform_for not in seqform_for: seqform_for.append(str(my_seqform_for)) my_amplicons = list() for amplicon_file in data_layout_dict[sample]['amplicon']: if amplicon_file.upper() == 'N': # amplicon left blank my_amplicons.append(list(['N','N'])) else: [subdirnames, filenames] = sysOps.get_directory_and_file_list() if amplicon_file in filenames: for record in SeqIO.parse(amplicon_file, "fasta"): my_amplicons.append(list([str(record.id), str(record.seq)])) else: sysOps.throw_status('Skipping ' + str(amplicon_file)) primer_amplicon_pairs = list() primer_amplicon_starts = list() seqform_rev = list() print str(my_amplicons) for amplicon in my_amplicons: revcomp_amplicon = get_revcomp(amplicon[1].lower()) for target_oligo_index in range(len(data_layout_dict[sample]['target'])): target_oligo = data_layout_dict[sample]['target'][target_oligo_index] target_oligo = target_oligo[(target_oligo.find(final10_sbs12_sbs3)+len(final10_sbs12_sbs3)):] if revcomp_amplicon == 'n': randprim_len = len(target_oligo) - (1 + np.max(np.array([target_oligo.upper().rfind(my_char) for my_char in 'ACGT']))) target_oligo = target_oligo[:(len(target_oligo) - randprim_len)] my_seqform_rev = list() my_seqform_rev.append('U_' + target_oligo[1:len(target_oligo)] + '_1:' + str(len(target_oligo))) my_seqform_rev.append('A_' + str(len(target_oligo)+randprim_len) + ':') my_seqform_rev = '|'.join(my_seqform_rev) if my_seqform_rev not in seqform_rev: seqform_rev.append(str(my_seqform_rev)) else: find_index, conjoined_amplicon_seq = rigid_conjoin(target_oligo,revcomp_amplicon,10) if find_index >= 0: primer_overlap = len(target_oligo) + len(revcomp_amplicon) - len(conjoined_amplicon_seq) primer_amplicon_pairs.append(amplicon[0] + '|' + get_revcomp(conjoined_amplicon_seq[(find_index+primer_overlap):]) + ',' + get_revcomp(conjoined_amplicon_seq[find_index:(find_index+primer_overlap)])) my_seqform_rev = list() my_seqform_rev.append('U_' + conjoined_amplicon_seq[1:find_index] + '_1:' + str(find_index)) my_seqform_rev.append('A_' + str(find_index) + ':') primer_amplicon_starts.append(int(find_index)) my_seqform_rev = '|'.join(my_seqform_rev) if my_seqform_rev not in seqform_rev: seqform_rev.append(str(my_seqform_rev)) # finally, print libsettings.txt my_libdir = 'lib_' + str(sample) + '//' os.mkdir(my_libdir) if ('standardize-amplicon-start' in data_layout_dict[sample] and data_layout_dict[sample]['standardize-amplicon-start'][0].lower() == 'true'): max_amplicon_start = int(np.max(np.array(primer_amplicon_starts))) with open(my_libdir + 'amplicon_refs.txt','w') as outfile: new_seqform_rev = list() for my_seqform_rev in seqform_rev: elements = my_seqform_rev.split('|') elements[len(elements)-1] = 'A_' + str(max_amplicon_start) + ':' elements = '|'.join(elements) if elements not in new_seqform_rev: new_seqform_rev.append(str(elements)) seqform_rev = list(new_seqform_rev) for primer_amplicon_pair,primer_amplicon_start in itertools.izip(primer_amplicon_pairs,primer_amplicon_starts): outfile.write(primer_amplicon_pair[:(len(primer_amplicon_pair) + primer_amplicon_start - max_amplicon_start)] + '\n') primer_amplicon_pairs = list() # omit from libsettings with open(my_libdir + 'libsettings.txt','w') as outfile: outfile.write('-source_for ' + source_for + '\n') outfile.write('-source_rev ' + source_rev + '\n') for this_seqform_for in seqform_for: outfile.write('-seqform_for ' + this_seqform_for + '\n') for this_seqform_rev in seqform_rev: outfile.write('-seqform_rev ' + this_seqform_rev + '\n') for primer_amplicon_pair in primer_amplicon_pairs: outfile.write('-amplicon ' + primer_amplicon_pair + '\n') if 'max-mismatch' in data_layout_dict[sample]: outfile.write('-max_mismatch ' + data_layout_dict[sample]['max-mismatch'][0] + '\n') if 'max-mismatch-amplicon' in data_layout_dict[sample]: outfile.write('-max_mismatch_amplicon ' + data_layout_dict[sample]['max-mismatch-amplicon'][0] + '\n') if 'min-mean-qual' in data_layout_dict[sample]: outfile.write('-min_mean_qual ' + data_layout_dict[sample]['min-mean-qual'][0] + '\n') if 'filter-amplicon-window' in data_layout_dict[sample]: outfile.write('-filter_amplicon_window ' + data_layout_dict[sample]['filter-amplicon-window'][0] + '\n') if 'amplicon-terminate' in data_layout_dict[sample]: for this_amplicon_terminate in data_layout_dict[sample]['amplicon-terminate']: outfile.write('-amplicon_terminate ' + this_amplicon_terminate + '\n') return
def filter_mats(bcn_dict, trg_dict, bcn_div_dict, trg_div_dict, min_uei_count): # prune UEI data to exclude UMIs with UEI counts < min_uei_count if len(bcn_dict) == 0: return [bcn_dict, trg_dict, bcn_div_dict, trg_div_dict] deletion_iteration = 0 is_list = None sysOps.throw_status('Filtering matrices with ' + str(len(bcn_div_dict)) + '+' + str(len(trg_div_dict)) + ' UMIs.') while True: bcn_retained = 0 trg_retained = 0 bcn_deleted = list() trg_deleted = list() for bcn_el in bcn_div_dict: if bcn_div_dict[bcn_el]<min_uei_count: bcn_deleted.append(bcn_el) else: bcn_retained += 1 for trg_el in trg_div_dict: if trg_div_dict[trg_el]<min_uei_count: trg_deleted.append(trg_el) else: trg_retained += 1 #check if bcn_dict and trg_dict are still list or already converted to values if is_list == None: for bcn_el in bcn_dict: for trg_el in bcn_dict[bcn_el]: is_list = (type(bcn_dict[bcn_el][trg_el]) is list) break break if len(bcn_deleted)==0 and len(trg_deleted)==0: sysOps.throw_status('On deletion-iteration ' + str(deletion_iteration) + ', all retained.') break sysOps.throw_status('On deletion-iteration ' + str(deletion_iteration) + ' deleting ' + str(len(bcn_deleted)) + '+' + str(len(trg_deleted)) + ', retained ' + str(bcn_retained) + '+' + str(trg_retained) + '. is_list=' + str(is_list)) if is_list == None: sysOps.throw_exception('Error, could not find any elements: len(bcn_dict) = ' + str(len(bcn_dict))) sysOps.exitProgram() for bcn_el in bcn_deleted: for trg_el in bcn_dict[bcn_el]: if is_list: trg_div_dict[trg_el] -= len(trg_dict[trg_el][bcn_el]) else: trg_div_dict[trg_el] -= trg_dict[trg_el][bcn_el] del trg_dict[trg_el][bcn_el] del bcn_dict[bcn_el] del bcn_div_dict[bcn_el] for trg_el in trg_deleted: for bcn_el in trg_dict[trg_el]: if bcn_el in bcn_div_dict: #if not already deleted above if is_list: bcn_div_dict[bcn_el] -= len(bcn_dict[bcn_el][trg_el]) else: bcn_div_dict[bcn_el] -= bcn_dict[bcn_el][trg_el] del bcn_dict[bcn_el][trg_el] del trg_dict[trg_el] del trg_div_dict[trg_el] deletion_iteration += 1 #check for consistency for bcn_el in bcn_dict: for trg_el in bcn_dict[bcn_el]: if is_list and len(bcn_dict[bcn_el][trg_el])!=len(trg_dict[trg_el][bcn_el]): sysOps.throw_exception('ERROR: bcn_dict and trg_dict contain different elements') sysOps.exitProgram() for trg_el in trg_dict: for bcn_el in trg_dict[trg_el]: if is_list and len(bcn_dict[bcn_el][trg_el])!=len(trg_dict[trg_el][bcn_el]): sysOps.throw_exception('ERROR: bcn_dict and trg_dict contain different elements') sysOps.exitProgram() return [bcn_dict, trg_dict, bcn_div_dict, trg_div_dict]
def crosscomparison_analysis(self, args): sysOps.initiate_statusfilename() list_of_dirs = list() file_to_compare = args[1] with open(sysOps.globaldatapath + args[2], 'rU') as csvfile: for myline in csvfile: thisline = myline.strip('\n').split(',') subdir = 'lib_' + str(thisline[0]) + '_' + str( thisline[1]) + '_' + str(thisline[2]) list_of_dirs.append(subdir) print "Beginning comparison analysis" print "File to compare = " + file_to_compare print "Directories = " + ",".join(list_of_dirs) try: os.mkdir(sysOps.globaldatapath + 'cross_comparisons') except: sysOps.throw_exception( 'cross_comparisons directory already exists. Terminating comparison analysis.' ) sysOps.exitProgram() shared_num_unique_matrix = list() unshared_num_unique_matrix = list() shared_read_abund_matrix = list() unshared_read_abund_matrix = list() for i in range(len(list_of_dirs)): shared_num_unique_matrix.append(list([-1] * len(list_of_dirs))) unshared_num_unique_matrix.append(list([-1] * len(list_of_dirs))) shared_read_abund_matrix.append(list([-1] * len(list_of_dirs))) unshared_read_abund_matrix.append(list([-1] * len(list_of_dirs))) for ind1 in range(len(list_of_dirs)): for ind2 in range(ind1): dir1 = list_of_dirs[ind1] dir2 = list_of_dirs[ind2] clustfile1 = dir1 + "//" + file_to_compare clustfile2 = dir2 + "//" + file_to_compare dir1_abbrev = dir1[( dir1.rfind('/') + 1 ):] #remove superdirectory structure of path -- requires individual directories have unique names dir2_abbrev = dir2[(dir2.rfind('/') + 1):] sysOps.throw_status('Began writing cross_comparisons//' + dir1_abbrev + "_" + dir2_abbrev + "_" + file_to_compare) [ num_unique_shared, num_unique_unshared, read_abundance_shared, read_abundance_unshared ] = alignOps.compare( clustfile1, clustfile2, dir1_abbrev + "_" + dir2_abbrev + "_" + file_to_compare, False) sysOps.throw_status('Completed writing cross_comparisons//' + dir1_abbrev + "_" + dir2_abbrev + "_" + file_to_compare) shared_num_unique_matrix[ind1][ind2] = num_unique_shared[0] shared_num_unique_matrix[ind2][ind1] = num_unique_shared[1] unshared_num_unique_matrix[ind1][ind2] = num_unique_unshared[0] unshared_num_unique_matrix[ind2][ind1] = num_unique_unshared[1] print str(num_unique_unshared[0] ) + '-> unshared_num_unique_matrix[ ' + str( ind1) + '][' + str(ind2) + ']' shared_read_abund_matrix[ind1][ind2] = read_abundance_shared[0] shared_read_abund_matrix[ind2][ind1] = read_abundance_shared[1] unshared_read_abund_matrix[ind1][ ind2] = read_abundance_unshared[0] unshared_read_abund_matrix[ind2][ ind1] = read_abundance_unshared[1] print shared_num_unique_matrix print unshared_num_unique_matrix print shared_read_abund_matrix print unshared_read_abund_matrix with open('comparison_matrices.csv', 'w') as compare_matrix_file: for i1 in range(len(list_of_dirs)): compare_matrix_file.write( ','.join([str(j) for j in shared_num_unique_matrix[i1]]) + '\n') for i2 in range(len(list_of_dirs)): compare_matrix_file.write( ','.join([str(j) for j in unshared_num_unique_matrix[i2]]) + '\n') for i3 in range(len(list_of_dirs)): compare_matrix_file.write( ','.join([str(j) for j in shared_read_abund_matrix[i3]]) + '\n') for i4 in range(len(list_of_dirs)): compare_matrix_file.write( ','.join([str(j) for j in unshared_read_abund_matrix[i4]]) + '\n')
def dnamic_inference(self, smle_infer=False, msmle_infer=False, segment_infer=False, compute_local_solutions_only=True): # Perform image inference on the basis of raw output of DNA microscopy sequence analysis # Basic settings read_thresh = 2 min_uei_count = 2 output_dim = 2 version = 1.0 infer_dir = '' # raw data files consensus_pairing_csv_file = "..//consensus_" + str( read_thresh) + "r_pairing_filter0.75_uei_umi.csv" outname = 'minuei' + str(min_uei_count) + 'DMv' + str( version) + '_' + str(read_thresh) + 'r_filter0.75' wmat_outfilename = 'wmat_' + outname + '.csv' param_name = 'minuei' + str(min_uei_count) + 'dim' + str( output_dim) + 'DMv' + str(version) + '_.csv' imagemodule_input_filename = 'data_' + param_name key_filename = 'key_' + param_name if not sysOps.check_file_exists('microscopy_tasklist.csv'): [subdirnames, filenames ] = sysOps.get_directory_and_file_list(sysOps.globaldatapath) with open(sysOps.globaldatapath + 'microscopy_tasklist.csv', 'w') as task_input_file_handle: for subdir in subdirnames: if sysOps.check_file_exists(subdir + '//libsettings.txt'): task_input_file_handle.write('infer_smle;' + sysOps.globaldatapath + subdir + '//\n') task_input_file_handle.write('infer_msmle;' + sysOps.globaldatapath + subdir + '//\n') task_input_file_handle.write('infer_segment;' + sysOps.globaldatapath + subdir + '//\n') task_input_file_handle.write('infer_ptmle;' + sysOps.globaldatapath + subdir + '//\n') original_datapath = str(sysOps.globaldatapath) if smle_infer: infer_dir = 'infer_smle//' [my_task, time_start ] = parallelOps.get_next_open_task('tasklog.csv', 'microscopy_tasklist.csv', 'infer_smle') elif msmle_infer: infer_dir = 'infer_msmle//' [my_task, time_start ] = parallelOps.get_next_open_task('tasklog.csv', 'microscopy_tasklist.csv', 'infer_msmle') elif segment_infer: infer_dir = 'infer_segment//' [my_task, time_start ] = parallelOps.get_next_open_task('tasklog.csv', 'microscopy_tasklist.csv', 'infer_segment') else: infer_dir = 'infer_ptmle//' [my_task, time_start ] = parallelOps.get_next_open_task('tasklog.csv', 'microscopy_tasklist.csv', 'infer_ptmle') if not (my_task is None): sysOps.initiate_runpath(str(my_task[1])) [subdirnames, filenames] = sysOps.get_directory_and_file_list() dirnames = list(["."]) subdirnames_nodatayet = [ subdirname for subdirname in subdirnames if subdirname.startswith('sub') and ( not sysOps.check_file_exists(subdirname + '//' + imagemodule_input_filename)) ] subdirnames_nodatayet = [ subdirnames_nodatayet[i] for i in np.argsort(-np.array([ int(subdirname[3:].strip('/')) for subdirname in subdirnames_nodatayet ])) ] # sort by descending read count subdirnames_dataalready = [ subdirname for subdirname in subdirnames if subdirname.startswith('sub') and ( sysOps.check_file_exists(subdirname + '//' + imagemodule_input_filename)) ] subdirnames_dataalready = [ subdirnames_dataalready[i] for i in np.argsort(-np.array([ int(subdirname[3:].strip('/')) for subdirname in subdirnames_dataalready ])) ] # sort by descending read count dirnames.extend(subdirnames_nodatayet) dirnames.extend(subdirnames_dataalready) sysOps.throw_status('Checking directories ' + sysOps.globaldatapath + ' ... ' + str(dirnames) + ' for infer-subdirectories.') for dirname in dirnames: # make inference directories try: with open( sysOps.globaldatapath + dirname + '//' + infer_dir + 'tmpfile.txt', 'w') as tmpfile: tmpfile.write('test') os.remove(sysOps.globaldatapath + dirname + '//' + infer_dir + 'tmpfile.txt') sysOps.throw_status('Directory ' + sysOps.globaldatapath + dirname + '//' + infer_dir + ' found already created.') except: os.mkdir(sysOps.globaldatapath + dirname + '//' + infer_dir) sysOps.throw_status('Created directory ' + sysOps.globaldatapath + dirname + '//' + infer_dir) for dirname in dirnames: sysOps.initiate_runpath( str(my_task[1]) + dirname + '//' + infer_dir) sysOps.initiate_statusfilename() sysOps.throw_status('Assigned path ' + sysOps.globaldatapath) if not (sysOps.check_file_exists(key_filename) and sysOps.check_file_exists(imagemodule_input_filename) and sysOps.check_file_exists( 'read_' + imagemodule_input_filename) and sysOps.check_file_exists('seq_params_' + imagemodule_input_filename)): sysOps.throw_status('Calling matOps.generate_wmat()') trg_dict = matOps.generate_wmat(consensus_pairing_csv_file, read_thresh, min_uei_count, wmat_outfilename) sysOps.throw_status('Completed matOps.generate_wmat()') matOps.print_imagemodule_input(trg_dict, imagemodule_input_filename, key_filename, output_dim) #print_imagemodule_input outputs # 1. File key_filename containing 3 columns: 0 or 1 (for beacon or target, respectively), cluster-index, MLE processing index # 2. imagemodule_input_filename containing 3 columns: MLE processing index for beacon, MLE processing index for target, uei-count, max UEI read count # 3. Summary file containing: Number of beacons inputted to MLE, number of targets inputted to MLE, else: sysOps.throw_status( 'Image-module input pre-computed. Proceeding ...') #optimOps.test_ffgt() if sysOps.check_file_exists(imagemodule_input_filename): if segment_infer: optimOps.run_mle( imagemodule_input_filename, False, False, True, compute_local_solutions_only, ) # segmentation only elif msmle_infer: optimOps.run_mle(imagemodule_input_filename, False, True, False, compute_local_solutions_only) # msMLE elif smle_infer: optimOps.run_mle(imagemodule_input_filename, True, False, False, compute_local_solutions_only) # sMLE else: optimOps.run_mle(imagemodule_input_filename, False, False, False, compute_local_solutions_only) # ptMLE if not compute_local_solutions_only: dnamicOps.print_final_results( '..//trg_amplicon_calls.csv', '..//trg_amplicon_calls.fasta') else: sysOps.exitProgram() else: sysOps.throw_status('Could not locate ' + sysOps.globaldatapath + imagemodule_input_filename) sysOps.globaldatapath = str(original_datapath) if not parallelOps.close_task('tasklog.csv', ';'.join(my_task), time_start): sysOps.throw_exception('Task ' + str(my_task) + ' no longer exists in log ' + sysOps.globaldatapath + 'tasklog.csv' + ' -- exiting.') sysOps.exitProgram() return
def parse_seqform(parseable, amplicon_option=None): ''' parse input from -seqform_for or -seqform_rev tag in settings file parseable must contain integers separated by '|' characters, X_position1:position2 X is one of the following characters 1. P -- primer 2. S -- spacer 3. A -- amplicon 4. U -- uxi X's may be redundant (there may be multiple primers, spacers, and amplicons) If form is X_N_position1:position2 (with a string between 2 underscores), N represents a sequence to which the input is aligned and match-score stored (N's in case of uxi) Final form of returned my_seqform dictionary entry is: Character1: [[[positionA1,positionA2],filter-sequence A (="" if none given)],[[positionB1,positionB2],filter-sequence B (="" if none given)]] ''' my_seqform = dict() parseable = parseable.split("|") for this_parseable in parseable: my_elements = this_parseable.split("_") try: if (len(my_elements) < 3): my_char = my_elements[0].upper() seq = "" boundaries = my_elements[1].split(":") else: my_char = my_elements[0].upper() seq = my_elements[1] boundaries = my_elements[2].split(":") if (len(boundaries[0]) == 0): boundaries = [None, int(boundaries[1])] elif (len(boundaries[1]) == 0): boundaries = [int(boundaries[0]), None] else: boundaries = [int(boundaries[0]), int(boundaries[1])] if (boundaries[1] - boundaries[0] != len(seq) and len(my_elements) == 3): sysOps.throw_exception( 'Error: mismatch between filter boundary-indices and filter string-size, boundaries=' + str(boundaries) + ", seq=" + seq) except: print "Error parsing seqform " + this_parseable sysOps.throw_exception(["Error parsing seqform " + this_parseable]) if my_char not in "PSAU": sysOps.throw_status([ "Ignoring this_parseable=" + this_parseable + " -- unrecognized character-type." ]) else: if my_char == "A" and type(amplicon_option) == str and type( boundaries[1]) != int: start_pos = int(boundaries[0]) for sub_seq in amplicon_option.split(','): len_sub_seq = len(sub_seq) seq_bool_vec = np.zeros(4 * len_sub_seq, dtype=np.bool_) capital_bool_vec = np.zeros(4 * len_sub_seq, dtype=np.bool_) ambig_vec = np.zeros(len_sub_seq, dtype=np.bool_) ambig_seq_to_np(sub_seq, seq_bool_vec, capital_bool_vec, ambig_vec) if my_char in my_seqform: my_seqform[my_char].append( [[start_pos, start_pos + len_sub_seq], seq_bool_vec[:], capital_bool_vec, ambig_vec]) else: my_seqform[my_char] = [[[ start_pos, start_pos + len_sub_seq ], seq_bool_vec, capital_bool_vec, ambig_vec]] start_pos += len_sub_seq # since original type(boundaries[1]) != int, re-set final boundaries[1] = None my_seqform[my_char][len(my_seqform[my_char]) - 1][0][1] = None else: seq_bool_vec = np.zeros(4 * len(seq), dtype=np.bool_) capital_bool_vec = np.zeros(4 * len(seq), dtype=np.bool_) ambig_vec = np.zeros(len(seq), dtype=np.bool_) ambig_seq_to_np(seq, seq_bool_vec, capital_bool_vec, ambig_vec) if my_char in my_seqform: my_seqform[my_char].append([ boundaries, seq_bool_vec, capital_bool_vec, ambig_vec ]) else: my_seqform[my_char] = [[ boundaries, seq_bool_vec, capital_bool_vec, ambig_vec ]] return my_seqform
globaldatapath ''' global statuslogfilename global globaldatapath if __name__ == '__main__': # Calls sub-routines #optimOps.test_ffgt() sys.argv[len(sys.argv) - 1] = sys.argv[len(sys.argv) - 1].strip('\r') sysOps.initiate_runpath('') sysOps.initiate_statusfilename('', make_file=False) sys.argv = sys.argv[1:] #remove first argument (script call) sysOps.throw_status('sys.argv = ' + str(sys.argv)) if len(sys.argv) > 0 and sys.argv[0][(len(sys.argv[0]) - 2):] == '//': #if first argument is a directory, use this directory as the data directory for all subsequent operations sysOps.initiate_runpath(sys.argv[0]) #initiate data run path sys.argv = sys.argv[1:] #remove directory from argument list sysOps.globalmasterProcess = masterProcesses.masterProcess([]) if len(sys.argv) == 0 or sys.argv[0] == 'data_layout.csv': sysOps.globalmasterProcess.generate_uxi_library() elif sys.argv[0].endswith('infer'): compute_local_solutions_only = False if len(sys.argv) > 1 and sys.argv[1] == 'local': sysOps.throw_status('Performing local computing function alone.') compute_local_solutions_only = True