def generate_wmat(consensus_pairing_csv_file, minreadcount, min_uei_count, outfilename = 'wmat.csv'): #consensus_pairing_csv_file has elements: #uei index, beacon-umi index, target-umi index, read-count #if outfilename == None, does not print data to new files [bcn_dict,trg_dict, bcn_abund_dict,trg_abund_dict, bcn_div_dict,trg_div_dict] = get_umi_uei_matrices(consensus_pairing_csv_file, minreadcount) if len(trg_dict)==0 or len(bcn_dict)==0: sysOps.throw_exception(consensus_pairing_csv_file + ' generated an empty UEI matrix.') sysOps.exitProgram() sysOps.throw_status(['Generating feature list.',sysOps.statuslogfilename]) trg_feature_dict_list = get_features_from_dict(trg_dict) #collects salient pieces of information on targets for printing in file later [bcn_dict, trg_dict, bcn_div_dict, trg_div_dict] = filter_mats(bcn_dict, trg_dict, bcn_div_dict, trg_div_dict, min_uei_count) sysOps.throw_status(['Replacing matrix elements with UEI numbers (scalars).',sysOps.statuslogfilename]) del bcn_dict sysOps.throw_status(['Generating weight matrix.',sysOps.statuslogfilename]) if len(trg_dict)==0: sysOps.throw_exception('After filtering, ' + consensus_pairing_csv_file + ' generated an empty UEI matrix.') sysOps.exitProgram() if outfilename != None: print_features(trg_dict, 'trg_' + outfilename, trg_feature_dict_list) return trg_dict
def define_nuc_degeneracy(c1): c1 = c1.upper() if (c1 in 'ACGTU'): return [c1] elif (c1 == 'N'): return ['A', 'C', 'G', 'T'] elif (c1 == 'W'): return ['A', 'T'] elif (c1 == 'S'): return ['C', 'G'] elif (c1 == 'M'): return ['A', 'C'] elif (c1 == 'K'): return ['G', 'T'] elif (c1 == 'R'): return ['A', 'G'] elif (c1 == 'Y'): return ['C', 'T'] elif (c1 == 'B'): return ['C', 'G', 'T'] elif (c1 == 'D'): return ['A', 'G', 'T'] elif (c1 == 'H'): return ['A', 'C', 'T'] elif (c1 == 'V'): return ['A', 'C', 'G'] else: sysOps.throw_exception([ 'Error: ' + c1 + 'does not code for a single- or degenerate-nucleotide' ]) sysOps.exitProgram()
def generate_uxi_library(self): # Perform sequence analysis (read-parsing, clustering, pairing UEIs/UMIs, sub-sampling data for rarefaction analyses) if not sysOps.check_file_exists('uxi_lib_tasklist.csv'): # create task list for library processing [subdirnames, filenames] = sysOps.get_directory_and_file_list(sysOps.globaldatapath) with open(sysOps.globaldatapath + 'uxi_lib_tasklist.csv','w') as task_input_file_handle: for subdir in subdirnames: if sysOps.check_file_exists(subdir + '//libsettings.txt'): task_input_file_handle.write('generate_uxi_library;' + sysOps.globaldatapath + subdir + '//\n') original_datapath = str(sysOps.globaldatapath) [my_task,time_start] = parallelOps.get_next_open_task('tasklog.csv', 'uxi_lib_tasklist.csv', 'generate_uxi_library') if not (my_task is None): sysOps.initiate_runpath(str(my_task[1])) myLibObj = libOps.libObj(settingsfilename = 'libsettings.txt', output_prefix = '_') if not sysOps.check_file_exists(myLibObj.output_prefix + 'lib_stats.txt'): myLibObj.partition_fastq_library(discarded_sequence_path = "discarded_sequences.fastq", mean_phred_score_path = "mean_phred_scores.txt") self.generate_cluster_analysis() libOps.subsample(myLibObj.seqform_for_params,myLibObj.seqform_rev_params, myLibObj.output_prefix) [subdirnames, filenames] = sysOps.get_directory_and_file_list() dirnames = list([subdirname for subdirname in subdirnames if subdirname.startswith('sub')]) sysOps.throw_status('Performing cluster analysis on sub-directories: ' + str(dirnames)) for dirname in dirnames: sysOps.initiate_runpath(str(my_task[1]) + dirname + '//') self.generate_cluster_analysis() sysOps.globaldatapath = str(original_datapath) if not parallelOps.close_task('tasklog.csv', ';'.join(my_task), time_start): sysOps.throw_exception('Task ' + str(my_task) + ' no longer exists in log ' + sysOps.globaldatapath + 'tasklog.csv' + ' -- exiting.') sysOps.exitProgram()
def threshold_cluster_uxi_prelinked(uxi_list,identical_uxi_filename,threshold,P=0,subsample = -1, prefix = ''): # Function will be called while loading linkage_file into uxi_list through load_linkage_file_to_list(linkage_file) in hashAlignments.py # Format of linkage file: # uxi-sequence, self-read-number, RND: list of linked-to indices with self-index as first in line # linkage_list elements: [uxi-sequence,self-read-number,RND,[list of linked-to indices with self-index as first in line]]) #sort uxi_list by decreasing RND num_uxi = len(uxi_list) sysOps.throw_status('Starting uxi list sort. List size = ' + str(num_uxi)) sorted_uxi_list = sorted(uxi_list, key=lambda row: -row[2]) #note: sorted_uxi_list _REMAINS_ a pointer to uxi_list index_vals = [-1 for i in range(num_uxi)] sysOps.throw_status('Completed uxi list sort. Assigning EASL-clusters ...') for sorted_uxi_el in sorted_uxi_list: #index_vals, with indices corresponding to _original_ positions in pre-sorted uxi_list, are initiated at -1 (stored in list at row[3]) #uxi's accepted into cluster with seed of index i, will be given value i in index_vals #uxi's rejected from all classification are given index if index_vals[sorted_uxi_el[3][0]] < 0: #if this seed has index -1 (has not been assigned to any seed itself) index_vals[sorted_uxi_el[3][0]] = int(sorted_uxi_el[3][0]) # set cluster seed to itself my_index_val = int(index_vals[sorted_uxi_el[3][0]]) for i in range(1,len(sorted_uxi_el[3])): if index_vals[sorted_uxi_el[3][i]] < 0: #connected read is unassigned -- assign to current cluster seed index_vals[sorted_uxi_el[3][i]] = my_index_val sysOps.throw_status('Consolidating clustered uxis ...') #consolidate clustered uxi's if -1 in index_vals: sysOps.throw_exception('Error: UNASSIGNED/UNCLUSTERED uxis. Exiting program') sysOps.exitProgram() index_str_vals = [str(int(x)) for x in index_vals] new_uxi_dict= dict() for i in range(num_uxi): my_index_str = index_str_vals[i] if my_index_str in new_uxi_dict: new_uxi_dict[my_index_str].append(uxi_list[i][0] + "_" + str(uxi_list[i][1])) else: new_uxi_dict[my_index_str] = [(uxi_list[i][0] + "_" + str(uxi_list[i][1]))] if(subsample<=0): new_uxi_handle = open(sysOps.globaldatapath + prefix + "thresh" + str(threshold) + "_" + identical_uxi_filename,'w') else: new_uxi_handle = open(sysOps.globaldatapath + prefix + "thresh" + str(threshold) + "_sub" + str(subsample) + identical_uxi_filename,'w') i = 0 for dict_el in new_uxi_dict: for el in new_uxi_dict[dict_el]: new_uxi_handle.write(str(i) + "_" + el + "\n") i += 1 new_uxi_handle.close() print "Completed clustering." return True
def generate_cluster_analysis(self): # Perform clustering analysis of UMI and UEI sequences, consolidate pairings and determine consenses of these pairings sysOps.initiate_statusfilename() missing_uxi_files = sysOps.find_missing_uxi_files('libsettings.txt', '_') if len(missing_uxi_files)>0: sysOps.throw_exception('Missing uxi files: ' + str(missing_uxi_files)) if(sysOps.check_file_exists('_for_uxi0.fasta')): sysOps.throw_status("Clustering for_uxi0") clustering_up_to_date_1 = hashAlignments.initiate_hash_alignment('_for_uxi0.fasta') else: clustering_up_to_date_1 = True sysOps.throw_status(sysOps.globaldatapath + '_for_uxi0.fasta does not exist. Skipping.') if(sysOps.check_file_exists('_for_uxi1.fasta')): sysOps.throw_status("Clustering for_uxi1") clustering_up_to_date_2 = hashAlignments.initiate_hash_alignment('_for_uxi1.fasta') else: clustering_up_to_date_2 = True sysOps.throw_status(sysOps.globaldatapath + '_for_uxi1.fasta does not exist. Skipping.') if(sysOps.check_file_exists('_rev_uxi0.fasta')): sysOps.throw_status("Clustering rev_uxi0") clustering_up_to_date_3 = hashAlignments.initiate_hash_alignment('_rev_uxi0.fasta') else: clustering_up_to_date_3 = True sysOps.throw_status(sysOps.globaldatapath + '_rev_uxi0.fasta does not exist. Skipping.') if (clustering_up_to_date_1 and clustering_up_to_date_2 and clustering_up_to_date_3): filter_val = 0.75 #maximum fraction of same-base permitted in a single UMI/UEI min_pairing_readcount = 2 sysOps.throw_status('Clustering completed. Beginning final output.') if (sysOps.check_file_exists('thresh1_identical__for_uxi0.fasta') and sysOps.check_file_exists('thresh1_identical__for_uxi1.fasta') and sysOps.check_file_exists('thresh1_identical__rev_uxi0.fasta') and not (sysOps.check_file_exists('consensus_pairing_filter' + str(filter_val) + '_uei_umi.csv'))): if not sysOps.check_file_exists("pairing_filter" + str(filter_val) + "_uei_umi.csv"): dnamicOps.assign_umi_pairs('thresh1_identical__for_uxi1.fasta','thresh1_identical__for_uxi0.fasta','thresh1_identical__rev_uxi0.fasta', '_for_uxi1.fasta' , '_for_uxi0.fasta', '_rev_uxi0.fasta', 'pairing',filter_val,False) # final parameter = False: excluding invalid amplicon sequences dnamicOps.assign_consensus_pairs("pairing_filter" + str(filter_val) + "_uei_umi.csv",min_pairing_readcount) else: sysOps.throw_status('Consensus-pairing file found pre-computed.') if (sysOps.check_file_exists('thresh1_identical__rev_uxi0.fasta') and not sysOps.check_file_exists('trg_amplicon_calls.csv')): #assign amplicon-identities to target umi's sysOps.throw_status('Assigning amplicon-identities and consensus sequences to target umis.') dnamicOps.assign_umi_amplicons('thresh1_identical__rev_uxi0.fasta','_rev_uxi0.fasta','_amp_match.txt', '_rev_amp0.fasta', 'trg_amplicon_calls.csv')
def get_next_uxi_file_entry(handle): header = handle.readline() if len(header) == 0: return [[], []] header = header.strip('\n').split('_') if len(header) != 3: sysOps.throw_exception( 'Error in get_next_uxi_file_entry(): new line = ' + '_'.join(header)) id_list = list() for i in range(int(header[2])): id_list.append(handle.readline().strip('\n')) return [header, id_list]
def group_uxi_reads(uxi_clust_file, uxi_list_file): #takes in clustered -file and identically-matched file, generates look-up of reads based on identically-matched file uxi_dict = load_uxi_dict(uxi_list_file) uxi_clust_handle = open(sysOps.globaldatapath +uxi_clust_file,'rU') read_id_grouping = [] uxis_and_readnums = [] #list of -sequences and corresponding read-numbers, indexed as a list of lists with one-to-one correspondence to clusters for uxi_clust_line in uxi_clust_handle: [clust_index, my_uxi, read_num] = uxi_clust_line.strip('\n').split("_") clust_index = int(clust_index) if clust_index >= len(read_id_grouping): read_id_grouping.append([]) uxis_and_readnums.append([]) if not (my_uxi in uxi_dict): print "Error: could not find " + my_uxi sysOps.throw_exception("Could not find " + my_uxi) read_id_grouping[clust_index].extend(uxi_dict[my_uxi]) uxi_clust_handle.close() return read_id_grouping
def filter_mats(bcn_dict, trg_dict, bcn_div_dict, trg_div_dict, min_uei_count): # prune UEI data to exclude UMIs with UEI counts < min_uei_count if len(bcn_dict) == 0: return [bcn_dict, trg_dict, bcn_div_dict, trg_div_dict] deletion_iteration = 0 is_list = None sysOps.throw_status('Filtering matrices with ' + str(len(bcn_div_dict)) + '+' + str(len(trg_div_dict)) + ' UMIs.') while True: bcn_retained = 0 trg_retained = 0 bcn_deleted = list() trg_deleted = list() for bcn_el in bcn_div_dict: if bcn_div_dict[bcn_el]<min_uei_count: bcn_deleted.append(bcn_el) else: bcn_retained += 1 for trg_el in trg_div_dict: if trg_div_dict[trg_el]<min_uei_count: trg_deleted.append(trg_el) else: trg_retained += 1 #check if bcn_dict and trg_dict are still list or already converted to values if is_list == None: for bcn_el in bcn_dict: for trg_el in bcn_dict[bcn_el]: is_list = (type(bcn_dict[bcn_el][trg_el]) is list) break break if len(bcn_deleted)==0 and len(trg_deleted)==0: sysOps.throw_status('On deletion-iteration ' + str(deletion_iteration) + ', all retained.') break sysOps.throw_status('On deletion-iteration ' + str(deletion_iteration) + ' deleting ' + str(len(bcn_deleted)) + '+' + str(len(trg_deleted)) + ', retained ' + str(bcn_retained) + '+' + str(trg_retained) + '. is_list=' + str(is_list)) if is_list == None: sysOps.throw_exception('Error, could not find any elements: len(bcn_dict) = ' + str(len(bcn_dict))) sysOps.exitProgram() for bcn_el in bcn_deleted: for trg_el in bcn_dict[bcn_el]: if is_list: trg_div_dict[trg_el] -= len(trg_dict[trg_el][bcn_el]) else: trg_div_dict[trg_el] -= trg_dict[trg_el][bcn_el] del trg_dict[trg_el][bcn_el] del bcn_dict[bcn_el] del bcn_div_dict[bcn_el] for trg_el in trg_deleted: for bcn_el in trg_dict[trg_el]: if bcn_el in bcn_div_dict: #if not already deleted above if is_list: bcn_div_dict[bcn_el] -= len(bcn_dict[bcn_el][trg_el]) else: bcn_div_dict[bcn_el] -= bcn_dict[bcn_el][trg_el] del bcn_dict[bcn_el][trg_el] del trg_dict[trg_el] del trg_div_dict[trg_el] deletion_iteration += 1 #check for consistency for bcn_el in bcn_dict: for trg_el in bcn_dict[bcn_el]: if is_list and len(bcn_dict[bcn_el][trg_el])!=len(trg_dict[trg_el][bcn_el]): sysOps.throw_exception('ERROR: bcn_dict and trg_dict contain different elements') sysOps.exitProgram() for trg_el in trg_dict: for bcn_el in trg_dict[trg_el]: if is_list and len(bcn_dict[bcn_el][trg_el])!=len(trg_dict[trg_el][bcn_el]): sysOps.throw_exception('ERROR: bcn_dict and trg_dict contain different elements') sysOps.exitProgram() return [bcn_dict, trg_dict, bcn_div_dict, trg_div_dict]
def print_final_results(trgcalls_filename, trgseq_filename): #output final_*.csv containing columns (index, -1 (beacon)/ target-amplicon match, # x, y, ..., segment #output final_feat*.csv containing columns (index, features, consensus sequence (if target) # [dirnames, filenames] = sysOps.get_directory_and_file_list() seq_dat_filename = [ filename for filename in filenames if filename.startswith('seq_params') ] seq_dat_filename = seq_dat_filename[0][len('seq_params_'):] for result_dat_file in filenames: if (result_dat_file.startswith('Xumi_') and not (sysOps.check_file_exists('final_' + result_dat_file))): key_dat_file = 'key' + seq_dat_filename[ (seq_dat_filename.find('_')):] if sysOps.check_file_exists(key_dat_file): coords_dict = dict() sysOps.throw_status('Generating final output for ' + sysOps.globaldatapath + str(result_dat_file)) result = np.loadtxt(sysOps.globaldatapath + result_dat_file, delimiter=',') for i in range(result.shape[0]): coords_dict[str(int(result[i, 0]))] = ','.join( [str(x) for x in result[i, 1:]]) trg_match_dict = dict() trg_match_file = open( sysOps.globaldatapath + trgcalls_filename, 'rU') trg_seq_file = open(sysOps.globaldatapath + trgseq_filename, 'rU') for line, fasta_record in itertools.izip( trg_match_file, SeqIO.parse(trg_seq_file, "fasta")): [trg_umi_index, max_match, max_tally, tot_tally] = line.strip('\n').split(',') trg_match_dict[trg_umi_index] = [ str(max_match), str(max_tally), str(tot_tally), str(fasta_record.seq) ] trg_match_file.close() trg_seq_file.close() outfile = open( sysOps.globaldatapath + '//final_' + result_dat_file, 'w') outfile_feat = open( sysOps.globaldatapath + '//final_feat_' + result_dat_file, 'w') bcn_excluded = 0 trg_excluded = 0 with open(sysOps.globaldatapath + key_dat_file, 'rU') as key_file: for line in key_file: [bcn0trg1, orig_index, mle_index] = line.strip('\n').split(',') #key file columns: 0 or 1 (for beacon or target, respectively), cluster-index, MLE processing index if mle_index in coords_dict: outfile.write(orig_index + ',' + coords_dict[mle_index] + '\n') if bcn0trg1 == '0': outfile_feat.write(orig_index + ',-1,-1,-1,N\n') else: outfile_feat.write( orig_index + ',' + ','.join(trg_match_dict[orig_index]) + '\n') else: if bcn0trg1 == '0': bcn_excluded += 1 else: trg_excluded += 1 sysOps.throw_status( str(bcn_excluded) + ' beacons, ' + str(trg_excluded) + ' targets excluded from final estimation') outfile.close() outfile_feat.close() else: sysOps.throw_exception(sysOps.globaldatapath + key_dat_file + ' does not exist.') return
def assign_consensus_pairs(pairing_csv_file, min_pairing_readcount): ''' Assumes CSV file with columns: 1. UEI cluster-index 2. Beacon UMI cluster-index 3. Target UMI cluster-index 4. Read-number ''' sysOps.throw_status('Loading pairing file ' + pairing_csv_file + ' ...') uei_clust_index_dict = dict() with open(sysOps.globaldatapath + pairing_csv_file, 'rU') as csvfile: for line in csvfile: row = line.strip('\n').split(',') index_str = str(row[0]) #UEI cluster-index if index_str in uei_clust_index_dict: uei_clust_index_dict[index_str].append( [int(row[1]), int(row[2]), int(row[3]), int(row[4])] ) #append dictionary entry as list with row having indices of beacon- and target-umi clusters, the read-number, and the set-index (will all be 0 if invalid-amplicon reads are excluded) else: uei_clust_index_dict[index_str] = [[ int(row[1]), int(row[2]), int(row[3]), int(row[4]) ]] #replace each entry with umi pairing having plurality of reads, in same indexed format sysOps.throw_status('Generating consensus-pairs ...') discarded_ueis = 0 accepted_ueis = 0 for uei_clust_el in uei_clust_index_dict: maxcount = 0 secondmaxcount = 0 #detect ties, discard if tie exists maxcount_pair_bcn_index = -1 maxcount_pair_trg_index = -1 maxcount_set_index = -1 for row in uei_clust_index_dict[uei_clust_el]: if (row[2] >= min_pairing_readcount and row[2] > maxcount): secondmaxcount = int(maxcount) if maxcount_set_index >= 0 and maxcount_set_index != row[3]: sysOps.throw_exception('Error: set-index mismatch.') sysOps.exitProgram() maxcount_pair_bcn_index = int(row[0]) maxcount_pair_trg_index = int(row[1]) maxcount = int(row[2]) maxcount_set_index = int(row[3]) elif (row[2] >= min_pairing_readcount and row[2] > secondmaxcount): secondmaxcount = int(row[2]) if maxcount >= min_pairing_readcount and maxcount > secondmaxcount: # note: this condition requires that not only must the uei have at least min_pairing_readcount, # but the plurality-tally be must min_pairing_readcount as well uei_clust_index_dict[uei_clust_el] = list([ int(maxcount_pair_bcn_index), int(maxcount_pair_trg_index), int(maxcount), int(maxcount_set_index) ]) accepted_ueis += 1 else: uei_clust_index_dict[uei_clust_el] = list() discarded_ueis += 1 sysOps.throw_status('Outputting consensus-pairs with at least ' + str(min_pairing_readcount) + ' read-plurality. Accepted ' + str(accepted_ueis) + ' UEIs, discarded ' + str(discarded_ueis) + ' UEIs ...') #index outputted as uei-index, beacon-umi-index, target-umi-index, read-count outfile_handle = open( sysOps.globaldatapath + "consensus_" + str(min_pairing_readcount) + "r_" + pairing_csv_file, 'w') for uei_clust_el in uei_clust_index_dict: if len(uei_clust_index_dict[uei_clust_el]) > 0: outfile_handle.write( uei_clust_el + "," + ",".join([str(s) for s in uei_clust_index_dict[uei_clust_el]]) + "\n") outfile_handle.close() return
def assign_umi_amplicons(trg_umi_cluster_file, trg_umi_fasta, amp_match_file, amp_seq_fasta, outfilename): #function will tally reads counted for each target umi across each amplicon-call, and return a csv file with the following columns: #(target umi cluster-index),(leading amplicon-call),(reads for leading amplicon-call),(total reads counted) sysOps.throw_status('Loading cluster-file ' + sysOps.globaldatapath + trg_umi_cluster_file) trg_umi_cluster_dict = fileOps.load_cluster_file_to_dictionary( trg_umi_cluster_file) #outputs dictionary with entries {uxi-sequence: [uxi-cluster-index, read-number]} trg_umi_handle = open(sysOps.globaldatapath + trg_umi_fasta, "rU") amp_seq_handle = open(sysOps.globaldatapath + amp_seq_fasta, "rU") realign_amplicons = False amp_match_handle = None try: sysOps.throw_status('Loading ' + sysOps.globaldatapath + amp_match_file) amp_match_handle = open(sysOps.globaldatapath + amp_match_file, "rU") except: sysOps.throw_status( sysOps.globaldatapath + amp_match_file + ' not found. Alignments will occur from sequence-consenses directly.' ) realign_amplicons = True if not sysOps.check_file_exists('amplicon_refs.txt'): sysOps.throw_exception('Error: ' + sysOps.globaldatapath + 'amplicon_refs.txt not found.') sysOps.exitProgram() trg_umi_dict = dict() trg_amp_seq_dict = dict() for trg_umi_record, amp_seq_record in itertools.izip( SeqIO.parse(trg_umi_handle, "fasta"), SeqIO.parse(amp_seq_handle, "fasta")): if not realign_amplicons: amp_match = int(amp_match_handle.readline().strip('\n')) else: amp_match = -1 trg_umi_seq = str(trg_umi_record.seq) if trg_umi_seq in trg_umi_cluster_dict: trg_umi_index = str( trg_umi_cluster_dict[trg_umi_seq][0]) #uxi cluster-index if trg_umi_index in trg_umi_dict: if amp_match in trg_umi_dict[trg_umi_index]: trg_umi_dict[trg_umi_index][ amp_match] += 1 #add 1, because every read is being entered else: trg_umi_dict[trg_umi_index][amp_match] = 1 else: trg_umi_dict[trg_umi_index] = dict() trg_amp_seq_dict[trg_umi_index] = baseTally() trg_umi_dict[trg_umi_index][amp_match] = 1 trg_amp_seq_dict[trg_umi_index].add_record(str(amp_seq_record.seq), 1) trg_umi_handle.close() amp_seq_handle.close() if not realign_amplicons: amp_match_handle.close() csvfile = open(sysOps.globaldatapath + outfilename, 'w') fastafile = open( sysOps.globaldatapath + outfilename[:outfilename.rfind('.')] + '.fasta', 'w') ref_sequences = list() if realign_amplicons and sysOps.check_file_exists('amplicon_refs.txt'): with open(sysOps.globaldatapath + 'amplicon_refs.txt', 'rU') as ref_file_handle: for ref_line in ref_file_handle: [ref_name, ref_seq] = ref_line.strip('\n').upper().split('|') # amplicon_refs.txt will contain sequences in reverse complementary orientation. We therefore reverse both complementarity and order ref_sequences.append([ str(Seq.Seq(my_ref_seq).reverse_complement()) for my_ref_seq in reversed(ref_seq.split(',')) ]) mySettings = fileOps.read_settingsfile_to_dictionary('libsettings.txt') max_mismatch_amplicon = float(mySettings["-max_mismatch_amplicon"][0]) trg_umi_index_dict = dict() accepted_consensus_sequences = 0 inadmis_consensus_sequences = 0 for trg_umi_index in trg_umi_dict: max_tally = 0 tot_tally = 0 for amp_match in trg_umi_dict[trg_umi_index]: my_tally = trg_umi_dict[trg_umi_index][amp_match] if my_tally >= max_tally: max_tally = int(my_tally) max_match = int(amp_match) tot_tally += int(my_tally) consensus_seq = str( trg_amp_seq_dict[trg_umi_index].get_str_consensus()) if realign_amplicons: # perform direct, un-gapped alignment of consensus_seq to reference options to obtain max_match max_match = -1 max_tally = -1 # exclude max_tally as count, since alignment is happening post-consensus min_mismatch_count = -1 for i in range(len(ref_sequences)): all_subamplicons_pass = True start_index = 0 tot_mismatches = 0 for j in range(len(ref_sequences[i]) ): # loop through sub-amplicon-sequences ref_subamplicon_len = len(ref_sequences[i][j]) my_mismatches, minlen = alignOps.count_mismatches( ref_sequences[i][j], consensus_seq[start_index:(start_index + ref_subamplicon_len)]) if minlen == 0: all_subamplicons_pass = False break all_subamplicons_pass = all_subamplicons_pass and ( my_mismatches / float(minlen) <= max_mismatch_amplicon) start_index += ref_subamplicon_len tot_mismatches += my_mismatches if all_subamplicons_pass and ( max_match < 0 or min_mismatch_count < tot_mismatches): max_match = int(i) min_mismatch_count = int(tot_mismatches) if max_match >= 0: csvfile.write(trg_umi_index + "," + str(max_match) + "," + str(max_tally) + "," + str(tot_tally) + "\n") fastafile.write(">" + trg_umi_index + '\n') fastafile.write(consensus_seq + '\n') if realign_amplicons: trg_umi_index_dict[trg_umi_index] = True accepted_consensus_sequences += 1 else: inadmis_consensus_sequences += 1 csvfile.close() fastafile.close() sysOps.throw_status('Discarded ' + str(inadmis_consensus_sequences) + '/' + str(accepted_consensus_sequences + inadmis_consensus_sequences) + ' sequences in writing ' + sysOps.globaldatapath + outfilename + ' due to inadequate amplicon match.') if realign_amplicons: # create a new consensus pairing file that's filtered with the accepted trg umi indices [dirnames, filenames] = sysOps.get_directory_and_file_list() consensus_filenames = [ filename for filename in filenames if filename.startswith('consensus') ] for consensus_filename in consensus_filenames: # find all consensus files present accepted_consensus_sequences = 0 inadmis_consensus_sequences = 0 os.rename( sysOps.globaldatapath + consensus_filename, sysOps.globaldatapath + 'unfiltered_' + consensus_filename) with open(sysOps.globaldatapath + consensus_filename, 'w') as new_consensus_file: with open( sysOps.globaldatapath + 'unfiltered_' + consensus_filename, 'rU') as old_consensus_file: for old_consensus_file_line in old_consensus_file: consensus_list = old_consensus_file_line.strip( '\n' ).split( ',' ) # [uei_index, bcn_umi_index, trg_umi_index, read_count, (additional variables)] if consensus_list[2] in trg_umi_index_dict: new_consensus_file.write(old_consensus_file_line) accepted_consensus_sequences += 1 else: inadmis_consensus_sequences += 1 sysOps.throw_status('Discarded ' + str(inadmis_consensus_sequences) + '/' + str(accepted_consensus_sequences + inadmis_consensus_sequences) + ' consensus-pairings in writing ' + sysOps.globaldatapath + consensus_filename + ' due to inadequate amplicon match.') if len(consensus_filenames) == 0: sysOps.throw_exception( 'Error: no consensus files available to update with realigned amplicon information. Exiting.' ) sysOps.exitProgram()
def dnamic_inference(self, smle_infer=False, msmle_infer=False, segment_infer=False, compute_local_solutions_only=True): # Perform image inference on the basis of raw output of DNA microscopy sequence analysis # Basic settings read_thresh = 2 min_uei_count = 2 output_dim = 2 version = 1.0 infer_dir = '' # raw data files consensus_pairing_csv_file = "..//consensus_" + str( read_thresh) + "r_pairing_filter0.75_uei_umi.csv" outname = 'minuei' + str(min_uei_count) + 'DMv' + str( version) + '_' + str(read_thresh) + 'r_filter0.75' wmat_outfilename = 'wmat_' + outname + '.csv' param_name = 'minuei' + str(min_uei_count) + 'dim' + str( output_dim) + 'DMv' + str(version) + '_.csv' imagemodule_input_filename = 'data_' + param_name key_filename = 'key_' + param_name if not sysOps.check_file_exists('microscopy_tasklist.csv'): [subdirnames, filenames ] = sysOps.get_directory_and_file_list(sysOps.globaldatapath) with open(sysOps.globaldatapath + 'microscopy_tasklist.csv', 'w') as task_input_file_handle: for subdir in subdirnames: if sysOps.check_file_exists(subdir + '//libsettings.txt'): task_input_file_handle.write('infer_smle;' + sysOps.globaldatapath + subdir + '//\n') task_input_file_handle.write('infer_msmle;' + sysOps.globaldatapath + subdir + '//\n') task_input_file_handle.write('infer_segment;' + sysOps.globaldatapath + subdir + '//\n') task_input_file_handle.write('infer_ptmle;' + sysOps.globaldatapath + subdir + '//\n') original_datapath = str(sysOps.globaldatapath) if smle_infer: infer_dir = 'infer_smle//' [my_task, time_start ] = parallelOps.get_next_open_task('tasklog.csv', 'microscopy_tasklist.csv', 'infer_smle') elif msmle_infer: infer_dir = 'infer_msmle//' [my_task, time_start ] = parallelOps.get_next_open_task('tasklog.csv', 'microscopy_tasklist.csv', 'infer_msmle') elif segment_infer: infer_dir = 'infer_segment//' [my_task, time_start ] = parallelOps.get_next_open_task('tasklog.csv', 'microscopy_tasklist.csv', 'infer_segment') else: infer_dir = 'infer_ptmle//' [my_task, time_start ] = parallelOps.get_next_open_task('tasklog.csv', 'microscopy_tasklist.csv', 'infer_ptmle') if not (my_task is None): sysOps.initiate_runpath(str(my_task[1])) [subdirnames, filenames] = sysOps.get_directory_and_file_list() dirnames = list(["."]) subdirnames_nodatayet = [ subdirname for subdirname in subdirnames if subdirname.startswith('sub') and ( not sysOps.check_file_exists(subdirname + '//' + imagemodule_input_filename)) ] subdirnames_nodatayet = [ subdirnames_nodatayet[i] for i in np.argsort(-np.array([ int(subdirname[3:].strip('/')) for subdirname in subdirnames_nodatayet ])) ] # sort by descending read count subdirnames_dataalready = [ subdirname for subdirname in subdirnames if subdirname.startswith('sub') and ( sysOps.check_file_exists(subdirname + '//' + imagemodule_input_filename)) ] subdirnames_dataalready = [ subdirnames_dataalready[i] for i in np.argsort(-np.array([ int(subdirname[3:].strip('/')) for subdirname in subdirnames_dataalready ])) ] # sort by descending read count dirnames.extend(subdirnames_nodatayet) dirnames.extend(subdirnames_dataalready) sysOps.throw_status('Checking directories ' + sysOps.globaldatapath + ' ... ' + str(dirnames) + ' for infer-subdirectories.') for dirname in dirnames: # make inference directories try: with open( sysOps.globaldatapath + dirname + '//' + infer_dir + 'tmpfile.txt', 'w') as tmpfile: tmpfile.write('test') os.remove(sysOps.globaldatapath + dirname + '//' + infer_dir + 'tmpfile.txt') sysOps.throw_status('Directory ' + sysOps.globaldatapath + dirname + '//' + infer_dir + ' found already created.') except: os.mkdir(sysOps.globaldatapath + dirname + '//' + infer_dir) sysOps.throw_status('Created directory ' + sysOps.globaldatapath + dirname + '//' + infer_dir) for dirname in dirnames: sysOps.initiate_runpath( str(my_task[1]) + dirname + '//' + infer_dir) sysOps.initiate_statusfilename() sysOps.throw_status('Assigned path ' + sysOps.globaldatapath) if not (sysOps.check_file_exists(key_filename) and sysOps.check_file_exists(imagemodule_input_filename) and sysOps.check_file_exists( 'read_' + imagemodule_input_filename) and sysOps.check_file_exists('seq_params_' + imagemodule_input_filename)): sysOps.throw_status('Calling matOps.generate_wmat()') trg_dict = matOps.generate_wmat(consensus_pairing_csv_file, read_thresh, min_uei_count, wmat_outfilename) sysOps.throw_status('Completed matOps.generate_wmat()') matOps.print_imagemodule_input(trg_dict, imagemodule_input_filename, key_filename, output_dim) #print_imagemodule_input outputs # 1. File key_filename containing 3 columns: 0 or 1 (for beacon or target, respectively), cluster-index, MLE processing index # 2. imagemodule_input_filename containing 3 columns: MLE processing index for beacon, MLE processing index for target, uei-count, max UEI read count # 3. Summary file containing: Number of beacons inputted to MLE, number of targets inputted to MLE, else: sysOps.throw_status( 'Image-module input pre-computed. Proceeding ...') #optimOps.test_ffgt() if sysOps.check_file_exists(imagemodule_input_filename): if segment_infer: optimOps.run_mle( imagemodule_input_filename, False, False, True, compute_local_solutions_only, ) # segmentation only elif msmle_infer: optimOps.run_mle(imagemodule_input_filename, False, True, False, compute_local_solutions_only) # msMLE elif smle_infer: optimOps.run_mle(imagemodule_input_filename, True, False, False, compute_local_solutions_only) # sMLE else: optimOps.run_mle(imagemodule_input_filename, False, False, False, compute_local_solutions_only) # ptMLE if not compute_local_solutions_only: dnamicOps.print_final_results( '..//trg_amplicon_calls.csv', '..//trg_amplicon_calls.fasta') else: sysOps.exitProgram() else: sysOps.throw_status('Could not locate ' + sysOps.globaldatapath + imagemodule_input_filename) sysOps.globaldatapath = str(original_datapath) if not parallelOps.close_task('tasklog.csv', ';'.join(my_task), time_start): sysOps.throw_exception('Task ' + str(my_task) + ' no longer exists in log ' + sysOps.globaldatapath + 'tasklog.csv' + ' -- exiting.') sysOps.exitProgram() return
def sim_reads(self): simLibObj = libOps.libObj(settingsfilename='libsettings.txt', output_prefix='_') enforced_rev_read_len = 100 [for_read_len, rev_read_len] = simLibObj.get_min_allowed_readlens( simLibObj.filter_amplicon_window) rev_read_len = int(enforced_rev_read_len) ''' simLibObj.seqform_for_params and simLibObj.seqform_rev_params are already stored in current object's memory Form of these variables is a list of the following: Element 1: [start_pos,end_pos] Element 2: np.ndarray(seq_bool_vec, dtype=np.bool_) Element 3: np.ndarray(capital_bool_vec, dtype=np.bool_) Element 4: np.ndarray(ambig_vec, dtype=np.bool_) ''' [subdirnames, filenames] = sysOps.get_directory_and_file_list() for_umi_seqs = list() rev_umi_seqs = list() rev_umi_amplicon_list = list() uei_seqs = list() base_order = 'ACGT' sysOps.throw_status('Generating simulated sequences ...') amplicon_list = list() if "-amplicon" in simLibObj.mySettings: amplicon_list = [ simLibObj.mySettings["-amplicon"][i].upper().split(',') for i in range(len(simLibObj.mySettings["-amplicon"])) ] for for_umi_i in range(self.Nbcn): for_param_index = np.random.randint( len(simLibObj.seqform_for_params)) if len(simLibObj.seqform_for_params[for_param_index]) > 1: sysOps.throw_exception( 'Error: len(simLibObj.seqform_for_params[for_param_index]) = ' + str(len(simLibObj.seqform_for_params[for_param_index]))) sysOps.exitProgram() my_for_umi_param = simLibObj.seqform_for_params[for_param_index][ 0]['U'][0] [start_pos, end_pos] = my_for_umi_param[0] seq_bool_vec = my_for_umi_param[1] my_for_umi = str('') for pos in range(end_pos - start_pos): possible_bases = np.where(seq_bool_vec[(pos * 4):((pos + 1) * 4)])[0] my_for_umi += base_order[possible_bases[np.random.randint( possible_bases.shape[0])]] for_umi_seqs.append([int(for_param_index), str(my_for_umi)]) for for_uei_i in range(self.Nuei): for_param_index = 0 # there should be no difference across UMI's my_for_uei_param = simLibObj.seqform_for_params[for_param_index][ 0]['U'][1] [start_pos, end_pos] = my_for_uei_param[0] seq_bool_vec = my_for_uei_param[1] my_for_uei = str('') for pos in range(end_pos - start_pos): possible_bases = np.where(seq_bool_vec[(pos * 4):((pos + 1) * 4)])[0] my_for_uei += base_order[possible_bases[np.random.randint( possible_bases.shape[0])]] uei_seqs.append(str(my_for_uei)) for rev_umi_i in range(self.Ntrg): rev_param_index = np.random.randint( len(simLibObj.seqform_rev_params)) my_rev_umi_param = simLibObj.seqform_rev_params[rev_param_index][ 0]['U'][0] [start_pos, end_pos] = my_rev_umi_param[0] seq_bool_vec = my_rev_umi_param[1] my_rev_umi = str('') for pos in range(end_pos - start_pos): possible_bases = np.where(seq_bool_vec[(pos * 4):((pos + 1) * 4)])[0] my_rev_umi += base_order[possible_bases[np.random.randint( possible_bases.shape[0])]] if len(amplicon_list) == 0: encoded_amplicon = str('') else: this_gsp_primer_amplicon_pair = list( amplicon_list[np.random.randint(len(amplicon_list))] ) # already properly oriented # already properly oriented # generate single error on amplicon lenamp = len(this_gsp_primer_amplicon_pair[1]) rand_loc = np.random.randint(lenamp) this_gsp_primer_amplicon_pair[1] = str( this_gsp_primer_amplicon_pair[1][:rand_loc] + base_order[np.random.randint(4)] + this_gsp_primer_amplicon_pair[1][(rand_loc + 1):]) encoded_amplicon = ''.join(this_gsp_primer_amplicon_pair) tmp_umi_index = float(rev_umi_i) if tmp_umi_index == 0: encoded_amplicon += base_order[0] else: for myexponent in range( int(np.floor(np.log(tmp_umi_index) / np.log(4.0))), -1, -1): mydigit = np.floor(tmp_umi_index / np.power(4.0, myexponent)) encoded_amplicon += base_order[int(mydigit)] tmp_umi_index -= mydigit * np.power(4.0, myexponent) rev_umi_seqs.append( [int(rev_param_index), str(my_rev_umi), str(encoded_amplicon)]) sysOps.throw_status('Writing simulated reads ...') for filename in filenames: if filename.endswith('_sim_ueifile.csv'): ueifile = np.int64( np.loadtxt(sysOps.globaldatapath + filename, delimiter=',')) newdirname = filename[:filename.find('_')] read_list = list() for i in range(ueifile.shape[0]): for myread in range(ueifile[i, 3]): read_list.append(np.array([ueifile[i, :3]])) read_list = np.concatenate( read_list, axis=0 ) # re-write array so that there is now one row per read # randomly permute: read_list = read_list[ np.random.permutation(read_list.shape[0]), :] for_chararray = np.chararray((for_read_len)) rev_chararray = np.chararray((rev_read_len)) for_fastq_outfile = open(newdirname + '_for.fastq', "w") rev_fastq_outfile = open(newdirname + '_rev.fastq', "w") for i in range(read_list.shape[0]): for_param_index = for_umi_seqs[read_list[i, 1]][0] for_umi_seq = for_umi_seqs[read_list[i, 1]][1] rev_param_index = rev_umi_seqs[read_list[i, 2]][ 0] # both beacon and target indices are at this point are independently indexed from 0 rev_umi_seq = rev_umi_seqs[read_list[i, 2]][1] rev_amp_seq = rev_umi_seqs[read_list[i, 2]][2] uei_seq = uei_seqs[read_list[i, 0]] for j in range(for_read_len): for_chararray[j] = 'N' for j in range(rev_read_len): rev_chararray[j] = 'N' my_for_umi_param = simLibObj.seqform_for_params[ for_param_index][0]['U'][0] [start_pos, end_pos] = my_for_umi_param[0] for j in range(end_pos - start_pos): for_chararray[j + start_pos] = for_umi_seq[j] my_for_uei_param = simLibObj.seqform_for_params[ for_param_index][0]['U'][1] [start_pos, end_pos] = my_for_uei_param[0] for j in range(end_pos - start_pos): for_chararray[j + start_pos] = uei_seq[j] for my_for_param in simLibObj.seqform_for_params[ for_param_index][0]['P']: [start_pos, end_pos] = my_for_param[0] for j in range(end_pos - start_pos): for_chararray[j + start_pos] = base_order[np.where( my_for_param[1][(4 * j):(4 * (j + 1))])[0][0]] my_rev_umi_param = simLibObj.seqform_rev_params[ rev_param_index][0]['U'][0] [start_pos, end_pos] = my_rev_umi_param[0] for j in range(end_pos - start_pos): rev_chararray[j + start_pos] = rev_umi_seq[j] my_rev_amp_param = simLibObj.seqform_rev_params[ rev_param_index][0]['A'][0] start_pos = my_rev_amp_param[0][0] for j in range(len(rev_amp_seq)): rev_chararray[j + start_pos] = rev_amp_seq[j] if 'P' in simLibObj.seqform_rev_params[rev_param_index][0]: for my_rev_param in simLibObj.seqform_rev_params[ rev_param_index][0]['P']: [start_pos, end_pos] = my_rev_param[0] for j in range(end_pos - start_pos): rev_chararray[j + start_pos] = base_order[np.where( my_rev_param[1][(4 * j):( 4 * (j + 1))])[0][0]] for_record = SeqIO.SeqRecord( Seq.Seq(for_chararray.tostring())) for_record.id = '-' + str(i) + '-' + str(read_list[i, 1]) for_record.description = '' for_record.letter_annotations['phred_quality'] = list( [30 for j in range(for_read_len)]) rev_record = SeqIO.SeqRecord( Seq.Seq(rev_chararray.tostring())) rev_record.id = '-' + str(i) + '-' + str(read_list[i, 2]) rev_record.description = '' rev_record.letter_annotations['phred_quality'] = list( [30 for j in range(rev_read_len)]) SeqIO.write(for_record, for_fastq_outfile, "fastq") SeqIO.write(rev_record, rev_fastq_outfile, "fastq") for_fastq_outfile.close() rev_fastq_outfile.close() os.mkdir(newdirname) with open('libsettings.txt', 'rU') as oldsettingsfile: with open(newdirname + '//libsettings.txt', 'w') as newsettingsfile: for oldsettings_row in oldsettingsfile: if oldsettings_row.startswith('-source_for'): newsettingsfile.write('-source_for ..//' + newdirname + '_for.fastq\n') elif oldsettings_row.startswith('-source_rev'): newsettingsfile.write('-source_rev ..//' + newdirname + '_rev.fastq\n') else: newsettingsfile.write(oldsettings_row) sysOps.throw_status('Done.') return
def compare(clustfile1, clustfile2, comparison_file_name, rev_comp, read_thresh=2, filter_substr_list=[], filter_val=0.75): #rev_comp = True/False depending on need of reverse-complement being taken #filter_val = maximum fraction of bases in uxi allowed to be the same #all filtering of legitimate comparison occurs here, at the front end print "Beginning comparison between " + clustfile1 + " and " + clustfile2 #Stage 1 of comparison: determine total read-abundance of clusters in clustfile1 and clustfile2, #assign to abund_dict1 and abund_dict2 abund_dict1 = dict() with open(sysOps.globaldatapath + clustfile1, 'rU') as clust1_handle: for clust_line in clust1_handle: my_el = clust_line.strip('\n').split('_') if (len(my_el) == 3): uxi_index = my_el[0] my_numreads = int(my_el[2]) if uxi_index not in abund_dict1: abund_dict1[uxi_index] = { 'reads': my_numreads, 'is_shared': False } else: abund_dict1[uxi_index]['reads'] += my_numreads abund_dict2 = dict() with open(sysOps.globaldatapath + clustfile2, 'rU') as clust2_handle: for clust_line in clust2_handle: my_el = clust_line.strip('\n').split('_') if (len(my_el) == 3): uxi_index = my_el[0] my_numreads = int(my_el[2]) if uxi_index not in abund_dict2: abund_dict2[uxi_index] = { 'reads': my_numreads, 'is_shared': False } else: abund_dict2[uxi_index]['reads'] += my_numreads #Stage 2 of comparison: enter actual uxi sequences into dict_clust1 and dict_clust2, #enter their respective cluster-indices into dict_uxi_indices1 and dict_uxi_indices2 dict_clust1 = dict() with open(sysOps.globaldatapath + clustfile1, 'rU') as clust1_handle: for clust_line in clust1_handle: my_el = clust_line.strip('\n').split('_') if (len(my_el) == 3): uxi_index = int(my_el[0]) this_uxi = str(my_el[1]) my_numreads = int(my_el[2]) has_disallowed_substr = [ my_substr in this_uxi for my_substr in filter_substr_list ] if abund_dict1[my_el[0]]['reads'] >= read_thresh and ( True not in has_disallowed_substr) and max( numpy.bincount([('ACGT').index(s) for s in this_uxi ])) <= filter_val * len(this_uxi): dict_clust1[this_uxi] = [ uxi_index, my_numreads, False ] #final entry corresponds to being shared print "Completed first cluster-file input. Second cluster-file being read, output to cross_comparisons//" + comparison_file_name comparison_handle = open( sysOps.globaldatapath + 'cross_comparisons//' + comparison_file_name, 'w') dict_clust2 = dict() with open(sysOps.globaldatapath + clustfile2, 'rU') as clust2_handle: for clust_line in clust2_handle: my_el = clust_line.strip('\n').split('_') if (len(my_el) == 3): uxi_index = int(my_el[0]) #references clustfile2 #my_uxi references clustfile2 uxi sequences #this_uxi references clustfile1 uxi sequences my_uxi = str(my_el[1]) my_numreads = int(my_el[2]) this_uxi = str(my_uxi) if (rev_comp): this_uxi = str(Seq.Seq(this_uxi).reverse_complement()) has_disallowed_substr = [ my_substr in this_uxi for my_substr in filter_substr_list ] if abund_dict2[my_el[0]]['reads'] >= read_thresh and ( True not in has_disallowed_substr) and max( numpy.bincount([('ACGT').index(s) for s in this_uxi ])) <= filter_val * len(this_uxi): dict_clust2[my_uxi] = [uxi_index, my_numreads, False] if this_uxi in dict_clust1: dict_clust1[this_uxi][2] = True dict_clust2[my_uxi][2] = True if str(dict_clust1[this_uxi][0]) not in abund_dict1: sysOps.throw_exception( 'A: ' + str(dict_clust1[this_uxi][0]) + ' not in dict_uxi_indices1') sysOps.exitProgram() if str(uxi_index) not in abund_dict2: sysOps.throw_exception('B: ' + str(uxi_index) + ' not in dict_uxi_indices2') sysOps.exitProgram() abund_dict1[str( dict_clust1[this_uxi][0])]['is_shared'] = True abund_dict2[str(uxi_index)]['is_shared'] = True comparison_handle.write( str(this_uxi) + "," + str(dict_clust1[this_uxi][0]) + "," + str(dict_clust1[this_uxi][1]) + "," + str(abund_dict1[str(dict_clust1[this_uxi][0])] ['reads']) + "," + str(dict_clust2[my_uxi][0]) + "," + str(dict_clust2[my_uxi][1]) + "," + str(abund_dict2[str(dict_clust2[my_uxi][0])] ['reads']) + "\n") comparison_handle.close() #count number unique shared and unique unshared num_unique_shared = [0, 0] num_unique_unshared = [0, 0] read_abundance_shared = [0, 0] read_abundance_unshared = [0, 0] for uxi_index1 in abund_dict1: if abund_dict1[uxi_index1]['is_shared']: num_unique_shared[0] += 1 read_abundance_shared[0] += abund_dict1[uxi_index1]['reads'] else: num_unique_unshared[0] += 1 read_abundance_unshared[0] += abund_dict1[uxi_index1]['reads'] for uxi_index2 in abund_dict2: if abund_dict2[uxi_index2]['is_shared']: num_unique_shared[1] += 1 read_abundance_shared[1] += abund_dict2[uxi_index2]['reads'] else: num_unique_unshared[1] += 1 read_abundance_unshared[1] += abund_dict2[uxi_index2]['reads'] return [ num_unique_shared, num_unique_unshared, read_abundance_shared, read_abundance_unshared ]
def gather_rarefaction_data(conditions_filename = 'conditions.csv', outfilename = 'rarefaction_file.txt', raw_uxi_files = ['_for_uxi0.fasta','_for_uxi1.fasta','_rev_uxi0.fasta']): #use conditions conditions_filename to specify output order dirnames = list() with open(sysOps.globaldatapath + conditions_filename, 'rU') as conditions_handle: for myline in conditions_handle: thisline = myline.strip('\n').split(',') dirnames.append('lib_' + str(thisline[0]) + '_' + str(thisline[1]) + '_' + str(thisline[2])) outfile_1r = open(sysOps.globaldatapath +'1r_' + outfilename,'w') outfile_2r = open(sysOps.globaldatapath +'2r_' + outfilename,'w') outfile_3r = open(sysOps.globaldatapath +'3r_' + outfilename,'w') for dir in dirnames: print 'Gathering rarefaction data for directory ' + sysOps.globaldatapath + dir sum_reads_raw = 0 with open(sysOps.globaldatapath +dir + '/' + raw_uxi_files[0],'rU') as uxi_file_handle: #first UMI/UEI file in list to count raw reads for uxi_record in SeqIO.parse(uxi_file_handle,'fasta'): sum_reads_raw += 1 subsample = 500 terminate = False while not terminate: all_diversities = [] try: for my_raw_uxi_file in raw_uxi_files: try: cluster_file_handle = open(sysOps.globaldatapath +dir + '/thresh1_identical_sub' + str(subsample) + my_raw_uxi_file,'rU') consensus_pairing_csv_file = dir + '/consensus_2r_sub' + str(subsample) + 'pairing_filter0.75_uei_umi.csv' except: terminate = True try: cluster_file_handle = open(sysOps.globaldatapath +dir + '/thresh1_identical_' + my_raw_uxi_file,'rU') consensus_pairing_csv_file = dir + '/consensus_2r_pairing_filter0.75_uei_umi.csv' except: sysOps.throw_exception('Directory ' + sysOps.globaldatapath + dir + ' does not contain clustered file' + sysOps.globaldatapath +dir + '/thresh1_identical_' + my_raw_uxi_file + '. Skipping ...') break subsample = sum_reads_raw cluster_dict = dict() for myline in cluster_file_handle: thisline = myline.strip('\n').split('_') if thisline[0] in cluster_dict: cluster_dict[thisline[0]] += int(thisline[2]) else: cluster_dict[thisline[0]] = int(thisline[2]) cluster_file_handle.close() diversity = [0,0,0] #first element is 1-read-gated diversity, second is 2-read-gated, third is 3-read-gated for el in cluster_dict: if cluster_dict[el]>=3: diversity[0] += 1 diversity[1] += 1 diversity[2] += 1 elif cluster_dict[el]>=2: diversity[0] += 1 diversity[1] += 1 else: diversity[0] += 1 all_diversities.append(diversity) #if sysOps.check_file_exists(consensus_pairing_csv_file): if False: #temp sysOps.throw_status('Found ' + sysOps.globaldatapath + consensus_pairing_csv_file + '.') min_uei_count = 2 min_umi_readcount = 2 outname = 'minb' + str(min_uei_count) + 'v' + str(0) + '_' + str(min_umi_readcount) + 'r_filter0.75' wmat_outfilename = 'noabundcorr_wmat_' + outname + '.csv' sysOps.throw_status('Calling matOps.generate_wmat()') [num_unique_trg, num_unique_bcn, trg_dict] = matOps.generate_wmat(consensus_pairing_csv_file, min_umi_readcount, min_umi_readcount, min_uei_count, wmat_outfilename = None) if num_unique_bcn>0: filtered_minb_diversity_2r = [num_unique_bcn, sum([trg_dict[trg_el] for trg_el in trg_dict]), num_unique_trg] else: filtered_minb_diversity_2r = [0,0,0] else: sysOps.throw_status(sysOps.globaldatapath + consensus_pairing_csv_file + ' not found.') filtered_minb_diversity_2r = [] outfile_1r.write(','.join([dir, str(subsample), ','.join([str(my_diversity[0]) for my_diversity in all_diversities])]) + '\n') outfile_2r.write(','.join([dir, str(subsample), ','.join([str(my_diversity[1]) for my_diversity in all_diversities]), ','.join([str(s) for s in filtered_minb_diversity_2r])]) + '\n') outfile_3r.write(','.join([dir, str(subsample), ','.join([str(my_diversity[2]) for my_diversity in all_diversities])]) + '\n') except: terminate = True subsample *= 2 outfile_1r.close() outfile_2r.close() outfile_3r.close()
def crosscomparison_analysis(self, args): sysOps.initiate_statusfilename() list_of_dirs = list() file_to_compare = args[1] with open(sysOps.globaldatapath + args[2], 'rU') as csvfile: for myline in csvfile: thisline = myline.strip('\n').split(',') subdir = 'lib_' + str(thisline[0]) + '_' + str( thisline[1]) + '_' + str(thisline[2]) list_of_dirs.append(subdir) print "Beginning comparison analysis" print "File to compare = " + file_to_compare print "Directories = " + ",".join(list_of_dirs) try: os.mkdir(sysOps.globaldatapath + 'cross_comparisons') except: sysOps.throw_exception( 'cross_comparisons directory already exists. Terminating comparison analysis.' ) sysOps.exitProgram() shared_num_unique_matrix = list() unshared_num_unique_matrix = list() shared_read_abund_matrix = list() unshared_read_abund_matrix = list() for i in range(len(list_of_dirs)): shared_num_unique_matrix.append(list([-1] * len(list_of_dirs))) unshared_num_unique_matrix.append(list([-1] * len(list_of_dirs))) shared_read_abund_matrix.append(list([-1] * len(list_of_dirs))) unshared_read_abund_matrix.append(list([-1] * len(list_of_dirs))) for ind1 in range(len(list_of_dirs)): for ind2 in range(ind1): dir1 = list_of_dirs[ind1] dir2 = list_of_dirs[ind2] clustfile1 = dir1 + "//" + file_to_compare clustfile2 = dir2 + "//" + file_to_compare dir1_abbrev = dir1[( dir1.rfind('/') + 1 ):] #remove superdirectory structure of path -- requires individual directories have unique names dir2_abbrev = dir2[(dir2.rfind('/') + 1):] sysOps.throw_status('Began writing cross_comparisons//' + dir1_abbrev + "_" + dir2_abbrev + "_" + file_to_compare) [ num_unique_shared, num_unique_unshared, read_abundance_shared, read_abundance_unshared ] = alignOps.compare( clustfile1, clustfile2, dir1_abbrev + "_" + dir2_abbrev + "_" + file_to_compare, False) sysOps.throw_status('Completed writing cross_comparisons//' + dir1_abbrev + "_" + dir2_abbrev + "_" + file_to_compare) shared_num_unique_matrix[ind1][ind2] = num_unique_shared[0] shared_num_unique_matrix[ind2][ind1] = num_unique_shared[1] unshared_num_unique_matrix[ind1][ind2] = num_unique_unshared[0] unshared_num_unique_matrix[ind2][ind1] = num_unique_unshared[1] print str(num_unique_unshared[0] ) + '-> unshared_num_unique_matrix[ ' + str( ind1) + '][' + str(ind2) + ']' shared_read_abund_matrix[ind1][ind2] = read_abundance_shared[0] shared_read_abund_matrix[ind2][ind1] = read_abundance_shared[1] unshared_read_abund_matrix[ind1][ ind2] = read_abundance_unshared[0] unshared_read_abund_matrix[ind2][ ind1] = read_abundance_unshared[1] print shared_num_unique_matrix print unshared_num_unique_matrix print shared_read_abund_matrix print unshared_read_abund_matrix with open('comparison_matrices.csv', 'w') as compare_matrix_file: for i1 in range(len(list_of_dirs)): compare_matrix_file.write( ','.join([str(j) for j in shared_num_unique_matrix[i1]]) + '\n') for i2 in range(len(list_of_dirs)): compare_matrix_file.write( ','.join([str(j) for j in unshared_num_unique_matrix[i2]]) + '\n') for i3 in range(len(list_of_dirs)): compare_matrix_file.write( ','.join([str(j) for j in shared_read_abund_matrix[i3]]) + '\n') for i4 in range(len(list_of_dirs)): compare_matrix_file.write( ','.join([str(j) for j in unshared_read_abund_matrix[i4]]) + '\n')
def parse_seqform(parseable, amplicon_option=None): ''' parse input from -seqform_for or -seqform_rev tag in settings file parseable must contain integers separated by '|' characters, X_position1:position2 X is one of the following characters 1. P -- primer 2. S -- spacer 3. A -- amplicon 4. U -- uxi X's may be redundant (there may be multiple primers, spacers, and amplicons) If form is X_N_position1:position2 (with a string between 2 underscores), N represents a sequence to which the input is aligned and match-score stored (N's in case of uxi) Final form of returned my_seqform dictionary entry is: Character1: [[[positionA1,positionA2],filter-sequence A (="" if none given)],[[positionB1,positionB2],filter-sequence B (="" if none given)]] ''' my_seqform = dict() parseable = parseable.split("|") for this_parseable in parseable: my_elements = this_parseable.split("_") try: if (len(my_elements) < 3): my_char = my_elements[0].upper() seq = "" boundaries = my_elements[1].split(":") else: my_char = my_elements[0].upper() seq = my_elements[1] boundaries = my_elements[2].split(":") if (len(boundaries[0]) == 0): boundaries = [None, int(boundaries[1])] elif (len(boundaries[1]) == 0): boundaries = [int(boundaries[0]), None] else: boundaries = [int(boundaries[0]), int(boundaries[1])] if (boundaries[1] - boundaries[0] != len(seq) and len(my_elements) == 3): sysOps.throw_exception( 'Error: mismatch between filter boundary-indices and filter string-size, boundaries=' + str(boundaries) + ", seq=" + seq) except: print "Error parsing seqform " + this_parseable sysOps.throw_exception(["Error parsing seqform " + this_parseable]) if my_char not in "PSAU": sysOps.throw_status([ "Ignoring this_parseable=" + this_parseable + " -- unrecognized character-type." ]) else: if my_char == "A" and type(amplicon_option) == str and type( boundaries[1]) != int: start_pos = int(boundaries[0]) for sub_seq in amplicon_option.split(','): len_sub_seq = len(sub_seq) seq_bool_vec = np.zeros(4 * len_sub_seq, dtype=np.bool_) capital_bool_vec = np.zeros(4 * len_sub_seq, dtype=np.bool_) ambig_vec = np.zeros(len_sub_seq, dtype=np.bool_) ambig_seq_to_np(sub_seq, seq_bool_vec, capital_bool_vec, ambig_vec) if my_char in my_seqform: my_seqform[my_char].append( [[start_pos, start_pos + len_sub_seq], seq_bool_vec[:], capital_bool_vec, ambig_vec]) else: my_seqform[my_char] = [[[ start_pos, start_pos + len_sub_seq ], seq_bool_vec, capital_bool_vec, ambig_vec]] start_pos += len_sub_seq # since original type(boundaries[1]) != int, re-set final boundaries[1] = None my_seqform[my_char][len(my_seqform[my_char]) - 1][0][1] = None else: seq_bool_vec = np.zeros(4 * len(seq), dtype=np.bool_) capital_bool_vec = np.zeros(4 * len(seq), dtype=np.bool_) ambig_vec = np.zeros(len(seq), dtype=np.bool_) ambig_seq_to_np(seq, seq_bool_vec, capital_bool_vec, ambig_vec) if my_char in my_seqform: my_seqform[my_char].append([ boundaries, seq_bool_vec, capital_bool_vec, ambig_vec ]) else: my_seqform[my_char] = [[ boundaries, seq_bool_vec, capital_bool_vec, ambig_vec ]] return my_seqform
elif sys.argv[0].endswith('infer'): compute_local_solutions_only = False if len(sys.argv) > 1 and sys.argv[1] == 'local': sysOps.throw_status('Performing local computing function alone.') compute_local_solutions_only = True if sys.argv[0] == 'smle_infer': sysOps.globalmasterProcess.dnamic_inference( True, False, False, compute_local_solutions_only) elif sys.argv[0] == 'msmle_infer': sysOps.globalmasterProcess.dnamic_inference( False, True, False, compute_local_solutions_only) elif sys.argv[0] == 'ptmle_infer': sysOps.globalmasterProcess.dnamic_inference( False, False, False, compute_local_solutions_only) elif sys.argv[0] == 'segment_infer': sysOps.globalmasterProcess.dnamic_inference( False, False, True, compute_local_solutions_only) elif sys.argv[0] == 'layout': upstream.generate_data_layout() elif (len(sys.argv) > 2 and sys.argv[0] == 'compare'): sysOps.globalmasterProcess.crosscomparison_analysis(sys.argv) elif (sys.argv[0] == 'stats'): summaryAnalysis.gather_rarefaction_data() summaryAnalysis.gather_raw_read_stats() summaryAnalysis.gather_stats() summaryAnalysis.gather_cluster_stats() else: sysOps.throw_exception('Unrecognized pipeline input: ' + str(sys.argv)) print "Completed run."