コード例 #1
0
    def generate_uxi_library(self):
        # Perform sequence analysis (read-parsing, clustering, pairing UEIs/UMIs, sub-sampling data for rarefaction analyses)
        
        if not sysOps.check_file_exists('uxi_lib_tasklist.csv'):
            # create task list for library processing
            [subdirnames, filenames] = sysOps.get_directory_and_file_list(sysOps.globaldatapath)
            with open(sysOps.globaldatapath + 'uxi_lib_tasklist.csv','w') as task_input_file_handle:
                for subdir in subdirnames:
                    if sysOps.check_file_exists(subdir + '//libsettings.txt'):
                        task_input_file_handle.write('generate_uxi_library;' + sysOps.globaldatapath + subdir + '//\n')
                            
        original_datapath = str(sysOps.globaldatapath)
        [my_task,time_start] = parallelOps.get_next_open_task('tasklog.csv', 'uxi_lib_tasklist.csv', 'generate_uxi_library')
        if not (my_task is None):

            sysOps.initiate_runpath(str(my_task[1]))
            myLibObj = libOps.libObj(settingsfilename = 'libsettings.txt', output_prefix = '_')
            if not sysOps.check_file_exists(myLibObj.output_prefix + 'lib_stats.txt'):
                myLibObj.partition_fastq_library(discarded_sequence_path = "discarded_sequences.fastq", mean_phred_score_path = "mean_phred_scores.txt")
            self.generate_cluster_analysis()
                
            libOps.subsample(myLibObj.seqform_for_params,myLibObj.seqform_rev_params, myLibObj.output_prefix)
            [subdirnames, filenames] = sysOps.get_directory_and_file_list()
            dirnames = list([subdirname for subdirname in subdirnames if subdirname.startswith('sub')])
            sysOps.throw_status('Performing cluster analysis on sub-directories: ' + str(dirnames))
            for dirname in dirnames:
                sysOps.initiate_runpath(str(my_task[1]) + dirname + '//')
                self.generate_cluster_analysis()
        
            sysOps.globaldatapath = str(original_datapath)   
            if not parallelOps.close_task('tasklog.csv', ';'.join(my_task), time_start):
                sysOps.throw_exception('Task ' + str(my_task) + ' no longer exists in log ' + sysOps.globaldatapath + 'tasklog.csv' + ' -- exiting.')
                sysOps.exitProgram()
コード例 #2
0
ファイル: alignOps.py プロジェクト: richardzhu/dnamic
def consolidate_uxi(uxi_file, start_index=0, prefix='', include_inv_amp=False):
    # Function generates file ("identical_" + uxi_file) with list of identical unique uxi's (perfectly matched) with indices and and the number of corresponding reads

    #aux_info_file, if provided, contains line-by-line auxiliary assignments (stagger + amplicon-identity, if either exist)
    uxi_lib = dict()
    #build dictionary directly in memory

    with open(sysOps.globaldatapath + uxi_file, 'rU') as fasta_handle:
        sysOps.throw_status('Proceeding with consolidation ...')
        for my_record in SeqIO.parse(fasta_handle, "fasta"):
            my_seq = str(my_record.seq)
            if my_seq in uxi_lib:
                uxi_lib[my_seq].append(str(my_record.id))
            else:
                uxi_lib[my_seq] = [my_record.id]
    uxi_len = len(my_seq)  #final sequence used

    if include_inv_amp:
        with open(
                sysOps.globaldatapath + uxi_file[:uxi_file.find('.')] +
                '_amp_inv' + uxi_file[uxi_file.find('.'):],
                'rU') as fasta_handle:
            sysOps.throw_status(
                'Proceeding with consolidation, including invalid amplicons ...'
            )
            for my_record in SeqIO.parse(fasta_handle, "fasta"):
                my_seq = str(my_record.seq)
                if my_seq in uxi_lib:
                    uxi_lib[my_seq].append(str(my_record.id))
                else:
                    uxi_lib[my_seq] = [my_record.id]

    uxi_list_handle = open(
        sysOps.globaldatapath + prefix + "identical_" + uxi_file, 'w')
    uxi_index = int(start_index)
    for my_uxi_key, my_uxi_record_ids in sorted(
            uxi_lib.items()):  #alphabetize by uxi sequence
        uxi_list_handle.write(
            str(my_uxi_key) + '_' + str(uxi_index) + '_' +
            str(len(my_uxi_record_ids)) +
            '\n')  #output line includes uxi index and number of reads
        for my_record_id in my_uxi_record_ids:
            uxi_list_handle.write(my_record_id + '\n')
        uxi_index += 1

    uxi_list_handle.close()

    del uxi_lib

    return [uxi_index, uxi_len
            ]  #returns total number of unique entries and length of uxi itself
コード例 #3
0
def threshold_cluster_uxi_prelinked(uxi_list,identical_uxi_filename,threshold,P=0,subsample = -1, prefix = ''):
    
    # Function will be called while loading linkage_file into uxi_list through load_linkage_file_to_list(linkage_file) in hashAlignments.py
    # Format of linkage file:
    #    uxi-sequence, self-read-number, RND: list of linked-to indices with self-index as first in line
    # linkage_list elements: [uxi-sequence,self-read-number,RND,[list of linked-to indices with self-index as first in line]])
            
    #sort uxi_list by decreasing RND
    num_uxi = len(uxi_list)
    sysOps.throw_status('Starting uxi list sort. List size = ' + str(num_uxi))
    sorted_uxi_list = sorted(uxi_list, key=lambda row: -row[2]) #note: sorted_uxi_list _REMAINS_ a pointer to uxi_list
    index_vals = [-1 for i in range(num_uxi)]
    sysOps.throw_status('Completed uxi list sort. Assigning EASL-clusters ...')
        
    for sorted_uxi_el in sorted_uxi_list: 
        #index_vals, with indices corresponding to _original_ positions in pre-sorted uxi_list, are initiated at -1 (stored in list at row[3])
        #uxi's accepted into cluster with seed of index i, will be given value i in index_vals
        #uxi's rejected from all classification are given index
        if index_vals[sorted_uxi_el[3][0]] < 0: #if this seed has index -1 (has not been assigned to any seed itself)
            index_vals[sorted_uxi_el[3][0]] = int(sorted_uxi_el[3][0]) # set cluster seed to itself
            
        my_index_val = int(index_vals[sorted_uxi_el[3][0]])
        
        for i in range(1,len(sorted_uxi_el[3])):
            if index_vals[sorted_uxi_el[3][i]] < 0: #connected read is unassigned -- assign to current cluster seed
                index_vals[sorted_uxi_el[3][i]] = my_index_val

    sysOps.throw_status('Consolidating clustered uxis ...')
    #consolidate clustered uxi's
    
    if -1 in index_vals:
        sysOps.throw_exception('Error: UNASSIGNED/UNCLUSTERED uxis. Exiting program')
        sysOps.exitProgram()
        
    index_str_vals = [str(int(x)) for x in index_vals]
    new_uxi_dict= dict()
    
    for i in range(num_uxi):
        my_index_str = index_str_vals[i] 
        if my_index_str in new_uxi_dict:
            new_uxi_dict[my_index_str].append(uxi_list[i][0] + "_" + str(uxi_list[i][1]))
        else:
            new_uxi_dict[my_index_str] = [(uxi_list[i][0] + "_" + str(uxi_list[i][1]))]
            
    if(subsample<=0):
        new_uxi_handle = open(sysOps.globaldatapath + prefix + "thresh" + str(threshold) + "_" + identical_uxi_filename,'w')
    else:
        new_uxi_handle = open(sysOps.globaldatapath + prefix + "thresh" + str(threshold) + "_sub" + str(subsample) + identical_uxi_filename,'w')
    
    i = 0
    for dict_el in new_uxi_dict:
        for el in new_uxi_dict[dict_el]:
            new_uxi_handle.write(str(i) + "_" + el + "\n")     
        i += 1   
        
    new_uxi_handle.close()
    
    print "Completed clustering."
    
    return True
コード例 #4
0
ファイル: matOps.py プロジェクト: richardzhu/dnamic
def generate_wmat(consensus_pairing_csv_file, minreadcount, min_uei_count, outfilename = 'wmat.csv'):
    #consensus_pairing_csv_file has elements: 
    #uei index, beacon-umi index, target-umi index, read-count
    #if outfilename == None, does not print data to new files
    
    [bcn_dict,trg_dict,
     bcn_abund_dict,trg_abund_dict,
     bcn_div_dict,trg_div_dict] = get_umi_uei_matrices(consensus_pairing_csv_file, minreadcount)       
    if len(trg_dict)==0 or len(bcn_dict)==0:
        sysOps.throw_exception(consensus_pairing_csv_file + ' generated an empty UEI matrix.')
        sysOps.exitProgram()
    
    sysOps.throw_status(['Generating feature list.',sysOps.statuslogfilename])
    trg_feature_dict_list = get_features_from_dict(trg_dict) #collects salient pieces of information on targets for printing in file later
    [bcn_dict, trg_dict, bcn_div_dict, trg_div_dict] = filter_mats(bcn_dict, trg_dict,
                                                                   bcn_div_dict, trg_div_dict, min_uei_count)

    sysOps.throw_status(['Replacing matrix elements with UEI numbers (scalars).',sysOps.statuslogfilename])
    del bcn_dict
    sysOps.throw_status(['Generating weight matrix.',sysOps.statuslogfilename])
    
    if len(trg_dict)==0:
        sysOps.throw_exception('After filtering, ' + consensus_pairing_csv_file + ' generated an empty UEI matrix.')
        sysOps.exitProgram()
    
    if outfilename != None:
        print_features(trg_dict, 'trg_' + outfilename, trg_feature_dict_list)
    
    return trg_dict
コード例 #5
0
    def __init__(self, paramfilename):

        #Open parameter-file

        sysOps.throw_status("Reading from " + sysOps.globaldatapath +
                            paramfilename + " ...")
        sim_settings = fileOps.read_settingsfile_to_dictionary(
            sysOps.globaldatapath + paramfilename)
        self.effic_monomer = float(sim_settings['-effic_monomer'][0])
        self.effic_dimer = float(sim_settings['-effic_dimer'][0])
        self.diffconst = float(sim_settings['-diffconst'][0])
        self.lin_cycles = int(sim_settings['-lin_cycles'][0])
        self.exp_cycles = int(sim_settings['-exp_cycles'][0])
        self.posfilename = str(sim_settings['-posfilename'][0])
        # position file contains columns: UMI-index (stored as-is for later), 0 for bcn/1 for trg, x-coordinate, y-coordinate

        raw_image_csv = np.loadtxt(sysOps.globaldatapath + self.posfilename,
                                   delimiter=',')
        raw_image_csv = raw_image_csv[np.argsort(
            raw_image_csv[:, 1]), :]  # arranged as beacons followed by targets
        self.sim_pos = np.array(raw_image_csv[:, 2:], dtype=np.float64)
        self.sim_dims = self.sim_pos.shape[1]
        self.Nbcn = np.sum(raw_image_csv[:, 1] == 0)
        self.Ntrg = np.sum(raw_image_csv[:, 1] == 1)
        self.Nuei = int(sim_settings['-uei_per_bcn_umi'][0]) * self.Nbcn
        self.N_reads = int(sim_settings['-reads_per_uei'][0]) * self.Nuei
        self.index_key = np.int64(raw_image_csv[:, 0])

        self.sim_pos = np.append(
            self.sim_pos,
            np.ones([self.Nbcn + self.Ntrg, 1], dtype=np.float64),
            axis=1)  #number of starting molecules is always = 1
        sysOps.throw_status("Assigned point-dimensionality to " +
                            str(self.sim_dims))

        return
コード例 #6
0
ファイル: hashAlignments.py プロジェクト: richardzhu/dnamic
def initiate_hash_alignment(uxi_file, P=0.0):
    '''
    Takes in specific uxi_file, already formatted from source, consolidates identical sequences, performs hash-alignment, 
    and clusters them. Each of these tasks is skipped, in order, if it's found up-to-date based on dates-of-modification. 
    
    '''

    identical_uxi_file = 'identical_' + uxi_file

    consolidation_up_to_date = False
    clustering_up_to_date = False
    alignment_up_to_date = False

    [dirnames, filenames] = sysOps.get_directory_and_file_list()

    if identical_uxi_file in filenames:
        consolidation_up_to_date = (
            os.stat(sysOps.globaldatapath + identical_uxi_file).st_mtime >
            os.stat(sysOps.globaldatapath + uxi_file).st_mtime)
        #if time of last modification of identical-consolidation file is later than time of modification/writing of uxi_file
        sysOps.throw_status([
            'Consolidation up-to-date = ' + str(consolidation_up_to_date),
            sysOps.statuslogfilename
        ])

    if ('linked_' + identical_uxi_file) in filenames:
        alignment_up_to_date = (
            os.stat(sysOps.globaldatapath + 'linked_' +
                    identical_uxi_file).st_mtime >
            os.stat(sysOps.globaldatapath + identical_uxi_file).st_mtime)
        #if time of last modification of threshold-clustering file is later than time of modification/writing of uxi_file
        sysOps.throw_status([
            'Alignment up-to-date = ' + str(alignment_up_to_date),
            sysOps.statuslogfilename
        ])

    if ('thresh1_' + identical_uxi_file) in filenames:
        clustering_up_to_date = (
            os.stat(sysOps.globaldatapath + 'thresh1_' +
                    identical_uxi_file).st_mtime >
            os.stat(sysOps.globaldatapath + identical_uxi_file).st_mtime)
        #if time of last modification of threshold-clustering file is later than time of modification/writing of uxi_file
        sysOps.throw_status([
            'Clustering up-to-date = ' + str(clustering_up_to_date),
            sysOps.statuslogfilename
        ])

    if not (consolidation_up_to_date and alignment_up_to_date):

        #write placeholder file
        with open(sysOps.globaldatapath + 'thresh1_' + identical_uxi_file,
                  'w') as placeholderfile:
            placeholderfile.write('In progress.')

        if not consolidation_up_to_date:
            sysOps.throw_status([
                'Consolidation not up to date, consolidating file ' +
                sysOps.globaldatapath + uxi_file, sysOps.statuslogfilename
            ])
            [num_elements,
             uxi_len] = alignOps.consolidate_uxi(uxi_file,
                                                 start_index=0,
                                                 prefix='',
                                                 include_inv_amp=False)
        else:  #fetch uxi_len
            sysOps.throw_status([
                'Consolidation up to date, reading from file ' +
                sysOps.globaldatapath + identical_uxi_file,
                sysOps.statuslogfilename
            ])
            with open(sysOps.globaldatapath + identical_uxi_file,
                      'rU') as uxi_handle:
                for uxi_line in uxi_handle:
                    split_str = uxi_line.split('_')
                    if (len(split_str) == 3):
                        uxi_len = len(
                            split_str[0]
                        )  #first element of identical-sequence file is U(M/E)I sequence itself
                        break

        for mismatch_pos in range(
                uxi_len
        ):  #output members (indexed by ) of substrings (and abundances) corresponding to all characters except for the one at mismatch_pos
            #format as follows -- substring: member1-index_abundance1,member2-index_abundance2,...
            sysOps.throw_status([
                'Performing hash alignment on position ' + str(mismatch_pos),
                sysOps.statuslogfilename
            ])
            output_hashed_mismatch_alignment(
                identical_uxi_file, mismatch_pos,
                'mis' + str(mismatch_pos) + '_' + identical_uxi_file)

        sysOps.throw_status([
            'Hash alignments complete. Proceeding to assemble linked file.',
            sysOps.statuslogfilename
        ])
        generate_linkage_file(identical_uxi_file, [
            'mis' + str(mismatch_pos) + '_' + identical_uxi_file
            for mismatch_pos in range(uxi_len)
        ], "linked_" + identical_uxi_file, P)

        #now that linkage file has been constructed, delete hash-alignment files
        for hash_filename in [
                'mis' + str(mismatch_pos) + '_' + identical_uxi_file
                for mismatch_pos in range(uxi_len)
        ]:
            os.remove(sysOps.globaldatapath + hash_filename)

    if not clustering_up_to_date:
        sysOps.delay_with_alertfile('_cluster_inprog' + uxi_file)
        clustOps.threshold_cluster_uxi_prelinked(
            alignOps.load_linkage_file_to_list("linked_" + identical_uxi_file),
            identical_uxi_file, 1, P)
        clustering_up_to_date = True
        sysOps.remove_alertfile('_cluster_inprog' + uxi_file)

    return clustering_up_to_date
コード例 #7
0
ファイル: dnamicOps.py プロジェクト: richardzhu/dnamic
def print_final_results(trgcalls_filename, trgseq_filename):

    #output final_*.csv containing columns (index, -1 (beacon)/ target-amplicon match,
    #                                            x, y, ..., segment
    #output final_feat*.csv containing columns (index, features, consensus sequence (if target)
    #
    [dirnames, filenames] = sysOps.get_directory_and_file_list()
    seq_dat_filename = [
        filename for filename in filenames if filename.startswith('seq_params')
    ]
    seq_dat_filename = seq_dat_filename[0][len('seq_params_'):]

    for result_dat_file in filenames:
        if (result_dat_file.startswith('Xumi_') and
                not (sysOps.check_file_exists('final_' + result_dat_file))):
            key_dat_file = 'key' + seq_dat_filename[
                (seq_dat_filename.find('_')):]
            if sysOps.check_file_exists(key_dat_file):
                coords_dict = dict()
                sysOps.throw_status('Generating final output for ' +
                                    sysOps.globaldatapath +
                                    str(result_dat_file))
                result = np.loadtxt(sysOps.globaldatapath + result_dat_file,
                                    delimiter=',')
                for i in range(result.shape[0]):
                    coords_dict[str(int(result[i, 0]))] = ','.join(
                        [str(x) for x in result[i, 1:]])

                trg_match_dict = dict()
                trg_match_file = open(
                    sysOps.globaldatapath + trgcalls_filename, 'rU')
                trg_seq_file = open(sysOps.globaldatapath + trgseq_filename,
                                    'rU')

                for line, fasta_record in itertools.izip(
                        trg_match_file, SeqIO.parse(trg_seq_file, "fasta")):
                    [trg_umi_index, max_match, max_tally,
                     tot_tally] = line.strip('\n').split(',')
                    trg_match_dict[trg_umi_index] = [
                        str(max_match),
                        str(max_tally),
                        str(tot_tally),
                        str(fasta_record.seq)
                    ]

                trg_match_file.close()
                trg_seq_file.close()

                outfile = open(
                    sysOps.globaldatapath + '//final_' + result_dat_file, 'w')
                outfile_feat = open(
                    sysOps.globaldatapath + '//final_feat_' + result_dat_file,
                    'w')

                bcn_excluded = 0
                trg_excluded = 0
                with open(sysOps.globaldatapath + key_dat_file,
                          'rU') as key_file:
                    for line in key_file:
                        [bcn0trg1, orig_index,
                         mle_index] = line.strip('\n').split(',')
                        #key file columns: 0 or 1 (for beacon or target, respectively), cluster-index, MLE processing index
                        if mle_index in coords_dict:
                            outfile.write(orig_index + ',' +
                                          coords_dict[mle_index] + '\n')
                            if bcn0trg1 == '0':
                                outfile_feat.write(orig_index +
                                                   ',-1,-1,-1,N\n')
                            else:
                                outfile_feat.write(
                                    orig_index + ',' +
                                    ','.join(trg_match_dict[orig_index]) +
                                    '\n')
                        else:
                            if bcn0trg1 == '0':
                                bcn_excluded += 1
                            else:
                                trg_excluded += 1
                sysOps.throw_status(
                    str(bcn_excluded) + ' beacons, ' + str(trg_excluded) +
                    ' targets excluded from final estimation')
                outfile.close()
                outfile_feat.close()

            else:
                sysOps.throw_exception(sysOps.globaldatapath + key_dat_file +
                                       ' does not exist.')
    return
コード例 #8
0
ファイル: dnamicOps.py プロジェクト: richardzhu/dnamic
def assign_consensus_pairs(pairing_csv_file, min_pairing_readcount):
    '''
    Assumes CSV file with columns:
    1. UEI cluster-index
    2. Beacon UMI cluster-index
    3. Target UMI cluster-index
    4. Read-number
    '''

    sysOps.throw_status('Loading pairing file ' + pairing_csv_file + ' ...')
    uei_clust_index_dict = dict()

    with open(sysOps.globaldatapath + pairing_csv_file, 'rU') as csvfile:
        for line in csvfile:
            row = line.strip('\n').split(',')
            index_str = str(row[0])  #UEI cluster-index
            if index_str in uei_clust_index_dict:
                uei_clust_index_dict[index_str].append(
                    [int(row[1]),
                     int(row[2]),
                     int(row[3]),
                     int(row[4])]
                )  #append dictionary entry as list with row having indices of beacon- and target-umi clusters, the read-number, and the set-index (will all be 0 if invalid-amplicon reads are excluded)
            else:
                uei_clust_index_dict[index_str] = [[
                    int(row[1]),
                    int(row[2]),
                    int(row[3]),
                    int(row[4])
                ]]

    #replace each entry with umi pairing having plurality of reads, in same indexed format
    sysOps.throw_status('Generating consensus-pairs ...')
    discarded_ueis = 0
    accepted_ueis = 0
    for uei_clust_el in uei_clust_index_dict:
        maxcount = 0
        secondmaxcount = 0  #detect ties, discard if tie exists
        maxcount_pair_bcn_index = -1
        maxcount_pair_trg_index = -1
        maxcount_set_index = -1
        for row in uei_clust_index_dict[uei_clust_el]:
            if (row[2] >= min_pairing_readcount and row[2] > maxcount):
                secondmaxcount = int(maxcount)
                if maxcount_set_index >= 0 and maxcount_set_index != row[3]:
                    sysOps.throw_exception('Error: set-index mismatch.')
                    sysOps.exitProgram()
                maxcount_pair_bcn_index = int(row[0])
                maxcount_pair_trg_index = int(row[1])
                maxcount = int(row[2])
                maxcount_set_index = int(row[3])
            elif (row[2] >= min_pairing_readcount and row[2] > secondmaxcount):
                secondmaxcount = int(row[2])

        if maxcount >= min_pairing_readcount and maxcount > secondmaxcount:
            # note: this condition requires that not only must the uei have at least min_pairing_readcount,
            # but the plurality-tally be must min_pairing_readcount as well
            uei_clust_index_dict[uei_clust_el] = list([
                int(maxcount_pair_bcn_index),
                int(maxcount_pair_trg_index),
                int(maxcount),
                int(maxcount_set_index)
            ])
            accepted_ueis += 1
        else:
            uei_clust_index_dict[uei_clust_el] = list()
            discarded_ueis += 1

    sysOps.throw_status('Outputting consensus-pairs with at least ' +
                        str(min_pairing_readcount) +
                        ' read-plurality. Accepted ' + str(accepted_ueis) +
                        ' UEIs, discarded ' + str(discarded_ueis) +
                        ' UEIs ...')
    #index outputted as uei-index, beacon-umi-index, target-umi-index, read-count
    outfile_handle = open(
        sysOps.globaldatapath + "consensus_" + str(min_pairing_readcount) +
        "r_" + pairing_csv_file, 'w')

    for uei_clust_el in uei_clust_index_dict:
        if len(uei_clust_index_dict[uei_clust_el]) > 0:
            outfile_handle.write(
                uei_clust_el + "," +
                ",".join([str(s)
                          for s in uei_clust_index_dict[uei_clust_el]]) + "\n")

    outfile_handle.close()

    return
コード例 #9
0
ファイル: dnamicOps.py プロジェクト: richardzhu/dnamic
def assign_umi_pairs(uei_cluster_file,
                     bcn_umi_cluster_file,
                     trg_umi_cluster_file,
                     uei_fasta_file,
                     bcn_umi_fasta_file,
                     trg_umi_fasta_file,
                     outfile_prefix,
                     filter_val=0.75,
                     include_inv_amp=False):

    #at most filter_val fraction of total bases in given uxi allowed to be the same

    #Cluster-files have row-formats: uxi-cluster-index_uxi-sequence_read-number
    #load_cluster_file_to_dictionary outputs dictionary with entries {uxi-sequence: [uxi-cluster-index, read-number]}

    sysOps.throw_status("Finalizing consensus UMI sequences ...")
    uei_cluster_dict = fileOps.load_cluster_file_to_dictionary(
        uei_cluster_file)
    bcn_umi_cluster_dict = fileOps.load_cluster_file_to_dictionary(
        bcn_umi_cluster_file)
    trg_umi_cluster_dict = fileOps.load_cluster_file_to_dictionary(
        trg_umi_cluster_file)

    uei_clust_readcount_tally = dict()
    bcn_umi_clust_readcount_tally = dict()
    trg_umi_clust_readcount_tally = dict()

    # initiate tally dictionaries addressed by clust index
    # one element per file_set_index (if invalid amplicon sequences are being excluded, only first index will be populated)
    for uei_seq in uei_cluster_dict:
        uei_clust_readcount_tally[str(uei_cluster_dict[uei_seq][0])] = [0, 0]
    for umi_seq in bcn_umi_cluster_dict:
        bcn_umi_clust_readcount_tally[str(
            bcn_umi_cluster_dict[umi_seq][0])] = [0, 0]
    for umi_seq in trg_umi_cluster_dict:
        trg_umi_clust_readcount_tally[str(
            trg_umi_cluster_dict[umi_seq][0])] = [0, 0]

    #outputs dictionary with entries {uxi-sequence: [uxi-cluster-index, read-number]}
    #generate list of list of lists with index-order uei,umi
    uei_umi_dict = dict()
    inadmis_seq_count = 0
    admis_seq_count = 0

    uei_fasta_list = [uei_fasta_file]
    bcn_umi_fasta_list = [bcn_umi_fasta_file]
    trg_umi_fasta_list = [trg_umi_fasta_file]
    if include_inv_amp:
        uei_fasta_list.append(uei_fasta_file[:uei_fasta_file.find('.')] +
                              '_amp_inv' +
                              uei_fasta_file[uei_fasta_file.find('.'):])
        bcn_umi_fasta_list.append(
            bcn_umi_fasta_file[:bcn_umi_fasta_file.find('.')] + '_amp_inv' +
            bcn_umi_fasta_file[bcn_umi_fasta_file.find('.'):])
        trg_umi_fasta_list.append(
            trg_umi_fasta_file[:trg_umi_fasta_file.find('.')] + '_amp_inv' +
            trg_umi_fasta_file[trg_umi_fasta_file.find('.'):])

    file_set_index = 0
    sysOps.throw_status(
        "Inputting data to UEI-UMI dictionary using file-sets: " +
        str(uei_fasta_list) + ", " + str(bcn_umi_fasta_list) + ", " +
        str(trg_umi_fasta_list))

    for uei_fasta, bcn_umi_fasta, trg_umi_fasta in itertools.izip(
            uei_fasta_list, bcn_umi_fasta_list, trg_umi_fasta_list):
        uei_handle = open(sysOps.globaldatapath + uei_fasta, "rU")
        bcn_umi_handle = open(sysOps.globaldatapath + bcn_umi_fasta, "rU")
        trg_umi_handle = open(sysOps.globaldatapath + trg_umi_fasta, "rU")

        for uei_record, bcn_umi_record, trg_umi_record in itertools.izip(
                SeqIO.parse(uei_handle, "fasta"),
                SeqIO.parse(bcn_umi_handle, "fasta"),
                SeqIO.parse(trg_umi_handle, "fasta")):
            uei_seq = str(uei_record.seq)
            bcn_umi_seq = str(bcn_umi_record.seq)
            trg_umi_seq = str(trg_umi_record.seq)
            max_uei_frac = max(
                np.bincount([('ACGT').index(s)
                             for s in uei_seq])) / float(len(uei_seq))
            max_bcn_umi_frac = max(
                np.bincount([('ACGT').index(s)
                             for s in bcn_umi_seq])) / float(len(bcn_umi_seq))
            max_trg_umi_frac = max(
                np.bincount([('ACGT').index(s)
                             for s in trg_umi_seq])) / float(len(trg_umi_seq))
            if max_uei_frac <= filter_val and max_bcn_umi_frac <= filter_val and max_trg_umi_frac <= filter_val:
                uei_clust_ind = str(uei_cluster_dict[uei_seq][0])
                bcn_umi_clust_ind = str(bcn_umi_cluster_dict[bcn_umi_seq][0])
                trg_umi_clust_ind = str(trg_umi_cluster_dict[trg_umi_seq][0])
                uei_clust_readcount_tally[uei_clust_ind][file_set_index] += 1
                bcn_umi_clust_readcount_tally[bcn_umi_clust_ind][
                    file_set_index] += 1
                trg_umi_clust_readcount_tally[trg_umi_clust_ind][
                    file_set_index] += 1

                pair_str = bcn_umi_clust_ind + "_" + trg_umi_clust_ind
                if uei_clust_ind in uei_umi_dict and uei_umi_dict[
                        uei_clust_ind][2] == file_set_index:
                    #if uei from read has already been inserted into uei-umi dictionary
                    if pair_str in uei_umi_dict[uei_clust_ind][
                            0]:  #if bcn-trg pair has already been added to this uei entry
                        pair_ind = uei_umi_dict[uei_clust_ind][0].index(
                            pair_str)
                        uei_umi_dict[uei_clust_ind][1][pair_ind] += 1
                    else:  # uei in uei_umi_dict -- but corresponding UMI-pair not found in existing list
                        uei_umi_dict[uei_clust_ind][0].append(pair_str)
                        uei_umi_dict[uei_clust_ind][1].append(1)
                    admis_seq_count += 1
                elif uei_clust_ind not in uei_umi_dict:  # uei not yet in uei_umi_dict -- create new list
                    uei_umi_dict[uei_clust_ind] = [[pair_str], [1],
                                                   int(file_set_index)]
                    admis_seq_count += 1
                else:
                    inadmis_seq_count += 1  # if UEI has been found but not as part of the first file_set_index for which it was detected (this depends on orderring any invalid-amplicon files second in the fasta-lists above), then disregard
            else:
                inadmis_seq_count += 1
        uei_handle.close()
        bcn_umi_handle.close()
        trg_umi_handle.close()
        file_set_index += 1

    sysOps.throw_status(
        'Did not use ' + str(inadmis_seq_count) + '/' +
        str(admis_seq_count + inadmis_seq_count) +
        ' pairings due to repetitive base-usage in UMI or UEI sequence.')
    #elements of uei_umi_dict are now list of cluster indices (ordered uei,bcn,trg,# times that element has been called)

    #convert embedded dictionaries into list
    list_output = list()
    for uei_el in uei_umi_dict:
        for i in range(len(uei_umi_dict[uei_el][0])):
            pair_str = uei_umi_dict[uei_el][0][i]
            [bcn_umi_el, trg_umi_el] = pair_str.split('_')
            list_output.append([
                int(uei_el),
                int(bcn_umi_el),
                int(trg_umi_el),
                int(uei_umi_dict[uei_el][1][i]),
                int(uei_umi_dict[uei_el][2])
            ])

    del uei_umi_dict
    list_output.sort(
        key=lambda row: (row[0], row[1], row[2], -row[3], row[4])
    )  #sort by uei-cluster, then beacon-umi-cluster, then target-umi-cluster, then decreasing read-count

    sysOps.throw_status("Writing file ...")
    with open(
            sysOps.globaldatapath + outfile_prefix + "_filter" +
            str(filter_val) + "_uei_umi.csv", 'w') as outfile_handle:
        for row in list_output:
            outfile_handle.write(','.join([str(s) for s in row]) + "\n")

    sysOps.throw_status("Tallying clusters ...")
    uei_clust_counts = [[0, 0], [0, 0], [0, 0]]
    bcn_umi_clust_counts = [[0, 0], [0, 0], [0, 0]]
    trg_umi_clust_counts = [[0, 0], [0, 0], [0, 0]]
    uei_clust_counts_inclusive = [[0, 0], [0, 0], [0, 0]]
    bcn_umi_clust_counts_inclusive = [[0, 0], [0, 0], [0, 0]]
    trg_umi_clust_counts_inclusive = [[0, 0], [0, 0], [0, 0]]

    # with file_set_index=0 corresponding to amplicon-valid and file_set_index=1 corresponding to amplicon-invalid and
    # a cluster is counted for file_set_index = 0 if none of its members are valid
    for uei_clust_ind in uei_clust_readcount_tally:
        file_set_index = 0
        this_readcount = uei_clust_readcount_tally[uei_clust_ind][
            file_set_index]
        if this_readcount == 0:
            file_set_index = 1
            this_readcount = uei_clust_readcount_tally[uei_clust_ind][
                file_set_index]
        if this_readcount > 0:
            uei_clust_counts[min(this_readcount, 3) - 1][file_set_index] += 1
        uei_clust_counts_inclusive[
            min(uei_clust_readcount_tally[uei_clust_ind][0], 3) - 1][0] += 1
        uei_clust_counts_inclusive[
            min(uei_clust_readcount_tally[uei_clust_ind][1], 3) - 1][1] += 1

    for umi_clust_ind in bcn_umi_clust_readcount_tally:
        file_set_index = 0
        this_readcount = bcn_umi_clust_readcount_tally[umi_clust_ind][
            file_set_index]
        if this_readcount == 0:
            file_set_index = 1
            this_readcount = bcn_umi_clust_readcount_tally[umi_clust_ind][
                file_set_index]
        if this_readcount > 0:
            bcn_umi_clust_counts[min(this_readcount, 3) -
                                 1][file_set_index] += 1
        bcn_umi_clust_counts_inclusive[
            min(bcn_umi_clust_readcount_tally[umi_clust_ind][0], 3) -
            1][0] += 1
        bcn_umi_clust_counts_inclusive[
            min(bcn_umi_clust_readcount_tally[umi_clust_ind][1], 3) -
            1][1] += 1

    for umi_clust_ind in trg_umi_clust_readcount_tally:
        file_set_index = 0
        this_readcount = trg_umi_clust_readcount_tally[umi_clust_ind][
            file_set_index]
        if this_readcount == 0:
            file_set_index = 1
            this_readcount = trg_umi_clust_readcount_tally[umi_clust_ind][
                file_set_index]
        if this_readcount > 0:
            trg_umi_clust_counts[min(this_readcount, 3) -
                                 1][file_set_index] += 1
        trg_umi_clust_counts_inclusive[
            min(trg_umi_clust_readcount_tally[umi_clust_ind][0], 3) -
            1][0] += 1
        trg_umi_clust_counts_inclusive[
            min(trg_umi_clust_readcount_tally[umi_clust_ind][1], 3) -
            1][1] += 1

    with open(sysOps.globaldatapath + outfile_prefix + '_clust_stats.txt',
              'w') as out_stats:
        tot_file_sets = 1
        if include_inv_amp:
            tot_file_sets = 2
        for file_set_index in range(tot_file_sets):
            out_stats.write('uei:' + str(file_set_index) + ':' + ','.join(
                [str(uei_clust_counts[i][file_set_index])
                 for i in range(3)]) + '\n')
            out_stats.write('bcn_umi:' + str(file_set_index) + ':' + ','.join([
                str(bcn_umi_clust_counts[i][file_set_index]) for i in range(3)
            ]) + '\n')
            out_stats.write('trg_umi:' + str(file_set_index) + ':' + ','.join([
                str(trg_umi_clust_counts[i][file_set_index]) for i in range(3)
            ]) + '\n')

    sysOps.throw_status("Completed.")
    return
コード例 #10
0
ファイル: dnamicOps.py プロジェクト: richardzhu/dnamic
def assign_umi_amplicons(trg_umi_cluster_file, trg_umi_fasta, amp_match_file,
                         amp_seq_fasta, outfilename):
    #function will tally reads counted for each target umi across each amplicon-call, and return a csv file with the following columns:
    #(target umi cluster-index),(leading amplicon-call),(reads for leading amplicon-call),(total reads counted)

    sysOps.throw_status('Loading cluster-file ' + sysOps.globaldatapath +
                        trg_umi_cluster_file)
    trg_umi_cluster_dict = fileOps.load_cluster_file_to_dictionary(
        trg_umi_cluster_file)
    #outputs dictionary with entries {uxi-sequence: [uxi-cluster-index, read-number]}

    trg_umi_handle = open(sysOps.globaldatapath + trg_umi_fasta, "rU")
    amp_seq_handle = open(sysOps.globaldatapath + amp_seq_fasta, "rU")
    realign_amplicons = False
    amp_match_handle = None
    try:
        sysOps.throw_status('Loading ' + sysOps.globaldatapath +
                            amp_match_file)
        amp_match_handle = open(sysOps.globaldatapath + amp_match_file, "rU")
    except:
        sysOps.throw_status(
            sysOps.globaldatapath + amp_match_file +
            ' not found. Alignments will occur from sequence-consenses directly.'
        )
        realign_amplicons = True
        if not sysOps.check_file_exists('amplicon_refs.txt'):
            sysOps.throw_exception('Error: ' + sysOps.globaldatapath +
                                   'amplicon_refs.txt not found.')
            sysOps.exitProgram()

    trg_umi_dict = dict()
    trg_amp_seq_dict = dict()

    for trg_umi_record, amp_seq_record in itertools.izip(
            SeqIO.parse(trg_umi_handle, "fasta"),
            SeqIO.parse(amp_seq_handle, "fasta")):

        if not realign_amplicons:
            amp_match = int(amp_match_handle.readline().strip('\n'))
        else:
            amp_match = -1

        trg_umi_seq = str(trg_umi_record.seq)
        if trg_umi_seq in trg_umi_cluster_dict:
            trg_umi_index = str(
                trg_umi_cluster_dict[trg_umi_seq][0])  #uxi cluster-index
            if trg_umi_index in trg_umi_dict:
                if amp_match in trg_umi_dict[trg_umi_index]:
                    trg_umi_dict[trg_umi_index][
                        amp_match] += 1  #add 1, because every read is being entered
                else:
                    trg_umi_dict[trg_umi_index][amp_match] = 1
            else:
                trg_umi_dict[trg_umi_index] = dict()
                trg_amp_seq_dict[trg_umi_index] = baseTally()
                trg_umi_dict[trg_umi_index][amp_match] = 1

            trg_amp_seq_dict[trg_umi_index].add_record(str(amp_seq_record.seq),
                                                       1)

    trg_umi_handle.close()
    amp_seq_handle.close()
    if not realign_amplicons:
        amp_match_handle.close()

    csvfile = open(sysOps.globaldatapath + outfilename, 'w')
    fastafile = open(
        sysOps.globaldatapath + outfilename[:outfilename.rfind('.')] +
        '.fasta', 'w')
    ref_sequences = list()
    if realign_amplicons and sysOps.check_file_exists('amplicon_refs.txt'):
        with open(sysOps.globaldatapath + 'amplicon_refs.txt',
                  'rU') as ref_file_handle:
            for ref_line in ref_file_handle:
                [ref_name, ref_seq] = ref_line.strip('\n').upper().split('|')
                # amplicon_refs.txt will contain sequences in reverse complementary orientation. We therefore reverse both complementarity and order
                ref_sequences.append([
                    str(Seq.Seq(my_ref_seq).reverse_complement())
                    for my_ref_seq in reversed(ref_seq.split(','))
                ])
        mySettings = fileOps.read_settingsfile_to_dictionary('libsettings.txt')
        max_mismatch_amplicon = float(mySettings["-max_mismatch_amplicon"][0])
        trg_umi_index_dict = dict()

    accepted_consensus_sequences = 0
    inadmis_consensus_sequences = 0
    for trg_umi_index in trg_umi_dict:
        max_tally = 0
        tot_tally = 0

        for amp_match in trg_umi_dict[trg_umi_index]:

            my_tally = trg_umi_dict[trg_umi_index][amp_match]

            if my_tally >= max_tally:
                max_tally = int(my_tally)
                max_match = int(amp_match)

            tot_tally += int(my_tally)

        consensus_seq = str(
            trg_amp_seq_dict[trg_umi_index].get_str_consensus())

        if realign_amplicons:
            # perform direct, un-gapped alignment of consensus_seq to reference options to obtain max_match
            max_match = -1
            max_tally = -1  # exclude max_tally as count, since alignment is happening post-consensus
            min_mismatch_count = -1
            for i in range(len(ref_sequences)):
                all_subamplicons_pass = True
                start_index = 0
                tot_mismatches = 0
                for j in range(len(ref_sequences[i])
                               ):  # loop through sub-amplicon-sequences
                    ref_subamplicon_len = len(ref_sequences[i][j])
                    my_mismatches, minlen = alignOps.count_mismatches(
                        ref_sequences[i][j],
                        consensus_seq[start_index:(start_index +
                                                   ref_subamplicon_len)])
                    if minlen == 0:
                        all_subamplicons_pass = False
                        break
                    all_subamplicons_pass = all_subamplicons_pass and (
                        my_mismatches / float(minlen) <= max_mismatch_amplicon)
                    start_index += ref_subamplicon_len
                    tot_mismatches += my_mismatches
                if all_subamplicons_pass and (
                        max_match < 0 or min_mismatch_count < tot_mismatches):
                    max_match = int(i)
                    min_mismatch_count = int(tot_mismatches)

        if max_match >= 0:
            csvfile.write(trg_umi_index + "," + str(max_match) + "," +
                          str(max_tally) + "," + str(tot_tally) + "\n")
            fastafile.write(">" + trg_umi_index + '\n')
            fastafile.write(consensus_seq + '\n')
            if realign_amplicons:
                trg_umi_index_dict[trg_umi_index] = True
            accepted_consensus_sequences += 1
        else:
            inadmis_consensus_sequences += 1

    csvfile.close()
    fastafile.close()
    sysOps.throw_status('Discarded ' + str(inadmis_consensus_sequences) + '/' +
                        str(accepted_consensus_sequences +
                            inadmis_consensus_sequences) +
                        ' sequences in writing ' + sysOps.globaldatapath +
                        outfilename + ' due to inadequate amplicon match.')

    if realign_amplicons:
        # create a new consensus pairing file that's filtered with the accepted trg umi indices
        [dirnames, filenames] = sysOps.get_directory_and_file_list()
        consensus_filenames = [
            filename for filename in filenames
            if filename.startswith('consensus')
        ]
        for consensus_filename in consensus_filenames:  # find all consensus files present
            accepted_consensus_sequences = 0
            inadmis_consensus_sequences = 0
            os.rename(
                sysOps.globaldatapath + consensus_filename,
                sysOps.globaldatapath + 'unfiltered_' + consensus_filename)
            with open(sysOps.globaldatapath + consensus_filename,
                      'w') as new_consensus_file:
                with open(
                        sysOps.globaldatapath + 'unfiltered_' +
                        consensus_filename, 'rU') as old_consensus_file:
                    for old_consensus_file_line in old_consensus_file:
                        consensus_list = old_consensus_file_line.strip(
                            '\n'
                        ).split(
                            ','
                        )  # [uei_index, bcn_umi_index, trg_umi_index, read_count, (additional variables)]
                        if consensus_list[2] in trg_umi_index_dict:
                            new_consensus_file.write(old_consensus_file_line)
                            accepted_consensus_sequences += 1
                        else:
                            inadmis_consensus_sequences += 1
            sysOps.throw_status('Discarded ' +
                                str(inadmis_consensus_sequences) + '/' +
                                str(accepted_consensus_sequences +
                                    inadmis_consensus_sequences) +
                                ' consensus-pairings in writing ' +
                                sysOps.globaldatapath + consensus_filename +
                                ' due to inadequate amplicon match.')
        if len(consensus_filenames) == 0:
            sysOps.throw_exception(
                'Error: no consensus files available to update with realigned amplicon information. Exiting.'
            )
            sysOps.exitProgram()
コード例 #11
0
ファイル: summaryAnalysis.py プロジェクト: richardzhu/dnamic
def gather_rarefaction_data(conditions_filename = 'conditions.csv', outfilename = 'rarefaction_file.txt', raw_uxi_files = ['_for_uxi0.fasta','_for_uxi1.fasta','_rev_uxi0.fasta']):
    
    #use conditions conditions_filename to specify output order
    dirnames = list()
    with open(sysOps.globaldatapath + conditions_filename, 'rU') as conditions_handle:
        for myline in conditions_handle:
            thisline = myline.strip('\n').split(',')
            dirnames.append('lib_' + str(thisline[0]) + '_' + str(thisline[1]) + '_' + str(thisline[2]))
    
    outfile_1r = open(sysOps.globaldatapath +'1r_' + outfilename,'w')
    outfile_2r = open(sysOps.globaldatapath +'2r_' + outfilename,'w')
    outfile_3r = open(sysOps.globaldatapath +'3r_' + outfilename,'w')
    
    for dir in dirnames:
        print 'Gathering rarefaction data for directory ' + sysOps.globaldatapath + dir
        sum_reads_raw = 0
        with open(sysOps.globaldatapath +dir + '/' + raw_uxi_files[0],'rU') as uxi_file_handle:
            #first UMI/UEI file in list to count raw reads
            for uxi_record in SeqIO.parse(uxi_file_handle,'fasta'):
                sum_reads_raw += 1
        
        subsample = 500
        terminate = False
        while not terminate:
            all_diversities = []
            try:
                for my_raw_uxi_file in raw_uxi_files:
                    try:
                        cluster_file_handle = open(sysOps.globaldatapath +dir + '/thresh1_identical_sub' + str(subsample) + my_raw_uxi_file,'rU')
                        consensus_pairing_csv_file = dir + '/consensus_2r_sub' + str(subsample) + 'pairing_filter0.75_uei_umi.csv'
                    except:
                        terminate = True
                        try:
                            cluster_file_handle = open(sysOps.globaldatapath +dir + '/thresh1_identical_' + my_raw_uxi_file,'rU')
                            consensus_pairing_csv_file = dir + '/consensus_2r_pairing_filter0.75_uei_umi.csv'
                        except:
                            sysOps.throw_exception('Directory ' + sysOps.globaldatapath + dir + ' does not contain clustered file' +  sysOps.globaldatapath +dir + '/thresh1_identical_' + my_raw_uxi_file + '. Skipping ...')
                            break
                        
                        subsample = sum_reads_raw
                        
                    cluster_dict = dict()
                    for myline in cluster_file_handle:
                        thisline = myline.strip('\n').split('_')
                        if thisline[0] in cluster_dict:
                            cluster_dict[thisline[0]] += int(thisline[2])
                        else:
                            cluster_dict[thisline[0]] = int(thisline[2])
                            
                    cluster_file_handle.close()
    
                    diversity = [0,0,0] #first element is 1-read-gated diversity, second is 2-read-gated, third is 3-read-gated
                    for el in cluster_dict:
                        if cluster_dict[el]>=3:
                            diversity[0] += 1
                            diversity[1] += 1
                            diversity[2] += 1
                        elif cluster_dict[el]>=2:
                            diversity[0] += 1
                            diversity[1] += 1
                        else:
                            diversity[0] += 1
                            
                    all_diversities.append(diversity)
    
                #if sysOps.check_file_exists(consensus_pairing_csv_file):
                if False: #temp
                    sysOps.throw_status('Found ' + sysOps.globaldatapath + consensus_pairing_csv_file + '.')
                    min_uei_count = 2  
                    min_umi_readcount = 2
                    outname = 'minb' + str(min_uei_count) + 'v' + str(0) + '_' + str(min_umi_readcount) + 'r_filter0.75'
                    wmat_outfilename = 'noabundcorr_wmat_' + outname + '.csv'
                    sysOps.throw_status('Calling matOps.generate_wmat()')
                    [num_unique_trg, num_unique_bcn, trg_dict] = matOps.generate_wmat(consensus_pairing_csv_file, min_umi_readcount, min_umi_readcount, min_uei_count, wmat_outfilename = None)
                    
                    if num_unique_bcn>0:
                        filtered_minb_diversity_2r = [num_unique_bcn, sum([trg_dict[trg_el] for trg_el in trg_dict]), num_unique_trg]
                    else:
                        filtered_minb_diversity_2r = [0,0,0]
                else:
                    sysOps.throw_status(sysOps.globaldatapath + consensus_pairing_csv_file + ' not found.')
                    filtered_minb_diversity_2r = []
                    
                outfile_1r.write(','.join([dir, str(subsample), ','.join([str(my_diversity[0]) for my_diversity in all_diversities])]) + '\n')
                outfile_2r.write(','.join([dir, str(subsample), ','.join([str(my_diversity[1]) for my_diversity in all_diversities]), ','.join([str(s) for s in filtered_minb_diversity_2r])]) + '\n')
                outfile_3r.write(','.join([dir, str(subsample), ','.join([str(my_diversity[2]) for my_diversity in all_diversities])]) + '\n')                 
            
            except:
                terminate = True
                
            subsample *= 2
        
    outfile_1r.close()
    outfile_2r.close()
    outfile_3r.close()
コード例 #12
0
    def generate_cluster_analysis(self):
        # Perform clustering analysis of UMI and UEI sequences, consolidate pairings and determine consenses of these pairings

        sysOps.initiate_statusfilename()
        missing_uxi_files = sysOps.find_missing_uxi_files(
            'libsettings.txt', '_')
        if len(missing_uxi_files) > 0:
            sysOps.throw_exception('Missing uxi files: ' +
                                   str(missing_uxi_files))

        if (sysOps.check_file_exists('_for_uxi0.fasta')):
            sysOps.throw_status("Clustering for_uxi0")
            clustering_up_to_date_1 = hashAlignments.initiate_hash_alignment(
                '_for_uxi0.fasta')
        else:
            clustering_up_to_date_1 = True
            sysOps.throw_status(sysOps.globaldatapath +
                                '_for_uxi0.fasta does not exist. Skipping.')

        if (sysOps.check_file_exists('_for_uxi1.fasta')):
            sysOps.throw_status("Clustering for_uxi1")
            clustering_up_to_date_2 = hashAlignments.initiate_hash_alignment(
                '_for_uxi1.fasta')
        else:
            clustering_up_to_date_2 = True
            sysOps.throw_status(sysOps.globaldatapath +
                                '_for_uxi1.fasta does not exist. Skipping.')

        if (sysOps.check_file_exists('_rev_uxi0.fasta')):
            sysOps.throw_status("Clustering rev_uxi0")
            clustering_up_to_date_3 = hashAlignments.initiate_hash_alignment(
                '_rev_uxi0.fasta')
        else:
            clustering_up_to_date_3 = True
            sysOps.throw_status(sysOps.globaldatapath +
                                '_rev_uxi0.fasta does not exist. Skipping.')

        if (clustering_up_to_date_1 and clustering_up_to_date_2
                and clustering_up_to_date_3):

            filter_val = 0.75  #maximum fraction of same-base permitted in a single UMI/UEI
            min_pairing_readcount = 2
            sysOps.throw_status(
                'Clustering completed. Beginning final output.')

            if (sysOps.check_file_exists('thresh1_identical__for_uxi0.fasta')
                    and sysOps.check_file_exists(
                        'thresh1_identical__for_uxi1.fasta')
                    and sysOps.check_file_exists(
                        'thresh1_identical__rev_uxi0.fasta') and
                    not (sysOps.check_file_exists('consensus_pairing_filter' +
                                                  str(filter_val) +
                                                  '_uei_umi.csv'))):
                if not sysOps.check_file_exists("pairing_filter" +
                                                str(filter_val) +
                                                "_uei_umi.csv"):
                    dnamicOps.assign_umi_pairs(
                        'thresh1_identical__for_uxi1.fasta',
                        'thresh1_identical__for_uxi0.fasta',
                        'thresh1_identical__rev_uxi0.fasta', '_for_uxi1.fasta',
                        '_for_uxi0.fasta', '_rev_uxi0.fasta', 'pairing',
                        filter_val, False
                    )  # final parameter = False: excluding invalid amplicon sequences

                dnamicOps.assign_consensus_pairs(
                    "pairing_filter" + str(filter_val) + "_uei_umi.csv",
                    min_pairing_readcount)
            else:
                sysOps.throw_status(
                    'Consensus-pairing file found pre-computed.')

            if (sysOps.check_file_exists('thresh1_identical__rev_uxi0.fasta')
                    and
                    not sysOps.check_file_exists('trg_amplicon_calls.csv')):
                #assign amplicon-identities to target umi's
                sysOps.throw_status(
                    'Assigning amplicon-identities and consensus sequences to target umis.'
                )
                dnamicOps.assign_umi_amplicons(
                    'thresh1_identical__rev_uxi0.fasta', '_rev_uxi0.fasta',
                    '_amp_match.txt', '_rev_amp0.fasta',
                    'trg_amplicon_calls.csv')
コード例 #13
0
    def sim_physics(self):
        sysOps.throw_status("Running DNA microscopy simulation.")
        # bcn-indices must range from 0 to Nbcn-1, trg-indices must range from Nbcn to Nbcn+Ntrg-1
        phys_dims = 3.0
        sysOps.throw_status("Using num_dims = " + str(self.sim_dims) +
                            ", Nbcn = " + str(self.Nbcn) + ", Ntrg = " +
                            str(self.Ntrg))
        tot_rate = 0.0
        # initialize all molecule numbers to 1
        self.sim_pos[:, self.sim_dims] = 1.0
        np.savetxt(sysOps.globaldatapath + "sim_index_key.csv",
                   np.reshape(self.index_key, [self.Nbcn + self.Ntrg, 1]),
                   delimiter=',')

        sysOps.throw_status("BEGINNING PART 1")
        # PART 1: (a) estimate pairwise reaction rates (and record their sum)
        #         (b) simulate amplification with stochasticity (parameterized by effic_monomer)
        for C in range(1, self.lin_cycles + self.exp_cycles + 1):
            # amplify with effic_monomer <=1
            for n in range(self.Nbcn + self.Ntrg):
                if C <= self.lin_cycles:
                    binom_res = np.random.binomial(
                        1, self.effic_monomer
                    )  # if linear amplification step, keep template number constant at 1
                else:
                    binom_res = np.random.binomial(
                        int(self.sim_pos[n, self.sim_dims]),
                        self.effic_monomer)
                    # if exponential amplification step, template number varies over time
                self.sim_pos[n, self.sim_dims] += binom_res

            np.savetxt(sysOps.globaldatapath + "molcountfile_" + str(C) +
                       ".csv",
                       np.reshape(np.int64(self.sim_pos[:, self.sim_dims]),
                                  [self.Nbcn + self.Ntrg, 1]),
                       delimiter=',')

            if C > self.lin_cycles:  #only sum/print UEI-formation rates for exponential amplification steps

                # iterate through simulated PCR cycles, print relative rates to rate_file
                sysOps.throw_status("C=" + str(C) + " --> 8*D*d*t = " +
                                    str(8 * self.diffconst * phys_dims *
                                        float(C)))
                # diffconst multiplies time to incorporate that dependence
                t_term = 8 * self.diffconst * phys_dims * float(C)
                # that need to be zoomed into so that individual UMI's for concatenation can be extracted
                tot_rate += sum_partition_function(self.sim_pos, self.Nbcn,
                                                   self.Ntrg, self.sim_dims,
                                                   phys_dims, t_term)

        sysOps.throw_status("BEGINNING PART 2")
        #PART 2: simulate UEI generation
        sysOps.throw_status("Generating " + str(self.Nuei) +
                            " random numbers.")
        sorted_unif_rand_res = np.sort(np.random.rand(self.Nuei))
        uei_arr = np.zeros([self.Nuei, 2], dtype=np.int64)
        uei_ind = 0
        inp_cumul_rate = np.array([0.0], dtype=np.float64)

        for C in range(self.lin_cycles + 1,
                       self.lin_cycles + self.exp_cycles + 1):
            self.sim_pos[:, self.sim_dims] = np.int64(
                np.loadtxt(sysOps.globaldatapath + "molcountfile_" + str(C) +
                           ".csv",
                           delimiter=','))
            t_term = 8 * self.diffconst * phys_dims * float(C)
            uei_arr[:] = -1
            prev_uei_ind = int(uei_ind)
            uei_ind = generate_ueis(uei_arr, self.sim_pos,
                                    sorted_unif_rand_res, inp_cumul_rate,
                                    self.Nbcn, self.Ntrg, self.Nuei,
                                    self.sim_dims, phys_dims, t_term, tot_rate,
                                    uei_ind)

            if uei_ind > prev_uei_ind:
                np.savetxt(sysOps.globaldatapath + "ueifile_" + str(C) +
                           ".csv",
                           uei_arr[:(uei_ind - prev_uei_ind), :],
                           delimiter=',')
            sysOps.throw_status("C = " + str(C) + ". Current UEI-count: " +
                                str(uei_ind) + '.')

        del sorted_unif_rand_res

        sysOps.throw_status("BEGINNING PART 3")
        #PART 3: simulate UEI amplification

        all_uei = np.array([])
        for C in range(self.lin_cycles + 1,
                       self.lin_cycles + self.exp_cycles + 1):
            for i in range(all_uei.shape[0]):
                all_uei[i, 2] += np.random.binomial(all_uei[i, 2],
                                                    self.effic_dimer)

            if sysOps.check_file_exists(sysOps.globaldatapath + "ueifile_" +
                                        str(C) + ".csv"):
                this_uei_arr = np.int64(
                    np.loadtxt(sysOps.globaldatapath + "ueifile_" + str(C) +
                               ".csv",
                               delimiter=','))
                if len(this_uei_arr.shape) == 1:
                    this_uei_arr = np.array([this_uei_arr])
                if this_uei_arr.shape[0] > 0:
                    this_uei_arr = np.append(this_uei_arr,
                                             np.ones(
                                                 [this_uei_arr.shape[0], 1]),
                                             axis=1)
                    if all_uei.shape[0] == 0:
                        all_uei = np.array(this_uei_arr)
                    else:
                        all_uei = np.concatenate([all_uei, this_uei_arr],
                                                 axis=0)

        sysOps.throw_status("BEGINNING PART 4")
        tot_mol = np.sum(all_uei[:, 2])
        #PART 4: output simulated reads
        my_N_reads = self.N_reads

        sysOps.throw_status('my_N_reads = ' + str(my_N_reads) + '/' +
                            str(self.N_reads) + ',' + sysOps.globaldatapath +
                            'r' + str(my_N_reads) + '_sim_ueifile.csv')

        with open(
                sysOps.globaldatapath + "r" + str(my_N_reads) +
                "_sim_ueifile.csv", 'w') as finalsimdata_outfile:
            sorted_unif_rand_reads = np.sort(np.random.rand(my_N_reads))

            #For downstream processing, need consensus-pairing file with the following comma-separated columns:
            #1. uei index
            #2. beacon-umi index
            #3. target-umi index
            #4. read-count

            read_ind = 0
            cumul_read_frac = 0.0
            for uei_index in range(uei_arr.shape[0]):
                cumul_read_frac += all_uei[uei_index, 2] / tot_mol
                my_reads = 0
                while cumul_read_frac >= sorted_unif_rand_reads[read_ind]:
                    my_reads += 1
                    read_ind += 1
                    if read_ind == my_N_reads:
                        break  #no more reads to generate

                if my_reads > 1:
                    #only include those UEI's with at least 2 reads
                    finalsimdata_outfile.write(
                        str(uei_index) + "," + str(all_uei[uei_index, 0]) +
                        "," + str(all_uei[uei_index, 1]) + "," +
                        str(my_reads) + '\n')

                if read_ind >= my_N_reads:
                    break

        del sorted_unif_rand_reads

        my_N_reads *= 2

        sysOps.throw_status("SIMULATION COMPLETE")

        return
コード例 #14
0
    def sim_reads(self):
        simLibObj = libOps.libObj(settingsfilename='libsettings.txt',
                                  output_prefix='_')
        enforced_rev_read_len = 100
        [for_read_len, rev_read_len] = simLibObj.get_min_allowed_readlens(
            simLibObj.filter_amplicon_window)
        rev_read_len = int(enforced_rev_read_len)
        '''
        simLibObj.seqform_for_params and simLibObj.seqform_rev_params are already stored in current object's memory
        Form of these variables is a list of the following:
            Element 1: [start_pos,end_pos]
            Element 2: np.ndarray(seq_bool_vec, dtype=np.bool_)
            Element 3: np.ndarray(capital_bool_vec, dtype=np.bool_)
            Element 4: np.ndarray(ambig_vec, dtype=np.bool_)
        '''
        [subdirnames, filenames] = sysOps.get_directory_and_file_list()

        for_umi_seqs = list()
        rev_umi_seqs = list()
        rev_umi_amplicon_list = list()
        uei_seqs = list()
        base_order = 'ACGT'

        sysOps.throw_status('Generating simulated sequences ...')
        amplicon_list = list()
        if "-amplicon" in simLibObj.mySettings:
            amplicon_list = [
                simLibObj.mySettings["-amplicon"][i].upper().split(',')
                for i in range(len(simLibObj.mySettings["-amplicon"]))
            ]

        for for_umi_i in range(self.Nbcn):
            for_param_index = np.random.randint(
                len(simLibObj.seqform_for_params))
            if len(simLibObj.seqform_for_params[for_param_index]) > 1:
                sysOps.throw_exception(
                    'Error: len(simLibObj.seqform_for_params[for_param_index]) = '
                    + str(len(simLibObj.seqform_for_params[for_param_index])))
                sysOps.exitProgram()
            my_for_umi_param = simLibObj.seqform_for_params[for_param_index][
                0]['U'][0]
            [start_pos, end_pos] = my_for_umi_param[0]
            seq_bool_vec = my_for_umi_param[1]
            my_for_umi = str('')
            for pos in range(end_pos - start_pos):
                possible_bases = np.where(seq_bool_vec[(pos * 4):((pos + 1) *
                                                                  4)])[0]
                my_for_umi += base_order[possible_bases[np.random.randint(
                    possible_bases.shape[0])]]

            for_umi_seqs.append([int(for_param_index), str(my_for_umi)])

        for for_uei_i in range(self.Nuei):
            for_param_index = 0  # there should be no difference across UMI's
            my_for_uei_param = simLibObj.seqform_for_params[for_param_index][
                0]['U'][1]
            [start_pos, end_pos] = my_for_uei_param[0]
            seq_bool_vec = my_for_uei_param[1]
            my_for_uei = str('')
            for pos in range(end_pos - start_pos):
                possible_bases = np.where(seq_bool_vec[(pos * 4):((pos + 1) *
                                                                  4)])[0]
                my_for_uei += base_order[possible_bases[np.random.randint(
                    possible_bases.shape[0])]]

            uei_seqs.append(str(my_for_uei))

        for rev_umi_i in range(self.Ntrg):
            rev_param_index = np.random.randint(
                len(simLibObj.seqform_rev_params))
            my_rev_umi_param = simLibObj.seqform_rev_params[rev_param_index][
                0]['U'][0]
            [start_pos, end_pos] = my_rev_umi_param[0]
            seq_bool_vec = my_rev_umi_param[1]
            my_rev_umi = str('')
            for pos in range(end_pos - start_pos):
                possible_bases = np.where(seq_bool_vec[(pos * 4):((pos + 1) *
                                                                  4)])[0]
                my_rev_umi += base_order[possible_bases[np.random.randint(
                    possible_bases.shape[0])]]

            if len(amplicon_list) == 0:
                encoded_amplicon = str('')
            else:
                this_gsp_primer_amplicon_pair = list(
                    amplicon_list[np.random.randint(len(amplicon_list))]
                )  # already properly oriented # already properly oriented
                # generate single error on amplicon
                lenamp = len(this_gsp_primer_amplicon_pair[1])
                rand_loc = np.random.randint(lenamp)
                this_gsp_primer_amplicon_pair[1] = str(
                    this_gsp_primer_amplicon_pair[1][:rand_loc] +
                    base_order[np.random.randint(4)] +
                    this_gsp_primer_amplicon_pair[1][(rand_loc + 1):])
                encoded_amplicon = ''.join(this_gsp_primer_amplicon_pair)

            tmp_umi_index = float(rev_umi_i)

            if tmp_umi_index == 0:
                encoded_amplicon += base_order[0]
            else:
                for myexponent in range(
                        int(np.floor(np.log(tmp_umi_index) / np.log(4.0))), -1,
                        -1):
                    mydigit = np.floor(tmp_umi_index /
                                       np.power(4.0, myexponent))
                    encoded_amplicon += base_order[int(mydigit)]
                    tmp_umi_index -= mydigit * np.power(4.0, myexponent)

            rev_umi_seqs.append(
                [int(rev_param_index),
                 str(my_rev_umi),
                 str(encoded_amplicon)])

        sysOps.throw_status('Writing simulated reads ...')

        for filename in filenames:
            if filename.endswith('_sim_ueifile.csv'):
                ueifile = np.int64(
                    np.loadtxt(sysOps.globaldatapath + filename,
                               delimiter=','))
                newdirname = filename[:filename.find('_')]
                read_list = list()
                for i in range(ueifile.shape[0]):
                    for myread in range(ueifile[i, 3]):
                        read_list.append(np.array([ueifile[i, :3]]))
                read_list = np.concatenate(
                    read_list, axis=0
                )  # re-write array so that there is now one row per read
                # randomly permute:
                read_list = read_list[
                    np.random.permutation(read_list.shape[0]), :]

                for_chararray = np.chararray((for_read_len))
                rev_chararray = np.chararray((rev_read_len))
                for_fastq_outfile = open(newdirname + '_for.fastq', "w")
                rev_fastq_outfile = open(newdirname + '_rev.fastq', "w")
                for i in range(read_list.shape[0]):
                    for_param_index = for_umi_seqs[read_list[i, 1]][0]
                    for_umi_seq = for_umi_seqs[read_list[i, 1]][1]
                    rev_param_index = rev_umi_seqs[read_list[i, 2]][
                        0]  # both beacon and target indices are at this point are independently indexed from 0
                    rev_umi_seq = rev_umi_seqs[read_list[i, 2]][1]
                    rev_amp_seq = rev_umi_seqs[read_list[i, 2]][2]
                    uei_seq = uei_seqs[read_list[i, 0]]

                    for j in range(for_read_len):
                        for_chararray[j] = 'N'
                    for j in range(rev_read_len):
                        rev_chararray[j] = 'N'

                    my_for_umi_param = simLibObj.seqform_for_params[
                        for_param_index][0]['U'][0]
                    [start_pos, end_pos] = my_for_umi_param[0]
                    for j in range(end_pos - start_pos):
                        for_chararray[j + start_pos] = for_umi_seq[j]

                    my_for_uei_param = simLibObj.seqform_for_params[
                        for_param_index][0]['U'][1]
                    [start_pos, end_pos] = my_for_uei_param[0]
                    for j in range(end_pos - start_pos):
                        for_chararray[j + start_pos] = uei_seq[j]

                    for my_for_param in simLibObj.seqform_for_params[
                            for_param_index][0]['P']:
                        [start_pos, end_pos] = my_for_param[0]
                        for j in range(end_pos - start_pos):
                            for_chararray[j + start_pos] = base_order[np.where(
                                my_for_param[1][(4 * j):(4 * (j + 1))])[0][0]]

                    my_rev_umi_param = simLibObj.seqform_rev_params[
                        rev_param_index][0]['U'][0]
                    [start_pos, end_pos] = my_rev_umi_param[0]
                    for j in range(end_pos - start_pos):
                        rev_chararray[j + start_pos] = rev_umi_seq[j]
                    my_rev_amp_param = simLibObj.seqform_rev_params[
                        rev_param_index][0]['A'][0]
                    start_pos = my_rev_amp_param[0][0]
                    for j in range(len(rev_amp_seq)):
                        rev_chararray[j + start_pos] = rev_amp_seq[j]

                    if 'P' in simLibObj.seqform_rev_params[rev_param_index][0]:
                        for my_rev_param in simLibObj.seqform_rev_params[
                                rev_param_index][0]['P']:
                            [start_pos, end_pos] = my_rev_param[0]
                            for j in range(end_pos - start_pos):
                                rev_chararray[j +
                                              start_pos] = base_order[np.where(
                                                  my_rev_param[1][(4 * j):(
                                                      4 * (j + 1))])[0][0]]

                    for_record = SeqIO.SeqRecord(
                        Seq.Seq(for_chararray.tostring()))
                    for_record.id = '-' + str(i) + '-' + str(read_list[i, 1])
                    for_record.description = ''
                    for_record.letter_annotations['phred_quality'] = list(
                        [30 for j in range(for_read_len)])
                    rev_record = SeqIO.SeqRecord(
                        Seq.Seq(rev_chararray.tostring()))
                    rev_record.id = '-' + str(i) + '-' + str(read_list[i, 2])
                    rev_record.description = ''
                    rev_record.letter_annotations['phred_quality'] = list(
                        [30 for j in range(rev_read_len)])
                    SeqIO.write(for_record, for_fastq_outfile, "fastq")
                    SeqIO.write(rev_record, rev_fastq_outfile, "fastq")

                for_fastq_outfile.close()
                rev_fastq_outfile.close()
                os.mkdir(newdirname)
                with open('libsettings.txt', 'rU') as oldsettingsfile:
                    with open(newdirname + '//libsettings.txt',
                              'w') as newsettingsfile:
                        for oldsettings_row in oldsettingsfile:
                            if oldsettings_row.startswith('-source_for'):
                                newsettingsfile.write('-source_for ..//' +
                                                      newdirname +
                                                      '_for.fastq\n')
                            elif oldsettings_row.startswith('-source_rev'):
                                newsettingsfile.write('-source_rev ..//' +
                                                      newdirname +
                                                      '_rev.fastq\n')
                            else:
                                newsettingsfile.write(oldsettings_row)

        sysOps.throw_status('Done.')
        return
コード例 #15
0
ファイル: upstream.py プロジェクト: richardzhu/dnamic
def generate_data_layout(data_layout_file = 'data_layout.csv'): 
    # Format of data_layout_file as follows:
    # Sample     Sample name, description, etc
    # Barcode    Number/search-term for run directories
    # Run        Run directory-1
    # Run        Run directory-2
    # Run        etc
    # Beacon     Beacon oligo-1
    # Beacon     Beacon oligo-2 
    # Beacon     etc
    # Target     Target oligo-1
    # Target     Target oligo-2 
    # Target     etc
    # OE1a       OE-primer-1
    # OE4b       OE-primer-2
    # Amplicon   Amplicon file-1
    # Amplicon   Amplicon file-2
    # Standardize-amplicon-start    TRUE/left blank
    
    data_layout_dict = dict()
    with open(data_layout_file,'rU') as csvfile:
        curr_sample = None
        for myline in csvfile:
            thisline = rm_hidden_char(myline).strip('\n').split(',')
            if len(thisline) >= 2:
                if thisline[0].lower() == 'sample':
                    curr_sample = thisline[1]
                    data_layout_dict[curr_sample] = dict()
                else:
                    if thisline[0].lower() not in data_layout_dict[curr_sample]:
                        data_layout_dict[curr_sample][thisline[0].lower()] = list()
                    data_layout_dict[curr_sample][thisline[0].lower()].append(thisline[1])

    final10_sbs12_sbs3 = 'CTTCCGATCT'     
    for sample in data_layout_dict:
        missing_keys = [my_key for my_key in ['barcode','run','beacon','target','oe1a','oe4b','amplicon'] if my_key not in data_layout_dict[sample]]
        if (len(missing_keys)>0):
            sysOps.throw_status('Skipping sample ' + str(sample) + ' due to missing keys:' + str(missing_keys))
        else:
            source_for = list()
            source_rev = list()
            for run_index in range(len(data_layout_dict[sample]['run'])):
                run_dir = data_layout_dict[sample]['run'][run_index]
                if not run_dir.endswith('//'):
                    run_dir += '//'
                run_dir_exists = False
                try: # try opening run_dir for writing
                    with open(run_dir + 'test.txt','w'):
                        run_dir_exists = True
                    os.remove(run_dir + 'test.txt')
                except:
                    sysOps.throw_status('Skipping run-directory ' + str(run_dir))
                if run_dir_exists:
                    [subdirnames, filenames] = sysOps.get_directory_and_file_list(run_dir)
                    this_sample_run_R1 = list(['..//' + run_dir + filename 
                                               for filename in filenames if (data_layout_dict[sample]['barcode'][0]+'_' in filename
                                                                             and 'R1' in filename)])
                    this_sample_run_R2 = list([filename[:(filename.find('R1'))] 
                                               + 'R2' 
                                               +  filename[(filename.find('R1')+2):] for filename in this_sample_run_R1])
                    source_for.extend(this_sample_run_R1) # since new directory is being created, adding an additional level to the path
                    source_rev.extend(this_sample_run_R2)
            source_for =  ','.join(source_for)
            source_rev =  ','.join(source_rev)  
            # join oe sequences
            seqform_for = list()
            find_index, conjoined_oe_seq = rigid_conjoin(get_revcomp(data_layout_dict[sample]['oe1a'][0]),data_layout_dict[sample]['oe4b'][0],10)

            for beacon_oligo_index in range(len(data_layout_dict[sample]['beacon'])):
                revcomp_bcn_oligo = get_revcomp(data_layout_dict[sample]['beacon'][beacon_oligo_index])
                revcomp_bcn_oligo = revcomp_bcn_oligo[(revcomp_bcn_oligo.find(final10_sbs12_sbs3)+len(final10_sbs12_sbs3)):]
                oe_start_index, conjoined_bcn_oe_seq = rigid_conjoin(revcomp_bcn_oligo,conjoined_oe_seq,10)
                
                uei_start_index = np.min(np.array([(oe_start_index+conjoined_bcn_oe_seq[oe_start_index:].upper().find(my_char))
                                                    for my_char in 'NWSRY' if my_char in conjoined_bcn_oe_seq[oe_start_index:].upper()]))
                uei_end_index = 1+np.max(np.array([(oe_start_index+conjoined_bcn_oe_seq[oe_start_index:].upper().rfind(my_char))
                                                    for my_char in 'NWSRY' if my_char in conjoined_bcn_oe_seq[oe_start_index:].upper()]))
                
                my_seqform_for = list()
                my_seqform_for.append('U_' + conjoined_bcn_oe_seq[1:oe_start_index] + '_1:' + str(oe_start_index))
                my_seqform_for.append('P_' + conjoined_bcn_oe_seq[oe_start_index:uei_start_index] + '_' + str(oe_start_index) + ':' + str(uei_start_index))
                my_seqform_for.append('U_' + conjoined_bcn_oe_seq[uei_start_index:uei_end_index] + '_' + str(uei_start_index) + ':' + str(uei_end_index))
                my_seqform_for.append('P_' + conjoined_bcn_oe_seq[uei_end_index:(uei_end_index+2)] + '_' + str(uei_end_index) + ':' + str(uei_end_index+2))
                my_seqform_for = '|'.join(my_seqform_for)
                if my_seqform_for not in seqform_for:
                    seqform_for.append(str(my_seqform_for))
            
            my_amplicons = list()
            for amplicon_file in data_layout_dict[sample]['amplicon']:
                if amplicon_file.upper() == 'N': # amplicon left blank
                    my_amplicons.append(list(['N','N']))
                else:
                    [subdirnames, filenames] = sysOps.get_directory_and_file_list()
                    if amplicon_file in filenames:
                        for record in SeqIO.parse(amplicon_file, "fasta"):
                            my_amplicons.append(list([str(record.id), str(record.seq)]))
                    else:
                        sysOps.throw_status('Skipping ' + str(amplicon_file))
            
            primer_amplicon_pairs = list()
            primer_amplicon_starts = list()
            seqform_rev = list()
            print str(my_amplicons)
            for amplicon in my_amplicons:
                revcomp_amplicon = get_revcomp(amplicon[1].lower())
                for target_oligo_index in range(len(data_layout_dict[sample]['target'])):
                    target_oligo = data_layout_dict[sample]['target'][target_oligo_index]
                    target_oligo = target_oligo[(target_oligo.find(final10_sbs12_sbs3)+len(final10_sbs12_sbs3)):]
                    if revcomp_amplicon == 'n':
                        randprim_len = len(target_oligo) - (1 + np.max(np.array([target_oligo.upper().rfind(my_char) for my_char in 'ACGT'])))
                        target_oligo = target_oligo[:(len(target_oligo) - randprim_len)]
                        my_seqform_rev = list()
                        my_seqform_rev.append('U_' + target_oligo[1:len(target_oligo)] + '_1:' + str(len(target_oligo)))
                        my_seqform_rev.append('A_' + str(len(target_oligo)+randprim_len) + ':')
                        my_seqform_rev = '|'.join(my_seqform_rev)
                        if my_seqform_rev not in seqform_rev:
                            seqform_rev.append(str(my_seqform_rev))
                    else:
                        find_index, conjoined_amplicon_seq = rigid_conjoin(target_oligo,revcomp_amplicon,10)
                        if find_index >= 0:
                            primer_overlap = len(target_oligo) + len(revcomp_amplicon) - len(conjoined_amplicon_seq)
                            primer_amplicon_pairs.append(amplicon[0] + '|'
                                                         + get_revcomp(conjoined_amplicon_seq[(find_index+primer_overlap):]) + ','
                                                         + get_revcomp(conjoined_amplicon_seq[find_index:(find_index+primer_overlap)]))
                            my_seqform_rev = list()
                            my_seqform_rev.append('U_' + conjoined_amplicon_seq[1:find_index] + '_1:' + str(find_index))
                            my_seqform_rev.append('A_' + str(find_index) + ':')
                            primer_amplicon_starts.append(int(find_index))
                            my_seqform_rev = '|'.join(my_seqform_rev)
                            if my_seqform_rev not in seqform_rev:
                                seqform_rev.append(str(my_seqform_rev))
                    
            # finally, print libsettings.txt
            my_libdir = 'lib_' + str(sample) + '//'
            os.mkdir(my_libdir)
                        
            if ('standardize-amplicon-start' in data_layout_dict[sample] 
                and data_layout_dict[sample]['standardize-amplicon-start'][0].lower() == 'true'):
                
                max_amplicon_start = int(np.max(np.array(primer_amplicon_starts)))
                with open(my_libdir + 'amplicon_refs.txt','w') as outfile:
                    new_seqform_rev = list()
                    for my_seqform_rev in seqform_rev:
                        elements = my_seqform_rev.split('|')
                        elements[len(elements)-1] = 'A_' + str(max_amplicon_start) + ':'
                        elements = '|'.join(elements)
                        if elements not in new_seqform_rev:
                            new_seqform_rev.append(str(elements))
                    seqform_rev = list(new_seqform_rev)
                    for primer_amplicon_pair,primer_amplicon_start in itertools.izip(primer_amplicon_pairs,primer_amplicon_starts):
                        outfile.write(primer_amplicon_pair[:(len(primer_amplicon_pair) + primer_amplicon_start - max_amplicon_start)] + '\n')
                primer_amplicon_pairs = list() # omit from libsettings
                
            with open(my_libdir + 'libsettings.txt','w') as outfile:
                outfile.write('-source_for ' + source_for + '\n')
                outfile.write('-source_rev ' + source_rev + '\n')
                for this_seqform_for in seqform_for:
                    outfile.write('-seqform_for ' + this_seqform_for + '\n')
                for this_seqform_rev in seqform_rev:
                    outfile.write('-seqform_rev ' + this_seqform_rev + '\n')
                for primer_amplicon_pair in primer_amplicon_pairs:
                    outfile.write('-amplicon ' + primer_amplicon_pair + '\n')
                if 'max-mismatch' in data_layout_dict[sample]:
                    outfile.write('-max_mismatch ' + data_layout_dict[sample]['max-mismatch'][0] + '\n')
                if 'max-mismatch-amplicon' in data_layout_dict[sample]:
                    outfile.write('-max_mismatch_amplicon ' + data_layout_dict[sample]['max-mismatch-amplicon'][0] + '\n')
                if 'min-mean-qual' in data_layout_dict[sample]:
                    outfile.write('-min_mean_qual ' + data_layout_dict[sample]['min-mean-qual'][0] + '\n')
                if 'filter-amplicon-window' in data_layout_dict[sample]:
                    outfile.write('-filter_amplicon_window ' + data_layout_dict[sample]['filter-amplicon-window'][0] + '\n')
                if 'amplicon-terminate' in data_layout_dict[sample]:
                    for this_amplicon_terminate in data_layout_dict[sample]['amplicon-terminate']:
                        outfile.write('-amplicon_terminate ' + this_amplicon_terminate + '\n')
            
    return
コード例 #16
0
ファイル: matOps.py プロジェクト: richardzhu/dnamic
def filter_mats(bcn_dict, trg_dict, bcn_div_dict, trg_div_dict, min_uei_count):
    
    # prune UEI data to exclude UMIs with UEI counts < min_uei_count
    
    if len(bcn_dict) == 0:
        return [bcn_dict, trg_dict, bcn_div_dict, trg_div_dict]
    
    deletion_iteration = 0
    is_list = None
    
    sysOps.throw_status('Filtering matrices with ' + str(len(bcn_div_dict)) + '+' + str(len(trg_div_dict)) + ' UMIs.')
    
    while True:
        
        bcn_retained = 0
        trg_retained = 0
        bcn_deleted = list()
        trg_deleted = list()
        
        for bcn_el in bcn_div_dict:
            if bcn_div_dict[bcn_el]<min_uei_count:
                bcn_deleted.append(bcn_el)
            else:
                bcn_retained += 1
                
        for trg_el in trg_div_dict:
            if trg_div_dict[trg_el]<min_uei_count:
                trg_deleted.append(trg_el)
            else:
                trg_retained += 1
        
        #check if bcn_dict and trg_dict are still list or already converted to values
        if is_list == None:
            for bcn_el in bcn_dict:
                for trg_el in bcn_dict[bcn_el]:
                    is_list = (type(bcn_dict[bcn_el][trg_el]) is list)
                    break
                break
            
        if len(bcn_deleted)==0 and len(trg_deleted)==0:
            sysOps.throw_status('On deletion-iteration ' + str(deletion_iteration) + ', all retained.')
            break
            
        sysOps.throw_status('On deletion-iteration ' + str(deletion_iteration) + ' deleting ' + str(len(bcn_deleted)) + '+' + str(len(trg_deleted)) + ', retained ' + str(bcn_retained) + '+' + str(trg_retained) + '. is_list=' + str(is_list))
        
        if is_list == None:
            sysOps.throw_exception('Error, could not find any elements: len(bcn_dict) = ' + str(len(bcn_dict)))
            sysOps.exitProgram()
            
        for bcn_el in bcn_deleted:
            for trg_el in bcn_dict[bcn_el]:
                if is_list:
                    trg_div_dict[trg_el] -= len(trg_dict[trg_el][bcn_el])
                else:
                    trg_div_dict[trg_el] -= trg_dict[trg_el][bcn_el]
                del trg_dict[trg_el][bcn_el]
                
            del bcn_dict[bcn_el]
            del bcn_div_dict[bcn_el]
            
        for trg_el in trg_deleted:
            for bcn_el in trg_dict[trg_el]:
                if bcn_el in bcn_div_dict: #if not already deleted above
                    if is_list:
                        bcn_div_dict[bcn_el] -= len(bcn_dict[bcn_el][trg_el])
                    else:
                        bcn_div_dict[bcn_el] -= bcn_dict[bcn_el][trg_el]
                    del bcn_dict[bcn_el][trg_el]
                
            del trg_dict[trg_el]
            del trg_div_dict[trg_el]
                        
        deletion_iteration += 1
    
    #check for consistency
    for bcn_el in bcn_dict:
        for trg_el in bcn_dict[bcn_el]:
            if is_list and len(bcn_dict[bcn_el][trg_el])!=len(trg_dict[trg_el][bcn_el]):
                sysOps.throw_exception('ERROR: bcn_dict and trg_dict contain different elements')
                sysOps.exitProgram()
                
    for trg_el in trg_dict:
        for bcn_el in trg_dict[trg_el]:
            if is_list and len(bcn_dict[bcn_el][trg_el])!=len(trg_dict[trg_el][bcn_el]):
                sysOps.throw_exception('ERROR: bcn_dict and trg_dict contain different elements')
                sysOps.exitProgram()
               
    
    return [bcn_dict, trg_dict, bcn_div_dict, trg_div_dict]
コード例 #17
0
    def crosscomparison_analysis(self, args):

        sysOps.initiate_statusfilename()
        list_of_dirs = list()

        file_to_compare = args[1]

        with open(sysOps.globaldatapath + args[2], 'rU') as csvfile:
            for myline in csvfile:
                thisline = myline.strip('\n').split(',')
                subdir = 'lib_' + str(thisline[0]) + '_' + str(
                    thisline[1]) + '_' + str(thisline[2])
                list_of_dirs.append(subdir)

        print "Beginning comparison analysis"
        print "File to compare = " + file_to_compare
        print "Directories = " + ",".join(list_of_dirs)

        try:
            os.mkdir(sysOps.globaldatapath + 'cross_comparisons')
        except:
            sysOps.throw_exception(
                'cross_comparisons directory already exists. Terminating comparison analysis.'
            )
            sysOps.exitProgram()

        shared_num_unique_matrix = list()
        unshared_num_unique_matrix = list()
        shared_read_abund_matrix = list()
        unshared_read_abund_matrix = list()

        for i in range(len(list_of_dirs)):
            shared_num_unique_matrix.append(list([-1] * len(list_of_dirs)))
            unshared_num_unique_matrix.append(list([-1] * len(list_of_dirs)))
            shared_read_abund_matrix.append(list([-1] * len(list_of_dirs)))
            unshared_read_abund_matrix.append(list([-1] * len(list_of_dirs)))

        for ind1 in range(len(list_of_dirs)):
            for ind2 in range(ind1):
                dir1 = list_of_dirs[ind1]
                dir2 = list_of_dirs[ind2]
                clustfile1 = dir1 + "//" + file_to_compare
                clustfile2 = dir2 + "//" + file_to_compare
                dir1_abbrev = dir1[(
                    dir1.rfind('/') + 1
                ):]  #remove superdirectory structure of path -- requires individual directories have unique names
                dir2_abbrev = dir2[(dir2.rfind('/') + 1):]
                sysOps.throw_status('Began writing cross_comparisons//' +
                                    dir1_abbrev + "_" + dir2_abbrev + "_" +
                                    file_to_compare)
                [
                    num_unique_shared, num_unique_unshared,
                    read_abundance_shared, read_abundance_unshared
                ] = alignOps.compare(
                    clustfile1, clustfile2,
                    dir1_abbrev + "_" + dir2_abbrev + "_" + file_to_compare,
                    False)
                sysOps.throw_status('Completed writing cross_comparisons//' +
                                    dir1_abbrev + "_" + dir2_abbrev + "_" +
                                    file_to_compare)
                shared_num_unique_matrix[ind1][ind2] = num_unique_shared[0]
                shared_num_unique_matrix[ind2][ind1] = num_unique_shared[1]
                unshared_num_unique_matrix[ind1][ind2] = num_unique_unshared[0]
                unshared_num_unique_matrix[ind2][ind1] = num_unique_unshared[1]
                print str(num_unique_unshared[0]
                          ) + '-> unshared_num_unique_matrix[ ' + str(
                              ind1) + '][' + str(ind2) + ']'
                shared_read_abund_matrix[ind1][ind2] = read_abundance_shared[0]
                shared_read_abund_matrix[ind2][ind1] = read_abundance_shared[1]
                unshared_read_abund_matrix[ind1][
                    ind2] = read_abundance_unshared[0]
                unshared_read_abund_matrix[ind2][
                    ind1] = read_abundance_unshared[1]

        print shared_num_unique_matrix
        print unshared_num_unique_matrix
        print shared_read_abund_matrix
        print unshared_read_abund_matrix

        with open('comparison_matrices.csv', 'w') as compare_matrix_file:
            for i1 in range(len(list_of_dirs)):
                compare_matrix_file.write(
                    ','.join([str(j)
                              for j in shared_num_unique_matrix[i1]]) + '\n')

            for i2 in range(len(list_of_dirs)):
                compare_matrix_file.write(
                    ','.join([str(j)
                              for j in unshared_num_unique_matrix[i2]]) + '\n')

            for i3 in range(len(list_of_dirs)):
                compare_matrix_file.write(
                    ','.join([str(j)
                              for j in shared_read_abund_matrix[i3]]) + '\n')

            for i4 in range(len(list_of_dirs)):
                compare_matrix_file.write(
                    ','.join([str(j)
                              for j in unshared_read_abund_matrix[i4]]) + '\n')
コード例 #18
0
    def dnamic_inference(self,
                         smle_infer=False,
                         msmle_infer=False,
                         segment_infer=False,
                         compute_local_solutions_only=True):
        # Perform image inference on the basis of raw output of DNA microscopy sequence analysis

        # Basic settings
        read_thresh = 2
        min_uei_count = 2
        output_dim = 2
        version = 1.0
        infer_dir = ''

        # raw data files
        consensus_pairing_csv_file = "..//consensus_" + str(
            read_thresh) + "r_pairing_filter0.75_uei_umi.csv"
        outname = 'minuei' + str(min_uei_count) + 'DMv' + str(
            version) + '_' + str(read_thresh) + 'r_filter0.75'
        wmat_outfilename = 'wmat_' + outname + '.csv'
        param_name = 'minuei' + str(min_uei_count) + 'dim' + str(
            output_dim) + 'DMv' + str(version) + '_.csv'
        imagemodule_input_filename = 'data_' + param_name
        key_filename = 'key_' + param_name
        if not sysOps.check_file_exists('microscopy_tasklist.csv'):
            [subdirnames, filenames
             ] = sysOps.get_directory_and_file_list(sysOps.globaldatapath)
            with open(sysOps.globaldatapath + 'microscopy_tasklist.csv',
                      'w') as task_input_file_handle:
                for subdir in subdirnames:
                    if sysOps.check_file_exists(subdir + '//libsettings.txt'):
                        task_input_file_handle.write('infer_smle;' +
                                                     sysOps.globaldatapath +
                                                     subdir + '//\n')
                        task_input_file_handle.write('infer_msmle;' +
                                                     sysOps.globaldatapath +
                                                     subdir + '//\n')
                        task_input_file_handle.write('infer_segment;' +
                                                     sysOps.globaldatapath +
                                                     subdir + '//\n')
                        task_input_file_handle.write('infer_ptmle;' +
                                                     sysOps.globaldatapath +
                                                     subdir + '//\n')

        original_datapath = str(sysOps.globaldatapath)
        if smle_infer:
            infer_dir = 'infer_smle//'
            [my_task, time_start
             ] = parallelOps.get_next_open_task('tasklog.csv',
                                                'microscopy_tasklist.csv',
                                                'infer_smle')
        elif msmle_infer:
            infer_dir = 'infer_msmle//'
            [my_task, time_start
             ] = parallelOps.get_next_open_task('tasklog.csv',
                                                'microscopy_tasklist.csv',
                                                'infer_msmle')
        elif segment_infer:
            infer_dir = 'infer_segment//'
            [my_task, time_start
             ] = parallelOps.get_next_open_task('tasklog.csv',
                                                'microscopy_tasklist.csv',
                                                'infer_segment')
        else:
            infer_dir = 'infer_ptmle//'
            [my_task, time_start
             ] = parallelOps.get_next_open_task('tasklog.csv',
                                                'microscopy_tasklist.csv',
                                                'infer_ptmle')

        if not (my_task is None):

            sysOps.initiate_runpath(str(my_task[1]))

            [subdirnames, filenames] = sysOps.get_directory_and_file_list()
            dirnames = list(["."])
            subdirnames_nodatayet = [
                subdirname for subdirname in subdirnames
                if subdirname.startswith('sub') and (
                    not sysOps.check_file_exists(subdirname + '//' +
                                                 imagemodule_input_filename))
            ]
            subdirnames_nodatayet = [
                subdirnames_nodatayet[i] for i in np.argsort(-np.array([
                    int(subdirname[3:].strip('/'))
                    for subdirname in subdirnames_nodatayet
                ]))
            ]  # sort by descending read count
            subdirnames_dataalready = [
                subdirname for subdirname in subdirnames
                if subdirname.startswith('sub') and (
                    sysOps.check_file_exists(subdirname + '//' +
                                             imagemodule_input_filename))
            ]
            subdirnames_dataalready = [
                subdirnames_dataalready[i] for i in np.argsort(-np.array([
                    int(subdirname[3:].strip('/'))
                    for subdirname in subdirnames_dataalready
                ]))
            ]  # sort by descending read count
            dirnames.extend(subdirnames_nodatayet)
            dirnames.extend(subdirnames_dataalready)
            sysOps.throw_status('Checking directories ' +
                                sysOps.globaldatapath + ' ... ' +
                                str(dirnames) + ' for infer-subdirectories.')
            for dirname in dirnames:  # make inference directories
                try:
                    with open(
                            sysOps.globaldatapath + dirname + '//' +
                            infer_dir + 'tmpfile.txt', 'w') as tmpfile:
                        tmpfile.write('test')
                    os.remove(sysOps.globaldatapath + dirname + '//' +
                              infer_dir + 'tmpfile.txt')
                    sysOps.throw_status('Directory ' + sysOps.globaldatapath +
                                        dirname + '//' + infer_dir +
                                        ' found already created.')
                except:
                    os.mkdir(sysOps.globaldatapath + dirname + '//' +
                             infer_dir)
                    sysOps.throw_status('Created directory ' +
                                        sysOps.globaldatapath + dirname +
                                        '//' + infer_dir)

            for dirname in dirnames:
                sysOps.initiate_runpath(
                    str(my_task[1]) + dirname + '//' + infer_dir)
                sysOps.initiate_statusfilename()
                sysOps.throw_status('Assigned path ' + sysOps.globaldatapath)

                if not (sysOps.check_file_exists(key_filename) and
                        sysOps.check_file_exists(imagemodule_input_filename)
                        and sysOps.check_file_exists(
                            'read_' + imagemodule_input_filename) and
                        sysOps.check_file_exists('seq_params_' +
                                                 imagemodule_input_filename)):

                    sysOps.throw_status('Calling matOps.generate_wmat()')

                    trg_dict = matOps.generate_wmat(consensus_pairing_csv_file,
                                                    read_thresh, min_uei_count,
                                                    wmat_outfilename)
                    sysOps.throw_status('Completed matOps.generate_wmat()')
                    matOps.print_imagemodule_input(trg_dict,
                                                   imagemodule_input_filename,
                                                   key_filename, output_dim)
                    #print_imagemodule_input outputs
                    #    1. File key_filename containing 3 columns: 0 or 1 (for beacon or target, respectively), cluster-index, MLE processing index
                    #    2. imagemodule_input_filename containing 3 columns: MLE processing index for beacon, MLE processing index for target, uei-count, max UEI read count
                    #    3. Summary file containing: Number of beacons inputted to MLE, number of targets inputted to MLE,
                else:
                    sysOps.throw_status(
                        'Image-module input pre-computed. Proceeding ...')

                #optimOps.test_ffgt()

                if sysOps.check_file_exists(imagemodule_input_filename):
                    if segment_infer:
                        optimOps.run_mle(
                            imagemodule_input_filename,
                            False,
                            False,
                            True,
                            compute_local_solutions_only,
                        )  # segmentation only
                    elif msmle_infer:
                        optimOps.run_mle(imagemodule_input_filename, False,
                                         True, False,
                                         compute_local_solutions_only)  # msMLE
                    elif smle_infer:
                        optimOps.run_mle(imagemodule_input_filename, True,
                                         False, False,
                                         compute_local_solutions_only)  # sMLE
                    else:
                        optimOps.run_mle(imagemodule_input_filename, False,
                                         False, False,
                                         compute_local_solutions_only)  # ptMLE

                    if not compute_local_solutions_only:
                        dnamicOps.print_final_results(
                            '..//trg_amplicon_calls.csv',
                            '..//trg_amplicon_calls.fasta')
                    else:
                        sysOps.exitProgram()
                else:
                    sysOps.throw_status('Could not locate ' +
                                        sysOps.globaldatapath +
                                        imagemodule_input_filename)

            sysOps.globaldatapath = str(original_datapath)
            if not parallelOps.close_task('tasklog.csv', ';'.join(my_task),
                                          time_start):
                sysOps.throw_exception('Task ' + str(my_task) +
                                       ' no longer exists in log ' +
                                       sysOps.globaldatapath + 'tasklog.csv' +
                                       ' -- exiting.')
                sysOps.exitProgram()

        return
コード例 #19
0
def parse_seqform(parseable, amplicon_option=None):
    '''
    parse input from -seqform_for or -seqform_rev tag in settings file
    parseable must contain integers separated by '|' characters, X_position1:position2
    X is one of the following characters
    1. P -- primer
    2. S -- spacer
    3. A -- amplicon
    4. U -- uxi
    X's may be redundant (there may be multiple primers, spacers, and amplicons)
    If form is X_N_position1:position2 (with a string between 2 underscores), N represents a sequence to which the input is aligned and match-score stored (N's in case of uxi)
    Final form of returned my_seqform dictionary entry is:
    Character1: [[[positionA1,positionA2],filter-sequence A (="" if none given)],[[positionB1,positionB2],filter-sequence B (="" if none given)]]
    '''
    my_seqform = dict()
    parseable = parseable.split("|")
    for this_parseable in parseable:
        my_elements = this_parseable.split("_")
        try:
            if (len(my_elements) < 3):
                my_char = my_elements[0].upper()
                seq = ""
                boundaries = my_elements[1].split(":")
            else:
                my_char = my_elements[0].upper()
                seq = my_elements[1]
                boundaries = my_elements[2].split(":")

            if (len(boundaries[0]) == 0):
                boundaries = [None, int(boundaries[1])]
            elif (len(boundaries[1]) == 0):
                boundaries = [int(boundaries[0]), None]
            else:
                boundaries = [int(boundaries[0]), int(boundaries[1])]
                if (boundaries[1] - boundaries[0] != len(seq)
                        and len(my_elements) == 3):
                    sysOps.throw_exception(
                        'Error: mismatch between filter boundary-indices and filter string-size, boundaries='
                        + str(boundaries) + ", seq=" + seq)

        except:
            print "Error parsing seqform " + this_parseable
            sysOps.throw_exception(["Error parsing seqform " + this_parseable])

        if my_char not in "PSAU":
            sysOps.throw_status([
                "Ignoring this_parseable=" + this_parseable +
                " -- unrecognized character-type."
            ])
        else:
            if my_char == "A" and type(amplicon_option) == str and type(
                    boundaries[1]) != int:
                start_pos = int(boundaries[0])
                for sub_seq in amplicon_option.split(','):
                    len_sub_seq = len(sub_seq)
                    seq_bool_vec = np.zeros(4 * len_sub_seq, dtype=np.bool_)
                    capital_bool_vec = np.zeros(4 * len_sub_seq,
                                                dtype=np.bool_)
                    ambig_vec = np.zeros(len_sub_seq, dtype=np.bool_)
                    ambig_seq_to_np(sub_seq, seq_bool_vec, capital_bool_vec,
                                    ambig_vec)
                    if my_char in my_seqform:
                        my_seqform[my_char].append(
                            [[start_pos, start_pos + len_sub_seq],
                             seq_bool_vec[:], capital_bool_vec, ambig_vec])
                    else:
                        my_seqform[my_char] = [[[
                            start_pos, start_pos + len_sub_seq
                        ], seq_bool_vec, capital_bool_vec, ambig_vec]]
                    start_pos += len_sub_seq
                # since original type(boundaries[1]) != int, re-set final boundaries[1] = None
                my_seqform[my_char][len(my_seqform[my_char]) - 1][0][1] = None
            else:
                seq_bool_vec = np.zeros(4 * len(seq), dtype=np.bool_)
                capital_bool_vec = np.zeros(4 * len(seq), dtype=np.bool_)
                ambig_vec = np.zeros(len(seq), dtype=np.bool_)
                ambig_seq_to_np(seq, seq_bool_vec, capital_bool_vec, ambig_vec)
                if my_char in my_seqform:
                    my_seqform[my_char].append([
                        boundaries, seq_bool_vec, capital_bool_vec, ambig_vec
                    ])
                else:
                    my_seqform[my_char] = [[
                        boundaries, seq_bool_vec, capital_bool_vec, ambig_vec
                    ]]

    return my_seqform
コード例 #20
0
ファイル: main.py プロジェクト: richardzhu/dnamic
globaldatapath
'''
global statuslogfilename
global globaldatapath

if __name__ == '__main__':

    # Calls sub-routines

    #optimOps.test_ffgt()

    sys.argv[len(sys.argv) - 1] = sys.argv[len(sys.argv) - 1].strip('\r')
    sysOps.initiate_runpath('')
    sysOps.initiate_statusfilename('', make_file=False)
    sys.argv = sys.argv[1:]  #remove first argument (script call)
    sysOps.throw_status('sys.argv = ' + str(sys.argv))

    if len(sys.argv) > 0 and sys.argv[0][(len(sys.argv[0]) - 2):] == '//':
        #if first argument is a directory, use this directory as the data directory for all subsequent operations
        sysOps.initiate_runpath(sys.argv[0])  #initiate data run path
        sys.argv = sys.argv[1:]  #remove directory from argument list

    sysOps.globalmasterProcess = masterProcesses.masterProcess([])

    if len(sys.argv) == 0 or sys.argv[0] == 'data_layout.csv':
        sysOps.globalmasterProcess.generate_uxi_library()
    elif sys.argv[0].endswith('infer'):
        compute_local_solutions_only = False
        if len(sys.argv) > 1 and sys.argv[1] == 'local':
            sysOps.throw_status('Performing local computing function alone.')
            compute_local_solutions_only = True