Python exitProgram Examples

Programming Language: Python

Namespace/Package Name: sysOps

Method/Function: exitProgram

Examples at hotexamples.com: 13

Python exitProgram - 13 examples found. These are the top rated real world Python examples of sysOps.exitProgram extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def threshold_cluster_uxi_prelinked(uxi_list,identical_uxi_filename,threshold,P=0,subsample = -1, prefix = ''):
    
    # Function will be called while loading linkage_file into uxi_list through load_linkage_file_to_list(linkage_file) in hashAlignments.py
    # Format of linkage file:
    #    uxi-sequence, self-read-number, RND: list of linked-to indices with self-index as first in line
    # linkage_list elements: [uxi-sequence,self-read-number,RND,[list of linked-to indices with self-index as first in line]])
            
    #sort uxi_list by decreasing RND
    num_uxi = len(uxi_list)
    sysOps.throw_status('Starting uxi list sort. List size = ' + str(num_uxi))
    sorted_uxi_list = sorted(uxi_list, key=lambda row: -row[2]) #note: sorted_uxi_list _REMAINS_ a pointer to uxi_list
    index_vals = [-1 for i in range(num_uxi)]
    sysOps.throw_status('Completed uxi list sort. Assigning EASL-clusters ...')
        
    for sorted_uxi_el in sorted_uxi_list: 
        #index_vals, with indices corresponding to _original_ positions in pre-sorted uxi_list, are initiated at -1 (stored in list at row[3])
        #uxi's accepted into cluster with seed of index i, will be given value i in index_vals
        #uxi's rejected from all classification are given index
        if index_vals[sorted_uxi_el[3][0]] < 0: #if this seed has index -1 (has not been assigned to any seed itself)
            index_vals[sorted_uxi_el[3][0]] = int(sorted_uxi_el[3][0]) # set cluster seed to itself
            
        my_index_val = int(index_vals[sorted_uxi_el[3][0]])
        
        for i in range(1,len(sorted_uxi_el[3])):
            if index_vals[sorted_uxi_el[3][i]] < 0: #connected read is unassigned -- assign to current cluster seed
                index_vals[sorted_uxi_el[3][i]] = my_index_val

    sysOps.throw_status('Consolidating clustered uxis ...')
    #consolidate clustered uxi's
    
    if -1 in index_vals:
        sysOps.throw_exception('Error: UNASSIGNED/UNCLUSTERED uxis. Exiting program')
        sysOps.exitProgram()
        
    index_str_vals = [str(int(x)) for x in index_vals]
    new_uxi_dict= dict()
    
    for i in range(num_uxi):
        my_index_str = index_str_vals[i] 
        if my_index_str in new_uxi_dict:
            new_uxi_dict[my_index_str].append(uxi_list[i][0] + "_" + str(uxi_list[i][1]))
        else:
            new_uxi_dict[my_index_str] = [(uxi_list[i][0] + "_" + str(uxi_list[i][1]))]
            
    if(subsample<=0):
        new_uxi_handle = open(sysOps.globaldatapath + prefix + "thresh" + str(threshold) + "_" + identical_uxi_filename,'w')
    else:
        new_uxi_handle = open(sysOps.globaldatapath + prefix + "thresh" + str(threshold) + "_sub" + str(subsample) + identical_uxi_filename,'w')
    
    i = 0
    for dict_el in new_uxi_dict:
        for el in new_uxi_dict[dict_el]:
            new_uxi_handle.write(str(i) + "_" + el + "\n")     
        i += 1   
        
    new_uxi_handle.close()
    
    print "Completed clustering."
    
    return True

Example #2

Show file

File: matOps.py Project: richardzhu/dnamic

def generate_wmat(consensus_pairing_csv_file, minreadcount, min_uei_count, outfilename = 'wmat.csv'):
    #consensus_pairing_csv_file has elements: 
    #uei index, beacon-umi index, target-umi index, read-count
    #if outfilename == None, does not print data to new files
    
    [bcn_dict,trg_dict,
     bcn_abund_dict,trg_abund_dict,
     bcn_div_dict,trg_div_dict] = get_umi_uei_matrices(consensus_pairing_csv_file, minreadcount)       
    if len(trg_dict)==0 or len(bcn_dict)==0:
        sysOps.throw_exception(consensus_pairing_csv_file + ' generated an empty UEI matrix.')
        sysOps.exitProgram()
    
    sysOps.throw_status(['Generating feature list.',sysOps.statuslogfilename])
    trg_feature_dict_list = get_features_from_dict(trg_dict) #collects salient pieces of information on targets for printing in file later
    [bcn_dict, trg_dict, bcn_div_dict, trg_div_dict] = filter_mats(bcn_dict, trg_dict,
                                                                   bcn_div_dict, trg_div_dict, min_uei_count)

    sysOps.throw_status(['Replacing matrix elements with UEI numbers (scalars).',sysOps.statuslogfilename])
    del bcn_dict
    sysOps.throw_status(['Generating weight matrix.',sysOps.statuslogfilename])
    
    if len(trg_dict)==0:
        sysOps.throw_exception('After filtering, ' + consensus_pairing_csv_file + ' generated an empty UEI matrix.')
        sysOps.exitProgram()
    
    if outfilename != None:
        print_features(trg_dict, 'trg_' + outfilename, trg_feature_dict_list)
    
    return trg_dict

Example #3

Show file

    def generate_uxi_library(self):
        # Perform sequence analysis (read-parsing, clustering, pairing UEIs/UMIs, sub-sampling data for rarefaction analyses)
        
        if not sysOps.check_file_exists('uxi_lib_tasklist.csv'):
            # create task list for library processing
            [subdirnames, filenames] = sysOps.get_directory_and_file_list(sysOps.globaldatapath)
            with open(sysOps.globaldatapath + 'uxi_lib_tasklist.csv','w') as task_input_file_handle:
                for subdir in subdirnames:
                    if sysOps.check_file_exists(subdir + '//libsettings.txt'):
                        task_input_file_handle.write('generate_uxi_library;' + sysOps.globaldatapath + subdir + '//\n')
                            
        original_datapath = str(sysOps.globaldatapath)
        [my_task,time_start] = parallelOps.get_next_open_task('tasklog.csv', 'uxi_lib_tasklist.csv', 'generate_uxi_library')
        if not (my_task is None):

            sysOps.initiate_runpath(str(my_task[1]))
            myLibObj = libOps.libObj(settingsfilename = 'libsettings.txt', output_prefix = '_')
            if not sysOps.check_file_exists(myLibObj.output_prefix + 'lib_stats.txt'):
                myLibObj.partition_fastq_library(discarded_sequence_path = "discarded_sequences.fastq", mean_phred_score_path = "mean_phred_scores.txt")
            self.generate_cluster_analysis()
                
            libOps.subsample(myLibObj.seqform_for_params,myLibObj.seqform_rev_params, myLibObj.output_prefix)
            [subdirnames, filenames] = sysOps.get_directory_and_file_list()
            dirnames = list([subdirname for subdirname in subdirnames if subdirname.startswith('sub')])
            sysOps.throw_status('Performing cluster analysis on sub-directories: ' + str(dirnames))
            for dirname in dirnames:
                sysOps.initiate_runpath(str(my_task[1]) + dirname + '//')
                self.generate_cluster_analysis()
        
            sysOps.globaldatapath = str(original_datapath)   
            if not parallelOps.close_task('tasklog.csv', ';'.join(my_task), time_start):
                sysOps.throw_exception('Task ' + str(my_task) + ' no longer exists in log ' + sysOps.globaldatapath + 'tasklog.csv' + ' -- exiting.')
                sysOps.exitProgram()

Example #4

Show file

File: alignOps.py Project: richardzhu/dnamic

def define_nuc_degeneracy(c1):
    c1 = c1.upper()
    if (c1 in 'ACGTU'):
        return [c1]
    elif (c1 == 'N'):
        return ['A', 'C', 'G', 'T']
    elif (c1 == 'W'):
        return ['A', 'T']
    elif (c1 == 'S'):
        return ['C', 'G']
    elif (c1 == 'M'):
        return ['A', 'C']
    elif (c1 == 'K'):
        return ['G', 'T']
    elif (c1 == 'R'):
        return ['A', 'G']
    elif (c1 == 'Y'):
        return ['C', 'T']
    elif (c1 == 'B'):
        return ['C', 'G', 'T']
    elif (c1 == 'D'):
        return ['A', 'G', 'T']
    elif (c1 == 'H'):
        return ['A', 'C', 'T']
    elif (c1 == 'V'):
        return ['A', 'C', 'G']
    else:
        sysOps.throw_exception([
            'Error: ' + c1 +
            'does not code for a single- or degenerate-nucleotide'
        ])
        sysOps.exitProgram()

Example #5

Show file

File: testOps.py Project: richardzhu/dnamic

def rand_from_ambig_base(nt):
    nt_isupper = nt.isupper()

    if nt.upper() == 'N':
        chars = 'ACGT'
        return_char = chars[int(numpy.floor(float(numpy.random.random() * 4)))]
    elif nt.upper() == 'W':
        chars = 'AT'
        return_char = chars[int(numpy.floor(float(numpy.random.random() * 2)))]
    elif nt.upper() == 'S':
        chars = 'CG'
        return_char = chars[int(numpy.floor(float(numpy.random.random() * 2)))]
    else:
        print 'Error with base ' + nt
        sysOps.exitProgram()

    if not nt_isupper:
        return return_char.lower()

    return return_char

Example #6

Show file

File: matOps.py Project: richardzhu/dnamic

def filter_mats(bcn_dict, trg_dict, bcn_div_dict, trg_div_dict, min_uei_count):
    
    # prune UEI data to exclude UMIs with UEI counts < min_uei_count
    
    if len(bcn_dict) == 0:
        return [bcn_dict, trg_dict, bcn_div_dict, trg_div_dict]
    
    deletion_iteration = 0
    is_list = None
    
    sysOps.throw_status('Filtering matrices with ' + str(len(bcn_div_dict)) + '+' + str(len(trg_div_dict)) + ' UMIs.')
    
    while True:
        
        bcn_retained = 0
        trg_retained = 0
        bcn_deleted = list()
        trg_deleted = list()
        
        for bcn_el in bcn_div_dict:
            if bcn_div_dict[bcn_el]<min_uei_count:
                bcn_deleted.append(bcn_el)
            else:
                bcn_retained += 1
                
        for trg_el in trg_div_dict:
            if trg_div_dict[trg_el]<min_uei_count:
                trg_deleted.append(trg_el)
            else:
                trg_retained += 1
        
        #check if bcn_dict and trg_dict are still list or already converted to values
        if is_list == None:
            for bcn_el in bcn_dict:
                for trg_el in bcn_dict[bcn_el]:
                    is_list = (type(bcn_dict[bcn_el][trg_el]) is list)
                    break
                break
            
        if len(bcn_deleted)==0 and len(trg_deleted)==0:
            sysOps.throw_status('On deletion-iteration ' + str(deletion_iteration) + ', all retained.')
            break
            
        sysOps.throw_status('On deletion-iteration ' + str(deletion_iteration) + ' deleting ' + str(len(bcn_deleted)) + '+' + str(len(trg_deleted)) + ', retained ' + str(bcn_retained) + '+' + str(trg_retained) + '. is_list=' + str(is_list))
        
        if is_list == None:
            sysOps.throw_exception('Error, could not find any elements: len(bcn_dict) = ' + str(len(bcn_dict)))
            sysOps.exitProgram()
            
        for bcn_el in bcn_deleted:
            for trg_el in bcn_dict[bcn_el]:
                if is_list:
                    trg_div_dict[trg_el] -= len(trg_dict[trg_el][bcn_el])
                else:
                    trg_div_dict[trg_el] -= trg_dict[trg_el][bcn_el]
                del trg_dict[trg_el][bcn_el]
                
            del bcn_dict[bcn_el]
            del bcn_div_dict[bcn_el]
            
        for trg_el in trg_deleted:
            for bcn_el in trg_dict[trg_el]:
                if bcn_el in bcn_div_dict: #if not already deleted above
                    if is_list:
                        bcn_div_dict[bcn_el] -= len(bcn_dict[bcn_el][trg_el])
                    else:
                        bcn_div_dict[bcn_el] -= bcn_dict[bcn_el][trg_el]
                    del bcn_dict[bcn_el][trg_el]
                
            del trg_dict[trg_el]
            del trg_div_dict[trg_el]
                        
        deletion_iteration += 1
    
    #check for consistency
    for bcn_el in bcn_dict:
        for trg_el in bcn_dict[bcn_el]:
            if is_list and len(bcn_dict[bcn_el][trg_el])!=len(trg_dict[trg_el][bcn_el]):
                sysOps.throw_exception('ERROR: bcn_dict and trg_dict contain different elements')
                sysOps.exitProgram()
                
    for trg_el in trg_dict:
        for bcn_el in trg_dict[trg_el]:
            if is_list and len(bcn_dict[bcn_el][trg_el])!=len(trg_dict[trg_el][bcn_el]):
                sysOps.throw_exception('ERROR: bcn_dict and trg_dict contain different elements')
                sysOps.exitProgram()
               
    
    return [bcn_dict, trg_dict, bcn_div_dict, trg_div_dict]

Example #7

Show file

File: dnamicOps.py Project: richardzhu/dnamic

def assign_consensus_pairs(pairing_csv_file, min_pairing_readcount):
    '''
    Assumes CSV file with columns:
    1. UEI cluster-index
    2. Beacon UMI cluster-index
    3. Target UMI cluster-index
    4. Read-number
    '''

    sysOps.throw_status('Loading pairing file ' + pairing_csv_file + ' ...')
    uei_clust_index_dict = dict()

    with open(sysOps.globaldatapath + pairing_csv_file, 'rU') as csvfile:
        for line in csvfile:
            row = line.strip('\n').split(',')
            index_str = str(row[0])  #UEI cluster-index
            if index_str in uei_clust_index_dict:
                uei_clust_index_dict[index_str].append(
                    [int(row[1]),
                     int(row[2]),
                     int(row[3]),
                     int(row[4])]
                )  #append dictionary entry as list with row having indices of beacon- and target-umi clusters, the read-number, and the set-index (will all be 0 if invalid-amplicon reads are excluded)
            else:
                uei_clust_index_dict[index_str] = [[
                    int(row[1]),
                    int(row[2]),
                    int(row[3]),
                    int(row[4])
                ]]

    #replace each entry with umi pairing having plurality of reads, in same indexed format
    sysOps.throw_status('Generating consensus-pairs ...')
    discarded_ueis = 0
    accepted_ueis = 0
    for uei_clust_el in uei_clust_index_dict:
        maxcount = 0
        secondmaxcount = 0  #detect ties, discard if tie exists
        maxcount_pair_bcn_index = -1
        maxcount_pair_trg_index = -1
        maxcount_set_index = -1
        for row in uei_clust_index_dict[uei_clust_el]:
            if (row[2] >= min_pairing_readcount and row[2] > maxcount):
                secondmaxcount = int(maxcount)
                if maxcount_set_index >= 0 and maxcount_set_index != row[3]:
                    sysOps.throw_exception('Error: set-index mismatch.')
                    sysOps.exitProgram()
                maxcount_pair_bcn_index = int(row[0])
                maxcount_pair_trg_index = int(row[1])
                maxcount = int(row[2])
                maxcount_set_index = int(row[3])
            elif (row[2] >= min_pairing_readcount and row[2] > secondmaxcount):
                secondmaxcount = int(row[2])

        if maxcount >= min_pairing_readcount and maxcount > secondmaxcount:
            # note: this condition requires that not only must the uei have at least min_pairing_readcount,
            # but the plurality-tally be must min_pairing_readcount as well
            uei_clust_index_dict[uei_clust_el] = list([
                int(maxcount_pair_bcn_index),
                int(maxcount_pair_trg_index),
                int(maxcount),
                int(maxcount_set_index)
            ])
            accepted_ueis += 1
        else:
            uei_clust_index_dict[uei_clust_el] = list()
            discarded_ueis += 1

    sysOps.throw_status('Outputting consensus-pairs with at least ' +
                        str(min_pairing_readcount) +
                        ' read-plurality. Accepted ' + str(accepted_ueis) +
                        ' UEIs, discarded ' + str(discarded_ueis) +
                        ' UEIs ...')
    #index outputted as uei-index, beacon-umi-index, target-umi-index, read-count
    outfile_handle = open(
        sysOps.globaldatapath + "consensus_" + str(min_pairing_readcount) +
        "r_" + pairing_csv_file, 'w')

    for uei_clust_el in uei_clust_index_dict:
        if len(uei_clust_index_dict[uei_clust_el]) > 0:
            outfile_handle.write(
                uei_clust_el + "," +
                ",".join([str(s)
                          for s in uei_clust_index_dict[uei_clust_el]]) + "\n")

    outfile_handle.close()

    return

Example #8

Show file

File: dnamicOps.py Project: richardzhu/dnamic

def assign_umi_amplicons(trg_umi_cluster_file, trg_umi_fasta, amp_match_file,
                         amp_seq_fasta, outfilename):
    #function will tally reads counted for each target umi across each amplicon-call, and return a csv file with the following columns:
    #(target umi cluster-index),(leading amplicon-call),(reads for leading amplicon-call),(total reads counted)

    sysOps.throw_status('Loading cluster-file ' + sysOps.globaldatapath +
                        trg_umi_cluster_file)
    trg_umi_cluster_dict = fileOps.load_cluster_file_to_dictionary(
        trg_umi_cluster_file)
    #outputs dictionary with entries {uxi-sequence: [uxi-cluster-index, read-number]}

    trg_umi_handle = open(sysOps.globaldatapath + trg_umi_fasta, "rU")
    amp_seq_handle = open(sysOps.globaldatapath + amp_seq_fasta, "rU")
    realign_amplicons = False
    amp_match_handle = None
    try:
        sysOps.throw_status('Loading ' + sysOps.globaldatapath +
                            amp_match_file)
        amp_match_handle = open(sysOps.globaldatapath + amp_match_file, "rU")
    except:
        sysOps.throw_status(
            sysOps.globaldatapath + amp_match_file +
            ' not found. Alignments will occur from sequence-consenses directly.'
        )
        realign_amplicons = True
        if not sysOps.check_file_exists('amplicon_refs.txt'):
            sysOps.throw_exception('Error: ' + sysOps.globaldatapath +
                                   'amplicon_refs.txt not found.')
            sysOps.exitProgram()

    trg_umi_dict = dict()
    trg_amp_seq_dict = dict()

    for trg_umi_record, amp_seq_record in itertools.izip(
            SeqIO.parse(trg_umi_handle, "fasta"),
            SeqIO.parse(amp_seq_handle, "fasta")):

        if not realign_amplicons:
            amp_match = int(amp_match_handle.readline().strip('\n'))
        else:
            amp_match = -1

        trg_umi_seq = str(trg_umi_record.seq)
        if trg_umi_seq in trg_umi_cluster_dict:
            trg_umi_index = str(
                trg_umi_cluster_dict[trg_umi_seq][0])  #uxi cluster-index
            if trg_umi_index in trg_umi_dict:
                if amp_match in trg_umi_dict[trg_umi_index]:
                    trg_umi_dict[trg_umi_index][
                        amp_match] += 1  #add 1, because every read is being entered
                else:
                    trg_umi_dict[trg_umi_index][amp_match] = 1
            else:
                trg_umi_dict[trg_umi_index] = dict()
                trg_amp_seq_dict[trg_umi_index] = baseTally()
                trg_umi_dict[trg_umi_index][amp_match] = 1

            trg_amp_seq_dict[trg_umi_index].add_record(str(amp_seq_record.seq),
                                                       1)

    trg_umi_handle.close()
    amp_seq_handle.close()
    if not realign_amplicons:
        amp_match_handle.close()

    csvfile = open(sysOps.globaldatapath + outfilename, 'w')
    fastafile = open(
        sysOps.globaldatapath + outfilename[:outfilename.rfind('.')] +
        '.fasta', 'w')
    ref_sequences = list()
    if realign_amplicons and sysOps.check_file_exists('amplicon_refs.txt'):
        with open(sysOps.globaldatapath + 'amplicon_refs.txt',
                  'rU') as ref_file_handle:
            for ref_line in ref_file_handle:
                [ref_name, ref_seq] = ref_line.strip('\n').upper().split('|')
                # amplicon_refs.txt will contain sequences in reverse complementary orientation. We therefore reverse both complementarity and order
                ref_sequences.append([
                    str(Seq.Seq(my_ref_seq).reverse_complement())
                    for my_ref_seq in reversed(ref_seq.split(','))
                ])
        mySettings = fileOps.read_settingsfile_to_dictionary('libsettings.txt')
        max_mismatch_amplicon = float(mySettings["-max_mismatch_amplicon"][0])
        trg_umi_index_dict = dict()

    accepted_consensus_sequences = 0
    inadmis_consensus_sequences = 0
    for trg_umi_index in trg_umi_dict:
        max_tally = 0
        tot_tally = 0

        for amp_match in trg_umi_dict[trg_umi_index]:

            my_tally = trg_umi_dict[trg_umi_index][amp_match]

            if my_tally >= max_tally:
                max_tally = int(my_tally)
                max_match = int(amp_match)

            tot_tally += int(my_tally)

        consensus_seq = str(
            trg_amp_seq_dict[trg_umi_index].get_str_consensus())

        if realign_amplicons:
            # perform direct, un-gapped alignment of consensus_seq to reference options to obtain max_match
            max_match = -1
            max_tally = -1  # exclude max_tally as count, since alignment is happening post-consensus
            min_mismatch_count = -1
            for i in range(len(ref_sequences)):
                all_subamplicons_pass = True
                start_index = 0
                tot_mismatches = 0
                for j in range(len(ref_sequences[i])
                               ):  # loop through sub-amplicon-sequences
                    ref_subamplicon_len = len(ref_sequences[i][j])
                    my_mismatches, minlen = alignOps.count_mismatches(
                        ref_sequences[i][j],
                        consensus_seq[start_index:(start_index +
                                                   ref_subamplicon_len)])
                    if minlen == 0:
                        all_subamplicons_pass = False
                        break
                    all_subamplicons_pass = all_subamplicons_pass and (
                        my_mismatches / float(minlen) <= max_mismatch_amplicon)
                    start_index += ref_subamplicon_len
                    tot_mismatches += my_mismatches
                if all_subamplicons_pass and (
                        max_match < 0 or min_mismatch_count < tot_mismatches):
                    max_match = int(i)
                    min_mismatch_count = int(tot_mismatches)

        if max_match >= 0:
            csvfile.write(trg_umi_index + "," + str(max_match) + "," +
                          str(max_tally) + "," + str(tot_tally) + "\n")
            fastafile.write(">" + trg_umi_index + '\n')
            fastafile.write(consensus_seq + '\n')
            if realign_amplicons:
                trg_umi_index_dict[trg_umi_index] = True
            accepted_consensus_sequences += 1
        else:
            inadmis_consensus_sequences += 1

    csvfile.close()
    fastafile.close()
    sysOps.throw_status('Discarded ' + str(inadmis_consensus_sequences) + '/' +
                        str(accepted_consensus_sequences +
                            inadmis_consensus_sequences) +
                        ' sequences in writing ' + sysOps.globaldatapath +
                        outfilename + ' due to inadequate amplicon match.')

    if realign_amplicons:
        # create a new consensus pairing file that's filtered with the accepted trg umi indices
        [dirnames, filenames] = sysOps.get_directory_and_file_list()
        consensus_filenames = [
            filename for filename in filenames
            if filename.startswith('consensus')
        ]
        for consensus_filename in consensus_filenames:  # find all consensus files present
            accepted_consensus_sequences = 0
            inadmis_consensus_sequences = 0
            os.rename(
                sysOps.globaldatapath + consensus_filename,
                sysOps.globaldatapath + 'unfiltered_' + consensus_filename)
            with open(sysOps.globaldatapath + consensus_filename,
                      'w') as new_consensus_file:
                with open(
                        sysOps.globaldatapath + 'unfiltered_' +
                        consensus_filename, 'rU') as old_consensus_file:
                    for old_consensus_file_line in old_consensus_file:
                        consensus_list = old_consensus_file_line.strip(
                            '\n'
                        ).split(
                            ','
                        )  # [uei_index, bcn_umi_index, trg_umi_index, read_count, (additional variables)]
                        if consensus_list[2] in trg_umi_index_dict:
                            new_consensus_file.write(old_consensus_file_line)
                            accepted_consensus_sequences += 1
                        else:
                            inadmis_consensus_sequences += 1
            sysOps.throw_status('Discarded ' +
                                str(inadmis_consensus_sequences) + '/' +
                                str(accepted_consensus_sequences +
                                    inadmis_consensus_sequences) +
                                ' consensus-pairings in writing ' +
                                sysOps.globaldatapath + consensus_filename +
                                ' due to inadequate amplicon match.')
        if len(consensus_filenames) == 0:
            sysOps.throw_exception(
                'Error: no consensus files available to update with realigned amplicon information. Exiting.'
            )
            sysOps.exitProgram()

Example #9

Show file

    def dnamic_inference(self,
                         smle_infer=False,
                         msmle_infer=False,
                         segment_infer=False,
                         compute_local_solutions_only=True):
        # Perform image inference on the basis of raw output of DNA microscopy sequence analysis

        # Basic settings
        read_thresh = 2
        min_uei_count = 2
        output_dim = 2
        version = 1.0
        infer_dir = ''

        # raw data files
        consensus_pairing_csv_file = "..//consensus_" + str(
            read_thresh) + "r_pairing_filter0.75_uei_umi.csv"
        outname = 'minuei' + str(min_uei_count) + 'DMv' + str(
            version) + '_' + str(read_thresh) + 'r_filter0.75'
        wmat_outfilename = 'wmat_' + outname + '.csv'
        param_name = 'minuei' + str(min_uei_count) + 'dim' + str(
            output_dim) + 'DMv' + str(version) + '_.csv'
        imagemodule_input_filename = 'data_' + param_name
        key_filename = 'key_' + param_name
        if not sysOps.check_file_exists('microscopy_tasklist.csv'):
            [subdirnames, filenames
             ] = sysOps.get_directory_and_file_list(sysOps.globaldatapath)
            with open(sysOps.globaldatapath + 'microscopy_tasklist.csv',
                      'w') as task_input_file_handle:
                for subdir in subdirnames:
                    if sysOps.check_file_exists(subdir + '//libsettings.txt'):
                        task_input_file_handle.write('infer_smle;' +
                                                     sysOps.globaldatapath +
                                                     subdir + '//\n')
                        task_input_file_handle.write('infer_msmle;' +
                                                     sysOps.globaldatapath +
                                                     subdir + '//\n')
                        task_input_file_handle.write('infer_segment;' +
                                                     sysOps.globaldatapath +
                                                     subdir + '//\n')
                        task_input_file_handle.write('infer_ptmle;' +
                                                     sysOps.globaldatapath +
                                                     subdir + '//\n')

        original_datapath = str(sysOps.globaldatapath)
        if smle_infer:
            infer_dir = 'infer_smle//'
            [my_task, time_start
             ] = parallelOps.get_next_open_task('tasklog.csv',
                                                'microscopy_tasklist.csv',
                                                'infer_smle')
        elif msmle_infer:
            infer_dir = 'infer_msmle//'
            [my_task, time_start
             ] = parallelOps.get_next_open_task('tasklog.csv',
                                                'microscopy_tasklist.csv',
                                                'infer_msmle')
        elif segment_infer:
            infer_dir = 'infer_segment//'
            [my_task, time_start
             ] = parallelOps.get_next_open_task('tasklog.csv',
                                                'microscopy_tasklist.csv',
                                                'infer_segment')
        else:
            infer_dir = 'infer_ptmle//'
            [my_task, time_start
             ] = parallelOps.get_next_open_task('tasklog.csv',
                                                'microscopy_tasklist.csv',
                                                'infer_ptmle')

        if not (my_task is None):

            sysOps.initiate_runpath(str(my_task[1]))

            [subdirnames, filenames] = sysOps.get_directory_and_file_list()
            dirnames = list(["."])
            subdirnames_nodatayet = [
                subdirname for subdirname in subdirnames
                if subdirname.startswith('sub') and (
                    not sysOps.check_file_exists(subdirname + '//' +
                                                 imagemodule_input_filename))
            ]
            subdirnames_nodatayet = [
                subdirnames_nodatayet[i] for i in np.argsort(-np.array([
                    int(subdirname[3:].strip('/'))
                    for subdirname in subdirnames_nodatayet
                ]))
            ]  # sort by descending read count
            subdirnames_dataalready = [
                subdirname for subdirname in subdirnames
                if subdirname.startswith('sub') and (
                    sysOps.check_file_exists(subdirname + '//' +
                                             imagemodule_input_filename))
            ]
            subdirnames_dataalready = [
                subdirnames_dataalready[i] for i in np.argsort(-np.array([
                    int(subdirname[3:].strip('/'))
                    for subdirname in subdirnames_dataalready
                ]))
            ]  # sort by descending read count
            dirnames.extend(subdirnames_nodatayet)
            dirnames.extend(subdirnames_dataalready)
            sysOps.throw_status('Checking directories ' +
                                sysOps.globaldatapath + ' ... ' +
                                str(dirnames) + ' for infer-subdirectories.')
            for dirname in dirnames:  # make inference directories
                try:
                    with open(
                            sysOps.globaldatapath + dirname + '//' +
                            infer_dir + 'tmpfile.txt', 'w') as tmpfile:
                        tmpfile.write('test')
                    os.remove(sysOps.globaldatapath + dirname + '//' +
                              infer_dir + 'tmpfile.txt')
                    sysOps.throw_status('Directory ' + sysOps.globaldatapath +
                                        dirname + '//' + infer_dir +
                                        ' found already created.')
                except:
                    os.mkdir(sysOps.globaldatapath + dirname + '//' +
                             infer_dir)
                    sysOps.throw_status('Created directory ' +
                                        sysOps.globaldatapath + dirname +
                                        '//' + infer_dir)

            for dirname in dirnames:
                sysOps.initiate_runpath(
                    str(my_task[1]) + dirname + '//' + infer_dir)
                sysOps.initiate_statusfilename()
                sysOps.throw_status('Assigned path ' + sysOps.globaldatapath)

                if not (sysOps.check_file_exists(key_filename) and
                        sysOps.check_file_exists(imagemodule_input_filename)
                        and sysOps.check_file_exists(
                            'read_' + imagemodule_input_filename) and
                        sysOps.check_file_exists('seq_params_' +
                                                 imagemodule_input_filename)):

                    sysOps.throw_status('Calling matOps.generate_wmat()')

                    trg_dict = matOps.generate_wmat(consensus_pairing_csv_file,
                                                    read_thresh, min_uei_count,
                                                    wmat_outfilename)
                    sysOps.throw_status('Completed matOps.generate_wmat()')
                    matOps.print_imagemodule_input(trg_dict,
                                                   imagemodule_input_filename,
                                                   key_filename, output_dim)
                    #print_imagemodule_input outputs
                    #    1. File key_filename containing 3 columns: 0 or 1 (for beacon or target, respectively), cluster-index, MLE processing index
                    #    2. imagemodule_input_filename containing 3 columns: MLE processing index for beacon, MLE processing index for target, uei-count, max UEI read count
                    #    3. Summary file containing: Number of beacons inputted to MLE, number of targets inputted to MLE,
                else:
                    sysOps.throw_status(
                        'Image-module input pre-computed. Proceeding ...')

                #optimOps.test_ffgt()

                if sysOps.check_file_exists(imagemodule_input_filename):
                    if segment_infer:
                        optimOps.run_mle(
                            imagemodule_input_filename,
                            False,
                            False,
                            True,
                            compute_local_solutions_only,
                        )  # segmentation only
                    elif msmle_infer:
                        optimOps.run_mle(imagemodule_input_filename, False,
                                         True, False,
                                         compute_local_solutions_only)  # msMLE
                    elif smle_infer:
                        optimOps.run_mle(imagemodule_input_filename, True,
                                         False, False,
                                         compute_local_solutions_only)  # sMLE
                    else:
                        optimOps.run_mle(imagemodule_input_filename, False,
                                         False, False,
                                         compute_local_solutions_only)  # ptMLE

                    if not compute_local_solutions_only:
                        dnamicOps.print_final_results(
                            '..//trg_amplicon_calls.csv',
                            '..//trg_amplicon_calls.fasta')
                    else:
                        sysOps.exitProgram()
                else:
                    sysOps.throw_status('Could not locate ' +
                                        sysOps.globaldatapath +
                                        imagemodule_input_filename)

            sysOps.globaldatapath = str(original_datapath)
            if not parallelOps.close_task('tasklog.csv', ';'.join(my_task),
                                          time_start):
                sysOps.throw_exception('Task ' + str(my_task) +
                                       ' no longer exists in log ' +
                                       sysOps.globaldatapath + 'tasklog.csv' +
                                       ' -- exiting.')
                sysOps.exitProgram()

        return

Example #10

Show file

    def crosscomparison_analysis(self, args):

        sysOps.initiate_statusfilename()
        list_of_dirs = list()

        file_to_compare = args[1]

        with open(sysOps.globaldatapath + args[2], 'rU') as csvfile:
            for myline in csvfile:
                thisline = myline.strip('\n').split(',')
                subdir = 'lib_' + str(thisline[0]) + '_' + str(
                    thisline[1]) + '_' + str(thisline[2])
                list_of_dirs.append(subdir)

        print "Beginning comparison analysis"
        print "File to compare = " + file_to_compare
        print "Directories = " + ",".join(list_of_dirs)

        try:
            os.mkdir(sysOps.globaldatapath + 'cross_comparisons')
        except:
            sysOps.throw_exception(
                'cross_comparisons directory already exists. Terminating comparison analysis.'
            )
            sysOps.exitProgram()

        shared_num_unique_matrix = list()
        unshared_num_unique_matrix = list()
        shared_read_abund_matrix = list()
        unshared_read_abund_matrix = list()

        for i in range(len(list_of_dirs)):
            shared_num_unique_matrix.append(list([-1] * len(list_of_dirs)))
            unshared_num_unique_matrix.append(list([-1] * len(list_of_dirs)))
            shared_read_abund_matrix.append(list([-1] * len(list_of_dirs)))
            unshared_read_abund_matrix.append(list([-1] * len(list_of_dirs)))

        for ind1 in range(len(list_of_dirs)):
            for ind2 in range(ind1):
                dir1 = list_of_dirs[ind1]
                dir2 = list_of_dirs[ind2]
                clustfile1 = dir1 + "//" + file_to_compare
                clustfile2 = dir2 + "//" + file_to_compare
                dir1_abbrev = dir1[(
                    dir1.rfind('/') + 1
                ):]  #remove superdirectory structure of path -- requires individual directories have unique names
                dir2_abbrev = dir2[(dir2.rfind('/') + 1):]
                sysOps.throw_status('Began writing cross_comparisons//' +
                                    dir1_abbrev + "_" + dir2_abbrev + "_" +
                                    file_to_compare)
                [
                    num_unique_shared, num_unique_unshared,
                    read_abundance_shared, read_abundance_unshared
                ] = alignOps.compare(
                    clustfile1, clustfile2,
                    dir1_abbrev + "_" + dir2_abbrev + "_" + file_to_compare,
                    False)
                sysOps.throw_status('Completed writing cross_comparisons//' +
                                    dir1_abbrev + "_" + dir2_abbrev + "_" +
                                    file_to_compare)
                shared_num_unique_matrix[ind1][ind2] = num_unique_shared[0]
                shared_num_unique_matrix[ind2][ind1] = num_unique_shared[1]
                unshared_num_unique_matrix[ind1][ind2] = num_unique_unshared[0]
                unshared_num_unique_matrix[ind2][ind1] = num_unique_unshared[1]
                print str(num_unique_unshared[0]
                          ) + '-> unshared_num_unique_matrix[ ' + str(
                              ind1) + '][' + str(ind2) + ']'
                shared_read_abund_matrix[ind1][ind2] = read_abundance_shared[0]
                shared_read_abund_matrix[ind2][ind1] = read_abundance_shared[1]
                unshared_read_abund_matrix[ind1][
                    ind2] = read_abundance_unshared[0]
                unshared_read_abund_matrix[ind2][
                    ind1] = read_abundance_unshared[1]

        print shared_num_unique_matrix
        print unshared_num_unique_matrix
        print shared_read_abund_matrix
        print unshared_read_abund_matrix

        with open('comparison_matrices.csv', 'w') as compare_matrix_file:
            for i1 in range(len(list_of_dirs)):
                compare_matrix_file.write(
                    ','.join([str(j)
                              for j in shared_num_unique_matrix[i1]]) + '\n')

            for i2 in range(len(list_of_dirs)):
                compare_matrix_file.write(
                    ','.join([str(j)
                              for j in unshared_num_unique_matrix[i2]]) + '\n')

            for i3 in range(len(list_of_dirs)):
                compare_matrix_file.write(
                    ','.join([str(j)
                              for j in shared_read_abund_matrix[i3]]) + '\n')

            for i4 in range(len(list_of_dirs)):
                compare_matrix_file.write(
                    ','.join([str(j)
                              for j in unshared_read_abund_matrix[i4]]) + '\n')

Example #11

Show file

    def sim_reads(self):
        simLibObj = libOps.libObj(settingsfilename='libsettings.txt',
                                  output_prefix='_')
        enforced_rev_read_len = 100
        [for_read_len, rev_read_len] = simLibObj.get_min_allowed_readlens(
            simLibObj.filter_amplicon_window)
        rev_read_len = int(enforced_rev_read_len)
        '''
        simLibObj.seqform_for_params and simLibObj.seqform_rev_params are already stored in current object's memory
        Form of these variables is a list of the following:
            Element 1: [start_pos,end_pos]
            Element 2: np.ndarray(seq_bool_vec, dtype=np.bool_)
            Element 3: np.ndarray(capital_bool_vec, dtype=np.bool_)
            Element 4: np.ndarray(ambig_vec, dtype=np.bool_)
        '''
        [subdirnames, filenames] = sysOps.get_directory_and_file_list()

        for_umi_seqs = list()
        rev_umi_seqs = list()
        rev_umi_amplicon_list = list()
        uei_seqs = list()
        base_order = 'ACGT'

        sysOps.throw_status('Generating simulated sequences ...')
        amplicon_list = list()
        if "-amplicon" in simLibObj.mySettings:
            amplicon_list = [
                simLibObj.mySettings["-amplicon"][i].upper().split(',')
                for i in range(len(simLibObj.mySettings["-amplicon"]))
            ]

        for for_umi_i in range(self.Nbcn):
            for_param_index = np.random.randint(
                len(simLibObj.seqform_for_params))
            if len(simLibObj.seqform_for_params[for_param_index]) > 1:
                sysOps.throw_exception(
                    'Error: len(simLibObj.seqform_for_params[for_param_index]) = '
                    + str(len(simLibObj.seqform_for_params[for_param_index])))
                sysOps.exitProgram()
            my_for_umi_param = simLibObj.seqform_for_params[for_param_index][
                0]['U'][0]
            [start_pos, end_pos] = my_for_umi_param[0]
            seq_bool_vec = my_for_umi_param[1]
            my_for_umi = str('')
            for pos in range(end_pos - start_pos):
                possible_bases = np.where(seq_bool_vec[(pos * 4):((pos + 1) *
                                                                  4)])[0]
                my_for_umi += base_order[possible_bases[np.random.randint(
                    possible_bases.shape[0])]]

            for_umi_seqs.append([int(for_param_index), str(my_for_umi)])

        for for_uei_i in range(self.Nuei):
            for_param_index = 0  # there should be no difference across UMI's
            my_for_uei_param = simLibObj.seqform_for_params[for_param_index][
                0]['U'][1]
            [start_pos, end_pos] = my_for_uei_param[0]
            seq_bool_vec = my_for_uei_param[1]
            my_for_uei = str('')
            for pos in range(end_pos - start_pos):
                possible_bases = np.where(seq_bool_vec[(pos * 4):((pos + 1) *
                                                                  4)])[0]
                my_for_uei += base_order[possible_bases[np.random.randint(
                    possible_bases.shape[0])]]

            uei_seqs.append(str(my_for_uei))

        for rev_umi_i in range(self.Ntrg):
            rev_param_index = np.random.randint(
                len(simLibObj.seqform_rev_params))
            my_rev_umi_param = simLibObj.seqform_rev_params[rev_param_index][
                0]['U'][0]
            [start_pos, end_pos] = my_rev_umi_param[0]
            seq_bool_vec = my_rev_umi_param[1]
            my_rev_umi = str('')
            for pos in range(end_pos - start_pos):
                possible_bases = np.where(seq_bool_vec[(pos * 4):((pos + 1) *
                                                                  4)])[0]
                my_rev_umi += base_order[possible_bases[np.random.randint(
                    possible_bases.shape[0])]]

            if len(amplicon_list) == 0:
                encoded_amplicon = str('')
            else:
                this_gsp_primer_amplicon_pair = list(
                    amplicon_list[np.random.randint(len(amplicon_list))]
                )  # already properly oriented # already properly oriented
                # generate single error on amplicon
                lenamp = len(this_gsp_primer_amplicon_pair[1])
                rand_loc = np.random.randint(lenamp)
                this_gsp_primer_amplicon_pair[1] = str(
                    this_gsp_primer_amplicon_pair[1][:rand_loc] +
                    base_order[np.random.randint(4)] +
                    this_gsp_primer_amplicon_pair[1][(rand_loc + 1):])
                encoded_amplicon = ''.join(this_gsp_primer_amplicon_pair)

            tmp_umi_index = float(rev_umi_i)

            if tmp_umi_index == 0:
                encoded_amplicon += base_order[0]
            else:
                for myexponent in range(
                        int(np.floor(np.log(tmp_umi_index) / np.log(4.0))), -1,
                        -1):
                    mydigit = np.floor(tmp_umi_index /
                                       np.power(4.0, myexponent))
                    encoded_amplicon += base_order[int(mydigit)]
                    tmp_umi_index -= mydigit * np.power(4.0, myexponent)

            rev_umi_seqs.append(
                [int(rev_param_index),
                 str(my_rev_umi),
                 str(encoded_amplicon)])

        sysOps.throw_status('Writing simulated reads ...')

        for filename in filenames:
            if filename.endswith('_sim_ueifile.csv'):
                ueifile = np.int64(
                    np.loadtxt(sysOps.globaldatapath + filename,
                               delimiter=','))
                newdirname = filename[:filename.find('_')]
                read_list = list()
                for i in range(ueifile.shape[0]):
                    for myread in range(ueifile[i, 3]):
                        read_list.append(np.array([ueifile[i, :3]]))
                read_list = np.concatenate(
                    read_list, axis=0
                )  # re-write array so that there is now one row per read
                # randomly permute:
                read_list = read_list[
                    np.random.permutation(read_list.shape[0]), :]

                for_chararray = np.chararray((for_read_len))
                rev_chararray = np.chararray((rev_read_len))
                for_fastq_outfile = open(newdirname + '_for.fastq', "w")
                rev_fastq_outfile = open(newdirname + '_rev.fastq', "w")
                for i in range(read_list.shape[0]):
                    for_param_index = for_umi_seqs[read_list[i, 1]][0]
                    for_umi_seq = for_umi_seqs[read_list[i, 1]][1]
                    rev_param_index = rev_umi_seqs[read_list[i, 2]][
                        0]  # both beacon and target indices are at this point are independently indexed from 0
                    rev_umi_seq = rev_umi_seqs[read_list[i, 2]][1]
                    rev_amp_seq = rev_umi_seqs[read_list[i, 2]][2]
                    uei_seq = uei_seqs[read_list[i, 0]]

                    for j in range(for_read_len):
                        for_chararray[j] = 'N'
                    for j in range(rev_read_len):
                        rev_chararray[j] = 'N'

                    my_for_umi_param = simLibObj.seqform_for_params[
                        for_param_index][0]['U'][0]
                    [start_pos, end_pos] = my_for_umi_param[0]
                    for j in range(end_pos - start_pos):
                        for_chararray[j + start_pos] = for_umi_seq[j]

                    my_for_uei_param = simLibObj.seqform_for_params[
                        for_param_index][0]['U'][1]
                    [start_pos, end_pos] = my_for_uei_param[0]
                    for j in range(end_pos - start_pos):
                        for_chararray[j + start_pos] = uei_seq[j]

                    for my_for_param in simLibObj.seqform_for_params[
                            for_param_index][0]['P']:
                        [start_pos, end_pos] = my_for_param[0]
                        for j in range(end_pos - start_pos):
                            for_chararray[j + start_pos] = base_order[np.where(
                                my_for_param[1][(4 * j):(4 * (j + 1))])[0][0]]

                    my_rev_umi_param = simLibObj.seqform_rev_params[
                        rev_param_index][0]['U'][0]
                    [start_pos, end_pos] = my_rev_umi_param[0]
                    for j in range(end_pos - start_pos):
                        rev_chararray[j + start_pos] = rev_umi_seq[j]
                    my_rev_amp_param = simLibObj.seqform_rev_params[
                        rev_param_index][0]['A'][0]
                    start_pos = my_rev_amp_param[0][0]
                    for j in range(len(rev_amp_seq)):
                        rev_chararray[j + start_pos] = rev_amp_seq[j]

                    if 'P' in simLibObj.seqform_rev_params[rev_param_index][0]:
                        for my_rev_param in simLibObj.seqform_rev_params[
                                rev_param_index][0]['P']:
                            [start_pos, end_pos] = my_rev_param[0]
                            for j in range(end_pos - start_pos):
                                rev_chararray[j +
                                              start_pos] = base_order[np.where(
                                                  my_rev_param[1][(4 * j):(
                                                      4 * (j + 1))])[0][0]]

                    for_record = SeqIO.SeqRecord(
                        Seq.Seq(for_chararray.tostring()))
                    for_record.id = '-' + str(i) + '-' + str(read_list[i, 1])
                    for_record.description = ''
                    for_record.letter_annotations['phred_quality'] = list(
                        [30 for j in range(for_read_len)])
                    rev_record = SeqIO.SeqRecord(
                        Seq.Seq(rev_chararray.tostring()))
                    rev_record.id = '-' + str(i) + '-' + str(read_list[i, 2])
                    rev_record.description = ''
                    rev_record.letter_annotations['phred_quality'] = list(
                        [30 for j in range(rev_read_len)])
                    SeqIO.write(for_record, for_fastq_outfile, "fastq")
                    SeqIO.write(rev_record, rev_fastq_outfile, "fastq")

                for_fastq_outfile.close()
                rev_fastq_outfile.close()
                os.mkdir(newdirname)
                with open('libsettings.txt', 'rU') as oldsettingsfile:
                    with open(newdirname + '//libsettings.txt',
                              'w') as newsettingsfile:
                        for oldsettings_row in oldsettingsfile:
                            if oldsettings_row.startswith('-source_for'):
                                newsettingsfile.write('-source_for ..//' +
                                                      newdirname +
                                                      '_for.fastq\n')
                            elif oldsettings_row.startswith('-source_rev'):
                                newsettingsfile.write('-source_rev ..//' +
                                                      newdirname +
                                                      '_rev.fastq\n')
                            else:
                                newsettingsfile.write(oldsettings_row)

        sysOps.throw_status('Done.')
        return

Example #12

Show file

File: alignOps.py Project: richardzhu/dnamic

def compare_identical(idfile1, idfile2, comparison_file_name, rev_comp):
    #rev_comp = True/False depending on need of reverse-complement being taken
    print "Beginning comparison between " + idfile1 + " and " + idfile2

    uxi_handle1 = open(sysOps.globaldatapath + idfile1, 'rU')
    uxi_dict1 = dict()
    len_uxi1 = -1
    uxi_index = 0
    for uxi_line1 in uxi_handle1:
        split_str = uxi_line1.strip('\n').split('_')
        if (len(split_str) == 3):

            my_uxi = split_str[0]
            if len_uxi1 < 0:
                len_uxi1 = len(my_uxi)
            elif len_uxi1 != len(my_uxi):
                print 'Error: uxi length-mismatch'
                sysOps.exitProgram()

            my_numreads = int(split_str[2])
            uxi_dict1[my_uxi] = [uxi_index, my_numreads, False
                                 ]  #final entry corresponds to being shared
            uxi_index += 1

    uxi_handle1.close()

    uxi_handle2 = open(sysOps.globaldatapath + idfile2, 'rU')
    uxi_dict2 = dict()
    len_uxi2 = -1
    uxi_index = 0
    comparison_handle = open(sysOps.globaldatapath + comparison_file_name, 'w')
    for uxi_line2 in uxi_handle2:
        split_str = uxi_line2.strip('\n').split('_')
        if (len(split_str) == 3):

            my_uxi = split_str[0]
            if len_uxi2 < 0:
                len_uxi2 = len(my_uxi)
                if len_uxi1 != len_uxi2:
                    print 'Error: uxi1/uxi2 length-mismatch'
                    sysOps.exitProgram()

            my_numreads = int(split_str[2])
            uxi_dict2[my_uxi] = [uxi_index, my_numreads, False]
            this_uxi = str(my_uxi)
            if rev_comp:
                this_uxi = str(Seq.Seq(this_uxi).reverse_complement())

            if this_uxi in uxi_dict1:
                print "Found match " + this_uxi
                uxi_dict1[this_uxi][2] = True
                uxi_dict2[my_uxi][2] = True
                comparison_handle.write(this_uxi + "," +
                                        str(uxi_dict1[this_uxi][0]) + "," +
                                        str(uxi_dict1[this_uxi][1]) + "," +
                                        str(uxi_index) + "," +
                                        str(my_numreads) + "\n")

            uxi_index += 1

    comparison_handle.close()

    unshared_handle = open(
        sysOps.globaldatapath + "unshared_" + comparison_file_name, 'w')
    for dict_el in uxi_dict1:
        if not uxi_dict1[dict_el][2]:
            unshared_handle.write(dict_el + ",0," +
                                  str(uxi_dict1[dict_el][0]) + "," +
                                  str(uxi_dict1[dict_el][1]) + "\n")

    for dict_el in uxi_dict2:
        if not uxi_dict2[dict_el][2]:
            unshared_handle.write(dict_el + ",1," +
                                  str(uxi_dict2[dict_el][0]) + "," +
                                  str(uxi_dict2[dict_el][1]) + "\n")
    unshared_handle.close()

    return True

Example #13

Show file

File: alignOps.py Project: richardzhu/dnamic

def compare(clustfile1,
            clustfile2,
            comparison_file_name,
            rev_comp,
            read_thresh=2,
            filter_substr_list=[],
            filter_val=0.75):
    #rev_comp = True/False depending on need of reverse-complement being taken
    #filter_val = maximum fraction of bases in uxi allowed to be the same

    #all filtering of legitimate comparison occurs here, at the front end
    print "Beginning comparison between " + clustfile1 + " and " + clustfile2

    #Stage 1 of comparison: determine total read-abundance of clusters in clustfile1 and clustfile2,
    #assign to abund_dict1 and abund_dict2

    abund_dict1 = dict()
    with open(sysOps.globaldatapath + clustfile1, 'rU') as clust1_handle:
        for clust_line in clust1_handle:
            my_el = clust_line.strip('\n').split('_')
            if (len(my_el) == 3):
                uxi_index = my_el[0]
                my_numreads = int(my_el[2])
                if uxi_index not in abund_dict1:
                    abund_dict1[uxi_index] = {
                        'reads': my_numreads,
                        'is_shared': False
                    }
                else:
                    abund_dict1[uxi_index]['reads'] += my_numreads

    abund_dict2 = dict()
    with open(sysOps.globaldatapath + clustfile2, 'rU') as clust2_handle:
        for clust_line in clust2_handle:
            my_el = clust_line.strip('\n').split('_')
            if (len(my_el) == 3):
                uxi_index = my_el[0]
                my_numreads = int(my_el[2])
                if uxi_index not in abund_dict2:
                    abund_dict2[uxi_index] = {
                        'reads': my_numreads,
                        'is_shared': False
                    }
                else:
                    abund_dict2[uxi_index]['reads'] += my_numreads

    #Stage 2 of comparison: enter actual uxi sequences into dict_clust1 and dict_clust2,
    #enter their respective cluster-indices into dict_uxi_indices1 and dict_uxi_indices2

    dict_clust1 = dict()
    with open(sysOps.globaldatapath + clustfile1, 'rU') as clust1_handle:
        for clust_line in clust1_handle:
            my_el = clust_line.strip('\n').split('_')
            if (len(my_el) == 3):
                uxi_index = int(my_el[0])
                this_uxi = str(my_el[1])
                my_numreads = int(my_el[2])
                has_disallowed_substr = [
                    my_substr in this_uxi for my_substr in filter_substr_list
                ]
                if abund_dict1[my_el[0]]['reads'] >= read_thresh and (
                        True not in has_disallowed_substr) and max(
                            numpy.bincount([('ACGT').index(s) for s in this_uxi
                                            ])) <= filter_val * len(this_uxi):
                    dict_clust1[this_uxi] = [
                        uxi_index, my_numreads, False
                    ]  #final entry corresponds to being shared

    print "Completed first cluster-file input. Second cluster-file being read, output to cross_comparisons//" + comparison_file_name

    comparison_handle = open(
        sysOps.globaldatapath + 'cross_comparisons//' + comparison_file_name,
        'w')

    dict_clust2 = dict()
    with open(sysOps.globaldatapath + clustfile2, 'rU') as clust2_handle:
        for clust_line in clust2_handle:
            my_el = clust_line.strip('\n').split('_')
            if (len(my_el) == 3):
                uxi_index = int(my_el[0])  #references clustfile2
                #my_uxi references clustfile2 uxi sequences
                #this_uxi references clustfile1 uxi sequences
                my_uxi = str(my_el[1])
                my_numreads = int(my_el[2])
                this_uxi = str(my_uxi)
                if (rev_comp):
                    this_uxi = str(Seq.Seq(this_uxi).reverse_complement())
                has_disallowed_substr = [
                    my_substr in this_uxi for my_substr in filter_substr_list
                ]

                if abund_dict2[my_el[0]]['reads'] >= read_thresh and (
                        True not in has_disallowed_substr) and max(
                            numpy.bincount([('ACGT').index(s) for s in this_uxi
                                            ])) <= filter_val * len(this_uxi):
                    dict_clust2[my_uxi] = [uxi_index, my_numreads, False]
                    if this_uxi in dict_clust1:
                        dict_clust1[this_uxi][2] = True
                        dict_clust2[my_uxi][2] = True
                        if str(dict_clust1[this_uxi][0]) not in abund_dict1:
                            sysOps.throw_exception(
                                'A: ' + str(dict_clust1[this_uxi][0]) +
                                ' not in dict_uxi_indices1')
                            sysOps.exitProgram()
                        if str(uxi_index) not in abund_dict2:
                            sysOps.throw_exception('B: ' + str(uxi_index) +
                                                   ' not in dict_uxi_indices2')
                            sysOps.exitProgram()

                        abund_dict1[str(
                            dict_clust1[this_uxi][0])]['is_shared'] = True
                        abund_dict2[str(uxi_index)]['is_shared'] = True

                        comparison_handle.write(
                            str(this_uxi) + "," +
                            str(dict_clust1[this_uxi][0]) + "," +
                            str(dict_clust1[this_uxi][1]) + "," +
                            str(abund_dict1[str(dict_clust1[this_uxi][0])]
                                ['reads']) + "," +
                            str(dict_clust2[my_uxi][0]) + "," +
                            str(dict_clust2[my_uxi][1]) + "," +
                            str(abund_dict2[str(dict_clust2[my_uxi][0])]
                                ['reads']) + "\n")

    comparison_handle.close()

    #count number unique shared and unique unshared
    num_unique_shared = [0, 0]
    num_unique_unshared = [0, 0]
    read_abundance_shared = [0, 0]
    read_abundance_unshared = [0, 0]

    for uxi_index1 in abund_dict1:
        if abund_dict1[uxi_index1]['is_shared']:
            num_unique_shared[0] += 1
            read_abundance_shared[0] += abund_dict1[uxi_index1]['reads']
        else:
            num_unique_unshared[0] += 1
            read_abundance_unshared[0] += abund_dict1[uxi_index1]['reads']

    for uxi_index2 in abund_dict2:
        if abund_dict2[uxi_index2]['is_shared']:
            num_unique_shared[1] += 1
            read_abundance_shared[1] += abund_dict2[uxi_index2]['reads']
        else:
            num_unique_unshared[1] += 1
            read_abundance_unshared[1] += abund_dict2[uxi_index2]['reads']

    return [
        num_unique_shared, num_unique_unshared, read_abundance_shared,
        read_abundance_unshared
    ]