Example #1
0
def threshold_cluster_uxi_prelinked(uxi_list,identical_uxi_filename,threshold,P=0,subsample = -1, prefix = ''):
    
    # Function will be called while loading linkage_file into uxi_list through load_linkage_file_to_list(linkage_file) in hashAlignments.py
    # Format of linkage file:
    #    uxi-sequence, self-read-number, RND: list of linked-to indices with self-index as first in line
    # linkage_list elements: [uxi-sequence,self-read-number,RND,[list of linked-to indices with self-index as first in line]])
            
    #sort uxi_list by decreasing RND
    num_uxi = len(uxi_list)
    sysOps.throw_status('Starting uxi list sort. List size = ' + str(num_uxi))
    sorted_uxi_list = sorted(uxi_list, key=lambda row: -row[2]) #note: sorted_uxi_list _REMAINS_ a pointer to uxi_list
    index_vals = [-1 for i in range(num_uxi)]
    sysOps.throw_status('Completed uxi list sort. Assigning EASL-clusters ...')
        
    for sorted_uxi_el in sorted_uxi_list: 
        #index_vals, with indices corresponding to _original_ positions in pre-sorted uxi_list, are initiated at -1 (stored in list at row[3])
        #uxi's accepted into cluster with seed of index i, will be given value i in index_vals
        #uxi's rejected from all classification are given index
        if index_vals[sorted_uxi_el[3][0]] < 0: #if this seed has index -1 (has not been assigned to any seed itself)
            index_vals[sorted_uxi_el[3][0]] = int(sorted_uxi_el[3][0]) # set cluster seed to itself
            
        my_index_val = int(index_vals[sorted_uxi_el[3][0]])
        
        for i in range(1,len(sorted_uxi_el[3])):
            if index_vals[sorted_uxi_el[3][i]] < 0: #connected read is unassigned -- assign to current cluster seed
                index_vals[sorted_uxi_el[3][i]] = my_index_val

    sysOps.throw_status('Consolidating clustered uxis ...')
    #consolidate clustered uxi's
    
    if -1 in index_vals:
        sysOps.throw_exception('Error: UNASSIGNED/UNCLUSTERED uxis. Exiting program')
        sysOps.exitProgram()
        
    index_str_vals = [str(int(x)) for x in index_vals]
    new_uxi_dict= dict()
    
    for i in range(num_uxi):
        my_index_str = index_str_vals[i] 
        if my_index_str in new_uxi_dict:
            new_uxi_dict[my_index_str].append(uxi_list[i][0] + "_" + str(uxi_list[i][1]))
        else:
            new_uxi_dict[my_index_str] = [(uxi_list[i][0] + "_" + str(uxi_list[i][1]))]
            
    if(subsample<=0):
        new_uxi_handle = open(sysOps.globaldatapath + prefix + "thresh" + str(threshold) + "_" + identical_uxi_filename,'w')
    else:
        new_uxi_handle = open(sysOps.globaldatapath + prefix + "thresh" + str(threshold) + "_sub" + str(subsample) + identical_uxi_filename,'w')
    
    i = 0
    for dict_el in new_uxi_dict:
        for el in new_uxi_dict[dict_el]:
            new_uxi_handle.write(str(i) + "_" + el + "\n")     
        i += 1   
        
    new_uxi_handle.close()
    
    print "Completed clustering."
    
    return True
Example #2
0
def generate_wmat(consensus_pairing_csv_file, minreadcount, min_uei_count, outfilename = 'wmat.csv'):
    #consensus_pairing_csv_file has elements: 
    #uei index, beacon-umi index, target-umi index, read-count
    #if outfilename == None, does not print data to new files
    
    [bcn_dict,trg_dict,
     bcn_abund_dict,trg_abund_dict,
     bcn_div_dict,trg_div_dict] = get_umi_uei_matrices(consensus_pairing_csv_file, minreadcount)       
    if len(trg_dict)==0 or len(bcn_dict)==0:
        sysOps.throw_exception(consensus_pairing_csv_file + ' generated an empty UEI matrix.')
        sysOps.exitProgram()
    
    sysOps.throw_status(['Generating feature list.',sysOps.statuslogfilename])
    trg_feature_dict_list = get_features_from_dict(trg_dict) #collects salient pieces of information on targets for printing in file later
    [bcn_dict, trg_dict, bcn_div_dict, trg_div_dict] = filter_mats(bcn_dict, trg_dict,
                                                                   bcn_div_dict, trg_div_dict, min_uei_count)

    sysOps.throw_status(['Replacing matrix elements with UEI numbers (scalars).',sysOps.statuslogfilename])
    del bcn_dict
    sysOps.throw_status(['Generating weight matrix.',sysOps.statuslogfilename])
    
    if len(trg_dict)==0:
        sysOps.throw_exception('After filtering, ' + consensus_pairing_csv_file + ' generated an empty UEI matrix.')
        sysOps.exitProgram()
    
    if outfilename != None:
        print_features(trg_dict, 'trg_' + outfilename, trg_feature_dict_list)
    
    return trg_dict
Example #3
0
    def generate_uxi_library(self):
        # Perform sequence analysis (read-parsing, clustering, pairing UEIs/UMIs, sub-sampling data for rarefaction analyses)
        
        if not sysOps.check_file_exists('uxi_lib_tasklist.csv'):
            # create task list for library processing
            [subdirnames, filenames] = sysOps.get_directory_and_file_list(sysOps.globaldatapath)
            with open(sysOps.globaldatapath + 'uxi_lib_tasklist.csv','w') as task_input_file_handle:
                for subdir in subdirnames:
                    if sysOps.check_file_exists(subdir + '//libsettings.txt'):
                        task_input_file_handle.write('generate_uxi_library;' + sysOps.globaldatapath + subdir + '//\n')
                            
        original_datapath = str(sysOps.globaldatapath)
        [my_task,time_start] = parallelOps.get_next_open_task('tasklog.csv', 'uxi_lib_tasklist.csv', 'generate_uxi_library')
        if not (my_task is None):

            sysOps.initiate_runpath(str(my_task[1]))
            myLibObj = libOps.libObj(settingsfilename = 'libsettings.txt', output_prefix = '_')
            if not sysOps.check_file_exists(myLibObj.output_prefix + 'lib_stats.txt'):
                myLibObj.partition_fastq_library(discarded_sequence_path = "discarded_sequences.fastq", mean_phred_score_path = "mean_phred_scores.txt")
            self.generate_cluster_analysis()
                
            libOps.subsample(myLibObj.seqform_for_params,myLibObj.seqform_rev_params, myLibObj.output_prefix)
            [subdirnames, filenames] = sysOps.get_directory_and_file_list()
            dirnames = list([subdirname for subdirname in subdirnames if subdirname.startswith('sub')])
            sysOps.throw_status('Performing cluster analysis on sub-directories: ' + str(dirnames))
            for dirname in dirnames:
                sysOps.initiate_runpath(str(my_task[1]) + dirname + '//')
                self.generate_cluster_analysis()
        
            sysOps.globaldatapath = str(original_datapath)   
            if not parallelOps.close_task('tasklog.csv', ';'.join(my_task), time_start):
                sysOps.throw_exception('Task ' + str(my_task) + ' no longer exists in log ' + sysOps.globaldatapath + 'tasklog.csv' + ' -- exiting.')
                sysOps.exitProgram()
Example #4
0
def define_nuc_degeneracy(c1):
    c1 = c1.upper()
    if (c1 in 'ACGTU'):
        return [c1]
    elif (c1 == 'N'):
        return ['A', 'C', 'G', 'T']
    elif (c1 == 'W'):
        return ['A', 'T']
    elif (c1 == 'S'):
        return ['C', 'G']
    elif (c1 == 'M'):
        return ['A', 'C']
    elif (c1 == 'K'):
        return ['G', 'T']
    elif (c1 == 'R'):
        return ['A', 'G']
    elif (c1 == 'Y'):
        return ['C', 'T']
    elif (c1 == 'B'):
        return ['C', 'G', 'T']
    elif (c1 == 'D'):
        return ['A', 'G', 'T']
    elif (c1 == 'H'):
        return ['A', 'C', 'T']
    elif (c1 == 'V'):
        return ['A', 'C', 'G']
    else:
        sysOps.throw_exception([
            'Error: ' + c1 +
            'does not code for a single- or degenerate-nucleotide'
        ])
        sysOps.exitProgram()
Example #5
0
def rand_from_ambig_base(nt):
    nt_isupper = nt.isupper()

    if nt.upper() == 'N':
        chars = 'ACGT'
        return_char = chars[int(numpy.floor(float(numpy.random.random() * 4)))]
    elif nt.upper() == 'W':
        chars = 'AT'
        return_char = chars[int(numpy.floor(float(numpy.random.random() * 2)))]
    elif nt.upper() == 'S':
        chars = 'CG'
        return_char = chars[int(numpy.floor(float(numpy.random.random() * 2)))]
    else:
        print 'Error with base ' + nt
        sysOps.exitProgram()

    if not nt_isupper:
        return return_char.lower()

    return return_char
Example #6
0
def filter_mats(bcn_dict, trg_dict, bcn_div_dict, trg_div_dict, min_uei_count):
    
    # prune UEI data to exclude UMIs with UEI counts < min_uei_count
    
    if len(bcn_dict) == 0:
        return [bcn_dict, trg_dict, bcn_div_dict, trg_div_dict]
    
    deletion_iteration = 0
    is_list = None
    
    sysOps.throw_status('Filtering matrices with ' + str(len(bcn_div_dict)) + '+' + str(len(trg_div_dict)) + ' UMIs.')
    
    while True:
        
        bcn_retained = 0
        trg_retained = 0
        bcn_deleted = list()
        trg_deleted = list()
        
        for bcn_el in bcn_div_dict:
            if bcn_div_dict[bcn_el]<min_uei_count:
                bcn_deleted.append(bcn_el)
            else:
                bcn_retained += 1
                
        for trg_el in trg_div_dict:
            if trg_div_dict[trg_el]<min_uei_count:
                trg_deleted.append(trg_el)
            else:
                trg_retained += 1
        
        #check if bcn_dict and trg_dict are still list or already converted to values
        if is_list == None:
            for bcn_el in bcn_dict:
                for trg_el in bcn_dict[bcn_el]:
                    is_list = (type(bcn_dict[bcn_el][trg_el]) is list)
                    break
                break
            
        if len(bcn_deleted)==0 and len(trg_deleted)==0:
            sysOps.throw_status('On deletion-iteration ' + str(deletion_iteration) + ', all retained.')
            break
            
        sysOps.throw_status('On deletion-iteration ' + str(deletion_iteration) + ' deleting ' + str(len(bcn_deleted)) + '+' + str(len(trg_deleted)) + ', retained ' + str(bcn_retained) + '+' + str(trg_retained) + '. is_list=' + str(is_list))
        
        if is_list == None:
            sysOps.throw_exception('Error, could not find any elements: len(bcn_dict) = ' + str(len(bcn_dict)))
            sysOps.exitProgram()
            
        for bcn_el in bcn_deleted:
            for trg_el in bcn_dict[bcn_el]:
                if is_list:
                    trg_div_dict[trg_el] -= len(trg_dict[trg_el][bcn_el])
                else:
                    trg_div_dict[trg_el] -= trg_dict[trg_el][bcn_el]
                del trg_dict[trg_el][bcn_el]
                
            del bcn_dict[bcn_el]
            del bcn_div_dict[bcn_el]
            
        for trg_el in trg_deleted:
            for bcn_el in trg_dict[trg_el]:
                if bcn_el in bcn_div_dict: #if not already deleted above
                    if is_list:
                        bcn_div_dict[bcn_el] -= len(bcn_dict[bcn_el][trg_el])
                    else:
                        bcn_div_dict[bcn_el] -= bcn_dict[bcn_el][trg_el]
                    del bcn_dict[bcn_el][trg_el]
                
            del trg_dict[trg_el]
            del trg_div_dict[trg_el]
                        
        deletion_iteration += 1
    
    #check for consistency
    for bcn_el in bcn_dict:
        for trg_el in bcn_dict[bcn_el]:
            if is_list and len(bcn_dict[bcn_el][trg_el])!=len(trg_dict[trg_el][bcn_el]):
                sysOps.throw_exception('ERROR: bcn_dict and trg_dict contain different elements')
                sysOps.exitProgram()
                
    for trg_el in trg_dict:
        for bcn_el in trg_dict[trg_el]:
            if is_list and len(bcn_dict[bcn_el][trg_el])!=len(trg_dict[trg_el][bcn_el]):
                sysOps.throw_exception('ERROR: bcn_dict and trg_dict contain different elements')
                sysOps.exitProgram()
               
    
    return [bcn_dict, trg_dict, bcn_div_dict, trg_div_dict]
Example #7
0
def assign_consensus_pairs(pairing_csv_file, min_pairing_readcount):
    '''
    Assumes CSV file with columns:
    1. UEI cluster-index
    2. Beacon UMI cluster-index
    3. Target UMI cluster-index
    4. Read-number
    '''

    sysOps.throw_status('Loading pairing file ' + pairing_csv_file + ' ...')
    uei_clust_index_dict = dict()

    with open(sysOps.globaldatapath + pairing_csv_file, 'rU') as csvfile:
        for line in csvfile:
            row = line.strip('\n').split(',')
            index_str = str(row[0])  #UEI cluster-index
            if index_str in uei_clust_index_dict:
                uei_clust_index_dict[index_str].append(
                    [int(row[1]),
                     int(row[2]),
                     int(row[3]),
                     int(row[4])]
                )  #append dictionary entry as list with row having indices of beacon- and target-umi clusters, the read-number, and the set-index (will all be 0 if invalid-amplicon reads are excluded)
            else:
                uei_clust_index_dict[index_str] = [[
                    int(row[1]),
                    int(row[2]),
                    int(row[3]),
                    int(row[4])
                ]]

    #replace each entry with umi pairing having plurality of reads, in same indexed format
    sysOps.throw_status('Generating consensus-pairs ...')
    discarded_ueis = 0
    accepted_ueis = 0
    for uei_clust_el in uei_clust_index_dict:
        maxcount = 0
        secondmaxcount = 0  #detect ties, discard if tie exists
        maxcount_pair_bcn_index = -1
        maxcount_pair_trg_index = -1
        maxcount_set_index = -1
        for row in uei_clust_index_dict[uei_clust_el]:
            if (row[2] >= min_pairing_readcount and row[2] > maxcount):
                secondmaxcount = int(maxcount)
                if maxcount_set_index >= 0 and maxcount_set_index != row[3]:
                    sysOps.throw_exception('Error: set-index mismatch.')
                    sysOps.exitProgram()
                maxcount_pair_bcn_index = int(row[0])
                maxcount_pair_trg_index = int(row[1])
                maxcount = int(row[2])
                maxcount_set_index = int(row[3])
            elif (row[2] >= min_pairing_readcount and row[2] > secondmaxcount):
                secondmaxcount = int(row[2])

        if maxcount >= min_pairing_readcount and maxcount > secondmaxcount:
            # note: this condition requires that not only must the uei have at least min_pairing_readcount,
            # but the plurality-tally be must min_pairing_readcount as well
            uei_clust_index_dict[uei_clust_el] = list([
                int(maxcount_pair_bcn_index),
                int(maxcount_pair_trg_index),
                int(maxcount),
                int(maxcount_set_index)
            ])
            accepted_ueis += 1
        else:
            uei_clust_index_dict[uei_clust_el] = list()
            discarded_ueis += 1

    sysOps.throw_status('Outputting consensus-pairs with at least ' +
                        str(min_pairing_readcount) +
                        ' read-plurality. Accepted ' + str(accepted_ueis) +
                        ' UEIs, discarded ' + str(discarded_ueis) +
                        ' UEIs ...')
    #index outputted as uei-index, beacon-umi-index, target-umi-index, read-count
    outfile_handle = open(
        sysOps.globaldatapath + "consensus_" + str(min_pairing_readcount) +
        "r_" + pairing_csv_file, 'w')

    for uei_clust_el in uei_clust_index_dict:
        if len(uei_clust_index_dict[uei_clust_el]) > 0:
            outfile_handle.write(
                uei_clust_el + "," +
                ",".join([str(s)
                          for s in uei_clust_index_dict[uei_clust_el]]) + "\n")

    outfile_handle.close()

    return
Example #8
0
def assign_umi_amplicons(trg_umi_cluster_file, trg_umi_fasta, amp_match_file,
                         amp_seq_fasta, outfilename):
    #function will tally reads counted for each target umi across each amplicon-call, and return a csv file with the following columns:
    #(target umi cluster-index),(leading amplicon-call),(reads for leading amplicon-call),(total reads counted)

    sysOps.throw_status('Loading cluster-file ' + sysOps.globaldatapath +
                        trg_umi_cluster_file)
    trg_umi_cluster_dict = fileOps.load_cluster_file_to_dictionary(
        trg_umi_cluster_file)
    #outputs dictionary with entries {uxi-sequence: [uxi-cluster-index, read-number]}

    trg_umi_handle = open(sysOps.globaldatapath + trg_umi_fasta, "rU")
    amp_seq_handle = open(sysOps.globaldatapath + amp_seq_fasta, "rU")
    realign_amplicons = False
    amp_match_handle = None
    try:
        sysOps.throw_status('Loading ' + sysOps.globaldatapath +
                            amp_match_file)
        amp_match_handle = open(sysOps.globaldatapath + amp_match_file, "rU")
    except:
        sysOps.throw_status(
            sysOps.globaldatapath + amp_match_file +
            ' not found. Alignments will occur from sequence-consenses directly.'
        )
        realign_amplicons = True
        if not sysOps.check_file_exists('amplicon_refs.txt'):
            sysOps.throw_exception('Error: ' + sysOps.globaldatapath +
                                   'amplicon_refs.txt not found.')
            sysOps.exitProgram()

    trg_umi_dict = dict()
    trg_amp_seq_dict = dict()

    for trg_umi_record, amp_seq_record in itertools.izip(
            SeqIO.parse(trg_umi_handle, "fasta"),
            SeqIO.parse(amp_seq_handle, "fasta")):

        if not realign_amplicons:
            amp_match = int(amp_match_handle.readline().strip('\n'))
        else:
            amp_match = -1

        trg_umi_seq = str(trg_umi_record.seq)
        if trg_umi_seq in trg_umi_cluster_dict:
            trg_umi_index = str(
                trg_umi_cluster_dict[trg_umi_seq][0])  #uxi cluster-index
            if trg_umi_index in trg_umi_dict:
                if amp_match in trg_umi_dict[trg_umi_index]:
                    trg_umi_dict[trg_umi_index][
                        amp_match] += 1  #add 1, because every read is being entered
                else:
                    trg_umi_dict[trg_umi_index][amp_match] = 1
            else:
                trg_umi_dict[trg_umi_index] = dict()
                trg_amp_seq_dict[trg_umi_index] = baseTally()
                trg_umi_dict[trg_umi_index][amp_match] = 1

            trg_amp_seq_dict[trg_umi_index].add_record(str(amp_seq_record.seq),
                                                       1)

    trg_umi_handle.close()
    amp_seq_handle.close()
    if not realign_amplicons:
        amp_match_handle.close()

    csvfile = open(sysOps.globaldatapath + outfilename, 'w')
    fastafile = open(
        sysOps.globaldatapath + outfilename[:outfilename.rfind('.')] +
        '.fasta', 'w')
    ref_sequences = list()
    if realign_amplicons and sysOps.check_file_exists('amplicon_refs.txt'):
        with open(sysOps.globaldatapath + 'amplicon_refs.txt',
                  'rU') as ref_file_handle:
            for ref_line in ref_file_handle:
                [ref_name, ref_seq] = ref_line.strip('\n').upper().split('|')
                # amplicon_refs.txt will contain sequences in reverse complementary orientation. We therefore reverse both complementarity and order
                ref_sequences.append([
                    str(Seq.Seq(my_ref_seq).reverse_complement())
                    for my_ref_seq in reversed(ref_seq.split(','))
                ])
        mySettings = fileOps.read_settingsfile_to_dictionary('libsettings.txt')
        max_mismatch_amplicon = float(mySettings["-max_mismatch_amplicon"][0])
        trg_umi_index_dict = dict()

    accepted_consensus_sequences = 0
    inadmis_consensus_sequences = 0
    for trg_umi_index in trg_umi_dict:
        max_tally = 0
        tot_tally = 0

        for amp_match in trg_umi_dict[trg_umi_index]:

            my_tally = trg_umi_dict[trg_umi_index][amp_match]

            if my_tally >= max_tally:
                max_tally = int(my_tally)
                max_match = int(amp_match)

            tot_tally += int(my_tally)

        consensus_seq = str(
            trg_amp_seq_dict[trg_umi_index].get_str_consensus())

        if realign_amplicons:
            # perform direct, un-gapped alignment of consensus_seq to reference options to obtain max_match
            max_match = -1
            max_tally = -1  # exclude max_tally as count, since alignment is happening post-consensus
            min_mismatch_count = -1
            for i in range(len(ref_sequences)):
                all_subamplicons_pass = True
                start_index = 0
                tot_mismatches = 0
                for j in range(len(ref_sequences[i])
                               ):  # loop through sub-amplicon-sequences
                    ref_subamplicon_len = len(ref_sequences[i][j])
                    my_mismatches, minlen = alignOps.count_mismatches(
                        ref_sequences[i][j],
                        consensus_seq[start_index:(start_index +
                                                   ref_subamplicon_len)])
                    if minlen == 0:
                        all_subamplicons_pass = False
                        break
                    all_subamplicons_pass = all_subamplicons_pass and (
                        my_mismatches / float(minlen) <= max_mismatch_amplicon)
                    start_index += ref_subamplicon_len
                    tot_mismatches += my_mismatches
                if all_subamplicons_pass and (
                        max_match < 0 or min_mismatch_count < tot_mismatches):
                    max_match = int(i)
                    min_mismatch_count = int(tot_mismatches)

        if max_match >= 0:
            csvfile.write(trg_umi_index + "," + str(max_match) + "," +
                          str(max_tally) + "," + str(tot_tally) + "\n")
            fastafile.write(">" + trg_umi_index + '\n')
            fastafile.write(consensus_seq + '\n')
            if realign_amplicons:
                trg_umi_index_dict[trg_umi_index] = True
            accepted_consensus_sequences += 1
        else:
            inadmis_consensus_sequences += 1

    csvfile.close()
    fastafile.close()
    sysOps.throw_status('Discarded ' + str(inadmis_consensus_sequences) + '/' +
                        str(accepted_consensus_sequences +
                            inadmis_consensus_sequences) +
                        ' sequences in writing ' + sysOps.globaldatapath +
                        outfilename + ' due to inadequate amplicon match.')

    if realign_amplicons:
        # create a new consensus pairing file that's filtered with the accepted trg umi indices
        [dirnames, filenames] = sysOps.get_directory_and_file_list()
        consensus_filenames = [
            filename for filename in filenames
            if filename.startswith('consensus')
        ]
        for consensus_filename in consensus_filenames:  # find all consensus files present
            accepted_consensus_sequences = 0
            inadmis_consensus_sequences = 0
            os.rename(
                sysOps.globaldatapath + consensus_filename,
                sysOps.globaldatapath + 'unfiltered_' + consensus_filename)
            with open(sysOps.globaldatapath + consensus_filename,
                      'w') as new_consensus_file:
                with open(
                        sysOps.globaldatapath + 'unfiltered_' +
                        consensus_filename, 'rU') as old_consensus_file:
                    for old_consensus_file_line in old_consensus_file:
                        consensus_list = old_consensus_file_line.strip(
                            '\n'
                        ).split(
                            ','
                        )  # [uei_index, bcn_umi_index, trg_umi_index, read_count, (additional variables)]
                        if consensus_list[2] in trg_umi_index_dict:
                            new_consensus_file.write(old_consensus_file_line)
                            accepted_consensus_sequences += 1
                        else:
                            inadmis_consensus_sequences += 1
            sysOps.throw_status('Discarded ' +
                                str(inadmis_consensus_sequences) + '/' +
                                str(accepted_consensus_sequences +
                                    inadmis_consensus_sequences) +
                                ' consensus-pairings in writing ' +
                                sysOps.globaldatapath + consensus_filename +
                                ' due to inadequate amplicon match.')
        if len(consensus_filenames) == 0:
            sysOps.throw_exception(
                'Error: no consensus files available to update with realigned amplicon information. Exiting.'
            )
            sysOps.exitProgram()
Example #9
0
    def dnamic_inference(self,
                         smle_infer=False,
                         msmle_infer=False,
                         segment_infer=False,
                         compute_local_solutions_only=True):
        # Perform image inference on the basis of raw output of DNA microscopy sequence analysis

        # Basic settings
        read_thresh = 2
        min_uei_count = 2
        output_dim = 2
        version = 1.0
        infer_dir = ''

        # raw data files
        consensus_pairing_csv_file = "..//consensus_" + str(
            read_thresh) + "r_pairing_filter0.75_uei_umi.csv"
        outname = 'minuei' + str(min_uei_count) + 'DMv' + str(
            version) + '_' + str(read_thresh) + 'r_filter0.75'
        wmat_outfilename = 'wmat_' + outname + '.csv'
        param_name = 'minuei' + str(min_uei_count) + 'dim' + str(
            output_dim) + 'DMv' + str(version) + '_.csv'
        imagemodule_input_filename = 'data_' + param_name
        key_filename = 'key_' + param_name
        if not sysOps.check_file_exists('microscopy_tasklist.csv'):
            [subdirnames, filenames
             ] = sysOps.get_directory_and_file_list(sysOps.globaldatapath)
            with open(sysOps.globaldatapath + 'microscopy_tasklist.csv',
                      'w') as task_input_file_handle:
                for subdir in subdirnames:
                    if sysOps.check_file_exists(subdir + '//libsettings.txt'):
                        task_input_file_handle.write('infer_smle;' +
                                                     sysOps.globaldatapath +
                                                     subdir + '//\n')
                        task_input_file_handle.write('infer_msmle;' +
                                                     sysOps.globaldatapath +
                                                     subdir + '//\n')
                        task_input_file_handle.write('infer_segment;' +
                                                     sysOps.globaldatapath +
                                                     subdir + '//\n')
                        task_input_file_handle.write('infer_ptmle;' +
                                                     sysOps.globaldatapath +
                                                     subdir + '//\n')

        original_datapath = str(sysOps.globaldatapath)
        if smle_infer:
            infer_dir = 'infer_smle//'
            [my_task, time_start
             ] = parallelOps.get_next_open_task('tasklog.csv',
                                                'microscopy_tasklist.csv',
                                                'infer_smle')
        elif msmle_infer:
            infer_dir = 'infer_msmle//'
            [my_task, time_start
             ] = parallelOps.get_next_open_task('tasklog.csv',
                                                'microscopy_tasklist.csv',
                                                'infer_msmle')
        elif segment_infer:
            infer_dir = 'infer_segment//'
            [my_task, time_start
             ] = parallelOps.get_next_open_task('tasklog.csv',
                                                'microscopy_tasklist.csv',
                                                'infer_segment')
        else:
            infer_dir = 'infer_ptmle//'
            [my_task, time_start
             ] = parallelOps.get_next_open_task('tasklog.csv',
                                                'microscopy_tasklist.csv',
                                                'infer_ptmle')

        if not (my_task is None):

            sysOps.initiate_runpath(str(my_task[1]))

            [subdirnames, filenames] = sysOps.get_directory_and_file_list()
            dirnames = list(["."])
            subdirnames_nodatayet = [
                subdirname for subdirname in subdirnames
                if subdirname.startswith('sub') and (
                    not sysOps.check_file_exists(subdirname + '//' +
                                                 imagemodule_input_filename))
            ]
            subdirnames_nodatayet = [
                subdirnames_nodatayet[i] for i in np.argsort(-np.array([
                    int(subdirname[3:].strip('/'))
                    for subdirname in subdirnames_nodatayet
                ]))
            ]  # sort by descending read count
            subdirnames_dataalready = [
                subdirname for subdirname in subdirnames
                if subdirname.startswith('sub') and (
                    sysOps.check_file_exists(subdirname + '//' +
                                             imagemodule_input_filename))
            ]
            subdirnames_dataalready = [
                subdirnames_dataalready[i] for i in np.argsort(-np.array([
                    int(subdirname[3:].strip('/'))
                    for subdirname in subdirnames_dataalready
                ]))
            ]  # sort by descending read count
            dirnames.extend(subdirnames_nodatayet)
            dirnames.extend(subdirnames_dataalready)
            sysOps.throw_status('Checking directories ' +
                                sysOps.globaldatapath + ' ... ' +
                                str(dirnames) + ' for infer-subdirectories.')
            for dirname in dirnames:  # make inference directories
                try:
                    with open(
                            sysOps.globaldatapath + dirname + '//' +
                            infer_dir + 'tmpfile.txt', 'w') as tmpfile:
                        tmpfile.write('test')
                    os.remove(sysOps.globaldatapath + dirname + '//' +
                              infer_dir + 'tmpfile.txt')
                    sysOps.throw_status('Directory ' + sysOps.globaldatapath +
                                        dirname + '//' + infer_dir +
                                        ' found already created.')
                except:
                    os.mkdir(sysOps.globaldatapath + dirname + '//' +
                             infer_dir)
                    sysOps.throw_status('Created directory ' +
                                        sysOps.globaldatapath + dirname +
                                        '//' + infer_dir)

            for dirname in dirnames:
                sysOps.initiate_runpath(
                    str(my_task[1]) + dirname + '//' + infer_dir)
                sysOps.initiate_statusfilename()
                sysOps.throw_status('Assigned path ' + sysOps.globaldatapath)

                if not (sysOps.check_file_exists(key_filename) and
                        sysOps.check_file_exists(imagemodule_input_filename)
                        and sysOps.check_file_exists(
                            'read_' + imagemodule_input_filename) and
                        sysOps.check_file_exists('seq_params_' +
                                                 imagemodule_input_filename)):

                    sysOps.throw_status('Calling matOps.generate_wmat()')

                    trg_dict = matOps.generate_wmat(consensus_pairing_csv_file,
                                                    read_thresh, min_uei_count,
                                                    wmat_outfilename)
                    sysOps.throw_status('Completed matOps.generate_wmat()')
                    matOps.print_imagemodule_input(trg_dict,
                                                   imagemodule_input_filename,
                                                   key_filename, output_dim)
                    #print_imagemodule_input outputs
                    #    1. File key_filename containing 3 columns: 0 or 1 (for beacon or target, respectively), cluster-index, MLE processing index
                    #    2. imagemodule_input_filename containing 3 columns: MLE processing index for beacon, MLE processing index for target, uei-count, max UEI read count
                    #    3. Summary file containing: Number of beacons inputted to MLE, number of targets inputted to MLE,
                else:
                    sysOps.throw_status(
                        'Image-module input pre-computed. Proceeding ...')

                #optimOps.test_ffgt()

                if sysOps.check_file_exists(imagemodule_input_filename):
                    if segment_infer:
                        optimOps.run_mle(
                            imagemodule_input_filename,
                            False,
                            False,
                            True,
                            compute_local_solutions_only,
                        )  # segmentation only
                    elif msmle_infer:
                        optimOps.run_mle(imagemodule_input_filename, False,
                                         True, False,
                                         compute_local_solutions_only)  # msMLE
                    elif smle_infer:
                        optimOps.run_mle(imagemodule_input_filename, True,
                                         False, False,
                                         compute_local_solutions_only)  # sMLE
                    else:
                        optimOps.run_mle(imagemodule_input_filename, False,
                                         False, False,
                                         compute_local_solutions_only)  # ptMLE

                    if not compute_local_solutions_only:
                        dnamicOps.print_final_results(
                            '..//trg_amplicon_calls.csv',
                            '..//trg_amplicon_calls.fasta')
                    else:
                        sysOps.exitProgram()
                else:
                    sysOps.throw_status('Could not locate ' +
                                        sysOps.globaldatapath +
                                        imagemodule_input_filename)

            sysOps.globaldatapath = str(original_datapath)
            if not parallelOps.close_task('tasklog.csv', ';'.join(my_task),
                                          time_start):
                sysOps.throw_exception('Task ' + str(my_task) +
                                       ' no longer exists in log ' +
                                       sysOps.globaldatapath + 'tasklog.csv' +
                                       ' -- exiting.')
                sysOps.exitProgram()

        return
Example #10
0
    def crosscomparison_analysis(self, args):

        sysOps.initiate_statusfilename()
        list_of_dirs = list()

        file_to_compare = args[1]

        with open(sysOps.globaldatapath + args[2], 'rU') as csvfile:
            for myline in csvfile:
                thisline = myline.strip('\n').split(',')
                subdir = 'lib_' + str(thisline[0]) + '_' + str(
                    thisline[1]) + '_' + str(thisline[2])
                list_of_dirs.append(subdir)

        print "Beginning comparison analysis"
        print "File to compare = " + file_to_compare
        print "Directories = " + ",".join(list_of_dirs)

        try:
            os.mkdir(sysOps.globaldatapath + 'cross_comparisons')
        except:
            sysOps.throw_exception(
                'cross_comparisons directory already exists. Terminating comparison analysis.'
            )
            sysOps.exitProgram()

        shared_num_unique_matrix = list()
        unshared_num_unique_matrix = list()
        shared_read_abund_matrix = list()
        unshared_read_abund_matrix = list()

        for i in range(len(list_of_dirs)):
            shared_num_unique_matrix.append(list([-1] * len(list_of_dirs)))
            unshared_num_unique_matrix.append(list([-1] * len(list_of_dirs)))
            shared_read_abund_matrix.append(list([-1] * len(list_of_dirs)))
            unshared_read_abund_matrix.append(list([-1] * len(list_of_dirs)))

        for ind1 in range(len(list_of_dirs)):
            for ind2 in range(ind1):
                dir1 = list_of_dirs[ind1]
                dir2 = list_of_dirs[ind2]
                clustfile1 = dir1 + "//" + file_to_compare
                clustfile2 = dir2 + "//" + file_to_compare
                dir1_abbrev = dir1[(
                    dir1.rfind('/') + 1
                ):]  #remove superdirectory structure of path -- requires individual directories have unique names
                dir2_abbrev = dir2[(dir2.rfind('/') + 1):]
                sysOps.throw_status('Began writing cross_comparisons//' +
                                    dir1_abbrev + "_" + dir2_abbrev + "_" +
                                    file_to_compare)
                [
                    num_unique_shared, num_unique_unshared,
                    read_abundance_shared, read_abundance_unshared
                ] = alignOps.compare(
                    clustfile1, clustfile2,
                    dir1_abbrev + "_" + dir2_abbrev + "_" + file_to_compare,
                    False)
                sysOps.throw_status('Completed writing cross_comparisons//' +
                                    dir1_abbrev + "_" + dir2_abbrev + "_" +
                                    file_to_compare)
                shared_num_unique_matrix[ind1][ind2] = num_unique_shared[0]
                shared_num_unique_matrix[ind2][ind1] = num_unique_shared[1]
                unshared_num_unique_matrix[ind1][ind2] = num_unique_unshared[0]
                unshared_num_unique_matrix[ind2][ind1] = num_unique_unshared[1]
                print str(num_unique_unshared[0]
                          ) + '-> unshared_num_unique_matrix[ ' + str(
                              ind1) + '][' + str(ind2) + ']'
                shared_read_abund_matrix[ind1][ind2] = read_abundance_shared[0]
                shared_read_abund_matrix[ind2][ind1] = read_abundance_shared[1]
                unshared_read_abund_matrix[ind1][
                    ind2] = read_abundance_unshared[0]
                unshared_read_abund_matrix[ind2][
                    ind1] = read_abundance_unshared[1]

        print shared_num_unique_matrix
        print unshared_num_unique_matrix
        print shared_read_abund_matrix
        print unshared_read_abund_matrix

        with open('comparison_matrices.csv', 'w') as compare_matrix_file:
            for i1 in range(len(list_of_dirs)):
                compare_matrix_file.write(
                    ','.join([str(j)
                              for j in shared_num_unique_matrix[i1]]) + '\n')

            for i2 in range(len(list_of_dirs)):
                compare_matrix_file.write(
                    ','.join([str(j)
                              for j in unshared_num_unique_matrix[i2]]) + '\n')

            for i3 in range(len(list_of_dirs)):
                compare_matrix_file.write(
                    ','.join([str(j)
                              for j in shared_read_abund_matrix[i3]]) + '\n')

            for i4 in range(len(list_of_dirs)):
                compare_matrix_file.write(
                    ','.join([str(j)
                              for j in unshared_read_abund_matrix[i4]]) + '\n')
Example #11
0
    def sim_reads(self):
        simLibObj = libOps.libObj(settingsfilename='libsettings.txt',
                                  output_prefix='_')
        enforced_rev_read_len = 100
        [for_read_len, rev_read_len] = simLibObj.get_min_allowed_readlens(
            simLibObj.filter_amplicon_window)
        rev_read_len = int(enforced_rev_read_len)
        '''
        simLibObj.seqform_for_params and simLibObj.seqform_rev_params are already stored in current object's memory
        Form of these variables is a list of the following:
            Element 1: [start_pos,end_pos]
            Element 2: np.ndarray(seq_bool_vec, dtype=np.bool_)
            Element 3: np.ndarray(capital_bool_vec, dtype=np.bool_)
            Element 4: np.ndarray(ambig_vec, dtype=np.bool_)
        '''
        [subdirnames, filenames] = sysOps.get_directory_and_file_list()

        for_umi_seqs = list()
        rev_umi_seqs = list()
        rev_umi_amplicon_list = list()
        uei_seqs = list()
        base_order = 'ACGT'

        sysOps.throw_status('Generating simulated sequences ...')
        amplicon_list = list()
        if "-amplicon" in simLibObj.mySettings:
            amplicon_list = [
                simLibObj.mySettings["-amplicon"][i].upper().split(',')
                for i in range(len(simLibObj.mySettings["-amplicon"]))
            ]

        for for_umi_i in range(self.Nbcn):
            for_param_index = np.random.randint(
                len(simLibObj.seqform_for_params))
            if len(simLibObj.seqform_for_params[for_param_index]) > 1:
                sysOps.throw_exception(
                    'Error: len(simLibObj.seqform_for_params[for_param_index]) = '
                    + str(len(simLibObj.seqform_for_params[for_param_index])))
                sysOps.exitProgram()
            my_for_umi_param = simLibObj.seqform_for_params[for_param_index][
                0]['U'][0]
            [start_pos, end_pos] = my_for_umi_param[0]
            seq_bool_vec = my_for_umi_param[1]
            my_for_umi = str('')
            for pos in range(end_pos - start_pos):
                possible_bases = np.where(seq_bool_vec[(pos * 4):((pos + 1) *
                                                                  4)])[0]
                my_for_umi += base_order[possible_bases[np.random.randint(
                    possible_bases.shape[0])]]

            for_umi_seqs.append([int(for_param_index), str(my_for_umi)])

        for for_uei_i in range(self.Nuei):
            for_param_index = 0  # there should be no difference across UMI's
            my_for_uei_param = simLibObj.seqform_for_params[for_param_index][
                0]['U'][1]
            [start_pos, end_pos] = my_for_uei_param[0]
            seq_bool_vec = my_for_uei_param[1]
            my_for_uei = str('')
            for pos in range(end_pos - start_pos):
                possible_bases = np.where(seq_bool_vec[(pos * 4):((pos + 1) *
                                                                  4)])[0]
                my_for_uei += base_order[possible_bases[np.random.randint(
                    possible_bases.shape[0])]]

            uei_seqs.append(str(my_for_uei))

        for rev_umi_i in range(self.Ntrg):
            rev_param_index = np.random.randint(
                len(simLibObj.seqform_rev_params))
            my_rev_umi_param = simLibObj.seqform_rev_params[rev_param_index][
                0]['U'][0]
            [start_pos, end_pos] = my_rev_umi_param[0]
            seq_bool_vec = my_rev_umi_param[1]
            my_rev_umi = str('')
            for pos in range(end_pos - start_pos):
                possible_bases = np.where(seq_bool_vec[(pos * 4):((pos + 1) *
                                                                  4)])[0]
                my_rev_umi += base_order[possible_bases[np.random.randint(
                    possible_bases.shape[0])]]

            if len(amplicon_list) == 0:
                encoded_amplicon = str('')
            else:
                this_gsp_primer_amplicon_pair = list(
                    amplicon_list[np.random.randint(len(amplicon_list))]
                )  # already properly oriented # already properly oriented
                # generate single error on amplicon
                lenamp = len(this_gsp_primer_amplicon_pair[1])
                rand_loc = np.random.randint(lenamp)
                this_gsp_primer_amplicon_pair[1] = str(
                    this_gsp_primer_amplicon_pair[1][:rand_loc] +
                    base_order[np.random.randint(4)] +
                    this_gsp_primer_amplicon_pair[1][(rand_loc + 1):])
                encoded_amplicon = ''.join(this_gsp_primer_amplicon_pair)

            tmp_umi_index = float(rev_umi_i)

            if tmp_umi_index == 0:
                encoded_amplicon += base_order[0]
            else:
                for myexponent in range(
                        int(np.floor(np.log(tmp_umi_index) / np.log(4.0))), -1,
                        -1):
                    mydigit = np.floor(tmp_umi_index /
                                       np.power(4.0, myexponent))
                    encoded_amplicon += base_order[int(mydigit)]
                    tmp_umi_index -= mydigit * np.power(4.0, myexponent)

            rev_umi_seqs.append(
                [int(rev_param_index),
                 str(my_rev_umi),
                 str(encoded_amplicon)])

        sysOps.throw_status('Writing simulated reads ...')

        for filename in filenames:
            if filename.endswith('_sim_ueifile.csv'):
                ueifile = np.int64(
                    np.loadtxt(sysOps.globaldatapath + filename,
                               delimiter=','))
                newdirname = filename[:filename.find('_')]
                read_list = list()
                for i in range(ueifile.shape[0]):
                    for myread in range(ueifile[i, 3]):
                        read_list.append(np.array([ueifile[i, :3]]))
                read_list = np.concatenate(
                    read_list, axis=0
                )  # re-write array so that there is now one row per read
                # randomly permute:
                read_list = read_list[
                    np.random.permutation(read_list.shape[0]), :]

                for_chararray = np.chararray((for_read_len))
                rev_chararray = np.chararray((rev_read_len))
                for_fastq_outfile = open(newdirname + '_for.fastq', "w")
                rev_fastq_outfile = open(newdirname + '_rev.fastq', "w")
                for i in range(read_list.shape[0]):
                    for_param_index = for_umi_seqs[read_list[i, 1]][0]
                    for_umi_seq = for_umi_seqs[read_list[i, 1]][1]
                    rev_param_index = rev_umi_seqs[read_list[i, 2]][
                        0]  # both beacon and target indices are at this point are independently indexed from 0
                    rev_umi_seq = rev_umi_seqs[read_list[i, 2]][1]
                    rev_amp_seq = rev_umi_seqs[read_list[i, 2]][2]
                    uei_seq = uei_seqs[read_list[i, 0]]

                    for j in range(for_read_len):
                        for_chararray[j] = 'N'
                    for j in range(rev_read_len):
                        rev_chararray[j] = 'N'

                    my_for_umi_param = simLibObj.seqform_for_params[
                        for_param_index][0]['U'][0]
                    [start_pos, end_pos] = my_for_umi_param[0]
                    for j in range(end_pos - start_pos):
                        for_chararray[j + start_pos] = for_umi_seq[j]

                    my_for_uei_param = simLibObj.seqform_for_params[
                        for_param_index][0]['U'][1]
                    [start_pos, end_pos] = my_for_uei_param[0]
                    for j in range(end_pos - start_pos):
                        for_chararray[j + start_pos] = uei_seq[j]

                    for my_for_param in simLibObj.seqform_for_params[
                            for_param_index][0]['P']:
                        [start_pos, end_pos] = my_for_param[0]
                        for j in range(end_pos - start_pos):
                            for_chararray[j + start_pos] = base_order[np.where(
                                my_for_param[1][(4 * j):(4 * (j + 1))])[0][0]]

                    my_rev_umi_param = simLibObj.seqform_rev_params[
                        rev_param_index][0]['U'][0]
                    [start_pos, end_pos] = my_rev_umi_param[0]
                    for j in range(end_pos - start_pos):
                        rev_chararray[j + start_pos] = rev_umi_seq[j]
                    my_rev_amp_param = simLibObj.seqform_rev_params[
                        rev_param_index][0]['A'][0]
                    start_pos = my_rev_amp_param[0][0]
                    for j in range(len(rev_amp_seq)):
                        rev_chararray[j + start_pos] = rev_amp_seq[j]

                    if 'P' in simLibObj.seqform_rev_params[rev_param_index][0]:
                        for my_rev_param in simLibObj.seqform_rev_params[
                                rev_param_index][0]['P']:
                            [start_pos, end_pos] = my_rev_param[0]
                            for j in range(end_pos - start_pos):
                                rev_chararray[j +
                                              start_pos] = base_order[np.where(
                                                  my_rev_param[1][(4 * j):(
                                                      4 * (j + 1))])[0][0]]

                    for_record = SeqIO.SeqRecord(
                        Seq.Seq(for_chararray.tostring()))
                    for_record.id = '-' + str(i) + '-' + str(read_list[i, 1])
                    for_record.description = ''
                    for_record.letter_annotations['phred_quality'] = list(
                        [30 for j in range(for_read_len)])
                    rev_record = SeqIO.SeqRecord(
                        Seq.Seq(rev_chararray.tostring()))
                    rev_record.id = '-' + str(i) + '-' + str(read_list[i, 2])
                    rev_record.description = ''
                    rev_record.letter_annotations['phred_quality'] = list(
                        [30 for j in range(rev_read_len)])
                    SeqIO.write(for_record, for_fastq_outfile, "fastq")
                    SeqIO.write(rev_record, rev_fastq_outfile, "fastq")

                for_fastq_outfile.close()
                rev_fastq_outfile.close()
                os.mkdir(newdirname)
                with open('libsettings.txt', 'rU') as oldsettingsfile:
                    with open(newdirname + '//libsettings.txt',
                              'w') as newsettingsfile:
                        for oldsettings_row in oldsettingsfile:
                            if oldsettings_row.startswith('-source_for'):
                                newsettingsfile.write('-source_for ..//' +
                                                      newdirname +
                                                      '_for.fastq\n')
                            elif oldsettings_row.startswith('-source_rev'):
                                newsettingsfile.write('-source_rev ..//' +
                                                      newdirname +
                                                      '_rev.fastq\n')
                            else:
                                newsettingsfile.write(oldsettings_row)

        sysOps.throw_status('Done.')
        return
Example #12
0
def compare_identical(idfile1, idfile2, comparison_file_name, rev_comp):
    #rev_comp = True/False depending on need of reverse-complement being taken
    print "Beginning comparison between " + idfile1 + " and " + idfile2

    uxi_handle1 = open(sysOps.globaldatapath + idfile1, 'rU')
    uxi_dict1 = dict()
    len_uxi1 = -1
    uxi_index = 0
    for uxi_line1 in uxi_handle1:
        split_str = uxi_line1.strip('\n').split('_')
        if (len(split_str) == 3):

            my_uxi = split_str[0]
            if len_uxi1 < 0:
                len_uxi1 = len(my_uxi)
            elif len_uxi1 != len(my_uxi):
                print 'Error: uxi length-mismatch'
                sysOps.exitProgram()

            my_numreads = int(split_str[2])
            uxi_dict1[my_uxi] = [uxi_index, my_numreads, False
                                 ]  #final entry corresponds to being shared
            uxi_index += 1

    uxi_handle1.close()

    uxi_handle2 = open(sysOps.globaldatapath + idfile2, 'rU')
    uxi_dict2 = dict()
    len_uxi2 = -1
    uxi_index = 0
    comparison_handle = open(sysOps.globaldatapath + comparison_file_name, 'w')
    for uxi_line2 in uxi_handle2:
        split_str = uxi_line2.strip('\n').split('_')
        if (len(split_str) == 3):

            my_uxi = split_str[0]
            if len_uxi2 < 0:
                len_uxi2 = len(my_uxi)
                if len_uxi1 != len_uxi2:
                    print 'Error: uxi1/uxi2 length-mismatch'
                    sysOps.exitProgram()

            my_numreads = int(split_str[2])
            uxi_dict2[my_uxi] = [uxi_index, my_numreads, False]
            this_uxi = str(my_uxi)
            if rev_comp:
                this_uxi = str(Seq.Seq(this_uxi).reverse_complement())

            if this_uxi in uxi_dict1:
                print "Found match " + this_uxi
                uxi_dict1[this_uxi][2] = True
                uxi_dict2[my_uxi][2] = True
                comparison_handle.write(this_uxi + "," +
                                        str(uxi_dict1[this_uxi][0]) + "," +
                                        str(uxi_dict1[this_uxi][1]) + "," +
                                        str(uxi_index) + "," +
                                        str(my_numreads) + "\n")

            uxi_index += 1

    comparison_handle.close()

    unshared_handle = open(
        sysOps.globaldatapath + "unshared_" + comparison_file_name, 'w')
    for dict_el in uxi_dict1:
        if not uxi_dict1[dict_el][2]:
            unshared_handle.write(dict_el + ",0," +
                                  str(uxi_dict1[dict_el][0]) + "," +
                                  str(uxi_dict1[dict_el][1]) + "\n")

    for dict_el in uxi_dict2:
        if not uxi_dict2[dict_el][2]:
            unshared_handle.write(dict_el + ",1," +
                                  str(uxi_dict2[dict_el][0]) + "," +
                                  str(uxi_dict2[dict_el][1]) + "\n")
    unshared_handle.close()

    return True
Example #13
0
def compare(clustfile1,
            clustfile2,
            comparison_file_name,
            rev_comp,
            read_thresh=2,
            filter_substr_list=[],
            filter_val=0.75):
    #rev_comp = True/False depending on need of reverse-complement being taken
    #filter_val = maximum fraction of bases in uxi allowed to be the same

    #all filtering of legitimate comparison occurs here, at the front end
    print "Beginning comparison between " + clustfile1 + " and " + clustfile2

    #Stage 1 of comparison: determine total read-abundance of clusters in clustfile1 and clustfile2,
    #assign to abund_dict1 and abund_dict2

    abund_dict1 = dict()
    with open(sysOps.globaldatapath + clustfile1, 'rU') as clust1_handle:
        for clust_line in clust1_handle:
            my_el = clust_line.strip('\n').split('_')
            if (len(my_el) == 3):
                uxi_index = my_el[0]
                my_numreads = int(my_el[2])
                if uxi_index not in abund_dict1:
                    abund_dict1[uxi_index] = {
                        'reads': my_numreads,
                        'is_shared': False
                    }
                else:
                    abund_dict1[uxi_index]['reads'] += my_numreads

    abund_dict2 = dict()
    with open(sysOps.globaldatapath + clustfile2, 'rU') as clust2_handle:
        for clust_line in clust2_handle:
            my_el = clust_line.strip('\n').split('_')
            if (len(my_el) == 3):
                uxi_index = my_el[0]
                my_numreads = int(my_el[2])
                if uxi_index not in abund_dict2:
                    abund_dict2[uxi_index] = {
                        'reads': my_numreads,
                        'is_shared': False
                    }
                else:
                    abund_dict2[uxi_index]['reads'] += my_numreads

    #Stage 2 of comparison: enter actual uxi sequences into dict_clust1 and dict_clust2,
    #enter their respective cluster-indices into dict_uxi_indices1 and dict_uxi_indices2

    dict_clust1 = dict()
    with open(sysOps.globaldatapath + clustfile1, 'rU') as clust1_handle:
        for clust_line in clust1_handle:
            my_el = clust_line.strip('\n').split('_')
            if (len(my_el) == 3):
                uxi_index = int(my_el[0])
                this_uxi = str(my_el[1])
                my_numreads = int(my_el[2])
                has_disallowed_substr = [
                    my_substr in this_uxi for my_substr in filter_substr_list
                ]
                if abund_dict1[my_el[0]]['reads'] >= read_thresh and (
                        True not in has_disallowed_substr) and max(
                            numpy.bincount([('ACGT').index(s) for s in this_uxi
                                            ])) <= filter_val * len(this_uxi):
                    dict_clust1[this_uxi] = [
                        uxi_index, my_numreads, False
                    ]  #final entry corresponds to being shared

    print "Completed first cluster-file input. Second cluster-file being read, output to cross_comparisons//" + comparison_file_name

    comparison_handle = open(
        sysOps.globaldatapath + 'cross_comparisons//' + comparison_file_name,
        'w')

    dict_clust2 = dict()
    with open(sysOps.globaldatapath + clustfile2, 'rU') as clust2_handle:
        for clust_line in clust2_handle:
            my_el = clust_line.strip('\n').split('_')
            if (len(my_el) == 3):
                uxi_index = int(my_el[0])  #references clustfile2
                #my_uxi references clustfile2 uxi sequences
                #this_uxi references clustfile1 uxi sequences
                my_uxi = str(my_el[1])
                my_numreads = int(my_el[2])
                this_uxi = str(my_uxi)
                if (rev_comp):
                    this_uxi = str(Seq.Seq(this_uxi).reverse_complement())
                has_disallowed_substr = [
                    my_substr in this_uxi for my_substr in filter_substr_list
                ]

                if abund_dict2[my_el[0]]['reads'] >= read_thresh and (
                        True not in has_disallowed_substr) and max(
                            numpy.bincount([('ACGT').index(s) for s in this_uxi
                                            ])) <= filter_val * len(this_uxi):
                    dict_clust2[my_uxi] = [uxi_index, my_numreads, False]
                    if this_uxi in dict_clust1:
                        dict_clust1[this_uxi][2] = True
                        dict_clust2[my_uxi][2] = True
                        if str(dict_clust1[this_uxi][0]) not in abund_dict1:
                            sysOps.throw_exception(
                                'A: ' + str(dict_clust1[this_uxi][0]) +
                                ' not in dict_uxi_indices1')
                            sysOps.exitProgram()
                        if str(uxi_index) not in abund_dict2:
                            sysOps.throw_exception('B: ' + str(uxi_index) +
                                                   ' not in dict_uxi_indices2')
                            sysOps.exitProgram()

                        abund_dict1[str(
                            dict_clust1[this_uxi][0])]['is_shared'] = True
                        abund_dict2[str(uxi_index)]['is_shared'] = True

                        comparison_handle.write(
                            str(this_uxi) + "," +
                            str(dict_clust1[this_uxi][0]) + "," +
                            str(dict_clust1[this_uxi][1]) + "," +
                            str(abund_dict1[str(dict_clust1[this_uxi][0])]
                                ['reads']) + "," +
                            str(dict_clust2[my_uxi][0]) + "," +
                            str(dict_clust2[my_uxi][1]) + "," +
                            str(abund_dict2[str(dict_clust2[my_uxi][0])]
                                ['reads']) + "\n")

    comparison_handle.close()

    #count number unique shared and unique unshared
    num_unique_shared = [0, 0]
    num_unique_unshared = [0, 0]
    read_abundance_shared = [0, 0]
    read_abundance_unshared = [0, 0]

    for uxi_index1 in abund_dict1:
        if abund_dict1[uxi_index1]['is_shared']:
            num_unique_shared[0] += 1
            read_abundance_shared[0] += abund_dict1[uxi_index1]['reads']
        else:
            num_unique_unshared[0] += 1
            read_abundance_unshared[0] += abund_dict1[uxi_index1]['reads']

    for uxi_index2 in abund_dict2:
        if abund_dict2[uxi_index2]['is_shared']:
            num_unique_shared[1] += 1
            read_abundance_shared[1] += abund_dict2[uxi_index2]['reads']
        else:
            num_unique_unshared[1] += 1
            read_abundance_unshared[1] += abund_dict2[uxi_index2]['reads']

    return [
        num_unique_shared, num_unique_unshared, read_abundance_shared,
        read_abundance_unshared
    ]