Esempio n. 1
0
def gather_and_stitch(seq, tfile):

    total_data = []
    in_file = str(seq) + "_tophits.gzip"

    if os.path.isfile(in_file):
        print "Extracting data from non_refined file:", in_file
        tasks = io.readGzipPickle(in_file)
        for entry in tasks:
            total_data.append(entry)
    else:
        print "Something is wrong in the datafile: ", in_file

    refined_data = combine_data()
    if refined_data:
        for entry in refined_data:
            total_data.append(entry)
    else:
        pass

    from ranking.NoeStageRank import rank_assembly_with_clustering
    ranked_data = rank_assembly_with_clustering(total_data, args.numhits)
    io.dumpGzipPickle(str(tfile), ranked_data)

    # delete files

    try:
        print "Deleting old rtx_* files!"
        rm_files = "rm rtx_*.gzip"
        os.system(rm_files)
    except NameError:
        print "No rtx_* files exist!"
    return True
Esempio n. 2
0
def combine_data():
    hits = []

    file_list = glob.glob("rtx_*.gzip")
    if file_list:
        pass
    else:
        return False
    for f in file_list:
        print "Extracting data from file..", f
        t_hits = io.readGzipPickle(f)
        for t_hit in t_hits:
            hits.append(t_hit)
    return hits
Esempio n. 3
0
def makeTopPickle2Old(previous_smotif_index, num_hits, stage):
    """
    Concatenate data from all of the threads, organize, remove redundancies, rank
     and extract top hits as defined
    :param previous_smotif_index:
    :param num_hits:
    :param stage:
    :return:
    """
    hits = []
    # regex = str(previous_smotif_index) + "_*_*.pickle"
    regex = str(previous_smotif_index) + "_*_*.gzip"
    file_list = glob.glob(regex)
    for f in file_list:
        t_hits = io.readGzipPickle(f)
        for t_hit in t_hits:
            hits.append(t_hit)
    """
    identifiers: smotif, smotif_def, seq_filter, contacts_filter, PCS_filter, qcp_rmsd, Evofilter
                 RDC_filter, NOE_filter
    """

    new_dict = collections.defaultdict(list)

    for hit in hits:
        # thread_data contains data from each search and filter thread.

        if hit[4][0] == 'NOE_filter':
            no_of_noes = hit[4][2]
            new_dict[no_of_noes].append(hit)

    keys = new_dict.keys()
    keys.sort()
    keys.reverse()
    # Rank based on NOE energy
    non_redundant = collections.defaultdict(list)
    reduced_dump_log = []
    seqs = []
    count_hits = 0
    for i in range(len(keys)):
        entries = new_dict[keys[i]]
        if len(
                entries
        ) == 1:  # There is only one entry in this no_of_noes bin just check of existing sequences and move on
            smotif_seq = entries[0][3][1]
            if smotif_seq not in seqs:
                seqs.append(smotif_seq)
                reduced_dump_log.append(entries[0])
                print "final sele", entries[0][0][1][0][0], keys[i]
                count_hits += 1
        else:
            t_log = collections.defaultdict(list)
            for hit in entries:  # filter on noe_energy
                if hit[4][0] == 'NOE_filter':
                    noe_energy = hit[4][3]
                    noe_energy = round(noe_energy, 2)
                    t_log[noe_energy].append(hit)
            noe_energy_bins = t_log.keys()
            noe_energy_bins.sort()

            for j in range(len(noe_energy_bins)):  # filter on RDC score
                t2_log = collections.defaultdict(list)
                hits = t_log[noe_energy_bins[j]]
                for hit in hits:
                    if hit[5][0] == 'RDC_filter':
                        rdc_tensors = hit[5][1]
                        rdc_score = 0
                        for tensor in rdc_tensors:
                            rdc_score = rdc_score + tensor[0]
                        t2_log[rdc_score].append(hit)
                rdc_score_bins = t2_log.keys()
                rdc_score_bins.sort()
                for k in range(len(rdc_score_bins)):
                    hits = t2_log[rdc_score_bins[k]]
                    for hit in hits:
                        smotif_seq = hit[3][1]
                        if smotif_seq not in seqs:
                            seqs.append(smotif_seq)
                            reduced_dump_log.append(hit)
                            print "final sele", hit[0][1][0][0], keys[
                                i], noe_energy_bins[j], rdc_score_bins[k]
                            count_hits += 1
                if count_hits >= num_hits:
                    break
        if count_hits >= num_hits:
            break
    if count_hits >= num_hits:
        pass
    else:
        print "could only extract ", len(reduced_dump_log), count_hits

    # io.dumpPickle(str(previous_smotif_index) + "_tophits.pickle", dump_pickle)
    io.dumpGzipPickle(
        str(previous_smotif_index) + "_tophits.gzip", reduced_dump_log)
    print "actual number in top hits ", len(reduced_dump_log)
    return range(len(reduced_dump_log))
Esempio n. 4
0
def makeTopPickle2(previous_smotif_index, num_hits, stage):
    """
    Concatenate data from all of the threads, organize, remove redundancies, rank
     and extract top hits as defined
    :param previous_smotif_index:
    :param num_hits:
    :param stage:
    :return:
    """
    hits = []
    # regex = str(previous_smotif_index) + "_*_*.pickle"
    regex = str(previous_smotif_index) + "_*_*.gzip"
    file_list = glob.glob(regex)
    for f in file_list:
        t_hits = io.readGzipPickle(f)
        for t_hit in t_hits:
            hits.append(t_hit)
    """
    identifiers: smotif, smotif_def, seq_filter, contacts_filter, PCS_filter, qcp_rmsd, Evofilter
                 RDC_filter, NOE_filter
    """

    new_dict = collections.defaultdict(list)

    for hit in hits:
        # thread_data contains data from each search and filter thread.
        noe_energy = hit[5][3]
        noe_energy = round(noe_energy, 4)
        new_dict[noe_energy].append(hit)
    keys = new_dict.keys()
    keys.sort()
    # Rank based on NOE energy
    reduced_dump_log = []
    seqs = []
    count_hits = 0
    for i in range(len(keys)):
        entries = new_dict[keys[i]]
        if count_hits >= num_hits:
            break
        if len(entries) == 1:
            smotif_seq = entries[0][4][1]
            if smotif_seq not in seqs:
                seqs.append(smotif_seq)
                reduced_dump_log.append(entries[0])
                print "final sele", entries[0][0][1][0][0], keys[i]
                count_hits += 1
                if count_hits >= num_hits:
                    break
        else:
            t2_log = collections.defaultdict(list)
            if hit[6][0] == 'RDC_filter':
                for hit in entries:
                    # if hit[5][0] == 'RDC_filter':
                    rdc_tensors = hit[6][1]
                    rdc_score = 0
                    for tensor in rdc_tensors:
                        rdc_score = rdc_score + tensor[0]
                    t2_log[rdc_score].append(hit)
                rdc_score_bins = t2_log.keys()
                rdc_score_bins.sort()
            elif hit[6][0] == 'PCS_filter':
                print "Working on PCS filter instead of RDC"
                for hit in entries:
                    # if hit[5][0] == 'RDC_filter':
                    rdc_tensors = hit[6][1]
                    rdc_score = 0
                    for tensor in rdc_tensors:
                        rdc_score = rdc_score + tensor[1]
                    t2_log[rdc_score].append(hit)
                rdc_score_bins = t2_log.keys()
                rdc_score_bins.sort()
            else:
                print "Something is wrong with your PCS logic"

            for k in range(len(rdc_score_bins)):
                hits = t2_log[rdc_score_bins[k]]
                for hit in hits:
                    smotif_seq = hit[4][1]
                    if smotif_seq not in seqs:
                        seqs.append(smotif_seq)
                        reduced_dump_log.append(hit)
                        count_hits += 1
                        print "final sele", hit[0][1][0][0], keys[
                            i], rdc_score_bins[k]
                    if count_hits >= num_hits:
                        break
                if count_hits >= num_hits:
                    break
            if count_hits >= num_hits:
                break
            else:
                pass
    if count_hits >= num_hits:
        pass
    else:
        print "could only extract ", len(reduced_dump_log), count_hits

    io.dumpGzipPickle(
        str(previous_smotif_index) + "_tophits.gzip", reduced_dump_log)
    print "actual number in top hits ", len(reduced_dump_log)
    return range(len(reduced_dump_log))
Esempio n. 5
0
def makeTopPickle(previous_smotif_index, num_hits, stage):
    """
    Concatenate data from all of the threads, organize, remove redundancies, rank
     and extract top hits as defined
    :param previous_smotif_index:
    :param num_hits:
    :param stage:
    :return:
    """
    hits = []
    # regex = str(previous_smotif_index) + "_*_*.pickle"
    regex = str(previous_smotif_index) + "_*_*.gzip"
    file_list = glob.glob(regex)
    for f in file_list:
        t_hits = io.readGzipPickle(f)
        for t_hit in t_hits:
            hits.append(t_hit)
    """
    identifiers: smotif, smotif_def, seq_filter, contacts_filter, PCS_filter, qcp_rmsd, Evofilter
                 RDC_filter, NOE_filter
    """

    new_dict = collections.defaultdict(list)
    rdc_constant = 0.0
    for hit in hits:
        # thread_data contains data from each search and filter thread.
        # initialize total score array
        total_score = {}
        for data_filter in hit:

            if data_filter[0] == 'PCS_filter':
                pcs_data = data_filter
                pcsscore = getNchiSum(pcs_data, stage)
                total_score['pcs_score'] = pcsscore

            if data_filter[0] == 'Ref_RMSD':
                total_score['rmsd_score'] = data_filter[1]

            if data_filter[0] == 'RDC_filter':
                rdc_data = data_filter
                #Nchi = rdcSumChi(rdc_data, stage)
                log_likelihood = data_filter[2]
                rdc_tensors = data_filter[1]
                for tensor in rdc_tensors:
                    rdc_constant = rdc_constant + tensor[0]
                rdc_constant = rdc_constant * 1e-10
                total_score['rdc_score'] = log_likelihood

            if data_filter[0] == 'NOE_filter':
                noe_probability = data_filter[1]
                log_likelihood = -1 * (math.log(noe_probability))
                total_score['noe_score'] = log_likelihood

                # calculate the total score and append the hit
        if total_score:
            keys = total_score.keys()
            keys = ['noe_score', 'rdc_score']
            #keys = ['rmsd_score']
            tscore = 0
            for key in keys:
                tscore = tscore + total_score[key]
            tscore = tscore + rdc_constant
            if tscore < 999.999:
                new_dict[tscore].append(hit)

    # ************************************************
    # Exclude the redundant entries and rank top hits
    # ************************************************

    keys = new_dict.keys()
    keys.sort()

    # Exclude the redundant data.

    # non_redundant = {}
    non_redundant = collections.defaultdict(list)
    seqs = []
    smotif_seq = ''
    count_hits = 0
    for i in range(0, len(keys)):
        entries = new_dict[keys[i]]
        for entry in entries:
            for ent in entry:
                if ent[0] == 'seq_filter':
                    seq_filter = ent
                    smotif_seq = seq_filter[1]
            if smotif_seq not in seqs:
                seqs.append(smotif_seq)
                non_redundant[keys[i]].append(entry)
                count_hits += 1
        if count_hits >= num_hits:
            break

    # Rank top hits and dump the data
    keys = non_redundant.keys()
    keys.sort()

    dump_pickle = []
    print "Dumping data to disk"
    count_top_hits = 0
    while (True):
        for key in keys:
            entries = non_redundant[key]
            for entry in entries:
                dump_pickle.append(entry)
                # print "final selected Smotif: ", entry[0][1][0][0], "with score: ", key
                print "final sele", entry[0][1][0][0], key
                count_top_hits += 1
            if count_top_hits >= num_hits:
                break
        if count_top_hits >= num_hits:
            break
        else:
            print "could only extract ", count_top_hits
            break

    # io.dumpPickle(str(previous_smotif_index) + "_tophits.pickle", dump_pickle)
    io.dumpGzipPickle(
        str(previous_smotif_index) + "_tophits.gzip", dump_pickle)
    print "actual number in top hits ", len(dump_pickle)
    return range(count_top_hits)
Esempio n. 6
0
import sys, os, copy
sys.path.append('../../main/')
__author__ = 'kalabharath'

import utility.io_util as io

seq = int(sys.argv[1])

top_result = []

t_file = str(seq) + "_refined_tophits.gzip"
if os.path.isfile(t_file):
    top_result = io.readGzipPickle(t_file)

else:
    t_file = str(seq) + "_tophits.gzip"
    if os.path.isfile(t_file):
        top_result = io.readGzipPickle(t_file)
    else:
        print "Somethis is terrribly wrong !"
        exit()

for p in range(0, len(top_result)):
    print 'model_', p,

    top_struct = top_result[p]

    top_struct = copy.copy(top_struct)

    for entry in top_struct:
        if entry[0] == 'cathcodes':
Esempio n. 7
0
                    help='Top number of hits to be selected')
args = parser.parse_args()

# *********************   Define cmd line argument parser *********************

# Rank '0' specifies the master process

if rank == 0:

    # *********************   Extract top hits *********************

    in_file = str(args.infile) + "_tophits.gzip"
    print "infile ", in_file

    try:
        tasks = io.readGzipPickle(in_file)
        print "len of tasks", len(tasks)
    except:
        traceback.print_exc()
        print "There are no entries in the tophits file, nothing to refine"
        killall(size)
        exit()

    # ********************* Generate and distribute job index array *********************

    stime = time.time()

    try:
        if len(tasks):
            pass
    except: