def clean_up_data(data, f, AA_DICT, settings, filters):
    filter_lt_target = 0
    filter_weight_agreement = 0
    filter_empty_data = 0
    for r in reversed(range(len(data))):
        data[r][4] = analyze_mods(
            data[r][7], data[r]
            [4])  #must do first or the weight agreement will kill the peptide
        #this checks that there are enough peaks and that the thoeretical and measured m/z match. if not something is wrong and the row is deleted
        try:
            # peppy = IC.peptide(data[r][4], settings["Heavy Element"], AA_DICT)
            # print("settings[\"Heavy Element\"]", settings["Heavy Element"])
            # print(peppy.Composition(), ":", peppy.MW())

            theory_wt = IC.peptide(data[r][4], settings["Heavy Element"],
                                   AA_DICT).MW()
            if theory_wt > filters["mass cutoff"]:
                target = filters["Peaks included if over mass cutoff"]
            else:
                target = filters["Peaks included if under mass cutoff"]
            x = [float(y) for y in data[r][-2].split()]
            x = [a for a in x if a != 0
                 ]  # returns values as zero if not present.  remove issues.

            #debugging
            if len(x) < target:
                filter_lt_target += 1

            if abs(theory_wt -
                   float(data[r][6])) > filters["Weight Agreement"]:
                theory_comp = IC.peptide(data[r][4], settings["Heavy Element"],
                                         AA_DICT).Composition()
                print(data[r][1], data[r][4], "formula:", theory_comp,
                      "theory_wt:", theory_wt, "observed:", data[r][6])
                filter_weight_agreement += 1

            if data[r][4] == '':
                filter_empty_data += 1

            if len(x) < target or abs(
                    theory_wt - float(data[r][6])
            ) > filters["Weight Agreement"] or data[r][
                    4] == '':  #deals with insufficient peaks, bad weight agreement, and bad mods
                del data[r]
            else:
                #Protein ID, file, charge, RT, Peptide, protein name, mass, mod, isotopes, abundances before this point
                data[r] = [
                    f, data[r][0], data[r][5], data[r][4], data[r][6],
                    data[r][2], data[r][3], data[r][-3], data[r][-2],
                    data[r][-1]
                ]  # if all is good set the data as good and proceed
        #bad sequences are removed
        except KeyError:
            del data[r]

    print("clean_up_data() filters:", "filter_lt_target:", filter_lt_target,
          "filter_weight_agreement:", filter_weight_agreement,
          "filter_empty_data:", filter_empty_data)
예제 #2
0
def emass(inqueue, outlist, error_list, AA_DICT, settings, filters):
    while not inqueue.empty():
        sequence = inqueue.get()
        label = settings["Heavy Element"]
        pep = IC.peptide(sequence, settings["Heavy Element"], AA_DICT)
        if pep.MW() > filters["mass cutoff"]:
            size = int(filters["Peaks included if over mass cutoff"])
        else:
            size = int(filters["Peaks included if under mass cutoff"])
        fm = parser(
            pep.Composition(True)
        )  #this turns the potential labeling sites into Xs so the program knows what they are
        tmp = []
        seq_data = []
        mz_data = []
        working = True
        x_range = np.arange(
            0.0, settings["Maximum Theoretical Percent"],
            settings["Step Size for Labeling"]
        )  #this is used here and in the fitter.  easier to give it its own variable
        for x_pct in x_range:
            #increase x here
            #need to reassign since it is a tuple.  consider changing to dictionary or class in the future.
            master_isotope['X'] = [
                isotope(master_isotope[label][0].mass,
                        master_isotope[label][0].abundance - x_pct),
                isotope(master_isotope[label][1].mass,
                        master_isotope[label][1].abundance + x_pct)
            ]
            #does the calculations and saves the data
            result = [isotope(0, 1)]
            result = calculate(tmp, result, fm, limit, charge)
            mz, pre_norm = print_pattern(result, digits)
            if x_pct == 0:
                true_m0 = mz[0]
            elif round(mz[0], 2) != round(true_m0, 2):
                working = False
                break
            norm = normalize(
                pre_norm[:size]
            )  #need to normalize the abundance data (it comes out normalized with the highest peak as 1.  this makes the sumof the relative abundances in the number of relevant peaks 1)
            seq_data.append(norm)
            mz_data.append(mz[:size])
        if working:
            graph = [sequence,
                     pep.Composition(False),
                     pep.MW(),
                     pep.get_n()]  # save relevent data
            #run through fitter.  Do not use elif here.  it needs to check both so it will do both if both are selected
            if settings["Use Abundance"]:
                graph = fitter(graph, seq_data, size, x_range, settings,
                               filters, True)
            if settings["Use neutromer spacing"]:
                graph = fitter(graph, mz_data, size, x_range, settings,
                               filters, False)
            outlist.append(graph)  #add the data to the multiprocessing list
        else:
            error_list.append([sequence, x_pct])
        inqueue.task_done()
예제 #3
0
def emass(inqueue, seq_dict, outlist, error_list, AA_DICT, settings, filters):
    while not inqueue.empty():
        sequence = inqueue.get()
        big_size = int(filters["Peaks included if over mass cutoff"])
        small_size = int(filters["Peaks included if under mass cutoff"])
        label = settings["Heavy Element"]
        pep = IC.peptide(sequence,settings["Heavy Element"], AA_DICT)
        if pep.MW() > filters["mass cutoff"]: size = big_size
        else: size  = small_size
        fm = parser(pep.Composition(True)) #this turns the potential labeling sites into Xs so the program knows what they are
        master_isotope['X'] = [isotope(master_isotope[label][0].mass, master_isotope[label][0].abundance), isotope(master_isotope[label][1].mass, master_isotope[label][1].abundance)]
        tmp = []
        result = [isotope(0, 1)]
        result = calculate(tmp, result, fm, limit, charge)
        mz, pre_norm  = print_pattern(result, digits)
        true_m0 = mz[0]
        if settings["Use Abundance"]:
            base_abund = normalize(pre_norm[:size])
            base_abund = sizing(base_abund, size, small_size, big_size)
        if settings["Use neutromer spacing"]:
            base_mz = []
            for i in range(1, size):
                base_mz.append(mz[i]-mz[0])
            base_mz = sizing(base_mz, size, small_size, big_size)
        working = True
        for x_pct in seq_dict[sequence]:
            final = [sequence, pep.Composition(False), pep.MW(), pep.get_n(), x_pct]
            if x_pct == 0: 
                x_pct = filters["Zero Labeling Check"]
            #increase x here
            #need to reassign since it is a tuple.  consider changing to dictionary or class in the future. 
            master_isotope['X'] = [isotope(master_isotope[label][0].mass, master_isotope[label][0].abundance-x_pct), isotope(master_isotope[label][1].mass, master_isotope[label][1].abundance+x_pct)]
            #does the calculations and saves the data
            result = [isotope(0, 1)]
            result = calculate(tmp, result, fm, limit, charge)
            mz, pre_norm  = print_pattern(result, digits)
            if round(mz[0], 2) != round(true_m0,2):
                working = False
            if working:
                if settings["Use Abundance"]:
                    final.extend(base_abund)
                    abund = normalize(pre_norm[:size])
                    abund = sizing(abund, size, small_size, big_size)
                    final.extend(abund) #need to normalize the abundance data (it comes out normalized with the highest peak as 1.  this makes the sumof the relative abundances in the number of relevant peaks 1)
                if settings["Use neutromer spacing"]:
                    final.extend(base_mz)
                    spacing = []
                    for i in range(1, size):
                        spacing.append(mz[i]-mz[0])
                    spacing = sizing(spacing, size, small_size, big_size)
                    final.extend(spacing)
                outlist.append(final)  #add the data to the multiprocessing list
            else:
                error_list.append([sequence, x_pct])
        inqueue.task_done()