def main():
    first_pass_results_filename = sys.argv[1]
    second_pass_results_filename = sys.argv[2]
    fasta_db_filename = sys.argv[3]

    second_pass_proteins_filename = sys.argv[4]

    output_first_pass_peptides = sys.argv[5]
    output_second_pass_peptides = sys.argv[6]

    output_psms_first_pass = sys.argv[7]
    output_psms_updated_evalues = sys.argv[8]

    output_original_high_FDR_psms = sys.argv[9]
    output_updated_high_FDR_psms = sys.argv[10]

    #Low FDR Original and updated evals
    psm_list_first_pass = ming_psm_library.PSMset(first_pass_results_filename)
    psm_list_first_pass.load_MSGF_Plus_tsvfile(first_pass_results_filename)
    psm_list_first_pass.remove_duplicated_rows()
    psm_list_first_pass.filter_to_fdr_by_length(0.01)
    psm_list_first_pass.write_output(open(output_psms_first_pass, "w"))

    update_psm_set_with_second_pass_psms(first_pass_results_filename,
                                         second_pass_results_filename,
                                         output_psms_updated_evalues)

    #High FDR for other purposes to show things
    psm_list_first_pass = ming_psm_library.PSMset(first_pass_results_filename)
    psm_list_first_pass.load_MSGF_Plus_tsvfile(first_pass_results_filename)
    psm_list_first_pass.remove_duplicated_rows()
    psm_list_first_pass.filter_to_fdr_by_length(0.01)
    psm_list_first_pass.write_output(open(output_original_high_FDR_psms, "w"))

    update_psm_set_with_second_pass_psms(first_pass_results_filename,
                                         second_pass_results_filename,
                                         output_updated_high_FDR_psms, 0.05)

    #Precursor Level
    psm_list_first_pass = ming_psm_library.PSMset(first_pass_results_filename)
    psm_list_first_pass.load_MSGF_Plus_tsvfile(first_pass_results_filename)
    psm_list_first_pass.remove_duplicated_rows()
    full_peptides_list_first_pass = library_creation.create_library_unique_peptides_filtered(
        [psm_list_first_pass], filter_by_length=True)
    full_peptides_list_first_pass.write_output(
        open(output_first_pass_peptides, "w"))

    psm_list_second_pass = ming_psm_library.PSMset(
        second_pass_results_filename)
    psm_list_second_pass.load_MSGF_Plus_tsvfile(second_pass_results_filename)
    psm_list_second_pass.remove_duplicated_rows()
    full_peptides_list_second_pass = library_creation.create_library_unique_peptides_filtered(
        [psm_list_second_pass], filter_by_length=True)
    full_peptides_list_second_pass.write_output(
        open(output_second_pass_peptides, "w"))
def get_first_pass_variant_set(first_pass_results_filename):
    psm_list_first_pass = ming_psm_library.PSMset(first_pass_results_filename)
    psm_list_first_pass.load_MSGF_Plus_tsvfile(first_pass_results_filename)
    psm_list_first_pass.filter_to_fdr_by_length(0.05)
    print "First Pass PSMs: " + str(len(psm_list_first_pass))

    full_peptides_list_first_pass = library_creation.create_library_unique_peptides_filtered(
        [psm_list_first_pass], filter_by_length=True)
    print "First Pass Variants: " + str(len(full_peptides_list_first_pass))

    return full_peptides_list_first_pass
Ejemplo n.º 3
0
def save_psms_as_peptides(psm_set, output_peptide_path, fdr):
    peptide_variant_set = library_creation.create_library_unique_peptides_filtered(
        [psm_set], fdr, filter_by_length=True)

    psm_set = ming_psm_library.PSMset("task results")
    for peptide in peptide_variant_set.peptide_list:
        psm_set.psms.append(peptide.get_best_psm())

    output_pickle = open(output_peptide_path, 'wb')
    pickle.dump(psm_set, output_pickle, pickle.HIGHEST_PROTOCOL)
    output_pickle.close()

    return peptide_variant_set
Ejemplo n.º 4
0
def main():
    input_searchresults_filename = sys.argv[1]
    output_peptide_list = sys.argv[2]
    output_peptide_list_with_decoy_filename = sys.argv[3]

    psm_list = ming_psm_library.PSMset(input_searchresults_filename)
    psm_list.load_MSGF_Plus_tsvfile(input_searchresults_filename)
    psm_list.filter_to_fdr_by_length(0.01)
    print len(psm_list)

    full_peptides_list = library_creation.create_library_unique_peptides_filtered(
        [psm_list], fdr=0.01, filter_by_length=True)

    output_file = open(output_peptide_list, "w")

    all_peptides = [
        peptide.get_stripped_sequence()
        for peptide in full_peptides_list.peptide_list
    ]
    all_peptides = list(set(all_peptides))

    for peptide in all_peptides:
        output_file.write(peptide + "\n")

    #Now lets load the PSMs and keep all variants, and then output them with the decoys present
    print "GIVING US FULL RESULT SET"
    psm_list = ming_psm_library.PSMset(input_searchresults_filename)
    psm_list.load_MSGF_Plus_tsvfile(input_searchresults_filename)
    full_peptides_list = library_creation.create_library_unique_peptides_filtered(
        [psm_list], 1.0)
    output_peptide_list_with_decoy_file = open(
        output_peptide_list_with_decoy_filename, "w")

    output_peptide_list_with_decoy_file.write(
        ming_psm_library.PeptideVariant.output_header() + "\n")
    for peptide in full_peptides_list.peptide_list:
        output_peptide_list_with_decoy_file.write(str(peptide) + "\n")
Ejemplo n.º 5
0
def main():
    input_fasta_filename = sys.argv[1]
    input_searchresults_filename = sys.argv[2]
    output_proteins_as_list = sys.argv[3]

    proteome = ming_protein_library.parse_fasta_proteome_file(
        input_fasta_filename)

    #for protein in proteome.protein_list:
    #    print protein.protein

    psm_list = ming_psm_library.PSMset(input_searchresults_filename)
    psm_list.load_MSGF_Plus_tsvfile(input_searchresults_filename)

    full_peptides_list = library_creation.create_library_unique_peptides_filtered(
        [psm_list], 0.01, filter_by_length=True)

    target_peptide_strings = []
    decoy_peptide_strings = []
    for peptide_obj in full_peptides_list.peptide_list:
        peptide_to_search = peptide_obj.get_stripped_sequence()
        if peptide_obj.is_decoy():
            decoy_peptide_strings.append(peptide_to_search[::-1])
        else:
            target_peptide_strings.append(peptide_to_search)

    protein_coverage_of_targets = proteome.get_proteins_with_number_of_peptides_covered_map(
        target_peptide_strings)
    protein_coverage_of_decoys = proteome.get_proteins_with_number_of_peptides_covered_map(
        decoy_peptide_strings)

    output_file = open(output_proteins_as_list, "w")
    output_file.write(
        "protein\tdecoy_count\ttarget_count\ttotal_count\tlength\n")

    for protein in protein_coverage_of_targets:
        output_string = protein + "\t"
        output_string += str(protein_coverage_of_decoys[protein]) + "\t"
        output_string += str(protein_coverage_of_targets[protein]) + "\t"
        output_string += str(protein_coverage_of_targets[protein] +
                             protein_coverage_of_decoys[protein]) + "\t"
        output_string += str(len(
            proteome.protein_map[protein].sequence)) + "\n"

        output_file.write(output_string)
    output_file.close()
def grab_results_from_multipass(task_id, user, output_peptide_directory,
                                output_psm_directory):
    return_dict = {}
    return_dict["number_psms"] = 0
    return_dict["number_peptides"] = 0
    return_dict["task_id"] = task_id

    #Copying the psm files
    path_to_psm_files_list = ming_proteosafe_library.get_proteosafe_result_file_path(
        task_id, user, "updated_eval_psms_with_kl_with_ambiguity")
    if len(path_to_psm_files_list) == 1:
        output_psm_path = os.path.join(output_psm_directory, task_id + ".psms")
        path_to_param_file = ming_proteosafe_library.get_proteosafe_result_file_path(
            task_id, user, "params")[0]

        #path_to_merged_results = ming_proteosafe_library.get_proteosafe_backend_result_file_path(task_id, "mergedResult", "proteomics2")[0]
        print(
            ming_proteosafe_library.get_proteosafe_result_file_path(
                task_id, user, "mergedResult"))
        path_to_merged_results = ming_proteosafe_library.get_proteosafe_result_file_path(
            task_id, user, "mergedResult")[0]

        print(path_to_psm_files_list[0] + " to " + output_psm_path)
        #name_demangle_filenames(path_to_psm_files_list[0], output_psm_path, path_to_param_file, "filename", "filename")
        name_demangle_filenames_and_instrument_collision(
            path_to_psm_files_list[0], output_psm_path, path_to_param_file,
            path_to_merged_results, "filename", "filename")

        #Now lets generate the peptide list from the psm list
        psm_set = ming_psm_library.PSMset("task results")
        psm_set.load_PSM_tsvfile(output_psm_path)
        output_peptide_path = output_psm_path = os.path.join(
            output_peptide_directory, task_id + ".peptides")

        peptide_variant_set = library_creation.create_library_unique_peptides_filtered(
            [psm_set], 0.01)
        peptide_variant_set.write_output(open(output_peptide_path, "w"))

        return_dict["number_psms"] = len(psm_set.psms)
        return_dict["number_peptides"] = len(peptide_variant_set.peptide_list)

    return return_dict
Ejemplo n.º 7
0
def main():
    input_fasta_filename = sys.argv[1]
    input_searchresults_filename = sys.argv[2]
    output_fasta_filename = sys.argv[3]
    output_proteins_as_list = sys.argv[4]

    proteome = ming_protein_library.parse_fasta_proteome_file(input_fasta_filename)

    #for protein in proteome.protein_list:
    #    print protein.protein

    psm_list = ming_psm_library.PSMset(input_searchresults_filename)
    psm_list.load_MSGF_Plus_tsvfile(input_searchresults_filename)
    psm_list.filter_to_fdr_by_length(0.01)
    print len(psm_list)

    full_peptides_list = library_creation.create_library_unique_peptides_filtered([psm_list], fdr=0.01, filter_by_length=True)

    #Testing efficient version fo this
    all_peptide_strings = []
    for peptide_obj in full_peptides_list.peptide_list:
        peptide_to_search = peptide_obj.get_stripped_sequence()
        all_peptide_strings.append(peptide_obj.get_stripped_sequence())

    all_proteins = proteome.get_proteins_covered_by_k_peptides(all_peptide_strings, 2, True)

    all_protein_names = []
    for protein in all_proteins:
        all_protein_names.append(protein.protein)

    output_protein_filename = output_fasta_filename
    open(output_protein_filename, "w").write(json.dumps(all_protein_names))

    #Outputting the list of proteins
    output_protein_list_file = open(output_proteins_as_list, "w")

    output_protein_list_file.write("Protein\n")
    for protein in all_protein_names:
        output_protein_list_file.write(protein + "\n")

    exit(0)