Example #1
0
def analyse_oligomers(input_file,
                      template_hitchain,
                      oligomers_list,
                      interfaces_dict,
                      tmdata,
                      report,
                      args,
                      entropies=None,
                      z_entropies=None,
                      minx=None,
                      maxx=None):
    global g_template_hitchain
    global g_interfaces_dict
    global g_tmdata
    global g_report
    global g_args
    global g_entropies
    global g_z_entropies
    global g_minx
    global g_maxx
    global template
    global template_file
    global template_molprobity
    g_template_hitchain = template_hitchain
    g_interfaces_dict = interfaces_dict
    g_tmdata = tmdata
    g_report = report
    g_args = args
    g_entropies = entropies
    g_z_entropies = z_entropies
    g_minx = minx
    g_maxx = maxx
    pctools.print_section(3, 'OLIGOMER ANALYSIS')
    # Define template for comparisons
    template = template_hitchain.split(':')[0]
    template_file = template + '_CHOIR_RelevantChains.pdb'
    reports = []
    if 'M' in args.assessment:
        template_molprobity, molprobity_output = pctools.run_molprobity(
            template_file, args)
        print(molprobity_output)

    # Run the analysis for all models in parallel
    if args.multiprocess is True:
        p = Pool()
        for model_report, output in p.map_async(analyse_model,
                                                oligomers_list).get():
            print(output)
            reports.append(model_report)
        p.close()
        p.join()

    else:
        for oligomer in oligomers_list:
            model_report, output = analyse_model(oligomer)
            print(output)
            reports.append(model_report)

    return reports
Example #2
0
def make_oligomer(input_file,
                  largest_oligo_complexes,
                  report,
                  args,
                  residue_index_mapping=None):
    global workdir
    global input_name
    global verbosity
    global g_input_file
    global g_args
    global best_oligo_template_code
    global renamed_chains_file
    g_input_file = input_file
    g_args = args
    verbosity = args.verbosity
    workdir = os.getcwd()
    symmetry = args.symmetry

    # Subsection 2[a] #######################################################################
    if args.sequence_mode is False:
        input_name = os.path.basename(input_file).split(".pdb")[0].replace(
            '.', '_')
        candidate_qscores = {}
        # Select structurally best oligomeric template using GESAMT
        pctools.print_section(2, 'OLIGOMER ASSEMBLING')
        pctools.print_subsection('2[a]', 'Structural template selection')
        if args.multiprocess is True:
            p = Pool()
            for hitchain, average_qscore, output in p.map_async(
                    analyse_largest_complexes,
                    largest_oligo_complexes.items()).get():
                candidate_qscores[hitchain] = average_qscore
                report['hits'][hitchain]['qscore'] = round(average_qscore, 3)
                print(output)
            p.close()
            p.join()
        else:
            for item in largest_oligo_complexes.items():
                hitchain, average_qscore, output = analyse_largest_complexes(
                    item)
                candidate_qscores[hitchain] = average_qscore
                report['hits'][hitchain]['qscore'] = round(average_qscore, 3)
                print(output)

        best_oligo_template = max(candidate_qscores.keys(),
                                  key=(lambda x: candidate_qscores[x]))
        if candidate_qscores[best_oligo_template] >= args.qscore_cutoff:
            print('Structurally, the best template is: ' + clrs['y'] +
                  best_oligo_template + clrs['n'] + '. Using that!\n')
            report['best_template'] = best_oligo_template.split(':')[0]
            report['best_id'] = report['hits'][best_oligo_template]['id']
            report['best_cov'] = report['hits'][best_oligo_template][
                'coverage']
            report['best_qscore'] = report['hits'][best_oligo_template][
                'qscore']
            report['best_nchains'] = report['hits'][best_oligo_template][
                'final_homo_chains']
        else:
            print('No template had an average Q-score above cut-off of ' +
                  clrs['c'] + str(args.qscore_cutoff) + clrs['n'] +
                  '\nTry lowering the cutoff or running in sequence mode.\n')
            report['exit'] = '4'
            return None, None, report
        report['topology_figure'] = './' + best_oligo_template.replace(
            ':', '_') + '_CHOIR_Topology.png'
        template_chains = largest_oligo_complexes[best_oligo_template]
        best_oligo_template_code = best_oligo_template.split(':')[0]
        clean_template_file = make_local_template(best_oligo_template_code)

    elif args.sequence_mode is True:
        if input_file.endswith('.pdb'):
            input_name = os.path.basename(input_file).split(".pdb")[0].replace(
                '.', '_')
            input_file = os.path.join(
                workdir, input_name + '_CHOIR_MonomerSequence.fasta')
            g_input_file = input_file

        elif input_file.endswith('_CHOIR_MonomerSequence.fasta'):
            input_name = os.path.basename(input_file).split(
                "_CHOIR_MonomerSequence.fasta")[0]

        pctools.print_section(2, 'OLIGOMER ASSEMBLING - SEQUENCE MODE')
        print(clrs['y'] +
              "Skipping section 2[a] - Structural template selection" +
              clrs['n'] + "\n")
        attempt = 0
        while attempt < len(largest_oligo_complexes):
            try:
                best_oligo_template = list(largest_oligo_complexes)[attempt]
                report['best_template'] = best_oligo_template.split(':')[0]
                report['best_id'] = report['hits'][best_oligo_template]['id']
                report['best_cov'] = report['hits'][best_oligo_template][
                    'coverage']
                report['best_qscore'] = 'NA'
                report['best_nchains'] = report['hits'][best_oligo_template][
                    'final_homo_chains']
                report['topology_figure'] = './' + best_oligo_template.replace(
                    ':', '_') + '_CHOIR_Topology.png'
                template_chains = largest_oligo_complexes[best_oligo_template]
                best_oligo_template_code = best_oligo_template.split(':')[0]
                clean_template_file = make_local_template(
                    best_oligo_template_code)
                break
            except:
                attempt += 1
                if attempt < len(largest_oligo_complexes):
                    print('Attempt ' + str(attempt) +
                          ' failed, trying a differente template candidate.')
                if attempt == len(largest_oligo_complexes):
                    print('Failed to find templates in local databases.')
                    report['exit'] = '5'
                    return None, None, report

    relevant_chains_file = extract_relevant_chains(clean_template_file,
                                                   template_chains)
    if args.generate_report is True:
        report['template_figure'], pymol_output = pctools.pymol_screenshot(
            relevant_chains_file, args)
        print(pymol_output)
    renamed_chains_file, chains_dict = rename_relevant_chains(
        relevant_chains_file)
    relevant_chains = [
        chains_dict[template_chain] for template_chain in template_chains
    ]

    # Subsection 2[b] #######################################################################
    pctools.print_subsection('2[b]', 'Generating alignment')
    # Generate per chain alignment files
    alignment_files = []
    if args.sequence_mode is False:
        if args.multiprocess is True:
            p = Pool()
            for qscore, rmsd, fasta_out, gesamt_output in p.map_async(
                    run_gesamt_parallel, chains_dict.values()).get():
                alignment_files.append(fasta_out)
                print(gesamt_output)
            p.close()
            p.join()
        else:
            for chain in chains_dict.values():
                qscore, rmsd, fasta_out, gesamt_output = run_gesamt_parallel(
                    chain)
                alignment_files.append(fasta_out)
                print(gesamt_output)

    elif args.sequence_mode is True:
        if args.multiprocess is True:
            p = Pool()
            for fasta_out, output in p.map_async(alignment_from_sequence,
                                                 chains_dict.values()).get():
                alignment_files.append(fasta_out)
                print(output)
        else:
            for current_chain in chains_dict.values():
                fasta_out, output = alignment_from_sequence(current_chain)
                alignment_files.append(fasta_out)
                print(output)
    print('Alignment files:\n' + clrs['g'] +
          ('\n').join([os.path.basename(i)
                       for i in alignment_files]) + clrs['n'])

    # Generate final alignment which will be the input for Modeller
    final_alignment, full_residue_mapping = generate_ali(
        alignment_files, best_oligo_template_code, residue_index_mapping, args)
    # Score said alignment and enforce treshold
    report[
        'relative_alignment_score'], relative_wscores, nchains = score_alignment(
            final_alignment)
    print('\nFinal average relative score for alignment: ' +
          str(round(report['relative_alignment_score'], 2)) + '%')
    bad_streches = 0
    for wscore in relative_wscores:
        if wscore < args.similarity_cutoff:
            bad_streches += 1
    if bad_streches >= args.bad_streches * nchains:
        if args.sequence_mode is True:
            print(
                '\nThe alignment score was unacceptable for ' + clrs['r'] +
                str(bad_streches) + clrs['n'] +
                ' 30-res segments of the protein complex.\nTry running the default (structure) mode.\n'
            )
        else:
            print(
                '\nThe alignment score was unacceptable for ' + clrs['r'] +
                str(bad_streches) + clrs['n'] +
                ' 30-res segments of the protein complex.\nTry increasing the number of candidate templates or tweaking the similarity cut-offs.\n'
            )
        report['exit'] = '6'
        return None, None, report

    # Subsection 2[c] #######################################################################
    pctools.print_subsection('2[c]', 'Generating models')
    genmodel_file, expected_models = create_genmodel(final_alignment,
                                                     best_oligo_template_code,
                                                     relevant_chains, args)
    run_modeller(genmodel_file)

    # Record list of oligomers built
    nmodels = 0
    built_oligomers = []
    for model in expected_models:
        built_oligomers.append(
            restore_chain_identifiers(model, chains_dict,
                                      full_residue_mapping))
        nmodels += 1
    print(clrs['b'] + 'ProtCHOIR' + clrs['n'] + ' built ' + clrs['c'] +
          str(nmodels) + clrs['n'] + ' model oligomers:')
    for model in built_oligomers:
        print(clrs['g'] + model + clrs['n'])

    return best_oligo_template, built_oligomers, report
Example #3
0
def main():

    args = initial_args

    # Define multiprocessing options
    args.available_cores = cpu_count()

    if args.force_single_core is True:
        args.multiprocess = False
        args.psiblast_threads = 1
        args.modeller_threads = 1
    else:
        if args.psiblast_threads is None:
            args.psiblast_threads = args.available_cores
        if args.modeller_threads is None:
            args.modeller_threads = min([args.available_cores, args.models])

    if args.update is True:
        print(
            tw.dedent("""
                                         !WARNING!

                      You have chosen to updtate the local databases.

              ** The root directory for the database files is: """ +
                      clrs['y'] + choirdb + clrs['n'] + """

              ** The path to local pdb mirror is: """ + clrs['y'] +
                      pdb_archive + clrs['n'] + """

              ** The path to local pdb biounit mirror is: """ + clrs['y'] +
                      pdb1_archive + clrs['n'] + """

              ** The path to local gesamt archive is: """ + clrs['y'] +
                      ges_homo_archive + clrs['n'] + """

              ** The path to local UniRef50 blast database is: """ +
                      clrs['y'] + uniref50 + clrs['n'] + """


              This could take a long time.

              <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

              """))
        option = input('Do you confirm the information above? (y/n)')
        if option == 'y' or option == 'Y' or option == 'YES' or option == 'yes' or option == 'Yes':
            update_databases(args.verbosity)
            print('\n\nDone updating all databases. Exiting.\n')
        else:
            print('\n\nNo positive confirmation, will not update databases.\n')
            exit()
    # Actually run oligomerization protocol
    else:
        outdir = os.getcwd()
        input_file = args.input_file
        assert os.path.isdir(pdb_archive), clrs[
            'r'] + '\n\n Not able to find PDB directory.\n\n Does "' + pdb_archive + '" exist?' + clrs[
                'n']
        assert os.path.isdir(pdb1_archive), clrs[
            'r'] + '\n\n Not able to find PDB1 assemblies directory.\n\n Does "' + pdb1_archive + '" exist?' + clrs[
                'n']
        assert os.path.isdir(pdb_homo_archive), clrs[
            'r'] + '\n\n Not able to find ProtCHOIR database directory.\n\n Does "' + pdb_homo_archive + '" exist?' + clrs[
                'n']
        assert os.path.isdir(ges_homo_archive), clrs[
            'r'] + '\n\n Not able to find GESAMT archive directory.\n\n Does "' + ges_homo_archive + '" exist?' + clrs[
                'n']
        assert args.refine_level in [0, 1, 2, 3, 4], clrs[
            'r'] + '\n\n Refinement level must be an integer number from 0 to 4.\n Run ProtCHOIR -h for more information\n\n' + clrs[
                'n']
        assert args.psiblast_params in psiblast_params, clrs[
            'r'] + '\n\n PSI-BLAST parameters invalid.\n Run ProtCHOIR -h for more information\n\n' + clrs[
                'n']
        assert input_file is not None, clrs[
            'r'] + '\n\n Please inform the input file name.\n Run ProtCHOIR -h for more information.\n\n' + clrs[
                'n']
        assert os.path.isfile(input_file), clrs[
            'r'] + '\n\n Not able to find input file.\n\n Does "' + input_file + '" exist?\n' + clrs[
                'n']
        assert args.zip_output in [0, 1, 2], clrs[
            'r'] + '\n\n Compression level must be an integer number between 0 and 2.\n Run ProtCHOIR -h for more information\n\n' + clrs[
                'n']
        assert all([
            i in set('MIG') for i in set(args.assessment)
        ]) or args.assessment == 'N', clrs[
            'r'] + '\n\n Oligomer assessment type do not comply.\n Choose any combination of [G]Gesamt, [M]Molprobity, [I]Interfaces or choose [N] for None\n\n' + clrs[
                'n']

        # Force generation of topologies and all assessments if final report is requested
        if args.generate_report is True:
            args.assessment = 'MIG'
            args.plot_topologies = True

        # Deal with dots and dashes in the input file and remove dots
        if input_file.lower().endswith('.pdb'):
            input_basename = os.path.basename(input_file).split('.pdb')[0]
            input_basename = input_basename.replace(".", "_")
            input_basename = input_basename.replace("-", "_")
            new_input_file = input_basename + '.pdb'
            if os.path.basename(input_file) == os.path.basename(
                    new_input_file):
                pass
            else:
                shutil.copy(input_file, new_input_file)

        # Also process filename to fasta header if input file is fasta
        elif input_file.lower().endswith('.fasta'):
            input_basename = os.path.basename(input_file).split('.fasta')[0]
            input_basename = input_basename.replace(".", "_")
            input_basename = input_basename.replace("-", "_")
            new_input_file = os.path.join(
                outdir, input_basename + '_CHOIR_MonomerSequence.fasta')
            with open(input_file, 'r') as infile, open(new_input_file,
                                                       'w') as outfile:
                outfile.write('>' + input_basename + '\n')
                n = 0
                for line in infile.readlines():
                    if not line.startswith('>'):
                        outfile.write(line)
                    else:
                        n += 1
                    if n == 2:
                        break
            args.sequence_mode = True
        else:
            raise pctools.FileFormatError(
                clrs['r'] +
                '\n\n Input format must be either pdb or fasta\n Run ./ProtCHOIR -h for more information\n\n'
                + clrs['n'])
        if args.allow_monomers:
            assert args.sequence_mode is True, clrs[
                'r'] + '\n\n To allow building monomers you must use sequence mode. \n Run ProtCHOIR -h for more information\n\n' + clrs[
                    'n']

        # Start recording job progress
        with open('CHOIR_Progress.out', 'w') as f:
            f.write("Starting new ProtCHOIR run\n")

        # Pickle Runtime arguments
        pickle.dump(args, open('CHOIR_Args.pickle', 'wb'))

        # Show arguments used and create CHOIR.conf
        pctools.print_section(0, "Runtime Arguments")
        runtime_arguments = {}
        choir_args = os.path.join(outdir, "CHOIR.args")
        with open(choir_args, 'w') as f:
            for name, value in vars(args).items():
                runtime_arguments[name] = value
                print(name + "=" + str(value))
                f.write(name + "=" + str(value) + "\n")
        print('\nRuntime parameters written to: ' + clrs['g'] +
              os.path.basename(choir_args) + clrs['n'] + '\n')

        # Initialize report
        report = {}
        report['runtime_arguments'] = runtime_arguments
        report['input_filename'] = os.path.basename(new_input_file)

        # Write errorprof placeholder summary
        placeholder_report = report.copy()
        report_data = [
            'input_filename', 'sequence_mode', 'templatedmodel',
            'protomer_residues', 'tmspans', 'highest_scoring_state',
            'homo_oligomeric_over_other_score', 'best_template',
            'best_nchains', 'best_id', 'best_cov', 'best_qscore',
            'model_oligomer_name', 'model_molprobity', 'gesamt_rmsd',
            'protchoir_score', 'surface_score', 'interfaces_score',
            'quality_score', 'total_runtime', 'exit'
        ]
        for data in report_data:
            if data not in placeholder_report:
                placeholder_report[data] = 'NA'
        with open(input_basename + '_CHOIR_Summary.tsv', 'w') as f:
            f.write(
                'Input\tSeq.Mode\tTemplated\tLength\tTMSpans\tLikelyState\tH3OScore\tTemplate\tChains\tIdentity\tCoverage\tAv.QScore\tBestModel\tMolprobity\tRMSD\tProtCHOIR\tSurface\tInterfaces\tQuality\tRuntime\tExit\n'
            )
            f.write('\t'.join(
                [str(placeholder_report[data])
                 for data in report_data]) + '\n')

        # Start analysis of protomer
        analyse_protomer_results, report, args = analyze_protomer(
            new_input_file, report, args)

        # If no suitable h**o-oligomeric template wasfound, exit nicely.
        if analyse_protomer_results is None:
            finalize(report, input_basename, start_time, start_timestamp, args)
            pctools.print_sorry()
            sys.exit(0)

        # Else, proceed conditionally on runtime arguments
        elif analyse_protomer_results is not None and args.sequence_mode is True:
            residue_index_mapping = None
            minx = None
            maxx = None
            if args.skip_conservation:
                entropies = None
                z_entropies = None
                pdb_name, clean_input_file, largest_oligo_complexes, interfaces_dict, tmdata = analyse_protomer_results
            elif not args.skip_conservation:
                pdb_name, clean_input_file, largest_oligo_complexes, interfaces_dict, entropies, z_entropies, tmdata = analyse_protomer_results
                if entropies == z_entropies == minx == maxx == None:
                    args.skip_conservation = True

        elif analyse_protomer_results is not None and args.sequence_mode is False:
            if args.skip_conservation:
                minx = None
                maxx = None
                entropies = None
                z_entropies = None
                pdb_name, clean_input_file, largest_oligo_complexes, interfaces_dict, residue_index_mapping, tmdata = analyse_protomer_results
            elif not args.skip_conservation:
                pdb_name, clean_input_file, largest_oligo_complexes, interfaces_dict, entropies, z_entropies, residue_index_mapping, minx, maxx, tmdata = analyse_protomer_results
                if entropies == z_entropies == minx == maxx == None:
                    args.skip_conservation = True

        report['runtime_arguments'][
            'skip_conservation'] = args.skip_conservation

        new_input_file = clean_input_file

        # Use information of complexes to build oligomers
        best_oligo_template, built_oligomers, report = make_oligomer(
            new_input_file,
            largest_oligo_complexes,
            report,
            args,
            residue_index_mapping=residue_index_mapping)

        # If no models were built, exit nicely.
        if built_oligomers is None:
            finalize(report, input_basename, start_time, start_timestamp, args)
            pctools.print_sorry()
            sys.exit(0)

        # Analyse built models
        reports = analyse_oligomers(new_input_file,
                                    best_oligo_template,
                                    built_oligomers,
                                    interfaces_dict,
                                    tmdata,
                                    report,
                                    args,
                                    entropies=entropies,
                                    z_entropies=z_entropies,
                                    minx=minx,
                                    maxx=maxx)
        finalize(reports, input_basename, start_time, start_timestamp, args)