Esempio n. 1
0
def list_new_files(pdb1_archive, assession_log, verbosity):
    '''
    Taps into the pdb1 local repository and checks if there are new files there
    which should be assessed by curate_homoDB function. It creates a list with
    files from the pdb1 database that are newer than the ones last assessed
    (registered in the dat file); which means that it takes the previous
    assession log as an input in the form of a dictionary where keys are files
    and the values correspond to the last assession time.
    Called by: curate_homoDB()
    '''
    new_files = []
    pctools.printv('Assessing files in PDB1 archive...', verbosity)
    assert os.path.isdir(pdb1_archive), clrs[
        'r'] + '\n\n Not able to find PDB archive.\n\n Does "' + pdb1_archive + '" exist?' + clrs[
            'n']
    pdbfiles = [
        os.path.join(dp, f) for dp, dn, filenames in os.walk(pdb1_archive)
        for f in filenames if f.endswith(".pdb1.gz")
    ]
    for f in pdbfiles:
        filename = f.split('/')[-1]
        mod_date = os.path.getctime(f)
        if filename not in assession_log or mod_date > float(
                assession_log[filename]):
            pctools.printv(
                clrs['y'] + f + ' should be assessed' + clrs['n'] + '...\n',
                verbosity)
            new_files.append(f)
    return new_files
Esempio n. 2
0
def make_local_template(best_oligo_template):
    middle_letters_best = best_oligo_template[1:3]
    if g_args.allow_monomers:
        best_template_file = os.path.join(
            pdb_archive, middle_letters_best,
            'pdb' + best_oligo_template + ".ent.gz")
        pdb_name, contents = pctools.parse_pdb_contents(best_template_file)
        is_nmr = pctools.is_nmr(contents)
        if is_nmr:
            print(
                clrs['r'] + '\n\n Selected template ' + best_oligo_template +
                ' is an NMR structure \n Will try a a different candidate.\n\n'
                + clrs['n'])
            raise

    else:
        best_template_file = os.path.join(pdb_homo_archive,
                                          middle_letters_best,
                                          best_oligo_template + ".pdb.gz")
    clean_template_file = os.path.join(
        workdir, best_oligo_template + "_CHOIR_CleanTemplate.pdb")
    pdb_name, structure, nchains = pctools.parse_any_structure(
        best_template_file)
    io.set_structure(structure)
    io.save(clean_template_file, pctools.SelectIfCA())
    return clean_template_file
Esempio n. 3
0
def analyse_oligomers(input_file,
                      template_hitchain,
                      oligomers_list,
                      interfaces_dict,
                      tmdata,
                      report,
                      args,
                      entropies=None,
                      z_entropies=None,
                      minx=None,
                      maxx=None):
    global g_template_hitchain
    global g_interfaces_dict
    global g_tmdata
    global g_report
    global g_args
    global g_entropies
    global g_z_entropies
    global g_minx
    global g_maxx
    global template
    global template_file
    global template_molprobity
    g_template_hitchain = template_hitchain
    g_interfaces_dict = interfaces_dict
    g_tmdata = tmdata
    g_report = report
    g_args = args
    g_entropies = entropies
    g_z_entropies = z_entropies
    g_minx = minx
    g_maxx = maxx
    pctools.print_section(3, 'OLIGOMER ANALYSIS')
    # Define template for comparisons
    template = template_hitchain.split(':')[0]
    template_file = template + '_CHOIR_RelevantChains.pdb'
    reports = []
    if 'M' in args.assessment:
        template_molprobity, molprobity_output = pctools.run_molprobity(
            template_file, args)
        print(molprobity_output)

    # Run the analysis for all models in parallel
    if args.multiprocess is True:
        p = Pool()
        for model_report, output in p.map_async(analyse_model,
                                                oligomers_list).get():
            print(output)
            reports.append(model_report)
        p.close()
        p.join()

    else:
        for oligomer in oligomers_list:
            model_report, output = analyse_model(oligomer)
            print(output)
            reports.append(model_report)

    return reports
Esempio n. 4
0
def extract_relevant_chains(pdb_file, relevant_chains):
    template_name = os.path.basename(pdb_file).split('_CHOIR_')[0]
    pname, structure, nchains = pctools.parse_any_structure(pdb_file)
    relevant_chains_file = os.path.join(
        workdir, template_name + "_CHOIR_RelevantChains.pdb")
    chains = bpp.Selection.unfold_entities(structure, 'C')
    io.set_structure(structure)
    io.save(relevant_chains_file, pctools.SelectChains(relevant_chains))

    return relevant_chains_file
Esempio n. 5
0
def restore_chain_identifiers(pdb_file, chains_dict, full_residue_mapping):
    pname, structure, nchains = pctools.parse_any_structure(pdb_file)
    restored_chains_file = os.path.join(workdir,
                                        pname + "_CHOIR_CorrectedChains.pdb")
    chains = bpp.Selection.unfold_entities(structure, 'C')
    str_id = structure.id
    new_structure = bpp.Structure.Structure(str_id)
    new_model = bpp.Model.Model(0)
    for original, current in chains_dict.items():
        for chain in chains:
            if chain.id == current:
                new_chain = bpp.Chain.Chain(current)
                new_chain.id = original
                for residue in chain:
                    new_residue = bpp.Residue.Residue(residue.id,
                                                      residue.get_resname(),
                                                      residue.get_segid())
                    if type(full_residue_mapping[current]
                            ) is collections.OrderedDict:
                        for atom in residue:
                            new_residue.add(atom)
                        new_residue.id = (
                            ' ', full_residue_mapping[current][residue.id[1]],
                            ' ')
                    if type(full_residue_mapping[current]) is int:
                        for atom in residue:
                            new_residue.add(atom)
                        new_residue.id = (' ', full_residue_mapping[current] +
                                          residue.id[1], ' ')
                    new_chain.add(new_residue)
                new_model.add(new_chain)
    new_structure.add(new_model)
    io.set_structure(new_structure)
    io.save(restored_chains_file)
    return restored_chains_file
Esempio n. 6
0
def update_seqres(verbosity):
    '''
    Runs wget to update the local seqres database, decompresses it and runs
    makeblastdb.
    Called by: update_databases()
    '''
    seqres_dir = os.path.join(choirdb, 'seqres')
    if not os.path.isdir(seqres_dir):
        os.mkdir(seqres_dir)
    seqres_txt = os.path.join(seqres_dir, 'pdb_seqres.txt')
    seqres_fasta = os.path.join(seqres_dir, 'seqres.fasta')
    pctools.printv('Fetching pdb_seqres.txt...', verbosity)
    attempt = 0
    while attempt < 3:
        try:
            wgetout = subprocess.check_output([
                'wget', '-m', '-r', '-nH', '--cut-dirs=3', '--user=anonymous',
                seqres_ftp, '-P', seqres_dir
            ],
                                              stderr=subprocess.STDOUT)
            break
        except:
            attempt += 1
            if attempt < 3:
                print('Attempt ' + str(attempt) + ' failed, trying again.')
            if attempt == 3:
                print(
                    'Failed to download seqres in 3 attempts. Try again later.'
                )

    no_wget = 'seqres.txt.gz’ -- not retrieving'

    if no_wget not in wgetout.decode(
            'UTF-8') or not os.path.isfile(seqres_txt):
        pctools.printv('Decompressing pdb_seqres.txt...', verbosity)

        with gzip.open(seqres_txt + '.gz',
                       'rb') as fin, open(seqres_fasta, 'wb') as fout:
            shutil.copyfileobj(fin, fout)
    if no_wget not in wgetout.decode(
            'UTF-8') or not os.path.isfile(seqres_fasta + '.pal'):
        subprocess.run([
            makeblastdb_exe, '-in', seqres_fasta, '-parse_seqids', '-dbtype',
            'prot', '-blastdb_version', '5', '-out', seqres
        ])
Esempio n. 7
0
def update_uniref(verbosity):
    '''
    Runs wget to update the local uniref50 database, decompresses it and runs
    makeblastdb.
    Called by: update_databases()
    '''
    uniref50_fasta = os.path.join(choirdb, 'uniref50/uniref50.fasta')
    pctools.printv('Fetching uniref50.fasta...', verbosity)
    attempt = 0
    while attempt < 3:
        try:
            wgetout = subprocess.check_output([
                'wget', '-m', '-r', '-nH', '--cut-dirs=4', '--user=anonymous',
                uniref50_ftp, '-P', choirdb
            ],
                                              stderr=subprocess.STDOUT)
            break
        except:
            attempt += 1
            if attempt < 3:
                print('Attempt ' + str(attempt) + ' failed, trying again.')
            if attempt == 3:
                print(
                    'Failed to download UniRef50 in 3 attempts. Try again later.'
                )

    no_wget = 'uniref50.fasta.gz’ -- not retrieving'

    if no_wget not in wgetout.decode(
            'UTF-8') or not os.path.isfile(uniref50_fasta):
        pctools.printv('Decompressing uniref50.fasta...', verbosity)

        with gzip.open(uniref50_fasta + '.gz',
                       'rb') as fin, open(uniref50_fasta, 'wb') as fout:
            shutil.copyfileobj(fin, fout)
    if no_wget not in wgetout.decode(
            'UTF-8') or not os.path.isfile(uniref50_fasta + '.pal'):
        subprocess.run([
            makeblastdb_exe, '-in', uniref50_fasta, '-parse_seqids', '-dbtype',
            'prot', '-out', uniref50
        ])
Esempio n. 8
0
def score_pairwise(seq1, seq2, matrix, gap_s, gap_e):
    score = 0
    gap = False
    ipos = 0
    fpos = 30
    nwindows = -(-len(seq1) // 30)
    pctools.printv('Number of 30-residue segments: ' + str(nwindows),
                   g_args.verbosity)
    wscores = []
    for window in range(nwindows):
        wscore = 0
        if fpos > len(seq1):
            fpos = len(seq1)
        pctools.printv(
            str(ipos + 1) + ' ' + seq1[ipos:fpos] + ' ' + str(fpos),
            g_args.verbosity)
        pctools.printv(
            str(ipos + 1) + ' ' + seq2[ipos:fpos] + ' ' + str(fpos),
            g_args.verbosity)
        for i in range(len(seq1))[ipos:fpos]:
            pair = (seq1[i], seq2[i])
            if not gap:
                if pair == ('-', '-'):
                    score += 4
                    wscore += 4
                elif '-' in pair:
                    gap = True
                    score += gap_s
                    wscore += gap_s
                else:
                    score += score_match(pair, matrix)
                    wscore += score_match(pair, matrix)
            else:
                if '-' not in pair:
                    gap = False
                    score += score_match(pair, matrix)
                    wscore += score_match(pair, matrix)
                else:
                    score += gap_e
                    wscore += gap_e

        ipos += 30
        fpos += 30
        pctools.printv('Segment score: ' + str(wscore), g_args.verbosity)
        wscores.append(wscore)

    return score, wscores
Esempio n. 9
0
def record_fasta(pdb_code, seqs, chain_ids, subfolder, type=None):
    if not os.path.isdir(os.path.join(pdb_homo_archive, subfolder)):
        os.mkdir(os.path.join(pdb_homo_archive, subfolder))
    type_folder = os.path.join(pdb_homo_archive, subfolder,
                               type + '_sequences')
    if not os.path.isdir(type_folder):
        os.mkdir(type_folder)
    fasta_file = os.path.join(type_folder, pdb_code + ".fasta")
    with open(fasta_file, 'w+') as f:
        for seq, chain_id in zip(seqs, chain_ids):
            if pctools.is_valid_sequence(seq[1]):
                wrapped_seq = "\n".join(tw.wrap(seq[1]))
                fasta_entry = '>' + pdb_code + ':' + str(
                    chain_id) + '\n' + wrapped_seq + '\n\n'
                f.write(fasta_entry)
Esempio n. 10
0
def rename_relevant_chains(pdb_file):
    template_name = os.path.basename(pdb_file).split('_CHOIR_')[0]
    pname, structure, nchains = pctools.parse_any_structure(pdb_file)
    renamed_chains_file = os.path.join(
        workdir, template_name + "_CHOIR_RenamedChainsTemplate.pdb")
    chains = bpp.Selection.unfold_entities(structure, 'C')
    chains_dict = {}
    n = 1
    for chain in chains:
        original = chain.id
        new = numalpha[str(n)]
        chain.id = 'X' + new
        n += 1
        chains_dict[original] = new
    for chain in chains:
        chain.id = chain.id[1]
    io.set_structure(structure)
    io.save(renamed_chains_file)

    return renamed_chains_file, chains_dict
Esempio n. 11
0
def analyse_largest_complexes(item):
    output = []
    hitchain, chains = item
    template, hit_chain = hitchain.split(':')
    middle_letters = template[1:3]
    template_file = os.path.join(pdb_homo_archive, middle_letters,
                                 template + ".pdb.gz")
    sum_qscore = 0
    chain_n = 0
    for chain in chains:
        chain_n += 1
        qscore, rmsd, fasta_out, gesamt_output = pctools.run_gesamt(
            template, template_file, input_name, g_input_file, chain, g_args)
        sum_qscore += float(qscore)
        output.append(gesamt_output)

    average_qscore = sum_qscore / chain_n
    output.append('--\n\nAverage Q-Score for all candidate chains is ' +
                  clrs['c'] + str(average_qscore) + clrs['n'] + '\n')
    output.append(
        '-------------------------------------------------------------------\n'
    )

    return hitchain, average_qscore, '\n'.join(output)
Esempio n. 12
0
def curate_homoDB(verbosity):
    '''
    Creates h**o-oligomeric database from a local pdb repsitory.
    The divided scheme adopted by RCSB, in which the subdirectories
    are the two middle characters in the PDB code, is assumed.
    Each database contains three key files: dat, log and fasta.
    * homodb.dat contains only the pdb codes contained in the database.
    * homodb.log contains summarized relevant information about each entry.
    * homodb.fasta contains the sequences of every chain in the database.
    Called by: update_databases()
    '''
    # Create stats folder if does not exist
    stats_dir = os.path.join(pdb_homo_archive, 'stats')
    if not os.path.isdir(stats_dir):
        os.mkdir(stats_dir)
    # Compare latest assession with new files
    assession_log = read_latest_assession(stats_dir)
    new_files = list_new_files(pdb1_archive, assession_log, verbosity)
    print(clrs['g'] + str(len(new_files)) + clrs['n'] +
          ' new structure files were found and will be processed')
    now = str(time.strftime("%d-%m-%Y@%H.%M.%S"))
    dat_file = os.path.join(stats_dir, now + '-choirdb.dat')
    log_file = os.path.join(stats_dir, now + '-choirdb.log')
    err_file = os.path.join(stats_dir, now + '-choirdb.err')
    if not os.path.isfile(dat_file):
        with open(dat_file, 'w+'):
            pass
    # Write files not to be updated to new dat file
    with open(dat_file, 'a') as f:
        for i in assession_log:
            if i not in new_files:
                f.write(i + " " + assession_log[i] + "\n")
    # Create log file
    if not os.path.isfile(log_file):
        with open(log_file, 'w+') as f:
            f.write('Code, Chains, Author, Software, Date\n')

    # Read Chain correspondences
    chain_correspondences_file = os.path.join(stats_dir,
                                              'chain_correspondences.pickle')
    if os.path.isfile(chain_correspondences_file):
        with open(chain_correspondences_file, 'rb') as p:
            chain_correspondences = pickle.load(p)
    else:
        chain_correspondences = {}

    # Main loop that will populate the ProtCHOIR database
    for pdb in pg(new_files, widgets=widgets):
        filename = pdb.split('/')[-1]
        subfolder = pdb.split('/')[-2]
        # Record assessment in dat file
        with open(dat_file, 'a') as f:
            f.write(filename + " " + str(time.time()) + '\n')
        # Start assession
        pctools.printv('\nAssessing ' + pdb + '...', verbosity)
        # Reject files larger than 10Mb
        file_size = os.stat(pdb).st_size / 1048576
        pctools.printv(
            'File size: ' + clrs['c'] + '{0:.1g}'.format(file_size) + ' Mb' +
            clrs['n'], verbosity)
        if file_size > 2:
            pctools.printv(clrs['r'] + "File size too large!" + clrs['n'],
                           verbosity)
            pctools.printv(
                clrs['y'] +
                "Will try to fetch sequences from asymmetric unit." +
                clrs['n'], verbosity)
            try:
                alternative_pdb = os.path.join(
                    pdb_archive, subfolder,
                    'pdb' + filename.split('.')[0] + '.ent.gz')
                pdb_code, structure, nchains = pctools.parse_pdb_structure(
                    alternative_pdb)
                structure, chain_correspondences[
                    pdb_code] = pctools.split_states(structure)
                nchainspostsplit, seqs, chain_ids = pctools.extract_seqs(
                    structure, 0)
                # Write in fasta file
                pctools.printv(
                    clrs['y'] + "Recording large-pdb sequence" + clrs['n'],
                    verbosity)
                record_fasta(pdb_code,
                             seqs,
                             chain_ids,
                             subfolder,
                             type='largepdb')
            except:
                pctools.printv(
                    clrs['r'] + "Failed to fetch sequence!" + clrs['n'],
                    verbosity)
            continue

        try:
            pdb_code, structure, nchains = pctools.parse_pdb_structure(pdb)
            pctools.printv(
                'Number of chains in structure ' + clrs['y'] + pdb_code +
                clrs['n'] + ': ' + str(nchains), verbosity)
            # Reject structures with more than 60 chains
            if int(nchains) > 60:
                pctools.printv(
                    "Number of chains (" + clrs['y'] + str(nchains) +
                    clrs['n'] + ") larger than 60! " + clrs['r'] +
                    "Too many chains!" + clrs['n'], verbosity)
                pctools.printv(
                    clrs['y'] + "Will try to fetch sequences anyway." +
                    clrs['n'], verbosity)
                try:
                    pdb_code, structure, nchains = pctools.parse_pdb_structure(
                        pdb)
                    structure, chain_correspondences[
                        pdb_code] = pctools.split_states(structure)
                    nchainspostsplit, seqs, chain_ids = pctools.extract_seqs(
                        structure, 0)
                    pctools.printv(
                        clrs['y'] + "Recording large-pdb sequence" + clrs['n'],
                        verbosity)
                    # Write in fasta file
                    record_fasta(pdb_code,
                                 seqs,
                                 chain_ids,
                                 subfolder,
                                 type='largepdb')
                except:
                    pctools.printv(
                        clrs['r'] + "Failed to fetch sequence!" + clrs['n'],
                        verbosity)
                continue

            structure, chain_correspondences[pdb_code] = pctools.split_states(
                structure)
            nchainspostsplit, seqs, chain_ids = pctools.extract_seqs(
                structure, 0)
            pctools.printv(
                'Number of chains (' + clrs['c'] + str(nchains) + clrs['n'] +
                ') and file size (' + clrs['c'] + str(file_size) + clrs['n'] +
                ') OK.' + clrs['g'] + ' Proceeding.' + clrs['n'] + '\n',
                verbosity)
            # Try to get info from the canonic pdb header (homonimous to pdb1)
            canonpdb = "pdb" + pdb_code + ".ent.gz"
            try:
                contents = pctools.parse_pdb_contents(
                    os.path.join(pdb_archive, subfolder, canonpdb))[1]
            except:
                pctools.printv(
                    clrs['r'] +
                    '\n\n Mismatch between pdb and biounit entries...' +
                    clrs['n'], verbosity)
            author, software = pctools.get_annotated_states(contents)
            pctools.printv(
                'Author determined biological unit = ' + str(author),
                verbosity)
            pctools.printv(
                'Software determined quaternary structure= ' + str(software),
                verbosity)
            # Start assessing sequences and structures (from 2 up to 26 chains)
            if 1 < int(nchains) < 61:
                ids, proteinpair = pctools.get_pairwise_ids(seqs, nchains)
                for id in ids:
                    if id[0] >= 90:
                        color = clrs['g']
                    else:
                        color = clrs['r']
                    pctools.printv(
                        'Identity between chains ' + clrs['y'] + str(id[1]) +
                        clrs['n'] + ' and ' + clrs['y'] + str(id[2]) +
                        clrs['n'] + ' is ' + color + str(id[0]) + "%" +
                        clrs['n'] + ".", verbosity)
                # Save records for pure h**o-oligomers
                if all(id[0] > 90 for id in ids) and proteinpair is True:
                    pctools.printv(
                        "All identities over 90%. Likely " + clrs['b'] +
                        "h**o-oligomeric" + clrs['n'] + ".", verbosity)
                    pctools.printv(clrs['y'] + "FETCHING" + clrs['n'] + ".\n",
                                   verbosity)
                    # Write file to database
                    newfile = os.path.join(pdb_homo_archive, subfolder,
                                           pdb_code + ".pdb")
                    if not os.path.isdir(
                            os.path.join(pdb_homo_archive, subfolder)):
                        os.mkdir(os.path.join(pdb_homo_archive, subfolder))
                    io.set_structure(structure)
                    io.save(newfile)
                    pctools.gzip_pdb(newfile)
                    # Write to log file
                    with open(log_file, 'a') as f:
                        f.write(
                            str(pdb_code) + "," + str(nchains) + "," +
                            '/'.join(author) + "," + '/'.join(software) + "," +
                            str(os.path.getctime(newfile + '.gz')) + '\n')
                    # Write in fasta file
                    pctools.printv(
                        clrs['y'] + "Recording h**o-oligomer sequence." +
                        clrs['n'], verbosity)
                    record_fasta(pdb_code,
                                 seqs,
                                 chain_ids,
                                 subfolder,
                                 type='h**o')

                # Investigate partial h**o-oligomers
                elif any(id[0] > 90 for id in ids) and proteinpair is True:
                    at_least_one_interface = False
                    for id in ids:
                        if id[0] > 90:
                            # Check if similar chains share interfaces
                            if pctools.check_interfaces(
                                    structure, id[1], id[2]):
                                at_least_one_interface = True
                                pctools.printv(
                                    'Contacts found between chains ' +
                                    clrs['g'] + str(id[1]) + clrs['n'] +
                                    ' and ' + clrs['g'] + str(id[2]) +
                                    clrs['n'] + ' sharing ' + clrs['g'] +
                                    str(id[0]) + clrs['n'] + " % identity.",
                                    verbosity)
                                pctools.printv(
                                    "At least one putative " + clrs['b'] +
                                    "h**o-oligomeric " + clrs['n'] +
                                    "interface found.", verbosity)
                                pctools.printv(
                                    clrs['y'] + "FETCHING" + clrs['n'] + ".\n",
                                    verbosity)
                                # Write file to database
                                newfile = os.path.join(pdb_homo_archive,
                                                       subfolder,
                                                       pdb_code + ".pdb")
                                if not os.path.isdir(
                                        os.path.join(pdb_homo_archive,
                                                     subfolder)):
                                    os.mkdir(
                                        os.path.join(pdb_homo_archive,
                                                     subfolder))
                                io.set_structure(structure)
                                io.save(newfile)
                                pctools.gzip_pdb(newfile)
                                # Write to log file
                                with open(log_file, 'a') as f:
                                    f.write(
                                        str(pdb_code) + "," + str(nchains) +
                                        "," + '/'.join(author) + "," +
                                        '/'.join(software) + "," +
                                        str(os.path.getctime(newfile +
                                                             '.gz')) + '\n')
                                # Write in fasta file
                                pctools.printv(
                                    clrs['y'] +
                                    "Recording h**o-oligomer sequence." +
                                    clrs['n'], verbosity)
                                record_fasta(pdb_code,
                                             seqs,
                                             chain_ids,
                                             subfolder,
                                             type='h**o')

                                break
                    if at_least_one_interface is False:
                        pctools.printv(
                            "No h**o-oligomeric interface found. Likely " +
                            clrs['r'] + "hetero-oligomeric" + clrs['n'] + ".",
                            verbosity)
                        pctools.printv(
                            clrs['y'] + "Recording hetero-oligomer sequence" +
                            clrs['n'], verbosity)
                        # Write in fasta file
                        record_fasta(pdb_code,
                                     seqs,
                                     chain_ids,
                                     subfolder,
                                     type='hetero')

                elif proteinpair is False:
                    pctools.printv(
                        clrs['r'] + "No proteic chain pairs found" +
                        clrs['n'] + ".", verbosity)
                    if any([set(seq[1]) != {'X'} for seq in seqs]):
                        pctools.printv(
                            clrs['y'] + "Protein sequences found though" +
                            clrs['n'], verbosity)
                        pctools.printv(
                            clrs['y'] + "Recording hetero-oligomer sequence" +
                            clrs['n'], verbosity)
                        # Write in fasta file
                        record_fasta(pdb_code,
                                     seqs,
                                     chain_ids,
                                     subfolder,
                                     type='hetero')
                    else:
                        pctools.printv(
                            clrs['r'] +
                            "Not even a single protein chain. Disregarding." +
                            clrs['n'], verbosity)

                else:
                    pctools.printv(
                        "No similar chains found. Likely " + clrs['r'] +
                        "hetero-oligomeric" + clrs['n'] + ".", verbosity)
                    pctools.printv(
                        clrs['y'] + "Recording hetero-oligomer sequence" +
                        clrs['n'], verbosity)
                    record_fasta(pdb_code,
                                 seqs,
                                 chain_ids,
                                 subfolder,
                                 type='hetero')

            elif int(nchains) == 1:
                pctools.printv(
                    "Only one chain found. Likely " + clrs['r'] + "monomeric" +
                    clrs['n'] + ".", verbosity)
                pctools.printv(
                    clrs['y'] + "Recording monomer sequence." + clrs['n'],
                    verbosity)
                structure, chain_correspondences[
                    pdb_code] = pctools.split_states(structure)
                nchains, seqs, chain_ids = pctools.extract_seqs(structure, 0)
                record_fasta(pdb_code, seqs, chain_ids, subfolder, type='mono')

        except:
            errtype, errvalue, errtraceback = sys.exc_info()
            errtypeshort = str(errtype).split('\'')[1]
            pctools.printv(
                clrs['r'] + '*' + str(errtypeshort) + ': ' + str(errvalue) +
                ' l.' + str(errtraceback.tb_lineno) + '*' + clrs['n'],
                verbosity)
            traceback.print_exception(*sys.exc_info())
            if errtypeshort == 'KeyboardInterrupt':
                quit()
            #pctools.printv(clrs['r']+"UNKNOWN FAULT"+clrs['n']+".", verbosity)
            if not os.path.isfile(err_file):
                with open(err_file, 'w+') as f:
                    pass
            with open(err_file, 'a') as f:
                f.write(filename + '\n')
            continue

    with open(chain_correspondences_file, 'wb') as p:
        pickle.dump(chain_correspondences, p, protocol=pickle.HIGHEST_PROTOCOL)

    if not os.path.isfile(err_file):
        with open(err_file, 'w+') as f:
            f.write('\nNo errors. Assessment terminated succesfully.\n')
Esempio n. 13
0
def main():

    args = initial_args

    # Define multiprocessing options
    args.available_cores = cpu_count()

    if args.force_single_core is True:
        args.multiprocess = False
        args.psiblast_threads = 1
        args.modeller_threads = 1
    else:
        if args.psiblast_threads is None:
            args.psiblast_threads = args.available_cores
        if args.modeller_threads is None:
            args.modeller_threads = min([args.available_cores, args.models])

    if args.update is True:
        print(
            tw.dedent("""
                                         !WARNING!

                      You have chosen to updtate the local databases.

              ** The root directory for the database files is: """ +
                      clrs['y'] + choirdb + clrs['n'] + """

              ** The path to local pdb mirror is: """ + clrs['y'] +
                      pdb_archive + clrs['n'] + """

              ** The path to local pdb biounit mirror is: """ + clrs['y'] +
                      pdb1_archive + clrs['n'] + """

              ** The path to local gesamt archive is: """ + clrs['y'] +
                      ges_homo_archive + clrs['n'] + """

              ** The path to local UniRef50 blast database is: """ +
                      clrs['y'] + uniref50 + clrs['n'] + """


              This could take a long time.

              <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<

              """))
        option = input('Do you confirm the information above? (y/n)')
        if option == 'y' or option == 'Y' or option == 'YES' or option == 'yes' or option == 'Yes':
            update_databases(args.verbosity)
            print('\n\nDone updating all databases. Exiting.\n')
        else:
            print('\n\nNo positive confirmation, will not update databases.\n')
            exit()
    # Actually run oligomerization protocol
    else:
        outdir = os.getcwd()
        input_file = args.input_file
        assert os.path.isdir(pdb_archive), clrs[
            'r'] + '\n\n Not able to find PDB directory.\n\n Does "' + pdb_archive + '" exist?' + clrs[
                'n']
        assert os.path.isdir(pdb1_archive), clrs[
            'r'] + '\n\n Not able to find PDB1 assemblies directory.\n\n Does "' + pdb1_archive + '" exist?' + clrs[
                'n']
        assert os.path.isdir(pdb_homo_archive), clrs[
            'r'] + '\n\n Not able to find ProtCHOIR database directory.\n\n Does "' + pdb_homo_archive + '" exist?' + clrs[
                'n']
        assert os.path.isdir(ges_homo_archive), clrs[
            'r'] + '\n\n Not able to find GESAMT archive directory.\n\n Does "' + ges_homo_archive + '" exist?' + clrs[
                'n']
        assert args.refine_level in [0, 1, 2, 3, 4], clrs[
            'r'] + '\n\n Refinement level must be an integer number from 0 to 4.\n Run ProtCHOIR -h for more information\n\n' + clrs[
                'n']
        assert args.psiblast_params in psiblast_params, clrs[
            'r'] + '\n\n PSI-BLAST parameters invalid.\n Run ProtCHOIR -h for more information\n\n' + clrs[
                'n']
        assert input_file is not None, clrs[
            'r'] + '\n\n Please inform the input file name.\n Run ProtCHOIR -h for more information.\n\n' + clrs[
                'n']
        assert os.path.isfile(input_file), clrs[
            'r'] + '\n\n Not able to find input file.\n\n Does "' + input_file + '" exist?\n' + clrs[
                'n']
        assert args.zip_output in [0, 1, 2], clrs[
            'r'] + '\n\n Compression level must be an integer number between 0 and 2.\n Run ProtCHOIR -h for more information\n\n' + clrs[
                'n']
        assert all([
            i in set('MIG') for i in set(args.assessment)
        ]) or args.assessment == 'N', clrs[
            'r'] + '\n\n Oligomer assessment type do not comply.\n Choose any combination of [G]Gesamt, [M]Molprobity, [I]Interfaces or choose [N] for None\n\n' + clrs[
                'n']

        # Force generation of topologies and all assessments if final report is requested
        if args.generate_report is True:
            args.assessment = 'MIG'
            args.plot_topologies = True

        # Deal with dots and dashes in the input file and remove dots
        if input_file.lower().endswith('.pdb'):
            input_basename = os.path.basename(input_file).split('.pdb')[0]
            input_basename = input_basename.replace(".", "_")
            input_basename = input_basename.replace("-", "_")
            new_input_file = input_basename + '.pdb'
            if os.path.basename(input_file) == os.path.basename(
                    new_input_file):
                pass
            else:
                shutil.copy(input_file, new_input_file)

        # Also process filename to fasta header if input file is fasta
        elif input_file.lower().endswith('.fasta'):
            input_basename = os.path.basename(input_file).split('.fasta')[0]
            input_basename = input_basename.replace(".", "_")
            input_basename = input_basename.replace("-", "_")
            new_input_file = os.path.join(
                outdir, input_basename + '_CHOIR_MonomerSequence.fasta')
            with open(input_file, 'r') as infile, open(new_input_file,
                                                       'w') as outfile:
                outfile.write('>' + input_basename + '\n')
                n = 0
                for line in infile.readlines():
                    if not line.startswith('>'):
                        outfile.write(line)
                    else:
                        n += 1
                    if n == 2:
                        break
            args.sequence_mode = True
        else:
            raise pctools.FileFormatError(
                clrs['r'] +
                '\n\n Input format must be either pdb or fasta\n Run ./ProtCHOIR -h for more information\n\n'
                + clrs['n'])
        if args.allow_monomers:
            assert args.sequence_mode is True, clrs[
                'r'] + '\n\n To allow building monomers you must use sequence mode. \n Run ProtCHOIR -h for more information\n\n' + clrs[
                    'n']

        # Start recording job progress
        with open('CHOIR_Progress.out', 'w') as f:
            f.write("Starting new ProtCHOIR run\n")

        # Pickle Runtime arguments
        pickle.dump(args, open('CHOIR_Args.pickle', 'wb'))

        # Show arguments used and create CHOIR.conf
        pctools.print_section(0, "Runtime Arguments")
        runtime_arguments = {}
        choir_args = os.path.join(outdir, "CHOIR.args")
        with open(choir_args, 'w') as f:
            for name, value in vars(args).items():
                runtime_arguments[name] = value
                print(name + "=" + str(value))
                f.write(name + "=" + str(value) + "\n")
        print('\nRuntime parameters written to: ' + clrs['g'] +
              os.path.basename(choir_args) + clrs['n'] + '\n')

        # Initialize report
        report = {}
        report['runtime_arguments'] = runtime_arguments
        report['input_filename'] = os.path.basename(new_input_file)

        # Write errorprof placeholder summary
        placeholder_report = report.copy()
        report_data = [
            'input_filename', 'sequence_mode', 'templatedmodel',
            'protomer_residues', 'tmspans', 'highest_scoring_state',
            'homo_oligomeric_over_other_score', 'best_template',
            'best_nchains', 'best_id', 'best_cov', 'best_qscore',
            'model_oligomer_name', 'model_molprobity', 'gesamt_rmsd',
            'protchoir_score', 'surface_score', 'interfaces_score',
            'quality_score', 'total_runtime', 'exit'
        ]
        for data in report_data:
            if data not in placeholder_report:
                placeholder_report[data] = 'NA'
        with open(input_basename + '_CHOIR_Summary.tsv', 'w') as f:
            f.write(
                'Input\tSeq.Mode\tTemplated\tLength\tTMSpans\tLikelyState\tH3OScore\tTemplate\tChains\tIdentity\tCoverage\tAv.QScore\tBestModel\tMolprobity\tRMSD\tProtCHOIR\tSurface\tInterfaces\tQuality\tRuntime\tExit\n'
            )
            f.write('\t'.join(
                [str(placeholder_report[data])
                 for data in report_data]) + '\n')

        # Start analysis of protomer
        analyse_protomer_results, report, args = analyze_protomer(
            new_input_file, report, args)

        # If no suitable h**o-oligomeric template wasfound, exit nicely.
        if analyse_protomer_results is None:
            finalize(report, input_basename, start_time, start_timestamp, args)
            pctools.print_sorry()
            sys.exit(0)

        # Else, proceed conditionally on runtime arguments
        elif analyse_protomer_results is not None and args.sequence_mode is True:
            residue_index_mapping = None
            minx = None
            maxx = None
            if args.skip_conservation:
                entropies = None
                z_entropies = None
                pdb_name, clean_input_file, largest_oligo_complexes, interfaces_dict, tmdata = analyse_protomer_results
            elif not args.skip_conservation:
                pdb_name, clean_input_file, largest_oligo_complexes, interfaces_dict, entropies, z_entropies, tmdata = analyse_protomer_results
                if entropies == z_entropies == minx == maxx == None:
                    args.skip_conservation = True

        elif analyse_protomer_results is not None and args.sequence_mode is False:
            if args.skip_conservation:
                minx = None
                maxx = None
                entropies = None
                z_entropies = None
                pdb_name, clean_input_file, largest_oligo_complexes, interfaces_dict, residue_index_mapping, tmdata = analyse_protomer_results
            elif not args.skip_conservation:
                pdb_name, clean_input_file, largest_oligo_complexes, interfaces_dict, entropies, z_entropies, residue_index_mapping, minx, maxx, tmdata = analyse_protomer_results
                if entropies == z_entropies == minx == maxx == None:
                    args.skip_conservation = True

        report['runtime_arguments'][
            'skip_conservation'] = args.skip_conservation

        new_input_file = clean_input_file

        # Use information of complexes to build oligomers
        best_oligo_template, built_oligomers, report = make_oligomer(
            new_input_file,
            largest_oligo_complexes,
            report,
            args,
            residue_index_mapping=residue_index_mapping)

        # If no models were built, exit nicely.
        if built_oligomers is None:
            finalize(report, input_basename, start_time, start_timestamp, args)
            pctools.print_sorry()
            sys.exit(0)

        # Analyse built models
        reports = analyse_oligomers(new_input_file,
                                    best_oligo_template,
                                    built_oligomers,
                                    interfaces_dict,
                                    tmdata,
                                    report,
                                    args,
                                    entropies=entropies,
                                    z_entropies=z_entropies,
                                    minx=minx,
                                    maxx=maxx)
        finalize(reports, input_basename, start_time, start_timestamp, args)
Esempio n. 14
0
def finalize(reports, input_basename, start_time, start_timestamp, args):
    report_data = [
        'input_filename', 'sequence_mode', 'templatedmodel',
        'protomer_residues', 'tmspans', 'highest_scoring_state',
        'homo_oligomeric_over_other_score', 'best_template', 'best_nchains',
        'best_id', 'best_cov', 'best_qscore', 'model_oligomer_name',
        'model_molprobity', 'gesamt_rmsd', 'quality_score', 'surface_score',
        'interfaces_score', 'protchoir_score', 'total_runtime', 'exit'
    ]
    if type(reports) is list:
        if args.zip_output == 2:
            # Don't prevent compression of anything
            nozip = []
            for report in reports:
                if args.generate_report is True:
                    report['html_report'] = pctools.html_report(report, args)
        else:
            # Prevent compression of files needed for the report and the models
            nozip = [
                os.path.basename(report['model_filename'])
                for report in reports
            ]
            for report in reports:
                if args.generate_report is True:
                    report['html_report'] = pctools.html_report(report, args)
                    for key, value in report.items():
                        if key in [
                                'html_report', 'molprobity_radar',
                                'comparison_plots', 'protomer_figure',
                                'protomer_plot', 'template_figure',
                                'topology_figure', 'assemblied_protomer_plot',
                                'input_filename'
                        ]:
                            nozip.append(os.path.basename(value))
                        if key == 'model_figures':
                            for figure in value:
                                nozip.append(os.path.basename(figure))

        best_report = sorted(reports,
                             key=operator.itemgetter('protchoir_score'))[-1]

    elif type(reports) is dict:
        nozip = []
        best_report = reports
        for data in report_data:
            if data not in best_report:
                best_report[data] = 'NA'

    # Generate summary tsv file for the best report
    end_time = datetime.now()
    runtime = end_time - start_time
    best_report['total_runtime'] = str(runtime.seconds)
    summary_file = input_basename + '_CHOIR_Summary.tsv'
    nozip.append(summary_file)
    if 'exit' not in best_report:
        best_report['exit'] = '0'
        with open('CHOIR_Progress.out', 'a') as f:
            f.write(datetime.now().strftime("%H:%M:%S") +
                    ": Finished running ProtCHOIR!")
    elif best_report['exit'] == '1':
        with open('CHOIR_Progress.out', 'a') as f:
            f.write(
                datetime.now().strftime("%H:%M:%S") +
                ": ERROR! Indicated template not found in oligomers database..."
            )
    elif best_report['exit'] == '2':
        with open('CHOIR_Progress.out', 'a') as f:
            f.write(datetime.now().strftime("%H:%M:%S") +
                    ": ERROR! Failed to find suitable homologues...")
    elif best_report['exit'] == '3':
        with open('CHOIR_Progress.out', 'a') as f:
            f.write(
                datetime.now().strftime("%H:%M:%S") +
                ": ERROR! Failed to find suitable h**o-oligomeri interfaces..."
            )
    elif best_report['exit'] == '4':
        with open('CHOIR_Progress.out', 'a') as f:
            f.write(
                datetime.now().strftime("%H:%M:%S") +
                ": ERROR! No template had an average Q-score above cut-off...")
    elif best_report['exit'] == '5':
        with open('CHOIR_Progress.out', 'a') as f:
            f.write(datetime.now().strftime("%H:%M:%S") +
                    ": ERROR! Failed to find templates in local databases...")
    elif best_report['exit'] == '6':
        with open('CHOIR_Progress.out', 'a') as f:
            f.write(
                datetime.now().strftime("%H:%M:%S") +
                ": ERROR! Sub-optimal alignment between template and target sequences..."
            )

    with open(summary_file, 'w') as f:
        f.write(
            'Input\tSeq.Mode\tTemplated\tLength\tTMSpans\tLikelyState\tH3OScore\tTemplate\tChains\tIdentity\tCoverage\tAv.QScore\tBestModel\tMolprobity\tRMSD\tQuality\tSurface\tInterfaces\tProtCHOIR\tRuntime\tExit\n'
        )
        f.write('\t'.join([str(best_report[data])
                           for data in report_data]) + '\n')
    # Finalise
    final_end_time = datetime.timestamp(datetime.now())
    time.sleep(1)

    # Compress output
    if args.zip_output > 0:
        try:
            import zlib
            compression = zipfile.ZIP_DEFLATED
        except (ImportError, AttributeError):
            compression = zipfile.ZIP_STORED

        with zipfile.ZipFile(input_basename + '_ProtCHOIR_OUT.zip',
                             'w',
                             compression=compression) as zipf:
            for f in os.listdir(os.getcwd()):
                if f != input_basename + '_ProtCHOIR_OUT.zip' and os.path.getctime(
                        f) > start_timestamp and os.path.getctime(
                            f) < final_end_time:
                    print('Compressing... ' + f)
                    zipf.write(f)
                    if f not in nozip:
                        if os.path.isdir(f):
                            shutil.rmtree(f)
                        elif os.path.isfile(f):
                            os.remove(f)

    print('FINISHED AT: ' + datetime.now().strftime("%d-%m-%Y %H:%M"))
    print('TOTAL RUNTIME: ' + str(runtime.seconds) + ' s')
Esempio n. 15
0
def generate_ali(alignments, best_oligo_template, residue_index_mapping, args):
    best_oligo_template_file = best_oligo_template + "_CHOIR_RenamedChainsTemplate"
    final_alignment = os.path.join(
        workdir,
        input_name + '_' + best_oligo_template + '_CHOIR_Alignment.ali')
    getseq = False
    alignment_dict = {}
    full_residue_mapping = {}
    # Parse individual GESAMT alignments and organize in a per-chain dictionary
    for fasta_alignment in alignments:
        getseq = False
        template = False
        chain = None
        entryseq_dict = {}
        for line in open(fasta_alignment, 'r').readlines():
            # Only record sequence if line above starts with >
            if getseq is True:
                getseq = False
                seq = line.replace('\n', '')
                # If this is the template, count leading and trailing gaps
                if template is True:
                    template = False
                    leading_gaps = 0
                    for r in seq:
                        if r == '-':
                            leading_gaps += 1
                        else:
                            break
                    trailing_gaps = 0
                    for r in seq[::-1]:
                        if r == '-':
                            trailing_gaps += 1
                        else:
                            break
                assert seq is not None, 'Sequence is None'
                assert seq != '', 'Sequence is empty'
                entryseq_dict[entry] = seq.upper()
                del seq
            # If it is an entry line, get details and expect sequence
            if line.startswith('>'):
                entry = line.split('>')[1].split('(')[0].split(
                    '.pdb')[0].replace('\n', '')
                # If entry is template, use chain as reference
                if entry == best_oligo_template_file:
                    chain = line.split('(')[1].split(')')[0]
                    template = True
                getseq = True

        # Remove leading and trailing gaps from the alignment for both template and query
        if trailing_gaps == 0:
            for entry, seq in entryseq_dict.items():
                entryseq_dict[entry] = leading_gaps * '-' + seq[leading_gaps:]
        else:
            for entry, seq in entryseq_dict.items():
                entryseq_dict[entry] = leading_gaps * '-' + seq[
                    leading_gaps:-trailing_gaps] + trailing_gaps * '-'
        if residue_index_mapping is not None:
            full_residue_mapping[chain] = collections.OrderedDict()
            for res, i in residue_index_mapping.items():
                full_residue_mapping[chain][res] = i + leading_gaps
        else:
            full_residue_mapping[chain] = leading_gaps

        alignment_dict[chain] = entryseq_dict
        pctools.printv(
            'Removed ' + clrs['c'] + str(leading_gaps) + clrs['n'] +
            ' leading gaps and ' + clrs['c'] + str(trailing_gaps) + clrs['n'] +
            ' trailing gaps from chain ' + clrs['c'] + chain + clrs['n'] +
            ' alignment.\n', verbosity)

    # If symmetry is desired, reduce all chains to match the size of the smallest
    if args.symmetry:
        max_leading_gaps = 0
        max_trailing_gaps = 0
        for chain, seqs in alignment_dict.items():
            for entry, seq in seqs.items():
                if entry == best_oligo_template_file:
                    leading_gaps = 0
                    for r in seq:
                        if r == '-':
                            leading_gaps += 1
                        else:
                            break
                    if leading_gaps > max_leading_gaps:
                        max_leading_gaps = leading_gaps
                    trailing_gaps = 0
                    for r in seq[::-1]:
                        if r == '-':
                            trailing_gaps += 1
                        else:
                            break
                    if trailing_gaps > max_trailing_gaps:
                        max_trailing_gaps = trailing_gaps
        pctools.printv(
            'To cope with symmetry restraints, the modelled sequence will contain '
            + clrs['c'] + str(max_leading_gaps) + clrs['n'] +
            ' leading gaps and ' + clrs['c'] + str(max_trailing_gaps) +
            clrs['n'] + ' trailing gaps' + clrs['n'] + '.\n', verbosity)
        print(max_trailing_gaps)
        for chain, seqs in alignment_dict.items():
            if max_trailing_gaps == 0:
                seqs[entry] = max_leading_gaps * '-' + seqs[entry][
                    max_leading_gaps:]
            else:
                seqs[entry] = max_leading_gaps * '-' + seqs[
                    entry][max_leading_gaps:
                           -max_trailing_gaps] + max_trailing_gaps * '-'

    # Find out first and last chains
    first_chain = sorted(alignment_dict)[0]
    last_chain = sorted(alignment_dict)[-1]

    # Create strings to write in alignment file
    alignment_string_dict = {}
    for entry in [input_name, best_oligo_template_file]:
        if entry == input_name:
            alignment_string_dict[
                entry] = ">P1;" + input_name + "\nsequence:" + input_name + ":FIRST:" + first_chain + ":LAST:" + last_chain + "::::\n"
        elif entry == best_oligo_template_file:
            alignment_string_dict[
                entry] = ">P1;" + best_oligo_template_file + ".pdb\nstructureX:" + best_oligo_template_file + ".pdb:FIRST:" + first_chain + ":LAST:" + last_chain + "::::\n"
        for chain, entryseq in sorted(alignment_dict.items()):
            if chain == last_chain:
                alignment_string_dict[entry] += entryseq[entry] + '*\n'
            else:
                alignment_string_dict[entry] += entryseq[entry] + '/\n'

    # Write alignment file
    with open(final_alignment, 'w') as f:
        for entry, entrystring in alignment_string_dict.items():
            pctools.printv(entrystring, verbosity)
            f.write(entrystring)

    print('Modeller Alignment written to ' + clrs['g'] +
          os.path.basename(final_alignment) + clrs['n'] + '\n')
    return final_alignment, full_residue_mapping
Esempio n. 16
0
def make_oligomer(input_file,
                  largest_oligo_complexes,
                  report,
                  args,
                  residue_index_mapping=None):
    global workdir
    global input_name
    global verbosity
    global g_input_file
    global g_args
    global best_oligo_template_code
    global renamed_chains_file
    g_input_file = input_file
    g_args = args
    verbosity = args.verbosity
    workdir = os.getcwd()
    symmetry = args.symmetry

    # Subsection 2[a] #######################################################################
    if args.sequence_mode is False:
        input_name = os.path.basename(input_file).split(".pdb")[0].replace(
            '.', '_')
        candidate_qscores = {}
        # Select structurally best oligomeric template using GESAMT
        pctools.print_section(2, 'OLIGOMER ASSEMBLING')
        pctools.print_subsection('2[a]', 'Structural template selection')
        if args.multiprocess is True:
            p = Pool()
            for hitchain, average_qscore, output in p.map_async(
                    analyse_largest_complexes,
                    largest_oligo_complexes.items()).get():
                candidate_qscores[hitchain] = average_qscore
                report['hits'][hitchain]['qscore'] = round(average_qscore, 3)
                print(output)
            p.close()
            p.join()
        else:
            for item in largest_oligo_complexes.items():
                hitchain, average_qscore, output = analyse_largest_complexes(
                    item)
                candidate_qscores[hitchain] = average_qscore
                report['hits'][hitchain]['qscore'] = round(average_qscore, 3)
                print(output)

        best_oligo_template = max(candidate_qscores.keys(),
                                  key=(lambda x: candidate_qscores[x]))
        if candidate_qscores[best_oligo_template] >= args.qscore_cutoff:
            print('Structurally, the best template is: ' + clrs['y'] +
                  best_oligo_template + clrs['n'] + '. Using that!\n')
            report['best_template'] = best_oligo_template.split(':')[0]
            report['best_id'] = report['hits'][best_oligo_template]['id']
            report['best_cov'] = report['hits'][best_oligo_template][
                'coverage']
            report['best_qscore'] = report['hits'][best_oligo_template][
                'qscore']
            report['best_nchains'] = report['hits'][best_oligo_template][
                'final_homo_chains']
        else:
            print('No template had an average Q-score above cut-off of ' +
                  clrs['c'] + str(args.qscore_cutoff) + clrs['n'] +
                  '\nTry lowering the cutoff or running in sequence mode.\n')
            report['exit'] = '4'
            return None, None, report
        report['topology_figure'] = './' + best_oligo_template.replace(
            ':', '_') + '_CHOIR_Topology.png'
        template_chains = largest_oligo_complexes[best_oligo_template]
        best_oligo_template_code = best_oligo_template.split(':')[0]
        clean_template_file = make_local_template(best_oligo_template_code)

    elif args.sequence_mode is True:
        if input_file.endswith('.pdb'):
            input_name = os.path.basename(input_file).split(".pdb")[0].replace(
                '.', '_')
            input_file = os.path.join(
                workdir, input_name + '_CHOIR_MonomerSequence.fasta')
            g_input_file = input_file

        elif input_file.endswith('_CHOIR_MonomerSequence.fasta'):
            input_name = os.path.basename(input_file).split(
                "_CHOIR_MonomerSequence.fasta")[0]

        pctools.print_section(2, 'OLIGOMER ASSEMBLING - SEQUENCE MODE')
        print(clrs['y'] +
              "Skipping section 2[a] - Structural template selection" +
              clrs['n'] + "\n")
        attempt = 0
        while attempt < len(largest_oligo_complexes):
            try:
                best_oligo_template = list(largest_oligo_complexes)[attempt]
                report['best_template'] = best_oligo_template.split(':')[0]
                report['best_id'] = report['hits'][best_oligo_template]['id']
                report['best_cov'] = report['hits'][best_oligo_template][
                    'coverage']
                report['best_qscore'] = 'NA'
                report['best_nchains'] = report['hits'][best_oligo_template][
                    'final_homo_chains']
                report['topology_figure'] = './' + best_oligo_template.replace(
                    ':', '_') + '_CHOIR_Topology.png'
                template_chains = largest_oligo_complexes[best_oligo_template]
                best_oligo_template_code = best_oligo_template.split(':')[0]
                clean_template_file = make_local_template(
                    best_oligo_template_code)
                break
            except:
                attempt += 1
                if attempt < len(largest_oligo_complexes):
                    print('Attempt ' + str(attempt) +
                          ' failed, trying a differente template candidate.')
                if attempt == len(largest_oligo_complexes):
                    print('Failed to find templates in local databases.')
                    report['exit'] = '5'
                    return None, None, report

    relevant_chains_file = extract_relevant_chains(clean_template_file,
                                                   template_chains)
    if args.generate_report is True:
        report['template_figure'], pymol_output = pctools.pymol_screenshot(
            relevant_chains_file, args)
        print(pymol_output)
    renamed_chains_file, chains_dict = rename_relevant_chains(
        relevant_chains_file)
    relevant_chains = [
        chains_dict[template_chain] for template_chain in template_chains
    ]

    # Subsection 2[b] #######################################################################
    pctools.print_subsection('2[b]', 'Generating alignment')
    # Generate per chain alignment files
    alignment_files = []
    if args.sequence_mode is False:
        if args.multiprocess is True:
            p = Pool()
            for qscore, rmsd, fasta_out, gesamt_output in p.map_async(
                    run_gesamt_parallel, chains_dict.values()).get():
                alignment_files.append(fasta_out)
                print(gesamt_output)
            p.close()
            p.join()
        else:
            for chain in chains_dict.values():
                qscore, rmsd, fasta_out, gesamt_output = run_gesamt_parallel(
                    chain)
                alignment_files.append(fasta_out)
                print(gesamt_output)

    elif args.sequence_mode is True:
        if args.multiprocess is True:
            p = Pool()
            for fasta_out, output in p.map_async(alignment_from_sequence,
                                                 chains_dict.values()).get():
                alignment_files.append(fasta_out)
                print(output)
        else:
            for current_chain in chains_dict.values():
                fasta_out, output = alignment_from_sequence(current_chain)
                alignment_files.append(fasta_out)
                print(output)
    print('Alignment files:\n' + clrs['g'] +
          ('\n').join([os.path.basename(i)
                       for i in alignment_files]) + clrs['n'])

    # Generate final alignment which will be the input for Modeller
    final_alignment, full_residue_mapping = generate_ali(
        alignment_files, best_oligo_template_code, residue_index_mapping, args)
    # Score said alignment and enforce treshold
    report[
        'relative_alignment_score'], relative_wscores, nchains = score_alignment(
            final_alignment)
    print('\nFinal average relative score for alignment: ' +
          str(round(report['relative_alignment_score'], 2)) + '%')
    bad_streches = 0
    for wscore in relative_wscores:
        if wscore < args.similarity_cutoff:
            bad_streches += 1
    if bad_streches >= args.bad_streches * nchains:
        if args.sequence_mode is True:
            print(
                '\nThe alignment score was unacceptable for ' + clrs['r'] +
                str(bad_streches) + clrs['n'] +
                ' 30-res segments of the protein complex.\nTry running the default (structure) mode.\n'
            )
        else:
            print(
                '\nThe alignment score was unacceptable for ' + clrs['r'] +
                str(bad_streches) + clrs['n'] +
                ' 30-res segments of the protein complex.\nTry increasing the number of candidate templates or tweaking the similarity cut-offs.\n'
            )
        report['exit'] = '6'
        return None, None, report

    # Subsection 2[c] #######################################################################
    pctools.print_subsection('2[c]', 'Generating models')
    genmodel_file, expected_models = create_genmodel(final_alignment,
                                                     best_oligo_template_code,
                                                     relevant_chains, args)
    run_modeller(genmodel_file)

    # Record list of oligomers built
    nmodels = 0
    built_oligomers = []
    for model in expected_models:
        built_oligomers.append(
            restore_chain_identifiers(model, chains_dict,
                                      full_residue_mapping))
        nmodels += 1
    print(clrs['b'] + 'ProtCHOIR' + clrs['n'] + ' built ' + clrs['c'] +
          str(nmodels) + clrs['n'] + ' model oligomers:')
    for model in built_oligomers:
        print(clrs['g'] + model + clrs['n'])

    return best_oligo_template, built_oligomers, report
Esempio n. 17
0
def collect_fasta(verbosity):
    '''
    Fetches fasta files in the pdb_homo_archive and creates a single fasta file
    within a "sequences" folder. For that, it checks the identity among the
    chains in the original fasta and only keeps track of the unique chains, i.e.
    less than 99% identity to the other chains. This file is later use to make
    the blast database.
    Called by: update_databases()
    '''
    fastafiles = [
        os.path.join(dp, f) for dp, dn, filenames in os.walk(pdb_homo_archive)
        for f in filenames if f.endswith(".fasta")
    ]
    seqdir = os.path.join(pdb_homo_archive, 'sequences')
    if not os.path.isdir(seqdir):
        os.mkdir(seqdir)

    largepdb_collected_fasta = os.path.join(seqdir,
                                            'largepdb_collected.fastas')
    with open(largepdb_collected_fasta, 'w+'):
        pass

    homo_collected_fasta = os.path.join(seqdir, 'homo_collected.fastas')
    with open(homo_collected_fasta, 'w+'):
        pass

    mono_collected_fasta = os.path.join(seqdir, 'mono_collected.fastas')
    with open(mono_collected_fasta, 'w+'):
        pass

    hetero_collected_fasta = os.path.join(seqdir, 'hetero_collected.fastas')
    with open(hetero_collected_fasta, 'w+'):
        pass

    for fasta in pg(fastafiles, widgets=widgets):
        pctools.printv('Assessing ' + clrs['y'] + fasta + clrs['n'] + '...',
                       verbosity)
        contents = open(fasta, 'r').read()
        contentlines = contents.split('>')
        nchains = str(len(re.findall('>', contents)))
        pctools.printv(
            'With ' + clrs['y'] + nchains + clrs['n'] +
            ' chains to be assessed\n', verbosity)
        uniques = []
        for entry in contentlines:
            if entry:
                splitentry = entry.split('\n', 1)
                pdbch = splitentry[0]
                seq = splitentry[1].replace('\n', '')
                if uniques:
                    percent_ids = []
                    for unique in uniques:
                        alignment = parasail.sg_stats_striped_16(
                            seq, unique[1], 10, 1, parasail.blosum62)
                        if alignment.length == 0:
                            percent_ids.append(0)
                        else:
                            percent_ids.append(
                                (alignment.matches) / alignment.length * 100)
                    if all(percent_id <= 99 for percent_id in percent_ids):
                        uniques.append([pdbch, seq])
                else:
                    uniques.append([pdbch, seq])

        if '/largepdb_sequences/' in fasta:
            with open(largepdb_collected_fasta, 'a') as f:
                for unique in uniques:
                    wrapped_seq = "\n".join(tw.wrap(unique[1]))
                    fasta_entry = '>' + unique[0] + '\n' + wrapped_seq + '\n\n'
                    f.write(fasta_entry)

        elif '/mono_sequences/' in fasta:
            with open(mono_collected_fasta, 'a') as f:
                for unique in uniques:
                    wrapped_seq = "\n".join(tw.wrap(unique[1]))
                    fasta_entry = '>' + unique[0] + '\n' + wrapped_seq + '\n\n'
                    f.write(fasta_entry)

        elif '/hetero_sequences/' in fasta:
            with open(hetero_collected_fasta, 'a') as f:
                for unique in uniques:
                    wrapped_seq = "\n".join(tw.wrap(unique[1]))
                    fasta_entry = '>' + unique[0] + '\n' + wrapped_seq + '\n\n'
                    f.write(fasta_entry)

        elif '/homo_sequences/' in fasta:
            with open(homo_collected_fasta, 'a') as f:
                for unique in uniques:
                    wrapped_seq = "\n".join(tw.wrap(unique[1]))
                    fasta_entry = '>' + unique[0] + '\n' + wrapped_seq + '\n\n'
                    f.write(fasta_entry)

    subprocess.run([
        makeblastdb_exe, '-in', largepdb_collected_fasta, '-dbtype', 'prot',
        '-out',
        os.path.join(seqdir, 'largedb')
    ])

    subprocess.run([
        makeblastdb_exe, '-in', mono_collected_fasta, '-dbtype', 'prot',
        '-out',
        os.path.join(seqdir, 'monodb')
    ])

    subprocess.run([
        makeblastdb_exe, '-in', hetero_collected_fasta, '-dbtype', 'prot',
        '-out',
        os.path.join(seqdir, 'heterodb')
    ])

    subprocess.run([
        makeblastdb_exe, '-in', homo_collected_fasta, '-dbtype', 'prot',
        '-out',
        os.path.join(seqdir, 'homodb')
    ])
Esempio n. 18
0
def score_alignment(alignment_file):
    print(clrs['b'] + 'SCORING ALIGNMENT' + clrs['n'] + ' in ' + clrs['y'] +
          os.path.basename(alignment_file) + clrs['n'] + '\n')
    sequences = list(SeqIO.parse(alignment_file, "pir"))
    query_chains = str(sequences[0].seq).split('/')
    template_chains = str(sequences[1].seq).split('/')
    trimmed_query_chains = []
    trimmed_template_chains = []
    for query_chain, template_chain in zip(query_chains, template_chains):

        leading_gaps = 0
        for r in query_chain:
            if r == '-':
                leading_gaps += 1
            else:
                break
        trailing_gaps = 0
        for r in query_chain[::-1]:
            if r == '-':
                trailing_gaps += 1
            else:
                break

        if trailing_gaps == 0:
            trimmed_query_chains.append(query_chain[leading_gaps:])
            trimmed_template_chains.append(template_chain[leading_gaps:])
        else:
            trimmed_query_chains.append(
                query_chain[leading_gaps:-trailing_gaps])
            trimmed_template_chains.append(
                template_chain[leading_gaps:-trailing_gaps])

    relative_wscores = []
    relative_scores = []
    for q_chain, t_chain in zip(trimmed_query_chains, trimmed_template_chains):
        pctools.printv(
            '\nCalculating ' + clrs['y'] + 'maximum scores' + clrs['n'] +
            ' for chain segments:', g_args.verbosity)
        max_score, max_wscores = score_pairwise(t_chain, t_chain,
                                                MatrixInfo.blosum62, 0, 0)
        pctools.printv(
            '\nCalculating ' + clrs['y'] + 'actual scores' + clrs['n'] +
            ' for chain segments:', g_args.verbosity)
        score, wscores = score_pairwise(q_chain, t_chain, MatrixInfo.blosum62,
                                        0, 0)
        relative_scores.append(round(score * 100 / max_score, 2))

        for max_wscore, wscore in zip(max_wscores, wscores):
            if max_wscore != 0:
                relative_wscore = round(wscore * 100 / max_wscore, 2)
            else:
                relative_wscore = 100
            relative_wscores.append(relative_wscore)

    relative_score = sum(relative_scores) / len(relative_scores)
    string = ''
    for relative_wscore in relative_wscores:
        if relative_wscore > g_args.similarity_cutoff:
            color = 'g'
        else:
            color = 'r'
        if string == '':
            string += (clrs[color] + str(relative_wscore) + clrs['n'])
        else:
            string += (' ~ ' + clrs[color] + str(relative_wscore) + clrs['n'])
    print('\nRelative score per 30-res segment: ' + string + clrs['n'])
    return relative_score, relative_wscores, len(query_chains)
Esempio n. 19
0
def analyse_model(oligomer):
    output = []
    model_report = g_report.copy()
    model_report['model_filename'] = oligomer
    model_oligomer_name = os.path.basename(oligomer).split(
        "_CHOIR_")[0].replace('.', '_')
    output.append(pctools.subsection('3', model_oligomer_name))
    output.append('Analysing oligomer file: ' + clrs['y'] + oligomer +
                  clrs['n'] + '\n')
    model_report['model_oligomer_name'] = model_oligomer_name
    if g_args.generate_report is True:
        model_report['model_figures'], pymol_output = pctools.pymol_screenshot(
            oligomer, g_args, putty=True)
        output.append(pymol_output)
    pdb_name, structure, nchains = pctools.parse_any_structure(oligomer)
    nchains, seqs, chain_ids = pctools.extract_seqs(structure, 0)
    relevant_chains = []
    for seq in seqs:
        relevant_chains.append(seq[0])

    pisa_output, pisa_error, protomer_data = pctools.run_pisa(
        oligomer,
        '',
        g_args.verbosity,
        gen_monomer_data=True,
        gen_oligomer_data=True)
    protomer_surface_residues = pctools.get_areas(protomer_data)
    model_report['assemblied_protomer_plot'], model_report[
        'assemblied_protomer_exposed_area'], model_report[
            'assemblied_protomer_hydrophobic_area'], model_report[
                'assemblied_protomer_conserved_area'], minx, maxx, analysis_output = pctools.plot_analysis(
                    pdb_name,
                    protomer_surface_residues,
                    g_entropies,
                    g_z_entropies,
                    g_tmdata,
                    g_args,
                    minx=g_minx,
                    maxx=g_maxx)
    output.append(analysis_output)

    if 'I' in g_args.assessment and not g_args.allow_monomers:
        output.append(
            pctools.subsection('3' + '[I]', 'Interfaces Comparison: ' +
                               model_oligomer_name))
        if g_args.sequence_mode is False and g_args.skip_conservation is False:
            model_report['exposed_area_reduction'] = int(
                100 *
                (float(model_report['assemblied_protomer_exposed_area']) -
                 float(model_report['protomer_exposed_area'])) /
                float(model_report['protomer_exposed_area']))
            model_report['hydrophobic_area_reduction'] = int(
                100 *
                (float(model_report['assemblied_protomer_hydrophobic_area']) -
                 float(model_report['protomer_hydrophobic_area'])) /
                float(model_report['protomer_hydrophobic_area']))
            model_report['conserved_area_reduction'] = int(
                100 *
                (float(model_report['assemblied_protomer_conserved_area']) -
                 float(model_report['protomer_conserved_area'])) /
                float(model_report['protomer_conserved_area']))

            if model_report['exposed_area_reduction'] < -5:
                if model_report['hydrophobic_area_reduction'] < 0:
                    hydophobic_surface_score = 10 * (
                        model_report['hydrophobic_area_reduction'] /
                        model_report['exposed_area_reduction']) / 3
                else:
                    hydophobic_surface_score = 0
                if hydophobic_surface_score > 10:
                    hydophobic_surface_score = 10
                output.append('Hydrophobic surface score: ' +
                              str(hydophobic_surface_score))
                if model_report['conserved_area_reduction'] < 0:
                    conserved_surface_score = 10 * (
                        model_report['conserved_area_reduction'] /
                        model_report['exposed_area_reduction']) / 3
                else:
                    conserved_surface_score = 0
                if conserved_surface_score > 10:
                    conserved_surface_score = 10
                output.append('Conserved surface score: ' +
                              str(conserved_surface_score))
                model_report['surface_score'] = round(
                    (hydophobic_surface_score + conserved_surface_score) / 2,
                    2)
            else:
                output.append(clrs['r'] + 'Exposed area reduction too small.' +
                              clrs['n'])
                model_report['surface_score'] = 0
            output.append('Final surface score: ' +
                          str(model_report['surface_score']))
        else:
            model_report['surface_score'] = 'NA'

        model_oligomer = oligomer.split('_CHOIR_CorrectedChains')[0]
        xml_out = model_oligomer + '_CHOIR_PisaInterfaces.xml'
        model_interfaces_list, interfaces_output = pctools.parse_interfaces(
            xml_out, relevant_chains, g_args.verbosity)
        template_interfaces_list = g_interfaces_dict[g_template_hitchain]

        if model_interfaces_list and template_interfaces_list:
            if g_args.verbosity > 0:
                output.append(clrs['y'] + 'MODEL INTERFACES' + clrs['n'])
                for model_interface in model_interfaces_list:
                    output.append(clrs['y'] +
                                  ' <> '.join(model_interface['chains']) +
                                  clrs['n'])
                    output.append(clrs['y'] + 'Interface Area: ' + clrs['n'] +
                                  str(model_interface['interface area']) +
                                  ' A^2')
                    output.append(
                        clrs['y'] + 'Interface Solvation Energy: ' +
                        clrs['n'] +
                        str(model_interface['interface solvation energy']) +
                        ' kcal/mol')
                    output.append(clrs['y'] + 'Hydrogen Bonds: ' + clrs['n'] +
                                  str(model_interface['hydrogen bonds']))
                    output.append(clrs['y'] + 'Salt Bridges: ' + clrs['n'] +
                                  str(model_interface['salt bridges']))
                    output.append(clrs['y'] + 'Disulphide Bridges: ' +
                                  clrs['n'] +
                                  str(model_interface['disulphide bridges']) +
                                  "\n\n")

            interfaces_comparison = {}
            for template_interface in template_interfaces_list:
                for model_interface in model_interfaces_list:
                    if set(model_interface['chains']) == set(
                            template_interface['chains']):
                        comparison_data = {}
                        denominator = 12
                        delta_area = round(
                            model_interface['interface area'] -
                            template_interface['interface area'], 2)
                        comparison_data['model area'] = model_interface[
                            'interface area']
                        comparison_data['template area'] = template_interface[
                            'interface area']
                        comparison_data['delta area'] = delta_area
                        delta_energy = round(
                            model_interface['interface solvation energy'] -
                            template_interface['interface solvation energy'],
                            2)
                        comparison_data['model energy'] = model_interface[
                            'interface solvation energy']
                        comparison_data[
                            'template energy'] = template_interface[
                                'interface solvation energy']
                        comparison_data['delta energy'] = delta_energy
                        delta_hb = round(
                            model_interface['hydrogen bonds'] -
                            template_interface['hydrogen bonds'], 2)
                        comparison_data['model hb'] = model_interface[
                            'hydrogen bonds']
                        comparison_data['template hb'] = template_interface[
                            'hydrogen bonds']
                        comparison_data['delta hb'] = delta_hb
                        delta_sb = round(
                            model_interface['salt bridges'] -
                            template_interface['salt bridges'], 2)
                        comparison_data['model sb'] = model_interface[
                            'salt bridges']
                        comparison_data['template sb'] = template_interface[
                            'salt bridges']
                        comparison_data['delta sb'] = delta_sb
                        delta_ss = round(
                            model_interface['disulphide bridges'] -
                            template_interface['disulphide bridges'], 2)
                        comparison_data['model ss'] = model_interface[
                            'disulphide bridges']
                        comparison_data['template ss'] = template_interface[
                            'disulphide bridges']
                        comparison_data['delta ss'] = delta_ss

                        output.append(clrs['y'] + 'INTERFACES COMPARISON' +
                                      clrs['n'])
                        output.append(' <> '.join(model_interface['chains']))
                        if delta_area >= 0:
                            emphasis_color = clrs['g']
                            relative_area = 100
                        else:
                            emphasis_color = clrs['r']
                            relative_area = round(
                                model_interface['interface area'] * 100 /
                                template_interface['interface area'], 2)
                        output.append('Delta Interface Area: ' +
                                      emphasis_color + str(delta_area) +
                                      clrs['n'] + ' A^2 (' +
                                      str(relative_area) + '%)')

                        if delta_energy <= 0:
                            emphasis_color = clrs['g']
                            relative_energy = 100
                        else:
                            emphasis_color = clrs['r']
                            if model_interface[
                                    'interface solvation energy'] < 0 and template_interface[
                                        'interface solvation energy'] < 0:
                                relative_energy = round(
                                    model_interface[
                                        'interface solvation energy'] * 100 /
                                    template_interface[
                                        'interface solvation energy'], 2)
                            elif model_interface[
                                    'interface solvation energy'] > 0 and template_interface[
                                        'interface solvation energy'] < 0:
                                relative_energy = 0
                            elif model_interface[
                                    'interface solvation energy'] < 0 and template_interface[
                                        'interface solvation energy'] > 0:
                                relative_energy = 100
                            elif model_interface[
                                    'interface solvation energy'] > 0 and template_interface[
                                        'interface solvation energy'] > 0:
                                relative_energy = 0
                        output.append('Delta Interface Solvation Energy: ' +
                                      emphasis_color + str(delta_energy) +
                                      clrs['n'] + ' kcal/mol (' +
                                      str(relative_energy) + '%)')

                        if model_interface[
                                'hydrogen bonds'] == template_interface[
                                    'hydrogen bonds'] == 0:
                            relative_hb = 0
                            emphasis_color = clrs['r']
                            denominator -= 2
                        elif delta_hb >= 0:
                            relative_hb = 100
                            emphasis_color = clrs['g']
                        else:
                            emphasis_color = clrs['r']
                            relative_hb = round(
                                model_interface['hydrogen bonds'] * 100 /
                                template_interface['hydrogen bonds'], 2)
                        output.append('Delta Hydrogen Bonds: ' +
                                      emphasis_color + str(delta_hb) +
                                      clrs['n'] + ' (' + str(relative_hb) +
                                      '%)')

                        if model_interface[
                                'salt bridges'] == template_interface[
                                    'salt bridges'] == 0:
                            relative_sb = 0
                            emphasis_color = clrs['r']
                            denominator -= 3
                        elif delta_sb >= 0:
                            relative_sb = 100
                            emphasis_color = clrs['g']
                        else:
                            relative_sb = round(
                                model_interface['salt bridges'] * 100 /
                                template_interface['salt bridges'], 2)
                            emphasis_color = clrs['r']
                        output.append('Delta Salt Bridges: ' + emphasis_color +
                                      str(delta_sb) + clrs['n'] + ' (' +
                                      str(relative_sb) + '%)')

                        if model_interface[
                                'disulphide bridges'] == template_interface[
                                    'disulphide bridges'] == 0:
                            relative_ss = 0
                            emphasis_color = clrs['r']
                            denominator -= 4
                        elif delta_ss >= 0:
                            relative_ss = 100
                            emphasis_color = clrs['g']
                        else:
                            relative_ss = round(
                                model_interface['disulphide bridges'] * 100 /
                                template_interface['disulphide bridges'], 2)
                            emphasis_color = clrs['r']
                        output.append('Delta Disulphide Bridges: ' +
                                      emphasis_color + str(delta_ss) +
                                      clrs['n'] + ' (' + str(relative_ss) +
                                      '%)\n')

                        if denominator == 0:
                            comparison_data['score'] = 0
                        else:
                            comparison_data['score'] = round(
                                (relative_area + 2 * relative_energy +
                                 2 * relative_hb + 3 * relative_sb +
                                 4 * relative_ss) / denominator, 2)
                        output.append('Interface score: ' +
                                      str(comparison_data['score']))
                        interfaces_comparison[''.join(
                            sorted(
                                model_interface['chains']))] = comparison_data

            comparison_plots, interfaces_output = plot_deltas(
                model_oligomer_name, template, interfaces_comparison, g_args)
            model_report['comparison_plots'] = os.path.basename(
                comparison_plots)
            output.append(interfaces_output)
            summed_score = 0
            for interface, data in interfaces_comparison.items():
                summed_score += data['score']

            model_report['interfaces_score'] = round(
                summed_score / (10 * len(interfaces_comparison)), 2)
            output.append('Final interfaces score: ' +
                          str(model_report['interfaces_score']))
        else:
            if 'surface_score' not in model_report:
                model_report['surface_score'] = 0
            model_report['interfaces_score'] = 0

    else:
        model_report['surface_score'] = 'NA'
        model_report['interfaces_score'] = 'NA'
        model_report['comparison_plots'] = 'NA'
        model_report['assemblied_protomer_exposed_area'] = 'NA'
        model_report['assemblied_protomer_hydrophobic_area'] = 'NA'
        model_report['assemblied_protomer_conserved_area'] = 'NA'

    if 'G' in g_args.assessment:
        output.append(pctools.subsection('3' + '[G]', 'GESAMT Comparison'))
        qscore, rmsd, fasta_out, gesamt_output = pctools.run_gesamt(
            template, template_file, model_oligomer_name, oligomer, None,
            g_args)
        output.append(gesamt_output)
        model_report['gesamt_qscore'] = str(qscore)
        model_report['gesamt_rmsd'] = str(rmsd)
    else:
        model_report['gesamt_qscore'] = 'NA'
        model_report['gesamt_rmsd'] = 'NA'

    if 'M' in g_args.assessment:
        output.append(pctools.subsection('3' + '[M]', 'Molprobity Comparison'))
        model_molprobity, molprobity_output = pctools.run_molprobity(
            oligomer, g_args)
        output.append(molprobity_output)
        model_report['model_clashscore'] = str(model_molprobity['clashscore'])
        model_report['model_molprobity'] = str(
            model_molprobity['molprobity_score'])
        output.append(clrs['y'] + 'MOLPROBITY COMPARISON' + clrs['n'])
        output.append('Criterion\tTempl.\tModel')
        output.append('Rama. Fav.\t' + str(template_molprobity['rama_fav']) +
                      '\t' + str(model_molprobity['rama_fav']))
        output.append('Rama. Out.\t' + str(template_molprobity['rama_out']) +
                      '\t' + str(model_molprobity['rama_out']))
        output.append('Rot. Out.\t' + str(template_molprobity['rot_out']) +
                      '\t' + str(model_molprobity['rot_out']))
        output.append('CBeta Dev.\t' + str(template_molprobity['cb_dev']) +
                      '\t' + str(model_molprobity['cb_dev']))
        output.append('Clashscore\t' + str(template_molprobity['clashscore']) +
                      '\t' + str(model_molprobity['clashscore']))
        output.append('Molprob. Score\t' +
                      str(template_molprobity['molprobity_score']) + '\t' +
                      str(model_molprobity['molprobity_score']))
        molprobity_radar, radar_output = plot_molprobity(
            model_oligomer_name, model_molprobity, template,
            template_molprobity)
        output.append(radar_output)
        model_report['molprobity_radar'] = molprobity_radar
        delta_clashscore = (model_molprobity['clashscore'] -
                            template_molprobity['clashscore']) / 10
        output.append('Delta clashscore: ' + str(delta_clashscore))
        if delta_clashscore >= 1:
            model_report['quality_score'] = round(
                10 - math.log(delta_clashscore**5, 10), 2)
        else:
            model_report['quality_score'] = 10
        output.append('Final quality score: ' +
                      str(model_report['quality_score']))
    else:
        model_report['model_clashscore'] = 'NA'
        model_report['model_molprobity'] = 'NA'
        model_report['quality_score'] = 'NA'

    if 'M' in g_args.assessment and 'I' in g_args.assessment and not g_args.allow_monomers:
        if g_args.sequence_mode is False and g_args.skip_conservation is False:
            model_report['protchoir_score'] = round(
                sum([
                    model_report['interfaces_score'],
                    model_report['surface_score'],
                    model_report['quality_score']
                ]) / 3, 2)
        else:
            model_report['protchoir_score'] = round(
                sum([
                    model_report['interfaces_score'],
                    model_report['quality_score']
                ]) / 2, 2)
    elif 'M' in g_args.assessment:
        model_report['protchoir_score'] = model_report['quality_score']
    elif 'I' in g_args.assessment:
        if g_args.sequence_mode is False and g_args.skip_conservation is False:
            model_report['protchoir_score'] = round(
                sum([
                    model_report['interfaces_score'],
                    model_report['surface_score']
                ]) / 2, 2)
        else:
            model_report['protchoir_score'] = model_report['interfaces_score']
    else:
        model_report['protchoir_score'] = 'NA'
    if str(model_report['protchoir_score']) == 'NA':
        model_report['score_color'] = 'grey'
    elif model_report['protchoir_score'] <= 5:
        model_report['score_color'] = 'red'
    elif 5 < model_report['protchoir_score'] <= 7:
        model_report['score_color'] = 'orange'
    elif model_report['protchoir_score'] > 7:
        model_report['score_color'] = 'green'

    pickle.dump(model_report,
                open(model_oligomer_name + '_CHOIR_model_report.pickle', 'wb'))

    return model_report, '\n'.join(output)