예제 #1
0
def make_local_template(best_oligo_template):
    middle_letters_best = best_oligo_template[1:3]
    if g_args.allow_monomers:
        best_template_file = os.path.join(
            pdb_archive, middle_letters_best,
            'pdb' + best_oligo_template + ".ent.gz")
        pdb_name, contents = pctools.parse_pdb_contents(best_template_file)
        is_nmr = pctools.is_nmr(contents)
        if is_nmr:
            print(
                clrs['r'] + '\n\n Selected template ' + best_oligo_template +
                ' is an NMR structure \n Will try a a different candidate.\n\n'
                + clrs['n'])
            raise

    else:
        best_template_file = os.path.join(pdb_homo_archive,
                                          middle_letters_best,
                                          best_oligo_template + ".pdb.gz")
    clean_template_file = os.path.join(
        workdir, best_oligo_template + "_CHOIR_CleanTemplate.pdb")
    pdb_name, structure, nchains = pctools.parse_any_structure(
        best_template_file)
    io.set_structure(structure)
    io.save(clean_template_file, pctools.SelectIfCA())
    return clean_template_file
예제 #2
0
def curate_homoDB(verbosity):
    '''
    Creates h**o-oligomeric database from a local pdb repsitory.
    The divided scheme adopted by RCSB, in which the subdirectories
    are the two middle characters in the PDB code, is assumed.
    Each database contains three key files: dat, log and fasta.
    * homodb.dat contains only the pdb codes contained in the database.
    * homodb.log contains summarized relevant information about each entry.
    * homodb.fasta contains the sequences of every chain in the database.
    Called by: update_databases()
    '''
    # Create stats folder if does not exist
    stats_dir = os.path.join(pdb_homo_archive, 'stats')
    if not os.path.isdir(stats_dir):
        os.mkdir(stats_dir)
    # Compare latest assession with new files
    assession_log = read_latest_assession(stats_dir)
    new_files = list_new_files(pdb1_archive, assession_log, verbosity)
    print(clrs['g'] + str(len(new_files)) + clrs['n'] +
          ' new structure files were found and will be processed')
    now = str(time.strftime("%d-%m-%Y@%H.%M.%S"))
    dat_file = os.path.join(stats_dir, now + '-choirdb.dat')
    log_file = os.path.join(stats_dir, now + '-choirdb.log')
    err_file = os.path.join(stats_dir, now + '-choirdb.err')
    if not os.path.isfile(dat_file):
        with open(dat_file, 'w+'):
            pass
    # Write files not to be updated to new dat file
    with open(dat_file, 'a') as f:
        for i in assession_log:
            if i not in new_files:
                f.write(i + " " + assession_log[i] + "\n")
    # Create log file
    if not os.path.isfile(log_file):
        with open(log_file, 'w+') as f:
            f.write('Code, Chains, Author, Software, Date\n')

    # Read Chain correspondences
    chain_correspondences_file = os.path.join(stats_dir,
                                              'chain_correspondences.pickle')
    if os.path.isfile(chain_correspondences_file):
        with open(chain_correspondences_file, 'rb') as p:
            chain_correspondences = pickle.load(p)
    else:
        chain_correspondences = {}

    # Main loop that will populate the ProtCHOIR database
    for pdb in pg(new_files, widgets=widgets):
        filename = pdb.split('/')[-1]
        subfolder = pdb.split('/')[-2]
        # Record assessment in dat file
        with open(dat_file, 'a') as f:
            f.write(filename + " " + str(time.time()) + '\n')
        # Start assession
        pctools.printv('\nAssessing ' + pdb + '...', verbosity)
        # Reject files larger than 10Mb
        file_size = os.stat(pdb).st_size / 1048576
        pctools.printv(
            'File size: ' + clrs['c'] + '{0:.1g}'.format(file_size) + ' Mb' +
            clrs['n'], verbosity)
        if file_size > 2:
            pctools.printv(clrs['r'] + "File size too large!" + clrs['n'],
                           verbosity)
            pctools.printv(
                clrs['y'] +
                "Will try to fetch sequences from asymmetric unit." +
                clrs['n'], verbosity)
            try:
                alternative_pdb = os.path.join(
                    pdb_archive, subfolder,
                    'pdb' + filename.split('.')[0] + '.ent.gz')
                pdb_code, structure, nchains = pctools.parse_pdb_structure(
                    alternative_pdb)
                structure, chain_correspondences[
                    pdb_code] = pctools.split_states(structure)
                nchainspostsplit, seqs, chain_ids = pctools.extract_seqs(
                    structure, 0)
                # Write in fasta file
                pctools.printv(
                    clrs['y'] + "Recording large-pdb sequence" + clrs['n'],
                    verbosity)
                record_fasta(pdb_code,
                             seqs,
                             chain_ids,
                             subfolder,
                             type='largepdb')
            except:
                pctools.printv(
                    clrs['r'] + "Failed to fetch sequence!" + clrs['n'],
                    verbosity)
            continue

        try:
            pdb_code, structure, nchains = pctools.parse_pdb_structure(pdb)
            pctools.printv(
                'Number of chains in structure ' + clrs['y'] + pdb_code +
                clrs['n'] + ': ' + str(nchains), verbosity)
            # Reject structures with more than 60 chains
            if int(nchains) > 60:
                pctools.printv(
                    "Number of chains (" + clrs['y'] + str(nchains) +
                    clrs['n'] + ") larger than 60! " + clrs['r'] +
                    "Too many chains!" + clrs['n'], verbosity)
                pctools.printv(
                    clrs['y'] + "Will try to fetch sequences anyway." +
                    clrs['n'], verbosity)
                try:
                    pdb_code, structure, nchains = pctools.parse_pdb_structure(
                        pdb)
                    structure, chain_correspondences[
                        pdb_code] = pctools.split_states(structure)
                    nchainspostsplit, seqs, chain_ids = pctools.extract_seqs(
                        structure, 0)
                    pctools.printv(
                        clrs['y'] + "Recording large-pdb sequence" + clrs['n'],
                        verbosity)
                    # Write in fasta file
                    record_fasta(pdb_code,
                                 seqs,
                                 chain_ids,
                                 subfolder,
                                 type='largepdb')
                except:
                    pctools.printv(
                        clrs['r'] + "Failed to fetch sequence!" + clrs['n'],
                        verbosity)
                continue

            structure, chain_correspondences[pdb_code] = pctools.split_states(
                structure)
            nchainspostsplit, seqs, chain_ids = pctools.extract_seqs(
                structure, 0)
            pctools.printv(
                'Number of chains (' + clrs['c'] + str(nchains) + clrs['n'] +
                ') and file size (' + clrs['c'] + str(file_size) + clrs['n'] +
                ') OK.' + clrs['g'] + ' Proceeding.' + clrs['n'] + '\n',
                verbosity)
            # Try to get info from the canonic pdb header (homonimous to pdb1)
            canonpdb = "pdb" + pdb_code + ".ent.gz"
            try:
                contents = pctools.parse_pdb_contents(
                    os.path.join(pdb_archive, subfolder, canonpdb))[1]
            except:
                pctools.printv(
                    clrs['r'] +
                    '\n\n Mismatch between pdb and biounit entries...' +
                    clrs['n'], verbosity)
            author, software = pctools.get_annotated_states(contents)
            pctools.printv(
                'Author determined biological unit = ' + str(author),
                verbosity)
            pctools.printv(
                'Software determined quaternary structure= ' + str(software),
                verbosity)
            # Start assessing sequences and structures (from 2 up to 26 chains)
            if 1 < int(nchains) < 61:
                ids, proteinpair = pctools.get_pairwise_ids(seqs, nchains)
                for id in ids:
                    if id[0] >= 90:
                        color = clrs['g']
                    else:
                        color = clrs['r']
                    pctools.printv(
                        'Identity between chains ' + clrs['y'] + str(id[1]) +
                        clrs['n'] + ' and ' + clrs['y'] + str(id[2]) +
                        clrs['n'] + ' is ' + color + str(id[0]) + "%" +
                        clrs['n'] + ".", verbosity)
                # Save records for pure h**o-oligomers
                if all(id[0] > 90 for id in ids) and proteinpair is True:
                    pctools.printv(
                        "All identities over 90%. Likely " + clrs['b'] +
                        "h**o-oligomeric" + clrs['n'] + ".", verbosity)
                    pctools.printv(clrs['y'] + "FETCHING" + clrs['n'] + ".\n",
                                   verbosity)
                    # Write file to database
                    newfile = os.path.join(pdb_homo_archive, subfolder,
                                           pdb_code + ".pdb")
                    if not os.path.isdir(
                            os.path.join(pdb_homo_archive, subfolder)):
                        os.mkdir(os.path.join(pdb_homo_archive, subfolder))
                    io.set_structure(structure)
                    io.save(newfile)
                    pctools.gzip_pdb(newfile)
                    # Write to log file
                    with open(log_file, 'a') as f:
                        f.write(
                            str(pdb_code) + "," + str(nchains) + "," +
                            '/'.join(author) + "," + '/'.join(software) + "," +
                            str(os.path.getctime(newfile + '.gz')) + '\n')
                    # Write in fasta file
                    pctools.printv(
                        clrs['y'] + "Recording h**o-oligomer sequence." +
                        clrs['n'], verbosity)
                    record_fasta(pdb_code,
                                 seqs,
                                 chain_ids,
                                 subfolder,
                                 type='h**o')

                # Investigate partial h**o-oligomers
                elif any(id[0] > 90 for id in ids) and proteinpair is True:
                    at_least_one_interface = False
                    for id in ids:
                        if id[0] > 90:
                            # Check if similar chains share interfaces
                            if pctools.check_interfaces(
                                    structure, id[1], id[2]):
                                at_least_one_interface = True
                                pctools.printv(
                                    'Contacts found between chains ' +
                                    clrs['g'] + str(id[1]) + clrs['n'] +
                                    ' and ' + clrs['g'] + str(id[2]) +
                                    clrs['n'] + ' sharing ' + clrs['g'] +
                                    str(id[0]) + clrs['n'] + " % identity.",
                                    verbosity)
                                pctools.printv(
                                    "At least one putative " + clrs['b'] +
                                    "h**o-oligomeric " + clrs['n'] +
                                    "interface found.", verbosity)
                                pctools.printv(
                                    clrs['y'] + "FETCHING" + clrs['n'] + ".\n",
                                    verbosity)
                                # Write file to database
                                newfile = os.path.join(pdb_homo_archive,
                                                       subfolder,
                                                       pdb_code + ".pdb")
                                if not os.path.isdir(
                                        os.path.join(pdb_homo_archive,
                                                     subfolder)):
                                    os.mkdir(
                                        os.path.join(pdb_homo_archive,
                                                     subfolder))
                                io.set_structure(structure)
                                io.save(newfile)
                                pctools.gzip_pdb(newfile)
                                # Write to log file
                                with open(log_file, 'a') as f:
                                    f.write(
                                        str(pdb_code) + "," + str(nchains) +
                                        "," + '/'.join(author) + "," +
                                        '/'.join(software) + "," +
                                        str(os.path.getctime(newfile +
                                                             '.gz')) + '\n')
                                # Write in fasta file
                                pctools.printv(
                                    clrs['y'] +
                                    "Recording h**o-oligomer sequence." +
                                    clrs['n'], verbosity)
                                record_fasta(pdb_code,
                                             seqs,
                                             chain_ids,
                                             subfolder,
                                             type='h**o')

                                break
                    if at_least_one_interface is False:
                        pctools.printv(
                            "No h**o-oligomeric interface found. Likely " +
                            clrs['r'] + "hetero-oligomeric" + clrs['n'] + ".",
                            verbosity)
                        pctools.printv(
                            clrs['y'] + "Recording hetero-oligomer sequence" +
                            clrs['n'], verbosity)
                        # Write in fasta file
                        record_fasta(pdb_code,
                                     seqs,
                                     chain_ids,
                                     subfolder,
                                     type='hetero')

                elif proteinpair is False:
                    pctools.printv(
                        clrs['r'] + "No proteic chain pairs found" +
                        clrs['n'] + ".", verbosity)
                    if any([set(seq[1]) != {'X'} for seq in seqs]):
                        pctools.printv(
                            clrs['y'] + "Protein sequences found though" +
                            clrs['n'], verbosity)
                        pctools.printv(
                            clrs['y'] + "Recording hetero-oligomer sequence" +
                            clrs['n'], verbosity)
                        # Write in fasta file
                        record_fasta(pdb_code,
                                     seqs,
                                     chain_ids,
                                     subfolder,
                                     type='hetero')
                    else:
                        pctools.printv(
                            clrs['r'] +
                            "Not even a single protein chain. Disregarding." +
                            clrs['n'], verbosity)

                else:
                    pctools.printv(
                        "No similar chains found. Likely " + clrs['r'] +
                        "hetero-oligomeric" + clrs['n'] + ".", verbosity)
                    pctools.printv(
                        clrs['y'] + "Recording hetero-oligomer sequence" +
                        clrs['n'], verbosity)
                    record_fasta(pdb_code,
                                 seqs,
                                 chain_ids,
                                 subfolder,
                                 type='hetero')

            elif int(nchains) == 1:
                pctools.printv(
                    "Only one chain found. Likely " + clrs['r'] + "monomeric" +
                    clrs['n'] + ".", verbosity)
                pctools.printv(
                    clrs['y'] + "Recording monomer sequence." + clrs['n'],
                    verbosity)
                structure, chain_correspondences[
                    pdb_code] = pctools.split_states(structure)
                nchains, seqs, chain_ids = pctools.extract_seqs(structure, 0)
                record_fasta(pdb_code, seqs, chain_ids, subfolder, type='mono')

        except:
            errtype, errvalue, errtraceback = sys.exc_info()
            errtypeshort = str(errtype).split('\'')[1]
            pctools.printv(
                clrs['r'] + '*' + str(errtypeshort) + ': ' + str(errvalue) +
                ' l.' + str(errtraceback.tb_lineno) + '*' + clrs['n'],
                verbosity)
            traceback.print_exception(*sys.exc_info())
            if errtypeshort == 'KeyboardInterrupt':
                quit()
            #pctools.printv(clrs['r']+"UNKNOWN FAULT"+clrs['n']+".", verbosity)
            if not os.path.isfile(err_file):
                with open(err_file, 'w+') as f:
                    pass
            with open(err_file, 'a') as f:
                f.write(filename + '\n')
            continue

    with open(chain_correspondences_file, 'wb') as p:
        pickle.dump(chain_correspondences, p, protocol=pickle.HIGHEST_PROTOCOL)

    if not os.path.isfile(err_file):
        with open(err_file, 'w+') as f:
            f.write('\nNo errors. Assessment terminated succesfully.\n')