Beispiel #1
0
def single_chain(pdb_dir):
    '''
    Iterates through a directory and uses Biopython to
    select and write the first chain from each pdb.
    Called by: AccessPDB.py:main() (Option 6)
    '''
    single_chain_dir = pdb_dir + "/SingleChains/"
    os.makedirs(single_chain_dir)
    print('\n\nExtracting first chain of each PDB file...\n')
    for pdb in pg(os.listdir(pdb_dir), widgets=widgets):
        if pdb.endswith(".ent") or pdb.endswith(".pdb") or pdb.endswith(
                ".ent.gz"):
            pdb_name = pdb.split('.')[0].split("/")[-1]
            if pdb.endswith(".ent.gz"):
                pdb_file = gzip.open(pdb_dir + '/' + pdb, 'rt')
            else:
                pdb_file = pdb_dir + '/' + pdb
            try:
                structure = p.get_structure(pdb_name, pdb_file)
            except:
                print("Structure " + pdb_name +
                      " could not be strictly parsed.")
                continue
            chains = structure.get_chains()
            for chain in chains:
                chain_id = chain.id
                break
            single_chain_name = single_chain_dir + pdb_name + chain_id + ".pdb"
            io.set_structure(structure)
            io.save(single_chain_name, SelectChain(chain_id))
Beispiel #2
0
def match_parallel(list1, list2, threshold=0.75):
    '''
        Compute the similarity score between strings from list1 and strings
        from list2 based on the Levensthein distance. Unlike match(), this
        function uses multiprocessing to compute scores.

        In order for this function to be efficient, the length of list1 should
        be superior to the length of list2.

        Parameters
            list1, list2: lists of strings
            threshold: minimum score needed to consider two strings as matched
                (between 0 and 1, where 0 is no match and 1 is perfect match)
        Returns
            matched: list of dict
    '''
    if len(list1) > len(list2):
        print('\n[ ! ] WARNING: parameter list1 should contain the smallest',
              'array. Otherwise computation time might be extanded.\n')
        if str(input('Are you sure you want to keep going with no changes? [y/n]: ')) == 'y':
            pass
        else:
            exit()

    matched = []

    # Prepare data for parallelization
    n_cores = mp.cpu_count()
    list2_chunks = chunks(list2, n_cores)

    for a, _ in zip(list1, pg(range(len(list1)), widgets=widgets)):

        # Define jobs parameters
        job_parameters = []
        for c in list2_chunks:
            job_parameters.append((a, c, threshold))

        # Parallelize score computation
        with mp.Pool(processes=n_cores) as p:
            results = p.starmap(match_job, job_parameters)

        # Yield results
        for r in results:
            matched += r

    return matched
Beispiel #3
0
def pdb_to_fasta(pdb_dir):
    '''
    Simply iterates over the PDB files in the current directory and creates
    a FASTA file containing entries for each chain of all PDBs.
    Called by: AccessPDB.py:main() (option 9)
    '''
    fasta_file = pdb_dir.split("/")[-1] + '.fasta'
    for pdb in pg(os.listdir(pdb_dir), widgets=widgets):
        try:
            pdb_name, pdb_file, structure, contents = parse_pdb(pdb_dir, pdb)
        except:
            continue
        nchains, seqs, chain_ids = extract_seqs(structure, 0)
        with open(fasta_file, 'a') as f:
            for seq, chain_id in zip(seqs, chain_ids):
                wrapped_seq = "\n".join(tw.wrap(seq))
                fasta_entry = '>' + pdb_name + ':' + str(
                    chain_id) + '\n' + wrapped_seq + '\n\n'
                f.write(fasta_entry)
Beispiel #4
0
def min_chain_length(pdb_dir, length):
    '''
    Iterates over PDB files in directory, checks the chain lenght for each
    chain and copies the ones in which at least one chain has more than the
    desired length.
    Called by: AccessPDB.py:main() (option 8)
    '''
    filtered_dir = 'Over' + length
    os.mkdir(filtered_dir)
    for pdb in pg(os.listdir(pdb_dir), widgets=widgets):
        try:
            pdb_name, pdb_file, structure, contents = parse_pdb(pdb_dir, pdb)
        except:
            continue
        nchains, seqs = extract_seqs(structure, 0)
        if all(len(seq) < int(length) for seq in seqs):
            continue
        else:
            shutil.copyfile(
                pdb_dir + '/' + pdb_name + '.pdb',
                pdb_dir + '/' + filtered_dir + '/' + pdb_name + '.pdb')
Beispiel #5
0
def match(list1, list2, threshold=0.75):
    '''
        Compute the similarity score between strings from list1 and strings
        from list2 based on the Levensthein distance.

        Parameters
            list1, list2: lists of strings
            threshold: minimum score needed to consider two strings as matched
                (between 0 and 1, where 0 is no match and 1 is perfect match)
        Returns
            matched: list of dict
    '''
    matched = []
    for a, _ in zip(list1, pg(range(len(list1)), widgets=widgets)):
        for b in list2:
            a_ = alpha_sort(a)
            b_ = alpha_sort(b)
            d = 1 - dist.eval(a_, b_) / max(len(a_), len(b_))
            if d > threshold:
                matched.append({'list1': a,
                                'list2': b,
                                'score': round(d, 3)})
    return matched
Beispiel #6
0
def clean_pdb_files(pdb_dir):
    '''
    Iterates over files in given directory and uses clean_pdb
    function to write PDB files containing only amino_acids. (Option 5)
    Called by: AccessPDB.py:main()
    '''
    clean_dir = "clean"
    os.mkdir(clean_dir)
    print('\n\nCleaning PDB files...\n')
    for pdb in pg(os.listdir(pdb_dir), widgets=widgets):
        if pdb.endswith(".ent") or pdb.endswith(".pdb") or pdb.endswith(
                ".ent.gz"):
            pdb_name = pdb.split('.')[0].split("/")[-1]
            if pdb.endswith(".ent.gz"):
                pdb_file = gzip.open(pdb_dir + '/' + pdb, 'rt')
            else:
                pdb_file = pdb_dir + '/' + pdb
            try:
                structure = p.get_structure(pdb_name, pdb_file)
            except:
                print("Structure " + pdb_name +
                      " could not be strictly parsed.")
                continue
            clean_pdb(structure, pdb_name, clean_dir)
Beispiel #7
0
def collect_fasta(verbosity):
    '''
    Fetches fasta files in the pdb_homo_archive and creates a single fasta file
    within a "sequences" folder. For that, it checks the identity among the
    chains in the original fasta and only keeps track of the unique chains, i.e.
    less than 99% identity to the other chains. This file is later use to make
    the blast database.
    Called by: update_databases()
    '''
    fastafiles = [
        os.path.join(dp, f) for dp, dn, filenames in os.walk(pdb_homo_archive)
        for f in filenames if f.endswith(".fasta")
    ]
    seqdir = os.path.join(pdb_homo_archive, 'sequences')
    if not os.path.isdir(seqdir):
        os.mkdir(seqdir)

    largepdb_collected_fasta = os.path.join(seqdir,
                                            'largepdb_collected.fastas')
    with open(largepdb_collected_fasta, 'w+'):
        pass

    homo_collected_fasta = os.path.join(seqdir, 'homo_collected.fastas')
    with open(homo_collected_fasta, 'w+'):
        pass

    mono_collected_fasta = os.path.join(seqdir, 'mono_collected.fastas')
    with open(mono_collected_fasta, 'w+'):
        pass

    hetero_collected_fasta = os.path.join(seqdir, 'hetero_collected.fastas')
    with open(hetero_collected_fasta, 'w+'):
        pass

    for fasta in pg(fastafiles, widgets=widgets):
        pctools.printv('Assessing ' + clrs['y'] + fasta + clrs['n'] + '...',
                       verbosity)
        contents = open(fasta, 'r').read()
        contentlines = contents.split('>')
        nchains = str(len(re.findall('>', contents)))
        pctools.printv(
            'With ' + clrs['y'] + nchains + clrs['n'] +
            ' chains to be assessed\n', verbosity)
        uniques = []
        for entry in contentlines:
            if entry:
                splitentry = entry.split('\n', 1)
                pdbch = splitentry[0]
                seq = splitentry[1].replace('\n', '')
                if uniques:
                    percent_ids = []
                    for unique in uniques:
                        alignment = parasail.sg_stats_striped_16(
                            seq, unique[1], 10, 1, parasail.blosum62)
                        if alignment.length == 0:
                            percent_ids.append(0)
                        else:
                            percent_ids.append(
                                (alignment.matches) / alignment.length * 100)
                    if all(percent_id <= 99 for percent_id in percent_ids):
                        uniques.append([pdbch, seq])
                else:
                    uniques.append([pdbch, seq])

        if '/largepdb_sequences/' in fasta:
            with open(largepdb_collected_fasta, 'a') as f:
                for unique in uniques:
                    wrapped_seq = "\n".join(tw.wrap(unique[1]))
                    fasta_entry = '>' + unique[0] + '\n' + wrapped_seq + '\n\n'
                    f.write(fasta_entry)

        elif '/mono_sequences/' in fasta:
            with open(mono_collected_fasta, 'a') as f:
                for unique in uniques:
                    wrapped_seq = "\n".join(tw.wrap(unique[1]))
                    fasta_entry = '>' + unique[0] + '\n' + wrapped_seq + '\n\n'
                    f.write(fasta_entry)

        elif '/hetero_sequences/' in fasta:
            with open(hetero_collected_fasta, 'a') as f:
                for unique in uniques:
                    wrapped_seq = "\n".join(tw.wrap(unique[1]))
                    fasta_entry = '>' + unique[0] + '\n' + wrapped_seq + '\n\n'
                    f.write(fasta_entry)

        elif '/homo_sequences/' in fasta:
            with open(homo_collected_fasta, 'a') as f:
                for unique in uniques:
                    wrapped_seq = "\n".join(tw.wrap(unique[1]))
                    fasta_entry = '>' + unique[0] + '\n' + wrapped_seq + '\n\n'
                    f.write(fasta_entry)

    subprocess.run([
        makeblastdb_exe, '-in', largepdb_collected_fasta, '-dbtype', 'prot',
        '-out',
        os.path.join(seqdir, 'largedb')
    ])

    subprocess.run([
        makeblastdb_exe, '-in', mono_collected_fasta, '-dbtype', 'prot',
        '-out',
        os.path.join(seqdir, 'monodb')
    ])

    subprocess.run([
        makeblastdb_exe, '-in', hetero_collected_fasta, '-dbtype', 'prot',
        '-out',
        os.path.join(seqdir, 'heterodb')
    ])

    subprocess.run([
        makeblastdb_exe, '-in', homo_collected_fasta, '-dbtype', 'prot',
        '-out',
        os.path.join(seqdir, 'homodb')
    ])
Beispiel #8
0
def curate_homoDB(verbosity):
    '''
    Creates h**o-oligomeric database from a local pdb repsitory.
    The divided scheme adopted by RCSB, in which the subdirectories
    are the two middle characters in the PDB code, is assumed.
    Each database contains three key files: dat, log and fasta.
    * homodb.dat contains only the pdb codes contained in the database.
    * homodb.log contains summarized relevant information about each entry.
    * homodb.fasta contains the sequences of every chain in the database.
    Called by: update_databases()
    '''
    # Create stats folder if does not exist
    stats_dir = os.path.join(pdb_homo_archive, 'stats')
    if not os.path.isdir(stats_dir):
        os.mkdir(stats_dir)
    # Compare latest assession with new files
    assession_log = read_latest_assession(stats_dir)
    new_files = list_new_files(pdb1_archive, assession_log, verbosity)
    print(clrs['g'] + str(len(new_files)) + clrs['n'] +
          ' new structure files were found and will be processed')
    now = str(time.strftime("%d-%m-%Y@%H.%M.%S"))
    dat_file = os.path.join(stats_dir, now + '-choirdb.dat')
    log_file = os.path.join(stats_dir, now + '-choirdb.log')
    err_file = os.path.join(stats_dir, now + '-choirdb.err')
    if not os.path.isfile(dat_file):
        with open(dat_file, 'w+'):
            pass
    # Write files not to be updated to new dat file
    with open(dat_file, 'a') as f:
        for i in assession_log:
            if i not in new_files:
                f.write(i + " " + assession_log[i] + "\n")
    # Create log file
    if not os.path.isfile(log_file):
        with open(log_file, 'w+') as f:
            f.write('Code, Chains, Author, Software, Date\n')

    # Read Chain correspondences
    chain_correspondences_file = os.path.join(stats_dir,
                                              'chain_correspondences.pickle')
    if os.path.isfile(chain_correspondences_file):
        with open(chain_correspondences_file, 'rb') as p:
            chain_correspondences = pickle.load(p)
    else:
        chain_correspondences = {}

    # Main loop that will populate the ProtCHOIR database
    for pdb in pg(new_files, widgets=widgets):
        filename = pdb.split('/')[-1]
        subfolder = pdb.split('/')[-2]
        # Record assessment in dat file
        with open(dat_file, 'a') as f:
            f.write(filename + " " + str(time.time()) + '\n')
        # Start assession
        pctools.printv('\nAssessing ' + pdb + '...', verbosity)
        # Reject files larger than 10Mb
        file_size = os.stat(pdb).st_size / 1048576
        pctools.printv(
            'File size: ' + clrs['c'] + '{0:.1g}'.format(file_size) + ' Mb' +
            clrs['n'], verbosity)
        if file_size > 2:
            pctools.printv(clrs['r'] + "File size too large!" + clrs['n'],
                           verbosity)
            pctools.printv(
                clrs['y'] +
                "Will try to fetch sequences from asymmetric unit." +
                clrs['n'], verbosity)
            try:
                alternative_pdb = os.path.join(
                    pdb_archive, subfolder,
                    'pdb' + filename.split('.')[0] + '.ent.gz')
                pdb_code, structure, nchains = pctools.parse_pdb_structure(
                    alternative_pdb)
                structure, chain_correspondences[
                    pdb_code] = pctools.split_states(structure)
                nchainspostsplit, seqs, chain_ids = pctools.extract_seqs(
                    structure, 0)
                # Write in fasta file
                pctools.printv(
                    clrs['y'] + "Recording large-pdb sequence" + clrs['n'],
                    verbosity)
                record_fasta(pdb_code,
                             seqs,
                             chain_ids,
                             subfolder,
                             type='largepdb')
            except:
                pctools.printv(
                    clrs['r'] + "Failed to fetch sequence!" + clrs['n'],
                    verbosity)
            continue

        try:
            pdb_code, structure, nchains = pctools.parse_pdb_structure(pdb)
            pctools.printv(
                'Number of chains in structure ' + clrs['y'] + pdb_code +
                clrs['n'] + ': ' + str(nchains), verbosity)
            # Reject structures with more than 60 chains
            if int(nchains) > 60:
                pctools.printv(
                    "Number of chains (" + clrs['y'] + str(nchains) +
                    clrs['n'] + ") larger than 60! " + clrs['r'] +
                    "Too many chains!" + clrs['n'], verbosity)
                pctools.printv(
                    clrs['y'] + "Will try to fetch sequences anyway." +
                    clrs['n'], verbosity)
                try:
                    pdb_code, structure, nchains = pctools.parse_pdb_structure(
                        pdb)
                    structure, chain_correspondences[
                        pdb_code] = pctools.split_states(structure)
                    nchainspostsplit, seqs, chain_ids = pctools.extract_seqs(
                        structure, 0)
                    pctools.printv(
                        clrs['y'] + "Recording large-pdb sequence" + clrs['n'],
                        verbosity)
                    # Write in fasta file
                    record_fasta(pdb_code,
                                 seqs,
                                 chain_ids,
                                 subfolder,
                                 type='largepdb')
                except:
                    pctools.printv(
                        clrs['r'] + "Failed to fetch sequence!" + clrs['n'],
                        verbosity)
                continue

            structure, chain_correspondences[pdb_code] = pctools.split_states(
                structure)
            nchainspostsplit, seqs, chain_ids = pctools.extract_seqs(
                structure, 0)
            pctools.printv(
                'Number of chains (' + clrs['c'] + str(nchains) + clrs['n'] +
                ') and file size (' + clrs['c'] + str(file_size) + clrs['n'] +
                ') OK.' + clrs['g'] + ' Proceeding.' + clrs['n'] + '\n',
                verbosity)
            # Try to get info from the canonic pdb header (homonimous to pdb1)
            canonpdb = "pdb" + pdb_code + ".ent.gz"
            try:
                contents = pctools.parse_pdb_contents(
                    os.path.join(pdb_archive, subfolder, canonpdb))[1]
            except:
                pctools.printv(
                    clrs['r'] +
                    '\n\n Mismatch between pdb and biounit entries...' +
                    clrs['n'], verbosity)
            author, software = pctools.get_annotated_states(contents)
            pctools.printv(
                'Author determined biological unit = ' + str(author),
                verbosity)
            pctools.printv(
                'Software determined quaternary structure= ' + str(software),
                verbosity)
            # Start assessing sequences and structures (from 2 up to 26 chains)
            if 1 < int(nchains) < 61:
                ids, proteinpair = pctools.get_pairwise_ids(seqs, nchains)
                for id in ids:
                    if id[0] >= 90:
                        color = clrs['g']
                    else:
                        color = clrs['r']
                    pctools.printv(
                        'Identity between chains ' + clrs['y'] + str(id[1]) +
                        clrs['n'] + ' and ' + clrs['y'] + str(id[2]) +
                        clrs['n'] + ' is ' + color + str(id[0]) + "%" +
                        clrs['n'] + ".", verbosity)
                # Save records for pure h**o-oligomers
                if all(id[0] > 90 for id in ids) and proteinpair is True:
                    pctools.printv(
                        "All identities over 90%. Likely " + clrs['b'] +
                        "h**o-oligomeric" + clrs['n'] + ".", verbosity)
                    pctools.printv(clrs['y'] + "FETCHING" + clrs['n'] + ".\n",
                                   verbosity)
                    # Write file to database
                    newfile = os.path.join(pdb_homo_archive, subfolder,
                                           pdb_code + ".pdb")
                    if not os.path.isdir(
                            os.path.join(pdb_homo_archive, subfolder)):
                        os.mkdir(os.path.join(pdb_homo_archive, subfolder))
                    io.set_structure(structure)
                    io.save(newfile)
                    pctools.gzip_pdb(newfile)
                    # Write to log file
                    with open(log_file, 'a') as f:
                        f.write(
                            str(pdb_code) + "," + str(nchains) + "," +
                            '/'.join(author) + "," + '/'.join(software) + "," +
                            str(os.path.getctime(newfile + '.gz')) + '\n')
                    # Write in fasta file
                    pctools.printv(
                        clrs['y'] + "Recording h**o-oligomer sequence." +
                        clrs['n'], verbosity)
                    record_fasta(pdb_code,
                                 seqs,
                                 chain_ids,
                                 subfolder,
                                 type='h**o')

                # Investigate partial h**o-oligomers
                elif any(id[0] > 90 for id in ids) and proteinpair is True:
                    at_least_one_interface = False
                    for id in ids:
                        if id[0] > 90:
                            # Check if similar chains share interfaces
                            if pctools.check_interfaces(
                                    structure, id[1], id[2]):
                                at_least_one_interface = True
                                pctools.printv(
                                    'Contacts found between chains ' +
                                    clrs['g'] + str(id[1]) + clrs['n'] +
                                    ' and ' + clrs['g'] + str(id[2]) +
                                    clrs['n'] + ' sharing ' + clrs['g'] +
                                    str(id[0]) + clrs['n'] + " % identity.",
                                    verbosity)
                                pctools.printv(
                                    "At least one putative " + clrs['b'] +
                                    "h**o-oligomeric " + clrs['n'] +
                                    "interface found.", verbosity)
                                pctools.printv(
                                    clrs['y'] + "FETCHING" + clrs['n'] + ".\n",
                                    verbosity)
                                # Write file to database
                                newfile = os.path.join(pdb_homo_archive,
                                                       subfolder,
                                                       pdb_code + ".pdb")
                                if not os.path.isdir(
                                        os.path.join(pdb_homo_archive,
                                                     subfolder)):
                                    os.mkdir(
                                        os.path.join(pdb_homo_archive,
                                                     subfolder))
                                io.set_structure(structure)
                                io.save(newfile)
                                pctools.gzip_pdb(newfile)
                                # Write to log file
                                with open(log_file, 'a') as f:
                                    f.write(
                                        str(pdb_code) + "," + str(nchains) +
                                        "," + '/'.join(author) + "," +
                                        '/'.join(software) + "," +
                                        str(os.path.getctime(newfile +
                                                             '.gz')) + '\n')
                                # Write in fasta file
                                pctools.printv(
                                    clrs['y'] +
                                    "Recording h**o-oligomer sequence." +
                                    clrs['n'], verbosity)
                                record_fasta(pdb_code,
                                             seqs,
                                             chain_ids,
                                             subfolder,
                                             type='h**o')

                                break
                    if at_least_one_interface is False:
                        pctools.printv(
                            "No h**o-oligomeric interface found. Likely " +
                            clrs['r'] + "hetero-oligomeric" + clrs['n'] + ".",
                            verbosity)
                        pctools.printv(
                            clrs['y'] + "Recording hetero-oligomer sequence" +
                            clrs['n'], verbosity)
                        # Write in fasta file
                        record_fasta(pdb_code,
                                     seqs,
                                     chain_ids,
                                     subfolder,
                                     type='hetero')

                elif proteinpair is False:
                    pctools.printv(
                        clrs['r'] + "No proteic chain pairs found" +
                        clrs['n'] + ".", verbosity)
                    if any([set(seq[1]) != {'X'} for seq in seqs]):
                        pctools.printv(
                            clrs['y'] + "Protein sequences found though" +
                            clrs['n'], verbosity)
                        pctools.printv(
                            clrs['y'] + "Recording hetero-oligomer sequence" +
                            clrs['n'], verbosity)
                        # Write in fasta file
                        record_fasta(pdb_code,
                                     seqs,
                                     chain_ids,
                                     subfolder,
                                     type='hetero')
                    else:
                        pctools.printv(
                            clrs['r'] +
                            "Not even a single protein chain. Disregarding." +
                            clrs['n'], verbosity)

                else:
                    pctools.printv(
                        "No similar chains found. Likely " + clrs['r'] +
                        "hetero-oligomeric" + clrs['n'] + ".", verbosity)
                    pctools.printv(
                        clrs['y'] + "Recording hetero-oligomer sequence" +
                        clrs['n'], verbosity)
                    record_fasta(pdb_code,
                                 seqs,
                                 chain_ids,
                                 subfolder,
                                 type='hetero')

            elif int(nchains) == 1:
                pctools.printv(
                    "Only one chain found. Likely " + clrs['r'] + "monomeric" +
                    clrs['n'] + ".", verbosity)
                pctools.printv(
                    clrs['y'] + "Recording monomer sequence." + clrs['n'],
                    verbosity)
                structure, chain_correspondences[
                    pdb_code] = pctools.split_states(structure)
                nchains, seqs, chain_ids = pctools.extract_seqs(structure, 0)
                record_fasta(pdb_code, seqs, chain_ids, subfolder, type='mono')

        except:
            errtype, errvalue, errtraceback = sys.exc_info()
            errtypeshort = str(errtype).split('\'')[1]
            pctools.printv(
                clrs['r'] + '*' + str(errtypeshort) + ': ' + str(errvalue) +
                ' l.' + str(errtraceback.tb_lineno) + '*' + clrs['n'],
                verbosity)
            traceback.print_exception(*sys.exc_info())
            if errtypeshort == 'KeyboardInterrupt':
                quit()
            #pctools.printv(clrs['r']+"UNKNOWN FAULT"+clrs['n']+".", verbosity)
            if not os.path.isfile(err_file):
                with open(err_file, 'w+') as f:
                    pass
            with open(err_file, 'a') as f:
                f.write(filename + '\n')
            continue

    with open(chain_correspondences_file, 'wb') as p:
        pickle.dump(chain_correspondences, p, protocol=pickle.HIGHEST_PROTOCOL)

    if not os.path.isfile(err_file):
        with open(err_file, 'w+') as f:
            f.write('\nNo errors. Assessment terminated succesfully.\n')
Beispiel #9
0
def clean_and_sort(pdb_dir):
    '''
    Make clean directory and h**o multimer subdirectories (Option 4)
    Called by: AccessPDB.py:main()
    '''
    clean_dir = "clean/"
    try:
        os.mkdir(clean_dir)
        for i in range(1, 7):
            os.mkdir(clean_dir + '/' + str(i) + 'mers')
    except:
        pass
    '''
    Loop through pdb files to detect h**o get_oligomeric_status
    '''
    for pdb in pg(os.listdir(pdb_dir), widgets=widgets):
        if pdb.endswith(".ent") or pdb.endswith(".pdb") or pdb.endswith(
                ".ent.gz"):
            pdb_name = pdb.split('.')[0].split("/")[-1]
            if pdb.endswith(".ent.gz"):
                pdb_file = gzip.open(pdb_dir + '/' + pdb, 'rt')
                contents = gzip.open(pdb_dir + '/' + pdb, 'rt').read()
            else:
                pdb_file = open(pdb_dir + '/' + pdb)
                contents = open(pdb_dir + '/' + pdb, 'rt').read()
            try:
                structure = p.get_structure(pdb_name, pdb_file)
            except:
                print("Structure " + pdb_name +
                      " could not be strictly parsed.")
                continue
            nchains, seqs, chid = extract_seqs(structure, 0)
            del chid
            print("\n\nAssessing " + pdb_name + ". This PDB has got " +
                  str(nchains) + " chain(s).")
            if 2 <= nchains <= 6:
                if author_agrees(oligo_dict, contents, nchains):
                    print("Author agrees that " + pdb_name + " is " +
                          oligo_dict[nchains] + " and IDs will be checked.")
                    ids = get_pairwise_ids(seqs, nchains)
                    if all(id > 90 for id in ids):
                        print(
                            "All identities over 90%. Likely h**o-oligomer. Cleaning and sorting.\n\n"
                        )
                        if clean_pdb(structure, pdb_name, clean_dir):
                            os.rename(
                                clean_dir + pdb_name + '.clean.pdb',
                                clean_dir + str(nchains) + 'mers/' + pdb_name +
                                '.clean.pdb')
                        else:
                            print(
                                "Oops! Polypeptide chain too short or inexistent. Skipping.\n\n"
                            )
                    else:
                        print(
                            "Identity under 90%. Likely not a h**o-oligomer. Skipping.\n\n"
                        )
                else:
                    print("Author disagrees. Although PDB has " +
                          str(nchains) + " chains, likely not " +
                          oligo_dict[nchains] + ".\n\n")
            elif nchains == 1:
                if author_agrees(oligo_dict, contents, nchains):
                    print("Author agrees that " + pdb_name + " is " +
                          oligo_dict[nchains] + ". Cleaning and sorting.\n\n")
                    if clean_pdb(structure, pdb_name, clean_dir):
                        os.rename(
                            clean_dir + pdb_name + '.clean.pdb', clean_dir +
                            str(nchains) + 'mers/' + pdb_name + '.clean.pdb')
                    else:
                        print(
                            "Oops! Polypeptide chain too short or inexistent. Skipping.\n\n"
                        )
                else:
                    print("Author disagrees. Although PDB has " +
                          str(nchains) + " chains, likely not " +
                          oligo_dict[nchains] + ".\n\n")
            elif nchains > 6:
                print("Too many chains. Skipping\n\n")
Beispiel #10
0
def main():
    assert not os.path.isfile(
        'OligoSum.csv'
    ), '\033[1;31;40m \n\n File OligoSum.csv exists. Get rid of it.\n'
    results = open('OligoSum.csv', 'a')
    results.write(
        'PDB ID,Chain length,Was Available,No of templates,Template,Is Same,ID,Gesamt Rank,Model Chains, Orig Chains, RMSD, Aligned (%), TM-Score\n'
    )
    for job in pg(os.listdir(workdir), widgets=widgets):
        if job.endswith('h**o.oligo'):
            solution_list = []
            tm_list = []
            ntemplates = 0
            template = 'NA'
            is_same = 'NA'
            id = 'NA'
            gesamt_rank = 'NA'
            RMSD = 'NA'
            alignedp = 'NA'
            nchains_model = 0
            job_id = job.split('_')[0]
            pdb_id = ''.join(list(job)[3:7])
            original_pdb_file = clean_dir + 'pdb' + pdb_id + '.clean.pdb'
            original_structure = p.get_structure('original', original_pdb_file)
            nchains_orig, nres = strtools.count_chains(original_structure)
            if pdb_id in available_models:
                was_available = 'YES'
            else:
                was_available = 'NO'
            model_list = []
            for model in os.listdir(workdir + job):
                if model.startswith('oligo_model'):
                    model_list.append(model)
            if not model_list:
                tmscore = 'No templates found'
                tm_list.append(tmscore)
                results.write(pdb_id + ',' + str(nres) + ',' + was_available +
                              ',' + str(ntemplates) + ',' + template + ',' +
                              is_same + ',' + str(id) + ',' +
                              str(gesamt_rank) + ',' + str(nchains_model) +
                              ',' + str(nchains_orig) + ',' + str(RMSD) + ',' +
                              str(alignedp) + ',' + str(tmscore) + '\n')
            else:
                for model in model_list:
                    template = model.split('_')[2]
                    if template == pdb_id:
                        is_same = 'YES'
                    else:
                        is_same = 'NO'
                    ntemplates = len(model_list)
                    gesamt_results_file = workdir + job + '/' + job_id + '.pdb_first_ges.res'
                    with open(gesamt_results_file, 'r') as f:
                        for line in f.readlines():
                            if re.search(template, line):
                                gesamt_rank = line.split()[0]
                                id = line.split()[4]
                                break
                    model_pdb_file = workdir + job + '/' + model + '/' + job_id + '.B99990001.pdb'
                    try:
                        model_structure = p.get_structure(
                            'modeled', model_pdb_file)
                        nchains_model = strtools.count_chains(
                            model_structure)[0]
                    except FileNotFoundError:
                        nchains_model = 0
                    if nchains_model == nchains_orig:
                        merged_model_file = strtools.merge_chains(
                            model_pdb_file)
                        merged_original_file = strtools.merge_chains(
                            original_pdb_file)
                        aligned, RMSD, tmscore = strtools.run_tmalign(
                            merged_model_file, merged_original_file)
                        alignedp = (aligned * 100) / (nchains_model * nres)
                        tm_list.append(tmscore)
                    elif nchains_model == 0:
                        tmscore = 'Templates found but models not built'
                        RMSD = 'NA'
                        alignedp = 'NA'
                        tm_list.append(tmscore)
                    else:
                        tmscore = 'Wrong number of chains'
                        RMSD = 'NA'
                        alignedp = 'NA'
                        tm_list.append(tmscore)
                    results.write(pdb_id + ',' + str(nres) + ',' +
                                  was_available + ',' + str(ntemplates) + ',' +
                                  template + ',' + is_same + ',' + str(id) +
                                  ',' + str(gesamt_rank) + ',' +
                                  str(nchains_model) + ',' +
                                  str(nchains_orig) + ',' + str(RMSD) + ',' +
                                  str(alignedp) + ',' + str(tmscore) + '\n')

            for i in tm_list:
                if type(i) == str or i != max(
                    [x for x in tm_list if type(x) != str]) or i < 0.58:
                    solution_list.append('NO')
                else:
                    solution_list.append('YES')
            solutions_list.append(solution_list)
    results.close()
    merged_solutions_list = sum(solutions_list, [])
    results = pd.read_csv('OligoSum.csv', na_filter=False)
    results.insert(13, 'Solution', merged_solutions_list)
    results.to_csv('OligoSum.csv')