Esempio n. 1
0
def list_new_files(pdb1_archive, assession_log, verbosity):
    '''
    Taps into the pdb1 local repository and checks if there are new files there
    which should be assessed by curate_homoDB function. It creates a list with
    files from the pdb1 database that are newer than the ones last assessed
    (registered in the dat file); which means that it takes the previous
    assession log as an input in the form of a dictionary where keys are files
    and the values correspond to the last assession time.
    Called by: curate_homoDB()
    '''
    new_files = []
    pctools.printv('Assessing files in PDB1 archive...', verbosity)
    assert os.path.isdir(pdb1_archive), clrs[
        'r'] + '\n\n Not able to find PDB archive.\n\n Does "' + pdb1_archive + '" exist?' + clrs[
            'n']
    pdbfiles = [
        os.path.join(dp, f) for dp, dn, filenames in os.walk(pdb1_archive)
        for f in filenames if f.endswith(".pdb1.gz")
    ]
    for f in pdbfiles:
        filename = f.split('/')[-1]
        mod_date = os.path.getctime(f)
        if filename not in assession_log or mod_date > float(
                assession_log[filename]):
            pctools.printv(
                clrs['y'] + f + ' should be assessed' + clrs['n'] + '...\n',
                verbosity)
            new_files.append(f)
    return new_files
Esempio n. 2
0
def update_seqres(verbosity):
    '''
    Runs wget to update the local seqres database, decompresses it and runs
    makeblastdb.
    Called by: update_databases()
    '''
    seqres_dir = os.path.join(choirdb, 'seqres')
    if not os.path.isdir(seqres_dir):
        os.mkdir(seqres_dir)
    seqres_txt = os.path.join(seqres_dir, 'pdb_seqres.txt')
    seqres_fasta = os.path.join(seqres_dir, 'seqres.fasta')
    pctools.printv('Fetching pdb_seqres.txt...', verbosity)
    attempt = 0
    while attempt < 3:
        try:
            wgetout = subprocess.check_output([
                'wget', '-m', '-r', '-nH', '--cut-dirs=3', '--user=anonymous',
                seqres_ftp, '-P', seqres_dir
            ],
                                              stderr=subprocess.STDOUT)
            break
        except:
            attempt += 1
            if attempt < 3:
                print('Attempt ' + str(attempt) + ' failed, trying again.')
            if attempt == 3:
                print(
                    'Failed to download seqres in 3 attempts. Try again later.'
                )

    no_wget = 'seqres.txt.gz’ -- not retrieving'

    if no_wget not in wgetout.decode(
            'UTF-8') or not os.path.isfile(seqres_txt):
        pctools.printv('Decompressing pdb_seqres.txt...', verbosity)

        with gzip.open(seqres_txt + '.gz',
                       'rb') as fin, open(seqres_fasta, 'wb') as fout:
            shutil.copyfileobj(fin, fout)
    if no_wget not in wgetout.decode(
            'UTF-8') or not os.path.isfile(seqres_fasta + '.pal'):
        subprocess.run([
            makeblastdb_exe, '-in', seqres_fasta, '-parse_seqids', '-dbtype',
            'prot', '-blastdb_version', '5', '-out', seqres
        ])
Esempio n. 3
0
def update_uniref(verbosity):
    '''
    Runs wget to update the local uniref50 database, decompresses it and runs
    makeblastdb.
    Called by: update_databases()
    '''
    uniref50_fasta = os.path.join(choirdb, 'uniref50/uniref50.fasta')
    pctools.printv('Fetching uniref50.fasta...', verbosity)
    attempt = 0
    while attempt < 3:
        try:
            wgetout = subprocess.check_output([
                'wget', '-m', '-r', '-nH', '--cut-dirs=4', '--user=anonymous',
                uniref50_ftp, '-P', choirdb
            ],
                                              stderr=subprocess.STDOUT)
            break
        except:
            attempt += 1
            if attempt < 3:
                print('Attempt ' + str(attempt) + ' failed, trying again.')
            if attempt == 3:
                print(
                    'Failed to download UniRef50 in 3 attempts. Try again later.'
                )

    no_wget = 'uniref50.fasta.gz’ -- not retrieving'

    if no_wget not in wgetout.decode(
            'UTF-8') or not os.path.isfile(uniref50_fasta):
        pctools.printv('Decompressing uniref50.fasta...', verbosity)

        with gzip.open(uniref50_fasta + '.gz',
                       'rb') as fin, open(uniref50_fasta, 'wb') as fout:
            shutil.copyfileobj(fin, fout)
    if no_wget not in wgetout.decode(
            'UTF-8') or not os.path.isfile(uniref50_fasta + '.pal'):
        subprocess.run([
            makeblastdb_exe, '-in', uniref50_fasta, '-parse_seqids', '-dbtype',
            'prot', '-out', uniref50
        ])
Esempio n. 4
0
def score_pairwise(seq1, seq2, matrix, gap_s, gap_e):
    score = 0
    gap = False
    ipos = 0
    fpos = 30
    nwindows = -(-len(seq1) // 30)
    pctools.printv('Number of 30-residue segments: ' + str(nwindows),
                   g_args.verbosity)
    wscores = []
    for window in range(nwindows):
        wscore = 0
        if fpos > len(seq1):
            fpos = len(seq1)
        pctools.printv(
            str(ipos + 1) + ' ' + seq1[ipos:fpos] + ' ' + str(fpos),
            g_args.verbosity)
        pctools.printv(
            str(ipos + 1) + ' ' + seq2[ipos:fpos] + ' ' + str(fpos),
            g_args.verbosity)
        for i in range(len(seq1))[ipos:fpos]:
            pair = (seq1[i], seq2[i])
            if not gap:
                if pair == ('-', '-'):
                    score += 4
                    wscore += 4
                elif '-' in pair:
                    gap = True
                    score += gap_s
                    wscore += gap_s
                else:
                    score += score_match(pair, matrix)
                    wscore += score_match(pair, matrix)
            else:
                if '-' not in pair:
                    gap = False
                    score += score_match(pair, matrix)
                    wscore += score_match(pair, matrix)
                else:
                    score += gap_e
                    wscore += gap_e

        ipos += 30
        fpos += 30
        pctools.printv('Segment score: ' + str(wscore), g_args.verbosity)
        wscores.append(wscore)

    return score, wscores
Esempio n. 5
0
def collect_fasta(verbosity):
    '''
    Fetches fasta files in the pdb_homo_archive and creates a single fasta file
    within a "sequences" folder. For that, it checks the identity among the
    chains in the original fasta and only keeps track of the unique chains, i.e.
    less than 99% identity to the other chains. This file is later use to make
    the blast database.
    Called by: update_databases()
    '''
    fastafiles = [
        os.path.join(dp, f) for dp, dn, filenames in os.walk(pdb_homo_archive)
        for f in filenames if f.endswith(".fasta")
    ]
    seqdir = os.path.join(pdb_homo_archive, 'sequences')
    if not os.path.isdir(seqdir):
        os.mkdir(seqdir)

    largepdb_collected_fasta = os.path.join(seqdir,
                                            'largepdb_collected.fastas')
    with open(largepdb_collected_fasta, 'w+'):
        pass

    homo_collected_fasta = os.path.join(seqdir, 'homo_collected.fastas')
    with open(homo_collected_fasta, 'w+'):
        pass

    mono_collected_fasta = os.path.join(seqdir, 'mono_collected.fastas')
    with open(mono_collected_fasta, 'w+'):
        pass

    hetero_collected_fasta = os.path.join(seqdir, 'hetero_collected.fastas')
    with open(hetero_collected_fasta, 'w+'):
        pass

    for fasta in pg(fastafiles, widgets=widgets):
        pctools.printv('Assessing ' + clrs['y'] + fasta + clrs['n'] + '...',
                       verbosity)
        contents = open(fasta, 'r').read()
        contentlines = contents.split('>')
        nchains = str(len(re.findall('>', contents)))
        pctools.printv(
            'With ' + clrs['y'] + nchains + clrs['n'] +
            ' chains to be assessed\n', verbosity)
        uniques = []
        for entry in contentlines:
            if entry:
                splitentry = entry.split('\n', 1)
                pdbch = splitentry[0]
                seq = splitentry[1].replace('\n', '')
                if uniques:
                    percent_ids = []
                    for unique in uniques:
                        alignment = parasail.sg_stats_striped_16(
                            seq, unique[1], 10, 1, parasail.blosum62)
                        if alignment.length == 0:
                            percent_ids.append(0)
                        else:
                            percent_ids.append(
                                (alignment.matches) / alignment.length * 100)
                    if all(percent_id <= 99 for percent_id in percent_ids):
                        uniques.append([pdbch, seq])
                else:
                    uniques.append([pdbch, seq])

        if '/largepdb_sequences/' in fasta:
            with open(largepdb_collected_fasta, 'a') as f:
                for unique in uniques:
                    wrapped_seq = "\n".join(tw.wrap(unique[1]))
                    fasta_entry = '>' + unique[0] + '\n' + wrapped_seq + '\n\n'
                    f.write(fasta_entry)

        elif '/mono_sequences/' in fasta:
            with open(mono_collected_fasta, 'a') as f:
                for unique in uniques:
                    wrapped_seq = "\n".join(tw.wrap(unique[1]))
                    fasta_entry = '>' + unique[0] + '\n' + wrapped_seq + '\n\n'
                    f.write(fasta_entry)

        elif '/hetero_sequences/' in fasta:
            with open(hetero_collected_fasta, 'a') as f:
                for unique in uniques:
                    wrapped_seq = "\n".join(tw.wrap(unique[1]))
                    fasta_entry = '>' + unique[0] + '\n' + wrapped_seq + '\n\n'
                    f.write(fasta_entry)

        elif '/homo_sequences/' in fasta:
            with open(homo_collected_fasta, 'a') as f:
                for unique in uniques:
                    wrapped_seq = "\n".join(tw.wrap(unique[1]))
                    fasta_entry = '>' + unique[0] + '\n' + wrapped_seq + '\n\n'
                    f.write(fasta_entry)

    subprocess.run([
        makeblastdb_exe, '-in', largepdb_collected_fasta, '-dbtype', 'prot',
        '-out',
        os.path.join(seqdir, 'largedb')
    ])

    subprocess.run([
        makeblastdb_exe, '-in', mono_collected_fasta, '-dbtype', 'prot',
        '-out',
        os.path.join(seqdir, 'monodb')
    ])

    subprocess.run([
        makeblastdb_exe, '-in', hetero_collected_fasta, '-dbtype', 'prot',
        '-out',
        os.path.join(seqdir, 'heterodb')
    ])

    subprocess.run([
        makeblastdb_exe, '-in', homo_collected_fasta, '-dbtype', 'prot',
        '-out',
        os.path.join(seqdir, 'homodb')
    ])
Esempio n. 6
0
def curate_homoDB(verbosity):
    '''
    Creates h**o-oligomeric database from a local pdb repsitory.
    The divided scheme adopted by RCSB, in which the subdirectories
    are the two middle characters in the PDB code, is assumed.
    Each database contains three key files: dat, log and fasta.
    * homodb.dat contains only the pdb codes contained in the database.
    * homodb.log contains summarized relevant information about each entry.
    * homodb.fasta contains the sequences of every chain in the database.
    Called by: update_databases()
    '''
    # Create stats folder if does not exist
    stats_dir = os.path.join(pdb_homo_archive, 'stats')
    if not os.path.isdir(stats_dir):
        os.mkdir(stats_dir)
    # Compare latest assession with new files
    assession_log = read_latest_assession(stats_dir)
    new_files = list_new_files(pdb1_archive, assession_log, verbosity)
    print(clrs['g'] + str(len(new_files)) + clrs['n'] +
          ' new structure files were found and will be processed')
    now = str(time.strftime("%d-%m-%Y@%H.%M.%S"))
    dat_file = os.path.join(stats_dir, now + '-choirdb.dat')
    log_file = os.path.join(stats_dir, now + '-choirdb.log')
    err_file = os.path.join(stats_dir, now + '-choirdb.err')
    if not os.path.isfile(dat_file):
        with open(dat_file, 'w+'):
            pass
    # Write files not to be updated to new dat file
    with open(dat_file, 'a') as f:
        for i in assession_log:
            if i not in new_files:
                f.write(i + " " + assession_log[i] + "\n")
    # Create log file
    if not os.path.isfile(log_file):
        with open(log_file, 'w+') as f:
            f.write('Code, Chains, Author, Software, Date\n')

    # Read Chain correspondences
    chain_correspondences_file = os.path.join(stats_dir,
                                              'chain_correspondences.pickle')
    if os.path.isfile(chain_correspondences_file):
        with open(chain_correspondences_file, 'rb') as p:
            chain_correspondences = pickle.load(p)
    else:
        chain_correspondences = {}

    # Main loop that will populate the ProtCHOIR database
    for pdb in pg(new_files, widgets=widgets):
        filename = pdb.split('/')[-1]
        subfolder = pdb.split('/')[-2]
        # Record assessment in dat file
        with open(dat_file, 'a') as f:
            f.write(filename + " " + str(time.time()) + '\n')
        # Start assession
        pctools.printv('\nAssessing ' + pdb + '...', verbosity)
        # Reject files larger than 10Mb
        file_size = os.stat(pdb).st_size / 1048576
        pctools.printv(
            'File size: ' + clrs['c'] + '{0:.1g}'.format(file_size) + ' Mb' +
            clrs['n'], verbosity)
        if file_size > 2:
            pctools.printv(clrs['r'] + "File size too large!" + clrs['n'],
                           verbosity)
            pctools.printv(
                clrs['y'] +
                "Will try to fetch sequences from asymmetric unit." +
                clrs['n'], verbosity)
            try:
                alternative_pdb = os.path.join(
                    pdb_archive, subfolder,
                    'pdb' + filename.split('.')[0] + '.ent.gz')
                pdb_code, structure, nchains = pctools.parse_pdb_structure(
                    alternative_pdb)
                structure, chain_correspondences[
                    pdb_code] = pctools.split_states(structure)
                nchainspostsplit, seqs, chain_ids = pctools.extract_seqs(
                    structure, 0)
                # Write in fasta file
                pctools.printv(
                    clrs['y'] + "Recording large-pdb sequence" + clrs['n'],
                    verbosity)
                record_fasta(pdb_code,
                             seqs,
                             chain_ids,
                             subfolder,
                             type='largepdb')
            except:
                pctools.printv(
                    clrs['r'] + "Failed to fetch sequence!" + clrs['n'],
                    verbosity)
            continue

        try:
            pdb_code, structure, nchains = pctools.parse_pdb_structure(pdb)
            pctools.printv(
                'Number of chains in structure ' + clrs['y'] + pdb_code +
                clrs['n'] + ': ' + str(nchains), verbosity)
            # Reject structures with more than 60 chains
            if int(nchains) > 60:
                pctools.printv(
                    "Number of chains (" + clrs['y'] + str(nchains) +
                    clrs['n'] + ") larger than 60! " + clrs['r'] +
                    "Too many chains!" + clrs['n'], verbosity)
                pctools.printv(
                    clrs['y'] + "Will try to fetch sequences anyway." +
                    clrs['n'], verbosity)
                try:
                    pdb_code, structure, nchains = pctools.parse_pdb_structure(
                        pdb)
                    structure, chain_correspondences[
                        pdb_code] = pctools.split_states(structure)
                    nchainspostsplit, seqs, chain_ids = pctools.extract_seqs(
                        structure, 0)
                    pctools.printv(
                        clrs['y'] + "Recording large-pdb sequence" + clrs['n'],
                        verbosity)
                    # Write in fasta file
                    record_fasta(pdb_code,
                                 seqs,
                                 chain_ids,
                                 subfolder,
                                 type='largepdb')
                except:
                    pctools.printv(
                        clrs['r'] + "Failed to fetch sequence!" + clrs['n'],
                        verbosity)
                continue

            structure, chain_correspondences[pdb_code] = pctools.split_states(
                structure)
            nchainspostsplit, seqs, chain_ids = pctools.extract_seqs(
                structure, 0)
            pctools.printv(
                'Number of chains (' + clrs['c'] + str(nchains) + clrs['n'] +
                ') and file size (' + clrs['c'] + str(file_size) + clrs['n'] +
                ') OK.' + clrs['g'] + ' Proceeding.' + clrs['n'] + '\n',
                verbosity)
            # Try to get info from the canonic pdb header (homonimous to pdb1)
            canonpdb = "pdb" + pdb_code + ".ent.gz"
            try:
                contents = pctools.parse_pdb_contents(
                    os.path.join(pdb_archive, subfolder, canonpdb))[1]
            except:
                pctools.printv(
                    clrs['r'] +
                    '\n\n Mismatch between pdb and biounit entries...' +
                    clrs['n'], verbosity)
            author, software = pctools.get_annotated_states(contents)
            pctools.printv(
                'Author determined biological unit = ' + str(author),
                verbosity)
            pctools.printv(
                'Software determined quaternary structure= ' + str(software),
                verbosity)
            # Start assessing sequences and structures (from 2 up to 26 chains)
            if 1 < int(nchains) < 61:
                ids, proteinpair = pctools.get_pairwise_ids(seqs, nchains)
                for id in ids:
                    if id[0] >= 90:
                        color = clrs['g']
                    else:
                        color = clrs['r']
                    pctools.printv(
                        'Identity between chains ' + clrs['y'] + str(id[1]) +
                        clrs['n'] + ' and ' + clrs['y'] + str(id[2]) +
                        clrs['n'] + ' is ' + color + str(id[0]) + "%" +
                        clrs['n'] + ".", verbosity)
                # Save records for pure h**o-oligomers
                if all(id[0] > 90 for id in ids) and proteinpair is True:
                    pctools.printv(
                        "All identities over 90%. Likely " + clrs['b'] +
                        "h**o-oligomeric" + clrs['n'] + ".", verbosity)
                    pctools.printv(clrs['y'] + "FETCHING" + clrs['n'] + ".\n",
                                   verbosity)
                    # Write file to database
                    newfile = os.path.join(pdb_homo_archive, subfolder,
                                           pdb_code + ".pdb")
                    if not os.path.isdir(
                            os.path.join(pdb_homo_archive, subfolder)):
                        os.mkdir(os.path.join(pdb_homo_archive, subfolder))
                    io.set_structure(structure)
                    io.save(newfile)
                    pctools.gzip_pdb(newfile)
                    # Write to log file
                    with open(log_file, 'a') as f:
                        f.write(
                            str(pdb_code) + "," + str(nchains) + "," +
                            '/'.join(author) + "," + '/'.join(software) + "," +
                            str(os.path.getctime(newfile + '.gz')) + '\n')
                    # Write in fasta file
                    pctools.printv(
                        clrs['y'] + "Recording h**o-oligomer sequence." +
                        clrs['n'], verbosity)
                    record_fasta(pdb_code,
                                 seqs,
                                 chain_ids,
                                 subfolder,
                                 type='h**o')

                # Investigate partial h**o-oligomers
                elif any(id[0] > 90 for id in ids) and proteinpair is True:
                    at_least_one_interface = False
                    for id in ids:
                        if id[0] > 90:
                            # Check if similar chains share interfaces
                            if pctools.check_interfaces(
                                    structure, id[1], id[2]):
                                at_least_one_interface = True
                                pctools.printv(
                                    'Contacts found between chains ' +
                                    clrs['g'] + str(id[1]) + clrs['n'] +
                                    ' and ' + clrs['g'] + str(id[2]) +
                                    clrs['n'] + ' sharing ' + clrs['g'] +
                                    str(id[0]) + clrs['n'] + " % identity.",
                                    verbosity)
                                pctools.printv(
                                    "At least one putative " + clrs['b'] +
                                    "h**o-oligomeric " + clrs['n'] +
                                    "interface found.", verbosity)
                                pctools.printv(
                                    clrs['y'] + "FETCHING" + clrs['n'] + ".\n",
                                    verbosity)
                                # Write file to database
                                newfile = os.path.join(pdb_homo_archive,
                                                       subfolder,
                                                       pdb_code + ".pdb")
                                if not os.path.isdir(
                                        os.path.join(pdb_homo_archive,
                                                     subfolder)):
                                    os.mkdir(
                                        os.path.join(pdb_homo_archive,
                                                     subfolder))
                                io.set_structure(structure)
                                io.save(newfile)
                                pctools.gzip_pdb(newfile)
                                # Write to log file
                                with open(log_file, 'a') as f:
                                    f.write(
                                        str(pdb_code) + "," + str(nchains) +
                                        "," + '/'.join(author) + "," +
                                        '/'.join(software) + "," +
                                        str(os.path.getctime(newfile +
                                                             '.gz')) + '\n')
                                # Write in fasta file
                                pctools.printv(
                                    clrs['y'] +
                                    "Recording h**o-oligomer sequence." +
                                    clrs['n'], verbosity)
                                record_fasta(pdb_code,
                                             seqs,
                                             chain_ids,
                                             subfolder,
                                             type='h**o')

                                break
                    if at_least_one_interface is False:
                        pctools.printv(
                            "No h**o-oligomeric interface found. Likely " +
                            clrs['r'] + "hetero-oligomeric" + clrs['n'] + ".",
                            verbosity)
                        pctools.printv(
                            clrs['y'] + "Recording hetero-oligomer sequence" +
                            clrs['n'], verbosity)
                        # Write in fasta file
                        record_fasta(pdb_code,
                                     seqs,
                                     chain_ids,
                                     subfolder,
                                     type='hetero')

                elif proteinpair is False:
                    pctools.printv(
                        clrs['r'] + "No proteic chain pairs found" +
                        clrs['n'] + ".", verbosity)
                    if any([set(seq[1]) != {'X'} for seq in seqs]):
                        pctools.printv(
                            clrs['y'] + "Protein sequences found though" +
                            clrs['n'], verbosity)
                        pctools.printv(
                            clrs['y'] + "Recording hetero-oligomer sequence" +
                            clrs['n'], verbosity)
                        # Write in fasta file
                        record_fasta(pdb_code,
                                     seqs,
                                     chain_ids,
                                     subfolder,
                                     type='hetero')
                    else:
                        pctools.printv(
                            clrs['r'] +
                            "Not even a single protein chain. Disregarding." +
                            clrs['n'], verbosity)

                else:
                    pctools.printv(
                        "No similar chains found. Likely " + clrs['r'] +
                        "hetero-oligomeric" + clrs['n'] + ".", verbosity)
                    pctools.printv(
                        clrs['y'] + "Recording hetero-oligomer sequence" +
                        clrs['n'], verbosity)
                    record_fasta(pdb_code,
                                 seqs,
                                 chain_ids,
                                 subfolder,
                                 type='hetero')

            elif int(nchains) == 1:
                pctools.printv(
                    "Only one chain found. Likely " + clrs['r'] + "monomeric" +
                    clrs['n'] + ".", verbosity)
                pctools.printv(
                    clrs['y'] + "Recording monomer sequence." + clrs['n'],
                    verbosity)
                structure, chain_correspondences[
                    pdb_code] = pctools.split_states(structure)
                nchains, seqs, chain_ids = pctools.extract_seqs(structure, 0)
                record_fasta(pdb_code, seqs, chain_ids, subfolder, type='mono')

        except:
            errtype, errvalue, errtraceback = sys.exc_info()
            errtypeshort = str(errtype).split('\'')[1]
            pctools.printv(
                clrs['r'] + '*' + str(errtypeshort) + ': ' + str(errvalue) +
                ' l.' + str(errtraceback.tb_lineno) + '*' + clrs['n'],
                verbosity)
            traceback.print_exception(*sys.exc_info())
            if errtypeshort == 'KeyboardInterrupt':
                quit()
            #pctools.printv(clrs['r']+"UNKNOWN FAULT"+clrs['n']+".", verbosity)
            if not os.path.isfile(err_file):
                with open(err_file, 'w+') as f:
                    pass
            with open(err_file, 'a') as f:
                f.write(filename + '\n')
            continue

    with open(chain_correspondences_file, 'wb') as p:
        pickle.dump(chain_correspondences, p, protocol=pickle.HIGHEST_PROTOCOL)

    if not os.path.isfile(err_file):
        with open(err_file, 'w+') as f:
            f.write('\nNo errors. Assessment terminated succesfully.\n')
Esempio n. 7
0
def score_alignment(alignment_file):
    print(clrs['b'] + 'SCORING ALIGNMENT' + clrs['n'] + ' in ' + clrs['y'] +
          os.path.basename(alignment_file) + clrs['n'] + '\n')
    sequences = list(SeqIO.parse(alignment_file, "pir"))
    query_chains = str(sequences[0].seq).split('/')
    template_chains = str(sequences[1].seq).split('/')
    trimmed_query_chains = []
    trimmed_template_chains = []
    for query_chain, template_chain in zip(query_chains, template_chains):

        leading_gaps = 0
        for r in query_chain:
            if r == '-':
                leading_gaps += 1
            else:
                break
        trailing_gaps = 0
        for r in query_chain[::-1]:
            if r == '-':
                trailing_gaps += 1
            else:
                break

        if trailing_gaps == 0:
            trimmed_query_chains.append(query_chain[leading_gaps:])
            trimmed_template_chains.append(template_chain[leading_gaps:])
        else:
            trimmed_query_chains.append(
                query_chain[leading_gaps:-trailing_gaps])
            trimmed_template_chains.append(
                template_chain[leading_gaps:-trailing_gaps])

    relative_wscores = []
    relative_scores = []
    for q_chain, t_chain in zip(trimmed_query_chains, trimmed_template_chains):
        pctools.printv(
            '\nCalculating ' + clrs['y'] + 'maximum scores' + clrs['n'] +
            ' for chain segments:', g_args.verbosity)
        max_score, max_wscores = score_pairwise(t_chain, t_chain,
                                                MatrixInfo.blosum62, 0, 0)
        pctools.printv(
            '\nCalculating ' + clrs['y'] + 'actual scores' + clrs['n'] +
            ' for chain segments:', g_args.verbosity)
        score, wscores = score_pairwise(q_chain, t_chain, MatrixInfo.blosum62,
                                        0, 0)
        relative_scores.append(round(score * 100 / max_score, 2))

        for max_wscore, wscore in zip(max_wscores, wscores):
            if max_wscore != 0:
                relative_wscore = round(wscore * 100 / max_wscore, 2)
            else:
                relative_wscore = 100
            relative_wscores.append(relative_wscore)

    relative_score = sum(relative_scores) / len(relative_scores)
    string = ''
    for relative_wscore in relative_wscores:
        if relative_wscore > g_args.similarity_cutoff:
            color = 'g'
        else:
            color = 'r'
        if string == '':
            string += (clrs[color] + str(relative_wscore) + clrs['n'])
        else:
            string += (' ~ ' + clrs[color] + str(relative_wscore) + clrs['n'])
    print('\nRelative score per 30-res segment: ' + string + clrs['n'])
    return relative_score, relative_wscores, len(query_chains)
Esempio n. 8
0
def generate_ali(alignments, best_oligo_template, residue_index_mapping, args):
    best_oligo_template_file = best_oligo_template + "_CHOIR_RenamedChainsTemplate"
    final_alignment = os.path.join(
        workdir,
        input_name + '_' + best_oligo_template + '_CHOIR_Alignment.ali')
    getseq = False
    alignment_dict = {}
    full_residue_mapping = {}
    # Parse individual GESAMT alignments and organize in a per-chain dictionary
    for fasta_alignment in alignments:
        getseq = False
        template = False
        chain = None
        entryseq_dict = {}
        for line in open(fasta_alignment, 'r').readlines():
            # Only record sequence if line above starts with >
            if getseq is True:
                getseq = False
                seq = line.replace('\n', '')
                # If this is the template, count leading and trailing gaps
                if template is True:
                    template = False
                    leading_gaps = 0
                    for r in seq:
                        if r == '-':
                            leading_gaps += 1
                        else:
                            break
                    trailing_gaps = 0
                    for r in seq[::-1]:
                        if r == '-':
                            trailing_gaps += 1
                        else:
                            break
                assert seq is not None, 'Sequence is None'
                assert seq != '', 'Sequence is empty'
                entryseq_dict[entry] = seq.upper()
                del seq
            # If it is an entry line, get details and expect sequence
            if line.startswith('>'):
                entry = line.split('>')[1].split('(')[0].split(
                    '.pdb')[0].replace('\n', '')
                # If entry is template, use chain as reference
                if entry == best_oligo_template_file:
                    chain = line.split('(')[1].split(')')[0]
                    template = True
                getseq = True

        # Remove leading and trailing gaps from the alignment for both template and query
        if trailing_gaps == 0:
            for entry, seq in entryseq_dict.items():
                entryseq_dict[entry] = leading_gaps * '-' + seq[leading_gaps:]
        else:
            for entry, seq in entryseq_dict.items():
                entryseq_dict[entry] = leading_gaps * '-' + seq[
                    leading_gaps:-trailing_gaps] + trailing_gaps * '-'
        if residue_index_mapping is not None:
            full_residue_mapping[chain] = collections.OrderedDict()
            for res, i in residue_index_mapping.items():
                full_residue_mapping[chain][res] = i + leading_gaps
        else:
            full_residue_mapping[chain] = leading_gaps

        alignment_dict[chain] = entryseq_dict
        pctools.printv(
            'Removed ' + clrs['c'] + str(leading_gaps) + clrs['n'] +
            ' leading gaps and ' + clrs['c'] + str(trailing_gaps) + clrs['n'] +
            ' trailing gaps from chain ' + clrs['c'] + chain + clrs['n'] +
            ' alignment.\n', verbosity)

    # If symmetry is desired, reduce all chains to match the size of the smallest
    if args.symmetry:
        max_leading_gaps = 0
        max_trailing_gaps = 0
        for chain, seqs in alignment_dict.items():
            for entry, seq in seqs.items():
                if entry == best_oligo_template_file:
                    leading_gaps = 0
                    for r in seq:
                        if r == '-':
                            leading_gaps += 1
                        else:
                            break
                    if leading_gaps > max_leading_gaps:
                        max_leading_gaps = leading_gaps
                    trailing_gaps = 0
                    for r in seq[::-1]:
                        if r == '-':
                            trailing_gaps += 1
                        else:
                            break
                    if trailing_gaps > max_trailing_gaps:
                        max_trailing_gaps = trailing_gaps
        pctools.printv(
            'To cope with symmetry restraints, the modelled sequence will contain '
            + clrs['c'] + str(max_leading_gaps) + clrs['n'] +
            ' leading gaps and ' + clrs['c'] + str(max_trailing_gaps) +
            clrs['n'] + ' trailing gaps' + clrs['n'] + '.\n', verbosity)
        print(max_trailing_gaps)
        for chain, seqs in alignment_dict.items():
            if max_trailing_gaps == 0:
                seqs[entry] = max_leading_gaps * '-' + seqs[entry][
                    max_leading_gaps:]
            else:
                seqs[entry] = max_leading_gaps * '-' + seqs[
                    entry][max_leading_gaps:
                           -max_trailing_gaps] + max_trailing_gaps * '-'

    # Find out first and last chains
    first_chain = sorted(alignment_dict)[0]
    last_chain = sorted(alignment_dict)[-1]

    # Create strings to write in alignment file
    alignment_string_dict = {}
    for entry in [input_name, best_oligo_template_file]:
        if entry == input_name:
            alignment_string_dict[
                entry] = ">P1;" + input_name + "\nsequence:" + input_name + ":FIRST:" + first_chain + ":LAST:" + last_chain + "::::\n"
        elif entry == best_oligo_template_file:
            alignment_string_dict[
                entry] = ">P1;" + best_oligo_template_file + ".pdb\nstructureX:" + best_oligo_template_file + ".pdb:FIRST:" + first_chain + ":LAST:" + last_chain + "::::\n"
        for chain, entryseq in sorted(alignment_dict.items()):
            if chain == last_chain:
                alignment_string_dict[entry] += entryseq[entry] + '*\n'
            else:
                alignment_string_dict[entry] += entryseq[entry] + '/\n'

    # Write alignment file
    with open(final_alignment, 'w') as f:
        for entry, entrystring in alignment_string_dict.items():
            pctools.printv(entrystring, verbosity)
            f.write(entrystring)

    print('Modeller Alignment written to ' + clrs['g'] +
          os.path.basename(final_alignment) + clrs['n'] + '\n')
    return final_alignment, full_residue_mapping