Beispiel #1
0
def main():
    parser = create_argument_parser()
    args = parser.parse_args()

    global logger
    logger = pcl.pisacov_logger(level="info")
    logger.info(pcl.welcome())
Beispiel #2
0
def main():
    starttime = time.time()
    parser = create_argument_parser()
    args = parser.parse_args()

    global logger
    logger = pcl.pisacov_logger(level="info")
    logger.info(pcl.welcome())

    # READ INPUT ARGUMENTS
    if args.initialise is not None:
        inseq = pio.check_path(args.initialise[0], 'file')
        instr = pio.check_path(args.initialise[1], 'file')
        indb = pio.check_path(pio.conf.CSV_CHAIN_PATH, 'file')
        skipexec = [False, True] if not args.add_noncropped else [False, False]
        scoring = [True, False] if not args.add_noncropped else [True, True]
    elif args.skip_conpred is not None:
        inseq = pio.check_path(args.skip_conpred[0], 'file')
        instr = pio.check_path(args.skip_conpred[1], 'file')
        skipexec = [True, True]
        scoring = [True, False] if not args.add_noncropped else [True, True]
    elif args.skip_default_conpred is not None:
        inseq = pio.check_path(args.skip_default_conpred[0], 'file')
        instr = pio.check_path(args.skip_default_conpred[1], 'file')
        skipexec = [True, True] if not args.add_noncropped else [True, False]
        scoring = [True, False] if not args.add_noncropped else [True, True]

    if args.outdir is None:
        outrootdir = pio.check_path(os.path.dirname(inseq))
    else:
        outrootdir = pio.check_path(os.path.join(args.outdir[0], ''))
    pio.mdir(outrootdir)

    if args.collection_file is None:
        outcsvfile = pio.check_path(
            os.path.join(outrootdir, "pisacov_data.csv"))
    else:
        outcsvfile = pio.check_path(args.collection_file[0])
    try:
        pio.check_path(outcsvfile, 'file')
        csvexists = True
    except:
        csvexists = False

    if args.uniprot_threshold is not None:
        thuprot, dbuprot = pio.check_uniprot(args.uniprot_threshold[0])
    else:
        thuprot = 0.0
        dbuprot = None

    if args.hhparams is not None:
        hhparameters = pio.check_hhparams(args.hhparams)
    else:
        try:
            hhparameters = pio.check_hhparams(pio.conf.HHBLITS_PARAMETERS)
        except:
            hhparameters = pio.check_hhparams('dmp')

    # Define formats used
    sources = pio.paths.sources()
    n_sources = len(sources)

    # Parse sequence and structure files
    logger.info('Parsing sequence file...')
    seq = cps.parseseqfile(inseq)
    if len(seq) == 1:
        for key in seq.keys():
            pdbid = key.lower()
            if len(seq[key].imer) == 1:
                for key2 in seq[key].imer[key2]:
                    chid = key2
            else:
                raise Exception('More than one pdbid in sequence set.')
    else:
        raise Exception('More than one pdbid in sequence set.')

    logger.info('Parsing structure file...')
    structure = cps.parsestrfile(instr)[0][pdbid]

    #logger.info('Parsing SIFTS database file...')
    #sifts = cps.import_db(indb, pdb_in=pdbid)

    # CROPPING AND RENUMBERING
    if not skipexec[0]:
        logger.info(
            'Cropping and renumbering sequences, structures according to SIFTS database.'
        )
        psys.crops.runcrops(inseq, instr, indb, thuprot, dbuprot, outrootdir)

        outpdbdir = os.path.join(outrootdir, pdbid, "")
        pio.mdir(outpdbdir)

        inseqc = os.path.join(outpdbdir, os.path.basename(inseq))
        instrc = os.path.join(outpdbdir, os.path.basename(instr))

        copyfile(inseq, inseqc)
        copyfile(instr, instrc)

        cmappath = os.path.join(os.path.splitext(inseqc), '.cropmap')

    # MSA GENERATOR
    cseqpath = os.path.join(outpdbdir, pdbid + '.crops.to_uniprot.fasta')
    hhdir = os.path.join(outpdbdir, 'hhblits', '')
    pio.mdir(hhdir)
    neff = {}
    if not skipexec[0] or not skipexec[1]:
        if hhparameters == ['3', '0.001', 'inf', '50', '99']:
            logger.info(
                'Generating Multiple Sequence Alignment using DeepMetaPSICOV default parameters... [AS RECOMMENDED]'
            )
        elif hhparameters == ['2', '0.001', '1000', '0', '90']:
            logger.info(
                'Generating Multiple Sequence Alignment using HHBlits default parameters...'
            )
        else:
            logger.info(
                'Generating Multiple Sequence Alignment using user-custom parameters...'
            )

        if os.path.isfile(cmappath) and not skipexec[0]:
            psys.msagen.runhhblits(cseqpath, hhparameters, hhdir)
            cmsaa3mfile = os.path.splitext(
                os.path.basename(cseqpath))[0] + ".msa.a3m"
            cmsaa3mpath = os.path.join(hhdir, cmsaa3mfile)
            neff['cropped'] = psys.msagen.msafilesgen(cmsaa3mpath)
            neff['original'] = None
            if not skipexec[1]:
                logger.info(
                    '    Repeating process for non-default sequence...')
                neff['cropped'] = None
        else:
            logger.info(
                '    No cropped sequence found, using original sequence instead...'
            )

        if not os.path.isfile(cmappath) or not skipexec[1]:
            psys.msagen.runhhblits(inseq, hhparameters, hhdir)
            msaa3mfile = os.path.splitext(
                os.path.basename(inseq))[0] + ".msa.a3m"
            msaa3mpath = os.path.join(hhdir, msaa3mfile)
            neff['original'] = psys.msagen.msafilesgen(msaa3mpath)

    # DEEP META PSICOV RUN
    if not skipexec[0] or not skipexec[1]:
        logger.info(
            'Generating contact prediction lists via DeepMetaPSICOV...')
        dmpdir = os.path.join(outpdbdir, 'dmp', '')
        pio.mdir(dmpdir)
        if os.path.isfile(cmappath) and not skipexec[0]:
            psys.dmp.rundmp(cseqpath, cmsaa3mpath, dmpdir)
            if not skipexec[1]:
                logger.info(
                    '    Repeating process for non-default sequence...')
        else:
            logger.info(
                '    No cropped sequence found, using original sequence instead...'
            )

        if not os.path.isfile(cmappath) or not skipexec[1]:
            psys.dmp.rundmp(inseq, msaa3mpath, dmpdir)

    # INTERFACE GENERATION, PISA
    cstrpath = os.path.join(outpdbdir, pdbid + '.oldids.crops.to_uniprot.pdb')
    pisadir = os.path.join(outpdbdir, 'pisa', '')
    n_ifaces = {}
    if not skipexec[0] or not skipexec[1]:
        logger.info('Generating interface files via PISA...')

        if os.path.isfile(cmappath) and not skipexec[0]:
            n_ifaces['cropped'] = psys.pisa.runpisa(cstrpath, pisadir)
            if not skipexec[1]:
                logger.info(
                    '    Repeating process for non-default sequence...')
        else:
            n_ifaces['cropped'] = None
            logger.info(
                '    No cropped sequence found, using original sequence instead...'
            )

        if not os.path.isfile(cmappath) or not skipexec[1]:
            n_ifaces['original'] = psys.pisa.runpisa(instr, pisadir)
        else:
            n_ifaces['original'] = None

    # CONTACT ANALYSIS AND MATCH
    resultdir = os.path.join(outpdbdir, 'results', '')
    logger.info('Opening output csv files...')
    pdbcsvfile = os.path.join(resultdir, pdbid + ".evcovsignal.csv")
    pio.outcsv.csvheader(pdbcsvfile)
    if not csvexists:
        pio.outcsv.csvheader(outcsvfile)

    logger.info('Parsing contact predictions lists...')
    ckseq = ckio.read(inseq, 'fasta')
    conpred = {}
    for source, attribs in sources.items():
        for mode in ['cropped', 'original']:
            seqfile = cseqpath if mode == 'cropped' else inseq
            confile = (os.path.splitext(os.path.basename(seqfile))[0] +
                       attribs[1])
            conpath = os.path.join(outpdbdir, attribs[0], confile)
            cropmapping = cps.parsemapfile(cmappath)
            if os.path.isfile(conpath):
                conpred[mode][source] = ckio.read(conpath, attribs[2])[0]
                for contact in conpred[mode][source]:
                    contact.res1_seq = cropmapping[pdbid][chid]['cropbackmap'][
                        contact.res1_seq]
                    contact.res2_seq = cropmapping[pdbid][chid]['cropbackmap'][
                        contact.res2_seq]
            else:
                conpred[mode][source] = None

                # NOT SURE IT IS WORKING WITH SEVERAL CHAINS OF SAME SEQUENCE. CHECK EVERYTHING.

    logger.info('    Parsing PISA interfaces...')
    interfaces = {}

    for mode in ['cropped', 'original']:
        if n_ifaces[mode] is not None:
            interfaces[mode] = []
            for i in range(int(n_ifaces[mode])):
                strfile = cstrpath if mode == 'cropped' else instr
                pdbfilei = os.path.splitext(strfile)[0] + ".interface." + str(
                    i + 1) + ".pdb"
                interfaces[mode].append(ckio.read(pdbfilei, 'pdb'))
        else:
            interfaces[mode] = None

    # OUTPUT


# CODE TO PRINT NON-REPEATED LINES
#    import csv
#    rows = csv.reader(open("file.csv", "rb"))
#    newrows = []
#    for row in rows:
#        if row not in newrows:
#            newrows.append(row)
#    writer = csv.writer(open("file.csv", "wb"))
#    writer.writerows(newrows)

    return
Beispiel #3
0
def main():

    starttime=time.time()
    parser = create_argument_parser()
    args = parser.parse_args()

    global logger
    logger = pcl.crops_logger(level="info")
    logger.info(pcl.welcome())

    inseq=check_path(args.input_seqpath[0],'file')
    indb=check_path(args.input_database[0],'file')
    insprot=check_path(args.uniprot_threshold[1]) if args.uniprot_threshold is not None else None

    minlen=float(args.uniprot_threshold[0]) if args.uniprot_threshold is not None else 0.0
    targetlbl=ctg.target_format(indb,terms=args.terminals, th=minlen)
    infixlbl=ctg.infix_gen(indb,terms=args.terminals)

    if args.outdir is None:
        outdir=check_path(os.path.dirname(inseq),'dir')
    else:
        outdir=check_path(os.path.join(args.outdir[0],''),'dir')

    if args.sort is not None:
        if (args.sort[0].lower()!='ncrops' and args.sort[0].lower()!='percent' and
            args.sort[0].lower()!='ncropsin' and args.sort[0].lower()!='percentin'):
            raise ValueError("Arguments for sorting option can only be either 'ncrops' or 'percent'.")
        else:
            sorter=args.sort[0].lower()

#############################################################

    sources=["deepmetapsicov", "psicov"]
    confiledir=["deepmetapsicov", "deepmetapsicov"]
    confilesuffix=["deepmetapsicov.con","psicov"]
    n_sources=len(sources)

# BEGIN: DEBUGGING ONLY

# Skip the execution of PISA, HHBLITS and DeepMetaPSICOV (?) [bool]
#    SKIP_EXEC=[True, True, True]
    SKIP_EXEC=[False, False, False]

# Skip the creation of new dir (?) [bool]
#    SKIP_MKDIR=True
    SKIP_MKDIR=False

# END: DEBUGGING ONLY


# Create output directory
    if not SKIP_MKDIR:
        pmo.mkdirout()
        pmo.printout('Output directory created',extraline=True)

    starttime=pmi2.gettime()
    pmo.printout('Starting Time: '+ starttime[1].strftime("%-d %B %Y, %X") + ' UTC\n\n')

# Check that input values are correct

    hhparameters=pmi1.hhparam('logit')
    dumvar=pmi1.minneigh('logit')
    dumvar=pmi1.scorethreshold("deepmetapsicov",'logit')
    dumvar=pmi1.scorethreshold("psicov",'logit')

#####################################################################
##### MSA generator #################################################

    pmo.printout('*********************************************************')
    pmo.printout('*** MULTIPLE SEQUENCE ALIGNMENT *************************',extraline=True)

    pmo.printout('Importing RCSB PDB sequence from fasta file...')
    fasta_seq=conkit.io.read(pmin.SEQUENCE_PATH,"fasta")[0]
    pmo.printout(fasta_seq)

    if pmin.USE_BIOCHAIN:
        pmo.printout('Obtaining Biological sequence from fasta file...')
        if not SKIP_EXEC[1]:
            biological_seq, seqpath, bio = pms.crop_fasta(fasta_seq,pmo.output_tmpdir("deepmetapsicov"))
            if not bio:
                shutil.copyfile(pmin.SEQUENCE_PATH, seqpath)
        else:
            tmpfile = pmi2.pdbid() + '.bio.fasta'
            seqpath = os.path.join(pmo.output_tmpdir("deepmetapsicov"), tmpfile)
            try:
                biological_seq=conkit.io.read(seqpath,"fasta")[0]
                bio=True
            except:
                bio=False
                tmpfile = pmi2.pdbid() + '.fasta'
                seqpath = os.path.join(pmo.output_tmpdir("deepmetapsicov"), tmpfile)
                biological_seq=conkit.io.read(seqpath,"fasta")[0]
        seq=biological_seq
        tmpfile = pmi2.pdbid() + pms.biofile(bio)+ '.fasta'
        newseqpath = os.path.join(pmo.output_dir(), tmpfile)
        shutil.copyfile(seqpath, newseqpath)
        if bio:
            pmo.printout('Biological sequence:')
            pmo.printout(biological_seq)
        else:
            pmo.printout('WARNING: Biological Sequence not available. Using RCSB PDB fasta sequence for analysis (including cloning artifacts)',errorlog=True, extraline=True) # LOGGING
    else:
        bio=False
        pmo.printout('Using RCSB PDB fasta sequence for analysis (including cloning artifacts)', extraline=True)
        seqpath = os.path.join(pmo.output_dir(), os.path.splitext(os.path.basename(pmin.SEQUENCE_PATH))[0])+ os.path.splitext(os.path.basename(pmin.SEQUENCE_PATH))[1]
        shutil.copyfile(pmin.SEQUENCE_PATH, seqpath)
        seqpath = os.path.join(pmo.output_tmpdir("deepmetapsicov"), os.path.splitext(os.path.basename(pmin.SEQUENCE_PATH))[0])+ os.path.splitext(os.path.basename(pmin.SEQUENCE_PATH))[1]
        shutil.copyfile(pmin.SEQUENCE_PATH, seqpath)
        seq=fasta_seq

    pmo.printout('')
    if pmin.HHBLITS_VIA_DMP:
        pmo.printout("Using DeepMetaPSICOV's execution of HHblits...")
        msapath=''
    else:
        if not SKIP_EXEC[1]:
            pmo.printout('Creating Multiple Sequence Alignment with HHblits...')
            msa, msapath = pmp.runhhblits(bio,param=hhparameters,spath=seqpath)
        else:
            pmo.printout('Reading Multiple Sequence Alignment...')
            msafile = pmi2.pdbid()+pms.biofile(bio)+".msa.aln"
            msapath = os.path.join(pmo.output_dir(), msafile)
            msa = conkit.io.read(msapath,'jones')

    msacovpath=os.path.splitext(msapath)[0]+".coverage.png"
    msaformat='jones'
    sit=0.7

#####################################################################
##### DeepMetaPSICOV: Contact Prediction ############################
    pmo.printout('*********************************************************')
    pmo.printout('*** CONTACT PREDICTION **********************************',extraline=True)

    if not SKIP_EXEC[2]:
        pmo.printout('Running DeepMetaPSICOV for Contact prediction list...')
    else:
        pmo.printout('Skipping DeepMetaPSICOV execution.')

    if pmin.HHBLITS_VIA_DMP:
        msa, msapath = pmp.rundmp(seqpath, msapath,skiphhblits=SKIP_EXEC[1],skipdmp=SKIP_EXEC[2]) # Input Sequence fasta file path, MSA file path
    else:
        if not SKIP_EXEC[2]:
            pmp.rundmp(seqpath, msapath) # Input Sequence fasta file path, MSA file path


#####################################################################
##### PISA : OBTAIN INTERFACE PDB FILES #############################
    pmo.printout('*********************************************************')
    pmo.printout('*** INTERFACE IDENTIFICATION ****************************',extraline=True)

# Obtain Number of interfaces in PDB and produce Interface PDB files (including renumbered)
    if not SKIP_EXEC[0]:
        pmo.printout('Running PISA...')
        n_interfaces = pmp.runpisa(bio)
        for nif in range (n_interfaces):
            pdbintpath = os.path.join(pmo.output_tmpdir("pisa"), pmi2.pdbid()+".interface."+str(nif+1)+".pdb")
            pms.renumberpdbs(pdbintpath, fasta_seq, bio)
    else:
        pmo.printout('Reading PISA xml file...')
        n_interfaces = pmp.n_int_xml()


#####################################################################
##### Interface Confidence Scores ###################################
    pmo.printout('*********************************************************')
    pmo.printout('*** SCORING INTERFACES **********************************',extraline=True)

    pmo.printout('Importing contact prediction file ...',extraline=True)
    conpred=[]
    conpred_id=[]
    conpredpath=[]

    for n in range(n_sources):
        conpredfile=pmi2.pdbid()+pms.biofile(bio)+"."+confilesuffix[n]

        conpredpath.append(os.path.join(pmo.output_tmpdir(confiledir[n]), conpredfile))

        conpred.append(conkit.io.read(conpredpath[n], 'psicov')[0])
        conpred[n].sequence=seq
        conpred[n].set_sequence_register()
        conpred_id.append(conpred[n].id)

    pmo.printout('Remove excluded contact scores ...',extraline=True)

    for n in range(n_sources):
        conpred[n]=pmc.filter_contacts(conpred[n],sources[n])

    cntint=0
    cntlig=0
    cntunk=0
    interfacetype=[0 for i in range(n_interfaces+1)]
    scores = [[[0 for k in range(4)] for j in range(n_sources)] for i in range(n_interfaces)]
    n_contacts_all = [[[0 for k in range(4)] for j in range(n_sources)] for i in range(n_interfaces)]
    ndec=6

    for nif in range(n_interfaces):
        pmo.printout('Importing Interface '+str(nif+1)+' PDB file...')
        maps=pmc.import_interfacepdb(nif,bio)
        conpredInt=[]
        pmo.printout('  Number of maps : ' + str(len(maps)) )
        if (len(maps)==4):
            for n in range(n_sources):
                interfacetype[nif]="MM"
                conpredInt.append(conpred[n].deepcopy())
                if pmin.REMOVE_INTRA_CONTACTS:
                    pmo.printout("  Removing those contacts from interface that also appear as intramolecular contacts ...")
                    for element in [0,3]:
                        conpredInt[n]=pmc.remove_intra_contacts(conpredInt[n],maps[element])

                n_contacts_all[nif][n][0]=conpredInt[n].ncontacts # Total number of contacts predicted for interface
                n_contacts_all[nif][n][1]=maps[1].ncontacts # Total number of contacts from interface PDB file

            #pmo.printout(conpredInt)
            pmo.printout(maps[1])
            cntint += 1

            # Write contact list
            pmo.printout('  Writing contact lists ...' )
            if pmin.REMOVE_INTRA_CONTACTS:
                for element in [0,3]:
                    chainid = str(1) if element == 0 else str(2)
                    confile=pmi2.pdbid()+".pdb.interface."+str(nif+1)+".intrachain"+chainid+".conkit.con"
                    tmppath = os.path.join(pmo.output_tmpdir("pisacov"), confile)
                    conkit.io.write(tmppath, 'psicov', hierarchy=maps[element])
            confile=pmi2.pdbid()+".pdb.interface."+str(nif+1)+".conkit.con"
            tmppath = os.path.join(pmo.output_tmpdir("pisacov"), confile)
            conkit.io.write(tmppath, 'psicov', hierarchy=maps[1])

            pmo.printout('  Matching contact prediction and interface contact lists ...',extraline=True )
            contactpath=os.path.join(pmo.output_tmpdir("pisacov"), confile)
            for n in range(n_sources):
                conpredInt_pdb=pmc.pred_pdb_matchlist(conpredInt[n],contactpath)
                conpredInt_pdb.id="     Matching contacts of " + conpred_id[n]
                conpredInt_pdb.sequence = seq.deepcopy()
                conpredInt_pdb.set_sequence_register()

                if conpredInt_pdb.ncontacts == 0:
                    pmo.printout("  WARNING: No contacts found in the filtered interface "+str(nif)+" pdb file. SKIP this result type.",errorlog=True)
                    for sc in range(4):
                        scores[nif][n][sc]="***"
                    n_contacts_all[nif][n][3] = "***"
                else:
                    map_matched, n_contacts_all[nif][n][2], scores[nif][n][0], scores[nif][n][1], scores[nif][n][2], scores[nif][n][3] = pmc.match_maps(conpredInt_pdb, maps[1])
                    for sc in range(4):
                        scores[nif][n][sc]=round(scores[nif][n][sc],ndec)

                    n_contacts_all[nif][n][3] = n_contacts_all[nif][n][2] / n_contacts_all[nif][n][1]
                    ## Plot matched map
                    pngfile=pmi2.pdbid()+".interface."+str(nif+1)+"."+sources[n]+".con.png"
                    confile=pmi2.pdbid()+"."+sources[n]+".interface."+str(nif+1)+".conkit.con"

                    pngpath = os.path.join(pmo.output_dir(), pngfile)
                    fig = conkit.plot.ContactMapFigure(map_matched, reference=maps[1])
                    fig.savefig(pngpath, overwrite=True)

                    tmppath = os.path.join(pmo.output_dir(), confile)
                    conkit.io.write(tmppath, 'psicov',hierarchy=conpredInt_pdb)
                    plt.close('all')

        elif (len(maps)==2):
            interfacetype[nif]="Unk"
            pmo.printout("  WARNING: Unexpected number of maps (2)", errorlog=True)
            pmo.printout('----> REVISE CONTACT MAPS FOR THIS INTERFACE', errorlog=True,extraline=True)

            cntunk += 1
            cnt=0
            for score in range(4):
                for n in range(n_sources):
                    scores[nif][n][score]="***"
            for mapn in maps:
                confile=pmi2.pdbid()+".pdb."+str(cnt+1)+".interface."+str(nif+1)+".conkit.con"  #### CHECK FILES CREATED WHEN FIXING THIS SECTION.
                tmppath = os.path.join(pmo.output_dir(), confile)
                conkit.io.write(tmppath, 'psicov', hierarchy=maps[1])
                cnt += 1
        elif (len(maps)==1):
            interfacetype[nif]="LM"
            pmo.printout("  ... Ligand-monomer interface detected")
            pmo.printout('      SKIP', extraline=True)

            cntlig += 1
            for score in range(4):
                for n in range(n_sources):
                    scores[nif][n][score]="***"
        else:
            interfacetype[nif]="Unk"
            pmo.printout("  WARNING: Unexpected number of maps", errorlog=True)
            pmo.printout('      SKIP', extraline=True, errorlog=True)

            cntunk += 1
            for score in range(4):
                for n in range(n_sources):
                    scores[nif][n][score]="***"



#####################################################################
##### OUTPUT ########################################################

    pmo.printout('*********************************************************')
    pmo.printout('*** FINAL OUTPUT ****************************************',extraline=True)

    pmo.printout('Printing final data to file...', extraline=True)
    datfile=pmi2.pdbid()+".pisacov.out.dat"
    datpath = os.path.join(pmo.output_dir(), datfile)
    now = pmi2.gettime()
    space=str(ndec+4)
    with open(datpath, 'w') as out:
        if bio:
            out.write( "# PDB id: "+ pmi2.pdbid() + ' - Contact def: 0<d<8 Angstroms - Cloning artifacts removed from fasta file sequence - ' )
        else:
            out.write( "# PDB id: "+ pmi2.pdbid() + ' - Contact def: 0<d<8 Angstroms - Original fasta file sequence used - ' )
        out.write( now[1].strftime("%-d %B %Y, %X") + ' UTC')
        out.write('\n')
        datahead='# IF_id IF_type n_pdb  '
        for n in range(n_sources):
            datahead += 'n_'+sources[n]+'  '
            datahead += 'n_'+sources[n]+'/n_pdb  '
            datahead += 'Av_score_'+sources[n]+'  '
            datahead += 'Acc_score_'+sources[n]+'  '
            datahead += 'P_'+sources[n]+'  '
        out.write(datahead+'\n')
        for nif in range(n_interfaces):
            outformat='{:>4s}{:>4s}{:>4s}'
            outstring = str(outformat.format(str(nif+1),interfacetype[nif],str(n_contacts_all[nif][0][1])))
            for n in range(n_sources):
                #outformat='{:>4s}{:>'+space+'s}{:>'+space+'s}{:>'+space+'s}{:>'+space+'s}{:>'+space+'s}{:>'+space+'s}{:>'+space+'s}'
                outformat='{:>4s}{:>'+space+'s}{:>'+space+'s}{:>'+space+'s}{:>'+space+'s}'
                outstring += str(outformat.format(str(n_contacts_all[nif][n][2]),str(scores[nif][n][0]),str(scores[nif][n][1]),str(scores[nif][n][2]),str(scores[nif][n][3])))
            out.write(outstring+'\n')
    sumlogfile=pmi2.pdbid()+".pisacov.out.summary.log"
    sumlogpath = os.path.join(pmo.output_dir(), sumlogfile)
    now = pmi2.gettime()

    with open(sumlogpath, 'w') as out:
        out.write('*********************************************************\n')
        out.write('*** S U M M A R Y  **  P I S A C O V ********************\n')
        out.write('*********************************************************\n\n')
        out.write(now[1].strftime("%-d %B %Y, %X") + ' UTC\n\n')
        out.write('Protein PDB ID:  '+pmi2.pdbid()+'\n\n')
        out.write('*********************************************************\n')
        out.write('--- Multiple Sequence Alignment ---\n\n')
        outformat='{:<55s}{:<80s}'
        out.write(outformat.format('   MSA file: ', msapath))
        out.write('\n')
        out.write(outformat.format('   MSA format: ', msaformat))
        out.write('\n')
        out.write(outformat.format('   Sequence Identity Threshold: ', str(sit) ))
        out.write('\n')
        out.write(outformat.format('   Length of the Target Sequence: ',str(msa.top_sequence.seq_len)))
        out.write('\n')
        out.write(outformat.format('   Total number of sequences: ', str(msa.nseq)))
        out.write('\n')
        out.write(outformat.format('   Number of Effective Sequences: ', str(msa.meff)))
        out.write('\n')
        out.write(outformat.format('   Proportion of Effective Sequences: ', str(round(100*msa.meff/msa.nseq,2))+' %'))
        out.write('\n')
        out.write(outformat.format('   Sequence Coverage Plot: ', msacovpath))
        out.write('\n\n')
        out.write(outformat.format('   MSA created with: ',pmin.HHSUITE_PATH))
        out.write('\n')
        out.write(outformat.format('   Reference database name: ', pmin.HHBLITS_DATABASE_NAME))
        out.write('\n')
        out.write(outformat.format('   Reference database path: ', pmin.HHBLITS_DATABASE_DIR))
        out.write('\n')
        out.write(outformat.format('   Number of iterations: ', str(pmi1.hhparam()[0])))
        out.write('\n')
        out.write(outformat.format('   E-value cutoff for inclusion in result alignment: ', str(pmi1.hhparam()[1])))
        out.write('\n')
        out.write(outformat.format('   Non-redundant sequences to keep: ', str(pmi1.hhparam()[2])))
        out.write('\n')
        out.write(outformat.format('   Minimum coverage with master sequence (%): ', str(pmi1.hhparam()[3])))
        out.write('\n')
        out.write(outformat.format('   Maximum pairwise sequence identity: ', str(pmi1.hhparam()[4])))
        out.write('\n\n')
        out.write('*********************************************************\n')
        out.write('--- Contact Prediction ---\n\n')
        outformat='{:<55s}{:<80s}'
        out.write(outformat.format('   Contact prediction list(s) created with: ', pmin.DMP_PATH))
        out.write('\n')
        for n in range(n_sources):
            out.write(outformat.format('   Contact prediction file ('+sources[n]+'): ', conpredpath[n]))
            out.write('\n')
        out.write(outformat.format('   Minimum distance within sequence (neigh. cutoff): ', str(pmi1.minneigh()) ))
        out.write('\n')
        for n in range(n_sources):
            out.write(outformat.format('   Contact prediction score threshold ('+sources[n]+'): ', str(pmi1.scorethreshold(sources[n]))))
            out.write('\n')
        out.write('\n')
        out.write('*********************************************************\n')
        out.write('--- Interfaces ---\n\n')
        outformat='{:<55s}{:<80s}'
        out.write(outformat.format('   Total Number of Interfaces: ', str(n_interfaces) ))
        out.write('\n')
        out.write(outformat.format('   Number of Intramolecular interfaces: ', str(cntint) ))
        out.write('\n')
        out.write(outformat.format('   Number of Ligand-monomer interfaces: ', str(cntlig) ))
        out.write('\n')
        out.write(outformat.format('   Number of Unidentified interfaces: ', str(cntunk) ))
        out.write('\n\n')
        for nif in range(n_interfaces):
            nif1=nif+1
            out.write('   --- Interface '+str(nif1)+' ---\n')
            if (interfacetype[nif] == "MM"):
                out.write(outformat.format('      Interface type:','monomer - monomer (MM)'))
                out.write('\n')
                out.write(outformat.format('      Total number of intermolecular contacts (pdb): ', str(n_contacts_all[nif][0][1])))
                out.write('\n')
                for n in range(n_sources):
                    extral='\n\n' if n==n_sources-1 else '\n'
                    out.write('      + '+sources[n]+' Scores +\n')
                    out.write(outformat.format('        Number of True Positives : ', str(n_contacts_all[nif][n][2]) ))
                    out.write('\n')
                    out.write(outformat.format('        Proportion of True positives: ', str(n_contacts_all[nif][n][3])))
                    out.write('\n')
                    out.write(outformat.format('        Jaccard Index (|PDB ⋂ pred| / |PDB U pred|): ', str(scores[nif][n][0])))
                    out.write('\n')
                    out.write(outformat.format('        Average value of the scores of True Positives: ', str(scores[nif][n][1])))
                    out.write('\n')
                    out.write(outformat.format('        Sum of all the scores of True Positives: ', str(scores[nif][n][2])))
                    out.write('\n')
                    out.write(outformat.format('        Probabilistic score for whole interface: ', str(scores[nif][n][3])))
                    out.write(extral)

            elif (interfacetype[nif] == "LM"):
                out.write(outformat.format('      Interface type:','ligand - monomer (LM)'))
                out.write('\n\n')
            elif (interfacetype[nif] == "Unk"):
                out.write(outformat.format('      Interface type:','unidentified (Unk)'))
                out.write('\n\n')


    pmo.printout('*********************************************************')
    pmo.printout('*** S U M M A R Y ***************************************')
    pmo.printout('*********************************************************', extraline=True)

    pmo.printout('Protein PDB ID: %s' % pmi2.pdbid(), extraline=True)

    pmo.printout('--- Multiple Sequence Alignment ---')
    pmo.printout('   MSA file: ' + msapath)
    pmo.printout('   MSA format: ' + msaformat)
    pmo.printout('   Sequence Identity Threshold: '+ str (sit))
    pmo.printout('   Length of the Target Sequence: ' + str(msa.top_sequence.seq_len))
    pmo.printout('   Total number of sequences: ' + str(msa.nseq))
    pmo.printout('   Number of Effective Sequences: ' + str(msa.meff))
    pmo.printout('   Proportion of Effective Sequences: ' + str( round(100*msa.meff/msa.nseq,2))+' %')
    pmo.printout('   Sequence Coverage Plot: ' + msacovpath, extraline=True)

    pmo.printout('   MSA created with: ' + pmin.HHSUITE_PATH)
    pmo.printout('   Reference database name: ' + pmin.HHBLITS_DATABASE_NAME)
    pmo.printout('   Reference database path: ' + pmin.HHBLITS_DATABASE_DIR)
    pmo.printout('   Number of iterations: ' + str(pmi1.hhparam()[0]))
    pmo.printout('   E-value cutoff for inclusion in result alignment: ' + str(pmi1.hhparam()[1]))
    pmo.printout('   Non-redundant sequences to keep: ' + str(pmi1.hhparam()[2]))
    pmo.printout('   Minimum coverage with master sequence (%): ' + str(pmi1.hhparam()[3]))
    pmo.printout('   Maximum pairwise sequence identity: ' + str(pmi1.hhparam()[4]), extraline=True)

    pmo.printout('--- Contact Prediction ---')
    pmo.printout('   Contact prediction list created with: ' + pmin.DMP_PATH)
    for n in range(n_sources):
        pmo.printout('   Contact prediction file ('+sources[n]+'): ' + conpredpath[n])
    pmo.printout('   Minimum distance within sequence (neigh. cutoff): '+ str(pmi1.minneigh()))
    for n in range(n_sources):
        extral=True if n==n_sources-1 else False
        pmo.printout('   Contact prediction score threshold ('+sources[n]+'): ' + str(pmi1.scorethreshold(sources[n])),extraline=extral)

    pmo.printout('--- Interfaces ---')
    pmo.printout('   Number of Interfaces: ' + str(n_interfaces) + ", of which:")
    pmo.printout('     Number of Intramolecular interfaces: ' + str(cntint))
    pmo.printout('     Number of Ligand-monomer interfaces: ' + str(cntlig))
    pmo.printout('     Number of Unidentified interfaces: ' + str(cntunk),extraline=True)

    for nif in range(n_interfaces):
        nif1=nif+1
        pmo.printout('  --- Interface '+str(nif1)+' ---',extraline=True)

        if (interfacetype[nif] == "MM"):
            pmo.printout('   Interface type: monomer - monomer (MM)')
            pmo.printout('       Total number of intermolecular contacts (pdb): ' + str(n_contacts_all[nif][0][1]))
            for n in range(n_sources):
                extral=True if n==n_sources-1 else False
                pmo.printout('       + Scores ('+sources[n]+'):')
                pmo.printout('         Number of True Positives : ' + str(n_contacts_all[nif][n][2]))
                pmo.printout('         Proportion of True positives: ' + str(n_contacts_all[nif][n][3]) )
                pmo.printout('         Jaccard Index (|PDB ⋂ pred| / |PDB U pred|): ' + str(scores[nif][n][0]) )
                pmo.printout('         Average value of the scores of True Positives: ' + str(scores[nif][n][1] ))
                pmo.printout('         Sum of all the scores of True Positives: ' + str(scores[nif][n][2] ))
                pmo.printout('         Probabilistic score for whole interface: ' + str(scores[nif][n][3] ),extraline=extral)

        elif (interfacetype[nif] == "LM"):
            pmo.printout('   Interface type: ligand - monomer (LM)',extraline=True)
        elif (interfacetype[nif] == "Unk"):
            pmo.printout('   Interface type: unidentified (Unk)',extraline=True)

#####################################################################
##### TIMEIT ########################################################

    pmo.printout('*********************************************************')
    pmo.printout('*** T I M I N G *****************************************')
    pmo.printout('*********************************************************', extraline=True)

    with open(sumlogpath, 'a') as out:
        out.write('*********************************************************\n')
        out.write('*** T I M I N G *****************************************\n')
        out.write('*********************************************************\n\n')

        endtime=pmi2.gettime()
        totaltime=pmi2.readabletime(endtime[0]-starttime[0])

        out.write('Starting Time: '+ starttime[1].strftime("%-d %B %Y, %X") + ' UTC\n\n')
        out.write('Ending Time: '+ endtime[1].strftime("%-d %B %Y, %X") + ' UTC\n\n')
        out.write('Process Wallclock time: '+str(endtime[0]-starttime[0])+' s\n')
        out.write('      or, equivalently: ' + totaltime+'\n\n')

    pmo.printout('Starting Time: '+ starttime[1].strftime("%-d %B %Y, %X")+' UTC', extraline=True)
    pmo.printout('Ending Time: '+ endtime[1].strftime("%-d %B %Y, %X")+' UTC', extraline=True)
    pmo.printout('Process Wallclock time: '+str(endtime[0]-starttime[0])+' s')
    pmo.printout('      or, equivalently: ' + totaltime, extraline=True)
Beispiel #4
0
def main():

    parser = create_argument_parser()
    args = parser.parse_args()

    global logger
    logger = pcl.pisacov_logger(level="info")
    welcomemsg, starttime = pcl.welcome(command=__script__)
    logger.info(welcomemsg)

    kwlist = pco._default_keys()

    configin = {}

    if args.update_sifts_database is not None:
        outpath = ppaths.check_path(args.update_sifts_database[0], 'either')
        if os.path.isdir(outpath) is True:
            outpath = os.path.join(outpath,
                                   ('pdb_chain_uniprot' + os.extsep + 'csv'))
        pol.getsifts(outpath)
        configin['SIFTS_PATH'] = outpath
    else:
        pass

    if args.view_configuration is True:
        print('** ' + __prog__ + " v." + __version__ +
              ' configuration file **' + os.linesep)
        with open(pconf.__file__) as f:
            for line in f:
                line = _lineformat(line)
                print(line, end='')
        print('** End of configuration file **' + os.linesep)
        return
    else:
        pass

    if args.get_confpath is True:
        print(pconf.__file__)
        return
    else:
        pass

    if args.conf_file is False:
        newconf = pco._parse_conf()
    else:
        newconf = {}

    if args.reset_hhblits_arguments is True:
        newconf['HHBLITS_PARAMETERS'] = pco._default_values(
            'HHBLITS_PARAMETERS')
    else:
        pass

    if args.sifts_path is not None:
        configin['SIFTS_PATH'] = args.sifts_path[0]
    if args.pisa_path is not None:
        configin['PISA_PATH'] = args.pisa_path[0]
    if args.hhblits_path is not None:
        configin['HHBLITS_PATH'] = args.hhblits_path[0]
    if args.dmp_path is not None:
        configin['DMP_PATH'] = args.dmp_path[0]
    if args.uniclust_path is not None:
        configin['UNICLUST_FASTA_PATH'] = args.uniclust_path[0]
    if args.hhblits_arguments is not None:
        configin['HHBLITS_PARAMETERS'] = args.hhblits_arguments[0]
    if args.neighbours is not None:
        configin['NEIGHBOURS_MINDISTANCE'] = args.neighbours[0]
        configin['REMOVE_INTRA_CONTACTS'] = False
    if isinstance(args.hhblits_location, list) is True:
        configin['HHBLITS_DATABASE_NAME'] = args.hhblits_location[0]
        configin['HHBLITS_DATABASE_DIR'] = args.hhblits_location[1]

    compmsg = {
        'SIFTS_PATH':
        "Please, enter the path to the SIFTS csv file:\n",
        'PISA_PATH':
        "Please, enter the path to the PISA executable:\n",
        'HHBLITS_PATH':
        "Please, enter the path to the HHBLITS executable:\n",
        'HHBLITS_DATABASE_NAME':
        ("Please, enter the name of the HHBLITS database name\n" +
         "as requested by DeepMetaPSICOV (e.g. uniclust30_2018_08):\n"),
        'HHBLITS_DATABASE_DIR':
        "Please, enter the directory of the HHBLITS database:\n",
        'DMP_PATH':
        "Please, enter the path to the DeepMetaPSICOV executable:\n",
        'HHBLITS_PARAMETERS':
        ("Please, enter the 5 HHBLITS parameters.\n" +
         "#iterations, E-value cutoff, Non-redundant seqs to keep, " +
         " MinimumCoverageWithMasterSeq(%), and MaxPairwiseSequenceIdentity.\n"
         + "Leave empty for DMP default (3, 0.001, 'inf', 50, 99).\n"),
        'UNICLUST_FASTA_PATH':
        ("Please, enter the path to the UniClust fasta file.\n"
         "An empty input will deactivate this option.\n"),
        'NEIGHBOURS_MINDISTANCE':
        ("Press ENTER for intramolecular contacts " +
         "to be removed from intermolecular contact lists.\n" +
         "Otherwise, enter the minimum distance to be cosidered " +
         "between neighbours. This will override the removal of" +
         "intramolecular contacts that will be now ignored.\n")
    }

    for keystr in kwlist[:-1]:
        if keystr in configin or args.conf_file is True:
            if args.conf_file is True:
                while True:
                    newval = input(compmsg[keystr])
                    try:
                        newconf[keystr] = pco._check_input(newval, keystr)
                        if keystr == 'NEIGHBOURS_MINDISTANCE':
                            if newval is None or newval == "":
                                newconf['REMOVE_INTRA_CONTACTS'] = True
                            else:
                                newconf['REMOVE_INTRA_CONTACTS'] = False
                    except Exception:
                        print("Not a valid input. Please, try again:")
                    else:
                        break
            else:
                newconf[keystr] = pco._check_input(configin[keystr], keystr)
                if keystr == 'NEIGHBOURS_MINDISTANCE':
                    newconf['REMOVE_INTRA_CONTACTS'] = pco._check_input(
                        configin['REMOVE_INTRA_CONTACTS'],
                        'REMOVE_INTRA_CONTACTS')
                else:
                    pass

    if 'REMOVE_INTRA_CONTACTS' in configin:
        newconf['REMOVE_INTRA_CONTACTS'] = pco._check_input(
            configin['REMOVE_INTRA_CONTACTS'], 'REMOVE_INTRA_CONTACTS')
        if newconf['REMOVE_INTRA_CONTACTS'] is True:
            newconf['NEIGHBOURS_MINDISTANCE'] == 2
        else:
            pass
    else:
        pass

    _outconffile(newconf)

    endmsg = pcl.ok(starttime, command=__script__)
    logger.info(endmsg)
    return
def main():
    parser = create_argument_parser()
    args = parser.parse_args()

    global logger
    logger = pcl.pisacov_logger(level="info")
    welcomemsg, starttime = pcl.welcome(command=__script__)
    logger.info(welcomemsg)

    # PARSE CONFIGURATION FILE:
    invals = pco._initialise_inputs()

    invals['INSEQ'] = None
    invals['INSTR'] = None
    invals['ALTDB'] = None
    invals['OUTROOT'] = None
    invals['OUTCSVPATH'] = None
    invals['UPTHRESHOLD'] = None

    # READ INPUT ARGUMENTS
    invals['INSEQ'] = ppaths.check_path(args.seqpath[0], 'file')
    invals['INSTR'] = ppaths.check_path(args.crystalpath[0], 'file')

    if args.hhblits_arguments is not None:
        invals['HHBLITS_PARAMETERS'] = pco._check_hhparams(
            args.hhblits_arguments)
    else:
        pass

    if args.uniprot_threshold is not None:
        try:
            invals['UPTHRESHOLD'] = float(args.uniprot_threshold[0])
        except ValueError:
            logger.critical('Uniprot threshold given not valid.')
        if invals['UNICLUST_FASTA_PATH'] is None:
            invals['UNICLUST_FASTA_PATH'] = pco._uniurl
    else:
        pass

    if args.skip_conpred is True:
        skipexec = True
        if (args.hhblits_arguments is not None
                or args.uniprot_threshold is not None):
            logger.info(
                'HHblits, UniProt threshold parameters given bypassed by --skip_conpred'
            )
    else:
        skipexec = False
    cropping = args.remove_insertions
    scoring = [cropping, not cropping]

    if args.outdir is None:
        invals['OUTROOT'] = ppaths.check_path(os.path.dirname(invals['INSEQ']))
    else:
        invals['OUTROOT'] = ppaths.check_path(os.path.join(args.outdir[0], ''))
    ppaths.mdir(invals['OUTROOT'])

    invals['OUTCSVPATH'] = []
    if args.collection_file is None:
        invals['OUTCSVPATH'].append(
            ppaths.check_path(
                os.path.join(invals['OUTROOT'],
                             ("evcovsignal" + os.extsep + "cropped" +
                              os.extsep + "pisacov" + os.extsep + "csv"))))
        invals['OUTCSVPATH'].append(
            ppaths.check_path(
                os.path.join(invals['OUTROOT'],
                             ("evcovsignal" + os.extsep + "full" + os.extsep +
                              "pisacov" + os.extsep + "csv"))))
    else:
        if cropping is True:
            invals['OUTCSVPATH'].append(
                ppaths.check_path(args.collection_file[0]))
            invals['OUTCSVPATH'].append(
                ppaths.check_path(
                    os.path.splitext(args.collection_file[0])[0] + os.extsep +
                    'full' + os.extsep +
                    os.path.splitext(args.collection_file[0])[1]))
        else:
            invals['OUTCSVPATH'].append(None)
            invals['OUTCSVPATH'].append(
                ppaths.check_path(args.collection_file[0]))

    # Define formats used
    sources = pco._sources()

    # Parse sequence and structure files
    logger.info('Parsing sequence file...')
    seqs = cps.parseseqfile(invals['INSEQ'])

    logger.info('Parsing structure file...')
    strs, filestrs = cps.parsestrfile(invals['INSTR'])

    if len(seqs) == 1 or len(strs) == 1:
        if len(seqs) == 1:
            for key in seqs:
                pdbid = key
        elif len(seqs) > 1 and len(strs) == 1:
            for key in strs:
                for key2 in seqs:
                    if key.upper() == key2.upper():
                        pdbid = key.upper()
                    else:
                        if key2.upper() in key.upper():
                            pdbid = key2.upper()
    else:
        raise Exception(
            'More than one pdbid in sequence and/or structure set.')

    seq = seqs[pdbid]
    #structure = strs[pdbid]

    # CROPPING AND RENUMBERING
    outpdbdir = os.path.join(invals['OUTROOT'], pdbid, "")
    instrc = os.path.join(invals['OUTROOT'], pdbid,
                          os.path.basename(invals['INSTR']))

    fseq = {}
    fmsa = {}
    if skipexec is False:
        if cropping is True:
            logger.info('Cropping and renumbering sequences, ' +
                        'structures according to SIFTS database.')
            logger.info(pcl.running('CROPS-cropstr'))
            itime = datetime.datetime.now()
            psc.runcrops(invals['INSEQ'], invals['INSTR'],
                         invals['SIFTS_PATH'], invals['UPTHRESHOLD'],
                         invals['UNICLUST_FASTA_PATH'], invals['OUTROOT'])
            logger.info(pcl.running('CROPS-cropstr', done=itime))
        else:
            logger.info('Renumbering structure ' +
                        'according to position in sequence.')
            logger.info(pcl.running('CROPS-renumber'))
            itime = datetime.datetime.now()
            psc.renumcrops(invals['INSEQ'], invals['INSTR'], invals['OUTROOT'])
            logger.info(pcl.running('CROPS-renumber', done=itime))

        ppaths.mdir(outpdbdir)
        if cropping is False:
            psc.splitseqs(invals['INSEQ'], outpdbdir)
        copyfile(invals['INSTR'], instrc)

    for i, iseq in seq.imer.items():
        fiseq = pdbid + '_' + i + '.fasta'
        fseq[i] = os.path.join(invals['OUTROOT'], pdbid, fiseq)
        fiseq = pdbid + '_' + i + '.msa.aln'
        fmsa[i] = os.path.join(invals['OUTROOT'], pdbid, 'hhblits', fiseq)
        if skipexec is False:
            iseq.dump(fseq[i])

    # Parse cropped sequences and maps
    if cropping is True:
        amap = {}
        fcropseq = {}
        fcropmsa = {}
        for i, iseq in seq.imer.items():
            fprefix = pdbid + '_' + i + '.crops.to_uniprot'
            fmap = os.path.join(invals['OUTROOT'], pdbid,
                                fprefix + os.extsep + 'cropmap')
            amap.update(cps.parsemapfile(fmap)[pdbid])
            fcropseq[i] = os.path.join(invals['OUTROOT'], pdbid,
                                       fprefix + os.extsep + 'fasta')
            fcropmsa[i] = os.path.join(
                invals['OUTROOT'], pdbid, 'hhblits',
                (fprefix + os.extsep + 'msa' + os.extsep + 'aln'))
            seq.set_cropmaps(amap, cropmain=True)

    # EXECUTION OF EXTERNAL PROGRAMS
    hhdir = os.path.join(invals['OUTROOT'], pdbid, 'hhblits', '')
    dmpdir = os.path.join(invals['OUTROOT'], pdbid, 'dmp', '')
    pisadir = os.path.join(invals['OUTROOT'], pdbid, 'pisa', '')
    fstr = os.path.join(
        invals['OUTROOT'],
        (pdbid + os.extsep + 'crops' + os.extsep + 'seq' + os.extsep + 'pdb'))
    if cropping:
        fcropstr = os.path.join(
            invals['OUTROOT'], pdbid,
            (pdbid + os.extsep + 'crops' + os.extsep + 'oldids' + os.extsep +
             'to_uniprot' + os.path.splitext(invals['INSTR'])[1]))
    if skipexec is False:
        # MSA GENERATOR
        ppaths.mdir(hhdir)
        if invals['HHBLITS_PARAMETERS'] == ['3', '0.001', 'inf', '50', '99']:
            logger.info(
                'Generating Multiple Sequence Alignment using DeepMetaPSICOV default parameters... [AS RECOMMENDED]'
            )
        elif invals['HHBLITS_PARAMETERS'] == ['2', '0.001', '1000', '0', '90']:
            logger.info(
                'Generating Multiple Sequence Alignment using HHBlits default parameters...'
            )
        else:
            logger.info(
                'Generating Multiple Sequence Alignment using user-custom parameters...'
            )

        for i, iseq in seq.imer.items():
            sfile = fcropseq[i] if cropping is True else fseq[i]
            afile = fcropmsa[i] if cropping is True else fmsa[i]
            logger.info(pcl.running('HHBlits'))
            itime = datetime.datetime.now()
            themsa = psm.runhhblits(sfile, invals['HHBLITS_PARAMETERS'], hhdir)
            logger.info(pcl.running('HHBlits', done=itime))
            if cropping is True:
                iseq.cropmsa = themsa
                if iseq.ncrops() == 0:
                    iseq.msa = iseq.cropmsa
                    logger.info('    Cropped sequence ' + iseq.oligomer_id +
                                '_' + iseq.name +
                                ' is identical to original sequence.')
                    continue
                else:
                    pass
            else:
                iseq.msa = themsa

    # DEEP META PSICOV RUN
    ppaths.mdir(dmpdir)
    if skipexec is False:
        logger.info(
            'Generating contact prediction lists via DeepMetaPSICOV...')
        for i, iseq in seq.imer.items():
            sfile = fcropseq[i] if cropping is True else fseq[i]
            afile = fcropmsa[i] if cropping is True else fmsa[i]
            nsfile = os.path.join(dmpdir, os.path.basename(sfile))
            if sfile != nsfile:
                copyfile(sfile, nsfile)
            logger.info(pcl.running('DeepMetaPSICOV'))
            itime = datetime.datetime.now()
            psd.rundmp(nsfile, afile, dmpdir)
            logger.info(pcl.running('DeepMetaPSICOV', done=itime))

    # INTERFACE GENERATION, PISA
    ppaths.mdir(pisadir)
    if skipexec is False:
        logger.info('Generating interface files via PISA...')
        sfile = fcropstr if cropping is True else fstr
        logger.info(pcl.running('PISA'))
        itime = datetime.datetime.now()
        iflist = psp.runpisa(sfile, pisadir, sessionid=pdbid)
        logger.info(pcl.running('PISA', done=itime))

    endmsg = pcl.ok(starttime, command=__script__)
    logger.info(endmsg)

    return
Beispiel #6
0
def main():
    starttime = time.time()
    parser = create_argument_parser()
    args = parser.parse_args()

    global logger
    logger = pcl.pisacov_logger(level="info")
    welcomemsg, starttime = pcl.welcome(command=__script__)
    logger.info(welcomemsg)

    # PARSE CONFIGURATION FILE:
    invals = pco._initialise_inputs()

    invals['INSEQ'] = None
    invals['INIFS'] = None
    invals['OUTROOT'] = None
    invals['OUTCSVPATH'] = None

    # READ INPUT ARGUMENTS
    invals['INSEQ'] == ppaths.check_path(args.seqpath[0], 'file')

    invals['INIFS'] = []
    args.remove_insertions = False
    for fp in args.dimers:
        if '*' in fp:
            invals['INIFS'] += ppaths.check_wildcard(fp)
        else:
            invals['INIFS'].append(ppaths.check_path(fp, 'file'))
    invals['INIFS'] = list(dict.fromkeys(invals['INIFS']))

    if args.hhblits_arguments is not None:
        invals['HHBLITS_PARAMETERS'] = pco._check_hhparams(
            args.hhblits_arguments)
    else:
        pass

    if args.skip_conpred is True:
        skipexec = True
        if args.hhblits_arguments is not None:
            logger.info('HHblits parameters given bypassed by --skip_conpred')
    else:
        skipexec = False

    if args.outdir is None:
        invals['OUTROOT'] = ppaths.check_path(os.path.dirname(invals['INSEQ']))
    else:
        invals['OUTROOT'] = ppaths.check_path(os.path.join(args.outdir[0], ''))
    ppaths.mdir(invals['OUTROOT'])

    if args.collection_file is None:
        invals['OUTCSVPATH'] = ppaths.check_path(
            os.path.join(invals['OUTROOT'],
                         ("evcovsignal" + os.extsep + "full" + os.extsep +
                          "pisacov" + os.extsep + "csv")))
    else:
        invals['OUTCSVPATH'] = ppaths.check_path(args.collection_file[0])

    if os.path.isfile(invals['OUTCSVPATH']) is False:
        pic.csvheader(invals['OUTCSVPATH'], cropped=False)

    # Define formats used
    sources = pco._sources()

    # Parse sequence and structure files
    logger.info('Parsing sequence file...')
    seqs = cps.parseseqfile(invals['INSEQ'])

    if len(seqs) == 1:
        if len(seqs) == 1:
            for key in seqs:
                pdbid = key.lower()
    else:
        raise Exception('More than one pdbid in sequence set.')

    seq = seqs[pdbid]

    outpdbdir = os.path.join(invals['OUTROOT'], pdbid, "")

    # RENUMBERING
    fseq = {}
    fmsa = {}

    if skipexec is False:
        if invals['INIFS'] is not None:
            logger.info('Renumbering interfaces provided ' +
                        'according to position in sequence.')
            for path in invals['INIFS']:
                instrc = os.path.join(invals['OUTROOT'], pdbid,
                                      os.path.basename(path))
                logger.info(pcl.running('CROPS-renumber'))
                itime = datetime.datetime.now()
                psc.renumcrops(invals['INSEQ'], path, invals['OUTROOT'])
                copyfile(path, instrc)
                logger.info(pcl.running('CROPS-renumber', done=itime))

        ppaths.mdir(outpdbdir)

    for i, iseq in seq.imer.items():
        fiseq = pdbid + '_' + i + os.extsep + 'fasta'
        fseq[i] = os.path.join(invals['OUTROOT'], pdbid, fiseq)
        fiseq = pdbid + '_' + i + os.extsep + 'msa' + os.extsep + 'aln'
        fmsa[i] = os.path.join(invals['OUTROOT'], pdbid, 'hhblits', fiseq)
        if skipexec is False:
            iseq.dump(fseq[i])

    # EXECUTION OF EXTERNAL PROGRAMS
    hhdir = os.path.join(invals['OUTROOT'], pdbid, 'hhblits', '')
    dmpdir = os.path.join(invals['OUTROOT'], pdbid, 'dmp', '')
    fstr = []
    for file in invals['INIFS']:
        fstr.append(
            os.path.join(
                invals['OUTROOT'],
                (os.path.splitext(os.path.basename(file))[0] + os.extsep +
                 'crops' + os.extsep + 'seq' + os.extsep + 'pdb')))

    if skipexec is False:
        # MSA GENERATOR
        ppaths.mdir(hhdir)

        if invals['HHBLITS_PARAMETERS'] == ['3', '0.001', 'inf', '50', '99']:
            logger.info(
                'Generating Multiple Sequence Alignment using DeepMetaPSICOV default parameters... [AS RECOMMENDED]'
            )
        elif invals['HHBLITS_PARAMETERS'] == ['2', '0.001', '1000', '0', '90']:
            logger.info(
                'Generating Multiple Sequence Alignment using HHBlits default parameters...'
            )
        else:
            logger.info(
                'Generating Multiple Sequence Alignment using user-custom parameters...'
            )

        for i, iseq in seq.imer.items():
            sfile = fseq[i]
            afile = fmsa[i]
            logger.info(pcl.running('HHBlits'))
            itime = datetime.datetime.now()
            themsa = psm.runhhblits(sfile, invals['HHBLITS_PARAMETERS'], hhdir)
            logger.info(pcl.running('HHBlits', done=itime))
            iseq.msa = themsa

    # DEEP META PSICOV RUN
        logger.info(
            'Generating contact prediction lists via DeepMetaPSICOV...')

        ppaths.mdir(dmpdir)
        for i, iseq in seq.imer.items():
            sfile = fseq[i]
            afile = fmsa[i]
            nsfile = os.path.join(dmpdir, os.path.basename(sfile))
            if sfile != nsfile:
                copyfile(sfile, nsfile)
            logger.info(pcl.running('DeepMetaPSICOV'))
            itime = datetime.datetime.now()
            psd.rundmp(nsfile, afile, dmpdir)
            logger.info(pcl.running('DeepMetaPSICOV', done=itime))

    # GENERATE INTERFACE LIST
    iflist = []
    for filepath in fstr:
        ifname = os.path.splitext(os.path.basename(filepath))[0]
        iflist.append(pci.interface(name=ifname))

    # CONTACT ANALYSIS AND MATCH
    logger.info('Opening output csv files...')
    resultdir = os.path.join(invals['OUTROOT'], pdbid, 'pisacov', '')
    ppaths.mdir(resultdir)
    csvfile = os.path.join(
        resultdir, (pdbid + os.extsep + "evcovsignal" + os.extsep + "full" +
                    os.extsep + "pisacov" + os.extsep + "csv"))

    pic.csvheader(csvfile, cropped=False, pisascore=False)

    logger.info('Parsing sequence files...')
    for i, fpath in fseq.items():
        seq.imer[i].seqs['conkit'] = ckio.read(fpath, 'fasta')[0]
        seq.imer[i].biotype = csq.guess_type(seq.imer[i].seqs['mainseq'])

    logger.info('Parsing contact predictions lists...')
    conpred = {}
    matches = []
    for s in seq.imer:
        if s not in conpred:
            conpred[s] = {}
        for source, attribs in sources.items():
            fc = os.path.splitext(os.path.basename(fseq[s]))[0]
            fc += attribs[1]
            confile = os.path.join(invals['OUTROOT'], pdbid, attribs[0], fc)
            conpred[s][source] = ckio.read(confile, attribs[2])[0]

    logger.info('Parsing crystal structure contacts...')
    for i in range(len(iflist)):
        inputmap = ckio.read(fstr[i], 'pdb')
        if len(inputmap) == 4:
            chnames = list(iflist[i].chains.keys())
            chtypes = list(iflist[i].chains.values())
            if (seq.whatseq(chnames[0]) != seq.whatseq(chnames[1])
                    or (chtypes[0] != 'Protein' or chtypes[1] != 'Protein')):
                if chtypes[0] != "Protein" or chtypes[1] != "Protein":
                    logger.info(
                        'Interface ' + str(i) +
                        ' is not a Protein-Protein interface. Ignoring.')
                else:
                    logger.info('Interface ' + str(i) +
                                ' is not a homodimer. Ignoring.')
                iflist[i].structure = None
                matches.append(None)
                continue
            s = seq.whatseq(chnames[0])
            try:
                iflist[i].structure = []
                for m in range(len(inputmap)):
                    iflist[i].structure.append(inputmap[m].as_contactmap())
                    iflist[i].structure[m].id = inputmap[m].id
            except Exception:
                for m in range(len(inputmap)):
                    iflist[i].structure.append(inputmap[m])  # ConKit LEGACY.

            matches.append({})
            for source, attribs in sources.items():
                matches[i][source] = pcc.contact_atlas(
                    name=pdbid + '_' + str(s),
                    conpredmap=conpred[s][source],
                    strmap=iflist[i].structure,
                    sequence=seq.imer[s],
                    removeintra=True)
        else:
            iflist[i].structure = None
            matches.append(None)
            continue

    logger.info('Computing results and writing them to file...')
    for i in range(len(iflist)):
        if matches[i] is None:
            continue
        results = [pdbid, str(i + 1)]
        results.append(matches[i]['psicov'].chain1)
        results.append(matches[i]['psicov'].chain2)
        sid = seq.whatseq(matches[i]['psicov'].chain1)
        results.append(str(sid))
        results.append(str(seq.imer[sid].length()))
        results.append(str(seq.imer[sid].cropmsa.meff))
        results.append(str(seq.imer[sid].ncrops()))
        results.append(str(seq.imer[sid].full_length()))
        for source, attribs in sources.items():
            appresults = pcs.list_scores(matches[i][source], tag=source)
            results += appresults

        pic.lineout(results, csvfile)
        pic.lineout(results, invals['OUTCSVPATH'])

    endmsg = pcl.ok(starttime, command=__script__)
    logger.info(endmsg)

    return
Beispiel #7
0
def main():
    parser = create_argument_parser()
    args = parser.parse_args()

    global logger
    logger = pcl.pisacov_logger(level="info")
    welcomemsg, starttime = pcl.welcome(command=__script__)
    logger.info(welcomemsg)

    # PARSE CONFIGURATION FILE:
    invals = pco._initialise_inputs()

    invals['INSEQ'] = None
    invals['INSTR'] = None
    invals['ALTDB'] = None
    invals['OUTROOT'] = None
    invals['OUTCSVPATH'] = None
    invals['UPTHRESHOLD'] = None

    # READ INPUT ARGUMENTS
    invals['INSEQ'] = ppaths.check_path(args.seqpath[0], 'file')
    invals['INSTR'] = ppaths.check_path(args.crystalpath[0], 'file')

    if args.hhblits_arguments is not None:
        invals['HHBLITS_PARAMETERS'] = pco._check_hhparams(
            args.hhblits_arguments)
    else:
        pass

    if args.uniprot_threshold is not None:
        try:
            invals['UPTHRESHOLD'] = float(args.uniprot_threshold[0])
        except ValueError:
            logger.critical('Uniprot threshold given not valid.')
        if invals['UNICLUST_FASTA_PATH'] is None:
            invals['UNICLUST_FASTA_PATH'] = pco._uniurl
    else:
        pass

    if args.skip_conpred is True:
        skipexec = True
        if (args.hhblits_arguments is not None
                or args.uniprot_threshold is not None):
            logger.info(
                'HHblits, UniProt threshold parameters given bypassed by --skip_conpred'
            )
    else:
        skipexec = False
    cropping = args.remove_insertions
    scoring = [cropping, not cropping]

    if args.outdir is None:
        invals['OUTROOT'] = ppaths.check_path(os.path.dirname(invals['INSEQ']))
    else:
        invals['OUTROOT'] = ppaths.check_path(os.path.join(args.outdir[0], ''))
    ppaths.mdir(invals['OUTROOT'])

    invals['OUTCSVPATH'] = []
    if args.collection_file is None:
        invals['OUTCSVPATH'].append(
            ppaths.check_path(
                os.path.join(invals['OUTROOT'],
                             ("evcovsignal" + os.extsep + "cropped" +
                              os.extsep + "pisacov" + os.extsep + "csv"))))
        invals['OUTCSVPATH'].append(
            ppaths.check_path(
                os.path.join(invals['OUTROOT'],
                             ("evcovsignal" + os.extsep + "full" + os.extsep +
                              "pisacov" + os.extsep + "csv"))))
    else:
        if cropping is True:
            invals['OUTCSVPATH'].append(
                ppaths.check_path(args.collection_file[0]))
            invals['OUTCSVPATH'].append(
                ppaths.check_path(
                    os.path.splitext(args.collection_file[0])[0] + os.extsep +
                    'full' + os.extsep +
                    os.path.splitext(args.collection_file[0])[1]))
        else:
            invals['OUTCSVPATH'].append(None)
            invals['OUTCSVPATH'].append(
                ppaths.check_path(args.collection_file[0]))

    if args.plot_formats is None:
        plotformats = {'png'}
    else:
        plotformats = set()
        for element in args.plot_formats:
            if element.lower() in {'png', 'eps', 'dat'}:
                plotformats.add(element.lower())

    # Define formats used
    sources = pco._sources()

    # Parse sequence and structure files
    logger.info('Parsing sequence file...')
    # seqs = cps.parseseqfile(invals['INSEQ'])
    seqs = pio.read(invals['INSEQ'], 'fasta')

    logger.info('Parsing structure file...')
    # strs, filestrs = cps.parsestrfile(invals['INSTR'])
    strs, filestrs = pio.read(invals['INSTR'], 'pdb')

    if len(seqs) == 1 or len(strs) == 1:
        if len(seqs) == 1:
            for key in seqs:
                pdbid = key
        elif len(seqs) > 1 and len(strs) == 1:
            for key in strs:
                for key2 in seqs:
                    if key.upper() == key2.upper():
                        pdbid = key.upper()
                    else:
                        if key2.upper() in key.upper():
                            pdbid = key2.upper()
    else:
        raise Exception(
            'More than one pdbid in sequence and/or structure set.')

    seq = seqs[pdbid]
    #structure = strs[pdbid]

    # CROPPING AND RENUMBERING
    outpdbdir = os.path.join(invals['OUTROOT'], pdbid, "")
    instrc = os.path.join(invals['OUTROOT'], pdbid,
                          os.path.basename(invals['INSTR']))

    fseq = {}
    fmsa = {}
    if skipexec is False:
        if cropping is True:
            logger.info('Cropping and renumbering sequences, ' +
                        'structures according to SIFTS database.')
            logger.info(pcl.running('CROPS-cropstr'))
            itime = datetime.datetime.now()
            psc.runcrops(invals['INSEQ'], invals['INSTR'],
                         invals['SIFTS_PATH'], invals['UPTHRESHOLD'],
                         invals['UNICLUST_FASTA_PATH'], invals['OUTROOT'])
            logger.info(pcl.running('CROPS-cropstr', done=itime))
        else:
            logger.info('Renumbering structure ' +
                        'according to position in sequence.')
            logger.info(pcl.running('CROPS-renumber'))
            itime = datetime.datetime.now()
            psc.renumcrops(invals['INSEQ'], invals['INSTR'], invals['OUTROOT'])
            logger.info(pcl.running('CROPS-renumber', done=itime))

        ppaths.mdir(outpdbdir)
        copyfile(invals['INSTR'], instrc)

    for i, iseq in seq.imer.items():
        fiseq = pdbid + '_' + i + '.fasta'
        fseq[i] = os.path.join(invals['OUTROOT'], pdbid, fiseq)
        fiseq = pdbid + '_' + i + '.msa.aln'
        fmsa[i] = os.path.join(invals['OUTROOT'], pdbid, 'hhblits', fiseq)
        if skipexec is False:
            iseq.dump(fseq[i])

    # Parse cropped sequences and maps
    if cropping is True:
        amap = {}
        fcropseq = {}
        fcropmsa = {}
        for i, iseq in seq.imer.items():
            fprefix = pdbid + '_' + i + '.crops.to_uniprot'
            fmap = os.path.join(invals['OUTROOT'], pdbid,
                                fprefix + os.extsep + 'cropmap')
            amap.update(cps.parsemapfile(fmap)[pdbid])
            fcropseq[i] = os.path.join(invals['OUTROOT'], pdbid,
                                       fprefix + os.extsep + 'fasta')
            fcropmsa[i] = os.path.join(
                invals['OUTROOT'], pdbid, 'hhblits',
                (fprefix + os.extsep + 'msa' + os.extsep + 'aln'))
            seq.set_cropmaps(amap, cropmain=True)
            if iseq.ncrops() == 0:
                logger.info('    Cropped sequence ' + iseq.oligomer_id + '_' +
                            iseq.name +
                            ' is identical to the original sequence.')
            else:
                logger.info('    Cropped sequence ' + iseq.oligomer_id + '_' +
                            iseq.name + ' is ' + str(iseq.ncrops()) +
                            ' residues ' +
                            'shorter than the original sequence.')

    # EXECUTION OF EXTERNAL PROGRAMS
    hhdir = os.path.join(invals['OUTROOT'], pdbid, 'hhblits', '')
    dmpdir = os.path.join(invals['OUTROOT'], pdbid, 'dmp', '')
    pisadir = os.path.join(invals['OUTROOT'], pdbid, 'pisa', '')
    fstr = os.path.join(
        invals['OUTROOT'],
        (pdbid + os.extsep + 'crops' + os.extsep + 'seq' + os.extsep + 'pdb'))
    if cropping:
        fcropstr = os.path.join(
            invals['OUTROOT'], pdbid,
            (pdbid + os.extsep + 'crops' + os.extsep + 'oldids' + os.extsep +
             'to_uniprot' + os.path.splitext(invals['INSTR'])[1]))
    if skipexec is False:
        # MSA GENERATOR
        ppaths.mdir(hhdir)
        if invals['HHBLITS_PARAMETERS'] == ['3', '0.001', 'inf', '50', '99']:
            logger.info(
                'Generating Multiple Sequence Alignment using DeepMetaPSICOV default parameters... [AS RECOMMENDED]'
            )
        elif invals['HHBLITS_PARAMETERS'] == ['2', '0.001', '1000', '0', '90']:
            logger.info(
                'Generating Multiple Sequence Alignment using HHBlits default parameters...'
            )
        else:
            logger.info(
                'Generating Multiple Sequence Alignment using user-custom parameters...'
            )

        for i, iseq in seq.imer.items():
            sfile = fcropseq[i] if cropping is True else fseq[i]
            afile = fcropmsa[i] if cropping is True else fmsa[i]
            logger.info(pcl.running('HHBlits'))
            itime = datetime.datetime.now()
            themsa = psm.runhhblits(sfile, invals['HHBLITS_PARAMETERS'], hhdir)
            logger.info(pcl.running('HHBlits', done=itime))
            if cropping is True:
                iseq.cropmsa = themsa
                if iseq.ncrops() == 0:
                    iseq.msa = iseq.cropmsa
                    continue
                else:
                    pass
            else:
                iseq.msa = themsa

    # DEEP META PSICOV RUN
    ppaths.mdir(dmpdir)
    if skipexec is False:
        logger.info(
            'Generating contact prediction lists via DeepMetaPSICOV...')
        for i, iseq in seq.imer.items():
            sfile = fcropseq[i] if cropping is True else fseq[i]
            afile = fcropmsa[i] if cropping is True else fmsa[i]
            nsfile = os.path.join(dmpdir, os.path.basename(sfile))
            if sfile != nsfile:
                copyfile(sfile, nsfile)
            logger.info(pcl.running('DeepMetaPSICOV'))
            itime = datetime.datetime.now()
            psd.rundmp(nsfile, afile, dmpdir)
            logger.info(pcl.running('DeepMetaPSICOV', done=itime))

    # INTERFACE GENERATION, PISA
    ppaths.mdir(pisadir)
    if skipexec is False:
        logger.info('Generating interface files via PISA...')
        sfile = fcropstr if cropping is True else fstr
        logger.info(pcl.running('PISA'))
        itime = datetime.datetime.now()
        iflist = psp.runpisa(sfile, pisadir, sessionid=pdbid)
        logger.info(pcl.running('PISA', done=itime))

    # READ DATA IF SKIPEXEC USED:
    if skipexec is True:
        logger.info('Parsing already generated files...')
        for i, iseq in seq.imer.items():
            sfile = fcropstr if cropping is True else fstr
            afile = fcropmsa[i] if cropping is True else fmsa[i]
            if cropping is True:
                # iseq.cropmsa = ckio.read(afile, 'jones')
                iseq.cropmsa = pio.read(afile, 'jones')
                if iseq.ncrops() == 0:
                    scoring[1] = True
                    # iseq.msa = ckio.read(afile, 'jones')
                    iseq.msa = ckio.read(afile, 'jones')
            else:
                # iseq.msa = ckio.read(afile, 'jones')
                iseq.msa = pio.read(afile, 'jones')
        ixml = os.path.join(pisadir,
                            (os.path.splitext(os.path.basename(sfile))[0] +
                             os.extsep + 'interface' + os.extsep + 'xml'))
        axml = os.path.join(pisadir,
                            (os.path.splitext(os.path.basename(sfile))[0] +
                             os.extsep + 'assembly' + os.extsep + 'xml'))

        iflist = pci.parse_interface_xml(ixml, axml)

    # CONTACT ANALYSIS AND MATCH
    logger.info('Opening output csv files...')
    resultdir = os.path.join(invals['OUTROOT'], pdbid, 'pisacov', '')
    ppaths.mdir(resultdir)
    csvfile = []
    csvfile.append(
        os.path.join(resultdir,
                     (pdbid + os.extsep + "evcovsignal" + os.extsep +
                      "cropped" + os.extsep + "pisacov" + os.extsep + "csv")))
    csvfile.append(
        os.path.join(resultdir,
                     (pdbid + os.extsep + "evcovsignal" + os.extsep + "full" +
                      os.extsep + "pisacov" + os.extsep + "csv")))

    for n in range(2):
        if scoring[n] is True:
            cpd = True if cropping else False
            pic.csvheader(csvfile[n], cropped=cpd, pisascore=True)
            if invals['OUTCSVPATH'][n] is not None:
                if os.path.isfile(invals['OUTCSVPATH'][n]) is False:
                    pic.csvheader(invals['OUTCSVPATH'][n],
                                  cropped=cpd,
                                  pisascore=True)

    logger.info('Parsing sequence files...')
    for i, fpath in fseq.items():
        # seq.imer[i].seqs['conkit'] = ckio.read(fpath, 'fasta')[0]
        seq.imer[i].seqs['conkit'] = pio.read(fpath, 'fasta', ck=True)[0]

    logger.info('Parsing contact predictions lists...')
    conpred = {}
    matches = []
    for s in seq.imer:
        if s not in conpred:
            conpred[s] = {}
        fs = fcropseq[s] if cropping else fseq[s]
        for source, attribs in sources.items():
            fc = os.path.splitext(os.path.basename(fs))[0]
            fc += os.extsep + attribs[1]
            confile = os.path.join(dmpdir, fc)
            # conpred[s][source] = ckio.read(confile, attribs[2])[0]
            conpred[s][source] = pio.read(confile, attribs[2], ck=True)[0]

    logger.info('Parsing crystal structure contacts...')
    for i in range(len(iflist)):
        logger.info(os.linesep + str(iflist[i]))
        fs = fcropstr if cropping else fstr
        fs = (os.path.splitext(os.path.basename(fs))[0] + os.extsep +
              "interface" + os.extsep + str(i + 1) + os.extsep + "pdb")
        spath = os.path.join(pisadir, fs)
        # inputmap = ckio.read(spath, 'pdb')
        inputmap = pio.read(spath, 'pdb', ck=True)
        if len(inputmap) == 4:
            chnames = [
                iflist[i].chains[0].crystal_id, iflist[i].chains[1].crystal_id
            ]
            iflist[i].chains[0].seq_id = seq.whatseq(chnames[0])
            iflist[i].chains[1].seq_id = seq.whatseq(chnames[1])
            chseqs = [iflist[i].chains[0].seq_id, iflist[i].chains[1].seq_id]

            logger.info(iflist[i].chains)
            chtypes = [iflist[i].chains[0].type, iflist[i].chains[1].type]
            if (chseqs[0] != chseqs[1]
                    or (chtypes[0] != 'Protein' or chtypes[1] != 'Protein')):
                if chtypes[0] != "Protein" or chtypes[1] != "Protein":
                    logger.info(
                        'Interface ' + str(i) +
                        ' is not a Protein-Protein interface. Ignoring.')
                else:
                    logger.info('Interface ' + str(i) +
                                ' is not a homodimer. Ignoring.')
                iflist[i].structure = None
                matches.append(None)
                continue
            s = chseqs[0]

            try:
                iflist[i].structure = []
                for m in range(len(inputmap)):
                    iflist[i].structure.append(inputmap[m].as_contactmap())
                    iflist[i].structure[m].id = inputmap[m].id
            except Exception:
                logger.warning('Contact Maps obtained from a legacy ConKit ' +
                               'version with no Distograms implemented.')
                for m in range(len(inputmap)):
                    iflist[i].structure.append(inputmap[m])  # ConKit LEGACY.
            #fs = fcropstr if cropping else fstr
            #fs = (os.path.splitext(os.path.basename(fs))[0] +
            #      os.extsep + "interface" + os.extsep + str(i+1) + os.extsep + "con")
            #spath = os.path.join(pisadir, fs)
            #pio.write(spath, 'psicov', indata=iflist[i].structure[1])
            #iflist[i].contactmap = pio.read(spath, 'array')
            iflist[i].contactmap = iflist[i].structure[1].deepcopy()
            matches.append({})
            for source, attribs in sources.items():
                matches[i][source] = pcc.contact_atlas(
                    name=pdbid + '_' + str(s),
                    dimer_interface=iflist[i],
                    conpredmap=conpred[s][source],
                    conpredtype=source,
                    sequence=seq.imer[s])
                if cropping is True:
                    matches[i][source].set_cropmap()
                matches[i][source].remove_neighbours(mindist=2)
                matches[i][source].set_conpred_seq()
                matches[i][source].remove_intra()
                matches[i][source].make_match(filterout=attribs[3])
                for cmode, cmap in matches[i][source].conkitmatch.items():
                    if (len(cmap) > 0 and len(
                            matches[i][source].interface.structure[1]) > 0):
                        for imtype in plotformats:
                            if len(matches[i][source].conkitmatch) > 1:
                                pout = (os.path.splitext(fs)[0] + os.extsep +
                                        'match' + os.extsep + cmode +
                                        os.extsep + source + os.extsep +
                                        'con' + os.extsep + imtype)
                            else:
                                pout = (os.path.splitext(fs)[0] + os.extsep +
                                        'match' + os.extsep + source +
                                        os.extsep + 'con' + os.extsep + imtype)
                            plotpath = os.path.join(
                                os.path.dirname(csvfile[0]), pout)
                            matches[i][source].plot_map_alt(plotpath,
                                                            mode=cmode,
                                                            plot_type=imtype)
        else:
            iflist[i].structure = None
            iflist[i].contactmap = None
            matches.append(None)
            continue

    logger.info(os.linesep + 'Computing results and writing them to file...' +
                os.linesep)
    for i in range(len(iflist)):
        logger.info('Generating Interface ' + str(i + 1) + ' data...')
        if matches[i] is None:
            continue
        results = [pdbid, str(i + 1)]
        results.append(iflist[i].chains[0].crystal_id)
        results.append(iflist[i].chains[1].crystal_id)
        sid = iflist[i].chains[0].seq_id
        results.append(str(sid))
        results.append(str(seq.imer[sid].length()))
        if cropping is True:
            results.append(str(seq.imer[sid].cropmsa.meff))
        else:
            results.append(str(seq.imer[sid].msa.meff))
        results.append(str(seq.imer[sid].ncrops()))
        results.append(str(seq.imer[sid].full_length()))
        results.append(str(seq.imer[sid].msa.meff))
        for source, attribs in sources.items():
            appresults = pcs.list_scores(matches[i][source], tag=source)
            results.extend(appresults)
        results.append(str(iflist[i].stable))
        for n in range(2):
            if scoring[n] is True:
                pic.lineout(results, csvfile[n])
                pic.lineout(results, invals['OUTCSVPATH'][n])

    endmsg = pcl.ok(starttime, command=__script__)
    logger.info(endmsg)

    return
Beispiel #8
0
def main():
    parser = create_argument_parser()
    args = parser.parse_args()

    global logger
    logger = pcl.pisacov_logger(level="info")
    welcomemsg, starttime = pcl.welcome(command=__script__)
    logger.info(welcomemsg)

    csvfile = ppaths.check_path(args.scores[0], 'file')
    outdir = ppaths.check_path(args.outdir)
    ppaths.mdir(outdir)

    # Parsing scores
    scores = {}
    names = None
    thraw = {}
    with open(csvfile, 'r') as fin:
        scoresin = csv.reader(fin)
        for entry in scoresin:
            if entry[0][0] != "#":
                if (names is None or isinstance(names, str) or
                    (isinstance(names, list) and len(names) != len(entry))):
                    names = []
                    for n in range(len(entry)):
                        names.append('sc_' + str(n + 1))
                else:
                    if thraw == {}:
                        for name in names[13:-1]:
                            thraw[name] = []
                if entry[0] not in scores:
                    scores[entry[0]] = {}
                if entry[1] not in scores[entry[0]]:
                    scores[entry[0]][entry[1]] = []
                    for sc in (entry.split(sep=', ')[13:-1]):
                        scores[entry[0]][entry[1]].append(float(sc))
                    if (entry.split(sep=', ')[-1]) == 'True' or '1':
                        scores[entry[0]][entry[1]].append(True)
                    elif (entry.split(sep=', ')[-1]) == 'False' or '0':
                        scores[entry[0]][entry[1]].append(False)
                    for n in range(len(names)):
                        thraw[names[n]].append(scores[entry[0]][entry[1]][n])
                else:
                    if entry.split(
                            sep=', ')[13:] == scores[entry[0]][entry[1]]:
                        pass
                    else:
                        raise ValueError(
                            'CSV file contains different values for same interface.'
                        )
            else:
                names = entry[1:].split(sep=', ')

    # Setting thresholds
    thr = {}
    FPR = {}
    TPR = {}
    for key, value in thraw.items():
        thr[key] = list(set(thraw)).sort()
        FPR[key] = []
        TPR[key] = []
        for t in thr[key]:
            FP = 0
            TP = 0
            FN = 0
            TN = 0
            for pdbid in scores:
                for iface in scores[pdbid]:
                    stable = scores[pdbid][iface][-1]
                    for n in range(len(names)):
                        if scores[pdbid][iface][n] < t:
                            if stable is True:
                                FN += 1
                            else:
                                TN += 1
                        else:
                            if stable is True:
                                TP += 1
                            else:
                                FP += 1
            FPR[key].append(FP / (FP + TN))
            TPR[key].append(TP / (TP + FN))
        fnameout = os.path.join(
            outdir,
            (key + os.path.splitext(os.path.basename(csvfile))[0] + 'roc.dat'))
        with open(fnameout, 'w') as fout:
            for n in range(len(FPR[key])):
                fout.write(str(FPR[key][n]) + ' ' + str(TPR[key][n]))

    endmsg = pcl.ok(starttime, command=__script__)
    logger.info(endmsg)

    return