def main(): parser = create_argument_parser() args = parser.parse_args() global logger logger = pcl.pisacov_logger(level="info") logger.info(pcl.welcome())
def main(): starttime = time.time() parser = create_argument_parser() args = parser.parse_args() global logger logger = pcl.pisacov_logger(level="info") logger.info(pcl.welcome()) # READ INPUT ARGUMENTS if args.initialise is not None: inseq = pio.check_path(args.initialise[0], 'file') instr = pio.check_path(args.initialise[1], 'file') indb = pio.check_path(pio.conf.CSV_CHAIN_PATH, 'file') skipexec = [False, True] if not args.add_noncropped else [False, False] scoring = [True, False] if not args.add_noncropped else [True, True] elif args.skip_conpred is not None: inseq = pio.check_path(args.skip_conpred[0], 'file') instr = pio.check_path(args.skip_conpred[1], 'file') skipexec = [True, True] scoring = [True, False] if not args.add_noncropped else [True, True] elif args.skip_default_conpred is not None: inseq = pio.check_path(args.skip_default_conpred[0], 'file') instr = pio.check_path(args.skip_default_conpred[1], 'file') skipexec = [True, True] if not args.add_noncropped else [True, False] scoring = [True, False] if not args.add_noncropped else [True, True] if args.outdir is None: outrootdir = pio.check_path(os.path.dirname(inseq)) else: outrootdir = pio.check_path(os.path.join(args.outdir[0], '')) pio.mdir(outrootdir) if args.collection_file is None: outcsvfile = pio.check_path( os.path.join(outrootdir, "pisacov_data.csv")) else: outcsvfile = pio.check_path(args.collection_file[0]) try: pio.check_path(outcsvfile, 'file') csvexists = True except: csvexists = False if args.uniprot_threshold is not None: thuprot, dbuprot = pio.check_uniprot(args.uniprot_threshold[0]) else: thuprot = 0.0 dbuprot = None if args.hhparams is not None: hhparameters = pio.check_hhparams(args.hhparams) else: try: hhparameters = pio.check_hhparams(pio.conf.HHBLITS_PARAMETERS) except: hhparameters = pio.check_hhparams('dmp') # Define formats used sources = pio.paths.sources() n_sources = len(sources) # Parse sequence and structure files logger.info('Parsing sequence file...') seq = cps.parseseqfile(inseq) if len(seq) == 1: for key in seq.keys(): pdbid = key.lower() if len(seq[key].imer) == 1: for key2 in seq[key].imer[key2]: chid = key2 else: raise Exception('More than one pdbid in sequence set.') else: raise Exception('More than one pdbid in sequence set.') logger.info('Parsing structure file...') structure = cps.parsestrfile(instr)[0][pdbid] #logger.info('Parsing SIFTS database file...') #sifts = cps.import_db(indb, pdb_in=pdbid) # CROPPING AND RENUMBERING if not skipexec[0]: logger.info( 'Cropping and renumbering sequences, structures according to SIFTS database.' ) psys.crops.runcrops(inseq, instr, indb, thuprot, dbuprot, outrootdir) outpdbdir = os.path.join(outrootdir, pdbid, "") pio.mdir(outpdbdir) inseqc = os.path.join(outpdbdir, os.path.basename(inseq)) instrc = os.path.join(outpdbdir, os.path.basename(instr)) copyfile(inseq, inseqc) copyfile(instr, instrc) cmappath = os.path.join(os.path.splitext(inseqc), '.cropmap') # MSA GENERATOR cseqpath = os.path.join(outpdbdir, pdbid + '.crops.to_uniprot.fasta') hhdir = os.path.join(outpdbdir, 'hhblits', '') pio.mdir(hhdir) neff = {} if not skipexec[0] or not skipexec[1]: if hhparameters == ['3', '0.001', 'inf', '50', '99']: logger.info( 'Generating Multiple Sequence Alignment using DeepMetaPSICOV default parameters... [AS RECOMMENDED]' ) elif hhparameters == ['2', '0.001', '1000', '0', '90']: logger.info( 'Generating Multiple Sequence Alignment using HHBlits default parameters...' ) else: logger.info( 'Generating Multiple Sequence Alignment using user-custom parameters...' ) if os.path.isfile(cmappath) and not skipexec[0]: psys.msagen.runhhblits(cseqpath, hhparameters, hhdir) cmsaa3mfile = os.path.splitext( os.path.basename(cseqpath))[0] + ".msa.a3m" cmsaa3mpath = os.path.join(hhdir, cmsaa3mfile) neff['cropped'] = psys.msagen.msafilesgen(cmsaa3mpath) neff['original'] = None if not skipexec[1]: logger.info( ' Repeating process for non-default sequence...') neff['cropped'] = None else: logger.info( ' No cropped sequence found, using original sequence instead...' ) if not os.path.isfile(cmappath) or not skipexec[1]: psys.msagen.runhhblits(inseq, hhparameters, hhdir) msaa3mfile = os.path.splitext( os.path.basename(inseq))[0] + ".msa.a3m" msaa3mpath = os.path.join(hhdir, msaa3mfile) neff['original'] = psys.msagen.msafilesgen(msaa3mpath) # DEEP META PSICOV RUN if not skipexec[0] or not skipexec[1]: logger.info( 'Generating contact prediction lists via DeepMetaPSICOV...') dmpdir = os.path.join(outpdbdir, 'dmp', '') pio.mdir(dmpdir) if os.path.isfile(cmappath) and not skipexec[0]: psys.dmp.rundmp(cseqpath, cmsaa3mpath, dmpdir) if not skipexec[1]: logger.info( ' Repeating process for non-default sequence...') else: logger.info( ' No cropped sequence found, using original sequence instead...' ) if not os.path.isfile(cmappath) or not skipexec[1]: psys.dmp.rundmp(inseq, msaa3mpath, dmpdir) # INTERFACE GENERATION, PISA cstrpath = os.path.join(outpdbdir, pdbid + '.oldids.crops.to_uniprot.pdb') pisadir = os.path.join(outpdbdir, 'pisa', '') n_ifaces = {} if not skipexec[0] or not skipexec[1]: logger.info('Generating interface files via PISA...') if os.path.isfile(cmappath) and not skipexec[0]: n_ifaces['cropped'] = psys.pisa.runpisa(cstrpath, pisadir) if not skipexec[1]: logger.info( ' Repeating process for non-default sequence...') else: n_ifaces['cropped'] = None logger.info( ' No cropped sequence found, using original sequence instead...' ) if not os.path.isfile(cmappath) or not skipexec[1]: n_ifaces['original'] = psys.pisa.runpisa(instr, pisadir) else: n_ifaces['original'] = None # CONTACT ANALYSIS AND MATCH resultdir = os.path.join(outpdbdir, 'results', '') logger.info('Opening output csv files...') pdbcsvfile = os.path.join(resultdir, pdbid + ".evcovsignal.csv") pio.outcsv.csvheader(pdbcsvfile) if not csvexists: pio.outcsv.csvheader(outcsvfile) logger.info('Parsing contact predictions lists...') ckseq = ckio.read(inseq, 'fasta') conpred = {} for source, attribs in sources.items(): for mode in ['cropped', 'original']: seqfile = cseqpath if mode == 'cropped' else inseq confile = (os.path.splitext(os.path.basename(seqfile))[0] + attribs[1]) conpath = os.path.join(outpdbdir, attribs[0], confile) cropmapping = cps.parsemapfile(cmappath) if os.path.isfile(conpath): conpred[mode][source] = ckio.read(conpath, attribs[2])[0] for contact in conpred[mode][source]: contact.res1_seq = cropmapping[pdbid][chid]['cropbackmap'][ contact.res1_seq] contact.res2_seq = cropmapping[pdbid][chid]['cropbackmap'][ contact.res2_seq] else: conpred[mode][source] = None # NOT SURE IT IS WORKING WITH SEVERAL CHAINS OF SAME SEQUENCE. CHECK EVERYTHING. logger.info(' Parsing PISA interfaces...') interfaces = {} for mode in ['cropped', 'original']: if n_ifaces[mode] is not None: interfaces[mode] = [] for i in range(int(n_ifaces[mode])): strfile = cstrpath if mode == 'cropped' else instr pdbfilei = os.path.splitext(strfile)[0] + ".interface." + str( i + 1) + ".pdb" interfaces[mode].append(ckio.read(pdbfilei, 'pdb')) else: interfaces[mode] = None # OUTPUT # CODE TO PRINT NON-REPEATED LINES # import csv # rows = csv.reader(open("file.csv", "rb")) # newrows = [] # for row in rows: # if row not in newrows: # newrows.append(row) # writer = csv.writer(open("file.csv", "wb")) # writer.writerows(newrows) return
def main(): starttime=time.time() parser = create_argument_parser() args = parser.parse_args() global logger logger = pcl.crops_logger(level="info") logger.info(pcl.welcome()) inseq=check_path(args.input_seqpath[0],'file') indb=check_path(args.input_database[0],'file') insprot=check_path(args.uniprot_threshold[1]) if args.uniprot_threshold is not None else None minlen=float(args.uniprot_threshold[0]) if args.uniprot_threshold is not None else 0.0 targetlbl=ctg.target_format(indb,terms=args.terminals, th=minlen) infixlbl=ctg.infix_gen(indb,terms=args.terminals) if args.outdir is None: outdir=check_path(os.path.dirname(inseq),'dir') else: outdir=check_path(os.path.join(args.outdir[0],''),'dir') if args.sort is not None: if (args.sort[0].lower()!='ncrops' and args.sort[0].lower()!='percent' and args.sort[0].lower()!='ncropsin' and args.sort[0].lower()!='percentin'): raise ValueError("Arguments for sorting option can only be either 'ncrops' or 'percent'.") else: sorter=args.sort[0].lower() ############################################################# sources=["deepmetapsicov", "psicov"] confiledir=["deepmetapsicov", "deepmetapsicov"] confilesuffix=["deepmetapsicov.con","psicov"] n_sources=len(sources) # BEGIN: DEBUGGING ONLY # Skip the execution of PISA, HHBLITS and DeepMetaPSICOV (?) [bool] # SKIP_EXEC=[True, True, True] SKIP_EXEC=[False, False, False] # Skip the creation of new dir (?) [bool] # SKIP_MKDIR=True SKIP_MKDIR=False # END: DEBUGGING ONLY # Create output directory if not SKIP_MKDIR: pmo.mkdirout() pmo.printout('Output directory created',extraline=True) starttime=pmi2.gettime() pmo.printout('Starting Time: '+ starttime[1].strftime("%-d %B %Y, %X") + ' UTC\n\n') # Check that input values are correct hhparameters=pmi1.hhparam('logit') dumvar=pmi1.minneigh('logit') dumvar=pmi1.scorethreshold("deepmetapsicov",'logit') dumvar=pmi1.scorethreshold("psicov",'logit') ##################################################################### ##### MSA generator ################################################# pmo.printout('*********************************************************') pmo.printout('*** MULTIPLE SEQUENCE ALIGNMENT *************************',extraline=True) pmo.printout('Importing RCSB PDB sequence from fasta file...') fasta_seq=conkit.io.read(pmin.SEQUENCE_PATH,"fasta")[0] pmo.printout(fasta_seq) if pmin.USE_BIOCHAIN: pmo.printout('Obtaining Biological sequence from fasta file...') if not SKIP_EXEC[1]: biological_seq, seqpath, bio = pms.crop_fasta(fasta_seq,pmo.output_tmpdir("deepmetapsicov")) if not bio: shutil.copyfile(pmin.SEQUENCE_PATH, seqpath) else: tmpfile = pmi2.pdbid() + '.bio.fasta' seqpath = os.path.join(pmo.output_tmpdir("deepmetapsicov"), tmpfile) try: biological_seq=conkit.io.read(seqpath,"fasta")[0] bio=True except: bio=False tmpfile = pmi2.pdbid() + '.fasta' seqpath = os.path.join(pmo.output_tmpdir("deepmetapsicov"), tmpfile) biological_seq=conkit.io.read(seqpath,"fasta")[0] seq=biological_seq tmpfile = pmi2.pdbid() + pms.biofile(bio)+ '.fasta' newseqpath = os.path.join(pmo.output_dir(), tmpfile) shutil.copyfile(seqpath, newseqpath) if bio: pmo.printout('Biological sequence:') pmo.printout(biological_seq) else: pmo.printout('WARNING: Biological Sequence not available. Using RCSB PDB fasta sequence for analysis (including cloning artifacts)',errorlog=True, extraline=True) # LOGGING else: bio=False pmo.printout('Using RCSB PDB fasta sequence for analysis (including cloning artifacts)', extraline=True) seqpath = os.path.join(pmo.output_dir(), os.path.splitext(os.path.basename(pmin.SEQUENCE_PATH))[0])+ os.path.splitext(os.path.basename(pmin.SEQUENCE_PATH))[1] shutil.copyfile(pmin.SEQUENCE_PATH, seqpath) seqpath = os.path.join(pmo.output_tmpdir("deepmetapsicov"), os.path.splitext(os.path.basename(pmin.SEQUENCE_PATH))[0])+ os.path.splitext(os.path.basename(pmin.SEQUENCE_PATH))[1] shutil.copyfile(pmin.SEQUENCE_PATH, seqpath) seq=fasta_seq pmo.printout('') if pmin.HHBLITS_VIA_DMP: pmo.printout("Using DeepMetaPSICOV's execution of HHblits...") msapath='' else: if not SKIP_EXEC[1]: pmo.printout('Creating Multiple Sequence Alignment with HHblits...') msa, msapath = pmp.runhhblits(bio,param=hhparameters,spath=seqpath) else: pmo.printout('Reading Multiple Sequence Alignment...') msafile = pmi2.pdbid()+pms.biofile(bio)+".msa.aln" msapath = os.path.join(pmo.output_dir(), msafile) msa = conkit.io.read(msapath,'jones') msacovpath=os.path.splitext(msapath)[0]+".coverage.png" msaformat='jones' sit=0.7 ##################################################################### ##### DeepMetaPSICOV: Contact Prediction ############################ pmo.printout('*********************************************************') pmo.printout('*** CONTACT PREDICTION **********************************',extraline=True) if not SKIP_EXEC[2]: pmo.printout('Running DeepMetaPSICOV for Contact prediction list...') else: pmo.printout('Skipping DeepMetaPSICOV execution.') if pmin.HHBLITS_VIA_DMP: msa, msapath = pmp.rundmp(seqpath, msapath,skiphhblits=SKIP_EXEC[1],skipdmp=SKIP_EXEC[2]) # Input Sequence fasta file path, MSA file path else: if not SKIP_EXEC[2]: pmp.rundmp(seqpath, msapath) # Input Sequence fasta file path, MSA file path ##################################################################### ##### PISA : OBTAIN INTERFACE PDB FILES ############################# pmo.printout('*********************************************************') pmo.printout('*** INTERFACE IDENTIFICATION ****************************',extraline=True) # Obtain Number of interfaces in PDB and produce Interface PDB files (including renumbered) if not SKIP_EXEC[0]: pmo.printout('Running PISA...') n_interfaces = pmp.runpisa(bio) for nif in range (n_interfaces): pdbintpath = os.path.join(pmo.output_tmpdir("pisa"), pmi2.pdbid()+".interface."+str(nif+1)+".pdb") pms.renumberpdbs(pdbintpath, fasta_seq, bio) else: pmo.printout('Reading PISA xml file...') n_interfaces = pmp.n_int_xml() ##################################################################### ##### Interface Confidence Scores ################################### pmo.printout('*********************************************************') pmo.printout('*** SCORING INTERFACES **********************************',extraline=True) pmo.printout('Importing contact prediction file ...',extraline=True) conpred=[] conpred_id=[] conpredpath=[] for n in range(n_sources): conpredfile=pmi2.pdbid()+pms.biofile(bio)+"."+confilesuffix[n] conpredpath.append(os.path.join(pmo.output_tmpdir(confiledir[n]), conpredfile)) conpred.append(conkit.io.read(conpredpath[n], 'psicov')[0]) conpred[n].sequence=seq conpred[n].set_sequence_register() conpred_id.append(conpred[n].id) pmo.printout('Remove excluded contact scores ...',extraline=True) for n in range(n_sources): conpred[n]=pmc.filter_contacts(conpred[n],sources[n]) cntint=0 cntlig=0 cntunk=0 interfacetype=[0 for i in range(n_interfaces+1)] scores = [[[0 for k in range(4)] for j in range(n_sources)] for i in range(n_interfaces)] n_contacts_all = [[[0 for k in range(4)] for j in range(n_sources)] for i in range(n_interfaces)] ndec=6 for nif in range(n_interfaces): pmo.printout('Importing Interface '+str(nif+1)+' PDB file...') maps=pmc.import_interfacepdb(nif,bio) conpredInt=[] pmo.printout(' Number of maps : ' + str(len(maps)) ) if (len(maps)==4): for n in range(n_sources): interfacetype[nif]="MM" conpredInt.append(conpred[n].deepcopy()) if pmin.REMOVE_INTRA_CONTACTS: pmo.printout(" Removing those contacts from interface that also appear as intramolecular contacts ...") for element in [0,3]: conpredInt[n]=pmc.remove_intra_contacts(conpredInt[n],maps[element]) n_contacts_all[nif][n][0]=conpredInt[n].ncontacts # Total number of contacts predicted for interface n_contacts_all[nif][n][1]=maps[1].ncontacts # Total number of contacts from interface PDB file #pmo.printout(conpredInt) pmo.printout(maps[1]) cntint += 1 # Write contact list pmo.printout(' Writing contact lists ...' ) if pmin.REMOVE_INTRA_CONTACTS: for element in [0,3]: chainid = str(1) if element == 0 else str(2) confile=pmi2.pdbid()+".pdb.interface."+str(nif+1)+".intrachain"+chainid+".conkit.con" tmppath = os.path.join(pmo.output_tmpdir("pisacov"), confile) conkit.io.write(tmppath, 'psicov', hierarchy=maps[element]) confile=pmi2.pdbid()+".pdb.interface."+str(nif+1)+".conkit.con" tmppath = os.path.join(pmo.output_tmpdir("pisacov"), confile) conkit.io.write(tmppath, 'psicov', hierarchy=maps[1]) pmo.printout(' Matching contact prediction and interface contact lists ...',extraline=True ) contactpath=os.path.join(pmo.output_tmpdir("pisacov"), confile) for n in range(n_sources): conpredInt_pdb=pmc.pred_pdb_matchlist(conpredInt[n],contactpath) conpredInt_pdb.id=" Matching contacts of " + conpred_id[n] conpredInt_pdb.sequence = seq.deepcopy() conpredInt_pdb.set_sequence_register() if conpredInt_pdb.ncontacts == 0: pmo.printout(" WARNING: No contacts found in the filtered interface "+str(nif)+" pdb file. SKIP this result type.",errorlog=True) for sc in range(4): scores[nif][n][sc]="***" n_contacts_all[nif][n][3] = "***" else: map_matched, n_contacts_all[nif][n][2], scores[nif][n][0], scores[nif][n][1], scores[nif][n][2], scores[nif][n][3] = pmc.match_maps(conpredInt_pdb, maps[1]) for sc in range(4): scores[nif][n][sc]=round(scores[nif][n][sc],ndec) n_contacts_all[nif][n][3] = n_contacts_all[nif][n][2] / n_contacts_all[nif][n][1] ## Plot matched map pngfile=pmi2.pdbid()+".interface."+str(nif+1)+"."+sources[n]+".con.png" confile=pmi2.pdbid()+"."+sources[n]+".interface."+str(nif+1)+".conkit.con" pngpath = os.path.join(pmo.output_dir(), pngfile) fig = conkit.plot.ContactMapFigure(map_matched, reference=maps[1]) fig.savefig(pngpath, overwrite=True) tmppath = os.path.join(pmo.output_dir(), confile) conkit.io.write(tmppath, 'psicov',hierarchy=conpredInt_pdb) plt.close('all') elif (len(maps)==2): interfacetype[nif]="Unk" pmo.printout(" WARNING: Unexpected number of maps (2)", errorlog=True) pmo.printout('----> REVISE CONTACT MAPS FOR THIS INTERFACE', errorlog=True,extraline=True) cntunk += 1 cnt=0 for score in range(4): for n in range(n_sources): scores[nif][n][score]="***" for mapn in maps: confile=pmi2.pdbid()+".pdb."+str(cnt+1)+".interface."+str(nif+1)+".conkit.con" #### CHECK FILES CREATED WHEN FIXING THIS SECTION. tmppath = os.path.join(pmo.output_dir(), confile) conkit.io.write(tmppath, 'psicov', hierarchy=maps[1]) cnt += 1 elif (len(maps)==1): interfacetype[nif]="LM" pmo.printout(" ... Ligand-monomer interface detected") pmo.printout(' SKIP', extraline=True) cntlig += 1 for score in range(4): for n in range(n_sources): scores[nif][n][score]="***" else: interfacetype[nif]="Unk" pmo.printout(" WARNING: Unexpected number of maps", errorlog=True) pmo.printout(' SKIP', extraline=True, errorlog=True) cntunk += 1 for score in range(4): for n in range(n_sources): scores[nif][n][score]="***" ##################################################################### ##### OUTPUT ######################################################## pmo.printout('*********************************************************') pmo.printout('*** FINAL OUTPUT ****************************************',extraline=True) pmo.printout('Printing final data to file...', extraline=True) datfile=pmi2.pdbid()+".pisacov.out.dat" datpath = os.path.join(pmo.output_dir(), datfile) now = pmi2.gettime() space=str(ndec+4) with open(datpath, 'w') as out: if bio: out.write( "# PDB id: "+ pmi2.pdbid() + ' - Contact def: 0<d<8 Angstroms - Cloning artifacts removed from fasta file sequence - ' ) else: out.write( "# PDB id: "+ pmi2.pdbid() + ' - Contact def: 0<d<8 Angstroms - Original fasta file sequence used - ' ) out.write( now[1].strftime("%-d %B %Y, %X") + ' UTC') out.write('\n') datahead='# IF_id IF_type n_pdb ' for n in range(n_sources): datahead += 'n_'+sources[n]+' ' datahead += 'n_'+sources[n]+'/n_pdb ' datahead += 'Av_score_'+sources[n]+' ' datahead += 'Acc_score_'+sources[n]+' ' datahead += 'P_'+sources[n]+' ' out.write(datahead+'\n') for nif in range(n_interfaces): outformat='{:>4s}{:>4s}{:>4s}' outstring = str(outformat.format(str(nif+1),interfacetype[nif],str(n_contacts_all[nif][0][1]))) for n in range(n_sources): #outformat='{:>4s}{:>'+space+'s}{:>'+space+'s}{:>'+space+'s}{:>'+space+'s}{:>'+space+'s}{:>'+space+'s}{:>'+space+'s}' outformat='{:>4s}{:>'+space+'s}{:>'+space+'s}{:>'+space+'s}{:>'+space+'s}' outstring += str(outformat.format(str(n_contacts_all[nif][n][2]),str(scores[nif][n][0]),str(scores[nif][n][1]),str(scores[nif][n][2]),str(scores[nif][n][3]))) out.write(outstring+'\n') sumlogfile=pmi2.pdbid()+".pisacov.out.summary.log" sumlogpath = os.path.join(pmo.output_dir(), sumlogfile) now = pmi2.gettime() with open(sumlogpath, 'w') as out: out.write('*********************************************************\n') out.write('*** S U M M A R Y ** P I S A C O V ********************\n') out.write('*********************************************************\n\n') out.write(now[1].strftime("%-d %B %Y, %X") + ' UTC\n\n') out.write('Protein PDB ID: '+pmi2.pdbid()+'\n\n') out.write('*********************************************************\n') out.write('--- Multiple Sequence Alignment ---\n\n') outformat='{:<55s}{:<80s}' out.write(outformat.format(' MSA file: ', msapath)) out.write('\n') out.write(outformat.format(' MSA format: ', msaformat)) out.write('\n') out.write(outformat.format(' Sequence Identity Threshold: ', str(sit) )) out.write('\n') out.write(outformat.format(' Length of the Target Sequence: ',str(msa.top_sequence.seq_len))) out.write('\n') out.write(outformat.format(' Total number of sequences: ', str(msa.nseq))) out.write('\n') out.write(outformat.format(' Number of Effective Sequences: ', str(msa.meff))) out.write('\n') out.write(outformat.format(' Proportion of Effective Sequences: ', str(round(100*msa.meff/msa.nseq,2))+' %')) out.write('\n') out.write(outformat.format(' Sequence Coverage Plot: ', msacovpath)) out.write('\n\n') out.write(outformat.format(' MSA created with: ',pmin.HHSUITE_PATH)) out.write('\n') out.write(outformat.format(' Reference database name: ', pmin.HHBLITS_DATABASE_NAME)) out.write('\n') out.write(outformat.format(' Reference database path: ', pmin.HHBLITS_DATABASE_DIR)) out.write('\n') out.write(outformat.format(' Number of iterations: ', str(pmi1.hhparam()[0]))) out.write('\n') out.write(outformat.format(' E-value cutoff for inclusion in result alignment: ', str(pmi1.hhparam()[1]))) out.write('\n') out.write(outformat.format(' Non-redundant sequences to keep: ', str(pmi1.hhparam()[2]))) out.write('\n') out.write(outformat.format(' Minimum coverage with master sequence (%): ', str(pmi1.hhparam()[3]))) out.write('\n') out.write(outformat.format(' Maximum pairwise sequence identity: ', str(pmi1.hhparam()[4]))) out.write('\n\n') out.write('*********************************************************\n') out.write('--- Contact Prediction ---\n\n') outformat='{:<55s}{:<80s}' out.write(outformat.format(' Contact prediction list(s) created with: ', pmin.DMP_PATH)) out.write('\n') for n in range(n_sources): out.write(outformat.format(' Contact prediction file ('+sources[n]+'): ', conpredpath[n])) out.write('\n') out.write(outformat.format(' Minimum distance within sequence (neigh. cutoff): ', str(pmi1.minneigh()) )) out.write('\n') for n in range(n_sources): out.write(outformat.format(' Contact prediction score threshold ('+sources[n]+'): ', str(pmi1.scorethreshold(sources[n])))) out.write('\n') out.write('\n') out.write('*********************************************************\n') out.write('--- Interfaces ---\n\n') outformat='{:<55s}{:<80s}' out.write(outformat.format(' Total Number of Interfaces: ', str(n_interfaces) )) out.write('\n') out.write(outformat.format(' Number of Intramolecular interfaces: ', str(cntint) )) out.write('\n') out.write(outformat.format(' Number of Ligand-monomer interfaces: ', str(cntlig) )) out.write('\n') out.write(outformat.format(' Number of Unidentified interfaces: ', str(cntunk) )) out.write('\n\n') for nif in range(n_interfaces): nif1=nif+1 out.write(' --- Interface '+str(nif1)+' ---\n') if (interfacetype[nif] == "MM"): out.write(outformat.format(' Interface type:','monomer - monomer (MM)')) out.write('\n') out.write(outformat.format(' Total number of intermolecular contacts (pdb): ', str(n_contacts_all[nif][0][1]))) out.write('\n') for n in range(n_sources): extral='\n\n' if n==n_sources-1 else '\n' out.write(' + '+sources[n]+' Scores +\n') out.write(outformat.format(' Number of True Positives : ', str(n_contacts_all[nif][n][2]) )) out.write('\n') out.write(outformat.format(' Proportion of True positives: ', str(n_contacts_all[nif][n][3]))) out.write('\n') out.write(outformat.format(' Jaccard Index (|PDB â‹‚ pred| / |PDB U pred|): ', str(scores[nif][n][0]))) out.write('\n') out.write(outformat.format(' Average value of the scores of True Positives: ', str(scores[nif][n][1]))) out.write('\n') out.write(outformat.format(' Sum of all the scores of True Positives: ', str(scores[nif][n][2]))) out.write('\n') out.write(outformat.format(' Probabilistic score for whole interface: ', str(scores[nif][n][3]))) out.write(extral) elif (interfacetype[nif] == "LM"): out.write(outformat.format(' Interface type:','ligand - monomer (LM)')) out.write('\n\n') elif (interfacetype[nif] == "Unk"): out.write(outformat.format(' Interface type:','unidentified (Unk)')) out.write('\n\n') pmo.printout('*********************************************************') pmo.printout('*** S U M M A R Y ***************************************') pmo.printout('*********************************************************', extraline=True) pmo.printout('Protein PDB ID: %s' % pmi2.pdbid(), extraline=True) pmo.printout('--- Multiple Sequence Alignment ---') pmo.printout(' MSA file: ' + msapath) pmo.printout(' MSA format: ' + msaformat) pmo.printout(' Sequence Identity Threshold: '+ str (sit)) pmo.printout(' Length of the Target Sequence: ' + str(msa.top_sequence.seq_len)) pmo.printout(' Total number of sequences: ' + str(msa.nseq)) pmo.printout(' Number of Effective Sequences: ' + str(msa.meff)) pmo.printout(' Proportion of Effective Sequences: ' + str( round(100*msa.meff/msa.nseq,2))+' %') pmo.printout(' Sequence Coverage Plot: ' + msacovpath, extraline=True) pmo.printout(' MSA created with: ' + pmin.HHSUITE_PATH) pmo.printout(' Reference database name: ' + pmin.HHBLITS_DATABASE_NAME) pmo.printout(' Reference database path: ' + pmin.HHBLITS_DATABASE_DIR) pmo.printout(' Number of iterations: ' + str(pmi1.hhparam()[0])) pmo.printout(' E-value cutoff for inclusion in result alignment: ' + str(pmi1.hhparam()[1])) pmo.printout(' Non-redundant sequences to keep: ' + str(pmi1.hhparam()[2])) pmo.printout(' Minimum coverage with master sequence (%): ' + str(pmi1.hhparam()[3])) pmo.printout(' Maximum pairwise sequence identity: ' + str(pmi1.hhparam()[4]), extraline=True) pmo.printout('--- Contact Prediction ---') pmo.printout(' Contact prediction list created with: ' + pmin.DMP_PATH) for n in range(n_sources): pmo.printout(' Contact prediction file ('+sources[n]+'): ' + conpredpath[n]) pmo.printout(' Minimum distance within sequence (neigh. cutoff): '+ str(pmi1.minneigh())) for n in range(n_sources): extral=True if n==n_sources-1 else False pmo.printout(' Contact prediction score threshold ('+sources[n]+'): ' + str(pmi1.scorethreshold(sources[n])),extraline=extral) pmo.printout('--- Interfaces ---') pmo.printout(' Number of Interfaces: ' + str(n_interfaces) + ", of which:") pmo.printout(' Number of Intramolecular interfaces: ' + str(cntint)) pmo.printout(' Number of Ligand-monomer interfaces: ' + str(cntlig)) pmo.printout(' Number of Unidentified interfaces: ' + str(cntunk),extraline=True) for nif in range(n_interfaces): nif1=nif+1 pmo.printout(' --- Interface '+str(nif1)+' ---',extraline=True) if (interfacetype[nif] == "MM"): pmo.printout(' Interface type: monomer - monomer (MM)') pmo.printout(' Total number of intermolecular contacts (pdb): ' + str(n_contacts_all[nif][0][1])) for n in range(n_sources): extral=True if n==n_sources-1 else False pmo.printout(' + Scores ('+sources[n]+'):') pmo.printout(' Number of True Positives : ' + str(n_contacts_all[nif][n][2])) pmo.printout(' Proportion of True positives: ' + str(n_contacts_all[nif][n][3]) ) pmo.printout(' Jaccard Index (|PDB â‹‚ pred| / |PDB U pred|): ' + str(scores[nif][n][0]) ) pmo.printout(' Average value of the scores of True Positives: ' + str(scores[nif][n][1] )) pmo.printout(' Sum of all the scores of True Positives: ' + str(scores[nif][n][2] )) pmo.printout(' Probabilistic score for whole interface: ' + str(scores[nif][n][3] ),extraline=extral) elif (interfacetype[nif] == "LM"): pmo.printout(' Interface type: ligand - monomer (LM)',extraline=True) elif (interfacetype[nif] == "Unk"): pmo.printout(' Interface type: unidentified (Unk)',extraline=True) ##################################################################### ##### TIMEIT ######################################################## pmo.printout('*********************************************************') pmo.printout('*** T I M I N G *****************************************') pmo.printout('*********************************************************', extraline=True) with open(sumlogpath, 'a') as out: out.write('*********************************************************\n') out.write('*** T I M I N G *****************************************\n') out.write('*********************************************************\n\n') endtime=pmi2.gettime() totaltime=pmi2.readabletime(endtime[0]-starttime[0]) out.write('Starting Time: '+ starttime[1].strftime("%-d %B %Y, %X") + ' UTC\n\n') out.write('Ending Time: '+ endtime[1].strftime("%-d %B %Y, %X") + ' UTC\n\n') out.write('Process Wallclock time: '+str(endtime[0]-starttime[0])+' s\n') out.write(' or, equivalently: ' + totaltime+'\n\n') pmo.printout('Starting Time: '+ starttime[1].strftime("%-d %B %Y, %X")+' UTC', extraline=True) pmo.printout('Ending Time: '+ endtime[1].strftime("%-d %B %Y, %X")+' UTC', extraline=True) pmo.printout('Process Wallclock time: '+str(endtime[0]-starttime[0])+' s') pmo.printout(' or, equivalently: ' + totaltime, extraline=True)
def main(): parser = create_argument_parser() args = parser.parse_args() global logger logger = pcl.pisacov_logger(level="info") welcomemsg, starttime = pcl.welcome(command=__script__) logger.info(welcomemsg) kwlist = pco._default_keys() configin = {} if args.update_sifts_database is not None: outpath = ppaths.check_path(args.update_sifts_database[0], 'either') if os.path.isdir(outpath) is True: outpath = os.path.join(outpath, ('pdb_chain_uniprot' + os.extsep + 'csv')) pol.getsifts(outpath) configin['SIFTS_PATH'] = outpath else: pass if args.view_configuration is True: print('** ' + __prog__ + " v." + __version__ + ' configuration file **' + os.linesep) with open(pconf.__file__) as f: for line in f: line = _lineformat(line) print(line, end='') print('** End of configuration file **' + os.linesep) return else: pass if args.get_confpath is True: print(pconf.__file__) return else: pass if args.conf_file is False: newconf = pco._parse_conf() else: newconf = {} if args.reset_hhblits_arguments is True: newconf['HHBLITS_PARAMETERS'] = pco._default_values( 'HHBLITS_PARAMETERS') else: pass if args.sifts_path is not None: configin['SIFTS_PATH'] = args.sifts_path[0] if args.pisa_path is not None: configin['PISA_PATH'] = args.pisa_path[0] if args.hhblits_path is not None: configin['HHBLITS_PATH'] = args.hhblits_path[0] if args.dmp_path is not None: configin['DMP_PATH'] = args.dmp_path[0] if args.uniclust_path is not None: configin['UNICLUST_FASTA_PATH'] = args.uniclust_path[0] if args.hhblits_arguments is not None: configin['HHBLITS_PARAMETERS'] = args.hhblits_arguments[0] if args.neighbours is not None: configin['NEIGHBOURS_MINDISTANCE'] = args.neighbours[0] configin['REMOVE_INTRA_CONTACTS'] = False if isinstance(args.hhblits_location, list) is True: configin['HHBLITS_DATABASE_NAME'] = args.hhblits_location[0] configin['HHBLITS_DATABASE_DIR'] = args.hhblits_location[1] compmsg = { 'SIFTS_PATH': "Please, enter the path to the SIFTS csv file:\n", 'PISA_PATH': "Please, enter the path to the PISA executable:\n", 'HHBLITS_PATH': "Please, enter the path to the HHBLITS executable:\n", 'HHBLITS_DATABASE_NAME': ("Please, enter the name of the HHBLITS database name\n" + "as requested by DeepMetaPSICOV (e.g. uniclust30_2018_08):\n"), 'HHBLITS_DATABASE_DIR': "Please, enter the directory of the HHBLITS database:\n", 'DMP_PATH': "Please, enter the path to the DeepMetaPSICOV executable:\n", 'HHBLITS_PARAMETERS': ("Please, enter the 5 HHBLITS parameters.\n" + "#iterations, E-value cutoff, Non-redundant seqs to keep, " + " MinimumCoverageWithMasterSeq(%), and MaxPairwiseSequenceIdentity.\n" + "Leave empty for DMP default (3, 0.001, 'inf', 50, 99).\n"), 'UNICLUST_FASTA_PATH': ("Please, enter the path to the UniClust fasta file.\n" "An empty input will deactivate this option.\n"), 'NEIGHBOURS_MINDISTANCE': ("Press ENTER for intramolecular contacts " + "to be removed from intermolecular contact lists.\n" + "Otherwise, enter the minimum distance to be cosidered " + "between neighbours. This will override the removal of" + "intramolecular contacts that will be now ignored.\n") } for keystr in kwlist[:-1]: if keystr in configin or args.conf_file is True: if args.conf_file is True: while True: newval = input(compmsg[keystr]) try: newconf[keystr] = pco._check_input(newval, keystr) if keystr == 'NEIGHBOURS_MINDISTANCE': if newval is None or newval == "": newconf['REMOVE_INTRA_CONTACTS'] = True else: newconf['REMOVE_INTRA_CONTACTS'] = False except Exception: print("Not a valid input. Please, try again:") else: break else: newconf[keystr] = pco._check_input(configin[keystr], keystr) if keystr == 'NEIGHBOURS_MINDISTANCE': newconf['REMOVE_INTRA_CONTACTS'] = pco._check_input( configin['REMOVE_INTRA_CONTACTS'], 'REMOVE_INTRA_CONTACTS') else: pass if 'REMOVE_INTRA_CONTACTS' in configin: newconf['REMOVE_INTRA_CONTACTS'] = pco._check_input( configin['REMOVE_INTRA_CONTACTS'], 'REMOVE_INTRA_CONTACTS') if newconf['REMOVE_INTRA_CONTACTS'] is True: newconf['NEIGHBOURS_MINDISTANCE'] == 2 else: pass else: pass _outconffile(newconf) endmsg = pcl.ok(starttime, command=__script__) logger.info(endmsg) return
def main(): parser = create_argument_parser() args = parser.parse_args() global logger logger = pcl.pisacov_logger(level="info") welcomemsg, starttime = pcl.welcome(command=__script__) logger.info(welcomemsg) # PARSE CONFIGURATION FILE: invals = pco._initialise_inputs() invals['INSEQ'] = None invals['INSTR'] = None invals['ALTDB'] = None invals['OUTROOT'] = None invals['OUTCSVPATH'] = None invals['UPTHRESHOLD'] = None # READ INPUT ARGUMENTS invals['INSEQ'] = ppaths.check_path(args.seqpath[0], 'file') invals['INSTR'] = ppaths.check_path(args.crystalpath[0], 'file') if args.hhblits_arguments is not None: invals['HHBLITS_PARAMETERS'] = pco._check_hhparams( args.hhblits_arguments) else: pass if args.uniprot_threshold is not None: try: invals['UPTHRESHOLD'] = float(args.uniprot_threshold[0]) except ValueError: logger.critical('Uniprot threshold given not valid.') if invals['UNICLUST_FASTA_PATH'] is None: invals['UNICLUST_FASTA_PATH'] = pco._uniurl else: pass if args.skip_conpred is True: skipexec = True if (args.hhblits_arguments is not None or args.uniprot_threshold is not None): logger.info( 'HHblits, UniProt threshold parameters given bypassed by --skip_conpred' ) else: skipexec = False cropping = args.remove_insertions scoring = [cropping, not cropping] if args.outdir is None: invals['OUTROOT'] = ppaths.check_path(os.path.dirname(invals['INSEQ'])) else: invals['OUTROOT'] = ppaths.check_path(os.path.join(args.outdir[0], '')) ppaths.mdir(invals['OUTROOT']) invals['OUTCSVPATH'] = [] if args.collection_file is None: invals['OUTCSVPATH'].append( ppaths.check_path( os.path.join(invals['OUTROOT'], ("evcovsignal" + os.extsep + "cropped" + os.extsep + "pisacov" + os.extsep + "csv")))) invals['OUTCSVPATH'].append( ppaths.check_path( os.path.join(invals['OUTROOT'], ("evcovsignal" + os.extsep + "full" + os.extsep + "pisacov" + os.extsep + "csv")))) else: if cropping is True: invals['OUTCSVPATH'].append( ppaths.check_path(args.collection_file[0])) invals['OUTCSVPATH'].append( ppaths.check_path( os.path.splitext(args.collection_file[0])[0] + os.extsep + 'full' + os.extsep + os.path.splitext(args.collection_file[0])[1])) else: invals['OUTCSVPATH'].append(None) invals['OUTCSVPATH'].append( ppaths.check_path(args.collection_file[0])) # Define formats used sources = pco._sources() # Parse sequence and structure files logger.info('Parsing sequence file...') seqs = cps.parseseqfile(invals['INSEQ']) logger.info('Parsing structure file...') strs, filestrs = cps.parsestrfile(invals['INSTR']) if len(seqs) == 1 or len(strs) == 1: if len(seqs) == 1: for key in seqs: pdbid = key elif len(seqs) > 1 and len(strs) == 1: for key in strs: for key2 in seqs: if key.upper() == key2.upper(): pdbid = key.upper() else: if key2.upper() in key.upper(): pdbid = key2.upper() else: raise Exception( 'More than one pdbid in sequence and/or structure set.') seq = seqs[pdbid] #structure = strs[pdbid] # CROPPING AND RENUMBERING outpdbdir = os.path.join(invals['OUTROOT'], pdbid, "") instrc = os.path.join(invals['OUTROOT'], pdbid, os.path.basename(invals['INSTR'])) fseq = {} fmsa = {} if skipexec is False: if cropping is True: logger.info('Cropping and renumbering sequences, ' + 'structures according to SIFTS database.') logger.info(pcl.running('CROPS-cropstr')) itime = datetime.datetime.now() psc.runcrops(invals['INSEQ'], invals['INSTR'], invals['SIFTS_PATH'], invals['UPTHRESHOLD'], invals['UNICLUST_FASTA_PATH'], invals['OUTROOT']) logger.info(pcl.running('CROPS-cropstr', done=itime)) else: logger.info('Renumbering structure ' + 'according to position in sequence.') logger.info(pcl.running('CROPS-renumber')) itime = datetime.datetime.now() psc.renumcrops(invals['INSEQ'], invals['INSTR'], invals['OUTROOT']) logger.info(pcl.running('CROPS-renumber', done=itime)) ppaths.mdir(outpdbdir) if cropping is False: psc.splitseqs(invals['INSEQ'], outpdbdir) copyfile(invals['INSTR'], instrc) for i, iseq in seq.imer.items(): fiseq = pdbid + '_' + i + '.fasta' fseq[i] = os.path.join(invals['OUTROOT'], pdbid, fiseq) fiseq = pdbid + '_' + i + '.msa.aln' fmsa[i] = os.path.join(invals['OUTROOT'], pdbid, 'hhblits', fiseq) if skipexec is False: iseq.dump(fseq[i]) # Parse cropped sequences and maps if cropping is True: amap = {} fcropseq = {} fcropmsa = {} for i, iseq in seq.imer.items(): fprefix = pdbid + '_' + i + '.crops.to_uniprot' fmap = os.path.join(invals['OUTROOT'], pdbid, fprefix + os.extsep + 'cropmap') amap.update(cps.parsemapfile(fmap)[pdbid]) fcropseq[i] = os.path.join(invals['OUTROOT'], pdbid, fprefix + os.extsep + 'fasta') fcropmsa[i] = os.path.join( invals['OUTROOT'], pdbid, 'hhblits', (fprefix + os.extsep + 'msa' + os.extsep + 'aln')) seq.set_cropmaps(amap, cropmain=True) # EXECUTION OF EXTERNAL PROGRAMS hhdir = os.path.join(invals['OUTROOT'], pdbid, 'hhblits', '') dmpdir = os.path.join(invals['OUTROOT'], pdbid, 'dmp', '') pisadir = os.path.join(invals['OUTROOT'], pdbid, 'pisa', '') fstr = os.path.join( invals['OUTROOT'], (pdbid + os.extsep + 'crops' + os.extsep + 'seq' + os.extsep + 'pdb')) if cropping: fcropstr = os.path.join( invals['OUTROOT'], pdbid, (pdbid + os.extsep + 'crops' + os.extsep + 'oldids' + os.extsep + 'to_uniprot' + os.path.splitext(invals['INSTR'])[1])) if skipexec is False: # MSA GENERATOR ppaths.mdir(hhdir) if invals['HHBLITS_PARAMETERS'] == ['3', '0.001', 'inf', '50', '99']: logger.info( 'Generating Multiple Sequence Alignment using DeepMetaPSICOV default parameters... [AS RECOMMENDED]' ) elif invals['HHBLITS_PARAMETERS'] == ['2', '0.001', '1000', '0', '90']: logger.info( 'Generating Multiple Sequence Alignment using HHBlits default parameters...' ) else: logger.info( 'Generating Multiple Sequence Alignment using user-custom parameters...' ) for i, iseq in seq.imer.items(): sfile = fcropseq[i] if cropping is True else fseq[i] afile = fcropmsa[i] if cropping is True else fmsa[i] logger.info(pcl.running('HHBlits')) itime = datetime.datetime.now() themsa = psm.runhhblits(sfile, invals['HHBLITS_PARAMETERS'], hhdir) logger.info(pcl.running('HHBlits', done=itime)) if cropping is True: iseq.cropmsa = themsa if iseq.ncrops() == 0: iseq.msa = iseq.cropmsa logger.info(' Cropped sequence ' + iseq.oligomer_id + '_' + iseq.name + ' is identical to original sequence.') continue else: pass else: iseq.msa = themsa # DEEP META PSICOV RUN ppaths.mdir(dmpdir) if skipexec is False: logger.info( 'Generating contact prediction lists via DeepMetaPSICOV...') for i, iseq in seq.imer.items(): sfile = fcropseq[i] if cropping is True else fseq[i] afile = fcropmsa[i] if cropping is True else fmsa[i] nsfile = os.path.join(dmpdir, os.path.basename(sfile)) if sfile != nsfile: copyfile(sfile, nsfile) logger.info(pcl.running('DeepMetaPSICOV')) itime = datetime.datetime.now() psd.rundmp(nsfile, afile, dmpdir) logger.info(pcl.running('DeepMetaPSICOV', done=itime)) # INTERFACE GENERATION, PISA ppaths.mdir(pisadir) if skipexec is False: logger.info('Generating interface files via PISA...') sfile = fcropstr if cropping is True else fstr logger.info(pcl.running('PISA')) itime = datetime.datetime.now() iflist = psp.runpisa(sfile, pisadir, sessionid=pdbid) logger.info(pcl.running('PISA', done=itime)) endmsg = pcl.ok(starttime, command=__script__) logger.info(endmsg) return
def main(): starttime = time.time() parser = create_argument_parser() args = parser.parse_args() global logger logger = pcl.pisacov_logger(level="info") welcomemsg, starttime = pcl.welcome(command=__script__) logger.info(welcomemsg) # PARSE CONFIGURATION FILE: invals = pco._initialise_inputs() invals['INSEQ'] = None invals['INIFS'] = None invals['OUTROOT'] = None invals['OUTCSVPATH'] = None # READ INPUT ARGUMENTS invals['INSEQ'] == ppaths.check_path(args.seqpath[0], 'file') invals['INIFS'] = [] args.remove_insertions = False for fp in args.dimers: if '*' in fp: invals['INIFS'] += ppaths.check_wildcard(fp) else: invals['INIFS'].append(ppaths.check_path(fp, 'file')) invals['INIFS'] = list(dict.fromkeys(invals['INIFS'])) if args.hhblits_arguments is not None: invals['HHBLITS_PARAMETERS'] = pco._check_hhparams( args.hhblits_arguments) else: pass if args.skip_conpred is True: skipexec = True if args.hhblits_arguments is not None: logger.info('HHblits parameters given bypassed by --skip_conpred') else: skipexec = False if args.outdir is None: invals['OUTROOT'] = ppaths.check_path(os.path.dirname(invals['INSEQ'])) else: invals['OUTROOT'] = ppaths.check_path(os.path.join(args.outdir[0], '')) ppaths.mdir(invals['OUTROOT']) if args.collection_file is None: invals['OUTCSVPATH'] = ppaths.check_path( os.path.join(invals['OUTROOT'], ("evcovsignal" + os.extsep + "full" + os.extsep + "pisacov" + os.extsep + "csv"))) else: invals['OUTCSVPATH'] = ppaths.check_path(args.collection_file[0]) if os.path.isfile(invals['OUTCSVPATH']) is False: pic.csvheader(invals['OUTCSVPATH'], cropped=False) # Define formats used sources = pco._sources() # Parse sequence and structure files logger.info('Parsing sequence file...') seqs = cps.parseseqfile(invals['INSEQ']) if len(seqs) == 1: if len(seqs) == 1: for key in seqs: pdbid = key.lower() else: raise Exception('More than one pdbid in sequence set.') seq = seqs[pdbid] outpdbdir = os.path.join(invals['OUTROOT'], pdbid, "") # RENUMBERING fseq = {} fmsa = {} if skipexec is False: if invals['INIFS'] is not None: logger.info('Renumbering interfaces provided ' + 'according to position in sequence.') for path in invals['INIFS']: instrc = os.path.join(invals['OUTROOT'], pdbid, os.path.basename(path)) logger.info(pcl.running('CROPS-renumber')) itime = datetime.datetime.now() psc.renumcrops(invals['INSEQ'], path, invals['OUTROOT']) copyfile(path, instrc) logger.info(pcl.running('CROPS-renumber', done=itime)) ppaths.mdir(outpdbdir) for i, iseq in seq.imer.items(): fiseq = pdbid + '_' + i + os.extsep + 'fasta' fseq[i] = os.path.join(invals['OUTROOT'], pdbid, fiseq) fiseq = pdbid + '_' + i + os.extsep + 'msa' + os.extsep + 'aln' fmsa[i] = os.path.join(invals['OUTROOT'], pdbid, 'hhblits', fiseq) if skipexec is False: iseq.dump(fseq[i]) # EXECUTION OF EXTERNAL PROGRAMS hhdir = os.path.join(invals['OUTROOT'], pdbid, 'hhblits', '') dmpdir = os.path.join(invals['OUTROOT'], pdbid, 'dmp', '') fstr = [] for file in invals['INIFS']: fstr.append( os.path.join( invals['OUTROOT'], (os.path.splitext(os.path.basename(file))[0] + os.extsep + 'crops' + os.extsep + 'seq' + os.extsep + 'pdb'))) if skipexec is False: # MSA GENERATOR ppaths.mdir(hhdir) if invals['HHBLITS_PARAMETERS'] == ['3', '0.001', 'inf', '50', '99']: logger.info( 'Generating Multiple Sequence Alignment using DeepMetaPSICOV default parameters... [AS RECOMMENDED]' ) elif invals['HHBLITS_PARAMETERS'] == ['2', '0.001', '1000', '0', '90']: logger.info( 'Generating Multiple Sequence Alignment using HHBlits default parameters...' ) else: logger.info( 'Generating Multiple Sequence Alignment using user-custom parameters...' ) for i, iseq in seq.imer.items(): sfile = fseq[i] afile = fmsa[i] logger.info(pcl.running('HHBlits')) itime = datetime.datetime.now() themsa = psm.runhhblits(sfile, invals['HHBLITS_PARAMETERS'], hhdir) logger.info(pcl.running('HHBlits', done=itime)) iseq.msa = themsa # DEEP META PSICOV RUN logger.info( 'Generating contact prediction lists via DeepMetaPSICOV...') ppaths.mdir(dmpdir) for i, iseq in seq.imer.items(): sfile = fseq[i] afile = fmsa[i] nsfile = os.path.join(dmpdir, os.path.basename(sfile)) if sfile != nsfile: copyfile(sfile, nsfile) logger.info(pcl.running('DeepMetaPSICOV')) itime = datetime.datetime.now() psd.rundmp(nsfile, afile, dmpdir) logger.info(pcl.running('DeepMetaPSICOV', done=itime)) # GENERATE INTERFACE LIST iflist = [] for filepath in fstr: ifname = os.path.splitext(os.path.basename(filepath))[0] iflist.append(pci.interface(name=ifname)) # CONTACT ANALYSIS AND MATCH logger.info('Opening output csv files...') resultdir = os.path.join(invals['OUTROOT'], pdbid, 'pisacov', '') ppaths.mdir(resultdir) csvfile = os.path.join( resultdir, (pdbid + os.extsep + "evcovsignal" + os.extsep + "full" + os.extsep + "pisacov" + os.extsep + "csv")) pic.csvheader(csvfile, cropped=False, pisascore=False) logger.info('Parsing sequence files...') for i, fpath in fseq.items(): seq.imer[i].seqs['conkit'] = ckio.read(fpath, 'fasta')[0] seq.imer[i].biotype = csq.guess_type(seq.imer[i].seqs['mainseq']) logger.info('Parsing contact predictions lists...') conpred = {} matches = [] for s in seq.imer: if s not in conpred: conpred[s] = {} for source, attribs in sources.items(): fc = os.path.splitext(os.path.basename(fseq[s]))[0] fc += attribs[1] confile = os.path.join(invals['OUTROOT'], pdbid, attribs[0], fc) conpred[s][source] = ckio.read(confile, attribs[2])[0] logger.info('Parsing crystal structure contacts...') for i in range(len(iflist)): inputmap = ckio.read(fstr[i], 'pdb') if len(inputmap) == 4: chnames = list(iflist[i].chains.keys()) chtypes = list(iflist[i].chains.values()) if (seq.whatseq(chnames[0]) != seq.whatseq(chnames[1]) or (chtypes[0] != 'Protein' or chtypes[1] != 'Protein')): if chtypes[0] != "Protein" or chtypes[1] != "Protein": logger.info( 'Interface ' + str(i) + ' is not a Protein-Protein interface. Ignoring.') else: logger.info('Interface ' + str(i) + ' is not a homodimer. Ignoring.') iflist[i].structure = None matches.append(None) continue s = seq.whatseq(chnames[0]) try: iflist[i].structure = [] for m in range(len(inputmap)): iflist[i].structure.append(inputmap[m].as_contactmap()) iflist[i].structure[m].id = inputmap[m].id except Exception: for m in range(len(inputmap)): iflist[i].structure.append(inputmap[m]) # ConKit LEGACY. matches.append({}) for source, attribs in sources.items(): matches[i][source] = pcc.contact_atlas( name=pdbid + '_' + str(s), conpredmap=conpred[s][source], strmap=iflist[i].structure, sequence=seq.imer[s], removeintra=True) else: iflist[i].structure = None matches.append(None) continue logger.info('Computing results and writing them to file...') for i in range(len(iflist)): if matches[i] is None: continue results = [pdbid, str(i + 1)] results.append(matches[i]['psicov'].chain1) results.append(matches[i]['psicov'].chain2) sid = seq.whatseq(matches[i]['psicov'].chain1) results.append(str(sid)) results.append(str(seq.imer[sid].length())) results.append(str(seq.imer[sid].cropmsa.meff)) results.append(str(seq.imer[sid].ncrops())) results.append(str(seq.imer[sid].full_length())) for source, attribs in sources.items(): appresults = pcs.list_scores(matches[i][source], tag=source) results += appresults pic.lineout(results, csvfile) pic.lineout(results, invals['OUTCSVPATH']) endmsg = pcl.ok(starttime, command=__script__) logger.info(endmsg) return
def main(): parser = create_argument_parser() args = parser.parse_args() global logger logger = pcl.pisacov_logger(level="info") welcomemsg, starttime = pcl.welcome(command=__script__) logger.info(welcomemsg) # PARSE CONFIGURATION FILE: invals = pco._initialise_inputs() invals['INSEQ'] = None invals['INSTR'] = None invals['ALTDB'] = None invals['OUTROOT'] = None invals['OUTCSVPATH'] = None invals['UPTHRESHOLD'] = None # READ INPUT ARGUMENTS invals['INSEQ'] = ppaths.check_path(args.seqpath[0], 'file') invals['INSTR'] = ppaths.check_path(args.crystalpath[0], 'file') if args.hhblits_arguments is not None: invals['HHBLITS_PARAMETERS'] = pco._check_hhparams( args.hhblits_arguments) else: pass if args.uniprot_threshold is not None: try: invals['UPTHRESHOLD'] = float(args.uniprot_threshold[0]) except ValueError: logger.critical('Uniprot threshold given not valid.') if invals['UNICLUST_FASTA_PATH'] is None: invals['UNICLUST_FASTA_PATH'] = pco._uniurl else: pass if args.skip_conpred is True: skipexec = True if (args.hhblits_arguments is not None or args.uniprot_threshold is not None): logger.info( 'HHblits, UniProt threshold parameters given bypassed by --skip_conpred' ) else: skipexec = False cropping = args.remove_insertions scoring = [cropping, not cropping] if args.outdir is None: invals['OUTROOT'] = ppaths.check_path(os.path.dirname(invals['INSEQ'])) else: invals['OUTROOT'] = ppaths.check_path(os.path.join(args.outdir[0], '')) ppaths.mdir(invals['OUTROOT']) invals['OUTCSVPATH'] = [] if args.collection_file is None: invals['OUTCSVPATH'].append( ppaths.check_path( os.path.join(invals['OUTROOT'], ("evcovsignal" + os.extsep + "cropped" + os.extsep + "pisacov" + os.extsep + "csv")))) invals['OUTCSVPATH'].append( ppaths.check_path( os.path.join(invals['OUTROOT'], ("evcovsignal" + os.extsep + "full" + os.extsep + "pisacov" + os.extsep + "csv")))) else: if cropping is True: invals['OUTCSVPATH'].append( ppaths.check_path(args.collection_file[0])) invals['OUTCSVPATH'].append( ppaths.check_path( os.path.splitext(args.collection_file[0])[0] + os.extsep + 'full' + os.extsep + os.path.splitext(args.collection_file[0])[1])) else: invals['OUTCSVPATH'].append(None) invals['OUTCSVPATH'].append( ppaths.check_path(args.collection_file[0])) if args.plot_formats is None: plotformats = {'png'} else: plotformats = set() for element in args.plot_formats: if element.lower() in {'png', 'eps', 'dat'}: plotformats.add(element.lower()) # Define formats used sources = pco._sources() # Parse sequence and structure files logger.info('Parsing sequence file...') # seqs = cps.parseseqfile(invals['INSEQ']) seqs = pio.read(invals['INSEQ'], 'fasta') logger.info('Parsing structure file...') # strs, filestrs = cps.parsestrfile(invals['INSTR']) strs, filestrs = pio.read(invals['INSTR'], 'pdb') if len(seqs) == 1 or len(strs) == 1: if len(seqs) == 1: for key in seqs: pdbid = key elif len(seqs) > 1 and len(strs) == 1: for key in strs: for key2 in seqs: if key.upper() == key2.upper(): pdbid = key.upper() else: if key2.upper() in key.upper(): pdbid = key2.upper() else: raise Exception( 'More than one pdbid in sequence and/or structure set.') seq = seqs[pdbid] #structure = strs[pdbid] # CROPPING AND RENUMBERING outpdbdir = os.path.join(invals['OUTROOT'], pdbid, "") instrc = os.path.join(invals['OUTROOT'], pdbid, os.path.basename(invals['INSTR'])) fseq = {} fmsa = {} if skipexec is False: if cropping is True: logger.info('Cropping and renumbering sequences, ' + 'structures according to SIFTS database.') logger.info(pcl.running('CROPS-cropstr')) itime = datetime.datetime.now() psc.runcrops(invals['INSEQ'], invals['INSTR'], invals['SIFTS_PATH'], invals['UPTHRESHOLD'], invals['UNICLUST_FASTA_PATH'], invals['OUTROOT']) logger.info(pcl.running('CROPS-cropstr', done=itime)) else: logger.info('Renumbering structure ' + 'according to position in sequence.') logger.info(pcl.running('CROPS-renumber')) itime = datetime.datetime.now() psc.renumcrops(invals['INSEQ'], invals['INSTR'], invals['OUTROOT']) logger.info(pcl.running('CROPS-renumber', done=itime)) ppaths.mdir(outpdbdir) copyfile(invals['INSTR'], instrc) for i, iseq in seq.imer.items(): fiseq = pdbid + '_' + i + '.fasta' fseq[i] = os.path.join(invals['OUTROOT'], pdbid, fiseq) fiseq = pdbid + '_' + i + '.msa.aln' fmsa[i] = os.path.join(invals['OUTROOT'], pdbid, 'hhblits', fiseq) if skipexec is False: iseq.dump(fseq[i]) # Parse cropped sequences and maps if cropping is True: amap = {} fcropseq = {} fcropmsa = {} for i, iseq in seq.imer.items(): fprefix = pdbid + '_' + i + '.crops.to_uniprot' fmap = os.path.join(invals['OUTROOT'], pdbid, fprefix + os.extsep + 'cropmap') amap.update(cps.parsemapfile(fmap)[pdbid]) fcropseq[i] = os.path.join(invals['OUTROOT'], pdbid, fprefix + os.extsep + 'fasta') fcropmsa[i] = os.path.join( invals['OUTROOT'], pdbid, 'hhblits', (fprefix + os.extsep + 'msa' + os.extsep + 'aln')) seq.set_cropmaps(amap, cropmain=True) if iseq.ncrops() == 0: logger.info(' Cropped sequence ' + iseq.oligomer_id + '_' + iseq.name + ' is identical to the original sequence.') else: logger.info(' Cropped sequence ' + iseq.oligomer_id + '_' + iseq.name + ' is ' + str(iseq.ncrops()) + ' residues ' + 'shorter than the original sequence.') # EXECUTION OF EXTERNAL PROGRAMS hhdir = os.path.join(invals['OUTROOT'], pdbid, 'hhblits', '') dmpdir = os.path.join(invals['OUTROOT'], pdbid, 'dmp', '') pisadir = os.path.join(invals['OUTROOT'], pdbid, 'pisa', '') fstr = os.path.join( invals['OUTROOT'], (pdbid + os.extsep + 'crops' + os.extsep + 'seq' + os.extsep + 'pdb')) if cropping: fcropstr = os.path.join( invals['OUTROOT'], pdbid, (pdbid + os.extsep + 'crops' + os.extsep + 'oldids' + os.extsep + 'to_uniprot' + os.path.splitext(invals['INSTR'])[1])) if skipexec is False: # MSA GENERATOR ppaths.mdir(hhdir) if invals['HHBLITS_PARAMETERS'] == ['3', '0.001', 'inf', '50', '99']: logger.info( 'Generating Multiple Sequence Alignment using DeepMetaPSICOV default parameters... [AS RECOMMENDED]' ) elif invals['HHBLITS_PARAMETERS'] == ['2', '0.001', '1000', '0', '90']: logger.info( 'Generating Multiple Sequence Alignment using HHBlits default parameters...' ) else: logger.info( 'Generating Multiple Sequence Alignment using user-custom parameters...' ) for i, iseq in seq.imer.items(): sfile = fcropseq[i] if cropping is True else fseq[i] afile = fcropmsa[i] if cropping is True else fmsa[i] logger.info(pcl.running('HHBlits')) itime = datetime.datetime.now() themsa = psm.runhhblits(sfile, invals['HHBLITS_PARAMETERS'], hhdir) logger.info(pcl.running('HHBlits', done=itime)) if cropping is True: iseq.cropmsa = themsa if iseq.ncrops() == 0: iseq.msa = iseq.cropmsa continue else: pass else: iseq.msa = themsa # DEEP META PSICOV RUN ppaths.mdir(dmpdir) if skipexec is False: logger.info( 'Generating contact prediction lists via DeepMetaPSICOV...') for i, iseq in seq.imer.items(): sfile = fcropseq[i] if cropping is True else fseq[i] afile = fcropmsa[i] if cropping is True else fmsa[i] nsfile = os.path.join(dmpdir, os.path.basename(sfile)) if sfile != nsfile: copyfile(sfile, nsfile) logger.info(pcl.running('DeepMetaPSICOV')) itime = datetime.datetime.now() psd.rundmp(nsfile, afile, dmpdir) logger.info(pcl.running('DeepMetaPSICOV', done=itime)) # INTERFACE GENERATION, PISA ppaths.mdir(pisadir) if skipexec is False: logger.info('Generating interface files via PISA...') sfile = fcropstr if cropping is True else fstr logger.info(pcl.running('PISA')) itime = datetime.datetime.now() iflist = psp.runpisa(sfile, pisadir, sessionid=pdbid) logger.info(pcl.running('PISA', done=itime)) # READ DATA IF SKIPEXEC USED: if skipexec is True: logger.info('Parsing already generated files...') for i, iseq in seq.imer.items(): sfile = fcropstr if cropping is True else fstr afile = fcropmsa[i] if cropping is True else fmsa[i] if cropping is True: # iseq.cropmsa = ckio.read(afile, 'jones') iseq.cropmsa = pio.read(afile, 'jones') if iseq.ncrops() == 0: scoring[1] = True # iseq.msa = ckio.read(afile, 'jones') iseq.msa = ckio.read(afile, 'jones') else: # iseq.msa = ckio.read(afile, 'jones') iseq.msa = pio.read(afile, 'jones') ixml = os.path.join(pisadir, (os.path.splitext(os.path.basename(sfile))[0] + os.extsep + 'interface' + os.extsep + 'xml')) axml = os.path.join(pisadir, (os.path.splitext(os.path.basename(sfile))[0] + os.extsep + 'assembly' + os.extsep + 'xml')) iflist = pci.parse_interface_xml(ixml, axml) # CONTACT ANALYSIS AND MATCH logger.info('Opening output csv files...') resultdir = os.path.join(invals['OUTROOT'], pdbid, 'pisacov', '') ppaths.mdir(resultdir) csvfile = [] csvfile.append( os.path.join(resultdir, (pdbid + os.extsep + "evcovsignal" + os.extsep + "cropped" + os.extsep + "pisacov" + os.extsep + "csv"))) csvfile.append( os.path.join(resultdir, (pdbid + os.extsep + "evcovsignal" + os.extsep + "full" + os.extsep + "pisacov" + os.extsep + "csv"))) for n in range(2): if scoring[n] is True: cpd = True if cropping else False pic.csvheader(csvfile[n], cropped=cpd, pisascore=True) if invals['OUTCSVPATH'][n] is not None: if os.path.isfile(invals['OUTCSVPATH'][n]) is False: pic.csvheader(invals['OUTCSVPATH'][n], cropped=cpd, pisascore=True) logger.info('Parsing sequence files...') for i, fpath in fseq.items(): # seq.imer[i].seqs['conkit'] = ckio.read(fpath, 'fasta')[0] seq.imer[i].seqs['conkit'] = pio.read(fpath, 'fasta', ck=True)[0] logger.info('Parsing contact predictions lists...') conpred = {} matches = [] for s in seq.imer: if s not in conpred: conpred[s] = {} fs = fcropseq[s] if cropping else fseq[s] for source, attribs in sources.items(): fc = os.path.splitext(os.path.basename(fs))[0] fc += os.extsep + attribs[1] confile = os.path.join(dmpdir, fc) # conpred[s][source] = ckio.read(confile, attribs[2])[0] conpred[s][source] = pio.read(confile, attribs[2], ck=True)[0] logger.info('Parsing crystal structure contacts...') for i in range(len(iflist)): logger.info(os.linesep + str(iflist[i])) fs = fcropstr if cropping else fstr fs = (os.path.splitext(os.path.basename(fs))[0] + os.extsep + "interface" + os.extsep + str(i + 1) + os.extsep + "pdb") spath = os.path.join(pisadir, fs) # inputmap = ckio.read(spath, 'pdb') inputmap = pio.read(spath, 'pdb', ck=True) if len(inputmap) == 4: chnames = [ iflist[i].chains[0].crystal_id, iflist[i].chains[1].crystal_id ] iflist[i].chains[0].seq_id = seq.whatseq(chnames[0]) iflist[i].chains[1].seq_id = seq.whatseq(chnames[1]) chseqs = [iflist[i].chains[0].seq_id, iflist[i].chains[1].seq_id] logger.info(iflist[i].chains) chtypes = [iflist[i].chains[0].type, iflist[i].chains[1].type] if (chseqs[0] != chseqs[1] or (chtypes[0] != 'Protein' or chtypes[1] != 'Protein')): if chtypes[0] != "Protein" or chtypes[1] != "Protein": logger.info( 'Interface ' + str(i) + ' is not a Protein-Protein interface. Ignoring.') else: logger.info('Interface ' + str(i) + ' is not a homodimer. Ignoring.') iflist[i].structure = None matches.append(None) continue s = chseqs[0] try: iflist[i].structure = [] for m in range(len(inputmap)): iflist[i].structure.append(inputmap[m].as_contactmap()) iflist[i].structure[m].id = inputmap[m].id except Exception: logger.warning('Contact Maps obtained from a legacy ConKit ' + 'version with no Distograms implemented.') for m in range(len(inputmap)): iflist[i].structure.append(inputmap[m]) # ConKit LEGACY. #fs = fcropstr if cropping else fstr #fs = (os.path.splitext(os.path.basename(fs))[0] + # os.extsep + "interface" + os.extsep + str(i+1) + os.extsep + "con") #spath = os.path.join(pisadir, fs) #pio.write(spath, 'psicov', indata=iflist[i].structure[1]) #iflist[i].contactmap = pio.read(spath, 'array') iflist[i].contactmap = iflist[i].structure[1].deepcopy() matches.append({}) for source, attribs in sources.items(): matches[i][source] = pcc.contact_atlas( name=pdbid + '_' + str(s), dimer_interface=iflist[i], conpredmap=conpred[s][source], conpredtype=source, sequence=seq.imer[s]) if cropping is True: matches[i][source].set_cropmap() matches[i][source].remove_neighbours(mindist=2) matches[i][source].set_conpred_seq() matches[i][source].remove_intra() matches[i][source].make_match(filterout=attribs[3]) for cmode, cmap in matches[i][source].conkitmatch.items(): if (len(cmap) > 0 and len( matches[i][source].interface.structure[1]) > 0): for imtype in plotformats: if len(matches[i][source].conkitmatch) > 1: pout = (os.path.splitext(fs)[0] + os.extsep + 'match' + os.extsep + cmode + os.extsep + source + os.extsep + 'con' + os.extsep + imtype) else: pout = (os.path.splitext(fs)[0] + os.extsep + 'match' + os.extsep + source + os.extsep + 'con' + os.extsep + imtype) plotpath = os.path.join( os.path.dirname(csvfile[0]), pout) matches[i][source].plot_map_alt(plotpath, mode=cmode, plot_type=imtype) else: iflist[i].structure = None iflist[i].contactmap = None matches.append(None) continue logger.info(os.linesep + 'Computing results and writing them to file...' + os.linesep) for i in range(len(iflist)): logger.info('Generating Interface ' + str(i + 1) + ' data...') if matches[i] is None: continue results = [pdbid, str(i + 1)] results.append(iflist[i].chains[0].crystal_id) results.append(iflist[i].chains[1].crystal_id) sid = iflist[i].chains[0].seq_id results.append(str(sid)) results.append(str(seq.imer[sid].length())) if cropping is True: results.append(str(seq.imer[sid].cropmsa.meff)) else: results.append(str(seq.imer[sid].msa.meff)) results.append(str(seq.imer[sid].ncrops())) results.append(str(seq.imer[sid].full_length())) results.append(str(seq.imer[sid].msa.meff)) for source, attribs in sources.items(): appresults = pcs.list_scores(matches[i][source], tag=source) results.extend(appresults) results.append(str(iflist[i].stable)) for n in range(2): if scoring[n] is True: pic.lineout(results, csvfile[n]) pic.lineout(results, invals['OUTCSVPATH'][n]) endmsg = pcl.ok(starttime, command=__script__) logger.info(endmsg) return
def main(): parser = create_argument_parser() args = parser.parse_args() global logger logger = pcl.pisacov_logger(level="info") welcomemsg, starttime = pcl.welcome(command=__script__) logger.info(welcomemsg) csvfile = ppaths.check_path(args.scores[0], 'file') outdir = ppaths.check_path(args.outdir) ppaths.mdir(outdir) # Parsing scores scores = {} names = None thraw = {} with open(csvfile, 'r') as fin: scoresin = csv.reader(fin) for entry in scoresin: if entry[0][0] != "#": if (names is None or isinstance(names, str) or (isinstance(names, list) and len(names) != len(entry))): names = [] for n in range(len(entry)): names.append('sc_' + str(n + 1)) else: if thraw == {}: for name in names[13:-1]: thraw[name] = [] if entry[0] not in scores: scores[entry[0]] = {} if entry[1] not in scores[entry[0]]: scores[entry[0]][entry[1]] = [] for sc in (entry.split(sep=', ')[13:-1]): scores[entry[0]][entry[1]].append(float(sc)) if (entry.split(sep=', ')[-1]) == 'True' or '1': scores[entry[0]][entry[1]].append(True) elif (entry.split(sep=', ')[-1]) == 'False' or '0': scores[entry[0]][entry[1]].append(False) for n in range(len(names)): thraw[names[n]].append(scores[entry[0]][entry[1]][n]) else: if entry.split( sep=', ')[13:] == scores[entry[0]][entry[1]]: pass else: raise ValueError( 'CSV file contains different values for same interface.' ) else: names = entry[1:].split(sep=', ') # Setting thresholds thr = {} FPR = {} TPR = {} for key, value in thraw.items(): thr[key] = list(set(thraw)).sort() FPR[key] = [] TPR[key] = [] for t in thr[key]: FP = 0 TP = 0 FN = 0 TN = 0 for pdbid in scores: for iface in scores[pdbid]: stable = scores[pdbid][iface][-1] for n in range(len(names)): if scores[pdbid][iface][n] < t: if stable is True: FN += 1 else: TN += 1 else: if stable is True: TP += 1 else: FP += 1 FPR[key].append(FP / (FP + TN)) TPR[key].append(TP / (TP + FN)) fnameout = os.path.join( outdir, (key + os.path.splitext(os.path.basename(csvfile))[0] + 'roc.dat')) with open(fnameout, 'w') as fout: for n in range(len(FPR[key])): fout.write(str(FPR[key][n]) + ' ' + str(TPR[key][n])) endmsg = pcl.ok(starttime, command=__script__) logger.info(endmsg) return