def test_sequence_guess_biotype_1(self): expected_type = ces.guess_type(_SEQUENCE_3) seq = ces.sequence(seqid=1, oligomer='1IXY', chains={'C', 'D'}, seq=_SEQUENCE_3) returned_type = seq.guess_biotype() self.assertEqual(expected_type, returned_type)
def main(): starttime = time.time() parser = create_argument_parser() args = parser.parse_args() global logger logger = pcl.pisacov_logger(level="info") welcomemsg, starttime = pcl.welcome(command=__script__) logger.info(welcomemsg) # PARSE CONFIGURATION FILE: invals = pco._initialise_inputs() invals['INSEQ'] = None invals['INIFS'] = None invals['OUTROOT'] = None invals['OUTCSVPATH'] = None # READ INPUT ARGUMENTS invals['INSEQ'] == ppaths.check_path(args.seqpath[0], 'file') invals['INIFS'] = [] args.remove_insertions = False for fp in args.dimers: if '*' in fp: invals['INIFS'] += ppaths.check_wildcard(fp) else: invals['INIFS'].append(ppaths.check_path(fp, 'file')) invals['INIFS'] = list(dict.fromkeys(invals['INIFS'])) if args.hhblits_arguments is not None: invals['HHBLITS_PARAMETERS'] = pco._check_hhparams( args.hhblits_arguments) else: pass if args.skip_conpred is True: skipexec = True if args.hhblits_arguments is not None: logger.info('HHblits parameters given bypassed by --skip_conpred') else: skipexec = False if args.outdir is None: invals['OUTROOT'] = ppaths.check_path(os.path.dirname(invals['INSEQ'])) else: invals['OUTROOT'] = ppaths.check_path(os.path.join(args.outdir[0], '')) ppaths.mdir(invals['OUTROOT']) if args.collection_file is None: invals['OUTCSVPATH'] = ppaths.check_path( os.path.join(invals['OUTROOT'], ("evcovsignal" + os.extsep + "full" + os.extsep + "pisacov" + os.extsep + "csv"))) else: invals['OUTCSVPATH'] = ppaths.check_path(args.collection_file[0]) if os.path.isfile(invals['OUTCSVPATH']) is False: pic.csvheader(invals['OUTCSVPATH'], cropped=False) # Define formats used sources = pco._sources() # Parse sequence and structure files logger.info('Parsing sequence file...') seqs = cps.parseseqfile(invals['INSEQ']) if len(seqs) == 1: if len(seqs) == 1: for key in seqs: pdbid = key.lower() else: raise Exception('More than one pdbid in sequence set.') seq = seqs[pdbid] outpdbdir = os.path.join(invals['OUTROOT'], pdbid, "") # RENUMBERING fseq = {} fmsa = {} if skipexec is False: if invals['INIFS'] is not None: logger.info('Renumbering interfaces provided ' + 'according to position in sequence.') for path in invals['INIFS']: instrc = os.path.join(invals['OUTROOT'], pdbid, os.path.basename(path)) logger.info(pcl.running('CROPS-renumber')) itime = datetime.datetime.now() psc.renumcrops(invals['INSEQ'], path, invals['OUTROOT']) copyfile(path, instrc) logger.info(pcl.running('CROPS-renumber', done=itime)) ppaths.mdir(outpdbdir) for i, iseq in seq.imer.items(): fiseq = pdbid + '_' + i + os.extsep + 'fasta' fseq[i] = os.path.join(invals['OUTROOT'], pdbid, fiseq) fiseq = pdbid + '_' + i + os.extsep + 'msa' + os.extsep + 'aln' fmsa[i] = os.path.join(invals['OUTROOT'], pdbid, 'hhblits', fiseq) if skipexec is False: iseq.dump(fseq[i]) # EXECUTION OF EXTERNAL PROGRAMS hhdir = os.path.join(invals['OUTROOT'], pdbid, 'hhblits', '') dmpdir = os.path.join(invals['OUTROOT'], pdbid, 'dmp', '') fstr = [] for file in invals['INIFS']: fstr.append( os.path.join( invals['OUTROOT'], (os.path.splitext(os.path.basename(file))[0] + os.extsep + 'crops' + os.extsep + 'seq' + os.extsep + 'pdb'))) if skipexec is False: # MSA GENERATOR ppaths.mdir(hhdir) if invals['HHBLITS_PARAMETERS'] == ['3', '0.001', 'inf', '50', '99']: logger.info( 'Generating Multiple Sequence Alignment using DeepMetaPSICOV default parameters... [AS RECOMMENDED]' ) elif invals['HHBLITS_PARAMETERS'] == ['2', '0.001', '1000', '0', '90']: logger.info( 'Generating Multiple Sequence Alignment using HHBlits default parameters...' ) else: logger.info( 'Generating Multiple Sequence Alignment using user-custom parameters...' ) for i, iseq in seq.imer.items(): sfile = fseq[i] afile = fmsa[i] logger.info(pcl.running('HHBlits')) itime = datetime.datetime.now() themsa = psm.runhhblits(sfile, invals['HHBLITS_PARAMETERS'], hhdir) logger.info(pcl.running('HHBlits', done=itime)) iseq.msa = themsa # DEEP META PSICOV RUN logger.info( 'Generating contact prediction lists via DeepMetaPSICOV...') ppaths.mdir(dmpdir) for i, iseq in seq.imer.items(): sfile = fseq[i] afile = fmsa[i] nsfile = os.path.join(dmpdir, os.path.basename(sfile)) if sfile != nsfile: copyfile(sfile, nsfile) logger.info(pcl.running('DeepMetaPSICOV')) itime = datetime.datetime.now() psd.rundmp(nsfile, afile, dmpdir) logger.info(pcl.running('DeepMetaPSICOV', done=itime)) # GENERATE INTERFACE LIST iflist = [] for filepath in fstr: ifname = os.path.splitext(os.path.basename(filepath))[0] iflist.append(pci.interface(name=ifname)) # CONTACT ANALYSIS AND MATCH logger.info('Opening output csv files...') resultdir = os.path.join(invals['OUTROOT'], pdbid, 'pisacov', '') ppaths.mdir(resultdir) csvfile = os.path.join( resultdir, (pdbid + os.extsep + "evcovsignal" + os.extsep + "full" + os.extsep + "pisacov" + os.extsep + "csv")) pic.csvheader(csvfile, cropped=False, pisascore=False) logger.info('Parsing sequence files...') for i, fpath in fseq.items(): seq.imer[i].seqs['conkit'] = ckio.read(fpath, 'fasta')[0] seq.imer[i].biotype = csq.guess_type(seq.imer[i].seqs['mainseq']) logger.info('Parsing contact predictions lists...') conpred = {} matches = [] for s in seq.imer: if s not in conpred: conpred[s] = {} for source, attribs in sources.items(): fc = os.path.splitext(os.path.basename(fseq[s]))[0] fc += attribs[1] confile = os.path.join(invals['OUTROOT'], pdbid, attribs[0], fc) conpred[s][source] = ckio.read(confile, attribs[2])[0] logger.info('Parsing crystal structure contacts...') for i in range(len(iflist)): inputmap = ckio.read(fstr[i], 'pdb') if len(inputmap) == 4: chnames = list(iflist[i].chains.keys()) chtypes = list(iflist[i].chains.values()) if (seq.whatseq(chnames[0]) != seq.whatseq(chnames[1]) or (chtypes[0] != 'Protein' or chtypes[1] != 'Protein')): if chtypes[0] != "Protein" or chtypes[1] != "Protein": logger.info( 'Interface ' + str(i) + ' is not a Protein-Protein interface. Ignoring.') else: logger.info('Interface ' + str(i) + ' is not a homodimer. Ignoring.') iflist[i].structure = None matches.append(None) continue s = seq.whatseq(chnames[0]) try: iflist[i].structure = [] for m in range(len(inputmap)): iflist[i].structure.append(inputmap[m].as_contactmap()) iflist[i].structure[m].id = inputmap[m].id except Exception: for m in range(len(inputmap)): iflist[i].structure.append(inputmap[m]) # ConKit LEGACY. matches.append({}) for source, attribs in sources.items(): matches[i][source] = pcc.contact_atlas( name=pdbid + '_' + str(s), conpredmap=conpred[s][source], strmap=iflist[i].structure, sequence=seq.imer[s], removeintra=True) else: iflist[i].structure = None matches.append(None) continue logger.info('Computing results and writing them to file...') for i in range(len(iflist)): if matches[i] is None: continue results = [pdbid, str(i + 1)] results.append(matches[i]['psicov'].chain1) results.append(matches[i]['psicov'].chain2) sid = seq.whatseq(matches[i]['psicov'].chain1) results.append(str(sid)) results.append(str(seq.imer[sid].length())) results.append(str(seq.imer[sid].cropmsa.meff)) results.append(str(seq.imer[sid].ncrops())) results.append(str(seq.imer[sid].full_length())) for source, attribs in sources.items(): appresults = pcs.list_scores(matches[i][source], tag=source) results += appresults pic.lineout(results, csvfile) pic.lineout(results, invals['OUTCSVPATH']) endmsg = pcl.ok(starttime, command=__script__) logger.info(endmsg) return
def test_guess_type_4(self): expected_type = "RNA" obtained_type = ces.guess_type(_SEQUENCE_4) self.assertEqual(obtained_type, expected_type)
def test_guess_type_2(self): expected_type = "Protein" obtained_type = ces.guess_type(_SEQUENCE_2) self.assertEqual(obtained_type, expected_type)