def LoadTrainData4Properties(tplFile, angFile): tpl = LoadTPL(tplFile) Ang = LoadAngleFile(angFile) assert tpl['sequence'] == Ang['sequence'] assert all(tpl['missing'] == Ang['Missing']) assert (tSS == aSS for tSS, aSS, m in zip(tpl['SS_str'], Ang['SS'], tpl['missing']) if m != 1) ## merge tpl and ang to generate a new training and validation file protein = dict() protein['name'] = tpl['name'] protein['sequence'] = tpl['sequence'] protein['length'] = tpl['length'] """ protein['NEFF'] = tpl['NEFF'] protein['PSFM'] = tpl['PSFM'] protein['PSSM'] = tpl['PSSM'] """ protein['ACC'] = tpl['ACC'] protein['pACC'] = tpl['pACC'] protein['CNa'] = tpl['CNa'] protein['CNb'] = tpl['CNb'] #protein['Ca'] = tpl['Ca'] #protein['Cb'] = tpl['Cb'] protein['SS'] = Ang['SS'] protein['DISO'] = Ang['DISO'] protein['CLE'] = Ang['CLE'] protein['Phi'] = Ang['Phi'] protein['Psi'] = Ang['Psi'] protein['Theta'] = Ang['Theta'] protein['Tau'] = Ang['Tau'] protein['Omg'] = Ang['Omg'] ##merge Phi and Psi protein['PhiPsi'] = np.transpose(np.array([protein['Phi'], protein['Psi']])) ##merge Theta and Tau protein['ThetaTau'] = np.transpose( np.array([protein['Theta'], protein['Tau']])) ## the missing residues have no 3D coordinates and thus, angles and solvent accessibility protein['Missing'] = Ang['Missing'] protein['SS8'] = protein['SS'] protein['SS3'] = ''.join( [PropertyUtils.SS8Letter2SS3Letter[c] for c in protein['SS8']]) protein['ForTrain'] = True return protein
def Usage(): print 'Usage: python CheckIfSequenceHasXs.py inputFile' print ' this script checks if the following types of files have Xs in sequence: .hhm, .hhm.pkl, .tgt, .tgt.pkl, .tpl, .tpl.pkl' print ' if Xs are contained in the sequence, the file name will be printed out as well as the sequence' if len(sys.argv) < 2: Usage() exit(1) infile = sys.argv[1] if infile.endswith('.pkl'): with open(infile, 'rb') as fh: seq = cPickle.load(fh)['sequence'] elif infile.endswith('.hhm'): hhm = LoadHHM(infile) seq = hhm['sequence'] elif infile.endswith('.tgt'): tgt = LoadTGT(infile) seq = tgt['sequence'] elif infile.endswith('.tpl'): tpl = LoadTPL(infile) seq = tpl['sequence'] else: Usage() exit(1) if 'X' in seq: print infile, seq
""" if len(name) != 5: print 'the template name is incorrect. It must be composed of PDB ID and chain letter' exit(-1) chain = name[4] residues = structure[0][chain].get_residues() residueList = [r for r in residues if is_aa(r)] #numResidues = len(residueList) pdbseq = ''.join([three_to_one(r.get_resname()) for r in residueList]) #print pdbseq #from Bio import SeqIO tpl = LoadTPL(tplfile) tplseq = tpl['sequence'] #record = SeqIO.read(seqfile, "fasta") ##print(record.seq) ##align two sequences from Bio import pairwise2 from Bio.SubsMat.MatrixInfo import blosum80 ##alignments = pairwise2.align.localds(pdbseq, tplseq, blosum80, -5, -1) alignments = pairwise2.align.localms(pdbseq, tplseq, 3, -1, -0.5, -0.0) #print '#alignments:', len(alignments) ##find the alignment with the minimum residue number difference
if __name__ == "__main__": if len(sys.argv) < 3: Usage() exit(1) tplfile = sys.argv[1] pdbfile = sys.argv[2] ResDir = os.getcwd() if len(sys.argv) >= 4: ResDir = sys.argv[3] if not os.path.isdir(ResDir): os.mkdir(ResDir) protein = LoadTPL(tplfile) result, pdbseq, numMisMatches, numMatches = PDBUtils.ExtractCoordinatesNDSSPBySeq( protein['sequence'], pdbfile) if numMisMatches > 5: print 'ERROR: too many mismatches between TPL sequence and ATOM record in ', pdbfile exit(1) if numMatches < min(30, 0.5 * len(protein['sequence'])): print 'ERROR: more than half of TPL sequence not covered by ATOM record in ', pdbfile exit(1) coordInfo, dssp = result coordinates, numInvalidAtoms = coordInfo
def ExtractCoordinatesFromTPLPDB(tplfile, pdbfile, atoms=['CB', 'CA', 'N', 'O']): tpl = LoadTPL(tplfile)