Ejemplo n.º 1
0
    def perturb_fragment(self, sample_index, random_fragment):
        """
        TO DO
        Sample from possible fragments for a position, and replace torsion angles of that fragment in the protein.
        ---------
        Params:
            - TO DO
        Returns:
            - TO DO
        """
        # Create a new copy of the original protein to experiment on
        new_pose = Pose()
        new_pose.assign(self.prev_protein.pose)
        self.next_protein = Protein(pose=new_pose)

        # Print the Previous Protein
        debug('Previous Protein:', self.iter, 5)
        for pos in range(1, self.prev_protein.length+1):
            debug('\tPosition: {}\tAngle: {}'.format(pos, self.prev_protein.get_torsion(pos)), self.iter, 5)

        # For each residue in fragment, replace the corresponding torsion angles in copy of protein
        pos = sample_index
        debug('random fragment: {}\nPosition: {}'.format(random_fragment, pos), self.iter, 5)
        for phi, psi in random_fragment:
            self.next_protein.set_torsion(pos, phi, psi)
            pos += 1
        # Print Protein after angle replacement
        debug('New Protein:', self.iter, 5)
        for pos in range(1, self.prev_protein.length+1):
            debug('\tPosition: {}\tAngle: {}'.format(pos, self.next_protein.get_torsion(pos)), self.iter, 5)
Ejemplo n.º 2
0
    def perturb_fragment(self, protein,
                         position):  # you may want to add more arguments
        """
        Sample from possible fragments for a position, and replace torsion angles of that fragment in the protein.
        ---------
        Params:
            - protein = an input protein object that you want to copy and perturb
            - position = a position (1-indexed) in the input protein at which you would like to start the perturbation
        Returns:
            - perturbed fragment (a protein object)
        """

        #check which fragments have already been sampled from the current position during the current step
        chosen_indices = self.sampled_fragments[position]
        fragments_to_sample_from = set(range(self.nfrags)) - chosen_indices

        #choose a candidate fragment at this position, then add that candidate to the list of previously chosen fragments (during this step)
        chosen_candidate = random.choice(list(fragments_to_sample_from))
        self.sampled_fragments[position].add(chosen_candidate)

        #after candidate fragment is chosen, make perturbed fragment and return
        new_positions = self.candidate_frag_list[position][chosen_candidate]
        perturbed_fragment = Protein(pose=protein.pose)
        for i in range(len(new_positions)):
            perturbed_fragment.set_torsion((position + i), new_positions[i][0],
                                           new_positions[i][1])

        return (perturbed_fragment)
Ejemplo n.º 3
0
 def AddSeq(self,pseq):
     pt = Protein(pseq)
     if pt.isvalid(): #check validity
         for Ptn in self.dseqs.values(): #check repeat sequences
             if pt.seq == Ptn.seq: return False 
         self.dseqs[len(self.dseqs)+1] = pt #1-based integers
         return True
     return False
Ejemplo n.º 4
0
 def getBasicProteins(self, splittedLine):
     temp = []
     for i in range(0, len(splittedLine), 2):
         p = Protein()
         acc = Helper.retrieveAccessionNumber(splittedLine[i])
         p.accession = acc
         p.orthologGroup = self
         temp.append(p)
     return temp
Ejemplo n.º 5
0
 def getBasicProteins(self, splittedLine):
     temp = []
     for i in range(0, len(splittedLine), 2):
         p = Protein()
         acc = Helper.retrieveAccessionNumber(splittedLine[i])
         p.accession = acc
         p.orthologGroup = self
         temp.append(p)
     return temp
Ejemplo n.º 6
0
    def __init__(self, params):

        self.ligand = Protein()
        self.ligand.import_pdb(params.ligand_file_name)

        self.receptor = Protein()
        self.receptor.import_pdb(params.receptor_file_name)

        self.cg_atoms = []

        if params.energy_type == "vdw":
            [self.index_ligand, self.index_receptor] = self.get_index(["CA", "CB"])
Ejemplo n.º 7
0
 def __init__(self,
              sequence,
              steps=10000,
              temp_min=0.15,
              temp_max=1.0,
              temp_delta=0.05,
              save_interval=100):
     self.protein = Protein(sequence)
     self.steps = steps
     self.temp_min = temp_min
     self.temp_max = temp_max
     self.temp_delta = temp_delta
     self.temp = temp_max
     self.save_interval = save_interval
     self.best = None
Ejemplo n.º 8
0
 def user_proteins(self):
     proteins = []
     for i in range(len(self._protein_names)):
         name = self._protein_names[i].get()
         sequence = self._protein_sequences[i].get()
         proteins.append(Protein(name, sequence))
     return proteins
Ejemplo n.º 9
0
	def test_values(self):
		protein1 = Protein("SelfProt", 2, 0, 1, 1, 0, 0, 2, 0.15, 1, 0.076)
		#Make sure errors are raised when necessary

		# Test distance when protein 2 has negative numbers

		protein2 = Protein("IncorrectProt", -3, 0, 0, 1, 0, 3, 2, 0.24, 2, 0.016)
		self.assertRaises(ValueError, protein1.distance, protein2 )

		# Test distance when protein 2 has booleans:
		protein2 = Protein("IncorrectProt", True, 0, 0, 1, 0, 3, 2, 0.24, 2, 0.016)
		self.assertRaises(ValueError, protein1.distance, protein2 )


		# Test distance when protein 2 has strings:
		protein2 = Protein("IncorrectProt", "twenty", 0, 0, 1, 0, 3, 2, 0.24, 2, 0.016)
		self.assertRaises(ValueError, protein1.distance, protein2 )
Ejemplo n.º 10
0
 def LoadFile(self, filename, filetype="text", sep=None):
     f = open(filename, "r")
     lines = f.readlines()
     if sep is not None: lines = lines.split(sep)
     if filetype == "fasta":
         title = None
         temp = []
         for line in lines:
             if line.startswith(">"):
                 if title is None:
                     title = line
                 else: 
                     title = line
                     pt = Protein("".join(temp))
                     if pt.isvalid: #only adds valid seqs
                         self.dseqs[len(self.dseqs)+1] = pt
                     temp = []
             else: 
                 temp.append(line.strip("\n"))
         if len(temp) != 0:
             pt = Protein("".join(temp))
             if pt.isvalid: #only adds valid seqs
                 self.dseqs[len(self.dseqs)+1] = pt
         self.history.enqueue(("LoadFile",filename,"fasta",sep))
     elif filetype == "text":
         for line in lines:
             pt = Protein(line) 
             if pt.isvalid(pt.seq): self.dseqs[len(self.dseqs)+1] = line
     else: return False
     return True
Ejemplo n.º 11
0
def readPDBFromStream(stream: Base.IOStream):
    from Protein import Protein
    from MoleculeTools import sanitize_mol

    r = Biomol.PDBMoleculeReader(stream)
    mol = Chem.BasicMolecule()
    r.read(mol)
    sanitize_mol(mol, makeHydrogenComplete=True)
    return Protein(mol)
Ejemplo n.º 12
0
    def __init__(self,
                 location,
                 inputFile,
                 outputDir=None,
                 cns=False,
                 reject=None,
                 angleOnly=False,
                 ppm=False,
                 progressBar=None,
                 writePgm=True):

        self.input = inputFile
        self.progressBar = progressBar

        print 'DANGLE (version 1.1)'
        print DANGLE_CITE

        # 1. read config file for location of reference information
        self.reference = Reference(os.path.dirname(location))
        self.reference.outDir = outputDir or OUTDIR
        if not os.path.isdir(self.reference.outDir):
            os.makedirs(self.reference.outDir)

        self.reference.cns = cns
        self.reference.ppm = ppm
        self.reference.angleOnly = angleOnly

        if (reject is not None):
            self.reference.rejectThresh = reject

        # 2. read shifts of query protein (input) and calculate secondary shifts
        self.query = Protein(self.reference)
        self.query.readShiftsFromXml(inputFile)

        # 3. compare with DB
        print 'STEP1: Shift search'
        self.topMatches = self.compareWithShiftDB()

        # 4. make preditions from scorograms
        print 'STEP2: GLE generation'
        self.predictor = Predictor(self.query, self.topMatches, self.reference,
                                   writePgm)
        self.predictions = self.predictor.predictPhiPsiFromDatabaseMatches(
            progressBar=self.progressBar)
Ejemplo n.º 13
0
    def step(self):
        """
        TO DO
        Take a single MCMC step. Each step should do the following:
        1. sample position in chain
            - Note: think about positions you can sample a k-mer fragment from. 
              For example, you cannot sample from position 1 because there is no phi angle
        2. sample fragment at that position and replace torsions in a *copied version* of the protein
        3. measure energy after replacing fragment
        4. accept or reject based on Metropolis criterion
            - if accept: incorporate proposed insertion and anneal temperature
            - if reject: sample new fragment (go to step 3)
        """
        # Sample an eligible position within the original protein
        sample_index = random.randint(1, (int(self.prev_protein.length) - self.k))
        # Candidate fragments with lowest rmsd values
        debug('\nn value: {}, sample index: {}'.format(self.N, sample_index), self.iter, 5)
        candidate_fragments = self.fragset.get_lowRMS_fragments(sample_index, self.N)
        debug('\ncandidate_fragments: {}\n'.format(candidate_fragments), self.iter, 5)

        # Run through all possible options of frag candidates before moving on
        fragment_indices = set()
        while len(fragment_indices) < len(candidate_fragments):
            # From list, choose a random fragment
            fragment_index = random.randint(0, (len(candidate_fragments)-1))
            random_fragment = candidate_fragments[fragment_index]

            # Run through all possible options of frag candidates before moving on
            fragment_indices.add(fragment_index)

            # Sample from possible fragments for a position, and replace torsion angles of that fragment in the protein.
            self.perturb_fragment(sample_index, random_fragment)

            # Test the energy of the changed protein copy
            self.new_energy = self.compute_energy(self.next_protein)
            # Metropolis test to see if we should stick with the changed version
            if self.metropolis_accept():
                debug('Passed Metropolis!!\nProbability: {}'.format(self.prob_accept), self.iter, 5)
                # Anneal temp
                self.anneal_temp()
                # Accept the protein changes
                new_pose = Pose()
                new_pose.assign(self.next_protein.pose)
                self.prev_protein = Protein(pose=new_pose)
                # Update energy
                self.old_energy = self.new_energy
                # Update best pose and score if energy is better than previous
                if self.new_energy < self.best_score:
                    self.best_score = self.new_energy
                    self.best_pose = Pose()
                    self.best_pose.assign(self.next_protein.pose)
                return
            debug('Failed Metropolis...\nProbability: {}'.format(self.prob_accept), self.iter, 5)
Ejemplo n.º 14
0
 def perturb_fragment(
     self,
     pos: int,
     mer: str = "9mers",
     protein: Union[Protein, None] = None
 ) -> Tuple[Protein, int]:  # you may want to add more arguments
     """
     Sample from possible fragments for a position, and replace torsion angles of that fragment in the protein.
     Store fragment candidate at certain position (call get_lowRMS just once.)
     :param protein: optional parameter, if none, use self.protein
     :param pos: position to change
     :param mer: mode of function, either "3mers" or "9mers"
     :return: new Protein with updated angles
     """
     # set a new_pose (protein)
     if not protein:
         new_protein = Protein(pose=self.protein.pose)
     else:
         new_protein = Protein(pose=protein.pose)
     # sample candidate fragment
     random_index = random.randint(0,
                                   len(self.candidate_frag[mer][pos]) - 1)
     frag_chosen = self.candidate_frag[mer][pos][random_index]
     frag_index = self.fragment_set[mer].findFragIndex(pos, frag_chosen)
     # insert this fragment and return
     if mer == "9mers":
         frag_length = 9
     else:
         frag_length = 3
     for i in range(frag_length):
         new_protein.set_torsion(pos + i, frag_chosen[i][0],
                                 frag_chosen[i][1])
     return new_protein, frag_index
Ejemplo n.º 15
0
 def test_protein_move_shoul_can_be_reverted(self):
     p = Protein('HHHPPPHHH')
     p_copy = deepcopy(p)
     p.move()
     self.assertNotEqual(p, p_copy)
     p.undo_move()
     self.assertEquals(p, p_copy)
Ejemplo n.º 16
0
 def make_proteins():
     proteins.append(Protein("food_perception", "ST"))
     proteins.append(Protein("poison_perception", "AI"))
     proteins.append(Protein("red_tail+", "QR"))
     proteins.append(Protein("red_tail-", "RR"))
     proteins.append(Protein("green_tail+", "PR"))
     proteins.append(Protein("green_tail-", "LR"))
Ejemplo n.º 17
0
class Data:

    index_ligand = []
    index_receptor = []
    cg_atoms = []

    def __init__(self, params):

        self.ligand = Protein()
        self.ligand.import_pdb(params.ligand_file_name)

        self.receptor = Protein()
        self.receptor.import_pdb(params.receptor_file_name)

        self.cg_atoms = []

        if params.energy_type == "vdw":
            [self.index_ligand, self.index_receptor] = self.get_index(["CA", "CB"])

    def get_index(self, atoms=["CA", "CB"]):

        # generate a dummy assembly and extract the indexes where atoms of interest are located
        assembly = A.Assembly(self.ligand, self.receptor)
        assembly.place_ligand(np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0]))

        ligand_index = []
        receptor_index = []
        for aname in atoms:
            # append indexes of an element in atoms list for ligand
            [m, index] = assembly.atomselect_ligand("*", "*", aname, True)
            for i in index:
                ligand_index.append(i)

                # append indexes of an element in atoms list for receptor
            [m, index] = assembly.atomselect_receptor("*", "*", aname, True)
            for i in index:
                receptor_index.append(i)

        return [ligand_index, receptor_index]
Ejemplo n.º 18
0
 def __init__(self, num, modelPath, partial):
   self.num          = num
   self.basename     = "Frag{:04n}".format(num)
   self.basepath     = path.join(modelPath, self.basename)
   self.values       = FragValues()
   self.protein      = Protein()
   self.center       = IndexedCoM()
   self.resCenters   = []
   self.stat         = FragStatistics()
   self.atomCount    = 0
   self.residues     = []
   self.diffMat      = []
   self.sdf          = []
   self.partial      = partial
Ejemplo n.º 19
0
class Structure():

    def __init__ (self):
            # initialising the values
        self.monomer = "NA" # the monomeric unit
        self.pdb_file_name = "NA"
        self.index_CA_monomer = "NA"
        self.flexibility = "NA"
        self.init_coords = "NA"

    def read_pdb (self, pdb):
        self.pdb_file_name = pdb
        self.monomer = Protein()
        self.monomer.import_pdb(pdb)
        self.init_coords = self.monomer.get_xyz()

    def compute_PCA (self, topology,trajectory,align,ratio,mode, proj_file):
        self.flexibility = F.Flexibility_PCA()
        self.flexibility.compute_eigenvectors(topology,trajectory,align,ratio,mode, proj_file)


    def setCoords (self):
        self.init_coords = self.monomer.get_xyz()
Ejemplo n.º 20
0
    def __init__(self, data_path, prot_len_file_name, with_overlap,
                 with_redundant, with_gap, interpro_local_format):
        """
		Preprocess class init

		Parameters
		----------
		data_path : str
			full data path
		prot_len_file_name : str
			file name containing protein length information
		with_overlap : bool
			output overlapping domain annotation (True), otherwise not overlapping domain annotation will be created (False)
		with_redundant : bool
			if with_overlap is False then create non overlapping (but possibly redundant) domains (True),
			otherwise create non overlapping and non redundant domain annotation (False)
		with_gap : bool
			add GAP domain for each protein subsequence >30 amino acids without domain hit (True),
			otherwise don't add GAP domain (False)
		interpro_local_format : bool
			preprocess output format produced by local interproscan run (True),
			otherwise preprocess Interpro downloaded protein2ipr format (False)

		Returns
		-------
		None
		"""
        self.data_path = data_path
        self.prot_len_file_name = prot_len_file_name
        self.with_overlap = with_overlap
        self.with_redundant = with_redundant
        self.with_gap = with_gap
        self.last_protein = Protein(self.with_overlap, self.with_redundant,
                                    self.with_gap)
        self.proteins = []
        self.interpro_local_format = interpro_local_format
        self.num_prot_with_no_interpro = 0
Ejemplo n.º 21
0
 def __init__(self,
              parent=Protein(),
              name="",
              id=0,
              atoms=[],
              N=(0, 0, 0),
              C_alpha=(0, 0, 0),
              C_dash=(0, 0, 0),
              coordinates=[],
              SS=""):
     self.parent = parent
     self.name = name
     self.id = id
     self.atoms = atoms
     self.N = N
     self.C_alpha = C_alpha
     self.C_dash = C_dash
     self.coordinates = coordinates
     self.SS = SS
Ejemplo n.º 22
0
    def parseFile(self, way):
        i = 0
        list = []
        self.setFile(open(way,"r"))
        protein = ""
        name = ""

        for line in self.getFile().readlines():
            if (line[0] == '>'):
                if (protein != ""):
                    list.insert(i, Protein(name, protein))
                    i += 1
                protein = ""
                name = line
                name = name[0:-1]
            else:
                protein += line
                protein = protein[0:-1]
        return list
Ejemplo n.º 23
0
	def test_distance(self):
		# Test for all dimensions:

		# Protein Constructor:
		protein1 = Protein("SelfProt", 2, 0, 1, 1, 0, 0, 2, 0.15, 1, 0.076)
		#Protein(name, C2H2, C2WH2,GATA3, CCHC, ZN2C6, zinc, prot_len, pos, num_chain, hys_cys)

		# Test distance when protein 2 is correct:
		protein2 = Protein("CorrectProt", 1, 0, 0, 1, 0, 3, 2, 0.24, 2, 0.016)
		self.assertAlmostEqual(protein1.distance(protein2), 6.15)

		# Test distance when protein 2 is 0:
		protein2 = Protein("CorrectProt", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
		self.assertAlmostEqual(protein1.distance(protein2), 7.226)
Ejemplo n.º 24
0
    def __init__(self,params):

        self.structure_hash = {}
        self.structure_list = []
        volume_structure_hash = {} # this is used to get the biggest structure


        # GIORGIO_CODE create structure instances of the rigid monomers
        for nickName,pdb_file in params.monomer_file_name:
            # create instance of the structure class
            s = Structure()
            s.read_pdb (pdb_file)

            volume_structure_hash[len(s.monomer.get_xyz())] = [s, nickName]


        # create structure instance for the flexible monomers
        if params.assembly_style=="flexible":
            print ">> flexible docking requested for structures, launching PCA..."

            for nickName, traj_file in params.trajectory:
                try:
                    # get the topology file:
                    for nickName2, top_file in params.topology:
                        if nickName2 == nickName:
                            break

                    # create the structure and compute the PCA
                    s = Structure()
                    s.compute_PCA(top_file, traj_file, params.align, params.ratio, params.mode, params.proj_file)
                    s.read_pdb("protein.pdb")

                    volume_structure_hash[len(s.monomer.get_xyz())] = [s, nickName]

                except ImportError, e:
                    sys.exit(1)

                # TODO: work on the deform mode, but ask Matteo before
                if params.mode=="deform":
                    self.structure_ligand=Protein()
                    self.structure_ligand.import_pdb("protein.pdb")
                    self.ligand.import_pdb("CA.pdb")
Ejemplo n.º 25
0
 def FindInDB(self,dbID): 
     if dbID in self.dseqs.keys(): 
         print("Sequence size:",len(self.dseqs[dbID]))
         print("Sequence (First 1k bp)):",
               self.dseqs[dbID].seq[min(1000,len(self.dseqs[dbID].seq))])
         if input("Update Sequence? [y/n]: ") == "y":
             new_seq = input("New sequence (key number maintained): ")
             pt = Protein(new_seq)
             if pt.isvalid: self.dseqs[dbID] = pt
             else: print("Invalid sequence")
         if len(self.dseqs[dbID].otherDBs.values()) > 0:
             print(self.dseqs[dbID].otherDBs)
         else: print("No external DB IDs")
         if input("Update external DB IDs? [y/n]: ") == "y":
             new_db = input("Database: ")
             new_id = input("Database ID: ")
             if len(new_db) > 2 and len(new_id) > 2:
                 self.dseqs[dbID].addotherDBs(new_db,new_id)
             else: print("Invalid input")
     else: 
         print("Invalid ID")
         return False
Ejemplo n.º 26
0
def relax(pdb, native, scorefxn=scorefxn_fa):
    """
	Performs energy minimization using Rosetta FastRelax protocol, superimpose onto native structure, and calculate RMSD
	--------
	Params
		- pdb (str): path to input structure in PDB format
		- native (str): path to native structure in PDB format
		- scorefxn (ScoreFunction): energy function to use in scoring. Either centroid ('score3') or full-atom ('fa_standard'), default full-atom
	Returns
		- Protein object representing relaxed structure
		- RMSD between input and native (float)
		- Score after minimization (float)
	"""
    pose = pose_from_pdb(pdb)
    score = score_pose(pose, scorefxn)
    print('initial score', score)

    to_fullatom = SwitchResidueTypeSetMover('fa_standard')
    to_fullatom.apply(pose)

    relax = FastRelax()  #ClassicRelax()
    relax.set_scorefxn(scorefxn)

    relax.apply(pose)
    score = score_pose(pose, scorefxn)
    print('final score', score)
    pose.dump_pdb("%s_fast_relax.pdb" % (pdb[:-4]))

    native_pose = pose_from_pdb(native)
    relax.apply(native_pose)
    native_pose.dump_pdb("%s_fast_relax.pdb" % (native[:-4]))

    rmsd = superimpose_rmsd("%s_fast_relax.pdb" % (pdb[:-4]),
                            "%s.pdb" % (native[:-4]))
    print('RMSD to native', rmsd)

    return Protein(pose=pose), rmsd, score
Ejemplo n.º 27
0
 def storeSim(self, best_pdb: Protein, log: dict,
              sim_index: int) -> Tuple[str, str, int]:
     """
     Store best pdb and log text file to log
     :param best_pdb: the structure to store as "best.pdb"
     :param log: log dict to store
     :param sim_index: int of simulation number
     :return path to sim folder, path to log folder, sim
     """
     # dealing with paths
     if not self.logdir:
         cur_dir = os.getcwd()
         log_folder_name = self.protein_name + "_log"
         log_folder_path = os.path.join(cur_dir, log_folder_name)
     else:
         log_folder_path = self.logdir
     if not os.path.exists(log_folder_path):
         os.mkdir(log_folder_path)
     sim_folder_name = "sim_" + self.__toStr__(sim_index)
     sim_folder_path = os.path.join(log_folder_path, sim_folder_name)
     # avoid path exist error
     if not os.path.exists(sim_folder_path):
         os.mkdir(sim_folder_path)
     # store things
     # 1. initial pdb
     self.initial_protein.save_pdb(
         os.path.join(sim_folder_path, "initial.pdb"))
     # 2. target pdb
     target_protein = Protein(pose=self.target_pose)
     target_protein.save_pdb(os.path.join(sim_folder_path, "target.pdb"))
     # 3. best pdb
     best_pdb.save_pdb(os.path.join(sim_folder_path, "best.pdb"))
     # 6. log.txt
     self.savelog(
         log, os.path.join(sim_folder_path, sim_folder_name + "_log.txt"))
     return sim_folder_path, log_folder_path, sim_index
Ejemplo n.º 28
0
class MCMCSampler(object):
    def __init__(self, seq_protein, k, T_start, T_end, fragset, N, anneal_rate):
        """
        TO DO: initialize necessary variables
        The score function is given to you (Rosetta centroid score function)
        """
        self.scorefxn = create_score_function('score3')

        self.prev_protein = seq_protein
        self.best_pose = Pose()
        self.best_pose.assign(self.prev_protein.pose)

        self.fragset = fragset
        self.k = k
        self.N = N
        self.anneal_rate = anneal_rate

        self.best_score = self.compute_energy(seq_protein)
        self.old_energy = self.compute_energy(seq_protein)

        self.T = T_start
        self.T_end = T_end

        self.prob_accept = 0
        self.iter = 0

        # For graphing
        self.lst_energy = []

    def compute_energy(self, protein):
        """
        TO DO
        Compute energy of protein.
        Hint: look at utils.py
        --------
        Params:
            - protein (Protein object): protein to score
        Return:
            - energy of conformation (float)
        """
        return self.scorefxn(protein.pose)

    def perturb_fragment(self, sample_index, random_fragment):
        """
        TO DO
        Sample from possible fragments for a position, and replace torsion angles of that fragment in the protein.
        ---------
        Params:
            - TO DO
        Returns:
            - TO DO
        """
        # Create a new copy of the original protein to experiment on
        new_pose = Pose()
        new_pose.assign(self.prev_protein.pose)
        self.next_protein = Protein(pose=new_pose)

        # Print the Previous Protein
        debug('Previous Protein:', self.iter, 5)
        for pos in range(1, self.prev_protein.length+1):
            debug('\tPosition: {}\tAngle: {}'.format(pos, self.prev_protein.get_torsion(pos)), self.iter, 5)

        # For each residue in fragment, replace the corresponding torsion angles in copy of protein
        pos = sample_index
        debug('random fragment: {}\nPosition: {}'.format(random_fragment, pos), self.iter, 5)
        for phi, psi in random_fragment:
            self.next_protein.set_torsion(pos, phi, psi)
            pos += 1
        # Print Protein after angle replacement
        debug('New Protein:', self.iter, 5)
        for pos in range(1, self.prev_protein.length+1):
            debug('\tPosition: {}\tAngle: {}'.format(pos, self.next_protein.get_torsion(pos)), self.iter, 5)

    def metropolis_accept(self): # you may want to add more arguments
        """
        TO DO
        Calculate probability of accepting or rejecting move based on Metropolis criterion.
        --------
        Params:
            - TO DO
        Returns:
            - TO DO
        """
        # Compute change in energy
        delta_e = self.new_energy - self.old_energy
        # Calculate a random number between zero and one
        rand_num = np.random.rand()

        # Passes if energy change is negative
        if delta_e <= 0:
            self.prob_accept = 1
            return True
        else:
            # If energy is positive, calculate probability of accepting
            self.prob_accept = np.exp(-delta_e / self.T)
            if self.prob_accept > rand_num:
                return True
            else:
                return False

    def anneal_temp(self):
        """
        TO DO
        Anneal temperature using exponential annealing schedule. Consider kT to be a single variable (i.e. ignore Boltzmann constant)
        --------
        Params:
            - TO DO
        Returns:
            - TO DO
        """
        self.T = self.anneal_rate * self.T

    def step(self):
        """
        TO DO
        Take a single MCMC step. Each step should do the following:
        1. sample position in chain
            - Note: think about positions you can sample a k-mer fragment from. 
              For example, you cannot sample from position 1 because there is no phi angle
        2. sample fragment at that position and replace torsions in a *copied version* of the protein
        3. measure energy after replacing fragment
        4. accept or reject based on Metropolis criterion
            - if accept: incorporate proposed insertion and anneal temperature
            - if reject: sample new fragment (go to step 3)
        """
        # Sample an eligible position within the original protein
        sample_index = random.randint(1, (int(self.prev_protein.length) - self.k))
        # Candidate fragments with lowest rmsd values
        debug('\nn value: {}, sample index: {}'.format(self.N, sample_index), self.iter, 5)
        candidate_fragments = self.fragset.get_lowRMS_fragments(sample_index, self.N)
        debug('\ncandidate_fragments: {}\n'.format(candidate_fragments), self.iter, 5)

        # Run through all possible options of frag candidates before moving on
        fragment_indices = set()
        while len(fragment_indices) < len(candidate_fragments):
            # From list, choose a random fragment
            fragment_index = random.randint(0, (len(candidate_fragments)-1))
            random_fragment = candidate_fragments[fragment_index]

            # Run through all possible options of frag candidates before moving on
            fragment_indices.add(fragment_index)

            # Sample from possible fragments for a position, and replace torsion angles of that fragment in the protein.
            self.perturb_fragment(sample_index, random_fragment)

            # Test the energy of the changed protein copy
            self.new_energy = self.compute_energy(self.next_protein)
            # Metropolis test to see if we should stick with the changed version
            if self.metropolis_accept():
                debug('Passed Metropolis!!\nProbability: {}'.format(self.prob_accept), self.iter, 5)
                # Anneal temp
                self.anneal_temp()
                # Accept the protein changes
                new_pose = Pose()
                new_pose.assign(self.next_protein.pose)
                self.prev_protein = Protein(pose=new_pose)
                # Update energy
                self.old_energy = self.new_energy
                # Update best pose and score if energy is better than previous
                if self.new_energy < self.best_score:
                    self.best_score = self.new_energy
                    self.best_pose = Pose()
                    self.best_pose.assign(self.next_protein.pose)
                return
            debug('Failed Metropolis...\nProbability: {}'.format(self.prob_accept), self.iter, 5)

    def simulate(self):
        """
        TO DO
        Run full MCMC simulation from start_temp to end_temp. 
        Be sure to save the best (lowest-energy) structure, so you can access it after.
        It is also a good idea to track certain variables during the simulation (temp, energy, and more).
        -------- 
        Params:
            - TO DO
        Returns:
            - TO DO
        """
        outfile('kmer_stats.txt', 'iter: \ttemp: \t\t\t\tenergy:\n')

        # Take as many steps as necessary until we reach the target temp
        while self.T >= self.T_end:
            self.step()
            outfile('kmer_stats.txt', '{} \t\t{} \t{}\n'.format(self.iter, self.T, self.old_energy))
            self.lst_energy.append(self.new_energy)
            self.iter += 1
Ejemplo n.º 29
0
class Data:

    index_ligand=[]
    index_receptor=[]
    cg_atoms=[]

    def __init__(self,params):

        self.structure_hash = {}
        self.structure_list = []
        volume_structure_hash = {} # this is used to get the biggest structure


        # GIORGIO_CODE create structure instances of the rigid monomers
        for nickName,pdb_file in params.monomer_file_name:
            # create instance of the structure class
            s = Structure()
            s.read_pdb (pdb_file)

            volume_structure_hash[len(s.monomer.get_xyz())] = [s, nickName]


        # create structure instance for the flexible monomers
        if params.assembly_style=="flexible":
            print ">> flexible docking requested for structures, launching PCA..."

            for nickName, traj_file in params.trajectory:
                try:
                    # get the topology file:
                    for nickName2, top_file in params.topology:
                        if nickName2 == nickName:
                            break

                    # create the structure and compute the PCA
                    s = Structure()
                    s.compute_PCA(top_file, traj_file, params.align, params.ratio, params.mode, params.proj_file)
                    s.read_pdb("protein.pdb")

                    volume_structure_hash[len(s.monomer.get_xyz())] = [s, nickName]

                except ImportError, e:
                    sys.exit(1)

                # TODO: work on the deform mode, but ask Matteo before
                if params.mode=="deform":
                    self.structure_ligand=Protein()
                    self.structure_ligand.import_pdb("protein.pdb")
                    self.ligand.import_pdb("CA.pdb")

        # getting the biggest structure and putting at the beginning so that it is fixed
        sorted_volumes = volume_structure_hash.keys()
        sorted_volumes.sort()
        sorted_volumes.reverse()

        for i in sorted_volumes:

            # insert the elements in a list
            self.structure_list.append( volume_structure_hash[i][0] ) # insert the structure
            self.structure_hash[volume_structure_hash[i][1]] = self.structure_list.index(volume_structure_hash[i][0])

        self.structure_list_and_name = [self.structure_list, self.structure_hash]
        print self.structure_list_and_name

        #LIGAND STRUCTURE
        #self.ligand = Protein()

#               if params.assembly_style=="flexible":
#                       print ">> flexible docking requested for ligand, launching PCA..."
#                       try:
#                               self.flex_ligand=F.Flexibility_PCA()
#                               self.flex_ligand.compute_eigenvectors(params.ligand_topology,params.ligand_trajectory,params.ligand_align,params.ligand_ratio,params.mode,params.ligand_proj_file)
#                               self.ligand.import_pdb("protein.pdb") # importing the middle structure
#                       except ImportError, e:
#                               sys.exit(1)
#
#                       if params.mode=="deform":
#                               self.structure_ligand=Protein()
#                               self.structure_ligand.import_pdb("protein.pdb")
#                               self.ligand.import_pdb("CA.pdb")

        #else:
            #load monomeric structure (the pdb file)
            #self.ligand.import_pdb(params.ligand_file_name)

        if params.energy_type=="vdw":
            self.CA_index_of_structures = self.get_index(["CA"])
            #[self.index_ligand,self.index_receptor]=self.get_index(["CA","CB"])

        # if the density map docking is on load the structure into data:
        if params.map_dock_OnOff:
            self.density_map_fileName = params.density_map
Ejemplo n.º 30
0
 def read_pdb (self, pdb):
     self.pdb_file_name = pdb
     self.monomer = Protein()
     self.monomer.import_pdb(pdb)
     self.init_coords = self.monomer.get_xyz()
Ejemplo n.º 31
0
    def parse_prot2in(self, file_in_name, batch_num_lines, batch_num_prot):
        """
		Parse protein domain hits to create tabular formatted file relating each protein to its domains

		Parameters
		----------
		file_in_name : str
			input file name
		batch_num_lines : int
			number of lines to be parsed per batch
		batch_num_prot : int
			number of proteins to be processed per batch

		Returns
		-------
		None
		"""
        file_out_name = self.create_file_out_name()
        total_out_prot = 0
        if self.prot_len_file_name != "":
            prot_file = open(
                os.path.join(self.data_path, self.prot_len_file_name), 'r')
        else:
            prot_file = ""

        # check if output tabular file already exists, if yes then don't add header
        output_exists_already = False
        if os.path.isfile(os.path.join(self.data_path, file_out_name)):
            output_exists_already = True

        with gzip.open(os.path.join(self.data_path, file_in_name),
                       'rt') as file_in, open(
                           os.path.join(self.data_path, file_out_name),
                           'a') as file_out:
            if not output_exists_already:
                # write the header of the output file
                file_out.write("uniprot_id\tinterpro_ids\tevidence_db_ids\n")
            line_count = 0
            for i, batch in enumerate(batch_iterator(file_in,
                                                     batch_num_lines)):
                for hit_line in batch:
                    hit_line = hit_line.strip()
                    hit_tabs = hit_line.split("\t")
                    if self.interpro_local_format:
                        assert len(
                            hit_tabs
                        ) >= 11, "AssertionError: " + hit_line + "has less than 11 tabs."
                    else:
                        assert len(
                            hit_tabs
                        ) == 6, "AssertionError: " + hit_line + " has more than 6 tabs."
                    if self.last_protein.uniprot_id == "":
                        # initialize protein list
                        protein = Protein(self.with_overlap,
                                          self.with_redundant, self.with_gap,
                                          hit_line, prot_file,
                                          self.interpro_local_format)
                        self.last_protein = protein
                        self.proteins.append(protein)
                    else:
                        if Protein.get_prot_id(
                                hit_line) == self.last_protein.uniprot_id:
                            # update last created protein
                            self.last_protein.add_domain(hit_line)
                        else:
                            # write to file complete proteins
                            if len(self.proteins) == batch_num_prot:
                                self.update_output(file_out)
                                total_out_prot = total_out_prot + len(
                                    self.proteins)
                                self.update_no_intepro()
                                del self.proteins[:]
                            # create new protein and append it to proteins
                            protein = Protein(self.with_overlap,
                                              self.with_redundant,
                                              self.with_gap, hit_line,
                                              prot_file,
                                              self.interpro_local_format)
                            self.last_protein = protein
                            self.proteins.append(protein)
                    line_count = line_count + 1
                # save last proteins
                self.update_output(file_out)
                total_out_prot = total_out_prot + len(self.proteins)
                self.update_no_intepro()
                del self.proteins[:]
        if self.prot_len_file_name != "":
            prot_file.close()
        print("Successfully parsed {} lines.".format(line_count))
        print("Successfully created {} proteins.".format(total_out_prot))
        print("Number of proteins without any interpro annotation: {}.".format(
            self.num_prot_with_no_interpro))
Ejemplo n.º 32
0
    def translate_with_fs(self, frameshifts=None):
        # frameshifts is a dict in {pos: Variant} form. NOT VariantSet! We are translating
        # with a particular FS combination and NOT calculating possible combinations here.
        if frameshifts is None:
            frameshifts = []
        else:
            frameshifts = sorted(frameshifts)  # should be already sorted, but...

        # the number of bases gained or lost by each frameshift. Positive: gain, negative: lost
        fs_shifts = [(fpos, fpos[0] - fpos[1] + len(fsvar)) for
            fpos, fsvar in frameshifts]

        def reposition(orig_pos):
            start, stop = orig_pos
            new_start, new_stop = start, stop
            for (fs_start, fs_stop), fs_shift in fs_shifts:
                if fs_start <= start < fs_stop or fs_start < stop <= fs_stop:
                    warnings.warn('Watch out, variant inside frameshift! We\'re not ready to handle '
                        'that yet. %s, (%d-%d)' % (self.id, fs_start, fs_stop))
                if start >= fs_stop:  # frameshift happened before variant, so variant shifts
                    new_start += fs_shift
                    new_stop += fs_shift
            return new_start, new_stop

        fs_positions = []
        new_seq = Seq('', generic_nucleotide)
        original_seq = self.sequence[self.cds[0]:]
        next_start = 0
        for (fs_start, fs_stop), fs_var in frameshifts:
            new_seq += original_seq[next_start:fs_start]
            fs_positions.append(len(new_seq)/3)  # register first AA position that current FS affects
            new_seq += fs_var.sequence
            next_start = fs_stop
        else:
            new_seq += original_seq[next_start:]

        protein = Protein(new_seq.translate(), self)

        # now with the new sequence created it's time to translate non-FS variants. Since the frameshifts
        # moved their relative positions around, we have to use their updated locations.

        new_variantsets = {}
        for (start, stop), vset in {reposition(vpos): vset for vpos, vset in self.variantsets.iteritems()}.iteritems():
            cstart = start - (start % 3)  # codon start
            cstop = (stop + 2) / 3 * 3  # codon stop
            new_vset = VariantSet(vset.genomic_pos, set([]))

            # TODO: this may introduce superfluous AA-s, that is 'Q'->'QP' when a
            # ''->'P' would be enough. Need to look into it. -- 99% SOLVED.
            for v in vset:
                if v.variant_type not in ('FSI', 'FSD'):
                    aa_seq = (new_seq[cstart:start] + v.sequence + new_seq[stop:cstop]).translate()
                    translated_variant = Variant(v.genomic_pos, v.variant_type, aa_seq, 'AA', v.sample_id)
                    # TODO: should we carry over metadata? I think we really should!
                    # for now, let's just keep a simple reference to the original variant
                    translated_variant.log_metadata('origin', v)
                    new_vset.add_variant(translated_variant)
                    new_vset.log_metadata('origin', vset)  # TODO: maybe origin should be a first-class attribue not metadata?
            if len(new_vset) > 0:  # frameshift VariantSets would create empty new_vsets, disregard them
                new_variantsets[(cstart/3, cstop/3)] = new_vset

        protein.variantsets = new_variantsets
        protein._trim_after_stop()

        # now let's see which frameshifts were actually kept. As induced stop codons may have terminated
        # the translated sequence, there's a chance that later frameshifts are irrelevant.

        # <= instead of < as the stop codon (Biopython '*') is trimmed away and if a FS induces that
        # as its first affected AA position it DID play a role in what the sequence has become
        # although '*' is not part of the protein sequence itself.
        fs_positions = filter(lambda x: x<=len(protein), fs_positions)
        used_frameshifts = zip(fs_positions, (fs for _, fs in frameshifts[:len(fs_positions)]))

        assert protein.get_metadata('frameshifts') == [], ("Someone has tweaked with the 'frameshift'"
            " field of protein metadata before. May have come from inherited transcript metadata."
            " Use a different field name in your custom functions.")
        protein.log_metadata('frameshifts', used_frameshifts)
        return protein
Ejemplo n.º 33
0
 def rmsd_pdb(pdb_file1, pdb_file2):
     protein1 = Protein()
     protein1.read_pdb(pdb_file1)
     protein2 = Protein()
     protein2.read_pdb(pdb_file2)
     return Analysis.rmsd_proteins(protein1, protein2)
Ejemplo n.º 34
0
def Main():

    #Initializes the protein class which contains all main functions for manipulation and storage of protein data
    P = Protein()
    # Gets user input for the name of the desired output file. Output graphic displays in the subfolder of completed trees
    name_of_run = nameOfRun()
    P.run_name = name_of_run
    # timer that determines how much pause is placed between calls to NCBI servers, reccomend 0.5 however 0 works when running NOT during peak hours
    time = timer()
    P.timer = time

    #Set input/output paths
    input_file_path = os.path.join('Input', 'ProteinInput')
    output_file_path = os.path.join('Output')

    #Clears previous CDS and genomic output from last run
    clearPreviousOutput(output_file_path)

    #Functions from the Protein class that use the input protein accession numbers to determine the corresponding protein ID, CDS, and genomic data
    P.Entrez_Protein_ID_Fetch(input_file_path)

    P.Entrez_Genome_Fetch()
    # Writes the fasta seq for the genome corresponding the protein. Establishing a parallel list. Ouput file path: PhyloRewrite/Output/Genome
    for i in range(len(P.gene)):
        data_type = 'gene'
        writeOutput(data_type, P.gene[i])
    print 'Genomic Sequences written to Output File... '

    P.Entrez_CDS_Fetch()
    #Writes the fasta seq for the CDS corresponding the protein. Establishing a parallel list. Ouput file path: PhyloRewrite/Output/CDS
    for i in range(len(P.retrieved_full_cds)):
        data_type = 'CDS'
        writeOutput(data_type, P.retrieved_full_cds[i])
    print 'CDS Sequences written to Output File... '

    #Checks as to whether the lists are in parallel, if they are not error will rise
    print '\n Checking the output for errors... \n'

    if len(P.gene) != len(P.retrieved_full_cds):
        repair_script()
        sys.exit()

    print 'No errors found, continuing... \n'

    #Function of the protein class that determines the intron phase and location of intron/exon boundry
    P.intronCalculator()

    #Contained within the lists is each
    print 'Intron Phases: ' + str(P.intron_phase)
    print 'Length of the Exons: ' + str(P.exon_lengths)

    #Uses clustal X linux executible to format a multiple sequencing alignment for the input protein sequences. Files found in execs/tmp
    P.multiple_sequencing_alignment()
    #Takes the multiple sequencing alignment output from clustal X and inputs into fasttree to generate an unrooted tree, then piped into ete2 to root. Files found in execs/tmp
    P.rootedTreeConstruction()
    #Builds the tree graphic
    P.renderingTreeImage()
Ejemplo n.º 35
0
	def convertXmlToProtein(self, xml):
		"""Turns raw XML from Uniprot into a proper Protein object.
		
		:param xml: An XML string to be parsed.
		:rtype: A Protein object.
		"""
		# XML to dictionary
		proteinObject = Protein()
		
		dictionary = xmltodict.parse(xml)
		root = dictionary["uniprot"]
		entry = root["entry"]
		
		for element, value in entry.items():
			if element == "@accession":
				proteinObject.addAttribute("id", "uniprot", value)
				
			if element == "name":
				proteinObject.addAttribute("proteinShortName", "uniprot", value)
				
			if element == "protein":
				fullname = value["recommendedName"]["fullName"]
				proteinObject.addAttribute("proteinFullName", "uniprot", fullname)
				
			if element == "@created":
				year,month,day = value.split("-")
				proteinObject.addAttribute("creationDate", "uniprot", self.convertDateToNative(day,month,year) )
				
			if element == "@modified":
				year,month,day = value.split("-")
				proteinObject.addAttribute("modifiedDate", "uniprot", self.convertDateToNative(day,month,year) )
			
			if element == "comment":
				for comment in entry["comment"]:
					if "text" in comment:
						text = comment["text"]["#text"] if isinstance(comment["text"], OrderedDict) else comment["text"]
						proteinObject.addAttribute(comment["@type"], "uniprot",text)
					
			if element == "gene":
				genes = []
				for gene in value["name"]:
					if "#text" in gene and isinstance(gene, OrderedDict):
						genes.append(gene["#text"])
					
				proteinObject.addAttribute("geneName", "uniprot", genes)
					
			if element == "organism":
				if isinstance(value["name"], list):
					organisms = []
					for organism in value["name"]:
						organisms.append(organism["#text"])
					
				else:
					proteinObject.addAttribute("organism", "uniprot", value["name"]["#text"])
				
			
			if element == "sequence":
				proteinObject.addAttribute("sequence", "uniprot",value["#text"].replace("\n",""))
				proteinObject.addAttribute("sequencelength", "uniprot",value["@length"].replace("\n",""))


		return proteinObject
Ejemplo n.º 36
0
class Sym:
    def __init__(self,
                 sequence,
                 steps=10000,
                 temp_min=0.15,
                 temp_max=1.0,
                 temp_delta=0.05,
                 save_interval=100):
        self.protein = Protein(sequence)
        self.steps = steps
        self.temp_min = temp_min
        self.temp_max = temp_max
        self.temp_delta = temp_delta
        self.temp = temp_max
        self.save_interval = save_interval
        self.best = None

    def accept_higher_energy(self):
        def pi(j):
            kb = 1.0
            return exp(-j / (kb * self.temp))

        return random() < (pi(self.protein.energy) /
                           pi(self.protein.last_energy))

    def run(self):
        try:
            os.makedirs('output/' + self.protein.sequence)
        except:
            # directory exists?
            pass
        self.temp = self.temp_max
        global_steps_done = 0
        # clear file
        open('output/' + self.protein.sequence + '/trajectory.pdb',
             'w').close()
        heat_stats_file = open('output/' + self.protein.sequence + '/heat.csv',
                               'w')
        contacts_stats_file = open(
            'output/' + self.protein.sequence + '/contacts.csv', 'w')
        inertia_stats_file = open(
            'output/' + self.protein.sequence + '/inertia.csv', 'w')
        while self.temp >= self.temp_min:
            E2 = 0.0
            E = 0.0
            inertia_sum = 0.0
            best_for_temperature = None
            str_temp = str(self.temp)
            contacts_stats_file.write(str_temp)
            inertia_stats_file.write(str_temp)
            for s in xrange(self.steps):
                global_steps_done += 1
                # new_protein = deepcopy(self.protein)
                self.protein.move()
                if self.protein.is_valid():
                    if self.protein.energy <= self.protein.last_energy:
                        pass
                    elif self.accept_higher_energy():
                        pass
                    else:
                        self.protein.undo_move()
                else:
                    self.protein.undo_move()
                # saving best model
                if self.best is None:
                    self.best = deepcopy(self.protein)
                elif self.protein.energy < self.best.energy:
                    self.best = deepcopy(self.protein)
                # best for actual temperature
                if best_for_temperature is None:
                    best_for_temperature = deepcopy(self.protein)
                elif self.protein.energy < best_for_temperature.energy:
                    best_for_temperature = deepcopy(self.protein)
                # stats
                contacts_stats_file.write(';' + str(-self.protein.energy))
                inertia_sum += self.protein.calculate_moment_of_inertia()
                E2 += self.protein.energy**2
                E += self.protein.energy
                self.save_trajectory(global_steps_done)
            self.save_best_for_actual_temp(best_for_temperature)
            inertia_stats_file.write(';' + str(inertia_sum / self.steps) +
                                     '\n')
            E /= self.steps
            E2 /= self.steps
            heat_stats_file.write(str_temp + ';' +
                                  str((E2 - E**2) / self.temp**2) + '\n')
            contacts_stats_file.write('\n')
            self.temp -= self.temp_delta
        contacts_stats_file.close()
        inertia_stats_file.close()
        heat_stats_file.close()
        self.save_best()

    def save_best_for_actual_temp(self, best_for_temperature):
        temp = str(self.temp)
        if len(temp) == 3:
            temp += '0'
        with open(
                'output/' + self.protein.sequence + '/best_for_' + temp +
                '.pdb', 'w') as f:
            f.write(best_for_temperature.to_pdb(self.temp))

    def save_best(self):
        with open('output/' + self.protein.sequence + '/best.pdb', 'w') as f:
            f.write(self.best.to_pdb(0))

    def save_trajectory(self, all_steps):
        if all_steps % self.save_interval == 0:
            with open('output/' + self.protein.sequence + '/trajectory.pdb',
                      'a') as f:
                f.write(self.protein.to_pdb(all_steps / self.save_interval))
Ejemplo n.º 37
0
class Preprocess:
    """
	Class to preprocess file from interpro database, found at:
	https://www.ebi.ac.uk/interpro/beta/download/protein2ipr.dat.gz
	"""
    def __init__(self, data_path, prot_len_file_name, with_overlap,
                 with_redundant, with_gap, interpro_local_format):
        """
		Preprocess class init

		Parameters
		----------
		data_path : str
			full data path
		prot_len_file_name : str
			file name containing protein length information
		with_overlap : bool
			output overlapping domain annotation (True), otherwise not overlapping domain annotation will be created (False)
		with_redundant : bool
			if with_overlap is False then create non overlapping (but possibly redundant) domains (True),
			otherwise create non overlapping and non redundant domain annotation (False)
		with_gap : bool
			add GAP domain for each protein subsequence >30 amino acids without domain hit (True),
			otherwise don't add GAP domain (False)
		interpro_local_format : bool
			preprocess output format produced by local interproscan run (True),
			otherwise preprocess Interpro downloaded protein2ipr format (False)

		Returns
		-------
		None
		"""
        self.data_path = data_path
        self.prot_len_file_name = prot_len_file_name
        self.with_overlap = with_overlap
        self.with_redundant = with_redundant
        self.with_gap = with_gap
        self.last_protein = Protein(self.with_overlap, self.with_redundant,
                                    self.with_gap)
        self.proteins = []
        self.interpro_local_format = interpro_local_format
        self.num_prot_with_no_interpro = 0

    def update_no_intepro(self):
        """
		Update statistic count for proteins without interpro domain

		Parameters
		----------

		Returns
		-------
		None
		"""
        # check how many interpro ids exist for domains of proteins
        for protein in self.proteins:
            if sum(protein.interpro_exist_all_domains) == 0:
                self.num_prot_with_no_interpro = self.num_prot_with_no_interpro + 1

    def update_output(self, file_out):
        """
		Update output tabular file

		Parameters
		----------
		file_out : str
			output file name

		Returns
		-------
		None
		"""
        for protein in self.proteins:
            file_out.write(protein.to_tabs())

    def create_file_out_name(self):
        """
		Create output file name based on the type of domain annotation that was selected

		Parameters
		----------

		Returns
		-------
		str
			created output file name
		"""
        file_out_name = "id_domains"

        if self.with_overlap:
            file_out_name = file_out_name + "_overlap"
        elif self.with_redundant is False:
            file_out_name = file_out_name + "_no_overlap"
        else:
            file_out_name = file_out_name + "_no_redundant"
        if self.with_gap:
            file_out_name = file_out_name + "_gap"
        else:
            file_out_name = file_out_name + "_no_gap"
        return file_out_name + ".tab"

    def parse_prot2in(self, file_in_name, batch_num_lines, batch_num_prot):
        """
		Parse protein domain hits to create tabular formatted file relating each protein to its domains

		Parameters
		----------
		file_in_name : str
			input file name
		batch_num_lines : int
			number of lines to be parsed per batch
		batch_num_prot : int
			number of proteins to be processed per batch

		Returns
		-------
		None
		"""
        file_out_name = self.create_file_out_name()
        total_out_prot = 0
        if self.prot_len_file_name != "":
            prot_file = open(
                os.path.join(self.data_path, self.prot_len_file_name), 'r')
        else:
            prot_file = ""

        # check if output tabular file already exists, if yes then don't add header
        output_exists_already = False
        if os.path.isfile(os.path.join(self.data_path, file_out_name)):
            output_exists_already = True

        with gzip.open(os.path.join(self.data_path, file_in_name),
                       'rt') as file_in, open(
                           os.path.join(self.data_path, file_out_name),
                           'a') as file_out:
            if not output_exists_already:
                # write the header of the output file
                file_out.write("uniprot_id\tinterpro_ids\tevidence_db_ids\n")
            line_count = 0
            for i, batch in enumerate(batch_iterator(file_in,
                                                     batch_num_lines)):
                for hit_line in batch:
                    hit_line = hit_line.strip()
                    hit_tabs = hit_line.split("\t")
                    if self.interpro_local_format:
                        assert len(
                            hit_tabs
                        ) >= 11, "AssertionError: " + hit_line + "has less than 11 tabs."
                    else:
                        assert len(
                            hit_tabs
                        ) == 6, "AssertionError: " + hit_line + " has more than 6 tabs."
                    if self.last_protein.uniprot_id == "":
                        # initialize protein list
                        protein = Protein(self.with_overlap,
                                          self.with_redundant, self.with_gap,
                                          hit_line, prot_file,
                                          self.interpro_local_format)
                        self.last_protein = protein
                        self.proteins.append(protein)
                    else:
                        if Protein.get_prot_id(
                                hit_line) == self.last_protein.uniprot_id:
                            # update last created protein
                            self.last_protein.add_domain(hit_line)
                        else:
                            # write to file complete proteins
                            if len(self.proteins) == batch_num_prot:
                                self.update_output(file_out)
                                total_out_prot = total_out_prot + len(
                                    self.proteins)
                                self.update_no_intepro()
                                del self.proteins[:]
                            # create new protein and append it to proteins
                            protein = Protein(self.with_overlap,
                                              self.with_redundant,
                                              self.with_gap, hit_line,
                                              prot_file,
                                              self.interpro_local_format)
                            self.last_protein = protein
                            self.proteins.append(protein)
                    line_count = line_count + 1
                # save last proteins
                self.update_output(file_out)
                total_out_prot = total_out_prot + len(self.proteins)
                self.update_no_intepro()
                del self.proteins[:]
        if self.prot_len_file_name != "":
            prot_file.close()
        print("Successfully parsed {} lines.".format(line_count))
        print("Successfully created {} proteins.".format(total_out_prot))
        print("Number of proteins without any interpro annotation: {}.".format(
            self.num_prot_with_no_interpro))

    def create_domains_corpus(self, file_in_name, file_out_name,
                              batch_num_lines):
        """
		Create domain corpus from protein domains tabular file

		Parameters
		----------
		file_in_name : str
			input file name
		file_out_name : str
			output file name
		batch_num_lines : int
			number of lines to be processed per batch

		Returns
		-------
		None
		"""
        total_out_lines = 0
        with open(os.path.join(self.data_path, file_in_name),
                  'r') as file_in, open(
                      os.path.join(self.data_path, file_out_name),
                      'a') as file_out:
            for i, batch in enumerate(batch_iterator(file_in,
                                                     batch_num_lines)):
                for line in batch:
                    line_tabs = line.split("\t")
                    assert len(
                        line_tabs
                    ) == 3, "AssertionError: line should have only three tabs."
                    protein_domains = line_tabs[1]
                    if protein_domains.strip() != "interpro_ids":
                        file_out.write(protein_domains + "\n")
                        total_out_lines = total_out_lines + 1
        print("Successfully written {} proteins in domains representation.".
              format(total_out_lines))

    def fasta2default_domains(self, fasta_name, data_id_format):
        """
		Convert a fasta file containing proteins without any interproscan domain hit
		(mainly for prediction tasks)

		Parameters
		----------
		fasta_name : str
			input fasta name
		data_id_format : int
			data set contains id format in following types: protein ids (0), protein ids but remove ending ";" (1),
			protein ids can be extracted by splitting at "|"

		Returns
		-------
		None
		"""
        file_out_name = "default_domains.tab"
        with open(os.path.join(self.data_path, fasta_name),
                  "r") as fasta_file, open(
                      os.path.join(self.data_path, file_out_name),
                      "w") as file_out:
            file_out.write("uniprot_id\tinterpro_ids\tevidence_db_ids\n")
            for protein in SeqIO.parse(fasta_file, "fasta"):
                if data_id_format == 0:
                    # DeepLoc
                    domain_annot = protein.id + "_unk_dom"
                    evid_annot = protein.id + "_unk_evid"
                elif data_id_format == 1:
                    # for targetP remove ending ;
                    domain_annot = protein.id.strip(";") + "_unk_dom"
                    evid_annot = protein.id.strip(";") + "_unk_evid"
                elif data_id_format == 2:
                    # Toxin
                    domain_annot = protein.id.split("|")[1] + "_unk_dom"
                    evid_annot = protein.id.split("|")[1] + "_unk_evid"
                file_out.write(
                    "\t".join([protein.id, domain_annot, evid_annot]) + "\n")
Ejemplo n.º 38
0
'''
Created on Apr 26, 2013

@author: cforker
'''
from itertools import count
from Protein import Protein
from time import time

def leadingZeroes(binstring,length):
    return '0'*(length-len(binstring[2:])) + binstring[2:]

if __name__ == '__main__':
    t0 = time()
    N = 8
    # binary representation of proteins. 1=H,0=P
    for prot in count():
        if (prot == 2**N):
            break
        sprot = leadingZeroes(bin(prot),N)
        #print sprot
          
    ptest = Protein('01011111')
    ptest.setFolding([0,1,1,-1,-1,1])
    ptest.foldingDimensions()
    ptest.buildGrid()
    ptest.countHBonds()
    ptest.printEverything()
    t1 = time()
    print "Execution Time",round((t1 - t0)*1000),"ms"
Ejemplo n.º 39
0
from Protein import Protein
from Atom import Atom

# Creating a protein molecule:

prot = Protein('Trp-cage')
seq = ""

countaa = 0

with open("1l2y.coords") as protfile:
    for line in protfile:
        line = line.rstrip()
        split_line = line.split()
        aa = split_line[3]
        aanum = split_line[5]
        at = Atom(split_line[11], float(split_line[6]), float(split_line[7]),
                  float(split_line[8]))  #  Create first atom
        prot.addatom(at, aa, aanum)  #  Add it to the molecule object

        if (int(aanum) > countaa):
            seq += aa + " "
            countaa += 1

prot.addsequence(seq)

print(prot)  # Print the molecule object details

print("\nProtein Sequece:")
prot.getsequence()
Ejemplo n.º 40
0
    def run(self):

        exec "import %s as constraint" % (self.constraint)

        # create output directory for generated PDB
        self.OUTPUT_DIRECTORY = "result"
        if os.path.isdir(self.OUTPUT_DIRECTORY) != 1:
            os.mkdir(self.OUTPUT_DIRECTORY)

        clusters_file = open("%s/solutions.dat" % self.params.output_folder, "w")

        # use superclass method to filter acceptable solutions
        self.log = self.select_solutions(self.params)
        print ">> %s solutions filtered" % len(self.log)
        if len(self.log) == 0:
            return

        # generate a dummy multimer and extract the indexes of C alpha
        multimer = A.Assembly(self.data.ligand, self.data.receptor, self.data.cg_atoms)
        multimer.place_ligand(np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0]))
        [m, index] = multimer.atomselect_ligand("*", "*", "CA", True)

        # load the monomeric structure positions
        s = Protein()
        s.import_pdb(self.params.ligand_file_name)
        coords = s.get_xyz()

        print ">> clustering best solutions..."
        P = self.log[:, 0 : len(self.log[0, :])]  # points list
        V = self.log[:, -1]  # values of best hits
        C = []  # centroids array
        P_C = np.zeros(len(P))  # points-to-cluster mapping
        C_V = []  # centroids values
        cnt = 0  # centroids counter

        # cluster accepted solutions
        while True:
            # check if new clustering loop is needed
            k = np.nonzero(P_C == 0)[0]
            if len(k) != 0:
                cnt = cnt + 1
                P_C[k[0]] = cnt
                a = P[k[0]]
                C.append(a)
            else:
                break

                # create multimer
            pos = np.array(C[cnt - 1])[0:6].astype(float)
            multimer1 = A.Assembly(self.data.ligand, self.data.receptor, self.data.cg_atoms)
            multimer1.place_ligand(pos)

            # write multimer
            multimer1.write_PDB("%s/assembly%s.pdb" % (self.OUTPUT_DIRECTORY, cnt))

            # clustering loop
            m1 = multimer1.get_ligand_xyz()[index]
            cnt2 = 1
            for i in xrange(0, len(k), 1):

                self.data.ligand.set_xyz(coords)
                # multimer2 = A.Assembly(self.data.ligand)
                multimer2 = A.Assembly(self.data.ligand, self.data.receptor, self.data.cg_atoms)
                multimer2.place_ligand(
                    np.array([P[k[i]][0], P[k[i]][1], P[k[i]][2], P[k[i]][3], P[k[i]][4], P[k[i]][5]])
                )
                m2 = multimer2.get_ligand_xyz()[index]

                rmsd = self.align(m1, m2)

                if rmsd < self.params.cluster_threshold:
                    cnt2 += 1
                    P_C[k[i]] = cnt

            print ">>> clustered %s solutions on multimer %s" % (cnt2, cnt)

            # set centroid score with score of closes neighbor in set
            q = np.nonzero(P_C == cnt)[0]
            distance = 10000
            targ = 0
            for i in xrange(0, len(q), 1):
                d = np.sqrt(np.dot(C[cnt - 1] - P[q[i]], C[cnt - 1] - P[q[i]]))
                if d < distance:
                    distance = d
                    targ = q[i]
            C_V.append(V[targ])

            # extract constraint values calculated for selected centroid
            measure = constraint.constraint_check(multimer1)

            ###generate output log (prepare data and formatting line, then dump in output file)###
            l = []
            f = []
            for item in C[cnt - 1][0 : len(C[cnt - 1]) - 1]:
                l.append(item)
                f.append("%8.3f ")
                # write constraint values
            f.append("| ")
            for item in measure:
                l.append(item)
                f.append("%8.3f ")
                # write fitness
            f.append("| %8.3f\n")
            l.append(C_V[cnt - 1])

            formatting = "".join(f)

            clusters_file.write(formatting % tuple(l))

        clusters_file.close()

        ####generate output log###
        ##write solution values
        # for item in C[cnt-1]:
        #    clusters_file.write("%s "%item)
        ##write constraint values
        # for item in measure:
        #    clusters_file.write("%s "%item)
        ##write fitness
        # clusters_file.write("%s\n"%C_V[cnt-1])

        return
Ejemplo n.º 41
0
def parse_proteins(directory):

    proteins = list()

    #First we parse the structure.
    for file in os.listdir(directory):

        ZN_num = 0
        length = 0
        Hys_Cys = 0
        ARG_LYS_HIS = 0
        C2H2_occur = 0
        C2WH2_occur = 0
        GATA3_occur = 0
        CCHC_occur = 0
        ZN2C6_occur = 0
        length_factor = 0

        if file.endswith(".pdb") or file.endswith(".ent") or file.endswith(
                ".cif"):
            # Check the Zinc ions
            with open(os.path.join(directory, file), "r") as pdb:
                for line in pdb:
                    if line.startswith("HETNAM"):
                        if line.split(" ")[1] == "ZN":
                            ZN_num += 1

            # Biopython parser.
            protein = SeqIO.to_dict(
                SeqIO.parse((os.path.join(directory, file)), "pdb-seqres"))

            #Number of chains
            chain_num = len(protein)  #Number of chains
            for key in (protein.keys()):

                #C2H2 Motif
                C2H2 = re.findall("[C].{2,4}[C].{9,13}[H].{3,5}[H]",
                                  str(protein[key].seq))
                C2H2_occur = C2H2_occur + len(C2H2)

                #C2WH2 Motif
                C2WH2 = re.findall("[C].[W].{1,4}[C].{2,13}[H].{3,5}[H]",
                                   str(protein[key].seq))
                C2WH2_occur = C2WH2_occur + len(C2WH2)

                #GATA3 Motif
                GATA3 = re.findall("[Y].[K].[H].{1,3}[R][P]",
                                   str(protein[key].seq))
                GATA3_occur = GATA3_occur + len(GATA3)

                #CCHC Motif
                CCHC = re.findall("[C]..[C].{3,4}[H].{5,7}[C]",
                                  str(protein[key].seq))
                CCHC_occur = CCHC_occur + len(CCHC)

                #ZN2C6 Motif
                ZN2C6 = re.findall(
                    "[C]..[C]...[KR].[KR][C].{5,7}[C]..[C].{5,7}[C]",
                    str(protein[key].seq))
                ZN2C6_occur = ZN2C6_occur + len(ZN2C6)

                # Length of total protein
                length = length + (len(protein[key].seq))

                #Number of Hystidines + Cysteine
                Hys = (str(protein[key].seq).count("H"))
                Cys = (str(protein[key].seq).count("C"))
                Hys_Cys = Hys_Cys + Hys + Cys

                #Number of positive residues in the protein
                ARG = (str(protein[key].seq).count("R"))
                LYS = (str(protein[key].seq).count("K"))
                ARG_LYS_HIS = ARG_LYS_HIS + Hys + ARG + LYS

            if 200 > length > 0:
                length_factor = 0
            elif 400 >= length >= 200:
                length_factor = 1
            elif 600 >= length > 400:
                length_factor = 2
            else:
                length_factor = 3

            prot = Protein(name=file[:-4],
                           C2H2=C2H2_occur,
                           C2WH2=C2WH2_occur,
                           GATA3=GATA3_occur,
                           CCHC=CCHC_occur,
                           ZN2C6=ZN2C6_occur,
                           zinc=ZN_num,
                           prot_len=length_factor,
                           pos=ARG_LYS_HIS / length,
                           num_chain=chain_num,
                           hys_cys=Hys_Cys / length)

            proteins.append(prot)

    return proteins
Ejemplo n.º 42
0

if __name__ == '__main__':

    response = welcomeStatement()

    if response == '1':

        #Function that runs the entire program
        Main()

    elif response == '2':

        #Use following code if you have the Protein, CDS, and genomic sequences
        #Code produces the Phylogenetic tree and intron mapping
        P = Protein()
        P.multiple_sequencing_alignment()
        P.intronCalculator()
        P.rootedTreeConstruction()
        P.renderingTreeImage()

    elif response == '3':

        repair_script()

    elif response == '4':

        duplicate_management()

    elif response == '5':
        #Enter experimental code here:
Ejemplo n.º 43
0
 def rmsd_pdb(pdb_file1, pdb_file2):
     protein1 = Protein()
     protein1.read_pdb(pdb_file1)
     protein2 = Protein()
     protein2.read_pdb(pdb_file2)
     return Analysis.rmsd_proteins(protein1, protein2)
Ejemplo n.º 44
0
class Fragment:
  """Class describing fragments"""
  def __init__(self, num, modelPath, partial):
    self.num          = num
    self.basename     = "Frag{:04n}".format(num)
    self.basepath     = path.join(modelPath, self.basename)
    self.values       = FragValues()
    self.protein      = Protein()
    self.center       = IndexedCoM()
    self.resCenters   = []
    self.stat         = FragStatistics()
    self.atomCount    = 0
    self.residues     = []
    self.diffMat      = []
    self.sdf          = []
    self.partial      = partial

  def calcValues(self, viscosity=0.0, HarmMe=0.0, radius=0.0):
    self.values.calcValues(viscosity=viscosity, HarmMe=HarmMe, radius=radius)

  def addAtom(self, atomName, resNum, resName, point):
    weight = atomWeight(atomName)
    self.center.addPoint(resNum, weight, point=point)
    self.protein.addResidue(resNum, resName)
    self.atomCount   += 1
    if resNum not in self.residues:
      self.residues.append(resNum)
    if resNum > 1:
      if atomName.strip() == "N":
        self.protein.setNpos(resNum, point)
      if atomName.strip() == "H":
        self.protein.setHpos(resNum, point)

  def getWeight(self):
    return self.protein.getWeight()

  def getProtons(self):
    return self.protein.getProtons()

  def hasResidue(self, num):
    if num in self.residues:
      return True
    return False

  def getCenter(self):
    return self.center.getCenter()

  def getEta(self):
    return self.values.getEta()

  def getR(self, corr):
    return self.values.getR(corr)

  def getHM(self, corr):
    return self.values.getHM(corr)

  def getPDB(self):
    return self.basepath + '.pdb'

  def getDat(self):
    return self.basepath + '.dat'

  def doneParsing(self):  # This function simply exists to free memory
    self.protein.done()