def handle(self, *args, **options):
        self.options = options
        if self.options['purge']:
            Residue.objects.filter(
                protein_conformation__protein__entry_name__endswith='_a',
                protein_conformation__protein__family__parent__parent__name=
                'Alpha').delete()
            ProteinConformation.objects.filter(
                protein__entry_name__endswith='_a',
                protein__family__parent__parent__name='Alpha').delete()
            Protein.objects.filter(
                entry_name__endswith='_a',
                family__parent__parent__name='Alpha').delete()

        # Building protein and protconf objects for g protein structure in complex
        scs = SignprotComplex.objects.all()
        for sc in scs:
            self.logger.info(
                'Protein, ProteinConformation and Residue build for alpha subunit of {} is building'
                .format(sc))
            try:
                # Alpha subunit
                try:
                    alpha_protein = Protein.objects.get(
                        entry_name=sc.structure.pdb_code.index.lower() + '_a')
                except:
                    alpha_protein = Protein()
                    alpha_protein.entry_name = sc.structure.pdb_code.index.lower(
                    ) + '_a'
                    alpha_protein.accession = None
                    alpha_protein.name = sc.structure.pdb_code.index.lower(
                    ) + '_a'
                    alpha_protein.sequence = sc.protein.sequence
                    alpha_protein.family = sc.protein.family
                    alpha_protein.parent = sc.protein
                    alpha_protein.residue_numbering_scheme = sc.protein.residue_numbering_scheme
                    alpha_protein.sequence_type = ProteinSequenceType.objects.get(
                        slug='mod')
                    alpha_protein.source = ProteinSource.objects.get(
                        name='OTHER')
                    alpha_protein.species = sc.protein.species
                    alpha_protein.save()
                try:
                    alpha_protconf = ProteinConformation.objects.get(
                        protein__entry_name=sc.structure.pdb_code.index.lower(
                        ) + '_a')
                except:
                    alpha_protconf = ProteinConformation()
                    alpha_protconf.protein = alpha_protein
                    alpha_protconf.state = ProteinState.objects.get(
                        slug='active')
                    alpha_protconf.save()
                pdbp = PDBParser(PERMISSIVE=True, QUIET=True)
                s = pdbp.get_structure('struct',
                                       StringIO(sc.structure.pdb_data.pdb))
                chain = s[0][sc.alpha]
                nums = []
                for res in chain:
                    try:
                        res['CA']
                        nums.append(res.get_id()[1])
                    except:
                        pass

                resis = Residue.objects.filter(
                    protein_conformation__protein=sc.protein)
                num_i = 0
                temp_seq2 = ''
                pdb_num_dict = OrderedDict()
                # Create first alignment based on sequence numbers
                for n in nums:
                    if sc.structure.pdb_code.index == '6OIJ' and n < 30:
                        nr = n + 6
                    else:
                        nr = n
                    pdb_num_dict[n] = [chain[n], resis.get(sequence_number=nr)]
                # Find mismatches
                mismatches = []
                for n, res in pdb_num_dict.items():
                    if AA[res[0].get_resname()] != res[1].amino_acid:
                        mismatches.append(res)

                pdb_lines = sc.structure.pdb_data.pdb.split('\n')
                seqadv = []
                for l in pdb_lines:
                    if l.startswith('SEQADV'):
                        seqadv.append(l)
                mutations, shifted_mutations = OrderedDict(), OrderedDict()
                # Search for annotated engineered mutations in pdb SEQADV
                for s in seqadv:
                    line_search = re.search(
                        'SEQADV\s{1}[A-Z\s\d]{4}\s{1}([A-Z]{3})\s{1}([A-Z]{1})\s+(\d+)[\s\S\d]{5}([\s\S\d]{12})([A-Z]{3})\s+(\d+)(\s\S+)',
                        s)
                    if line_search != None:
                        if line_search.group(2) == sc.alpha:
                            if line_search.group(
                                    4).strip() == sc.protein.accession:
                                if line_search.group(3) == line_search.group(
                                        6):
                                    mutations[int(line_search.group(3))] = [
                                        line_search.group(1),
                                        line_search.group(5)
                                    ]
                                else:
                                    shifted_mutations[int(
                                        line_search.group(3))] = [
                                            line_search.group(1),
                                            line_search.group(5),
                                            int(line_search.group(6))
                                        ]
                            else:
                                # Exception for 6G79
                                if line_search.group(3) != line_search.group(
                                        6) and 'CONFLICT' in line_search.group(
                                            7):
                                    mutations[int(line_search.group(3))] = [
                                        line_search.group(1),
                                        line_search.group(5)
                                    ]
                                # Exception for 5G53
                                if line_search.group(
                                        4).strip() != sc.protein.accession:
                                    mutations[int(line_search.group(3))] = [
                                        line_search.group(1),
                                        line_search.group(5)
                                    ]
                remaining_mismatches = []

                # Check and clear mismatches that are registered in pdb SEQADV as engineered mutation
                for m in mismatches:
                    num = m[0].get_id()[1]
                    if num in mutations:
                        if m[0].get_resname() != mutations[num][0] and m[
                                1].amino_acid != AA[mutations[num][1]]:
                            remaining_mismatches.append(m)
                    elif num in shifted_mutations:
                        remaining_mismatches.append(m)
                    else:
                        remaining_mismatches.append(m)

                ### sanity check
                # print(mutations)
                # print(shifted_mutations)
                # print(mismatches)
                # print(remaining_mismatches)
                # pprint.pprint(pdb_num_dict)

                # Mismatches remained possibly to seqnumber shift, making pairwise alignment to try and fix alignment
                if len(remaining_mismatches
                       ) > 0 and sc.structure.pdb_code.index not in [
                           '6OIJ', '6OY9', '6OYA'
                       ]:
                    ppb = PPBuilder()
                    seq = ''
                    for pp in ppb.build_peptides(chain, aa_only=False):
                        seq += str(pp.get_sequence())
                    pw2 = pairwise2.align.localms(sc.protein.sequence, seq, 2,
                                                  -1, -.5, -.1)
                    ref_seq, temp_seq = str(pw2[0][0]), str(pw2[0][1])
                    wt_pdb_dict = OrderedDict()
                    pdb_wt_dict = OrderedDict()
                    j, k = 0, 0
                    for i, ref, temp in zip(range(0, len(ref_seq)), ref_seq,
                                            temp_seq):
                        if ref != '-' and temp != '-':
                            wt_pdb_dict[resis[j]] = pdb_num_dict[nums[k]]
                            pdb_wt_dict[pdb_num_dict[nums[k]][0]] = resis[j]
                            j += 1
                            k += 1
                        elif ref == '-':
                            wt_pdb_dict[i] = pdb_num_dict[nums[k]]
                            pdb_wt_dict[pdb_num_dict[nums[k]][0]] = i
                            k += 1
                        elif temp == '-':
                            wt_pdb_dict[resis[j]] = i
                            pdb_wt_dict[i] = resis[j]
                            j += 1
                    for i, r in enumerate(remaining_mismatches):
                        # Adjust for shifted residue when residue is a match
                        if r[0].get_id()[1] - remaining_mismatches[
                                i - 1][0].get_id()[1] > 1:
                            pdb_num_dict[r[0].get_id()[1] -
                                         1][1] = pdb_wt_dict[chain[
                                             r[0].get_id()[1] - 1]]
                        # Adjust for shifted residue when residue is mutated and it's logged in SEQADV
                        if r[0].get_id()[1] in shifted_mutations:
                            pdb_num_dict[r[0].get_id()[1]][1] = resis.get(
                                sequence_number=shifted_mutations[
                                    r[0].get_id()[1]][2])
                        # Adjust for shift
                        else:
                            pdb_num_dict[r[0].get_id()[1]][1] = pdb_wt_dict[
                                r[0]]

                bulked_residues = []
                for key, val in pdb_num_dict.items():
                    # print(key, val) # sanity check
                    res_obj = Residue()
                    res_obj.sequence_number = val[0].get_id()[1]
                    res_obj.amino_acid = AA[val[0].get_resname()]
                    res_obj.display_generic_number = val[
                        1].display_generic_number
                    res_obj.generic_number = val[1].generic_number
                    res_obj.protein_conformation = alpha_protconf
                    res_obj.protein_segment = val[1].protein_segment
                    bulked_residues.append(res_obj)
                Residue.objects.bulk_create(bulked_residues)
                self.logger.info(
                    'Protein, ProteinConformation and Residue build for alpha subunit of {} is finished'
                    .format(sc))
            except Exception as msg:
                print(
                    'Protein, ProteinConformation and Residue build for alpha subunit of {} has failed'
                    .format(sc))
                print(msg)
                self.logger.info(
                    'Protein, ProteinConformation and Residue build for alpha subunit of {} has failed'
                    .format(sc))
Ejemplo n.º 2
0
    def can_create_arrestins(self, family, residue_numbering_scheme, accession,
                             uniprot):
        # get/create protein source

        try:
            source, created = ProteinSource.objects.get_or_create(
                name=uniprot['source'], defaults={'name': uniprot['source']})
            if created:
                self.logger.info('Created protein source ' + source.name)
        except IntegrityError:
            source = ProteinSource.objects.get(name=uniprot['source'])

        # get/create species
        try:
            species, created = Species.objects.get_or_create(
                latin_name=uniprot['species_latin_name'],
                defaults={
                    'common_name': uniprot['species_common_name'],
                })
            if created:
                self.logger.info('Created species ' + species.latin_name)
        except IntegrityError:
            species = Species.objects.get(
                latin_name=uniprot['species_latin_name'])

        # get/create protein sequence type
        # Wild-type for all sequences from source file
        try:
            sequence_type, created = ProteinSequenceType.objects.get_or_create(
                slug='wt', defaults={
                    'slug': 'wt',
                    'name': 'Wild-type',
                })
            if created:
                self.logger.info('Created protein sequence type Wild-type')
        except:
            self.logger.error(
                'Failed creating protein sequence type Wild-type')

        # create protein
        p = Protein()
        p.family = family
        p.species = species
        p.source = source
        p.residue_numbering_scheme = residue_numbering_scheme
        p.sequence_type = sequence_type

        if accession:
            p.accession = accession
        p.entry_name = uniprot['entry_name'].lower()
        p.name = uniprot['names'][0]
        p.sequence = uniprot['sequence']

        try:
            p.save()
            self.logger.info('Created protein {}'.format(p.entry_name))
        except:
            self.logger.error('Failed creating protein {}'.format(
                p.entry_name))

        # protein aliases
        for i, alias in enumerate(uniprot['names']):
            pcan = Protein.objects.get(
                entry_name=uniprot['entry_name'].lower())
            a = ProteinAlias()
            a.protein = pcan
            a.name = alias
            a.position = i

            try:
                a.save()
                self.logger.info('Created protein alias ' + a.name +
                                 ' for protein ' + p.name)
            except:
                self.logger.error('Failed creating protein alias ' + a.name +
                                  ' for protein ' + p.name)

        # genes
        for i, gene in enumerate(uniprot['genes']):
            g = False
            try:
                g, created = Gene.objects.get_or_create(name=gene,
                                                        species=species,
                                                        position=i)
                if created:
                    self.logger.info('Created gene ' + g.name +
                                     ' for protein ' + p.name)
            except IntegrityError:
                g = Gene.objects.get(name=gene, species=species, position=i)

            if g:
                pcan = Protein.objects.get(
                    entry_name=uniprot['entry_name'].lower())
                g.proteins.add(pcan)

        # structures
        for i, structure in enumerate(uniprot['structures']):
            try:
                res = structure[1]
                if res == '-':
                    res = 0

                structure, created = SignprotStructure.objects.get_or_create(
                    PDB_code=structure[0], resolution=res)
                if created:
                    self.logger.info('Created structure ' +
                                     structure.PDB_code + ' for protein ' +
                                     p.name)
            except IntegrityError:
                self.logger.error('Failed creating structure ' +
                                  structure.PDB_code + ' for protein ' +
                                  p.name)

            if g:
                pcan = Protein.objects.get(
                    entry_name=uniprot['entry_name'].lower())
                structure.origin.add(pcan)
                structure.save()
Ejemplo n.º 3
0
    def create_protein(self, name, family, sequence_type,
                       residue_numbering_scheme, accession, uniprot):
        # get/create protein source
        try:
            source, created = ProteinSource.objects.get_or_create(
                name=uniprot['source'], defaults={'name': uniprot['source']})
            if created:
                self.logger.info('Created protein source ' + source.name)
        except IntegrityError:
            source = ProteinSource.objects.get(name=uniprot['source'])

        # get/create species
        try:
            species, created = Species.objects.get_or_create(
                latin_name=uniprot['species_latin_name'],
                defaults={
                    'common_name': uniprot['species_common_name'],
                })
            if created:
                self.logger.info('Created species ' + species.latin_name)
        except IntegrityError:
            species = Species.objects.get(
                latin_name=uniprot['species_latin_name'])

        # create protein
        p = Protein()
        p.family = family
        p.species = species
        p.source = source
        p.residue_numbering_scheme = residue_numbering_scheme
        p.sequence_type = sequence_type
        if accession:
            p.accession = accession
        p.entry_name = uniprot['entry_name']
        p.name = name
        p.sequence = uniprot['sequence']

        try:
            p.save()
            self.logger.info('Created protein {}'.format(p.entry_name))
        except:
            self.logger.error('Failed creating protein {}'.format(
                p.entry_name))

        # protein conformations
        try:
            ps, created = ProteinState.objects.get_or_create(
                slug=settings.DEFAULT_PROTEIN_STATE,
                defaults={'name': settings.DEFAULT_PROTEIN_STATE.title()})
        except IntegrityError:
            ps = ProteinState.objects.get(slug=settings.DEFAULT_PROTEIN_STATE)

        pc = ProteinConformation.objects.create(protein=p, state=ps)

        # protein aliases
        for i, alias in enumerate(uniprot['names']):
            a = ProteinAlias()
            a.protein = p
            a.name = alias
            a.position = i

            try:
                a.save()
                self.logger.info('Created protein alias ' + a.name +
                                 ' for protein ' + p.name)
            except:
                self.logger.error('Failed creating protein alias ' + a.name +
                                  ' for protein ' + p.name)

        # genes
        for i, gene in enumerate(uniprot['genes']):
            g = False
            try:
                g, created = Gene.objects.get_or_create(name=gene,
                                                        species=species,
                                                        position=i)
                if created:
                    self.logger.info('Created gene ' + g.name +
                                     ' for protein ' + p.name)
            except IntegrityError:
                g = Gene.objects.get(name=gene, species=species, position=i)

            if g:
                g.proteins.add(p)
Ejemplo n.º 4
0
    def cgn_creat_gproteins(self, family, residue_numbering_scheme, accession, uniprot):

        # get/create protein source
        try:
            source, created = ProteinSource.objects.get_or_create(name=uniprot['source'],
                defaults={'name': uniprot['source']})
            if created:
                self.logger.info('Created protein source ' + source.name)
        except IntegrityError:
            source = ProteinSource.objects.get(name=uniprot['source'])

        # get/create species
        try:
            species, created = Species.objects.get_or_create(latin_name=uniprot['species_latin_name'],
                defaults={
                'common_name': uniprot['species_common_name'],
                })
            if created:
                self.logger.info('Created species ' + species.latin_name)
        except IntegrityError:
            species = Species.objects.get(latin_name=uniprot['species_latin_name'])

        # get/create protein sequence type
        # Wild-type for all sequences from source file
        try:
            sequence_type, created = ProteinSequenceType.objects.get_or_create(slug='wt',
                defaults={
                'slug': 'wt',
                'name': 'Wild-type',
                })
            if created:
                self.logger.info('Created protein sequence type Wild-type')
        except:
                self.logger.error('Failed creating protein sequence type Wild-type')

        # create protein
        p = Protein()
        p.family = family
        p.species = species
        p.source = source
        p.residue_numbering_scheme = residue_numbering_scheme
        p.sequence_type = sequence_type

        if accession:
            p.accession = accession
        p.entry_name = uniprot['entry_name'].lower()
        p.name = uniprot['names'][0].split('Guanine nucleotide-binding protein ')[1]
        p.sequence = uniprot['sequence']

        try:
            p.save()
            self.logger.info('Created protein {}'.format(p.entry_name))
        except:
            self.logger.error('Failed creating protein {}'.format(p.entry_name))

        # protein aliases
        for i, alias in enumerate(uniprot['names']):
            pcgn = Protein.objects.get(entry_name=uniprot['entry_name'].lower())
            a = ProteinAlias()
            a.protein = pcgn
            a.name = alias
            a.position = i

            try:
                a.save()
                self.logger.info('Created protein alias ' + a.name + ' for protein ' + p.name)
            except:
                self.logger.error('Failed creating protein alias ' + a.name + ' for protein ' + p.name)

        # genes
        for i, gene in enumerate(uniprot['genes']):
            g = False
            try:
                g, created = Gene.objects.get_or_create(name=gene, species=species, position=i)
                if created:
                    self.logger.info('Created gene ' + g.name + ' for protein ' + p.name)
            except IntegrityError:
                g = Gene.objects.get(name=gene, species=species, position=i)

            if g:
                pcgn = Protein.objects.get(entry_name=uniprot['entry_name'].lower())
                g.proteins.add(pcgn)

        # structures
        for i, structure in enumerate(uniprot['structures']):
            try:
                res = structure[1]
                if res == '-':
                    res = 0
    
                structure, created = SignprotStructure.objects.get_or_create(PDB_code=structure[0], resolution=res)
                if created:
                    self.logger.info('Created structure ' + structure.PDB_code + ' for protein ' + p.name)
            except IntegrityError:
                self.logger.error('Failed creating structure ' + structure.PDB_code + ' for protein ' + p.name)

            if g:
                pcgn = Protein.objects.get(entry_name=uniprot['entry_name'].lower())
                structure.origin.add(pcgn)
                structure.save()
Ejemplo n.º 5
0
    def create_constructs(self, filenames):
        self.logger.info('CREATING CONSTRUCTS')
        
        # what files should be parsed?
        if not filenames:
            filenames = os.listdir(self.construct_data_dir)

        # parse files
        for source_file in filenames:
            source_file_path = os.sep.join([self.construct_data_dir, source_file])
            if os.path.isfile(source_file_path) and source_file[0] != '.':
                self.logger.info('Reading file {}'.format(source_file_path))
                # read the yaml file
                with open(source_file_path, 'r') as f:
                    sd = yaml.load(f)

                    # is a protein specified?
                    if 'protein' not in sd:
                        self.logger.error('Protein not specified for construct, skipping')
                        continue

                    # fetch the parent protein
                    try:
                        ppc = ProteinConformation.objects.select_related('protein__family', 'protein__species',
                            'protein__residue_numbering_scheme').get(protein__entry_name=sd['protein'],
                            state__slug=settings.DEFAULT_PROTEIN_STATE)
                    except ProteinConformation.DoesNotExist:
                        # abort if parent protein is not found
                        self.logger.error('Parent protein {} for construct {} not found, aborting!'.format(
                            sd['protein'], sd['name']))
                        continue

                    if not Protein.objects.filter(name=sd['name']).exists():
                        # create a protein record
                        p = Protein()
                        p.parent = ppc.protein
                        p.family = ppc.protein.family
                        p.species = ppc.protein.species
                        p.residue_numbering_scheme = ppc.protein.residue_numbering_scheme
                        p.sequence_type, created = ProteinSequenceType.objects.get_or_create(slug='mod',
                            defaults={'name': 'Modified'})
                        p.source, created = ProteinSource.objects.get_or_create(name='OTHER')
                        p.entry_name = slugify(strip_tags(sd['name']))
                        p.name = sd['name']
                        p.sequence = ppc.protein.sequence
                        # save protein (construct)
                        try:
                            p.save()
                            self.logger.info('Created construct {} with parent protein {}'.format(p.name,
                                ppc.protein.entry_name))
                        except Exception as e:
                            print(e)
                            self.logger.error('Failed creating construct {} with parent protein {}'.format(p.name,
                                ppc.protein.entry_name))
                            continue
                    else:
                        p = Protein.objects.get(name=sd['name'])

                    if not ProteinConformation.objects.filter(protein=p).exists():
                        # create protein conformation record
                        pc = ProteinConformation()
                        pc.protein = p
                        pc.state = ProteinState.objects.get(slug=settings.DEFAULT_PROTEIN_STATE)
                        try:
                            pc.save()
                            self.logger.info('Created conformation {} of protein {}'.format(pc.state.name, p.name))
                        except:
                            self.logger.error('Failed creating conformation {} of protein {}'.format(pc.state.name,
                                p.entry_name))

                    # # create residue records
                    # deletions = []
                    # deletions_list = []
                    # if 'deletions' in sd and sd['deletions']:
                    #     for t in sd['deletions']:
                    #         deletions += list(range(t[0],t[1]+1))
                    #         deletions_list.append(str(t[0])+'-'+str(t[1])) 
                    # s = ","
                    # deletion_string = s.join(deletions_list)
                         

                    # mutations = {}
                    # if 'mutations' in sd and sd['mutations']:
                    #     for m in sd['mutations']:
                    #         res_num = m[1:-1]
                    #         mutations[res_num] = {
                    #             'wt_res': m[0],
                    #             'mut_res': m[-1],
                    #             'full': m,
                    #         }
                    
                    # # Create construct record
                    # c = Construct()            
                    # c.protein_conformation = pc
                    # c.deletions =  deletion_string
                    # c.save()
                      

                    # Create Auxiliary proteins
#                    if 'auxiliary_proteins' in sd and sd['auxiliary_proteins']:
#                        ap = AuxProtein()
#                        ap.construct = c
#                        apct = AuxProteinType.objects.create()
                       # ap.protein_type = apct 
#                        apct.save()
#                        if 'remarks' in sd['auxiliary_proteins']:
#                            ap.remarks = sd['auxiliary_proteins']['remarks']
#                        ap.save()
 

#                        for step in sd['auxiliary_proteins']:
#                            if 'type' in step and 'name' in step and'sequence' in step:
#                                ap.protein_type = apct
                 #              ap.protein_type, created = AuxProteinType.objects.get_or_create()
#                                ap.name = sd['auxiliary_proteins']['name']
#                                ap.uniprot_id = sd['auxiliary_proteins']['uniprot_id']
#                                ap.sequence = sd['auxiliary_proteins']['sequence']
                                #mutations if any to be included from mutation model along with reason of mutation
#                                ap.position = sd['auxiliary_proteins']['position']
#                                ap.deletions = sd['auxiliary_proteins']['deletions']
                                
#                            else:
#                                self.logger.error('Auxiliary protein step incorrectly defined for {}'.format(p))



                   #   # create expression records
                   #  if 'expression_sys' in sd and sd['expression_sys']:
                   #      ce = ConstructExpression()           
                   #      ce.construct = c
                   #      ce.expression_system, created = ConstructExpressionSystem.objects.get_or_create(expression_method = sd['expression_sys']['expression_method'], host_cell_type = sd['expression_sys']['host_cell_type'], host_cell = sd['expression_sys']['host_cell'])
                   #      if 'remarks' in sd:
                   #          ce.remarks = sd['expression_sys']['remarks']
                   #      ce.save()
 
               
                   #  # create solubilization records
                   #  if 'solubilization' in sd and sd['solubilization'] and 'steps' in sd['solubilization'] and sd['solubilization']['steps']:
                   #      so = ConstructSolubilization()
                   #      so.construct = c
                   #      cl = ChemicalList.objects.create()
                   #      so.chemical_list = cl 

                   #      for step in sd['solubilization']['steps']:
                   #          if 'type' in step and 'item' in step and'concentration' in step:
                   #              chem = Chemical()
                   #              chem.chemical_type,  created = ChemicalType.objects.get_or_create(name = step['type']) 
                   #              chem.name =  step['item']
                   #              chem.save()

                   #              cc = ChemicalConc()
                   #              cc.concentration = step['concentration']
                   #              cc.chemical = chem    # since ChemicalConc has a ForeignKey to Chemical
                   #              cc.save()
                   #              cl.chemicals.add(cc)                          
                   #          else:
                   #              self.logger.error('Solubilization step incorrectly defined for {}'.format(p))                                 

                   #      if 'remarks' in sd['solubilization']:
                   #          so.remarks = sd['solubilization']['remarks']
                   #      so.save()



                   #  # create  purification records
                   #  if 'purification' in sd and sd['purification'] and sd['purification']['steps']:
                   #      pu = ConstructPurification()
                   #      pu.construct = c
                   #      if 'remarks' in sd['purification']:
                   #          pu.remarks = sd['purification']['remarks']
                   #      pu.save() 
                   #      for step in sd['purification']['steps']:
                   #          if 'type' in step and 'description' in step:
                   #              pust = PurificationStep()
                   #              pust.description = step['description']
                   #              pust.purification = pu
                   #              pust.purification_type, created = PurificationStepType.objects.get_or_create(name = step['type'] ) # 2 values returned by get_or_create
                   #              if created: 
                   #                  self.logger.info('Created purification step type {}'.format(pust.purification_type))
                   #              pust.save()

                   #          else:
                   #              self.logger.error('Purification step incorrectly defined for {}'.format(p))

                        


                   # # create crystallization records
                   #  if 'crystallization' in sd and sd['crystallization']: 
                   #      cy = ConstructCrystallization()
                   #      cy.construct = c
                   #      cyt = CrystallizationMethodTypes.objects.create()
                   #      cy.crystal_type = cyt
                   #      cy.method = sd['crystallization']['method']
                   #      cy.settings = sd['crystallization']['settings']
                   #      cy.protein_conc = sd['crystallization']['protein_conc']
                   #      cl = ChemicalList.objects.create()
                   #      cy.chemical_list = cl   

                   #      for step in sd['crystallization']['chemicallist']:
                   #          if 'type' in step and 'item' in step and'concentration' in step:
                   #              chem = Chemical()
                   #              chem.chemical_type,  created = ChemicalType.objects.get_or_create(name = step['type']) 
                                
                   #              chem.name =  step['item']
                   #              chem.save()
                   #              cc = ChemicalConc()
                   #              cc.concentration = step['concentration']
                   #              cc.chemical = chem    # since ChemicalConc has a ForeignKey to Chemical
                   #              cc.save()
                                
                   #              cl.chemicals.add(cc)                          
                   #          else:
                   #              self.logger.error('Crystallization step incorrectly defined for {}'.format(p))                        

                   #      cy.aqueous_solution_lipid_ratio = sd['crystallization']['aqueous_solution_lipid_ratio_LCP']
                   #      cy.lcp_bolus_volume = sd['crystallization']['LCP_bolus_volume']
                   #      cy.precipitant_solution_volume = sd['crystallization']['precipitant_solution_volume']
                   #      cy.temp = sd['crystallization']['temperature']
                   #      cy.ph = sd['crystallization']['ph']  


                   #      if 'remarks' in sd['crystallization']:
                   #          cy.remarks = sd['crystallization']['remarks']
                   #      cy.save()

                                     
                   #  # fusion proteins
                   #  split_segments = {}
                   #  if 'fusion_proteins' in sd and sd['fusion_proteins']:
                   #      for fp in sd['fusion_proteins']:
                   #          fp_start = Residue.objects.get(protein_conformation=ppc,
                   #              sequence_number=fp['positions'][0])
                   #          fp_end = Residue.objects.get(protein_conformation=ppc, sequence_number=fp['positions'][1])
                   #          # if the fusion protein is inserted within only one segment (the usual case), split that
                   #          # segment into two segments
                   #          if fp_start and fp_start.protein_segment == fp_end.protein_segment:
                   #              # get/create split protein segments
                   #              segment_before, created = ProteinSegment.objects.get_or_create(
                   #                  slug=fp_start.protein_segment.slug+"_1", defaults={
                   #                  'name': fp_start.protein_segment.name,
                   #                  'category': fp_start.protein_segment.category,
                   #                  'partial': True})
                   #              segment_after, created = ProteinSegment.objects.get_or_create(
                   #                  slug=fp_start.protein_segment.slug+"_2", defaults={
                   #                  'name': fp_start.protein_segment.name,
                   #                  'category': fp_start.protein_segment.category,
                   #                  'partial': True})

                   #              # keep track of  information about split segments
                   #              split_segments[fp_start.protein_segment.slug] = {
                   #                  'start': {
                   #                      'sequence_number': fp['positions'][0],
                   #                      'segment': segment_before,
                   #                  },
                   #                  'end': {
                   #                      'sequence_number': fp['positions'][1],
                   #                      'segment': segment_after,
                   #                  },
                   #              }

                   #          # get/insert fusion protein
                   #          fusion, create = ProteinFusion.objects.get_or_create(name=fp['name'], defaults={
                   #              'sequence': fp['sequence']})

                   #          # create relationship with protein
                   #          ProteinFusionProtein.objects.create(protein=p, protein_fusion=fusion,
                   #              segment_before=segment_before, segment_after=segment_after)

                    # prs = Residue.objects.filter(protein_conformation=ppc).prefetch_related(
                    #     'protein_conformation__protein', 'protein_segment', 'generic_number',
                    #     'display_generic_number__scheme', 'alternative_generic_numbers__scheme')
                    # updated_sequence = ''
                    # for pr in prs:
                    #     if pr.sequence_number not in deletions:
                    #         r = Residue()
                    #         r.protein_conformation = pc
                    #         r.generic_number = pr.generic_number
                    #         r.display_generic_number = pr.display_generic_number
                    #         r.sequence_number = pr.sequence_number
                            
                    #         # check for split segments
                    #         if pr.protein_segment.slug in split_segments:
                    #             rsns = split_segments[pr.protein_segment.slug]['start']['sequence_number']
                    #             rsne = split_segments[pr.protein_segment.slug]['end']['sequence_number']
                    #             if r.sequence_number <= rsns:
                    #                 r.protein_segment = split_segments[pr.protein_segment.slug]['start']['segment']
                    #             elif r.sequence_number >= rsne:
                    #                 r.protein_segment = split_segments[pr.protein_segment.slug]['end']['segment']
                    #         else:
                    #             r.protein_segment = pr.protein_segment

                    #         # amino acid, check for mutations
                    #         if r.sequence_number in mutations:
                    #             if mutations[r.sequence_number]['wt_res'] == pr.amino_acid:
                    #                 r.amino_acid = mutations[r.sequence_number]['mut_res']
                    #             else:
                    #                 self.logger.error('Mutation {} in construct {} does not match wild-type sequence' \
                    #                     + ' of {}'.format(mutations[r.sequence_number]['full'], pc.protein.name,
                    #                     ppc.protein.entry_name))
                    #         else:
                    #             r.amino_acid = pr.amino_acid

                    #         # save amino acid to updated sequence
                    #         updated_sequence += r.amino_acid

                    #         # save residue before populating M2M relations
                    #         r.save()

                    #         # alternative generic numbers
                    #         agns = pr.alternative_generic_numbers.all()
                    #         for agn in agns:
                    #             r.alternative_generic_numbers.add(agn)
                    
                    # # update sequence
                    # p.sequence = updated_sequence
                    # p.save()

        self.logger.info('COMPLETED CREATING CONSTRUCTS')
Ejemplo n.º 6
0
    def create_protein(self, name, family, sequence_type, residue_numbering_scheme, accession, uniprot):
        # get/create protein source
        try:
            source, created = ProteinSource.objects.get_or_create(name=uniprot['source'],
                defaults={'name': uniprot['source']})
            if created:
                self.logger.info('Created protein source ' + source.name)
        except IntegrityError:
            source = ProteinSource.objects.get(name=uniprot['source'])

        # get/create species
        try:
            species, created = Species.objects.get_or_create(latin_name=uniprot['species_latin_name'],
                defaults={
                'common_name': uniprot['species_common_name'],
                })
            if created:
                self.logger.info('Created species ' + species.latin_name)
        except IntegrityError:
            species = Species.objects.get(latin_name=uniprot['species_latin_name'])

        # create protein
        p = Protein()
        p.family = family
        p.species = species
        p.source = source
        p.residue_numbering_scheme = residue_numbering_scheme
        p.sequence_type = sequence_type
        if accession:
            p.accession = accession
        p.entry_name = uniprot['entry_name']
        p.name = name
        p.sequence = uniprot['sequence']

        try:
            p.save()
            self.logger.info('Created protein {}'.format(p.entry_name))
        except Exception as e:
            self.logger.error('Failed creating protein {} {}'.format(p.entry_name, str(e)))

        # protein conformations
        try:
            ps, created = ProteinState.objects.get_or_create(slug=settings.DEFAULT_PROTEIN_STATE,
                defaults={'name': settings.DEFAULT_PROTEIN_STATE.title()})
        except IntegrityError:
            ps = ProteinState.objects.get(slug=settings.DEFAULT_PROTEIN_STATE)

        pc = ProteinConformation.objects.create(protein=p, state=ps)

        # protein aliases
        for i, alias in enumerate(uniprot['names']):
            a = ProteinAlias()
            a.protein = p
            a.name = alias
            a.position = i

            try:
                a.save()
                self.logger.info('Created protein alias ' + a.name + ' for protein ' + p.name)
            except:
                self.logger.error('Failed creating protein alias ' + a.name + ' for protein ' + p.name)

        # genes
        for i, gene in enumerate(uniprot['genes']):
            g = False
            try:
                g, created = Gene.objects.get_or_create(name=gene, species=species, position=i)
                if created:
                    self.logger.info('Created gene ' + g.name + ' for protein ' + p.name)
            except IntegrityError:
                g = Gene.objects.get(name=gene, species=species, position=i)

            if g:
                g.proteins.add(p)
Ejemplo n.º 7
0
    def create_constructs(self, filenames):
        self.logger.info('CREATING CONSTRUCTS')
        
        # what files should be parsed?
        if not filenames:
            filenames = os.listdir(self.construct_data_dir)

        # parse files
        for source_file in filenames:
            source_file_path = os.sep.join([self.construct_data_dir, source_file])
            if os.path.isfile(source_file_path) and source_file[0] != '.':
                self.logger.info('Reading file {}'.format(source_file_path))
                # read the yaml file
                with open(source_file_path, 'r') as f:
                    sd = yaml.load(f)

                    # is a protein specified?
                    if 'protein' not in sd:
                        self.logger.error('Protein not specified for construct, skipping')
                        continue

                    # fetch the parent protein
                    try:
                        ppc = ProteinConformation.objects.select_related('protein__family', 'protein__species',
                            'protein__residue_numbering_scheme').get(protein__entry_name=sd['protein'],
                            state__slug=settings.DEFAULT_PROTEIN_STATE)
                    except ProteinConformation.DoesNotExist:
                        # abort if parent protein is not found
                        self.logger.error('Parent protein {} for construct {} not found, aborting!'.format(
                            sd['protein'], sd['name']))
                        continue

                    # create a protein record
                    p = Protein()
                    p.parent = ppc.protein
                    p.family = ppc.protein.family
                    p.species = ppc.protein.species
                    p.residue_numbering_scheme = ppc.protein.residue_numbering_scheme
                    p.sequence_type, created = ProteinSequenceType.objects.get_or_create(slug='mod',
                        defaults={'name': 'Modified'})
                    p.source, created = ProteinSource.objects.get_or_create(name='OTHER')
                    p.entry_name = slugify(strip_tags(sd['name']))
                    p.name = sd['name']
                    p.sequence = ppc.protein.sequence
                    # save protein (construct)
                    try:
                        p.save()
                        self.logger.info('Created construct {} with parent protein {}'.format(p.name,
                            ppc.protein.entry_name))
                    except Exception as e:
                        print(e)
                        self.logger.error('Failed creating construct {} with parent protein {}'.format(p.name,
                            ppc.protein.entry_name))
                        continue

                    # create protein conformation record
                    pc = ProteinConformation()
                    pc.protein = p
                    pc.state = ProteinState.objects.get(slug=settings.DEFAULT_PROTEIN_STATE)
                    try:
                        pc.save()
                        self.logger.info('Created conformation {} of protein {}'.format(pc.state.name, p.name))
                    except:
                        self.logger.error('Failed creating conformation {} of protein {}'.format(pc.state.name,
                            p.entry_name))

                    # create residue records
                    deletions = []
                    deletions_list = []
                    if 'deletions' in sd and sd['deletions']:
                        for t in sd['deletions']:
                            deletions += list(range(t[0],t[1]+1))
                            deletions_list.append(str(t[0])+'-'+str(t[1])) 
                    s = ","
                    deletion_string = s.join(deletions_list)
                         

                    mutations = {}
                    if 'mutations' in sd and sd['mutations']:
                        for m in sd['mutations']:
                            res_num = m[1:-1]
                            mutations[res_num] = {
                                'wt_res': m[0],
                                'mut_res': m[-1],
                                'full': m,
                            }
                    
                    # Create construct record
                    c = Construct()            
                    c.protein_conformation = pc
                    c.deletions =  deletion_string
                    c.save()
                      

                    # Create Auxiliary proteins
#                    if 'auxiliary_proteins' in sd and sd['auxiliary_proteins']:
#                        ap = AuxProtein()
#                        ap.construct = c
#                        apct = AuxProteinType.objects.create()
                       # ap.protein_type = apct 
#                        apct.save()
#                        if 'remarks' in sd['auxiliary_proteins']:
#                            ap.remarks = sd['auxiliary_proteins']['remarks']
#                        ap.save()
 

#                        for step in sd['auxiliary_proteins']:
#                            if 'type' in step and 'name' in step and'sequence' in step:
#                                ap.protein_type = apct
                 #              ap.protein_type, created = AuxProteinType.objects.get_or_create()
#                                ap.name = sd['auxiliary_proteins']['name']
#                                ap.uniprot_id = sd['auxiliary_proteins']['uniprot_id']
#                                ap.sequence = sd['auxiliary_proteins']['sequence']
                                #mutations if any to be included from mutation model along with reason of mutation
#                                ap.position = sd['auxiliary_proteins']['position']
#                                ap.deletions = sd['auxiliary_proteins']['deletions']
                                
#                            else:
#                                self.logger.error('Auxiliary protein step incorrectly defined for {}'.format(p))



                     # create expression records
                    if 'expression_sys' in sd and sd['expression_sys']:
                        ce = ConstructExpression()           
                        ce.construct = c
                        ce.expression_system, created = ConstructExpressionSystem.objects.get_or_create(expression_method = sd['expression_sys']['expression_method'], host_cell_type = sd['expression_sys']['host_cell_type'], host_cell = sd['expression_sys']['host_cell'])
                        if 'remarks' in sd:
                            ce.remarks = sd['expression_sys']['remarks']
                        ce.save()
 
               
                    # create solubilization records
                    if 'solubilization' in sd and sd['solubilization'] and 'steps' in sd['solubilization'] and sd['solubilization']['steps']:
                        so = ConstructSolubilization()
                        so.construct = c
                        cl = ChemicalList.objects.create()
                        so.chemical_list = cl 

                        for step in sd['solubilization']['steps']:
                            if 'type' in step and 'item' in step and'concentration' in step:
                                chem = Chemical()
                                chem.chemical_type,  created = ChemicalType.objects.get_or_create(name = step['type']) 
                                chem.name =  step['item']
                                chem.save()

                                cc = ChemicalConc()
                                cc.concentration = step['concentration']
                                cc.chemical = chem    # since ChemicalConc has a ForeignKey to Chemical
                                cc.save()
                                cl.chemicals.add(cc)                          
                            else:
                                self.logger.error('Solubilization step incorrectly defined for {}'.format(p))                                 

                        if 'remarks' in sd['solubilization']:
                            so.remarks = sd['solubilization']['remarks']
                        so.save()



                    # create  purification records
                    if 'purification' in sd and sd['purification'] and sd['purification']['steps']:
                        pu = ConstructPurification()
                        pu.construct = c
                        if 'remarks' in sd['purification']:
                            pu.remarks = sd['purification']['remarks']
                        pu.save() 
                        for step in sd['purification']['steps']:
                            if 'type' in step and 'description' in step:
                                pust = PurificationStep()
                                pust.description = step['description']
                                pust.purification = pu
                                pust.purification_type, created = PurificationStepType.objects.get_or_create(name = step['type'] ) # 2 values returned by get_or_create
                                if created: 
                                    self.logger.info('Created purification step type {}'.format(pust.purification_type))
                                pust.save()

                            else:
                                self.logger.error('Purification step incorrectly defined for {}'.format(p))

                        


                   # create crystallization records
                    if 'crystallization' in sd and sd['crystallization']: 
                        cy = ConstructCrystallization()
                        cy.construct = c
                        cyt = CrystallizationMethodTypes.objects.create()
                        cy.crystal_type = cyt
                        cy.method = sd['crystallization']['method']
                        cy.settings = sd['crystallization']['settings']
                        cy.protein_conc = sd['crystallization']['protein_conc']
                        cl = ChemicalList.objects.create()
                        cy.chemical_list = cl   

                        for step in sd['crystallization']['chemicallist']:
                            if 'type' in step and 'item' in step and'concentration' in step:
                                chem = Chemical()
                                chem.chemical_type,  created = ChemicalType.objects.get_or_create(name = step['type']) 
                                
                                chem.name =  step['item']
                                chem.save()
                                cc = ChemicalConc()
                                cc.concentration = step['concentration']
                                cc.chemical = chem    # since ChemicalConc has a ForeignKey to Chemical
                                cc.save()
                                
                                cl.chemicals.add(cc)                          
                            else:
                                self.logger.error('Crystallization step incorrectly defined for {}'.format(p))                        

                        cy.aqueous_solution_lipid_ratio = sd['crystallization']['aqueous_solution_lipid_ratio_LCP']
                        cy.lcp_bolus_volume = sd['crystallization']['LCP_bolus_volume']
                        cy.precipitant_solution_volume = sd['crystallization']['precipitant_solution_volume']
                        cy.temp = sd['crystallization']['temperature']
                        cy.ph = sd['crystallization']['ph']  


                        if 'remarks' in sd['crystallization']:
                            cy.remarks = sd['crystallization']['remarks']
                        cy.save()

                                     
                    # fusion proteins
                    split_segments = {}
                    if 'fusion_proteins' in sd and sd['fusion_proteins']:
                        for fp in sd['fusion_proteins']:
                            fp_start = Residue.objects.get(protein_conformation=ppc,
                                sequence_number=fp['positions'][0])
                            fp_end = Residue.objects.get(protein_conformation=ppc, sequence_number=fp['positions'][1])
                            # if the fusion protein is inserted within only one segment (the usual case), split that
                            # segment into two segments
                            if fp_start and fp_start.protein_segment == fp_end.protein_segment:
                                # get/create split protein segments
                                segment_before, created = ProteinSegment.objects.get_or_create(
                                    slug=fp_start.protein_segment.slug+"_1", defaults={
                                    'name': fp_start.protein_segment.name,
                                    'category': fp_start.protein_segment.category,
                                    'partial': True})
                                segment_after, created = ProteinSegment.objects.get_or_create(
                                    slug=fp_start.protein_segment.slug+"_2", defaults={
                                    'name': fp_start.protein_segment.name,
                                    'category': fp_start.protein_segment.category,
                                    'partial': True})

                                # keep track of  information about split segments
                                split_segments[fp_start.protein_segment.slug] = {
                                    'start': {
                                        'sequence_number': fp['positions'][0],
                                        'segment': segment_before,
                                    },
                                    'end': {
                                        'sequence_number': fp['positions'][1],
                                        'segment': segment_after,
                                    },
                                }

                            # get/insert fusion protein
                            fusion, create = ProteinFusion.objects.get_or_create(name=fp['name'], defaults={
                                'sequence': fp['sequence']})

                            # create relationship with protein
                            ProteinFusionProtein.objects.create(protein=p, protein_fusion=fusion,
                                segment_before=segment_before, segment_after=segment_after)

                    prs = Residue.objects.filter(protein_conformation=ppc).prefetch_related(
                        'protein_conformation__protein', 'protein_segment', 'generic_number',
                        'display_generic_number__scheme', 'alternative_generic_numbers__scheme')
                    updated_sequence = ''
                    for pr in prs:
                        if pr.sequence_number not in deletions:
                            r = Residue()
                            r.protein_conformation = pc
                            r.generic_number = pr.generic_number
                            r.display_generic_number = pr.display_generic_number
                            r.sequence_number = pr.sequence_number
                            
                            # check for split segments
                            if pr.protein_segment.slug in split_segments:
                                rsns = split_segments[pr.protein_segment.slug]['start']['sequence_number']
                                rsne = split_segments[pr.protein_segment.slug]['end']['sequence_number']
                                if r.sequence_number <= rsns:
                                    r.protein_segment = split_segments[pr.protein_segment.slug]['start']['segment']
                                elif r.sequence_number >= rsne:
                                    r.protein_segment = split_segments[pr.protein_segment.slug]['end']['segment']
                            else:
                                r.protein_segment = pr.protein_segment

                            # amino acid, check for mutations
                            if r.sequence_number in mutations:
                                if mutations[r.sequence_number]['wt_res'] == pr.amino_acid:
                                    r.amino_acid = mutations[r.sequence_number]['mut_res']
                                else:
                                    self.logger.error('Mutation {} in construct {} does not match wild-type sequence' \
                                        + ' of {}'.format(mutations[r.sequence_number]['full'], pc.protein.name,
                                        ppc.protein.entry_name))
                            else:
                                r.amino_acid = pr.amino_acid

                            # save amino acid to updated sequence
                            updated_sequence += r.amino_acid

                            # save residue before populating M2M relations
                            r.save()

                            # alternative generic numbers
                            agns = pr.alternative_generic_numbers.all()
                            for agn in agns:
                                r.alternative_generic_numbers.add(agn)
                    
                    # update sequence
                    p.sequence = updated_sequence
                    p.save()

        self.logger.info('COMPLETED CREATING CONSTRUCTS')
    def handle(self, *args, **options):
        self.options = options
        if self.options['purge']:
            Residue.objects.filter(
                protein_conformation__protein__entry_name__endswith='_a',
                protein_conformation__protein__family__parent__parent__name=
                'Alpha').delete()
            ProteinConformation.objects.filter(
                protein__entry_name__endswith='_a',
                protein__family__parent__parent__name='Alpha').delete()
            Protein.objects.filter(
                entry_name__endswith='_a',
                family__parent__parent__name='Alpha').delete()
            SignprotStructureExtraProteins.objects.all().delete()
            SignprotStructure.objects.all().delete()

        if not options['only_signprot_structures']:
            # Building protein and protconf objects for g protein structure in complex
            scs = SignprotComplex.objects.all()
            for sc in scs:
                self.logger.info(
                    'Protein, ProteinConformation and Residue build for alpha subunit of {} is building'
                    .format(sc))
                try:
                    # Alpha subunit
                    try:
                        alpha_protein = Protein.objects.get(
                            entry_name=sc.structure.pdb_code.index.lower() +
                            '_a')
                    except:
                        alpha_protein = Protein()
                        alpha_protein.entry_name = sc.structure.pdb_code.index.lower(
                        ) + '_a'
                        alpha_protein.accession = None
                        alpha_protein.name = sc.structure.pdb_code.index.lower(
                        ) + '_a'
                        alpha_protein.sequence = sc.protein.sequence
                        alpha_protein.family = sc.protein.family
                        alpha_protein.parent = sc.protein
                        alpha_protein.residue_numbering_scheme = sc.protein.residue_numbering_scheme
                        alpha_protein.sequence_type = ProteinSequenceType.objects.get(
                            slug='mod')
                        alpha_protein.source = ProteinSource.objects.get(
                            name='OTHER')
                        alpha_protein.species = sc.protein.species
                        alpha_protein.save()

                    try:
                        alpha_protconf = ProteinConformation.objects.get(
                            protein__entry_name=sc.structure.pdb_code.index.
                            lower() + '_a')
                    except:
                        alpha_protconf = ProteinConformation()
                        alpha_protconf.protein = alpha_protein
                        alpha_protconf.state = ProteinState.objects.get(
                            slug='active')
                        alpha_protconf.save()

                    pdbp = PDBParser(PERMISSIVE=True, QUIET=True)
                    s = pdbp.get_structure('struct',
                                           StringIO(sc.structure.pdb_data.pdb))
                    chain = s[0][sc.alpha]
                    nums = []
                    for res in chain:
                        try:
                            res['CA']
                            nums.append(res.get_id()[1])
                        except:
                            pass

                    resis = Residue.objects.filter(
                        protein_conformation__protein=sc.protein)
                    num_i = 0
                    temp_seq2 = ''
                    pdb_num_dict = OrderedDict()
                    # Create first alignment based on sequence numbers
                    for n in nums:
                        if sc.structure.pdb_code.index == '6OIJ' and n < 30:
                            nr = n + 6
                        else:
                            nr = n
                        pdb_num_dict[n] = [
                            chain[n], resis.get(sequence_number=nr)
                        ]
                    # Find mismatches
                    mismatches = []
                    for n, res in pdb_num_dict.items():
                        if AA[res[0].get_resname()] != res[1].amino_acid:
                            mismatches.append(res)

                    pdb_lines = sc.structure.pdb_data.pdb.split('\n')
                    seqadv = []
                    for l in pdb_lines:
                        if l.startswith('SEQADV'):
                            seqadv.append(l)
                    mutations, shifted_mutations = OrderedDict(), OrderedDict()
                    # Search for annotated engineered mutations in pdb SEQADV
                    for s in seqadv:
                        line_search = re.search(
                            'SEQADV\s{1}[A-Z\s\d]{4}\s{1}([A-Z]{3})\s{1}([A-Z]{1})\s+(\d+)[\s\S\d]{5}([\s\S\d]{12})([A-Z]{3})\s+(\d+)(\s\S+)',
                            s)
                        if line_search != None:
                            if line_search.group(2) == sc.alpha:
                                if line_search.group(
                                        4).strip() == sc.protein.accession:
                                    if line_search.group(
                                            3) == line_search.group(6):
                                        mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5)
                                            ]
                                    else:
                                        shifted_mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5),
                                                int(line_search.group(6))
                                            ]
                                else:
                                    # Exception for 6G79
                                    if line_search.group(
                                            3
                                    ) != line_search.group(
                                            6
                                    ) and 'CONFLICT' in line_search.group(7):
                                        mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5)
                                            ]
                                    # Exception for 5G53
                                    if line_search.group(
                                            4).strip() != sc.protein.accession:
                                        mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5)
                                            ]
                    remaining_mismatches = []

                    # Check and clear mismatches that are registered in pdb SEQADV as engineered mutation
                    for m in mismatches:
                        num = m[0].get_id()[1]
                        if num in mutations:
                            if m[0].get_resname() != mutations[num][0] and m[
                                    1].amino_acid != AA[mutations[num][1]]:
                                remaining_mismatches.append(m)
                        elif num in shifted_mutations:
                            remaining_mismatches.append(m)
                        else:
                            remaining_mismatches.append(m)

                    ### sanity check
                    # print(sc)
                    # print(mutations)
                    # print(shifted_mutations)
                    # print(mismatches)
                    # print('======')
                    # print(remaining_mismatches)
                    # pprint.pprint(pdb_num_dict)

                    # Mismatches remained possibly to seqnumber shift, making pairwise alignment to try and fix alignment
                    if len(remaining_mismatches
                           ) > 0 and sc.structure.pdb_code.index not in [
                               '6OIJ', '6OY9', '6OYA', '6LPB', '6WHA'
                           ]:
                        ppb = PPBuilder()
                        seq = ''
                        for pp in ppb.build_peptides(chain, aa_only=False):
                            seq += str(pp.get_sequence())
                        pw2 = pairwise2.align.localms(sc.protein.sequence, seq,
                                                      2, -1, -.5, -.1)
                        ref_seq, temp_seq = str(pw2[0][0]), str(pw2[0][1])
                        wt_pdb_dict = OrderedDict()
                        pdb_wt_dict = OrderedDict()
                        j, k = 0, 0
                        for i, ref, temp in zip(range(0, len(ref_seq)),
                                                ref_seq, temp_seq):
                            # print(i, ref, temp) # alignment check
                            if ref != '-' and temp != '-':
                                wt_pdb_dict[resis[j]] = pdb_num_dict[nums[k]]
                                pdb_wt_dict[pdb_num_dict[nums[k]]
                                            [0]] = resis[j]
                                j += 1
                                k += 1
                            elif ref == '-':
                                wt_pdb_dict[i] = pdb_num_dict[nums[k]]
                                pdb_wt_dict[pdb_num_dict[nums[k]][0]] = i
                                k += 1
                            elif temp == '-':
                                wt_pdb_dict[resis[j]] = i
                                pdb_wt_dict[i] = resis[j]
                                j += 1
                        for i, r in enumerate(remaining_mismatches):
                            # Adjust for shifted residue when residue is a match
                            if r[0].get_id()[1] - remaining_mismatches[
                                    i - 1][0].get_id()[1] > 1:
                                pdb_num_dict[r[0].get_id()[1] -
                                             1][1] = pdb_wt_dict[chain[
                                                 r[0].get_id()[1] - 1]]
                            # Adjust for shifted residue when residue is mutated and it's logged in SEQADV
                            if r[0].get_id()[1] in shifted_mutations:
                                pdb_num_dict[r[0].get_id()[1]][1] = resis.get(
                                    sequence_number=shifted_mutations[
                                        r[0].get_id()[1]][2])
                            # Adjust for shift
                            else:
                                pdb_num_dict[r[0].get_id()
                                             [1]][1] = pdb_wt_dict[r[0]]
                    # Custom alignment fix for 6WHA mini-Gq/Gi2/Gs chimera
                    # elif sc.structure.pdb_code.index=='6WHA':
                    #     ref_seq  = 'MTLESIMACCLSEEAKEARRINDEIERQLRRDKRDARRELKLLLLGTGESGKSTFIKQMRIIHGSGYSDEDKRGFTKLVYQNIFTAMQAMIRAMDTLKIPYKYEHNKAHAQLVREVDVEKVSAFENPYVDAIKSLWNDPGIQECYDRRREYQLSDSTKYYLNDLDRVADPAYLPTQQDVLRVRVPTTGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQVLVESDNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIMY--SHLVDYFPEYDGP----QRDAQAAREFILKMFVDL---NPDSDKIIYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV'
                    #     temp_seq = '----------VSAEDKAAAERSKMIDKNLREDGEKARRTLRLLLLGADNSGKSTIVK----------------------------------------------------------------------------------------------------------------------------------GIFETKFQVDKVNFHMFDVG-----RRKWIQCFNDVTAIIFVVDSSDYNR----------LQEALNDFKSIWNNRWLRTISVILFLNKQDLLAEKVLAGKSKIEDYFPEFARYTTPDPRVTRAKY-FIRKEFVDISTASGDGRHICYPHFTC-VDTENARRIFNDCKDIILQMNLREYNLV'
                    #     for i, ref, temp in zip(range(0,len(ref_seq)), ref_seq, temp_seq):
                    #         print(i, ref, temp)
                    #     pprint.pprint(pdb_num_dict)

                    bulked_residues = []
                    for key, val in pdb_num_dict.items():
                        # print(key, val) # sanity check
                        if not isinstance(val[1], int):
                            res_obj = Residue()
                            res_obj.sequence_number = val[0].get_id()[1]
                            res_obj.amino_acid = AA[val[0].get_resname()]
                            res_obj.display_generic_number = val[
                                1].display_generic_number
                            res_obj.generic_number = val[1].generic_number
                            res_obj.protein_conformation = alpha_protconf
                            res_obj.protein_segment = val[1].protein_segment
                            bulked_residues.append(res_obj)
                        else:
                            self.logger.info(
                                'Skipped {} as no annotation was present, while building for alpha subunit of {}'
                                .format(val[1], sc))

                    Residue.objects.bulk_create(bulked_residues)
                    self.logger.info(
                        'Protein, ProteinConformation and Residue build for alpha subunit of {} is finished'
                        .format(sc))
                except Exception as msg:
                    #print('Protein, ProteinConformation and Residue build for alpha subunit of {} has failed'.format(sc))
                    #print(msg)
                    #print(traceback.format_exc())
                    #exit(0)
                    self.logger.info(
                        'Protein, ProteinConformation and Residue build for alpha subunit of {} has failed'
                        .format(sc))

        ### Build SignprotStructure objects from non-complex signprots
        g_prot_alphas = Protein.objects.filter(
            family__slug__startswith='100_001',
            accession__isnull=False)  #.filter(entry_name='gnai1_human')
        complex_structures = SignprotComplex.objects.all().values_list(
            'structure__pdb_code__index', flat=True)
        for a in g_prot_alphas:
            pdb_list = get_pdb_ids(a.accession)
            for pdb in pdb_list:
                if pdb not in complex_structures:
                    try:
                        data = self.fetch_gprot_data(pdb, a)
                        if data:
                            self.build_g_prot_struct(a, pdb, data)
                    except Exception as msg:
                        self.logger.error(
                            'SignprotStructure of {} {} failed\n{}: {}'.format(
                                a.entry_name, pdb, type(msg), msg))
Ejemplo n.º 9
0
    def main_func(self, positions, iteration):
        # filenames
        if not positions[1]:
            filenames = self.filenames[positions[0]:]
        else:
            filenames = self.filenames[positions[0]:positions[1]]

        # parse files
        for source_file in filenames:
            source_file_path = os.sep.join([self.construct_data_dir, source_file])
            if os.path.isfile(source_file_path) and source_file[0] != '.':
                self.logger.info('Reading file {}'.format(source_file_path))
                # read the yaml file
                with open(source_file_path, 'r') as f:
                    sd = yaml.load(f)

                    # is a protein specified?
                    if 'protein' not in sd:
                        self.logger.error('Protein not specified for construct, skipping')
                        continue

                    # fetch the parent protein
                    try:
                        ppc = ProteinConformation.objects.prefetch_related('protein__family', 'protein__species',
                            'protein__residue_numbering_scheme').get(protein__entry_name=sd['protein'],
                            state__slug=settings.DEFAULT_PROTEIN_STATE)
                    except ProteinConformation.DoesNotExist:
                        # abort if parent protein is not found
                        self.logger.error('Parent protein {} for construct {} not found, aborting!'.format(
                            sd['protein'], sd['name']))
                        continue

                    # sequence type
                    try:
                        sequence_type, created = ProteinSequenceType.objects.get_or_create(slug='mod',
                            defaults={'name': 'Modified'})
                        if created:
                            self.logger.info('Created sequence type {}'.format(sequence_type))
                    except IntegrityError:
                        sequence_type = ProteinSequenceType.objects.get(slug='mod')

                    # protein source
                    try:
                        protein_source, created = ProteinSource.objects.get_or_create(name='OTHER')
                        if created:
                            self.logger.info('Created protein source {}'.format(protein_source))
                    except IntegrityError:
                        protein_source = ProteinSource.objects.get(name='OTHER')

                    # create a protein record
                    p = Protein()
                    p.parent = ppc.protein
                    p.family = ppc.protein.family
                    p.species = ppc.protein.species
                    p.residue_numbering_scheme = ppc.protein.residue_numbering_scheme
                    p.sequence_type= sequence_type
                    p.source = protein_source
                    p.entry_name = slugify(strip_tags(sd['name']))
                    p.name = sd['name']
                    p.sequence = ppc.protein.sequence

                    # save protein (construct)
                    try:
                        p.save()
                        self.logger.info('Created construct {} with parent protein {}'.format(p.name,
                            ppc.protein.entry_name))
                    except:
                        self.logger.error('Failed creating construct {} with parent protein {}'.format(p.name,
                            ppc.protein.entry_name))
                        continue

                    # create protein conformation record
                    pc = ProteinConformation()
                    pc.protein = p
                    pc.state = ProteinState.objects.get(slug=settings.DEFAULT_PROTEIN_STATE)
                    try:
                        pc.save()
                        self.logger.info('Created conformation {} of protein {}'.format(pc.state.name, p.name))
                    except:
                        self.logger.error('Failed creating conformation {} of protein {}'.format(pc.state.name,
                            p.entry_name))

                    # process deletions (save in db, and for sequence processing)
                    deletions = []
                    if 'deletions' in sd and sd['deletions']:
                        for t in sd['deletions']:
                            deletions += list(range(t[0],t[1]+1))
                            deletion = ConstructDeletion.objects.create(construct=pc, start=t[0], end=t[1])
                            if created:
                                self.logger.info('Created deletion {}-{} for {}'.format(t[0], t[1],
                                    pc.protein.entry_name))

                    # process mutations (save in db, and for sequence processing)
                    mutations = {}
                    if 'mutations' in sd and sd['mutations']:
                        for m in sd['mutations']:
                            res_num = int(m[1:-1])
                            mutations[res_num] = {
                                'wt_res': m[0],
                                'mut_res': m[-1],
                                'full': m,
                            }
                            mutation = ConstructMutation.objects.get_or_create(
                                construct=pc,
                                sequence_number=res_num,
                                wild_type_amino_acid=m[0],
                                mutated_amino_acid=m[-1],
                            )

                    # insertions
                    split_segments = {}
                    if 'insertions' in sd and sd['insertions']:
                        for ins in sd['insertions']:
                            ins_start = Residue.objects.get(protein_conformation=ppc,
                                sequence_number=ins['positions'][0])
                            ins_end = Residue.objects.get(protein_conformation=ppc,
                                sequence_number=ins['positions'][1])
                            # if the insertion is within only one segment (the usual case), split that
                            # segment into two segments
                            if ins_start and ins_start.protein_segment == ins_end.protein_segment:
                                # get/create split protein segments
                                slug_1 = ins_start.protein_segment.slug + "_1"
                                try:
                                    segment_before, created = ProteinSegment.objects.get_or_create(slug=slug_1,
                                        defaults={'name': ins_start.protein_segment.name,
                                        'category': ins_start.protein_segment.category, 'partial': True})
                                    if created:
                                        self.logger.info('Created protein segment {}'.format(segment_before))
                                except IntegrityError:
                                    segment_before = ProteinSegment.objects.get(slug=slug_1)

                                slug_2 = ins_start.protein_segment.slug + "_2"
                                try:
                                    segment_after, created = ProteinSegment.objects.get_or_create(slug=slug_2,
                                        defaults={'name': ins_start.protein_segment.name,
                                        'category': ins_start.protein_segment.category, 'partial': True})
                                    if created:
                                        self.logger.info('Created protein segment {}'.format(segment_after))
                                except IntegrityError:
                                    segment_after = ProteinSegment.objects.get(slug=slug_2)

                                # keep track of  information about split segments
                                split_segments[ins_start.protein_segment.slug] = {
                                    'start': {
                                        'sequence_number': ins['positions'][0],
                                        'segment': segment_before,
                                    },
                                    'end': {
                                        'sequence_number': ins['positions'][1],
                                        'segment': segment_after,
                                    },
                                }
                            # if the insertion covers two segments, use those two as the segments before and after
                            elif ins_start:
                                segment_before = ins_start.protein_segment
                                segment_after = ins_end.protein_segment

                            # if the insertion replaces a part of the sequence, add that range as a deletion
                            if ins['positions'][1] > (ins['positions'][0] + 1):
                                deletions += list(range((ins['positions'][0] + 1), ins['positions'][1]))

                            # get/insert fusion protein
                            fusion, create = ProteinFusion.objects.get_or_create(name=ins['name'], defaults={
                                'sequence': ins['sequence']})

                            # create relationship with protein
                            ProteinFusionProtein.objects.create(protein=p, protein_fusion=fusion,
                                segment_before=segment_before, segment_after=segment_after)

                    # create expression records
                    if 'expression_sys' in sd and sd['expression_sys']:
                        ce = Expression()           
                        ce.construct = pc
                        ce.expression_system, created = ExpressionSystem.objects.get_or_create(
                            expression_method = sd['expression_sys']['expression_method'],
                            host_cell_type = sd['expression_sys']['host_cell_type'],
                            host_cell = sd['expression_sys']['host_cell'])
                        if 'remarks' in sd:
                           ce.remarks = sd['expression_sys']['remarks']
                        ce.save()
                    
                    # create solubilization records
                    if ('solubilization' in sd and sd['solubilization'] and 'steps' in sd['solubilization']
                        and sd['solubilization']['steps']):
                        so = Solubilization()
                        so.construct = pc
                        cl = ChemicalList.objects.create()
                        so.chemical_list = cl 

                        for step in sd['solubilization']['steps']:
                            if 'type' in step and 'item' in step and'concentration' in step:
                                chem = Chemical()
                                chem.chemical_type,  created = ChemicalType.objects.get_or_create(name = step['type'])
                                chem.name =  step['item']
                                chem.save()

                                cc = ChemicalConc()
                                cc.concentration = step['concentration']
                                cc.chemical = chem    # since ChemicalConc has a ForeignKey to Chemical
                                cc.save()
                                cl.chemicals.add(cc)                          
                            else:
                                self.logger.error('Solubilization step incorrectly defined for {}'.format(p))

                        if 'remarks' in sd['solubilization']:
                            so.remarks = sd['solubilization']['remarks']
                        so.save()
                    
                    # create  purification records
                    if 'purification' in sd and sd['purification'] and sd['purification']['steps']:
                        pu = Purification()
                        pu.construct = pc
                        if 'remarks' in sd['purification']:
                            pu.remarks = sd['purification']['remarks']
                        pu.save() 
                        for step in sd['purification']['steps']:
                            if 'type' in step and 'description' in step:
                                pust = PurificationStep()
                                pust.description = step['description']
                                pust.purification = pu
                                pust.purification_type, created = PurificationStepType.objects.get_or_create(
                                    name = step['type'] ) # 2 values returned by get_or_create
                                if created: 
                                    self.logger.info('Created purification step type {}'.format(
                                        pust.purification_type))
                                pust.save()

                            else:
                                self.logger.error('Purification step incorrectly defined for {}'.format(p))
                    
                    # create crystallization records
                    if 'crystallization' in sd and sd['crystallization']: 
                        cy = Crystallization()
                        cy.construct = pc
                        cyt = CrystallizationMethodTypes.objects.create()
                        cy.crystal_type = cyt
                        cy.method = sd['crystallization']['method']
                        cy.settings = sd['crystallization']['settings']
                        cy.protein_conc = sd['crystallization']['protein_conc']
                        cl = ChemicalList.objects.create()
                        cy.chemical_list = cl

                        for step in sd['crystallization']['chemicallist']:
                            if 'type' in step and 'item' in step and'concentration' in step:
                                chem = Chemical()
                                chem.chemical_type,  created = ChemicalType.objects.get_or_create(name = step['type']) 

                                chem.name =  step['item']
                                chem.save()
                                cc = ChemicalConc()
                                cc.concentration = step['concentration']
                                cc.chemical = chem    # since ChemicalConc has a ForeignKey to Chemical
                                cc.save()

                                cl.chemicals.add(cc)                          
                            else:
                                self.logger.error('Crystallization step incorrectly defined for {}'.format(p))                        

                        cy.aqueous_solution_lipid_ratio = sd['crystallization']['aqueous_solution_lipid_ratio_LCP']
                        cy.lcp_bolus_volume = sd['crystallization']['LCP_bolus_volume']
                        cy.precipitant_solution_volume = sd['crystallization']['precipitant_solution_volume']
                        cy.temp = sd['crystallization']['temperature']
                        cy.ph = sd['crystallization']['ph']  


                        if 'remarks' in sd['crystallization']:
                            cy.remarks = sd['crystallization']['remarks']
                        cy.save()
                    
                    # create residues
                    prs = Residue.objects.filter(protein_conformation=ppc).prefetch_related(
                        'protein_conformation__protein', 'protein_segment', 'generic_number',
                        'display_generic_number__scheme', 'alternative_generic_numbers__scheme')
                    updated_sequence = ''
                    for pr in prs:
                        if pr.sequence_number not in deletions:
                            r = Residue()
                            r.protein_conformation = pc
                            r.generic_number = pr.generic_number
                            r.display_generic_number = pr.display_generic_number
                            r.sequence_number = pr.sequence_number

                            # check for split segments
                            if pr.protein_segment.slug in split_segments:
                                rsns = split_segments[pr.protein_segment.slug]['start']['sequence_number']
                                rsne = split_segments[pr.protein_segment.slug]['end']['sequence_number']
                                if r.sequence_number <= rsns:
                                    r.protein_segment = split_segments[pr.protein_segment.slug]['start']['segment']
                                elif r.sequence_number >= rsne:
                                    r.protein_segment = split_segments[pr.protein_segment.slug]['end']['segment']
                            else:
                                r.protein_segment = pr.protein_segment

                            # amino acid, check for mutations
                            if r.sequence_number in mutations:
                                if mutations[r.sequence_number]['wt_res'] == pr.amino_acid:
                                    r.amino_acid = mutations[r.sequence_number]['mut_res']
                                else:
                                    self.logger.error('Mutation {} in construct {} does not match wild-type sequence' \
                                        + ' of {}'.format(mutations[r.sequence_number]['full'], pc.protein.name,
                                        ppc.protein.entry_name))
                            else:
                                r.amino_acid = pr.amino_acid

                            # save amino acid to updated sequence
                            updated_sequence += r.amino_acid

                            # save residue before populating M2M relations
                            r.save()

                            # alternative generic numbers
                            agns = pr.alternative_generic_numbers.all()
                            for agn in agns:
                                r.alternative_generic_numbers.add(agn)

                    # update sequence
                    p.sequence = updated_sequence
                    p.save()
Ejemplo n.º 10
0
    def main_func(self, positions, iteration, count, lock):
        # filenames
        if not positions[1]:
            filenames = self.filenames[positions[0]:]
        else:
            filenames = self.filenames[positions[0]:positions[1]]

        # parse files
        for source_file in filenames:
            source_file_path = os.sep.join(
                [self.construct_data_dir, source_file])
            if os.path.isfile(source_file_path) and source_file[0] != '.':
                self.logger.info('Reading file {}'.format(source_file_path))
                # read the yaml file
                with open(source_file_path, 'r') as f:
                    sd = yaml.load(f, Loader=yaml.FullLoader)

                    # is a protein specified?
                    if 'protein' not in sd:
                        self.logger.error(
                            'Protein not specified for construct, skipping')
                        continue

                    # fetch the parent protein
                    try:
                        ppc = ProteinConformation.objects.prefetch_related(
                            'protein__family', 'protein__species',
                            'protein__residue_numbering_scheme').get(
                                protein__entry_name=sd['protein'],
                                state__slug=settings.DEFAULT_PROTEIN_STATE)
                    except ProteinConformation.DoesNotExist:
                        # abort if parent protein is not found
                        print(
                            'Parent protein {} for construct {} not found, aborting!'
                            .format(sd['protein'], sd['name']))
                        self.logger.error(
                            'Parent protein {} for construct {} not found, aborting!'
                            .format(sd['protein'], sd['name']))
                        continue
                    # sequence type
                    try:
                        sequence_type, created = ProteinSequenceType.objects.get_or_create(
                            slug='mod', defaults={'name': 'Modified'})
                        if created:
                            self.logger.info('Created sequence type {}'.format(
                                sequence_type))
                    except IntegrityError:
                        sequence_type = ProteinSequenceType.objects.get(
                            slug='mod')

                    # protein source
                    try:
                        protein_source, created = ProteinSource.objects.get_or_create(
                            name='OTHER')
                        if created:
                            self.logger.info(
                                'Created protein source {}'.format(
                                    protein_source))
                    except IntegrityError:
                        protein_source = ProteinSource.objects.get(
                            name='OTHER')

                    if not Protein.objects.filter(name=sd['name']).exists():
                        # create a protein record
                        p = Protein()
                        p.parent = ppc.protein
                        p.family = ppc.protein.family
                        p.species = ppc.protein.species
                        p.residue_numbering_scheme = ppc.protein.residue_numbering_scheme
                        p.sequence_type = sequence_type
                        p.source = protein_source
                        p.entry_name = slugify(strip_tags(sd['name']))
                        p.name = sd['name']
                        p.sequence = ppc.protein.sequence

                        # save protein (construct)
                        try:
                            p.save()
                            self.logger.info(
                                'Created construct {} with parent protein {}'.
                                format(p.name, ppc.protein.entry_name))
                        except:
                            self.logger.error(
                                'Failed creating construct {} with parent protein {}'
                                .format(p.name, ppc.protein.entry_name))
                            continue
                    else:
                        p = Protein.objects.get(name=sd['name'])

                    if not ProteinConformation.objects.filter(
                            protein=p).exists():
                        # create protein conformation record
                        pc = ProteinConformation()
                        pc.protein = p
                        pc.state = ProteinState.objects.get(
                            slug=settings.DEFAULT_PROTEIN_STATE)
                        try:
                            pc.save()
                            self.logger.info(
                                'Created conformation {} of protein {}'.format(
                                    pc.state.name, p.name))
                        except:
                            print(
                                'Failed creating conformation {} of protein {}'
                                .format(pc.state.name, p.entry_name))
                            self.logger.error(
                                'Failed creating conformation {} of protein {}'
                                .format(pc.state.name, p.entry_name))
    def handle(self, *args, **options):
        startTime = datetime.datetime.now()
        self.options = options
        if self.options["purge"]:
            Residue.objects.filter(
                protein_conformation__protein__entry_name__endswith="_a",
                protein_conformation__protein__family__parent__parent__name=
                "Alpha").delete()
            ProteinConformation.objects.filter(
                protein__entry_name__endswith="_a",
                protein__family__parent__parent__name="Alpha").delete()
            Protein.objects.filter(
                entry_name__endswith="_a",
                family__parent__parent__name="Alpha").delete()
            SignprotStructureExtraProteins.objects.all().delete()
            SignprotStructure.objects.all().delete()

        if not options["only_signprot_structures"]:
            # Building protein and protconf objects for g protein structure in complex
            if options["s"]:
                scs = SignprotComplex.objects.filter(
                    structure__pdb_code__index__in=[
                        i.upper() for i in options["s"]
                    ])
            else:
                scs = SignprotComplex.objects.all()
            for sc in scs:
                self.logger.info(
                    "Protein, ProteinConformation and Residue build for alpha subunit of {} is building"
                    .format(sc))
                try:
                    # Alpha subunit
                    try:
                        alpha_protein = Protein.objects.get(
                            entry_name=sc.structure.pdb_code.index.lower() +
                            "_a")
                    except:
                        alpha_protein = Protein()
                        alpha_protein.entry_name = sc.structure.pdb_code.index.lower(
                        ) + "_a"
                        alpha_protein.accession = None
                        alpha_protein.name = sc.structure.pdb_code.index.lower(
                        ) + "_a"
                        alpha_protein.sequence = sc.protein.sequence
                        alpha_protein.family = sc.protein.family
                        alpha_protein.parent = sc.protein
                        alpha_protein.residue_numbering_scheme = sc.protein.residue_numbering_scheme
                        alpha_protein.sequence_type = ProteinSequenceType.objects.get(
                            slug="mod")
                        alpha_protein.source = ProteinSource.objects.get(
                            name="OTHER")
                        alpha_protein.species = sc.protein.species
                        alpha_protein.save()

                    try:
                        alpha_protconf = ProteinConformation.objects.get(
                            protein__entry_name=sc.structure.pdb_code.index.
                            lower() + "_a")
                    except:
                        alpha_protconf = ProteinConformation()
                        alpha_protconf.protein = alpha_protein
                        alpha_protconf.state = ProteinState.objects.get(
                            slug="active")
                        alpha_protconf.save()

                    pdbp = PDBParser(PERMISSIVE=True, QUIET=True)
                    s = pdbp.get_structure("struct",
                                           StringIO(sc.structure.pdb_data.pdb))
                    chain = s[0][sc.alpha]
                    nums = []
                    for res in chain:
                        if "CA" in res and res.id[0] == " ":
                            nums.append(res.get_id()[1])

                    resis = Residue.objects.filter(
                        protein_conformation__protein=sc.protein)
                    num_i = 0
                    temp_seq2 = ""
                    pdb_num_dict = OrderedDict()
                    # Create first alignment based on sequence numbers
                    for n in nums:
                        if sc.structure.pdb_code.index == "6OIJ" and n < 30:
                            nr = n + 6
                        else:
                            nr = n
                        pdb_num_dict[n] = [
                            chain[n], resis.get(sequence_number=nr)
                        ]
                    # Find mismatches
                    mismatches = []
                    for n, res in pdb_num_dict.items():
                        if AA[res[0].get_resname()] != res[1].amino_acid:
                            mismatches.append(res)

                    pdb_lines = sc.structure.pdb_data.pdb.split("\n")
                    seqadv = []
                    for l in pdb_lines:
                        if l.startswith("SEQADV"):
                            seqadv.append(l)
                    mutations, shifted_mutations = OrderedDict(), OrderedDict()
                    # Search for annotated engineered mutations in pdb SEQADV
                    for s in seqadv:
                        line_search = re.search(
                            "SEQADV\s{1}[A-Z\s\d]{4}\s{1}([A-Z]{3})\s{1}([A-Z]{1})\s+(\d+)[\s\S\d]{5}([\s\S\d]{12})([A-Z]{3})\s+(\d+)(\s\S+)",
                            s)
                        if line_search != None:
                            if line_search.group(2) == sc.alpha:
                                if line_search.group(
                                        4).strip() == sc.protein.accession:
                                    if line_search.group(
                                            3) == line_search.group(6):
                                        mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5)
                                            ]
                                    else:
                                        shifted_mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5),
                                                int(line_search.group(6))
                                            ]
                                else:
                                    # Exception for 6G79
                                    if line_search.group(
                                            3
                                    ) != line_search.group(
                                            6
                                    ) and "CONFLICT" in line_search.group(7):
                                        mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5)
                                            ]
                                    # Exception for 5G53
                                    if line_search.group(
                                            4).strip() != sc.protein.accession:
                                        mutations[int(
                                            line_search.group(3))] = [
                                                line_search.group(1),
                                                line_search.group(5)
                                            ]
                    remaining_mismatches = []

                    # Check and clear mismatches that are registered in pdb SEQADV as engineered mutation
                    for m in mismatches:
                        num = m[0].get_id()[1]
                        if num in mutations:
                            if m[0].get_resname() != mutations[num][0] and m[
                                    1].amino_acid != AA[mutations[num][1]]:
                                remaining_mismatches.append(m)
                        elif num in shifted_mutations:
                            remaining_mismatches.append(m)
                        else:
                            remaining_mismatches.append(m)

                    if options["debug"]:
                        print(sc)
                        print(mutations)
                        print(shifted_mutations)
                        print(mismatches)
                        print("======")
                        print(remaining_mismatches)
                        pprint.pprint(pdb_num_dict)

                    no_seqnum_shift = [
                        '6OY9', '6OYA', '6LPB', '6WHA', '7D77', '6XOX', '7L1U',
                        '7L1V'
                    ]

                    # Check if HN is mutated to GNAI1 for the scFv16 stabilizer
                    if sc.protein.entry_name != 'gnai1_human' and len(
                            remaining_mismatches) > 0:
                        target_HN = resis.filter(protein_segment__slug='HN')
                        gnai1_HN = Residue.objects.filter(
                            protein_conformation__protein__entry_name=
                            'gnai1_human',
                            protein_segment__slug='HN')
                        pdb_HN_seq = ''
                        for num, val in pdb_num_dict.items():
                            if num <= target_HN.reverse()[0].sequence_number:
                                pdb_HN_seq += Polypeptide.three_to_one(
                                    val[0].get_resname())
                        if options['debug']:
                            print('Checking if HN is gnai1_human')
                            print(pdb_HN_seq)
                            print(''.join(
                                gnai1_HN.values_list('amino_acid', flat=True)))
                        gnai1_HN_seq = ''.join(
                            gnai1_HN.values_list('amino_acid', flat=True))
                        pw2 = pairwise2.align.localms(gnai1_HN_seq, pdb_HN_seq,
                                                      3, -4, -3, -1)
                        ref_seq, temp_seq = str(pw2[0][0]), str(pw2[0][1])
                        length, match = 0, 0
                        for r, t in zip(ref_seq, temp_seq):
                            if options['debug']:
                                print(r, t)
                            if t != '-':
                                if r == t:
                                    match += 1
                                length += 1
                        identity = match / length * 100
                        if options['debug']:
                            print(identity)
                        if identity > 85:
                            if sc.structure.pdb_code.index not in ['7DFL']:
                                no_seqnum_shift.append(
                                    sc.structure.pdb_code.index)
                            if options['debug']:
                                print(
                                    'INFO: HN has {}% with gnai1_human HN, skipping seqnum shift correction'
                                    .format(round(identity)))

                    # Mismatches remained possibly to seqnumber shift, making pairwise alignment to try and fix alignment
                    if len(
                            remaining_mismatches
                    ) > 0 and sc.structure.pdb_code.index not in no_seqnum_shift:
                        ppb = PPBuilder()
                        seq = ""
                        for pp in ppb.build_peptides(chain, aa_only=False):
                            seq += str(pp.get_sequence())
                        if sc.structure.pdb_code.index in [
                                '7JVQ', '7L1U', '7L1V'
                        ]:
                            pw2 = pairwise2.align.localms(
                                sc.protein.sequence, seq, 3, -4, -3, -1)
                        else:
                            pw2 = pairwise2.align.localms(
                                sc.protein.sequence, seq, 2, -1, -.5, -.1)
                        ref_seq, temp_seq = str(pw2[0][0]), str(pw2[0][1])

                        # Custom fix for A->G mutation at pos 18
                        if sc.structure.pdb_code.index == '7JJO':
                            ref_seq = ref_seq[:18] + ref_seq[19:]
                            temp_seq = temp_seq[:17] + temp_seq[18:]
                        # Custom alignment fixes
                        elif sc.structure.pdb_code.index == '7DFL':
                            ref_seq = 'MTLESIMACCLSEEAKEARRINDEIERQLRRDKRDARRELKLLLLGTGESGKSTFIKQMRIIHGSGYSDEDKRGFTKLVYQNIFTAMQAMIRAMDTLKIPYKYEHNKAHAQLVREVDVEKVSAFENPYVDAIKSLWNDPGIQECYDRRREYQLSDSTKYYLNDLDRVADPAYLPTQQDVLRVRVPTTGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQVLVESDNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIMYSHLVDYFPEYDGPQRDAQAAREFILKMFVDLNPDSDKIIYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV'
                            temp_seq = '--------CTLSAEDKAAVERSKMIDRNLREDGEKARRELKLLLLGTGESGKSTFIKQMRIIHG--------------------------------------------------------------------------------------------------------------------------TGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQV----DNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIMYSHLVDYFPEYDGPQRDAQAAREFILKMFVDLNPDSDKILYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV'
                        elif sc.structure.pdb_code.index == '7JOZ':
                            temp_seq = temp_seq[:67] + (
                                '-' * 14) + 'FNGDS' + temp_seq[86:]
                        elif sc.structure.pdb_code.index == '7AUE':
                            ref_seq = ref_seq[:31].replace('-',
                                                           '') + ref_seq[31:]
                            temp_seq = (
                                9 *
                                '-') + temp_seq[2:5] + temp_seq[5:54].replace(
                                    '-', '') + temp_seq[54:]
                        wt_pdb_dict = OrderedDict()
                        pdb_wt_dict = OrderedDict()
                        j, k = 0, 0
                        for i, ref, temp in zip(range(0, len(ref_seq)),
                                                ref_seq, temp_seq):
                            if options["debug"]:
                                print(i, ref, temp)  # alignment check
                            if ref != "-" and temp != "-":
                                wt_pdb_dict[resis[j]] = pdb_num_dict[nums[k]]
                                pdb_wt_dict[pdb_num_dict[nums[k]]
                                            [0]] = resis[j]
                                j += 1
                                k += 1
                            elif ref == "-":
                                wt_pdb_dict[i] = pdb_num_dict[nums[k]]
                                pdb_wt_dict[pdb_num_dict[nums[k]][0]] = i
                                k += 1
                            elif temp == "-":
                                wt_pdb_dict[resis[j]] = i
                                pdb_wt_dict[i] = resis[j]
                                j += 1
                        # Custom fix for 7JJO isoform difference
                        if sc.structure.pdb_code.index in [
                                '7JJO', '7JOZ', '7AUE'
                        ]:
                            pdb_num_dict = OrderedDict()
                            for wt_res, st_res in wt_pdb_dict.items():
                                if type(st_res) == type([]):
                                    pdb_num_dict[wt_res.sequence_number] = [
                                        st_res[0], wt_res
                                    ]
                        else:
                            for i, r in enumerate(remaining_mismatches):
                                # Adjust for shifted residue when residue is a match
                                if r[0].get_id()[1] - remaining_mismatches[
                                        i - 1][0].get_id()[1] > 1:
                                    pdb_num_dict[r[0].get_id()[1] -
                                                 1][1] = pdb_wt_dict[chain[
                                                     r[0].get_id()[1] - 1]]
                                # Adjust for shifted residue when residue is mutated and it's logged in SEQADV
                                if r[0].get_id()[1] in shifted_mutations:
                                    pdb_num_dict[
                                        r[0].get_id()[1]][1] = resis.get(
                                            sequence_number=shifted_mutations[
                                                r[0].get_id()[1]][2])
                                # Adjust for shift
                                else:
                                    pdb_num_dict[r[0].get_id()
                                                 [1]][1] = pdb_wt_dict[r[0]]
                            if sc.structure.pdb_code.index == '7JVQ':
                                pdb_num_dict[198][1] = Residue.objects.get(
                                    protein_conformation__protein=sc.protein,
                                    sequence_number=346)
                                pdb_num_dict[235][1] = Residue.objects.get(
                                    protein_conformation__protein=sc.protein,
                                    sequence_number=383)
                            elif sc.structure.pdb_code.index == '6PB0':
                                pdb_num_dict[205][1] = Residue.objects.get(
                                    protein_conformation__protein=sc.protein,
                                    sequence_number=205)
                    ### Custom alignment fix for 6WHA mini-Gq/Gi2/Gs chimera
                    elif sc.structure.pdb_code.index == "6WHA":
                        ref_seq = "MTLESIMACCLSEEAKEARRINDEIERQLRRDKRDARRELKLLLLGTGESGKSTFIKQMRIIHGSGYSDEDKRGFTKLVYQNIFTAMQAMIRAMDTLKIPYKYEHNKAHAQLVREVDVEKVSAFENPYVDAIKSLWNDPGIQECYDRRREYQLSDSTKYYLNDLDRVADPAYLPTQQDVLRVRVPTTGIIEYPFDLQSVIFRMVDVGGQRSERRKWIHCFENVTSIMFLVALSEYDQVLVESDNENRMEESKALFRTIITYPWFQNSSVILFLNKKDLLEEKIM--YSHLVDYFPEYDGP----QRDAQAAREFILKMFVDL---NPDSDKIIYSHFTCATDTENIRFVFAAVKDTILQLNLKEYNLV"
                        temp_seq = "----------VSAEDKAAAERSKMIDKNLREDGEKARRTLRLLLLGADNSGKSTIVK----------------------------------------------------------------------------------------------------------------------------------GIFETKFQVDKVNFHMFDVG-----RRKWIQCFNDVTAIIFVVDSSDYNR----------LQEALNDFKSIWNNRWLRTISVILFLNKQDLLAEKVLAGKSKIEDYFPEFARYTTPDPRVTRAKY-FIRKEFVDISTASGDGRHICYPHFTC-VDTENARRIFNDCKDIILQMNLREYNLV"
                        pdb_num_dict = OrderedDict()
                        temp_resis = [res for res in chain]
                        temp_i = 0
                        mapped_cgns = []
                        for i, aa in enumerate(temp_seq):
                            if aa != "-":
                                ref_split_on_gaps = ref_seq[:i + 1].split("-")
                                ref_seqnum = i - (len(ref_split_on_gaps) -
                                                  1) + 1
                                res = resis.get(sequence_number=ref_seqnum)
                                if res.display_generic_number.label in mapped_cgns:
                                    next_presumed_cgn = self.get_next_presumed_cgn(
                                        res)
                                    if next_presumed_cgn:
                                        res = next_presumed_cgn
                                        while res and res.display_generic_number.label in mapped_cgns:
                                            res = self.get_next_presumed_cgn(
                                                res)
                                    else:
                                        print(
                                            "Error: {} CGN does not exist. Incorrect mapping of {} in {}"
                                            .format(next_presumed_cgn,
                                                    chain[nums[temp_i]],
                                                    sc.structure))
                                mapped_cgns.append(
                                    res.display_generic_number.label)
                                pdb_num_dict[nums[temp_i]] = [
                                    chain[nums[temp_i]], res
                                ]
                                temp_i += 1

                    bulked_rotamers = []
                    for key, val in pdb_num_dict.items():
                        # print(key, val) # sanity check
                        if not isinstance(val[1], int):
                            res_obj = Residue()
                            res_obj.sequence_number = val[0].get_id()[1]
                            res_obj.amino_acid = AA[val[0].get_resname()]
                            res_obj.display_generic_number = val[
                                1].display_generic_number
                            res_obj.generic_number = val[1].generic_number
                            res_obj.protein_conformation = alpha_protconf
                            res_obj.protein_segment = val[1].protein_segment
                            res_obj.save()
                            rot = self.create_structure_rotamer(
                                val[0], res_obj, sc.structure)
                            bulked_rotamers.append(rot)
                        else:
                            self.logger.info(
                                "Skipped {} as no annotation was present, while building for alpha subunit of {}"
                                .format(val[1], sc))
                    if options["debug"]:
                        pprint.pprint(pdb_num_dict)
                    Rotamer.objects.bulk_create(bulked_rotamers)
                    self.logger.info(
                        "Protein, ProteinConformation and Residue build for alpha subunit of {} is finished"
                        .format(sc))
                except Exception as msg:
                    if options["debug"]:
                        print("Error: ", sc, msg)
                    self.logger.info(
                        "Protein, ProteinConformation and Residue build for alpha subunit of {} has failed"
                        .format(sc))

        if not options["s"]:
            ### Build SignprotStructure objects from non-complex signprots
            g_prot_alphas = Protein.objects.filter(
                family__slug__startswith="100_001",
                accession__isnull=False)  #.filter(entry_name="gnai1_human")
            complex_structures = SignprotComplex.objects.all().values_list(
                "structure__pdb_code__index", flat=True)
            for a in g_prot_alphas:
                pdb_list = get_pdb_ids(a.accession)
                for pdb in pdb_list:
                    if pdb not in complex_structures:
                        try:
                            data = self.fetch_gprot_data(pdb, a)
                            if data:
                                self.build_g_prot_struct(a, pdb, data)
                        except Exception as msg:
                            self.logger.error(
                                "SignprotStructure of {} {} failed\n{}: {}".
                                format(a.entry_name, pdb, type(msg), msg))

        if options["debug"]:
            print(datetime.datetime.now() - startTime)
Ejemplo n.º 12
0
    def main_func(self, positions, iteration):
        # filenames
        if not positions[1]:
            filenames = self.filenames[positions[0]:]
        else:
            filenames = self.filenames[positions[0]:positions[1]]

        # parse files
        for source_file in filenames:
            source_file_path = os.sep.join([self.construct_data_dir, source_file])
            if os.path.isfile(source_file_path) and source_file[0] != '.':
                self.logger.info('Reading file {}'.format(source_file_path))
                # read the yaml file
                with open(source_file_path, 'r') as f:
                    sd = yaml.load(f)

                    # is a protein specified?
                    if 'protein' not in sd:
                        self.logger.error('Protein not specified for construct, skipping')
                        continue

                    # fetch the parent protein
                    try:
                        ppc = ProteinConformation.objects.prefetch_related('protein__family', 'protein__species',
                            'protein__residue_numbering_scheme').get(protein__entry_name=sd['protein'],
                            state__slug=settings.DEFAULT_PROTEIN_STATE)
                    except ProteinConformation.DoesNotExist:
                        # abort if parent protein is not found
                        self.logger.error('Parent protein {} for construct {} not found, aborting!'.format(
                            sd['protein'], sd['name']))
                        continue

                    # sequence type
                    try:
                        sequence_type, created = ProteinSequenceType.objects.get_or_create(slug='mod',
                            defaults={'name': 'Modified'})
                        if created:
                            self.logger.info('Created sequence type {}'.format(sequence_type))
                    except IntegrityError:
                        sequence_type = ProteinSequenceType.objects.get(slug='mod')

                    # protein source
                    try:
                        protein_source, created = ProteinSource.objects.get_or_create(name='OTHER')
                        if created:
                            self.logger.info('Created protein source {}'.format(protein_source))
                    except IntegrityError:
                        protein_source = ProteinSource.objects.get(name='OTHER')

                    # create a protein record
                    p = Protein()
                    p.parent = ppc.protein
                    p.family = ppc.protein.family
                    p.species = ppc.protein.species
                    p.residue_numbering_scheme = ppc.protein.residue_numbering_scheme
                    p.sequence_type= sequence_type
                    p.source = protein_source
                    p.entry_name = slugify(strip_tags(sd['name']))
                    p.name = sd['name']
                    p.sequence = ppc.protein.sequence

                    # save protein (construct)
                    try:
                        p.save()
                        self.logger.info('Created construct {} with parent protein {}'.format(p.name,
                            ppc.protein.entry_name))
                    except:
                        self.logger.error('Failed creating construct {} with parent protein {}'.format(p.name,
                            ppc.protein.entry_name))
                        continue

                    # create protein conformation record
                    pc = ProteinConformation()
                    pc.protein = p
                    pc.state = ProteinState.objects.get(slug=settings.DEFAULT_PROTEIN_STATE)
                    try:
                        pc.save()
                        self.logger.info('Created conformation {} of protein {}'.format(pc.state.name, p.name))
                    except:
                        self.logger.error('Failed creating conformation {} of protein {}'.format(pc.state.name,
                            p.entry_name))

                    # create residue records
                    deletions = []
                    if 'deletions' in sd and sd['deletions']:
                        for t in sd['deletions']:
                            deletions += list(range(t[0],t[1]+1))

                    mutations = {}
                    if 'mutations' in sd and sd['mutations']:
                        for m in sd['mutations']:
                            res_num = int(m[1:-1])
                            mutations[res_num] = {
                                'wt_res': m[0],
                                'mut_res': m[-1],
                                'full': m,
                            }

                    # insertions
                    split_segments = {}
                    if 'insertions' in sd and sd['insertions']:
                        for ins in sd['insertions']:
                            ins_start = Residue.objects.get(protein_conformation=ppc,
                                sequence_number=ins['positions'][0])
                            ins_end = Residue.objects.get(protein_conformation=ppc,
                                sequence_number=ins['positions'][1])
                            # if the insertion is within only one segment (the usual case), split that
                            # segment into two segments
                            if ins_start and ins_start.protein_segment == ins_end.protein_segment:
                                # get/create split protein segments
                                slug_1 = ins_start.protein_segment.slug + "_1"
                                try:
                                    segment_before, created = ProteinSegment.objects.get_or_create(slug=slug_1,
                                        defaults={'name': ins_start.protein_segment.name,
                                        'category': ins_start.protein_segment.category, 'partial': True})
                                    if created:
                                        self.logger.info('Created protein segment {}'.format(segment_before))
                                except IntegrityError:
                                    segment_before = ProteinSegment.objects.get(slug=slug_1)

                                slug_2 = ins_start.protein_segment.slug + "_2"
                                try:
                                    segment_after, created = ProteinSegment.objects.get_or_create(slug=slug_2,
                                        defaults={'name': ins_start.protein_segment.name,
                                        'category': ins_start.protein_segment.category, 'partial': True})
                                    if created:
                                        self.logger.info('Created protein segment {}'.format(segment_after))
                                except IntegrityError:
                                    segment_after = ProteinSegment.objects.get(slug=slug_2)

                                # keep track of  information about split segments
                                split_segments[ins_start.protein_segment.slug] = {
                                    'start': {
                                        'sequence_number': ins['positions'][0],
                                        'segment': segment_before,
                                    },
                                    'end': {
                                        'sequence_number': ins['positions'][1],
                                        'segment': segment_after,
                                    },
                                }
                            # if the insertion covers two segments, use those two as the segments before and after
                            elif ins_start:
                                segment_before = ins_start.protein_segment
                                segment_after = ins_end.protein_segment

                            # if the insertion replaces a part of the sequence, add that range as a deletion
                            if ins['positions'][1] > (ins['positions'][0] + 1):
                                deletions += list(range((ins['positions'][0] + 1), ins['positions'][1]))

                            # get/insert fusion protein
                            fusion, create = ProteinFusion.objects.get_or_create(name=ins['name'], defaults={
                                'sequence': ins['sequence']})

                            # create relationship with protein
                            ProteinFusionProtein.objects.create(protein=p, protein_fusion=fusion,
                                segment_before=segment_before, segment_after=segment_after)

                    prs = Residue.objects.filter(protein_conformation=ppc).prefetch_related(
                        'protein_conformation__protein', 'protein_segment', 'generic_number',
                        'display_generic_number__scheme', 'alternative_generic_numbers__scheme')
                    updated_sequence = ''
                    for pr in prs:
                        if pr.sequence_number not in deletions:
                            r = Residue()
                            r.protein_conformation = pc
                            r.generic_number = pr.generic_number
                            r.display_generic_number = pr.display_generic_number
                            r.sequence_number = pr.sequence_number

                            # check for split segments
                            if pr.protein_segment.slug in split_segments:
                                rsns = split_segments[pr.protein_segment.slug]['start']['sequence_number']
                                rsne = split_segments[pr.protein_segment.slug]['end']['sequence_number']
                                if r.sequence_number <= rsns:
                                    r.protein_segment = split_segments[pr.protein_segment.slug]['start']['segment']
                                elif r.sequence_number >= rsne:
                                    r.protein_segment = split_segments[pr.protein_segment.slug]['end']['segment']
                            else:
                                r.protein_segment = pr.protein_segment

                            # amino acid, check for mutations
                            if r.sequence_number in mutations:
                                if mutations[r.sequence_number]['wt_res'] == pr.amino_acid:
                                    r.amino_acid = mutations[r.sequence_number]['mut_res']
                                else:
                                    self.logger.error('Mutation {} in construct {} does not match wild-type sequence' \
                                        + ' of {}'.format(mutations[r.sequence_number]['full'], pc.protein.name,
                                        ppc.protein.entry_name))
                            else:
                                r.amino_acid = pr.amino_acid

                            # save amino acid to updated sequence
                            updated_sequence += r.amino_acid

                            # save residue before populating M2M relations
                            r.save()

                            # alternative generic numbers
                            agns = pr.alternative_generic_numbers.all()
                            for agn in agns:
                                r.alternative_generic_numbers.add(agn)

                    # update sequence
                    p.sequence = updated_sequence
                    p.save()
Ejemplo n.º 13
0
    def main_func(self, positions, iteration):
        # filenames
        if not positions[1]:
            filenames = self.filenames[positions[0]:]
        else:
            filenames = self.filenames[positions[0]:positions[1]]

        # parse files
        for source_file in filenames:
            source_file_path = os.sep.join(
                [self.construct_data_dir, source_file])
            if os.path.isfile(source_file_path) and source_file[0] != '.':
                self.logger.info('Reading file {}'.format(source_file_path))
                # read the yaml file
                with open(source_file_path, 'r') as f:
                    sd = yaml.load(f)

                    # is a protein specified?
                    if 'protein' not in sd:
                        self.logger.error(
                            'Protein not specified for construct, skipping')
                        continue

                    # fetch the parent protein
                    try:
                        ppc = ProteinConformation.objects.prefetch_related(
                            'protein__family', 'protein__species',
                            'protein__residue_numbering_scheme').get(
                                protein__entry_name=sd['protein'],
                                state__slug=settings.DEFAULT_PROTEIN_STATE)
                    except ProteinConformation.DoesNotExist:
                        # abort if parent protein is not found
                        self.logger.error(
                            'Parent protein {} for construct {} not found, aborting!'
                            .format(sd['protein'], sd['name']))
                        continue

                    # sequence type
                    try:
                        sequence_type, created = ProteinSequenceType.objects.get_or_create(
                            slug='mod', defaults={'name': 'Modified'})
                        if created:
                            self.logger.info('Created sequence type {}'.format(
                                sequence_type))
                    except IntegrityError:
                        sequence_type = ProteinSequenceType.objects.get(
                            slug='mod')

                    # protein source
                    try:
                        protein_source, created = ProteinSource.objects.get_or_create(
                            name='OTHER')
                        if created:
                            self.logger.info(
                                'Created protein source {}'.format(
                                    protein_source))
                    except IntegrityError:
                        protein_source = ProteinSource.objects.get(
                            name='OTHER')

                    # create a protein record
                    p = Protein()
                    p.parent = ppc.protein
                    p.family = ppc.protein.family
                    p.species = ppc.protein.species
                    p.residue_numbering_scheme = ppc.protein.residue_numbering_scheme
                    p.sequence_type = sequence_type
                    p.source = protein_source
                    p.entry_name = slugify(strip_tags(sd['name']))
                    p.name = sd['name']
                    p.sequence = ppc.protein.sequence

                    # save protein (construct)
                    try:
                        p.save()
                        self.logger.info(
                            'Created construct {} with parent protein {}'.
                            format(p.name, ppc.protein.entry_name))
                    except:
                        self.logger.error(
                            'Failed creating construct {} with parent protein {}'
                            .format(p.name, ppc.protein.entry_name))
                        continue

                    # create protein conformation record
                    pc = ProteinConformation()
                    pc.protein = p
                    pc.state = ProteinState.objects.get(
                        slug=settings.DEFAULT_PROTEIN_STATE)
                    try:
                        pc.save()
                        self.logger.info(
                            'Created conformation {} of protein {}'.format(
                                pc.state.name, p.name))
                    except:
                        self.logger.error(
                            'Failed creating conformation {} of protein {}'.
                            format(pc.state.name, p.entry_name))

                    # create residue records
                    deletions = []
                    if 'deletions' in sd and sd['deletions']:
                        for t in sd['deletions']:
                            deletions += list(range(t[0], t[1] + 1))

                    mutations = {}
                    if 'mutations' in sd and sd['mutations']:
                        for m in sd['mutations']:
                            res_num = int(m[1:-1])
                            mutations[res_num] = {
                                'wt_res': m[0],
                                'mut_res': m[-1],
                                'full': m,
                            }

                    # insertions
                    split_segments = {}
                    if 'insertions' in sd and sd['insertions']:
                        for ins in sd['insertions']:
                            ins_start = Residue.objects.get(
                                protein_conformation=ppc,
                                sequence_number=ins['positions'][0])
                            ins_end = Residue.objects.get(
                                protein_conformation=ppc,
                                sequence_number=ins['positions'][1])
                            # if the insertion is within only one segment (the usual case), split that
                            # segment into two segments
                            if ins_start and ins_start.protein_segment == ins_end.protein_segment:
                                # get/create split protein segments
                                slug_1 = ins_start.protein_segment.slug + "_1"
                                try:
                                    segment_before, created = ProteinSegment.objects.get_or_create(
                                        slug=slug_1,
                                        defaults={
                                            'name':
                                            ins_start.protein_segment.name,
                                            'category':
                                            ins_start.protein_segment.category,
                                            'partial': True
                                        })
                                    if created:
                                        self.logger.info(
                                            'Created protein segment {}'.
                                            format(segment_before))
                                except IntegrityError:
                                    segment_before = ProteinSegment.objects.get(
                                        slug=slug_1)

                                slug_2 = ins_start.protein_segment.slug + "_2"
                                try:
                                    segment_after, created = ProteinSegment.objects.get_or_create(
                                        slug=slug_2,
                                        defaults={
                                            'name':
                                            ins_start.protein_segment.name,
                                            'category':
                                            ins_start.protein_segment.category,
                                            'partial': True
                                        })
                                    if created:
                                        self.logger.info(
                                            'Created protein segment {}'.
                                            format(segment_after))
                                except IntegrityError:
                                    segment_after = ProteinSegment.objects.get(
                                        slug=slug_2)

                                # keep track of  information about split segments
                                split_segments[
                                    ins_start.protein_segment.slug] = {
                                        'start': {
                                            'sequence_number':
                                            ins['positions'][0],
                                            'segment': segment_before,
                                        },
                                        'end': {
                                            'sequence_number':
                                            ins['positions'][1],
                                            'segment': segment_after,
                                        },
                                    }
                            # if the insertion covers two segments, use those two as the segments before and after
                            elif ins_start:
                                segment_before = ins_start.protein_segment
                                segment_after = ins_end.protein_segment

                            # if the insertion replaces a part of the sequence, add that range as a deletion
                            if ins['positions'][1] > (ins['positions'][0] + 1):
                                deletions += list(
                                    range((ins['positions'][0] + 1),
                                          ins['positions'][1]))

                            # get/insert fusion protein
                            fusion, create = ProteinFusion.objects.get_or_create(
                                name=ins['name'],
                                defaults={'sequence': ins['sequence']})

                            # create relationship with protein
                            ProteinFusionProtein.objects.create(
                                protein=p,
                                protein_fusion=fusion,
                                segment_before=segment_before,
                                segment_after=segment_after)

                    prs = Residue.objects.filter(
                        protein_conformation=ppc).prefetch_related(
                            'protein_conformation__protein', 'protein_segment',
                            'generic_number', 'display_generic_number__scheme',
                            'alternative_generic_numbers__scheme')
                    updated_sequence = ''
                    for pr in prs:
                        if pr.sequence_number not in deletions:
                            r = Residue()
                            r.protein_conformation = pc
                            r.generic_number = pr.generic_number
                            r.display_generic_number = pr.display_generic_number
                            r.sequence_number = pr.sequence_number

                            # check for split segments
                            if pr.protein_segment.slug in split_segments:
                                rsns = split_segments[pr.protein_segment.slug][
                                    'start']['sequence_number']
                                rsne = split_segments[pr.protein_segment.slug][
                                    'end']['sequence_number']
                                if r.sequence_number <= rsns:
                                    r.protein_segment = split_segments[
                                        pr.protein_segment.
                                        slug]['start']['segment']
                                elif r.sequence_number >= rsne:
                                    r.protein_segment = split_segments[
                                        pr.protein_segment.
                                        slug]['end']['segment']
                            else:
                                r.protein_segment = pr.protein_segment

                            # amino acid, check for mutations
                            if r.sequence_number in mutations:
                                if mutations[r.sequence_number][
                                        'wt_res'] == pr.amino_acid:
                                    r.amino_acid = mutations[
                                        r.sequence_number]['mut_res']
                                else:
                                    self.logger.error('Mutation {} in construct {} does not match wild-type sequence' \
                                        + ' of {}'.format(mutations[r.sequence_number]['full'], pc.protein.name,
                                        ppc.protein.entry_name))
                            else:
                                r.amino_acid = pr.amino_acid

                            # save amino acid to updated sequence
                            updated_sequence += r.amino_acid

                            # save residue before populating M2M relations
                            r.save()

                            # alternative generic numbers
                            agns = pr.alternative_generic_numbers.all()
                            for agn in agns:
                                r.alternative_generic_numbers.add(agn)

                    # update sequence
                    p.sequence = updated_sequence
                    p.save()