Esempio n. 1
0
    def __init__(self,
                 pdb_file=None,
                 pdb_filename=None,
                 structure=None,
                 blast_path='blastp',
                 blastdb=os.sep.join([
                     settings.STATICFILES_DIRS[0], 'blast', 'protwis_blastdb'
                 ])):

        # pdb_file can be either a name/path or a handle to an open file
        self.pdb_file = pdb_file
        self.pdb_filename = pdb_filename

        # dictionary of 'MappedResidue' object storing information about alignments and bw numbers
        self.residues = {}
        self.pdb_seq = {}  #Seq('')
        # list of uniprot ids returned from blast
        self.prot_id_list = []
        #setup for local blast search
        self.blast = BlastSearch(blast_path=blast_path, blastdb=blastdb)

        if self.pdb_file:
            self.pdb_structure = PDBParser(PERMISSIVE=True,
                                           QUIET=True).get_structure(
                                               'ref', self.pdb_file)[0]
        elif self.pdb_filename:
            self.pdb_structure = PDBParser(PERMISSIVE=True,
                                           QUIET=True).get_structure(
                                               'ref', self.pdb_filename)[0]
        else:
            self.pdb_structure = structure

        self.parse_structure(self.pdb_structure)
    def __init__(
        self,
        pdb_file=None,
        pdb_filename=None,
        structure=None,
        blast_path="blastp",
        blastdb=os.sep.join([settings.STATICFILES_DIRS[0], "blast", "protwis_blastdb"]),
    ):

        # pdb_file can be either a name/path or a handle to an open file
        self.pdb_file = pdb_file
        self.pdb_filename = pdb_filename

        # dictionary of 'MappedResidue' object storing information about alignments and bw numbers
        self.residues = {}
        self.pdb_seq = {}  # Seq('')
        # list of uniprot ids returned from blast
        self.prot_id_list = []
        # setup for local blast search
        self.blast = BlastSearch(blast_path=blast_path, blastdb=blastdb)

        if self.pdb_file:
            self.pdb_structure = PDBParser(PERMISSIVE=True, QUIET=True).get_structure("ref", self.pdb_file)[0]
        elif self.pdb_filename:
            self.pdb_structure = PDBParser(PERMISSIVE=True, QUIET=True).get_structure("ref", self.pdb_filename)[0]
        else:
            self.pdb_structure = structure

        self.parse_structure(self.pdb_structure)
Esempio n. 3
0
    def __init__(self, pdb_file=None, sequence=None, wt_protein_id=None):

        # dictionary of 'ParsedResidue' object storing information about alignments and bw numbers
        self.mapping = {}
        self.residues = {}
        self.segments = {}
        self.blast = BlastSearch(blastdb=os.sep.join([settings.STATICFILES_DIRS[0], 'blast', 'protwis_blastdb']))
        self.wt_protein_id = wt_protein_id
        
        if pdb_file is not None:
            self.pdb_struct = PDBParser(QUIET=True).get_structure('pdb', pdb_file)[0]
            # a list of SeqRecord objects retrived from the pdb SEQRES section
            try:
                self.seqres = list(SeqIO.parse(pdb_file, 'pdb-seqres'))
                self.struct_id = self.seqres[0].id.split(':')[0]
            except:
                self.seqres = None
                self.struct_id = None
            # SeqRecord id is a pdb_code:chain

        self.sequence = sequence
        if type(sequence) == "string":
            self.sequence = { x: y for x,y in enumerate(sequnece) }


        # If not specified, attempt to get wildtype from pdb.
        try:
            if not wt_protein_id and pdb_file is not None:
                self.wt = Structure.objects.get(pdb_code__index=self.struct_id).protein_conformation.protein.parent
            else:
                raise Exception()
        except:
            if not wt_protein_id:
                self.wt = None
                self.wt_seq = ''
            else:
                self.wt = Protein.objects.get(id=wt_protein_id)
                self.wt_seq = str(self.wt.sequence)
        self.fusions = []

        self.parse_pdb(self.pdb_struct)
        #if self.seqres:
        #    self.map_seqres()
        
        self.mark_deletions()
    def __init__ (self, pdb_file=None, pdb_filename=None, structure=None, pdb_code=None, blast_path='blastp',
        blastdb=os.sep.join([settings.STATICFILES_DIRS[0], 'blast', 'protwis_blastdb']),top_results=1, sequence_parser=False, signprot=False):

        # pdb_file can be either a name/path or a handle to an open file
        self.pdb_file = pdb_file
        self.pdb_filename = pdb_filename

        # if pdb 4 letter code is specified
        self.pdb_code = pdb_code

        # dictionary of 'MappedResidue' object storing information about alignments and bw numbers
        self.residues = {}
        self.pdb_seq = {} #Seq('')
        # list of uniprot ids returned from blast
        self.prot_id_list = []
        #setup for local blast search
        self.blast = BlastSearch(blast_path=blast_path, blastdb=blastdb,top_results=top_results)

        # calling sequence parser
        if sequence_parser:
            if pdb_code:
                struct = Structure.objects.get(pdb_code__index=self.pdb_code)
            if not signprot:
                if pdb_code:
                    s = SequenceParser(pdb_file=self.pdb_file, wt_protein_id=struct.protein_conformation.protein.parent.id)
                else:
                    s = SequenceParser(pdb_file=self.pdb_file)#, wt_protein_id=struct.protein_conformation.protein.parent.id)
            else:
                s = SequenceParser(pdb_file=self.pdb_file, wt_protein_id=signprot.id)
            self.pdb_structure = s.pdb_struct
            self.mapping = s.mapping
            self.wt = s.wt
        else:
            if self.pdb_file:
                self.pdb_structure = PDBParser(PERMISSIVE=True, QUIET=True).get_structure('ref', self.pdb_file)[0]
            elif self.pdb_filename:
                self.pdb_structure = PDBParser(PERMISSIVE=True, QUIET=True).get_structure('ref', self.pdb_filename)[0]
            else:
                self.pdb_structure = structure

            self.parse_structure(self.pdb_structure)
Esempio n. 5
0
    def __init__(self, pdb_file, sequence=None, wt_protein_id=None):

        # dictionary of 'ParsedResidue' object storing information about alignments and bw numbers
        self.mapping = {}
        self.residues = {}
        self.blast = BlastSearch(blastdb=os.sep.join([settings.STATICFILES_DIRS[0], 'blast', 'protwis_human_blastdb']))

        self.pdb_struct = PDBParser(QUIET=True).get_structure('pdb', pdb_file)[0]
        # a list of SeqRecord objects retrived from the pdb SEQRES section
        self.seqres = list(SeqIO.parse(pdb_file, 'pdb-seqres'))

        # SeqRecord id is a pdb_code:chain 
        self.struct_id = self.seqres[0].id.split(':')[0]
        # If not specified, attempt to get wildtype from pdb.
        if not wt_protein_id:
            self.wt = Structure.objects.get(pdb_code__index=self.struct_id).protein_conformation.protein.parent
        else:
            self.wt = Protein.objects.get(id=wt_protein_id)
        self.wt_seq = str(self.wt.sequence)
        self.fusions = []


        self.parse_pdb(self.pdb_struct)
Esempio n. 6
0
    def post(self, request, *args, **kwargs):

        if 'human' in request.POST.keys():
            blast = BlastSearch(blastdb=os.sep.join([settings.STATICFILES_DIRS[0], 'blast', 'protwis_human_blastdb']), top_results=50)
            blast_out = blast.run(request.POST['input_seq'])
        else:
            blast = BlastSearch(top_results=50)
            blast_out = blast.run(request.POST['input_seq'])

        context = {}
        context['results'] = [(Protein.objects.get(pk=x[0]), x[1]) for x in blast_out]
        context["input"] = request.POST['input_seq']

        return render(request, self.template_name, context)
    def __init__ (self, pdb_file=None, pdb_filename=None, structure=None, pdb_code=None, blast_path='blastp',
        blastdb=os.sep.join([settings.STATICFILES_DIRS[0], 'blast', 'protwis_blastdb']),top_results=1, sequence_parser=False, signprot=False):
    
        # pdb_file can be either a name/path or a handle to an open file
        self.pdb_file = pdb_file
        self.pdb_filename = pdb_filename

        # if pdb 4 letter code is specified
        self.pdb_code = pdb_code
        
        # dictionary of 'MappedResidue' object storing information about alignments and bw numbers
        self.residues = {}
        self.pdb_seq = {} #Seq('')
        # list of uniprot ids returned from blast
        self.prot_id_list = []
        #setup for local blast search
        self.blast = BlastSearch(blast_path=blast_path, blastdb=blastdb,top_results=top_results)
        
        # calling sequence parser
        if sequence_parser:
            if pdb_code:
                struct = Structure.objects.get(pdb_code__index=self.pdb_code)
            if not signprot:
                if pdb_code:
                    s = SequenceParser(pdb_file=self.pdb_file, wt_protein_id=struct.protein_conformation.protein.parent.id)
                else:
                    s = SequenceParser(pdb_file=self.pdb_file)#, wt_protein_id=struct.protein_conformation.protein.parent.id)
            else:
                s = SequenceParser(pdb_file=self.pdb_file, wt_protein_id=signprot.id)
            self.pdb_structure = s.pdb_struct
            self.mapping = s.mapping
            self.wt = s.wt
        else:
            if self.pdb_file:
                self.pdb_structure = PDBParser(PERMISSIVE=True, QUIET=True).get_structure('ref', self.pdb_file)[0]
            elif self.pdb_filename:
                self.pdb_structure = PDBParser(PERMISSIVE=True, QUIET=True).get_structure('ref', self.pdb_filename)[0]
            else:
                self.pdb_structure = structure

            self.parse_structure(self.pdb_structure)
Esempio n. 8
0
	def handle(self, *args, **options):
		blastdb = None
		if options['d']:
			blastdb = options['d'] ### FIXME import/parse blast db 
		else:
			blastdb = 'blastp_out.fasta'
			if options['make_db']:
				if len(options['make_db'])>1:
					prots = Protein.objects.filter(entry_name__in=options['make_db'])
					### FIXME
				elif len(options['make_db'])==1:
					prots = []
					fasta = ''
					### xtal preset
					if options['make_db']==['xtal']:
						structs = Structure.objects.all()
						for i in structs:
							if i.protein_conformation.protein.parent not in prots:
								prots.append(i.protein_conformation.protein.parent)
								fasta+='>{}\n{}\n'.format(i.protein_conformation.protein.parent.entry_name, i.protein_conformation.protein.parent.sequence)
					elif options['make_db']==['all']:
						receptor_fams = ProteinFamily.objects.filter(name__startswith='Class')
						prots = Protein.objects.filter(accession__isnull=False, family__parent__parent__parent__in=receptor_fams)
						for i in prots:
							fasta+='>{}\n{}\n'.format(i.entry_name, i.sequence)
					else:
						fasta+='>{}\n{}\n'.format('single input', options['make_db'][0])
					with open('./blastp_out.fasta','w') as f:
						f.write(fasta)
				make_db_command = shlex.split('makeblastdb -in blastp_out.fasta -dbtype prot -parse_seqids')
				subprocess.call(make_db_command)

		if options['q']:
			for q in options['q']:
				if blastdb:
					bs = BlastSearch(blastdb=blastdb, top_results=1)
					out = bs.run(q)
					for o in out:
						print(o[0])
						print(o[1])
				else:
					bs = BlastSearch()
					out = bs.run(q)
					for o in out:
						for i in o:
							print(i)
    def main_func(self, positions, iteration, count, lock):
        self.logger.info('CREATING OTHER PROTEINS')
        try:
            # go through constructs and finding their entry_names for lookup
            construct_entry_names = []
            self.logger.info('Getting construct accession codes')
            filenames = os.listdir(self.construct_data_dir)
            for source_file in filenames:
                source_file_path = os.sep.join(
                    [self.construct_data_dir, source_file])
                self.logger.info(
                    'Getting protein name from construct file {}'.format(
                        source_file))
                split_filename = source_file.split(".")
                extension = split_filename[1]
                if extension != 'yaml':
                    continue

                # read the yaml file
                with open(source_file_path, 'r') as f:
                    sd = yaml.load(f)

                # check whether protein is specified
                if 'protein' not in sd:
                    continue

                # append entry_name to lookup list
                construct_entry_names.append(sd['protein'])

            # parse files
            filenames = os.listdir(self.local_uniprot_dir)

            # Keep track of first or second iteration
            reviewed = ['SWISSPROT', 'TREMBL'][iteration - 1]
            skipped_due_to_swissprot = 0
            # for i,source_file in enumerate(filenames):
            while count.value < len(filenames):
                with lock:
                    source_file = filenames[count.value]
                    count.value += 1
                # if i<positions[0]: #continue if less than start
                #     continue
                # if positions[1]: #if end is non-false
                #     if i>=positions[1]:
                #         #continue if i less than process
                #         continue
                source_file_name = os.sep.join(
                    [self.local_uniprot_dir, source_file])
                split_filename = source_file.split(".")
                accession = split_filename[0]
                extension = split_filename[1]
                if extension != 'txt':
                    continue

                up = self.parse_uniprot_file(accession)

                # Skip TREMBL on first loop, and SWISSPROT on second
                if reviewed != up['source']:
                    continue

                # skip human proteins
                if 'species_latin_name' in up and up[
                        'species_latin_name'] == 'H**o sapiens':
                    continue

                # should proteins that are not constructs be skipped?
                if self.constructs_only and up[
                        'entry_name'] not in construct_entry_names:
                    continue

                # is this an ortholog of a human protein?
                ortholog = False

                # is there already an entry for this protein?
                try:
                    p = Protein.objects.get(entry_name=up['entry_name'])
                    if "SWISSPROT" == up['source']:
                        pass
                    #  print(up['entry_name'], "already there?", accession )
                    continue
                except Protein.DoesNotExist:
                    p = None

                    # get human ortholog using gene name
                    for gene in up['genes']:
                        try:
                            g = Gene.objects.get(name__iexact=gene,
                                                 species__common_name="Human",
                                                 position=0)
                            ps = g.proteins.all().order_by('id')
                            p = ps[0]
                            ortholog = True
                            self.logger.info("Human ortholog found: {}".format(
                                p.entry_name))
                            break
                        except Gene.DoesNotExist:
                            self.logger.info(
                                "No gene found for {}".format(gene))
                            continue

                    # if gene name not found, try using entry name
                    if not p:
                        split_entry_name = up['entry_name'].split('_')

                        # add _ to the split entry name to avoid e.g. gp1 matching gp139
                        entry_name_query = split_entry_name[0] + '_'
                        try:
                            p = Protein.objects.get(
                                entry_name__startswith=entry_name_query,
                                species__common_name="Human")
                            ortholog = True
                            self.logger.info("Human ortholog found: {}".format(
                                p.entry_name))
                        except Protein.DoesNotExist:
                            self.logger.info("No match found for {}".format(
                                entry_name_query))

                    # check whether the entry name is in the construct list
                    if not p and up['entry_name'] in construct_entry_names:
                        # BLAST sequence to find closest hit (for reference positions)
                        blast = BlastSearch()
                        blast_out = blast.run(up['sequence'])

                        # use first hit from BLAST as template for reference positions
                        try:
                            p = Protein.objects.get(pk=blast_out[0][0])
                        except Protein.DoesNotExist:
                            print('Template protein for {} not found'.format(
                                up['entry_name']))
                            self.logger.error(
                                'Template protein for {} not found'.format(
                                    up['entry_name']))

                # skip if no ortholog is found FIXME use a profile to find a good template
                if not p:
                    continue

                # check whether an entry already exists for this protein/species
                # Skips unreviewed genes that have a matching SWISPROT - Some human orthologues
                # can have several orthologues from same species. Eg: agtra_rat and agtrb_rat for AGTR1_HUMAN
                already_entry_names = list(
                    Protein.objects.filter(
                        family=p.family,
                        species__common_name=up['species_common_name'],
                        source__name="SWISSPROT").exclude(
                            entry_name=up['entry_name']).values_list(
                                'entry_name', flat=True))
                if "SWISSPROT" != up['source'] and len(already_entry_names):
                    # print(up['entry_name'], accession, " swissprot already there?",p.family.slug, p, p.accession )
                    skipped_due_to_swissprot += 1
                    continue
                elif len(already_entry_names):
                    self.logger.error(
                        "{} {} swissprot orthologue already there? {}".format(
                            up['entry_name'], accession, already_entry_names))

                # # check whether reference positions exist for this protein, and find them if they do not
                # ref_position_file_path = os.sep.join([self.ref_position_source_dir, up['entry_name'] + '.yaml'])
                # auto_ref_position_file_path = os.sep.join([self.auto_ref_position_source_dir, up['entry_name'] + '.yaml'])
                # if not os.path.isfile(ref_position_file_path):
                #     # look for the file in the automatically generated reference file dir
                #     if not os.path.isfile(auto_ref_position_file_path):
                #         # get reference positions of human ortholog
                #         template_ref_position_file_path = os.sep.join([self.ref_position_source_dir,
                #             p.entry_name + '.yaml'])
                #         if not os.path.isfile(template_ref_position_file_path):
                #             # use a non human sequence
                #             template_ref_position_file_path = os.sep.join([self.auto_ref_position_source_dir,
                #             p.entry_name + '.yaml'])

                #         ref_positions = align_protein_to_reference(up, template_ref_position_file_path, p)

                #         # write reference positions to a file
                #         with open(auto_ref_position_file_path, "w") as auto_ref_position_file:
                #             yaml.dump(ref_positions, auto_ref_position_file, default_flow_style=False)

                # create a database entry for the protein
                if ortholog:
                    # for orthologs, use properties from the human protein
                    self.create_protein(p.name, p.family, p.sequence_type,
                                        p.residue_numbering_scheme, accession,
                                        up)
                else:
                    # otherwise, create a new family, and use Uniprot name
                    top_level_parent_family = ProteinFamily.objects.get(
                        slug=p.family.slug.split('_')[0])
                    num_families = ProteinFamily.objects.filter(
                        parent=top_level_parent_family).count()
                    family_slug = top_level_parent_family.slug + "_" + str(
                        num_families + 1).zfill(3)
                    other_family, created = ProteinFamily.objects.get_or_create(
                        parent=top_level_parent_family,
                        name='Other',
                        defaults={'slug': family_slug})
                    if created:
                        self.logger.info(
                            'Created protein family {}'.format(other_family))

                    family_slug += '_001'
                    unclassified_family, created = ProteinFamily.objects.get_or_create(
                        parent=other_family,
                        name='Unclassified',
                        defaults={'slug': family_slug})
                    if created:
                        self.logger.info('Created protein family {}'.format(
                            unclassified_family))

                    num_families = ProteinFamily.objects.filter(
                        parent=unclassified_family).count()
                    family_slug = unclassified_family.slug + "_" + str(
                        num_families + 1).zfill(3)
                    pf, created = ProteinFamily.objects.get_or_create(
                        parent=unclassified_family,
                        name=up['genes'][0],
                        defaults={'slug': family_slug})
                    if created:
                        self.logger.info(
                            'Created protein family {}'.format(pf))

                    self.create_protein(up['genes'][0], pf, p.sequence_type,
                                        p.residue_numbering_scheme, accession,
                                        up)
            self.logger.info('COMPLETED CREATING OTHER PROTEINS')
        except Exception as msg:
            print(msg)
            self.logger.error(msg)
            PrintException()
Esempio n. 10
0
class SequenceParser(object):
    """
    Class mapping the pdb, pdb_seqres, wildtype and any given sequence onto wt using blast with human sequences database. It produces a report with missing, mutated and inserted residues.
    """

    residue_list = [
        "ARG", "ASP", "GLU", "HIS", "ASN", "GLN", "LYS", "SER", "THR", "HIS",
        "HID", "PHE", "LEU", "ILE", "TYR", "TRP", "VAL", "MET", "PRO", "CYS",
        "ALA", "GLY"
    ]

    def __init__(self, pdb_file=None, sequence=None, wt_protein_id=None):

        # dictionary of 'ParsedResidue' object storing information about alignments and bw numbers
        self.mapping = {}
        self.residues = {}
        self.segments = {}
        self.blast = BlastSearch(blastdb=os.sep.join(
            [settings.STATICFILES_DIRS[0], 'blast', 'protwis_human_blastdb']))

        if pdb_file is not None:
            self.pdb_struct = PDBParser(QUIET=True).get_structure(
                'pdb', pdb_file)[0]
            # a list of SeqRecord objects retrived from the pdb SEQRES section
            try:
                self.seqres = list(SeqIO.parse(pdb_file, 'pdb-seqres'))
            except:
                self.seqres = None
            # SeqRecord id is a pdb_code:chain
            self.struct_id = self.seqres[0].id.split(':')[0]

        self.sequence = sequence
        if type(sequence) == "string":
            self.sequence = {x: y for x, y in enumerate(sequnece)}

        # If not specified, attempt to get wildtype from pdb.
        if not wt_protein_id and pdb_file is not None:
            self.wt = Structure.objects.get(
                pdb_code__index=self.struct_id
            ).protein_conformation.protein.parent
        else:
            self.wt = Protein.objects.get(id=wt_protein_id)
        self.wt_seq = str(self.wt.sequence)
        self.fusions = []

        self.parse_pdb(self.pdb_struct)
        #if self.seqres:
        #    self.map_seqres()

        self.mark_deletions()

    def parse_pdb(self, pdb_struct):
        """
        extracting sequence and preparing dictionary of residues
        bio.pdb reads pdb in the following cascade: model->chain->residue->atom
        """
        wt_resi = list(
            Residue.objects.filter(protein_conformation__protein=self.wt.id))
        for chain in pdb_struct:
            self.residues[chain.id] = []
            self.mapping[chain.id] = {
                x.sequence_number: ParsedResidue(
                    x.amino_acid, x.sequence_number,
                    str(x.display_generic_number)
                    if x.display_generic_number else None, x.protein_segment)
                for x in wt_resi
            }

            for res in chain:
                #in bio.pdb the residue's id is a tuple of (hetatm flag, residue number, insertion code)
                if res.resname.replace('HID', 'HIS') not in self.residue_list:
                    continue
                self.residues[chain.id].append(res)
            poly = self.get_chain_peptides(chain.id)
            for peptide in poly:
                #print("Start: {} Stop: {} Len: {}".format(peptide[0].id[1], peptide[-1].id[1], len(peptide)))
                self.map_to_wt_blast(chain.id, peptide, None,
                                     int(peptide[0].id[1]))

    def get_segments(self):

        #get the first chain
        c = list(self.mapping.keys())[0]

        for segment in ProteinSegment.objects.all():
            resi = []
            for r in Residue.objects.filter(
                    protein_conformation__protein=self.wt.id,
                    protein_segment=segment):
                if self.mapping[c][r.sequence_number].resnum is not None:
                    resi.append(self.mapping[c][r.sequence_number].resnum)
            if resi == []:
                continue
            self.segments[segment.slug] = [min(resi), max(resi)]
        return self.segments

    def get_chain_peptides(self, chain_id, gap_threshold=230):
        """
        Get peptides of sequential residue numbers (with respect to 230 aa gaps).
        The maximum length of ICL3 is 230 aa, and fusion proteins usualy have significantly different numbers, i.e. exceeding the 230 gap between TM5 and 6.

        The maximum allowed gap size can be evaluated automaticaly, but it is fairly costly:
        max([len(Residue.objects.filter(protein_segment=11, protein_conformation__protein=x)) for x in Protein.objects.filter(species=1)])
        """

        rnumbers = [int(x.id[1]) for x in self.residues[chain_id]]
        last_idx = len(rnumbers) - 1
        peptides = []
        tmp = []
        for i, rnum in enumerate(rnumbers):
            if i == last_idx:
                #FIXME: Assuming that very last residue is actualy continuation of a chain
                tmp.append(self.residues[chain_id][i])
                peptides.append(tmp)
                break
            if rnumbers[i + 1] != rnum + 1 and abs(
                    rnum + 1 - rnumbers[i + 1]) > gap_threshold:
                tmp.append(self.residues[chain_id][i])
                peptides.append(tmp)
                tmp = []
            else:
                tmp.append(self.residues[chain_id][i])
        return peptides

    def get_chain_sequence(self, chain):
        """
        Returns a sequence string of a given chain.
        """
        return "".join([
            polypeptide.three_to_one(x.resname.replace('HID', 'HIS'))
            for x in self.residues[chain] if x.resname in self.residue_list
        ])

    def get_peptide_sequence(self, residues):
        """
        Returns a sequence string of a given list of Bio.PDB.Residue objects.
        """
        return "".join([
            polypeptide.three_to_one(x.resname.replace('HID', 'HIS'))
            for x in residues if x.resname in self.residue_list
        ])

    def find_nonredundant_chains(self):
        """
        Returns a list of nonidentical chains.
        """
        nrc = []
        if len(self.mapping.keys()) == 1:
            return self.mapping.keys()

        for r_chain in self.mapping.keys():
            for chain in self.mapping.keys():
                if r_chain == chain:
                    continue
                if self.mapping[r_chain] != self.mapping[chain]:
                    nrc.append(r_chain)
        return nrc

    def map_to_wt_blast(self,
                        chain_id,
                        residues=None,
                        sequence=None,
                        starting_aa=1,
                        seqres=False):

        if residues:
            seq = self.get_peptide_sequence(residues)
        elif sequence:
            seq = sequence
        else:
            seq = self.get_chain_sequence(chain_id)

        alignments = self.blast.run(seq)

        for alignment in alignments:
            if alignment[1].hsps[0].expect > .5 and residues:
                self.fusions.append(AuxProtein(residues))
                #The case when auxiliary protein is in a separate chain
                if self.get_chain_sequence(
                        chain_id) == self.get_peptide_sequence(residues):
                    del self.mapping[chain_id]
                continue

            if self.wt.id != int(alignment[0]):
                continue
            for hsps in alignment[1].hsps:
                self.map_hsps(hsps, chain_id, starting_aa, seqres)

    def map_hsps(self, hsps, chain_id, offset=1, seqres=False):
        """
        Analyzes the High Similarity Protein Segment.
        """
        q = hsps.query
        sbjct = hsps.sbjct
        sbjct_counter = hsps.sbjct_start
        q_counter = hsps.query_start

        for s, q in zip(sbjct, q):

            if s == q:
                if seqres:
                    self.mapping[chain_id][sbjct_counter].set_seqres(True)
                else:
                    self.mapping[chain_id][sbjct_counter].set_pdb_res_num(
                        offset - 1 + q_counter)
                sbjct_counter += 1
                q_counter += 1
            elif s != '-' and q != '-':
                self.mapping[chain_id][sbjct_counter].set_pdb_res_num(
                    offset - 1 + q_counter)
                self.mapping[chain_id][sbjct_counter].set_mutation(q)
                sbjct_counter += 1
                q_counter += 1
            elif s == '-' and q != '-':
                self.mapping[chain_id][offset - 1 + q_counter].set_insertion(q)
                sbjct_counter += 1
                q_counter += 1
            elif s != '-' and q == '-':
                self.mapping[chain_id][sbjct_counter].set_deletion()
                sbjct_counter += 1
                q_counter += 1

    def map_to_wt_pw(self,
                     chain_id,
                     residues=None,
                     sequence=None,
                     starting_aa=1):
        """
        @param sequence: a dictionary of residue number: residue one letter code pairs
        """

        if residues:
            seq = self.get_chain_sequence(residues)
        elif sequence:
            seq = sequence.values()
        else:
            return

        wt, chain_seq, score, start, end = pairwise2.align.localms(
            self.wt_seq, seq, 2, -4, -4, -.1, one_alignment_only=True)[0]

        offset = 0
        for w, c in zip(wt, chain_seq):
            if w == c:
                if seqres:
                    self.mapping[chain.id][starting_aa + offset].seqres = True
                r = Residue.objects.get(
                    sequence_number=offset + self.wt_seq_start,
                    protein_conformation__protein=self.wt.id)
                if r.display_generic_number is not None:
                    self.mapping[chain_id][starting_aa + offset].add_gpcrdb(
                        r.display_generic_number)
                offset += 1
            elif c == '-' and w != '-':
                print(offset)
                self.mapping[chain_id][starting_aa + offset].add_deletion()
            elif w != '-' and c != '-' and w != c:
                self.mapping[chain_id][starting_aa + offset].add_mutation(c)
                offset += 1
            elif w == '-' and c != '-':
                self.mapping[chain_id][starting_aa + offset].add_insertion(c)
                offset += 1

    def map_seqres(self):

        for sr in self.seqres:
            self.map_to_wt_blast(sr.annotations['chain'],
                                 sequence=sr.seq,
                                 seqres=True)

    def mark_deletions(self):
        for chain in self.mapping.keys():
            for num, res in self.mapping[chain].items():
                if res.resnum is None:
                    res.set_deletion()

    def get_mapping_dict(self, pdb_keys=False, seqres=False):

        if pdb_keys:
            return {
                x: {
                    y: self.mapping[x][y].seqres
                    if seqres else self.mapping[x][y].resnum
                    for y in self.mapping[x].keys()
                }
                for x in self.mapping.keys()
            }
        else:
            if seqres:
                return {
                    x: {
                        y: self.mapping[x][y].resnum
                        if self.mapping[x][y].seqres else '-'
                        for y in self.mapping[x].keys()
                    }
                    for x in self.mapping.keys()
                }
            else:
                return {
                    x: {
                        y: self.mapping[x][y].resnum
                        for y in self.mapping[x].keys()
                    }
                    for x in self.mapping.keys()
                }

    def get_fusions(self):

        if self.fusions == []:
            return {}
        fusion_dict = OrderedDict({"auxiliary": {}})
        count = 1
        for fusion in self.fusions:
            fusion_dict["auxiliary"]["aux{}".format(count)] = fusion.get_info()
        return fusion_dict

    def get_deletions(self):

        deletions_list = []

        for chain in self.find_nonredundant_chains():
            deletions = [
                x for x, y in self.mapping[chain].items() if y.deletion
            ]
            deletion = deletions.reverse()
            tmp = []
            #for num, res in self.mapping[chain].items():
            #    if res.deletion:
            #        tmp.append(num)
            first = 0
            prev = 0
            while deletions != []:
                x = deletions.pop()
                #print("{}\t{}\t{}".format(x, first, prev))
                if first == 0:
                    tmp.append(x)
                    first = x
                    continue
                if prev == 0:
                    tmp.append(x)
                    prev = x
                    continue
                if abs(x - prev) == 1:
                    tmp.append(x)
                    prev = x
                else:
                    deletions_list.append(
                        OrderedDict({
                            "start": min(tmp),
                            "end": max(tmp),
                            "type": "single" if len(tmp) == 1 else "range",
                            "chain": chain
                        }))
                    tmp = [x]
                    first = x
                    prev = x
            deletions_list.append(
                OrderedDict({
                    "start": min(tmp),
                    "end": max(tmp),
                    "type": "single" if len(tmp) == 1 else "range",
                    "chain": chain
                }))

        return {"deletions": deletions_list}

    def get_mutations(self):

        mutations_list = []
        for chain in self.find_nonredundant_chains():
            for num, res in self.mapping[chain].items():
                if res.mutation:
                    mutations_list.append(
                        OrderedDict({
                            "wt": res.name,
                            "mut": res.mutation,
                            "pos (wt)": num,
                            "pos (pdb)": res.resnum,
                            "chain": chain
                        }))
        return {"mutations": mutations_list}

    def get_report(self):

        for chain in sorted(self.mapping.keys()):
            print("Chain {}".format(chain))
            for res in sorted(self.mapping[chain].keys()):
                print(self.mapping[chain][res])

    def save_excel_report(self, file_name):

        workbook = xlsxwriter.Workbook(file_name)

        for chain in sorted(self.mapping.keys()):
            worksheet = workbook.add_worksheet(chain)
            worksheet.write_row(0, 0, [
                "Protein number", "Residue name", "PDB number",
                "Generic number", "Mutation", "SEQRES"
            ])

            row_id = 1
            for res in sorted(self.mapping[chain].keys()):
                tmp = self.mapping[chain][res]
                worksheet.write_row(row_id, 0, tmp.get_param_list())
                row_id += 1
        workbook.close()
    def handle(self, *args, **options):

        ## Prepare comparasion info ##
        filepath = 'protein/data/Isoform_annotation_table.txt'
        lmb_data = OrderedDict()
        total_lmb_isoforms = 0
        all_lmb_isoforms = []
        with open(filepath, "r", encoding='UTF-8') as f:
            for i,row in enumerate(f):
                if i>0:
                    c = row.split("\t")
                    entry_name = "{}_human".format(c[1].lower())
                    transcripts = c[4].split(", ")

                    if not entry_name in lmb_data:
                        lmb_data[entry_name] = []
                    lmb_data[entry_name] += transcripts
                    total_lmb_isoforms += 1
                    all_lmb_isoforms += transcripts

        print('all_lmb_isoforms',len(all_lmb_isoforms),'distinct',len(set(all_lmb_isoforms)))

        
        ## Get parsed gtex annotation
        with open('protein/data/matched_gtex.json') as json_file:
            gtex_old = json.load(json_file)

        ## Need to rewrite these entries, as ensembl doesnt use the . for transcripts
        gtex = {}
        for key, val in gtex_old['transcripts'].items():
            t,g = key.split("_")
            new_key = "{}_{}".format(t.split(".")[0],g)
            gtex[new_key] = val
            # del gtex[new_key]['subjects']


        ## Url API to map genename to ensemble ID
        cache_dir_genes = ['gtexportal', 'gene_lookup']
        url_gene = 'https://gtexportal.org/rest/v1/reference/gene?geneId=$index&gencodeVersion=v19&genomeBuild=GRCh37%2Fhg19&pageSize=250&format=json'

        ## Url to lookup ensemble ID to find transcripts
        cache_dir_transcripts_gtex = ['gtexportal', 'transcripts']
        url_transcripts = 'https://gtexportal.org/rest/v1/reference/transcript?gencodeId=$index&gencodeVersion=v19&genomeBuild=GRCh37%2Fhg19'

        cache_dir_transcripts = ['ensembl37', 'transcripts']
        url_ensembl = 'https://grch37.rest.ensembl.org/lookup/id/$index?expand=1;content-type=application/json'

        cache_dir_gtex_expression  = ['gtexportal', 'expression_data']
        url_expression = 'https://gtexportal.org/rest/v1/expression/medianTranscriptExpression?datasetId=gtex_v7&gencodeId=$index&format=json'

        ## Url to lookup sequence of transcript
        cache_dir_seq = ['ensembl37', 'seq_protein']
        url_ensembl_seq = 'https://grch37.rest.ensembl.org/sequence/id/$index?content-type=application/json;type=protein'

        # Get all human GPCRs
        ps = Protein.objects.filter(sequence_type__slug='wt', species__common_name="Human", family__slug__startswith='00').all().prefetch_related('genes').order_by('entry_name')
       
        isoforms = {}
        total_transcripts = 0
        total_transcript_skipped_no_tissue=0
        total_proteins_with_isoforms = 0
        gene_to_ensembl = {}
        transcripts_ids_total = set()
        transcripts_ids_skipped_total = set()
        total_fetched_transcripts = 0
        canonical_disagreement_count = 0

        total_new_transcripts = []
        total_not_found = []
        total_not_found_due_to_skipped = []
        new_proteins = set()

        lmb_compare_sequences = [0,0,0] # correct, wrong, not exists in lmb

        sequence_lookup = {}

        ## COMPARE SEQUENCES
        filenames = os.listdir("protein/data/LMB_sequences/")
        all_lmb_sequences= {}
        for f in filenames:
            with open ("protein/data/LMB_sequences/"+f, "r") as myfile:
                fasta=myfile.read().splitlines()
                for i,l in enumerate(fasta):
                    if l[0]==">":
                        e_id = l[2:]
                        continue
                    if e_id in all_lmb_sequences:
                        print('already there!',e_id)
                    if i>2:
                        all_lmb_sequences[e_id]=l
        print('all_lmb_sequences',len(all_lmb_sequences))

        f = open("protein/data/20190726_transcripts.fa", "w")
        missing_sequences = 0
        total_lmb_sequences = 0
        sequences_lookup = defaultdict(list)
        for p,ts in lmb_data.items():
            seq = Protein.objects.get(entry_name=p).sequence
            sequences_lookup[seq].append([p,p])
            # print(p,ts)
            # print(seq)
            f.write(">{} GPCRdb sequence reference\n".format(p))
            f.write("{}\n".format(seq))
            seq_filename = "protein/data/LMB_sequences/{}_nonstrict_transcripts.fa".format(p)
            lmb_sequences = {}
            try:
                with open (seq_filename, "r") as myfile:
                    #fasta_raw = myfile.read()
                    fasta=myfile.read().splitlines()
                    for i,l in enumerate(fasta):
                        if l[0]==">":
                            e_id = l[2:]
                            continue
                        lmb_sequences[e_id]=l
                        if i>2:
                            total_lmb_sequences += 1
            except:
                #print('No file for',p,' So no sequence for',ts)
                missing_sequences += len(ts)
            for t in ts:
                if not t in lmb_sequences:
                    #print('missing ',t,'in',"{}_nonstrict_transcripts.fa".format(p))
                    missing_sequences += 1

                seq = fetch_from_web_api(url_ensembl_seq, t,cache_dir_seq)['seq']
                sequences_lookup[seq].append([t,p])
                if t in lmb_sequences:
                    if seq!=lmb_sequences[t]:
                        print(t,'different from LBM - length ensembl:',len(seq),"length lmb:",len(lmb_sequences[t]))
                f.write(">{} ({})\n".format(t,p))
                f.write("{}\n".format(seq))
        f.close()
        print('total missing sequences',missing_sequences)
        print('total lmb transcript sequences provided',total_lmb_sequences)
        print('total lmb protein',len(lmb_data))
        #return
        for seq,ts in sequences_lookup.items():
            if len(ts)>1:
                print('Identical sequence:',ts)

        sequences_lookup = defaultdict(list)
        all_transcript_seq = {}
        for p in ps:# .filter(entry_name='gpc5b_human').all():
            transcripts = []
            transcripts_ids = []
            transcripts_ids_skipped = []
            ensembl_transcripts_count = 0
            genes = list(p.genes.all().values_list('name',flat=True))
            uniprot = p.accession
            canonical = ''
            canon_seq = p.sequence
            # sequence_lookup[canon_seq] = p.entry_name
            grch37_canonical_seq = ''
            uniprot_canonical = ''
            grch37_canonical = ''

            # print(">" + p.entry_name,uniprot, 'genes:',genes)
            seq_filename = "protein/data/LMB_sequences/{}_nonstrict_transcripts.fa".format(p.entry_name)
            lmb_sequences = {}
            try:
                with open (seq_filename, "r") as myfile:
                    #fasta_raw = myfile.read()
                    fasta=myfile.read().splitlines()
                    for l in fasta:
                        if l[0]==">":
                            e_id = l[2:]
                            continue
                        lmb_sequences[e_id]=l
            except:
                pass
            #break

            alternative_ids_uniprot = self.find_ensembl_id_by_uniprot(uniprot)
            # print(alternative_ids_uniprot)
            ensembl_gene_id = []
            for gene in genes:
                if not gene:
                    continue
                gene_lookup = fetch_from_web_api(url_gene, gene, cache_dir_genes)
                
                # try:
                same_gene_id = ''
                if gene_lookup['gene']:
                    for gene_info in gene_lookup['gene']:
                        if gene_info['geneSymbol']==gene:
                            ensembl_gene_id.append(gene_info['gencodeId'])

            if len(ensembl_gene_id)>1:
                print(ensembl_gene_id,'MORE THAN 1 !!!!')

            if len(ensembl_gene_id)==0:
                print('No ID found, using uniprot')
                if alternative_ids_uniprot['genes']:
                    ensembl_gene_id = alternative_ids_uniprot['genes'][0]
                else:
                    print("NO ID FOR THIS RECEPTOR")
                    continue
            else:
                ensembl_gene_id = ensembl_gene_id[0]

            #alternative_id = self.find_ensembl_id(gene)
            # alternative_id_uniprot = self.find_ensembl_id_by_uniprot(uniprot)
            # print(ensembl_gene_id,alternative_ids_uniprot)
            # expression = fetch_from_web_api(url_expression,ensembl_gene_id,cache_dir_gtex_expression)
            # print(expression)
            # go through expression
            # expressed_transcripts = {}
            # for e in expression['medianTranscriptExpression']:
            #     if e['median']>0 or 1==1:
            #         #only if expression
            #         t_id = e['transcriptId']
            #         t_short = t_id.split(".")[0]
            #         tissue = e['tissueSiteDetailId']
            #         if t_short not in expressed_transcripts:
            #             expressed_transcripts[t_short] = {'long':t_id,'tissues':[], 'max_median':0}
            #         if expressed_transcripts[t_short]['max_median']<e['median']:
            #             expressed_transcripts[t_short]['max_median'] = e['median']
            #         expressed_transcripts[t_short]['tissues'].append([tissue,e['median']])   
            # print(expressed_transcripts)
            # print(ensembl_gene_id)
            gene_to_ensembl[p.entry_name] = ensembl_gene_id
            # print("E_ID: " +ensembl_gene_id,alternative_ids_uniprot)
            # ensembl_transcripts = fetch_from_web_api(url_ensembl, ensembl_gene_id, cache_dir_transcripts)
            # use uniprot gene ID instead
            ensembl_transcripts = fetch_from_web_api(url_ensembl, ensembl_gene_id, cache_dir_transcripts)
            # print(ensembl_gene_id)
            if (alternative_ids_uniprot['genes'] and ensembl_gene_id.split(".")[0]!=alternative_ids_uniprot['genes'][0]):
                print("##### ensembl gene id changed",ensembl_gene_id,alternative_ids_uniprot['genes'][0])

            #total_fetched_transcripts += len(ensembl_transcripts['Transcript'])
            # print(ensembl_transcripts)
            same_gene_id = True
            if not ensembl_transcripts:
                print('error',alternative_ids_uniprot,ensembl_gene_id)
                same_gene_id = False
                ensembl_transcripts = fetch_from_web_api(url_ensembl, alternative_ids_uniprot['genes'][0], cache_dir_transcripts)

            for t in ensembl_transcripts['Transcript']:
                ensembl_transcripts_count += 1
                display_name = t['display_name']
                is_canonical = t['is_canonical']
                biotype = t['biotype']
                t_id = t['id']
                #     # Skip canonical entries
                #     continue

                # Only interested in protein_coding
                if biotype=='protein_coding':
                    total_fetched_transcripts += 1

                    key = '{}_{}'.format(t_id,ensembl_gene_id)

                    if not key in gtex:
                        # print('t_id', t_id, 'not in expressed_transcripts')
                        total_transcript_skipped_no_tissue += 1
                        transcripts_ids_skipped_total.add(t_id)
                        transcripts_ids_skipped.append(t_id)
                        continue

                    if gtex[key]["count"]<3:
                        total_transcript_skipped_no_tissue += 1
                        transcripts_ids_skipped_total.add(t_id)
                        transcripts_ids_skipped.append(t_id)
                        continue

                    length = t['Translation']['length']
                    seq_id = t['Translation']['id']
                    transcript_info = OrderedDict([('display_name',display_name),('t_id',t_id),('length',length), ('seq_id',seq_id), ('expressed',gtex[key])])
                    seq = fetch_from_web_api(url_ensembl_seq, seq_id,cache_dir_seq)


                    if is_canonical:
                        grch37_canonical = t_id
                        transcript_info['grch37_canonical'] = True
                        grch37_canonical_seq = seq['seq']

                    if seq['seq']==canon_seq:
                        uniprot_canonical = t_id
                        transcript_info['uniprot_canonical'] = True
                        continue
                        # Skip canonical entries

                    sequences_lookup[seq['seq']].append([t_id,p.entry_name])
                    all_transcript_seq[t_id] = seq['seq']
                    if seq['seq'] in sequence_lookup:
                        print('SEQUENCE ALREADY SEEN',t_id, sequence_lookup[seq['seq']])
                        continue
                    sequence_lookup[seq['seq']] = t_id


                    transcript_info['seq'] = seq['seq']
                    if not t_id in lmb_sequences:
                        transcript_info['lmb_sequences'] = False
                        lmb_compare_sequences[2] += 1
                    else:
                        if lmb_sequences[t_id]==seq['seq']:
                            transcript_info['lmb_sequences'] = True
                            lmb_compare_sequences[0] += 1
                        else:
                            transcript_info['lmb_sequences'] = lmb_sequences[t_id]
                            lmb_compare_sequences[1] += 1

                    if t_id in alternative_ids_uniprot['transcripts']:
                        transcript_info['in_uniprot'] = True
                    else:
                        transcript_info['in_uniprot'] = False

                    if p.entry_name in lmb_data and t_id in lmb_data[p.entry_name]:
                        transcript_info['in_lmb'] = True
                    else:
                        transcript_info['in_lmb'] = False

                    if t_id not in transcripts_ids:
                        transcripts.append(transcript_info)
                        transcripts_ids.append(t_id)
                        transcripts_ids_total.add(t_id)
                    total_transcripts += 1
                # except:
                #     print('Error fetching ensemble_gene_id for gene',gene)
                #     pass

            not_found = []
            not_found_due_to_skipped = []
            if p.entry_name in lmb_data:
                for t in lmb_data[p.entry_name]:
                    if t not in transcripts_ids:
                        if t in transcripts_ids_skipped:

                            f = open("protein/data/20190726_skipped_due_to_gtex.txt", "a")
                            not_found_due_to_skipped.append(t)
                            key = '{}_{}'.format(t,ensembl_gene_id)
                            if not key in gtex:
                                reason = 'Not in GTEX'
                            else:
                                reason = 'Subjects low in GTEX - count is {} - subject ids {}'.format(gtex[key]['count'],", ".join(gtex[key]['subjects']))
                            f.write("{}: {}\n".format(t,reason))
                            f.close()
                            # print(t)
                        else:
                            not_found.append(t)

            total_not_found += not_found
            total_not_found_due_to_skipped += not_found_due_to_skipped

            new = []
            for t in transcripts_ids:
                if p.entry_name in lmb_data and t in lmb_data[p.entry_name]:
                    pass
                else:
                    ts_check = sequences_lookup[all_transcript_seq[t]]
                    for t_check in ts_check:
                        if p.entry_name in lmb_data and t_check in lmb_data[p.entry_name]:
                            print('found via duplicate',t_check,t)
                            continue
                    key = '{}_{}'.format(t,ensembl_gene_id)



                    #blast = BlastSearch(top_results=2)

                    blast = BlastSearch(blastdb=os.sep.join([settings.STATICFILES_DIRS[0], 'blast', 'protwis_human_blastdb']), top_results=2)
                    blast_out = blast.run(all_transcript_seq[t])
                    result = [(Protein.objects.get(pk=x[0]).entry_name, x[1].hsps[0].expect) for x in blast_out]
                    #print(result)
                    if result:
                        if result[0][0]==p.entry_name and result[0][1]<0.05:
                            f = open("protein/data/20190726_new_transcripts_for_consideration.txt", "a")
                            reason = 'GTEX count: {}'.format(gtex[key]['count'])
                            f.write(">{} ({}): {}\n".format(t,p.entry_name,reason))
                            f.write("{}\n".format(all_transcript_seq[t]))
                            f.close()
                            new.append(t)
                            if p.entry_name in lmb_data:
                                new_proteins.add(p.entry_name)
                        else:
                            print('bad blast match',result)
                    else:
                        print('bad blast match',result)

            total_new_transcripts += new

            # print(len(alternative_ids_uniprot['transcripts']), 'uniprot transcripts found',ensembl_transcripts_count, ' ensembl transcripts found',len(transcripts), 'transcripts kept after filtering')
            
            # Add if transcripts found
            if len(transcripts):
                isoforms[p.entry_name] = {'ensembl_gene_id':ensembl_gene_id,'same_gene_id':same_gene_id,'canonical_seq':canon_seq, 'grch37_canonical_seq':grch37_canonical_seq, 'isoforms': transcripts, 'uniprot_lookup': alternative_ids_uniprot, 'lmb_not_found':not_found, 'lmb_not_found_due_to_skipped': not_found_due_to_skipped, 'new_transcripts_than_lmb': new,'skipped_due_to_gtex': transcripts_ids_skipped, 'grch37_canonical':grch37_canonical, 'uniprot_canonical':uniprot_canonical}
                if grch37_canonical_seq!=canon_seq:
                    isoforms[p.entry_name]['canonical_disagreement'] = True
                    canonical_disagreement_count += 1
                # isoforms[p.entry_name].append(alternative_ids_uniprot)
                # isoforms[p.entry_name].append(not_found)
                total_proteins_with_isoforms += 1
            else:
                isoforms[p.entry_name] = {'ensembl_gene_id':ensembl_gene_id,'same_gene_id':same_gene_id,'canonical_seq':canon_seq, 'grch37_canonical_seq':grch37_canonical_seq, 'isoforms': transcripts, 'uniprot_lookup': alternative_ids_uniprot, 'lmb_not_found':not_found, 'lmb_not_found_due_to_skipped': not_found_due_to_skipped, 'new_transcripts_than_lmb': new,'skipped_due_to_gtex': transcripts_ids_skipped, 'grch37_canonical':grch37_canonical, 'uniprot_canonical':uniprot_canonical}
                
            # break
            f = open('protein/data/all_isoforms_gtex.json', 'w')
            json.dump(isoforms,f, indent=4, separators=(',', ': '))
            #break

        for seq,ts in sequences_lookup.items():
            if len(ts)>1:
                print('identical sequence',ts)

        for t in total_not_found:
            ts_check = sequences_lookup[all_transcript_seq[t]]
            found = False
            for t_check in ts_check:
                if t_check[0] not in total_not_found:
                    print(t,'found but under another id',t_check[0])
                    found = True

            if not found:
                print('##',t,'in LMB but not in this search')


        # print small summary results
        print('total_proteins_searched',len(ps))
        print('total_proteins_with_isoforms', total_proteins_with_isoforms)
        print('Total transcripts deemed to be isoforms',total_transcripts)
        print('Amount of these not in LMB data',len(total_new_transcripts))
        print(new_proteins)
        # print('Amount in LBM not found',len(total_not_found))
        # print(total_not_found)
        print('Amount in LBM found but skipped due to GTEX data',len(total_not_found_due_to_skipped))
        print(total_not_found_due_to_skipped)
        print('Sequence compare to LMB', lmb_compare_sequences)
        print('canonical_disagreement_count',canonical_disagreement_count)
        print(total_not_found)
        # print('total_transcript_skipped_no_tissue',total_transcript_skipped_no_tissue)
        # print('total_transcript_skipped_no_tissue2 ',len(transcripts_ids_skipped_total))
        # print('total_fetched_transcripts',total_fetched_transcripts)

        # print(gene_to_ensembl)
        # save to file
        f = open('protein/data/all_isoforms_gtex.json', 'w')
        json.dump(isoforms,f, indent=4, separators=(',', ': '))
Esempio n. 12
0
    def create_orthologs(self, constructs_only):
        self.logger.info('CREATING OTHER PROTEINS')

        # go through constructs and finding their entry_names for lookup
        construct_entry_names = []
        self.logger.info('Getting construct accession codes')
        filenames = os.listdir(self.construct_data_dir)
        for source_file in filenames:
            source_file_path = os.sep.join(
                [self.construct_data_dir, source_file])
            self.logger.info(
                'Getting protein name from construct file {}'.format(
                    source_file))
            split_filename = source_file.split(".")
            extension = split_filename[1]
            if extension != 'yaml':
                continue

            # read the yaml file
            with open(source_file_path, 'r') as f:
                sd = yaml.load(f)

            # check whether protein is specified
            if 'protein' not in sd:
                continue

            # append entry_name to lookup list
            construct_entry_names.append(sd['protein'])

        # parse files
        filenames = os.listdir(self.local_uniprot_dir)
        for source_file in filenames:
            source_file_name = os.sep.join(
                [self.local_uniprot_dir, source_file])
            split_filename = source_file.split(".")
            accession = split_filename[0]
            extension = split_filename[1]
            if extension != 'txt':
                continue

            up = self.parse_uniprot_file(accession)

            # skip human proteins
            if 'species_latin_name' in up and up[
                    'species_latin_name'] == 'H**o sapiens':
                continue

            # should proteins that are not constructs be skipped?
            if constructs_only and up[
                    'entry_name'] not in construct_entry_names:
                continue

            # is this an ortholog of a human protein?
            ortholog = False

            # is there already an entry for this protein?
            try:
                p = Protein.objects.get(entry_name=up['entry_name'])
                continue
            except Protein.DoesNotExist:
                p = None

                # get human ortholog using gene name
                for gene in up['genes']:
                    try:
                        g = Gene.objects.get(name__iexact=gene,
                                             species__id=1,
                                             position=0)
                        ps = g.proteins.all().order_by('id')
                        p = ps[0]
                        ortholog = True
                        self.logger.info("Human ortholog found: {}".format(
                            p.entry_name))
                        break
                    except Gene.DoesNotExist:
                        self.logger.info("No gene found for {}".format(gene))
                        continue

                # if gene name not found, try using entry name
                if not p:
                    split_entry_name = up['entry_name'].split('_')

                    # add _ to the split entry name to avoid e.g. gp1 matching gp139
                    entry_name_query = split_entry_name[0] + '_'
                    try:
                        p = Protein.objects.get(
                            entry_name__startswith=entry_name_query,
                            species__id=1)
                        ortholog = True
                        self.logger.info("Human ortholog found: {}".format(
                            p.entry_name))
                    except Protein.DoesNotExist:
                        self.logger.info(
                            "No match found for {}".format(entry_name_query))

                # check whether the entry name is in the construct list
                if not p and up['entry_name'] in construct_entry_names:
                    # BLAST sequence to find closest hit (for reference positions)
                    blast = BlastSearch()
                    blast_out = blast.run(up['sequence'])

                    # use first hit from BLAST as template for reference positions
                    try:
                        p = Protein.objects.get(pk=blast_out[0][0])
                    except Protein.DoesNotExist:
                        self.logger.error(
                            'Template protein for {} not found'.format(
                                up['entry_name']))

            # skip if no ortholog is found FIXME use a profile to find a good template
            if not p:
                continue

            # check whether reference positions exist for this protein, and find them if they do not
            ref_position_file_path = os.sep.join(
                [self.ref_position_source_dir, up['entry_name'] + '.yaml'])
            auto_ref_position_file_path = os.sep.join([
                self.auto_ref_position_source_dir, up['entry_name'] + '.yaml'
            ])
            if not os.path.isfile(ref_position_file_path):
                # look for the file in the automatically generated reference file dir
                if not os.path.isfile(auto_ref_position_file_path):
                    # get reference positions of human ortholog
                    template_ref_position_file_path = os.sep.join(
                        [self.ref_position_source_dir, p.entry_name + '.yaml'])
                    if not os.path.isfile(template_ref_position_file_path):
                        # use a non human sequence
                        template_ref_position_file_path = os.sep.join([
                            self.auto_ref_position_source_dir,
                            p.entry_name + '.yaml'
                        ])

                    ref_positions = align_protein_to_reference(
                        up, template_ref_position_file_path, p)

                    # write reference positions to a file
                    with open(auto_ref_position_file_path,
                              "w") as auto_ref_position_file:
                        yaml.dump(ref_positions,
                                  auto_ref_position_file,
                                  default_flow_style=False)

            # create a database entry for the protein
            if ortholog:
                # for orthologs, use properties from the human protein
                self.create_protein(p.name, p.family, p.sequence_type,
                                    p.residue_numbering_scheme, accession, up)
            else:
                # otherwise, create a new family, and use Uniprot name
                top_level_parent_family = ProteinFamily.objects.get(
                    slug=p.family.slug.split('_')[0])
                num_families = ProteinFamily.objects.filter(
                    parent=top_level_parent_family).count()
                family_slug = top_level_parent_family.slug + "_" + str(
                    num_families + 1).zfill(3)
                other_family, created = ProteinFamily.objects.get_or_create(
                    parent=top_level_parent_family,
                    name='Other',
                    defaults={'slug': family_slug})
                if created:
                    self.logger.info(
                        'Created protein family {}'.format(other_family))

                family_slug += '_001'
                unclassified_family, created = ProteinFamily.objects.get_or_create(
                    parent=other_family,
                    name='Unclassified',
                    defaults={'slug': family_slug})
                if created:
                    self.logger.info('Created protein family {}'.format(
                        unclassified_family))

                num_families = ProteinFamily.objects.filter(
                    parent=unclassified_family).count()
                family_slug = unclassified_family.slug + "_" + str(
                    num_families + 1).zfill(3)
                pf, created = ProteinFamily.objects.get_or_create(
                    parent=unclassified_family,
                    name=up['genes'][0],
                    defaults={'slug': family_slug})
                if created:
                    self.logger.info('Created protein family {}'.format(pf))

                self.create_protein(up['genes'][0], pf, p.sequence_type,
                                    p.residue_numbering_scheme, accession, up)

        self.logger.info('COMPLETED CREATING OTHER PROTEINS')
Esempio n. 13
0
class GenericNumbering(object):

    residue_list = [
        "ARG", "ASP", "GLU", "HIS", "ASN", "GLN", "LYS", "SER", "THR", "HID",
        "PHE", "LEU", "ILE", "TYR", "TRP", "VAL", "MET", "PRO", "CYS", "ALA",
        "GLY"
    ]

    def __init__(self,
                 pdb_file=None,
                 pdb_filename=None,
                 structure=None,
                 blast_path='blastp',
                 blastdb=os.sep.join([
                     settings.STATICFILES_DIRS[0], 'blast', 'protwis_blastdb'
                 ])):

        # pdb_file can be either a name/path or a handle to an open file
        self.pdb_file = pdb_file
        self.pdb_filename = pdb_filename

        # dictionary of 'MappedResidue' object storing information about alignments and bw numbers
        self.residues = {}
        self.pdb_seq = {}  #Seq('')
        # list of uniprot ids returned from blast
        self.prot_id_list = []
        #setup for local blast search
        self.blast = BlastSearch(blast_path=blast_path, blastdb=blastdb)

        if self.pdb_file:
            self.pdb_structure = PDBParser(PERMISSIVE=True,
                                           QUIET=True).get_structure(
                                               'ref', self.pdb_file)[0]
        elif self.pdb_filename:
            self.pdb_structure = PDBParser(PERMISSIVE=True,
                                           QUIET=True).get_structure(
                                               'ref', self.pdb_filename)[0]
        else:
            self.pdb_structure = structure

        self.parse_structure(self.pdb_structure)

    def parse_structure(self, pdb_struct):
        """
        extracting sequence and preparing dictionary of residues
        bio.pdb reads pdb in the following cascade: model->chain->residue->atom
        """
        for chain in pdb_struct:
            self.residues[chain.id] = {}
            self.pdb_seq[chain.id] = Seq('')

            for res in chain:
                #in bio.pdb the residue's id is a tuple of (hetatm flag, residue number, insertion code)
                if res.resname == "HID":
                    resname = polypeptide.three_to_one('HIS')
                else:
                    if res.resname not in self.residue_list:
                        continue
                    self.residues[chain.id][res.id[1]] = MappedResidue(
                        res.id[1], polypeptide.three_to_one(res.resname))

            self.pdb_seq[chain.id] = ''.join([
                self.residues[chain.id][x].name
                for x in sorted(self.residues[chain.id].keys())
            ])

            for pos, res in enumerate(sorted(self.residues[chain.id].keys()),
                                      start=1):
                self.residues[chain.id][res].pos_in_aln = pos

    def locate_res_by_pos(self, chain, pos):

        for res in self.residues[chain].keys():
            if self.residues[chain][res].pos_in_aln == pos:
                return res
        return 0

    def map_blast_seq(self, prot_id, hsps, chain):

        #find uniprot residue numbers corresponding to those in pdb file
        q_seq = list(hsps.query)
        tmp_seq = list(hsps.sbjct)
        subj_counter = hsps.sbjct_start
        q_counter = hsps.query_start

        logger.info("{}\n{}".format(hsps.query, hsps.sbjct))
        logger.info("{:d}\t{:d}".format(hsps.query_start, hsps.sbjct_start))

        rs = Residue.objects.prefetch_related(
            'display_generic_number',
            'protein_segment').filter(protein_conformation__protein=prot_id)
        residues = {}
        for r in rs:
            residues[r.sequence_number] = r

        while tmp_seq:
            #skipping position if there is a gap in either of sequences
            if q_seq[0] == '-' or q_seq[0] == 'X' or q_seq[0] == ' ':
                subj_counter += 1
                tmp_seq.pop(0)
                q_seq.pop(0)
                continue
            if tmp_seq[0] == '-' or tmp_seq[0] == 'X' or tmp_seq[0] == ' ':
                q_counter += 1
                tmp_seq.pop(0)
                q_seq.pop(0)
                continue
            if tmp_seq[0] == q_seq[0]:
                resn = self.locate_res_by_pos(chain, q_counter)
                if resn != 0:
                    if subj_counter in residues:
                        db_res = residues[subj_counter]

                        if db_res.protein_segment:
                            segment = db_res.protein_segment.slug
                            self.residues[chain][resn].add_segment(segment)

                        if db_res.display_generic_number:
                            num = db_res.display_generic_number.label
                            bw, gpcrdb = num.split('x')
                            gpcrdb = "{}.{}".format(bw.split('.')[0], gpcrdb)
                            self.residues[chain][resn].add_bw_number(bw)
                            self.residues[chain][resn].add_gpcrdb_number(
                                gpcrdb)
                            self.residues[chain][resn].add_gpcrdb_number_id(
                                db_res.display_generic_number.id)
                            self.residues[chain][resn].add_display_number(num)
                            self.residues[chain][resn].add_residue_record(
                                db_res)
                    else:
                        logger.warning(
                            "Could not find residue {} {} in the database.".
                            format(resn, subj_counter))

                    if prot_id not in self.prot_id_list:
                        self.prot_id_list.append(prot_id)
            q_counter += 1
            subj_counter += 1
            tmp_seq.pop(0)
            q_seq.pop(0)

    def get_substructure_mapping_dict(self):

        mapping_dict = {}
        for chain in self.residues.keys():
            for res in self.residues[chain].keys():
                if self.residues[chain][res].segment in mapping_dict.keys():
                    mapping_dict[self.residues[chain][res].segment].append(
                        self.residues[chain][res].number)
                else:
                    mapping_dict[self.residues[chain][res].segment] = [
                        self.residues[chain][res].number,
                    ]
        return mapping_dict

    def get_annotated_structure(self):

        for chain in self.pdb_structure:
            for residue in chain:
                if residue.id[1] in self.residues[chain.id].keys():
                    if self.residues[chain.id][residue.id[1]].gpcrdb != 0.:
                        residue["CA"].set_bfactor(
                            float(
                                self.residues[chain.id][residue.id[1]].gpcrdb))
                    if self.residues[chain.id][residue.id[1]].bw != 0.:
                        residue["N"].set_bfactor(
                            float(self.residues[chain.id][residue.id[1]].bw))

        return self.pdb_structure

    def save_gn_to_pdb(self):

        #replace bfactor field of CA atoms with b-w numbers and return filehandle with the structure written
        for chain in self.pdb_structure:
            for residue in chain:
                if residue.id[1] in self.residues[chain.id].keys():
                    if self.residues[chain.id][residue.id[1]].gpcrdb != 0.:
                        residue["CA"].set_bfactor(
                            float(
                                self.residues[chain.id][residue.id[1]].gpcrdb))
                    if self.residues[chain.id][residue.id[1]].bw != 0.:
                        residue["N"].set_bfactor(
                            float(self.residues[chain.id][residue.id[1]].bw))
                    r = self.residues[chain.id][residue.id[1]]
        #get the basename, extension and export the pdb structure with b-w numbers
        root, ext = os.path.splitext(self.pdb_filename)
        io = PDBIO()
        io.set_structure(self.pdb_structure)
        io.save("%s_GPCRDB%s" % (root, ext))

    def assign_generic_numbers(self):

        alignments = {}
        #blast search goes first, looping through all the chains
        for chain in self.pdb_seq.keys():
            alignments[chain] = self.blast.run(self.pdb_seq[chain])

        #map the results onto pdb sequence for every sequence pair from blast
        for chain in self.pdb_seq.keys():
            for alignment in alignments[chain]:
                if alignment == []:
                    continue
                for hsps in alignment[1].hsps:
                    self.map_blast_seq(alignment[0], hsps, chain)
        return self.get_annotated_structure()
Esempio n. 14
0
    def main_func(self, positions, iteration,count,lock):
        self.logger.info('CREATING OTHER PROTEINS')
        try:
            # go through constructs and finding their entry_names for lookup
            construct_entry_names = []
            self.logger.info('Getting construct accession codes')
            filenames = os.listdir(self.construct_data_dir)
            for source_file in filenames:
                source_file_path = os.sep.join([self.construct_data_dir, source_file])
                self.logger.info('Getting protein name from construct file {}'.format(source_file))
                split_filename = source_file.split(".")
                extension = split_filename[1]
                if extension != 'yaml':
                    continue

                # read the yaml file
                with open(source_file_path, 'r') as f:
                    sd = yaml.load(f)

                # check whether protein is specified
                if 'protein' not in sd:
                    continue

                # append entry_name to lookup list
                construct_entry_names.append(sd['protein'])

            # parse files
            filenames = os.listdir(self.local_uniprot_dir)

            # Keep track of first or second iteration
            reviewed = ['SWISSPROT','TREMBL'][iteration-1]
            skipped_due_to_swissprot = 0
            # for i,source_file in enumerate(filenames):
            while count.value<len(filenames):
                with lock:
                    source_file = filenames[count.value]
                    count.value +=1 
                # if i<positions[0]: #continue if less than start
                #     continue
                # if positions[1]: #if end is non-false
                #     if i>=positions[1]:
                #         #continue if i less than process
                #         continue
                source_file_name = os.sep.join([self.local_uniprot_dir, source_file])
                split_filename = source_file.split(".")
                accession = split_filename[0]
                extension = split_filename[1]
                if extension != 'txt':
                    continue

                up = self.parse_uniprot_file(accession)

                # Skip TREMBL on first loop, and SWISSPROT on second
                if reviewed != up['source']:
                    continue

                # skip human proteins
                if 'species_latin_name' in up and up['species_latin_name'] == 'H**o sapiens':
                    continue

                # should proteins that are not constructs be skipped?
                if self.constructs_only and up['entry_name'] not in construct_entry_names:
                    continue

                # is this an ortholog of a human protein?
                ortholog = False

                # is there already an entry for this protein?
                try:
                    p = Protein.objects.get(entry_name=up['entry_name'])
                    if "SWISSPROT" == up['source']:
                        pass
                       #  print(up['entry_name'], "already there?", accession )
                    continue
                except Protein.DoesNotExist:
                    p = None

                    # get human ortholog using gene name
                    for gene in up['genes']:
                        try:
                            g = Gene.objects.get(name__iexact=gene, species__common_name="Human", position=0)
                            ps = g.proteins.all().order_by('id')
                            p = ps[0]
                            ortholog = True
                            self.logger.info("Human ortholog found: {}".format(p.entry_name))
                            break
                        except Gene.DoesNotExist:
                            self.logger.info("No gene found for {}".format(gene))
                            continue

                    # if gene name not found, try using entry name
                    if not p:
                        split_entry_name = up['entry_name'].split('_')

                        # add _ to the split entry name to avoid e.g. gp1 matching gp139
                        entry_name_query = split_entry_name[0] + '_'
                        try:
                            p = Protein.objects.get(entry_name__startswith=entry_name_query, species__common_name="Human")
                            ortholog = True
                            self.logger.info("Human ortholog found: {}".format(p.entry_name))
                        except Protein.DoesNotExist:
                            self.logger.info("No match found for {}".format(entry_name_query))

                    # check whether the entry name is in the construct list
                    if not p and up['entry_name'] in construct_entry_names:
                        # BLAST sequence to find closest hit (for reference positions)
                        blast = BlastSearch()
                        blast_out = blast.run(up['sequence'])

                        # use first hit from BLAST as template for reference positions
                        try:
                            p = Protein.objects.get(pk=blast_out[0][0])
                        except Protein.DoesNotExist:
                            print('Template protein for {} not found'.format(up['entry_name']))
                            self.logger.error('Template protein for {} not found'.format(up['entry_name']))

                # skip if no ortholog is found FIXME use a profile to find a good template
                if not p:
                    continue

                # check whether an entry already exists for this protein/species
                # Skips unreviewed genes that have a matching SWISPROT - Some human orthologues
                # can have several orthologues from same species. Eg: agtra_rat and agtrb_rat for AGTR1_HUMAN
                already_entry_names = list(Protein.objects.filter(family=p.family, species__common_name=up['species_common_name'], source__name="SWISSPROT").exclude(entry_name=up['entry_name']).values_list('entry_name', flat = True))
                if "SWISSPROT" != up['source'] and len(already_entry_names):
                    # print(up['entry_name'], accession, " swissprot already there?",p.family.slug, p, p.accession )
                    skipped_due_to_swissprot += 1
                    continue
                elif len(already_entry_names):
                    self.logger.error("{} {} swissprot orthologue already there? {}".format(up['entry_name'], accession,already_entry_names))
                
                # # check whether reference positions exist for this protein, and find them if they do not
                # ref_position_file_path = os.sep.join([self.ref_position_source_dir, up['entry_name'] + '.yaml'])
                # auto_ref_position_file_path = os.sep.join([self.auto_ref_position_source_dir, up['entry_name'] + '.yaml'])
                # if not os.path.isfile(ref_position_file_path):
                #     # look for the file in the automatically generated reference file dir
                #     if not os.path.isfile(auto_ref_position_file_path):
                #         # get reference positions of human ortholog
                #         template_ref_position_file_path = os.sep.join([self.ref_position_source_dir,
                #             p.entry_name + '.yaml'])
                #         if not os.path.isfile(template_ref_position_file_path):
                #             # use a non human sequence
                #             template_ref_position_file_path = os.sep.join([self.auto_ref_position_source_dir,
                #             p.entry_name + '.yaml'])

                #         ref_positions = align_protein_to_reference(up, template_ref_position_file_path, p)

                #         # write reference positions to a file
                #         with open(auto_ref_position_file_path, "w") as auto_ref_position_file:
                #             yaml.dump(ref_positions, auto_ref_position_file, default_flow_style=False)

                # create a database entry for the protein
                if ortholog:
                    # for orthologs, use properties from the human protein
                    self.create_protein(p.name, p.family, p.sequence_type, p.residue_numbering_scheme, accession, up)
                else:
                    # otherwise, create a new family, and use Uniprot name
                    top_level_parent_family = ProteinFamily.objects.get(slug=p.family.slug.split('_')[0])
                    num_families = ProteinFamily.objects.filter(parent=top_level_parent_family).count()
                    family_slug = top_level_parent_family.slug + "_" + str(num_families + 1).zfill(3)
                    other_family, created = ProteinFamily.objects.get_or_create(parent=top_level_parent_family,
                        name='Other', defaults={'slug': family_slug})
                    if created:
                        self.logger.info('Created protein family {}'.format(other_family))

                    family_slug += '_001'
                    unclassified_family, created = ProteinFamily.objects.get_or_create(parent=other_family,
                        name='Unclassified', defaults={'slug': family_slug})
                    if created:
                        self.logger.info('Created protein family {}'.format(unclassified_family))

                    num_families = ProteinFamily.objects.filter(parent=unclassified_family).count()
                    family_slug = unclassified_family.slug + "_" + str(num_families + 1).zfill(3)
                    pf, created = ProteinFamily.objects.get_or_create(parent=unclassified_family, name=up['genes'][0],
                        defaults={'slug': family_slug})
                    if created:
                        self.logger.info('Created protein family {}'.format(pf))

                    self.create_protein(up['genes'][0], pf, p.sequence_type, p.residue_numbering_scheme, accession, up)
            self.logger.info('COMPLETED CREATING OTHER PROTEINS')
        except Exception as msg:
            print(msg)
            self.logger.error(msg)
            PrintException()
Esempio n. 15
0
    def create_orthologs(self, constructs_only):
        self.logger.info('CREATING OTHER PROTEINS')

        # go through constructs and finding their entry_names for lookup
        construct_entry_names = []
        self.logger.info('Getting construct accession codes')
        filenames = os.listdir(self.construct_data_dir)
        for source_file in filenames:
            source_file_path = os.sep.join([self.construct_data_dir, source_file])
            self.logger.info('Getting protein name from construct file {}'.format(source_file))
            split_filename = source_file.split(".")
            extension = split_filename[1]
            if extension != 'yaml':
                continue

            # read the yaml file
            with open(source_file_path, 'r') as f:
                sd = yaml.load(f)

            # check whether protein is specified
            if 'protein' not in sd:
                continue

            # append entry_name to lookup list
            construct_entry_names.append(sd['protein'])

        # parse files
        filenames = os.listdir(self.local_uniprot_dir)
        for source_file in filenames:
            source_file_name = os.sep.join([self.local_uniprot_dir, source_file])
            split_filename = source_file.split(".")
            accession = split_filename[0]
            extension = split_filename[1]
            if extension != 'txt':
                continue

            up = self.parse_uniprot_file(accession)

            # skip human proteins
            if 'species_latin_name' in up and up['species_latin_name'] == 'H**o sapiens':
                continue

            # should proteins that are not constructs be skipped?
            if constructs_only and up['entry_name'] not in construct_entry_names:
                continue

            # is this an ortholog of a human protein?
            ortholog = False

            # is there already an entry for this protein?
            try:
                p = Protein.objects.get(entry_name=up['entry_name'])
                continue
            except Protein.DoesNotExist:
                p = None
                
                # get human ortholog using gene name
                for gene in up['genes']:
                    try:
                        g = Gene.objects.get(name__iexact=gene, species__id=1, position=0)
                        ps = g.proteins.all().order_by('id')
                        p = ps[0]
                        ortholog = True
                        self.logger.info("Human ortholog found: {}".format(p.entry_name))
                        break
                    except Gene.DoesNotExist:
                        self.logger.info("No gene found for {}".format(gene))
                        continue
                
                # if gene name not found, try using entry name
                if not p:
                    split_entry_name = up['entry_name'].split('_')

                    # add _ to the split entry name to avoid e.g. gp1 matching gp139
                    entry_name_query = split_entry_name[0] + '_'
                    try:
                        p = Protein.objects.get(entry_name__startswith=entry_name_query, species__id=1)
                        ortholog = True
                        self.logger.info("Human ortholog found: {}".format(p.entry_name))
                    except Protein.DoesNotExist:
                        self.logger.info("No match found for {}".format(entry_name_query))

                # check whether the entry name is in the construct list
                if not p and up['entry_name'] in construct_entry_names:
                    # BLAST sequence to find closest hit (for reference positions)
                    blast = BlastSearch()
                    blast_out = blast.run(up['sequence'])

                    # use first hit from BLAST as template for reference positions
                    try:
                        p = Protein.objects.get(pk=blast_out[0][0])
                    except Protein.DoesNotExist:
                        self.logger.error('Template protein for {} not found'.format(up['entry_name']))

            # skip if no ortholog is found FIXME use a profile to find a good template
            if not p:
                continue

            # check whether reference positions exist for this protein, and find them if they do not
            ref_position_file_path = os.sep.join([self.ref_position_source_dir, up['entry_name'] + '.yaml'])
            auto_ref_position_file_path = os.sep.join([self.auto_ref_position_source_dir, up['entry_name'] + '.yaml'])
            if not os.path.isfile(ref_position_file_path):
                # look for the file in the automatically generated reference file dir
                if not os.path.isfile(auto_ref_position_file_path):
                    # get reference positions of human ortholog
                    template_ref_position_file_path = os.sep.join([self.ref_position_source_dir,
                        p.entry_name + '.yaml'])
                    if not os.path.isfile(template_ref_position_file_path):
                        # use a non human sequence
                        template_ref_position_file_path = os.sep.join([self.auto_ref_position_source_dir,
                        p.entry_name + '.yaml'])
                    
                    ref_positions = align_protein_to_reference(up, template_ref_position_file_path, p)

                    # write reference positions to a file
                    with open(auto_ref_position_file_path, "w") as auto_ref_position_file:
                        yaml.dump(ref_positions, auto_ref_position_file, default_flow_style=False)

            # create a database entry for the protein
            if ortholog:
                # for orthologs, use properties from the human protein
                self.create_protein(p.name, p.family, p.sequence_type, p.residue_numbering_scheme, accession, up)
            else:
                # otherwise, create a new family, and use Uniprot name
                top_level_parent_family = ProteinFamily.objects.get(slug=p.family.slug.split('_')[0])
                num_families = ProteinFamily.objects.filter(parent=top_level_parent_family).count()
                family_slug = top_level_parent_family.slug + "_" + str(num_families + 1).zfill(3)
                other_family, created = ProteinFamily.objects.get_or_create(parent=top_level_parent_family,
                    name='Other', defaults={'slug': family_slug})
                if created:
                    self.logger.info('Created protein family {}'.format(other_family))

                family_slug += '_001'
                unclassified_family, created = ProteinFamily.objects.get_or_create(parent=other_family,
                    name='Unclassified', defaults={'slug': family_slug})
                if created:
                    self.logger.info('Created protein family {}'.format(unclassified_family))
                
                num_families = ProteinFamily.objects.filter(parent=unclassified_family).count()
                family_slug = unclassified_family.slug + "_" + str(num_families + 1).zfill(3)
                pf, created = ProteinFamily.objects.get_or_create(parent=unclassified_family, name=up['genes'][0],
                    defaults={'slug': family_slug})
                if created:
                    self.logger.info('Created protein family {}'.format(pf))

                self.create_protein(up['genes'][0], pf, p.sequence_type, p.residue_numbering_scheme, accession, up)

        self.logger.info('COMPLETED CREATING OTHER PROTEINS')
class GenericNumbering(object):


    residue_list = ["ARG","ASP","GLU","HIS","ASN","GLN","LYS","SER","THR","HID","PHE","LEU","ILE","TYR","TRP","VAL","MET","PRO","CYS","ALA","GLY"]
    exceptions = {'6GDG':[255, 10]}

    def __init__ (self, pdb_file=None, pdb_filename=None, structure=None, pdb_code=None, blast_path='blastp',
        blastdb=os.sep.join([settings.STATICFILES_DIRS[0], 'blast', 'protwis_blastdb']),top_results=1, sequence_parser=False, signprot=False):

        # pdb_file can be either a name/path or a handle to an open file
        self.pdb_file = pdb_file
        self.pdb_filename = pdb_filename

        # if pdb 4 letter code is specified
        self.pdb_code = pdb_code

        # dictionary of 'MappedResidue' object storing information about alignments and bw numbers
        self.residues = {}
        self.pdb_seq = {} #Seq('')
        # list of uniprot ids returned from blast
        self.prot_id_list = []
        #setup for local blast search
        self.blast = BlastSearch(blast_path=blast_path, blastdb=blastdb,top_results=top_results)

        # calling sequence parser
        if sequence_parser:
            if pdb_code:
                struct = Structure.objects.get(pdb_code__index=self.pdb_code)
            if not signprot:
                if pdb_code:
                    s = SequenceParser(pdb_file=self.pdb_file, wt_protein_id=struct.protein_conformation.protein.parent.id)
                else:
                    s = SequenceParser(pdb_file=self.pdb_file)#, wt_protein_id=struct.protein_conformation.protein.parent.id)
            else:
                s = SequenceParser(pdb_file=self.pdb_file, wt_protein_id=signprot.id)
            self.pdb_structure = s.pdb_struct
            self.mapping = s.mapping
            self.wt = s.wt
        else:
            if self.pdb_file:
                self.pdb_structure = PDBParser(PERMISSIVE=True, QUIET=True).get_structure('ref', self.pdb_file)[0]
            elif self.pdb_filename:
                self.pdb_structure = PDBParser(PERMISSIVE=True, QUIET=True).get_structure('ref', self.pdb_filename)[0]
            else:
                self.pdb_structure = structure

            self.parse_structure(self.pdb_structure)


    def parse_structure(self, pdb_struct):
        """
        extracting sequence and preparing dictionary of residues
        bio.pdb reads pdb in the following cascade: model->chain->residue->atom
        """
        for chain in pdb_struct:
            self.residues[chain.id] = {}
            self.pdb_seq[chain.id] = Seq('')

            for res in chain:
            #in bio.pdb the residue's id is a tuple of (hetatm flag, residue number, insertion code)
                if res.resname == "HID":
                    resname = polypeptide.three_to_one('HIS')
                else:
                    if res.resname not in self.residue_list:
                        continue
                    self.residues[chain.id][res.id[1]] = MappedResidue(res.id[1], polypeptide.three_to_one(res.resname))

            self.pdb_seq[chain.id] = ''.join([self.residues[chain.id][x].name for x in sorted(self.residues[chain.id].keys())])

            for pos, res in enumerate(sorted(self.residues[chain.id].keys()), start=1):
                self.residues[chain.id][res].pos_in_aln = pos


    def locate_res_by_pos (self, chain, pos):

        for res in self.residues[chain].keys():
            if self.residues[chain][res].pos_in_aln == pos:
                return res
        return 0


    def map_blast_seq (self, prot_id, hsps, chain):

        #find uniprot residue numbers corresponding to those in pdb file
        q_seq = list(hsps.query)
        tmp_seq = list(hsps.sbjct)
        subj_counter = hsps.sbjct_start
        q_counter = hsps.query_start

        logger.info("{}\n{}".format(hsps.query, hsps.sbjct))
        logger.info("{:d}\t{:d}".format(hsps.query_start, hsps.sbjct_start))

        rs = Residue.objects.prefetch_related('display_generic_number', 'protein_segment').filter(
            protein_conformation__protein=prot_id)
        residues = {}
        for r in rs:
            residues[r.sequence_number] = r

        while tmp_seq:
            #skipping position if there is a gap in either of sequences
            if q_seq[0] == '-' or q_seq[0] == 'X' or q_seq[0] == ' ':
                subj_counter += 1
                tmp_seq.pop(0)
                q_seq.pop(0)
                continue
            if tmp_seq[0] == '-' or tmp_seq[0] == 'X' or tmp_seq[0] == ' ':
                q_counter += 1
                tmp_seq.pop(0)
                q_seq.pop(0)
                continue
            if tmp_seq[0] == q_seq[0]:
                resn = self.locate_res_by_pos(chain, q_counter)
                if resn != 0:
                    if subj_counter in residues:
                        db_res = residues[subj_counter]

                        if db_res.protein_segment:
                            segment = db_res.protein_segment.slug
                            self.residues[chain][resn].add_segment(segment)

                        if db_res.display_generic_number:
                            num = db_res.display_generic_number.label
                            bw, gpcrdb = num.split('x')
                            # Handle non-numerical GNs - still add segment number
                            if not bw[0].isnumeric():
                                bw[0] = "0"

                            gpcrdb = "{}.{}".format(bw.split('.')[0], gpcrdb)
                            self.residues[chain][resn].add_bw_number(bw)
                            self.residues[chain][resn].add_gpcrdb_number(gpcrdb)
                            self.residues[chain][resn].add_gpcrdb_number_id(db_res.display_generic_number.id)
                            self.residues[chain][resn].add_display_number(num)
                            self.residues[chain][resn].add_residue_record(db_res)
                    else:
                        logger.warning("Could not find residue {} {} in the database.".format(resn, subj_counter))


                    if prot_id not in self.prot_id_list:
                        self.prot_id_list.append(prot_id)
            q_counter += 1
            subj_counter += 1
            tmp_seq.pop(0)
            q_seq.pop(0)


    def get_substructure_mapping_dict(self):

        mapping_dict = {}
        for chain in self.residues.keys():
            for res in self.residues[chain].keys():
                if self.residues[chain][res].segment in mapping_dict.keys():
                    mapping_dict[self.residues[chain][res].segment].append(self.residues[chain][res].number)
                else:
                    mapping_dict[self.residues[chain][res].segment] = [self.residues[chain][res].number,]
        return mapping_dict


    def get_annotated_structure(self):

        for chain in self.pdb_structure:
            for residue in chain:
                if residue.id[1] in self.residues[chain.id].keys():
                    try:
                        if self.residues[chain.id][residue.id[1]].gpcrdb != 0.:
                            residue["CA"].set_bfactor(float(self.residues[chain.id][residue.id[1]].gpcrdb))
                        if self.residues[chain.id][residue.id[1]].bw != 0.:
                            residue["N"].set_bfactor(float(self.residues[chain.id][residue.id[1]].bw))
                    except ValueError:
                        continue

        return self.pdb_structure


    def save_gn_to_pdb(self):

        #replace bfactor field of CA atoms with b-w numbers and return filehandle with the structure written
        for chain in self.pdb_structure:
            for residue in chain:
                if residue.id[1] in self.residues[chain.id].keys():
                    if self.residues[chain.id][residue.id[1]].gpcrdb != 0.:
                        residue["CA"].set_bfactor(float(self.residues[chain.id][residue.id[1]].gpcrdb))
                    if self.residues[chain.id][residue.id[1]].bw != 0.:
                        residue["N"].set_bfactor(float(self.residues[chain.id][residue.id[1]].bw))
                    r = self.residues[chain.id][residue.id[1]]
        #get the basename, extension and export the pdb structure with b-w numbers
        root, ext = os.path.splitext(self.pdb_filename)
        io=PDBIO()
        io.set_structure(self.pdb_structure)
        io.save("%s_GPCRDB%s" %(root, ext))


    def assign_generic_numbers(self):

        alignments = {}
        #blast search goes first, looping through all the chains
        for chain in self.pdb_seq.keys():
            alignments[chain] = self.blast.run(self.pdb_seq[chain])

        #map the results onto pdb sequence for every sequence pair from blast
        for chain in self.pdb_seq.keys():
            for alignment in alignments[chain]:
                if alignment == []:
                    continue
                for hsps in alignment[1].hsps:
                    self.map_blast_seq(alignment[0], hsps, chain)

        return self.get_annotated_structure()

    def assign_generic_numbers_with_sequence_parser(self):

        for chain in self.pdb_structure:
            for residue in chain:
                if chain.id in self.mapping:
                    if residue.id[1] in self.mapping[chain.id].keys():
                        gpcrdb_num = self.mapping[chain.id][residue.id[1]].gpcrdb
                        if gpcrdb_num != '' and len(gpcrdb_num.split('x'))==2:
                            bw, gn = gpcrdb_num.split('x')
                            gn = "{}.{}".format(bw.split('.')[0], gn)
                            if len(gn.split('.')[1])==3:
                                gn = '-'+gn[:-1]
                            try:
                                residue["CA"].set_bfactor(float(gn))
                                residue["N"].set_bfactor(float(bw))
                            except:
                                pass
        return self.pdb_structure

    def assign_cgn_with_sequence_parser(self, target_chain):
        pdb_array = OrderedDict()
        for s in G_PROTEIN_SEGMENTS['Full']:
            pdb_array[s] = OrderedDict()
        i, j = 0, 0
        key_list = [i.gpcrdb for i in list(self.mapping[target_chain].values())]
        for key, vals in self.mapping[target_chain].items():
            category, segment, num = vals.gpcrdb.split('.')
            if self.pdb_code in self.exceptions:
                try:
                    if self.pdb_structure[target_chain][key].get_id()[1]>=self.exceptions[self.pdb_code][0]:
                        if i<self.exceptions[self.pdb_code][1]:
                            pdb_array[segment][vals.gpcrdb] = 'x'
                            i+=1
                            continue
                except:
                    pass
            this_cat, this_seg, this_num = key_list[j].split('.')
            try:
                pdb_array[segment][vals.gpcrdb] = self.pdb_structure[target_chain][key-i].get_list()
            except:
                pdb_array[segment][vals.gpcrdb] = 'x'
            j+=1
        return pdb_array
Esempio n. 17
0
class SequenceParser(object):
    """
    Class mapping the pdb, pdb_seqres, wildtype and any given sequence onto wt using blast with human sequences database. It produces a report with missing, mutated and inserted residues.
    """

    residue_list = ["ARG","ASP","GLU","HIS","ASN","GLN","LYS","SER","THR", "HIS", "HID","PHE","LEU","ILE","TYR","TRP","VAL","MET","PRO","CYS","ALA","GLY"]

    def __init__(self, pdb_file=None, sequence=None, wt_protein_id=None):

        # dictionary of 'ParsedResidue' object storing information about alignments and bw numbers
        self.mapping = {}
        self.residues = {}
        self.segments = {}
        self.blast = BlastSearch(blastdb=os.sep.join([settings.STATICFILES_DIRS[0], 'blast', 'protwis_blastdb']))
        self.wt_protein_id = wt_protein_id
        
        if pdb_file is not None:
            self.pdb_struct = PDBParser(QUIET=True).get_structure('pdb', pdb_file)[0]
            # a list of SeqRecord objects retrived from the pdb SEQRES section
            try:
                self.seqres = list(SeqIO.parse(pdb_file, 'pdb-seqres'))
                self.struct_id = self.seqres[0].id.split(':')[0]
            except:
                self.seqres = None
                self.struct_id = None
            # SeqRecord id is a pdb_code:chain

        self.sequence = sequence
        if type(sequence) == "string":
            self.sequence = { x: y for x,y in enumerate(sequnece) }


        # If not specified, attempt to get wildtype from pdb.
        try:
            if not wt_protein_id and pdb_file is not None:
                self.wt = Structure.objects.get(pdb_code__index=self.struct_id).protein_conformation.protein.parent
            else:
                raise Exception()
        except:
            if not wt_protein_id:
                self.wt = None
                self.wt_seq = ''
            else:
                self.wt = Protein.objects.get(id=wt_protein_id)
                self.wt_seq = str(self.wt.sequence)
        self.fusions = []

        self.parse_pdb(self.pdb_struct)
        #if self.seqres:
        #    self.map_seqres()
        
        self.mark_deletions()


    def parse_pdb(self, pdb_struct):
        """
        extracting sequence and preparing dictionary of residues
        bio.pdb reads pdb in the following cascade: model->chain->residue->atom
        """
        
        for chain in pdb_struct:
            self.residues[chain.id] = []
            
            for res in chain:
            #in bio.pdb the residue's id is a tuple of (hetatm flag, residue number, insertion code)
                if res.resname.replace('HID', 'HIS') not in self.residue_list:
                    continue
                self.residues[chain.id].append(res)
            poly = self.get_chain_peptides(chain.id)
            for peptide in poly:
                #print("Start: {} Stop: {} Len: {}".format(peptide[0].id[1], peptide[-1].id[1], len(peptide)))
                self.map_to_wt_blast(chain.id, peptide, None, int(peptide[0].id[1]))


    def get_segments(self):

        #get the first chain
        c = list(self.mapping.keys())[0]

        for segment in ProteinSegment.objects.all():
            resi = []
            for r in Residue.objects.filter(protein_conformation__protein=self.wt.id, protein_segment=segment):
                if self.mapping[c][r.sequence_number].resnum is not None:
                    resi.append(self.mapping[c][r.sequence_number].resnum)
            if resi == []:
                continue
            self.segments[segment.slug] = [min(resi), max(resi)]
        return self.segments


    def get_chain_peptides(self, chain_id, gap_threshold=230):
        """
        Get peptides of sequential residue numbers (with respect to 230 aa gaps).
        The maximum length of ICL3 is 230 aa, and fusion proteins usualy have significantly different numbers, i.e. exceeding the 230 gap between TM5 and 6.

        The maximum allowed gap size can be evaluated automaticaly, but it is fairly costly:
        max([len(Residue.objects.filter(protein_segment=11, protein_conformation__protein=x)) for x in Protein.objects.filter(species=1)])
        """

        rnumbers = [int(x.id[1]) for x in self.residues[chain_id]]
        last_idx = len(rnumbers)-1
        peptides = []
        tmp = []
        for i, rnum in enumerate(rnumbers):
            if i == last_idx:
                #FIXME: Assuming that very last residue is actualy continuation of a chain
                tmp.append(self.residues[chain_id][i])
                peptides.append(tmp)
                break
            if rnumbers[i+1] != rnum+1 and abs(rnum+1 - rnumbers[i+1]) > gap_threshold:
                tmp.append(self.residues[chain_id][i])
                peptides.append(tmp)
                tmp = []
            else:
                tmp.append(self.residues[chain_id][i])
        return peptides


    def get_chain_sequence(self, chain):
        """
        Returns a sequence string of a given chain.
        """
        return "".join([polypeptide.three_to_one(x.resname.replace('HID', 'HIS')) for x in self.residues[chain] if x.resname in self.residue_list])

    def get_peptide_sequence(self, residues):
        """
        Returns a sequence string of a given list of Bio.PDB.Residue objects.
        """
        return "".join([polypeptide.three_to_one(x.resname.replace('HID', 'HIS')) for x in residues if x.resname in self.residue_list])
    
    def find_nonredundant_chains(self):
        """
        Returns a list of nonidentical chains.
        """
        nrc = []
        if len(self.mapping.keys()) == 1:
            return self.mapping.keys()

        for r_chain in self.mapping.keys():
            for chain in self.mapping.keys():
                if r_chain == chain:
                    continue
                if self.mapping[r_chain] != self.mapping[chain]:
                    nrc.append(r_chain)
        return nrc


    def map_to_wt_blast(self, chain_id, residues = None, sequence=None, starting_aa = 1, seqres = False):

        if residues:
            seq = self.get_peptide_sequence(residues)
        elif sequence:
            seq = sequence
        else:
            seq = self.get_chain_sequence(chain_id)
        alignments = self.blast.run(seq)
        
        if self.wt_protein_id!=None:
            self.wt = Protein.objects.get(id=self.wt_protein_id)
        else:
            self.wt = None
        for alignment in alignments:
            if self.wt==None:
                try:
                    self.wt = Protein.objects.get(entry_name=str(alignment[1].hit_def))
                    wt_resi = list(Residue.objects.filter(protein_conformation__protein=self.wt.id))
                    self.mapping[chain_id] = {x.sequence_number: ParsedResidue(x.amino_acid, x.sequence_number, str(x.display_generic_number) if x.display_generic_number else None, x.protein_segment) for x in wt_resi}
                except:
                    pass
            else:
                wt_resi = list(Residue.objects.filter(protein_conformation__protein=self.wt.id))
                self.mapping[chain_id] = {x.sequence_number: ParsedResidue(x.amino_acid, x.sequence_number, str(x.display_generic_number) if x.display_generic_number else None, x.protein_segment) for x in wt_resi}
            if alignment[1].hsps[0].expect > .5 and residues:
                # self.fusions.append(AuxProtein(residues))
                #The case when auxiliary protein is in a separate chain
                if self.get_chain_sequence(chain_id) == self.get_peptide_sequence(residues) and chain_id in self.mapping:
                    del self.mapping[chain_id]
                continue
            if self.wt.id != int(alignment[0]):
                continue
            for hsps in alignment[1].hsps:
                self.map_hsps(hsps, chain_id, starting_aa, seqres)
    

    def map_hsps(self, hsps, chain_id, offset = 1, seqres = False):
        """
        Analyzes the High Similarity Protein Segment.
        """
        q = hsps.query
        sbjct = hsps.sbjct
        sbjct_counter = hsps.sbjct_start	
        q_counter = hsps.query_start
        
        for s, q in zip(sbjct, q):
            
            if s == q:
                if seqres:
                    self.mapping[chain_id][sbjct_counter].set_seqres(True)
                else:
                    self.mapping[chain_id][sbjct_counter].set_pdb_res_num(offset - 1 + q_counter)
                sbjct_counter += 1
                q_counter += 1
            elif s != '-' and q != '-':
                self.mapping[chain_id][sbjct_counter].set_pdb_res_num(offset - 1 + q_counter)
                self.mapping[chain_id][sbjct_counter].set_mutation(q)
                sbjct_counter += 1
                q_counter += 1
            elif s == '-' and q != '-':
                self.mapping[chain_id][offset - 1 + q_counter].set_insertion(q)
                sbjct_counter += 1
                q_counter += 1
            elif s != '-' and q == '-':
                self.mapping[chain_id][sbjct_counter].set_deletion()
                sbjct_counter += 1
                q_counter += 1

    def map_to_wt_pw(self, chain_id, residues = None, sequence=None, starting_aa = 1):

        """
        @param sequence: a dictionary of residue number: residue one letter code pairs
        """

        if residues:
            seq = self.get_chain_sequence(residues)
        elif sequence:
            seq = sequence.values()
        else:
            return

        wt, chain_seq, score, start, end = pairwise2.align.localms(self.wt_seq, seq, 2, -4, -4, -.1, one_alignment_only=True)[0]

        offset = 0
        for w, c in zip(wt, chain_seq):
            if w == c:
                if seqres:
                    self.mapping[chain.id][starting_aa + offset].seqres=True
                r = Residue.objects.get(sequence_number=offset+self.wt_seq_start, protein_conformation__protein=self.wt.id)
                if r.display_generic_number is not None:
                    self.mapping[chain_id][starting_aa + offset].add_gpcrdb(r.display_generic_number)
                offset += 1
            elif c == '-' and w != '-':
                self.mapping[chain_id][starting_aa + offset].add_deletion()
            elif w != '-' and c != '-' and w != c:
                self.mapping[chain_id][starting_aa + offset].add_mutation(c)
                offset += 1
            elif w == '-' and c != '-':
                self.mapping[chain_id][starting_aa + offset].add_insertion(c)
                offset += 1


    def map_seqres(self):

        for sr in self.seqres:
            self.map_to_wt_blast(sr.annotations['chain'], sequence=sr.seq, seqres=True)

    def mark_deletions(self):
        for chain in self.mapping.keys():
            for num, res in self.mapping[chain].items():
                if res.resnum is None:
                    res.set_deletion()

    def get_mapping_dict(self, pdb_keys=False, seqres=False):

        if pdb_keys:
            return {x: {y: self.mapping[x][y].seqres if seqres else self.mapping[x][y].resnum for y in self.mapping[x].keys()} for x in self.mapping.keys()}
        else:
            if seqres:
                return {x: {y: self.mapping[x][y].resnum if self.mapping[x][y].seqres else '-' for y in self.mapping[x].keys()} for x in self.mapping.keys()}
            else:
                return {x: {y: self.mapping[x][y].resnum for y in self.mapping[x].keys()} for x in self.mapping.keys()}

    def get_fusions(self):

        if self.fusions == []:
            return {}
        fusion_dict = OrderedDict({"auxiliary": {}})
        count = 1
        for fusion in self.fusions:
            fusion_dict["auxiliary"]["aux{}".format(count)] = fusion.get_info()
        return fusion_dict

    def get_deletions(self):

        deletions_list = []

        for chain in self.find_nonredundant_chains():
            deletions = [x for x,y in self.mapping[chain].items() if y.deletion]
            deletion = deletions.reverse()
            tmp = []
            #for num, res in self.mapping[chain].items():
            #    if res.deletion:
            #        tmp.append(num)
            first = 0
            prev = 0
            while deletions != []:
                x = deletions.pop()
                #print("{}\t{}\t{}".format(x, first, prev))
                if first == 0:
                    tmp.append(x)
                    first = x
                    continue
                if prev == 0:
                    tmp.append(x)
                    prev = x
                    continue
                if abs(x - prev) == 1:
                    tmp.append(x)
                    prev = x
                else:
                    deletions_list.append(OrderedDict({
                        "start" : min(tmp),
                        "end" : max(tmp),
                        "type" : "single" if len(tmp) == 1 else "range",
                        "chain" : chain
                        }))
                    tmp = [x]
                    first = x
                    prev = x
            deletions_list.append(OrderedDict({
                        "start" : min(tmp),
                        "end" : max(tmp),
                        "type" : "single" if len(tmp) == 1 else "range",
                        "chain" : chain
                        }))

        return {"deletions" : deletions_list}

    def get_mutations(self):

        mutations_list = []
        for chain in self.find_nonredundant_chains():
            for num, res in self.mapping[chain].items():
                if res.mutation:
                    mutations_list.append(OrderedDict({
                        "wt" : res.name,
                        "mut" : res.mutation,
                        "pos (wt)" : num,
                        "pos (pdb)" : res.resnum,
                        "chain" : chain
                        }))
        return {"mutations" : mutations_list }


    def get_report(self):

        for chain in sorted(self.mapping.keys()):
            print("Chain {}".format(chain))
            for res in sorted(self.mapping[chain].keys()):
                print(self.mapping[chain][res])

    def save_excel_report(self, file_name):
        
        workbook = xlsxwriter.Workbook(file_name)
        
        for chain in sorted(self.mapping.keys()):
            worksheet = workbook.add_worksheet(chain)
            worksheet.write_row(0,0,["Protein number", "Residue name", "PDB number", "Generic number", "Mutation", "SEQRES"])

            row_id = 1
            for res in sorted(self.mapping[chain].keys()):
                tmp = self.mapping[chain][res]
                worksheet.write_row(row_id, 0, tmp.get_param_list())
                row_id += 1
        workbook.close()
class GenericNumbering(object):
    
    
    residue_list = ["ARG","ASP","GLU","HIS","ASN","GLN","LYS","SER","THR","HID","PHE","LEU","ILE","TYR","TRP","VAL","MET","PRO","CYS","ALA","GLY"]
  
    def __init__ (self, pdb_file=None, pdb_filename=None, structure=None, blast_path='blastp',
        blastdb=os.sep.join([settings.STATICFILES_DIRS[0], 'blast', 'protwis_blastdb']),top_results=1):
    
        # pdb_file can be either a name/path or a handle to an open file
        self.pdb_file = pdb_file
        self.pdb_filename = pdb_filename
        
        # dictionary of 'MappedResidue' object storing information about alignments and bw numbers
        self.residues = {}
        self.pdb_seq = {} #Seq('')
        # list of uniprot ids returned from blast
        self.prot_id_list = []
        #setup for local blast search
        self.blast = BlastSearch(blast_path=blast_path, blastdb=blastdb,top_results=top_results)
        
        if self.pdb_file:
            self.pdb_structure = PDBParser(PERMISSIVE=True, QUIET=True).get_structure('ref', self.pdb_file)[0]
        elif self.pdb_filename:
            self.pdb_structure = PDBParser(PERMISSIVE=True, QUIET=True).get_structure('ref', self.pdb_filename)[0]
        else:
            self.pdb_structure = structure

        self.parse_structure(self.pdb_structure)


    def parse_structure(self, pdb_struct):
        """
        extracting sequence and preparing dictionary of residues
        bio.pdb reads pdb in the following cascade: model->chain->residue->atom
        """
        for chain in pdb_struct:
            self.residues[chain.id] = {}
            self.pdb_seq[chain.id] = Seq('')
            
            for res in chain:
            #in bio.pdb the residue's id is a tuple of (hetatm flag, residue number, insertion code)
                if res.resname == "HID":
                    resname = polypeptide.three_to_one('HIS')
                else:
                    if res.resname not in self.residue_list:
                        continue
                    self.residues[chain.id][res.id[1]] = MappedResidue(res.id[1], polypeptide.three_to_one(res.resname))
    
            self.pdb_seq[chain.id] = ''.join([self.residues[chain.id][x].name for x in sorted(self.residues[chain.id].keys())])
            
            for pos, res in enumerate(sorted(self.residues[chain.id].keys()), start=1):
                self.residues[chain.id][res].pos_in_aln = pos


    def locate_res_by_pos (self, chain, pos):

        for res in self.residues[chain].keys():
            if self.residues[chain][res].pos_in_aln == pos:
                return res
        return 0


    def map_blast_seq (self, prot_id, hsps, chain):
    
        #find uniprot residue numbers corresponding to those in pdb file
        q_seq = list(hsps.query)
        tmp_seq = list(hsps.sbjct)
        subj_counter = hsps.sbjct_start	
        q_counter = hsps.query_start
        
        logger.info("{}\n{}".format(hsps.query, hsps.sbjct))
        logger.info("{:d}\t{:d}".format(hsps.query_start, hsps.sbjct_start))

        rs = Residue.objects.prefetch_related('display_generic_number', 'protein_segment').filter(
            protein_conformation__protein=prot_id)
        residues = {}
        for r in rs:
            residues[r.sequence_number] = r

        while tmp_seq:
            #skipping position if there is a gap in either of sequences
            if q_seq[0] == '-' or q_seq[0] == 'X' or q_seq[0] == ' ':
                subj_counter += 1
                tmp_seq.pop(0)
                q_seq.pop(0)
                continue
            if tmp_seq[0] == '-' or tmp_seq[0] == 'X' or tmp_seq[0] == ' ':
                q_counter += 1
                tmp_seq.pop(0)
                q_seq.pop(0)
                continue
            if tmp_seq[0] == q_seq[0]:
                resn = self.locate_res_by_pos(chain, q_counter)
                if resn != 0:
                    if subj_counter in residues:
                        db_res = residues[subj_counter]
                        
                        if db_res.protein_segment:
                            segment = db_res.protein_segment.slug
                            self.residues[chain][resn].add_segment(segment)

                        if db_res.display_generic_number:
                            num = db_res.display_generic_number.label
                            bw, gpcrdb = num.split('x')
                            gpcrdb = "{}.{}".format(bw.split('.')[0], gpcrdb)
                            self.residues[chain][resn].add_bw_number(bw)
                            self.residues[chain][resn].add_gpcrdb_number(gpcrdb)
                            self.residues[chain][resn].add_gpcrdb_number_id(db_res.display_generic_number.id)
                            self.residues[chain][resn].add_display_number(num)
                            self.residues[chain][resn].add_residue_record(db_res)
                    else:
                        logger.warning("Could not find residue {} {} in the database.".format(resn, subj_counter))

                    
                    if prot_id not in self.prot_id_list:
                        self.prot_id_list.append(prot_id)
            q_counter += 1
            subj_counter += 1
            tmp_seq.pop(0)
            q_seq.pop(0)        
    
                    
    def get_substructure_mapping_dict(self):

        mapping_dict = {}
        for chain in self.residues.keys():
            for res in self.residues[chain].keys():
                if self.residues[chain][res].segment in mapping_dict.keys():
                    mapping_dict[self.residues[chain][res].segment].append(self.residues[chain][res].number)
                else:
                    mapping_dict[self.residues[chain][res].segment] = [self.residues[chain][res].number,]
        return mapping_dict


    def get_annotated_structure(self):
    
        for chain in self.pdb_structure:
            for residue in chain:
                if residue.id[1] in self.residues[chain.id].keys():
                    if self.residues[chain.id][residue.id[1]].gpcrdb != 0.:
                        residue["CA"].set_bfactor(float(self.residues[chain.id][residue.id[1]].gpcrdb))
                    if self.residues[chain.id][residue.id[1]].bw != 0.:
                        residue["N"].set_bfactor(float(self.residues[chain.id][residue.id[1]].bw))
      
        return self.pdb_structure
  
  
    def save_gn_to_pdb(self):
    
        #replace bfactor field of CA atoms with b-w numbers and return filehandle with the structure written
        for chain in self.pdb_structure:
            for residue in chain:
                if residue.id[1] in self.residues[chain.id].keys():
                    if self.residues[chain.id][residue.id[1]].gpcrdb != 0.:
                        residue["CA"].set_bfactor(float(self.residues[chain.id][residue.id[1]].gpcrdb))
                    if self.residues[chain.id][residue.id[1]].bw != 0.:
                        residue["N"].set_bfactor(float(self.residues[chain.id][residue.id[1]].bw))
                    r = self.residues[chain.id][residue.id[1]]
        #get the basename, extension and export the pdb structure with b-w numbers
        root, ext = os.path.splitext(self.pdb_filename)
        io=PDBIO()
        io.set_structure(self.pdb_structure)
        io.save("%s_GPCRDB%s" %(root, ext))
           
    
    def assign_generic_numbers(self):
        
        alignments = {}
        #blast search goes first, looping through all the chains
        for chain in self.pdb_seq.keys():
            alignments[chain] = self.blast.run(self.pdb_seq[chain])
            
        #map the results onto pdb sequence for every sequence pair from blast
        for chain in self.pdb_seq.keys():
            for alignment in alignments[chain]:
                if alignment == []:
                    continue
                for hsps in alignment[1].hsps:
                    self.map_blast_seq(alignment[0], hsps, chain)
        return self.get_annotated_structure()
Esempio n. 19
0
class SequenceParser(object):
    """
    Class mapping the pdb, pdb_seqres, wildtype and any given sequence onto wt using blast with human sequences database. It produces a report with missing, mutated and inserted residues.
    """

    residue_list = ["ARG","ASP","GLU","HIS","ASN","GLN","LYS","SER","THR", "HIS", "HID","PHE","LEU","ILE","TYR","TRP","VAL","MET","PRO","CYS","ALA","GLY"]

    def __init__(self, pdb_file, sequence=None, wt_protein_id=None):

        # dictionary of 'ParsedResidue' object storing information about alignments and bw numbers
        self.mapping = {}
        self.residues = {}
        self.blast = BlastSearch(blastdb=os.sep.join([settings.STATICFILES_DIRS[0], 'blast', 'protwis_human_blastdb']))

        self.pdb_struct = PDBParser(QUIET=True).get_structure('pdb', pdb_file)[0]
        # a list of SeqRecord objects retrived from the pdb SEQRES section
        self.seqres = list(SeqIO.parse(pdb_file, 'pdb-seqres'))

        # SeqRecord id is a pdb_code:chain 
        self.struct_id = self.seqres[0].id.split(':')[0]
        # If not specified, attempt to get wildtype from pdb.
        if not wt_protein_id:
            self.wt = Structure.objects.get(pdb_code__index=self.struct_id).protein_conformation.protein.parent
        else:
            self.wt = Protein.objects.get(id=wt_protein_id)
        self.wt_seq = str(self.wt.sequence)
        self.fusions = []


        self.parse_pdb(self.pdb_struct)



    def parse_pdb(self, pdb_struct):
        """
        extracting sequence and preparing dictionary of residues
        bio.pdb reads pdb in the following cascade: model->chain->residue->atom
        """
        wt_resi = list(Residue.objects.filter(protein_conformation__protein=self.wt.id))
        for chain in pdb_struct:
            self.residues[chain.id] = []
            self.mapping[chain.id] = {x.sequence_number: ParsedResidue(x.amino_acid, x.sequence_number, str(x.display_generic_number) if x.display_generic_number else None) for x in wt_resi}
            
            for res in chain:
            #in bio.pdb the residue's id is a tuple of (hetatm flag, residue number, insertion code)
                if res.resname.replace('HID', 'HIS') not in self.residue_list:
                    continue
                self.residues[chain.id].append(res)
                #self.mapping[chain.id][res.id[1]] = ParsedResidue(polypeptide.three_to_one(res.resname.replace('HID', 'HIS')), res.id[1])


    def get_chain_peptides(self, chain_id, gap_threshold=230):
        """
        Get peptides of sequential residue numbers (with respect to 230 aa gaps).
        The maximum length of ICL3 is 230 aa, and fusion proteins usualy have significantly different numbers, i.e. exceeding the 230 gap between TM5 and 6.

        The maximum allowed gap size can be evaluated automaticaly, but it is fairly costly:
        max([len(Residue.objects.filter(protein_segment=11, protein_conformation__protein=x)) for x in Protein.objects.filter(species=1)])
        """

        rnumbers = [int(x.id[1]) for x in self.residues[chain_id]]
        last_idx = len(rnumbers)-1
        peptides = []
        tmp = []
        for i, rnum in enumerate(rnumbers):
            if i == last_idx:
                #FIXME: Assuming that very last residue is actualy continuation of a chain
                tmp.append(self.residues[chain_id][i])
                peptides.append(tmp)
                break
            if rnumbers[i+1] != rnum+1 and abs(rnum+1 - rnumbers[i+1]) > gap_threshold:
                tmp.append(self.residues[chain_id][i])
                peptides.append(tmp)
                tmp = []
            else:
                tmp.append(self.residues[chain_id][i])
        return peptides


    def get_chain_sequence(self, chain):
        return "".join([polypeptide.three_to_one(x.resname.replace('HID', 'HIS')) for x in chain if x.resname in self.residue_list])
    

    def map_to_wt_blast(self, chain_id, residues = None, sequence=None, starting_aa = 1, seqres = False):

        if residues:
            seq = self.get_chain_sequence(residues)
        elif sequence:
            seq = sequence
        else:
            return

        alignments = self.blast.run(seq)

        for alignment in alignments:
            if alignment[1].hsps[0].expect > 1. and residues:
                self.fusions.append(residues)
                #for res in residues:
                #    self.mapping[chain_id][res.id[1]].set_fusion()
            if self.wt.id != int(alignment[0]):
                continue
            for hsps in alignment[1].hsps:
                self.map_hsps(hsps, chain_id, starting_aa, seqres)
    

    def map_hsps(self, hsps, chain_id, offset = 1, seqres = False):
        """
        Analyzes the High Similarity Protein Segment.
        """
        q = hsps.query
        sbjct = hsps.sbjct
        sbjct_counter = hsps.sbjct_start	
        q_counter = hsps.query_start

        for s, q in zip(sbjct, q):
            if s == q:
                #r = Residue.objects.get(sequence_number=sbjct_counter, protein_conformation__protein=self.wt.id)
                #if r.display_generic_number is not None:
                #    self.mapping[chain_id][offset + q_counter].set_gpcrdb(r.display_generic_number)
                
                #self.mapping[chain_id][offset - 1 + q_counter].set_wt_number(sbjct_counter)
                if seqres:
                    self.mapping[chain_id][sbjct_counter].set_seqres(True)
                else:
                    self.mapping[chain_id][sbjct_counter].set_pdb_res_num(offset - 1 + q_counter)
                sbjct_counter += 1
                q_counter += 1
            elif s != '-' and q != '-':
                #print(s)
                #self.mapping[chain_id][offset - 1 + q_counter].set_mutation(s)
                #self.mapping[chain_id][offset - 1 + q_counter].set_wt_number(sbjct_counter)
                self.mapping[chain_id][sbjct_counter].set_pdb_res_num(offset - 1 + q_counter)
                self.mapping[chain_id][sbjct_counter].set_mutation(q)
                sbjct_counter += 1
                q_counter += 1
            elif s == '-' and q != '-':
                self.mapping[chain_id][offset - 1 + q_counter].set_insertion(q)
                q_counter += 1


    def map_to_wt_pw(self, chain_id, residues = None, sequence=None, starting_aa = 1):

        if residues:
            seq = self.get_chain_sequence(residues)
        elif sequence:
            seq = sequence
        else:
            return

        wt, chain_seq, score, start, end = pairwise2.align.localms(self.wt_seq, seq, 2, -4, -4, -.1, one_alignment_only=True)[0]

        offset = 0
        for w, c in zip(wt, chain_seq):
            if w == c:
                if seqres:
                    self.mapping[chain.id][starting_aa + offset].seqres=True
                r = Residue.objects.get(sequence_number=offset+self.wt_seq_start, protein_conformation__protein=self.wt.id)
                if r.display_generic_number is not None:
                    self.mapping[chain_id][starting_aa + offset].add_gpcrdb(r.display_generic_number)
                offset += 1
            elif c == '-' and w != '-':
                print(offset)
                self.mapping[chain_id][starting_aa + offset].add_deletion()
            elif w != '-' and c != '-' and w != c:
                self.mapping[chain_id][starting_aa + offset].add_mutation(c)
                offset += 1
            elif w == '-' and c != '-':
                self.mapping[chain_id][starting_aa + offset].add_insertion(c)
                offset += 1


    def map_seqres(self):

        for sr in self.seqres:
            self.map_to_wt_blast(sr.annotations['chain'], sequence=sr.seq, seqres=True)


    def get_report(self):

        for chain in sorted(self.mapping.keys()):
            print("Chain {}".format(chain))
            for res in sorted(self.mapping[chain].keys()):
                print(self.mapping[chain][res])

    def save_excel_report(self, file_name):
        
        workbook = xlsxwriter.Workbook(file_name)
        
        for chain in sorted(self.mapping.keys()):
            worksheet = workbook.add_worksheet(chain)
            worksheet.write_row(0,0,["Protein number", "Residue name", "PDB number", "Generic number", "Mutation", "SEQRES"])

            row_id = 1
            for res in sorted(self.mapping[chain].keys()):
                tmp = self.mapping[chain][res]
                worksheet.write_row(row_id, 0, tmp.get_param_list())
                row_id += 1
        workbook.close()