Esempio n. 1
0
    def get_pathway_protein_names(self, pathway):
        """returns list of list. Each elements is made of 3 items: gene name,
        locusId and accession (often empty

        Requires to parse HTML page such as
        http://www.biocarta.com/pathfiles/m_actinYPathway.asp

        to figure out the URL that would pop up if we press the protein list
        button. For instance:

        http://www.biocarta.com/pathfiles/PathwayProteinList.asp?showPFID=175

        but now we need to parse the HTML, which is not necessaray very robust.
        THere are many tables and we want to access one that is a children of
        another... Finally, We scan the table for tr and td tags.

        The most difficult is to find the good table which is hardcoded to be
        the third that contains a th[0] == "Gene name". Although there is only
        one, 3 are returned due probably to an error in the parsing or the HTMl
        file itself. To be checked and made more robust.

        """
        url = self._url + "/pathfiles/" + pathway
        x = readXML(url)
        self.logging.info("Reading " + url)
        protein_url = [this.get("href") for this in x.findAll("a") \
                if 'href' in this and "Protein" in this.get("href")]
        if len(protein_url) == 0:
            return None
        else:
            link = protein_url[0]
            link = link.split("/pathfiles/")[-1]
            link = str(link)  # get rid of unicode ?
            link = link.strip("')")
            url = self._url + "/pathfiles/" + link
            self.logging.info("Reading " + url)
            x = readXML(url)

            # seems to work
            table = [
                this for this in x.findAll("table") if this.findAll("th")
                and this.findAll("th")[0].getText() == "Gene Name"
            ][2]
            # now get the genename, locus and accession
            rows = [[y.getText() for y in xx.findAll("td")]
                    for xx in table.findAll("tr")]
            rows = [xx for xx in rows if len(x)]
            return rows
Esempio n. 2
0
    def get_pathway_protein_names(self, pathway):
        """returns list of list. Each elements is made of 3 items: gene name,
        locusId and accession (often empty

        Requires to parse HTML page such as
        http://www.biocarta.com/pathfiles/m_actinYPathway.asp

        to figure out the URL that would pop up if we press the protein list
        button. For instance:

        http://www.biocarta.com/pathfiles/PathwayProteinList.asp?showPFID=175

        but now we need to parse the HTML, which is not necessaray very robust.
        THere are many tables and we want to access one that is a children of
        another... Finally, We scan the table for tr and td tags.

        The most difficult is to find the good table which is hardcoded to be
        the third that contains a th[0] == "Gene name". Although there is only
        one, 3 are returned due probably to an error in the parsing or the HTMl
        file itself. To be checked and made more robust.

        """
        url = self._url + "/pathfiles/" + pathway
        x = readXML(url)
        self.logging.info("Reading " + url)
        protein_url = [this.get("href") for this in x.findAll("a") \
                if 'href' in this and "Protein" in this.get("href")]
        if len(protein_url) == 0:
            return None
        else:
            link = protein_url[0]
            link = link.split("/pathfiles/")[-1]
            link = str(link) # get rid of unicode ?
            link = link.strip("')")
            url = self._url + "/pathfiles/" + link
            self.logging.info("Reading " + url)
            x = readXML(url)

            # seems to work
            table = [this for this in x.findAll("table") if this.findAll("th")
                    and  this.findAll("th")[0].getText() == "Gene Name"][2]
            # now get the genename, locus and accession
            rows = [[y.getText() for y in xx.findAll("td")] for xx in  table.findAll("tr")]
            rows = [xx for xx in rows if len(x)]
            return rows
Esempio n. 3
0
    def get_pathway_names(self, startswith=""):
        """returns pathways from biocarta

        all human and mouse. can perform a selectiom
        """
        x = readXML(self._allPathwaysURL)
        pathways = [this.get("href") for this in x.findAll("a") if "pathfiles" in this.get("href")]
        pathways =  [str(xx.split("/")[-1]) for xx in pathways] # split the drive
        pathways = sorted(list(set(pathways)))
        pathways = [xx for xx in pathways if xx.startswith(startswith)]
        return pathways
Esempio n. 4
0
 def get_pathway_names(self, startswith=""):
     """returns pathways from biocarta
     
     all human and mouse. can perform a selectiom
     """
     raise NotImplementedError
     allx = readXML(self._allPathwaysURL)
     pathways = [
         this.get("href") for this in allx.findAll("a")
         if "pathfiles" in this.get("href")
     ]
     pathways = [str(x.split("/")[-1]) for x in pathways]  # split the drive
     pathways = sorted(list(set(pathways)))
     pathways = [x for x in pathways if x.startswith(startswith)]
     return pathways