Example #1
0
def main():
    os.chdir(path)
    html = HTML(url="http://www.genenames.org/cgi-bin/hgnc_downloads.cgi"
                )  # Check html for attributes.

    attributes = html.find_between(
        "</td> <td>", "</td>", '"',
        all=True)  # Retrieve all aviable attributes.

    print("Number of attributes: %s" %
          len(attributes))  # Check number of attributes.

    # Building url:
    url_begin = "http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?title=Core+Data"
    url_context = ";col=" + ";col=".join(
        attributes
    )  #col=gd_hgnc_id;col=gd_app_sym;col=gd_app_name;col=gd_status;col=gd_prev_sym;col=gd_aliases;col=gd_pub_chrom_map;col=gd_pub_acc_ids;col=gd_pub_refseq_ids;
    url_end = ";status=Approved;status=Approved+Non-Human;status=Entry+Withdrawn;status_opt=3;=on;where=;order_by=gd_app_sym_sort;limit=;format=text;submit=submit;.cgifields=;.cgifields=status;.cgifields=chr"
    url = url_begin + url_context + url_end

    f = File(name="hgnc.txt", url=url, path=path)
    contents = f.parse(printing=False, header=True)
    genes.name = "HGNC"
    genes.key = "hgnc"
    genes.taxid = 9606
    genes.addData(contents)
    genes.save()
    genes.buildMappings()
Example #2
0
def main():
    os.chdir(path)
    html = HTML(url="http://www.genenames.org/cgi-bin/hgnc_downloads.cgi") # Check html for attributes.

    attributes = html.find_between("</td> <td>", "</td>", '"', all=True) # Retrieve all aviable attributes.

    print("Number of attributes: %s" % len(attributes)) # Check number of attributes.

    # Building url:
    url_begin = "http://www.genenames.org/cgi-bin/hgnc_downloads.cgi?title=Core+Data"
    url_context = ";col="+";col=".join(attributes) #col=gd_hgnc_id;col=gd_app_sym;col=gd_app_name;col=gd_status;col=gd_prev_sym;col=gd_aliases;col=gd_pub_chrom_map;col=gd_pub_acc_ids;col=gd_pub_refseq_ids;
    url_end = ";status=Approved;status=Approved+Non-Human;status=Entry+Withdrawn;status_opt=3;=on;where=;order_by=gd_app_sym_sort;limit=;format=text;submit=submit;.cgifields=;.cgifields=status;.cgifields=chr"
    url = url_begin + url_context + url_end

    f = File(name="hgnc.txt", url=url, path=path)
    contents = f.parse(printing=False, header=True)
    genes.name = "HGNC"
    genes.key = "hgnc"
    genes.taxid = 9606
    genes.addData(contents)
    genes.save()
    genes.buildMappings()
Example #3
0
def main(interactions=False,
         download=True,
         parse=True,
         withdrawn=True,
         cleanup=True):
    """Performs the download of interaction and annotation files from MGI.
    Builds a gene annotation file and mapping tables.
    TODO:
    - Inspect and eventually use interaction file, else discard from this module.
    - Also check whether other information from MGI is worse to integrate
      such as homology or phenotypes."""
    os.chdir(path)
    genes.name = 'MGI'
    genes.key = 'mgi'
    folder = Folder(path)

    if interactions:
        ftp = FTP(
            url='ftp://ftp.informatics.jax.org/pub/protein-interaction-data/',
            path=path)
        ftp.download(path)

    if download:
        url = "ftp://ftp.informatics.jax.org/pub/reports/"
        files = [
            "MRK_List1.rpt", "MRK_List2.rpt", "MGI_Coordinate.rpt",
            "MRK_Sequence.rpt", "MRK_SwissProt_TrEMBL.rpt", "MRK_VEGA.rpt",
            "MRK_ENSEMBL.rpt", "MGI_EntrezGene.rpt"
        ]
        # MPheno_OBO.ontology, VOC_MammalianPhenotype.rpt, MGI_PhenotypicAllele.rpt, HMD_HumanPhenotype.rpt
        for f in files:
            f = File(url=url + f)  # automatically does f.download()
            res = f.parse(header=True, printing=False)
            folder.downloads.append(f.name)

    if parse:
        folder.update()
        if withdrawn: filename = "MRK_List1.rpt"
        else: filename = "MRK_List2.rpt"
        data = folder[filename].parse(header=True, printing=False)
        genes.addData(data, key='mgi', taxid=10090)

        data = folder["MGI_Coordinate.rpt"].parse(header=True, printing=False)
        for i in data:
            i = change_keys(i)
            i['taxid'] = 10090
            genes.add(i)

        data = folder['MRK_Sequence.rpt'].parse(header=True, printing=False)
        genes.addData(data, key='mgi', taxid=10090)

        header = "mgi symbol status name cm_position chromosome	type "\
        "secondary_accession_ids id synonyms feature_types start "\
        "stop strand biotypes".split()
        data = folder["MGI_EntrezGene.rpt"].parse(header=header,
                                                  printing=False)
        genes.addData(data, key="mgi", taxid=10090)
        print len(genes)

    if cleanup:
        if interactions: ftp.remove(confirm=False)
        for f in folder.downloads:
            folder.remove(f)

    genes.keep("category", "Gene")
    genes.remove("name", "withdrawn")
    genes.save()
    genes.buildMappings()
Example #4
0
def main(interactions=False, download=True, parse=True, withdrawn=True, cleanup=True):
    """Performs the download of interaction and annotation files from MGI.
    Builds a gene annotation file and mapping tables.
    TODO:
    - Inspect and eventually use interaction file, else discard from this module.
    - Also check whether other information from MGI is worse to integrate
      such as homology or phenotypes."""
    os.chdir(path)
    genes.name = "MGI"
    genes.key = "mgi"
    folder = Folder(path)

    if interactions:
        ftp = FTP(url="ftp://ftp.informatics.jax.org/pub/protein-interaction-data/", path=path)
        ftp.download(path)

    if download:
        url = "ftp://ftp.informatics.jax.org/pub/reports/"
        files = [
            "MRK_List1.rpt",
            "MRK_List2.rpt",
            "MGI_Coordinate.rpt",
            "MRK_Sequence.rpt",
            "MRK_SwissProt_TrEMBL.rpt",
            "MRK_VEGA.rpt",
            "MRK_ENSEMBL.rpt",
            "MGI_EntrezGene.rpt",
        ]
        # MPheno_OBO.ontology, VOC_MammalianPhenotype.rpt, MGI_PhenotypicAllele.rpt, HMD_HumanPhenotype.rpt
        for f in files:
            f = File(url=url + f)  # automatically does f.download()
            res = f.parse(header=True, printing=False)
            folder.downloads.append(f.name)

    if parse:
        folder.update()
        if withdrawn:
            filename = "MRK_List1.rpt"
        else:
            filename = "MRK_List2.rpt"
        data = folder[filename].parse(header=True, printing=False)
        genes.addData(data, key="mgi", taxid=10090)

        data = folder["MGI_Coordinate.rpt"].parse(header=True, printing=False)
        for i in data:
            i = change_keys(i)
            i["taxid"] = 10090
            genes.add(i)

        data = folder["MRK_Sequence.rpt"].parse(header=True, printing=False)
        genes.addData(data, key="mgi", taxid=10090)

        header = (
            "mgi symbol status name cm_position chromosome	type "
            "secondary_accession_ids id synonyms feature_types start "
            "stop strand biotypes".split()
        )
        data = folder["MGI_EntrezGene.rpt"].parse(header=header, printing=False)
        genes.addData(data, key="mgi", taxid=10090)
        print len(genes)

    if cleanup:
        if interactions:
            ftp.remove(confirm=False)
        for f in folder.downloads:
            folder.remove(f)

    genes.keep("category", "Gene")
    genes.remove("name", "withdrawn")
    genes.save()
    genes.buildMappings()