Esempio n. 1
0
def test_read_xml_dmi_1():
    xml = """
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<html>
  <head>
    <title>Index of /data/CORDEX/AFR-44/KNMI/MOHC-HadGEM2-ES/rcp45/r1i1p1/KNMI-RACMO22T/v1/mon/tasmax</title>
  </head>
  <body>
    <h1>Index of /data/CORDEX/AFR-44/KNMI/MOHC-HadGEM2-ES/rcp45/r1i1p1/KNMI-RACMO22T/v1/mon/tasmax</h1>
    <pre>
      <img src="/icons/blank.gif" alt="Icon "> 
      <a href="?C=N;O=D">Name</a>                                                                        
      <a href="?C=M;O=A">Last modified</a>      
      <a href="?C=S;O=A">Size</a>  
      <a href="?C=D;O=A">Description</a>
      <hr>
      <img src="/icons/back.gif" alt="[PARENTDIR]"> 
      <a href="/data/CORDEX/AFR-44/KNMI/MOHC-HadGEM2-ES/rcp45/r1i1p1/KNMI-RACMO22T/v1/mon/">Parent Directory</a>
      <img src="/icons/folder.gif" alt="[DIR]"> 
      <a href="v20150224/">v20150224/</a>  
      2015-02-24 14:10    -   
      <hr>
    </pre>
    <address>Apache/2.4.7 (Ubuntu) Server at ensemblesrt3.dmi.dk Port 80</address>
  </body>
</html>
    """
    page_url = "http://nowhere.org/cordex/"
    page = spider.read_xml(xml, baseurl=page_url)
    assert len(page.datasets) == 0
    assert len(page.references) == 1
    assert page.references[0].startswith(page_url)
Esempio n. 2
0
def test_read_xml_dmi_2():
    xml = """
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<html>
  <head>
    <title>Index of /data/CORDEX/AFR-44/KNMI/MOHC-HadGEM2-ES/rcp45/r1i1p1/KNMI-RACMO22T/v1/mon/tasmax/v20150224</title>
  </head>
  <body>
    <h1>Index of /data/CORDEX/AFR-44/KNMI/MOHC-HadGEM2-ES/rcp45/r1i1p1/KNMI-RACMO22T/v1/mon/tasmax/v20150224</h1>
    <pre>
      <img src="/icons/blank.gif" alt="Icon "> <a href="?C=N;O=D">Name</a> <a href="?C=M;O=A">Last modified</a> <a href="?C=S;O=A">Size</a> <a href="?C=D;O=A">Description</a>
      <hr>
      <img src="/icons/back.gif" alt="[PARENTDIR]"> 
      <a href="/data/CORDEX/AFR-44/KNMI/MOHC-HadGEM2-ES/rcp45/r1i1p1/KNMI-RACMO22T/v1/mon/tasmax/">Parent Directory</a> 
      -  
      <img src="/icons/unknown.gif" alt="[   ]"> <a href="tasmax_AFR-44_MOHC-HadGEM2-ES_rcp45_r1i1p1_KNMI-RACMO22T_v1_mon_200601-201012.nc">tasmax_AFR-44_MOHC-HadGEM2-ES_rcp45_r1i1p1_KNMI-RACMO22T_v1_mon_200601-201012.nc</a> 2015-02-12 14:37  5.0M  
      <img src="/icons/unknown.gif" alt="[   ]"> <a href="tasmax_AFR-44_MOHC-HadGEM2-ES_rcp45_r1i1p1_KNMI-RACMO22T_v1_mon_201101-202012.nc">tasmax_AFR-44_MOHC-HadGEM2-ES_rcp45_r1i1p1_KNMI-RACMO22T_v1_mon_201101-202012.nc</a> 2015-02-12 14:37  9.9M
      <hr>
    </pre>
    <address>Apache/2.4.7 (Ubuntu) Server at ensemblesrt3.dmi.dk Port 80</address>
  </body>
</html>
    """
    page_url = "http://nowhere.org/cordex/"
    page = spider.read_xml(xml, baseurl=page_url)
    assert len(page.datasets) == 2
    assert len(page.references) == 0
    for ds in page.datasets:
        assert ds.url.startswith(page_url)
        assert 'tasmax' in ds.url
    assert page.datasets[0].ID == "tasmax_AFR-44_MOHC-HadGEM2-ES_rcp45_r1i1p1_KNMI-RACMO22T_v1_mon_200601-201012.nc"
    assert page.datasets[0].name == "tasmax_AFR-44_MOHC-HadGEM2-ES_rcp45_r1i1p1_KNMI-RACMO22T_v1_mon_200601-201012.nc"
    assert page.datasets[0].last_modified == "2015-02-12T14:37:00Z"
    assert page.datasets[0].size == "5.0M"
    assert page.datasets[0].url == "http://nowhere.org/cordex/tasmax_AFR-44_MOHC-HadGEM2-ES_rcp45_r1i1p1_KNMI-RACMO22T_v1_mon_200601-201012.nc"
    assert page.datasets[0].download_url == "http://nowhere.org/cordex/tasmax_AFR-44_MOHC-HadGEM2-ES_rcp45_r1i1p1_KNMI-RACMO22T_v1_mon_200601-201012.nc"