コード例 #1
0
ファイル: test_download.py プロジェクト: egofer/CatAtom2Osm
 def test_get_response_bad(self, mock_requests):
     mock_response = mock.MagicMock()
     mock_response.status_code = 404
     mock_requests.codes.ok = 200
     mock_requests.get.return_value = mock_response
     get_response('foo', 'bar')
     self.assertEquals(mock_requests.get.call_count, 3)
     mock_response.raise_for_status.assert_called_once_with()
コード例 #2
0
 def test_get_response_ok(self, mock_requests):
     mock_response = mock.MagicMock()
     mock_response.status_code = 200
     mock_requests.codes.ok = 200
     mock_requests.get.return_value = mock_response
     r = get_response('foo', 'bar')
     self.assertEqual(r, mock_response)
     mock_requests.get.assert_called_once_with('foo', stream='bar', timeout=30)
コード例 #3
0
def run():
    for prov_code in setup.valid_provinces:
        url = setup.prov_url['BU'].format(code=prov_code)
        response = download.get_response(url)
        root = etree.fromstring(response.content)
        for entry in root.findall("atom:entry[atom:title]", namespaces=ns):
            title = entry.find('atom:title', ns).text
            zip_code = title[1:6]
            mun = title.replace('buildings', '').strip()[6:]
            url = u"{0}{1}/{2}-{3}/A.ES.SDGC.BU.{2}.zip".format(
                baseurl, prov_code, zip_code, mun)
            download.wget(url, 'temp')
            zf = zipfile.ZipFile('temp')
            root = etree.parse(
                zf.open('A.ES.SDGC.BU.MD.{}.xml'.format(zip_code))).getroot()
            gml_bbox = root.find('.//gmd:EX_GeographicBoundingBox', ns)
            gml_bbox_l = gml_bbox.find('gmd:westBoundLongitude/gco:Decimal',
                                       ns)
            gml_bbox_r = gml_bbox.find('gmd:eastBoundLongitude/gco:Decimal',
                                       ns)
            gml_bbox_b = gml_bbox.find('gmd:southBoundLatitude/gco:Decimal',
                                       ns)
            gml_bbox_t = gml_bbox.find('gmd:northBoundLatitude/gco:Decimal',
                                       ns)
            bbox = ','.join([
                gml_bbox_b.text, gml_bbox_l.text, gml_bbox_t.text,
                gml_bbox_r.text
            ])
            query = overpass.Query(bbox, 'json', False, False)
            query.add('rel["admin_level"="8"]')
            response = download.get_response(query.get_url())
            sys.stdout.write(' ' * 70 + '\r')
            data = response.json()
            matching = hgwnames.dsmatch(mun, data['elements'],
                                        lambda e: e['tags']['name'])
            match = matching['tags']['name'] if matching else ''
            ok = mun == match.upper().translate(trans)
            color = {False: '\033[0;31m', True: '\033[0m'}[ok]
            print u'{}{}\t{}\t{}\t{}'.format(color, zip_code, mun, match, ok)
            fh.write(u'{}\t{}\t{}\t{}\n'.format(zip_code, mun, match, ok))
    print '\033[0m'
    if os.path.exists('temp'):
        os.remove('temp')
コード例 #4
0
 def get_metadata(self, md_path):
     if os.path.exists(md_path):
         self.src_date = open(md_path, 'r').read()
     else:
         response = download.get_response(meta_url)
         s = re.search('fecha de referencia.*([0-9]{1,2} de .+ de [0-9]{4})', response.text)
         try:
             self.src_date = datetime.strptime(s.group(1), '%d de %B de %Y').strftime('%Y-%m-%d')
         except:
             raise IOError(_("Could not read metadata from '%s'") % 'CDAU')
         with open(md_path, 'w') as fo:
             fo.write(self.src_date)
コード例 #5
0
def list_municipalities(prov_code):
    """Get from the ATOM services a list of municipalities for a given province"""
    if prov_code not in setup.valid_provinces:
        raise ValueError(_("Province code '%s' not valid") % prov_code)
    url = setup.prov_url['BU'].format(code=prov_code)
    response = download.get_response(url)
    root = etree.fromstring(response.content)
    ns = {'atom': 'http://www.w3.org/2005/Atom'}
    office = root.find('atom:title', ns).text.split('Office ')[1]
    title = _("Territorial office %s") % office
    print title
    print "=" * len(title)
    for entry in root.findall('atom:entry', namespaces=ns):
        row = entry.find('atom:title', ns).text.replace('buildings', '')
        print row.encode(setup.encoding)
コード例 #6
0
 def get_atom_file(self, url):
     """
     Given the url of a Cadastre ATOM service, tries to download the ZIP
     file for self.zip_code
     """
     s = re.search('INSPIRE/(\w+)/', url)
     log.debug(_("Searching the url for the '%s' layer of '%s'..."),
               s.group(1), self.zip_code)
     response = download.get_response(url)
     s = re.search('http.+/%s.+zip' % self.zip_code, response.text)
     if not s:
         raise ValueError(_("Zip code '%s' don't exists") % self.zip_code)
     url = s.group(0)
     filename = url.split('/')[-1]
     out_path = os.path.join(self.path, filename)
     log.info(_("Downloading '%s'"), out_path)
     download.wget(url, out_path)
コード例 #7
0
def run():
    qgs = catatom2osm.QgsSingleton()
    for prov_code in setup.valid_provinces:
        url = setup.prov_url['BU'].format(code=prov_code)
        response = download.get_response(url)
        root = etree.fromstring(response.content)
        for entry in root.findall("atom:entry[atom:title]", namespaces=ns):
            title = entry.find('atom:title', ns).text
            zip_code = title[1:6]
            mun = title.replace('buildings', '').strip()[6:]
            url = u"{0}{1}/{2}-{3}/A.ES.SDGC.BU.{2}.zip".format(baseurl, prov_code, zip_code, mun)
            gml_fn = ".".join((setup.fn_prefix, 'BU', zip_code, 'building.gml'))
            download.wget(url, 'temp.zip')
            gml = layer.BaseLayer('/vsizip/temp.zip/'+gml_fn, 'temp', 'ogr')
            sys.stdout.write(' '*70+'\r')
            c = gml.featureCount()
            print zip_code, mun, c
            fh.write(u'{}\t{}\t{}\n'.format(zip_code, mun, c))
    if os.path.exists('temp'):
        os.remove('temp')
コード例 #8
0
ファイル: parse.py プロジェクト: AotY/crawl_xjtu_news
def parse_page(title, page_url):
    response = get_response(page_url)
    soup = BeautifulSoup(response.content, 'lxml')

    # 判断是否是电信学院的相关报道
    title_tokens = jieba.cut(title)
    is_school_relative = is_relative(SCHOOL_NAMES, title_tokens)
    is_keyword_relative = is_relative(KEYWORDS, title_tokens)

    # 判断标题和正文是否包含关键字
    content = soup.find('div', {'id': 'lmz_content'}).text

    # 如果标题没有相关内容,则判断正文
    if not is_school_relative or not is_keyword_relative:
        content_tokens = jieba.cut(content)
        is_school_relative = is_relative(SCHOOL_NAMES, content_tokens)
        is_keyword_relative = is_relative(KEYWORDS, content_tokens)

    # 如果还没有则跳过
    if not is_school_relative or not is_keyword_relative:
        return None

    # 图片
    imgs_url = []
    imgs = soup.find('div', {'id': 'lmz_content'}).findAll('img')
    if imgs is not None:
        for img in imgs:
            img_url = urljoin(response.url, img['src'])
            imgs_url.append(img_url)

    # 作者
    author = ''
    author_div_list = soup.findAll('div', {'class': 'd_write2'})
    for div in author_div_list:
        author += div.text + '\n'

    print('---- author ---- : {}'.format(author))

    return content, imgs_url, author
コード例 #9
0
 def read(self):
     """Returns query result"""
     response = download.get_response(self.get_url())
     return response.text.encode(response.apparent_encoding)
コード例 #10
0
    print('------ home_url: {}'.format(home_url))

    # 每个类别使用一个url manager
    url_manager = UrlManager()

    # 加入每个类别首页
    url_manager.add_url(home_url)

    # 遍历
    while not url_manager.is_empty():
        cur_url = url_manager.get_url()
        print('cur_url: {}'.format(cur_url))

        if url_manager.is_viewed(cur_url):
            continue

        # 获取当前页面新闻列表 (列表页)
        response = get_response(cur_url)

        # 解析内容并保存 content_list_result返回的是列表页所有的page内容
        content_list_result, next_url = parse_content_list(response, url_manager)

        print('next_url: {}'.format(next_url))
        if next_url is not None:
            url_manager.add_url(next_url)

        # 保存
        save_2_file(content_list_result)

        # 标记已访问
        url_manager.add_viewed(cur_url)