def test_get_response_bad(self, mock_requests): mock_response = mock.MagicMock() mock_response.status_code = 404 mock_requests.codes.ok = 200 mock_requests.get.return_value = mock_response get_response('foo', 'bar') self.assertEquals(mock_requests.get.call_count, 3) mock_response.raise_for_status.assert_called_once_with()
def test_get_response_ok(self, mock_requests): mock_response = mock.MagicMock() mock_response.status_code = 200 mock_requests.codes.ok = 200 mock_requests.get.return_value = mock_response r = get_response('foo', 'bar') self.assertEqual(r, mock_response) mock_requests.get.assert_called_once_with('foo', stream='bar', timeout=30)
def run(): for prov_code in setup.valid_provinces: url = setup.prov_url['BU'].format(code=prov_code) response = download.get_response(url) root = etree.fromstring(response.content) for entry in root.findall("atom:entry[atom:title]", namespaces=ns): title = entry.find('atom:title', ns).text zip_code = title[1:6] mun = title.replace('buildings', '').strip()[6:] url = u"{0}{1}/{2}-{3}/A.ES.SDGC.BU.{2}.zip".format( baseurl, prov_code, zip_code, mun) download.wget(url, 'temp') zf = zipfile.ZipFile('temp') root = etree.parse( zf.open('A.ES.SDGC.BU.MD.{}.xml'.format(zip_code))).getroot() gml_bbox = root.find('.//gmd:EX_GeographicBoundingBox', ns) gml_bbox_l = gml_bbox.find('gmd:westBoundLongitude/gco:Decimal', ns) gml_bbox_r = gml_bbox.find('gmd:eastBoundLongitude/gco:Decimal', ns) gml_bbox_b = gml_bbox.find('gmd:southBoundLatitude/gco:Decimal', ns) gml_bbox_t = gml_bbox.find('gmd:northBoundLatitude/gco:Decimal', ns) bbox = ','.join([ gml_bbox_b.text, gml_bbox_l.text, gml_bbox_t.text, gml_bbox_r.text ]) query = overpass.Query(bbox, 'json', False, False) query.add('rel["admin_level"="8"]') response = download.get_response(query.get_url()) sys.stdout.write(' ' * 70 + '\r') data = response.json() matching = hgwnames.dsmatch(mun, data['elements'], lambda e: e['tags']['name']) match = matching['tags']['name'] if matching else '' ok = mun == match.upper().translate(trans) color = {False: '\033[0;31m', True: '\033[0m'}[ok] print u'{}{}\t{}\t{}\t{}'.format(color, zip_code, mun, match, ok) fh.write(u'{}\t{}\t{}\t{}\n'.format(zip_code, mun, match, ok)) print '\033[0m' if os.path.exists('temp'): os.remove('temp')
def get_metadata(self, md_path): if os.path.exists(md_path): self.src_date = open(md_path, 'r').read() else: response = download.get_response(meta_url) s = re.search('fecha de referencia.*([0-9]{1,2} de .+ de [0-9]{4})', response.text) try: self.src_date = datetime.strptime(s.group(1), '%d de %B de %Y').strftime('%Y-%m-%d') except: raise IOError(_("Could not read metadata from '%s'") % 'CDAU') with open(md_path, 'w') as fo: fo.write(self.src_date)
def list_municipalities(prov_code): """Get from the ATOM services a list of municipalities for a given province""" if prov_code not in setup.valid_provinces: raise ValueError(_("Province code '%s' not valid") % prov_code) url = setup.prov_url['BU'].format(code=prov_code) response = download.get_response(url) root = etree.fromstring(response.content) ns = {'atom': 'http://www.w3.org/2005/Atom'} office = root.find('atom:title', ns).text.split('Office ')[1] title = _("Territorial office %s") % office print title print "=" * len(title) for entry in root.findall('atom:entry', namespaces=ns): row = entry.find('atom:title', ns).text.replace('buildings', '') print row.encode(setup.encoding)
def get_atom_file(self, url): """ Given the url of a Cadastre ATOM service, tries to download the ZIP file for self.zip_code """ s = re.search('INSPIRE/(\w+)/', url) log.debug(_("Searching the url for the '%s' layer of '%s'..."), s.group(1), self.zip_code) response = download.get_response(url) s = re.search('http.+/%s.+zip' % self.zip_code, response.text) if not s: raise ValueError(_("Zip code '%s' don't exists") % self.zip_code) url = s.group(0) filename = url.split('/')[-1] out_path = os.path.join(self.path, filename) log.info(_("Downloading '%s'"), out_path) download.wget(url, out_path)
def run(): qgs = catatom2osm.QgsSingleton() for prov_code in setup.valid_provinces: url = setup.prov_url['BU'].format(code=prov_code) response = download.get_response(url) root = etree.fromstring(response.content) for entry in root.findall("atom:entry[atom:title]", namespaces=ns): title = entry.find('atom:title', ns).text zip_code = title[1:6] mun = title.replace('buildings', '').strip()[6:] url = u"{0}{1}/{2}-{3}/A.ES.SDGC.BU.{2}.zip".format(baseurl, prov_code, zip_code, mun) gml_fn = ".".join((setup.fn_prefix, 'BU', zip_code, 'building.gml')) download.wget(url, 'temp.zip') gml = layer.BaseLayer('/vsizip/temp.zip/'+gml_fn, 'temp', 'ogr') sys.stdout.write(' '*70+'\r') c = gml.featureCount() print zip_code, mun, c fh.write(u'{}\t{}\t{}\n'.format(zip_code, mun, c)) if os.path.exists('temp'): os.remove('temp')
def parse_page(title, page_url): response = get_response(page_url) soup = BeautifulSoup(response.content, 'lxml') # 判断是否是电信学院的相关报道 title_tokens = jieba.cut(title) is_school_relative = is_relative(SCHOOL_NAMES, title_tokens) is_keyword_relative = is_relative(KEYWORDS, title_tokens) # 判断标题和正文是否包含关键字 content = soup.find('div', {'id': 'lmz_content'}).text # 如果标题没有相关内容,则判断正文 if not is_school_relative or not is_keyword_relative: content_tokens = jieba.cut(content) is_school_relative = is_relative(SCHOOL_NAMES, content_tokens) is_keyword_relative = is_relative(KEYWORDS, content_tokens) # 如果还没有则跳过 if not is_school_relative or not is_keyword_relative: return None # 图片 imgs_url = [] imgs = soup.find('div', {'id': 'lmz_content'}).findAll('img') if imgs is not None: for img in imgs: img_url = urljoin(response.url, img['src']) imgs_url.append(img_url) # 作者 author = '' author_div_list = soup.findAll('div', {'class': 'd_write2'}) for div in author_div_list: author += div.text + '\n' print('---- author ---- : {}'.format(author)) return content, imgs_url, author
def read(self): """Returns query result""" response = download.get_response(self.get_url()) return response.text.encode(response.apparent_encoding)
print('------ home_url: {}'.format(home_url)) # 每个类别使用一个url manager url_manager = UrlManager() # 加入每个类别首页 url_manager.add_url(home_url) # 遍历 while not url_manager.is_empty(): cur_url = url_manager.get_url() print('cur_url: {}'.format(cur_url)) if url_manager.is_viewed(cur_url): continue # 获取当前页面新闻列表 (列表页) response = get_response(cur_url) # 解析内容并保存 content_list_result返回的是列表页所有的page内容 content_list_result, next_url = parse_content_list(response, url_manager) print('next_url: {}'.format(next_url)) if next_url is not None: url_manager.add_url(next_url) # 保存 save_2_file(content_list_result) # 标记已访问 url_manager.add_viewed(cur_url)