def test_gp_quadratic_kernel(self): """Test Gaussian process predictions with the quadratic kernel.""" train_features, train_targets, test_features, test_targets = get_data() # Test prediction routine with quadratic kernel. kdict = { 'k1': { 'type': 'quadratic', 'slope': 1., 'degree': 1., 'scaling': 1., 'bounds': ((1e-5, None), ) * (np.shape(train_features)[1] + 1), 'scaling_bounds': ((0., None), ) } } gp = GaussianProcess(train_fp=train_features, train_target=train_targets, kernel_dict=kdict, regularization=1e-3, optimize_hyperparameters=True, scale_data=True) pred = gp.predict(test_fp=test_features, test_target=test_targets, get_validation_error=True, get_training_error=True) self.assertEqual(len(pred['prediction']), len(test_features)) print('quadratic prediction:', pred['validation_error']['rmse_average'])
def fetch_stores(data): url = data['url'] try: body = cm.get_data(url, {'m': data['m']}) except Exception, e: cm.dump('Error in fetching stores: %s' % url, log_name) return ()
def fetch_cities(data): url = data['home_url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching geo info: %s' % url, 'samsonite_log.txt') dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] pat = re.compile(ur"if \(currlan == 'ZHT'\)\s*\{.+?\}", re.S) body = re.sub(pat, '', body) m = re.search(ur'dsy.add\("0",\[(.+?)\]', body) if m is None: cm.dump('Error in fetching geo info: %s' % url, 'samsonite_log.txt') return [] province_list = [m1 for m1 in re.findall(ur'"(.+?)"', m.group(1))] city_list = [] for m in re.findall(ur'dsy.add\("0_(\d+)",\[(.+?)\]', body): for m1 in re.findall(ur'"(.+?)"', m[1]): c = data.copy() c['province'] = province_list[string.atoi(m[0])] c['city'] = m1 city_list.append(c)
def test_pareto(self): """Simple test case to make sure it doesn't crash.""" train_features, train_targets, _, _ = get_data() train_features = train_features[:, :20] ga = GeneticAlgorithm(population_size=10, fit_func=minimize_error_descriptors, features=train_features, targets=train_targets, population=None, fitness_parameters=2) self.assertEqual(np.shape(ga.population), (10, 20)) ga.search(50) self.assertTrue(len(ga.population) == 10) self.assertTrue(len(ga.fitness) == 10) ga = GeneticAlgorithm(population_size=10, fit_func=minimize_error_time, features=train_features, targets=train_targets, population=None, fitness_parameters=2) self.assertEqual(np.shape(ga.population), (10, 20)) ga.search(50) self.assertTrue(len(ga.population) == 10) self.assertTrue(len(ga.fitness) == 10)
def fetch_continents(data): url = data['store_url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] start = html.find(u'<select id="continent" name="continent"') if start == -1: return [] sub, start, end = cm.extract_closure(html[start:], ur'<select\b', ur'</select') continent_list = [] for m in re.findall(ur'<option value="(.+?)">.+?</option>', sub): d = data.copy() d['continent'] = m continent_list.append(d)
def fetch_stores(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] store_list = [] for m in re.findall(ur'var markerContent\s*?=\s*?"(.+?)".+?' ur'createMarker\(.+?new google.maps.LatLng\((-?\d+\.\d+),(-?\d+\.\d+)\)', html, re.S): entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) lat, lng = map(string.atof, [m[1], m[2]]) cm.update_entry(entry, {cm.lat: lat, cm.lng: lng}) sub = m[0].strip() m1 = re.search(ur'<b>(.+?)</b>', sub) if m1 is None: continue entry[cm.name_c] = m1.group(1) sub = sub.replace(m1.group(0), '') m1=re.search(ur'聯系電話(?::|:)(.+?)<', sub) if m1 is not None: entry[cm.tel]=m1.group(1) sub=sub.replace(m1.group(0), '<') sub = re.sub(ur'<img\b.*?/>', '', sub) entry[cm.addr_c] = cm.reformat_addr(sub) print '(%s/%d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_c], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]) store_list.append(entry) db.insert_record(entry, 'stores')
def fetch_continents(data): url = data['url'] try: body = cm.get_data(url) except Exception, e: cm.dump('Error in fetching continents: %s' % url, log_name) return []
def fetch_contact_info(data, s, store_id): url = "%s/%s/detail" % (data["shop_url"], store_id) try: body = cm.get_data(url) except Exception, e: cm.dump("Error in fetching stores: %s" % url, log_name) return ()
def get_store_list(data): """ 返回店铺列表,其中店铺包含国家信息。 :rtype : [{'name':'store name', 'url':'http://...', 'city':'NEW YORK', 'country:':'AUSTRALIA'}, ...] :param data: """ url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id} cm.dump(dump_data) return [] store_list = [] for m in re.finditer(ur'<ul>\s+?<h3 class="country-name">(.+?)</h3>', html, re.S): sub, start, end = cm.extract_closure(html[m.start():], ur'<ul>', ur'</ul>') if end == 0: continue # 得到不同国家的分割 splits = [[m1.start(), m1.group(1)] for m1 in re.finditer(ur'<h3 class="country-name">(.+?)</h3>', sub)] splits.append([-1, '']) for i in xrange(len(splits) - 1): # 在同一个国家下寻找 sub1 = sub[splits[i][0]:splits[i + 1][0]] country = splits[i][1].upper() for m1 in re.findall(ur'<li>\s*?<a href="(http://us.christianlouboutin.com/us_en/storelocator/\S+?)">' ur'(.+?)</a>,(.+?)</li>', sub1): store_list.append({'name': m1[1].strip(), 'url': m1[0], 'city': m1[2].strip().upper(), 'country': country})
def fetch_store_list(data): url = data['url'] try: body = cm.get_data(url) except Exception, e: cm.dump('Error in fetching countries: %s' % url, log_name) return []
def fetch_countries(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] # 处理重定向 m = re.search('<h2>Object moved to <a href="(.+?)">', html) if m is not None: data['url'] = data['host'] + m.group(1) return fetch_countries(data) m = re.search('<span class="country">Choose a country</span>', html) if m is None: return [] sub, start, end = cm.extract_closure(html[m.end():], r'<ul\b', r'</ul>') if end == 0: return [] country_list = [] for m in re.findall('<li><a .*?href="(.+?)">(.+?)</a></li>', sub): d = data.copy() country_e = cm.html2plain(m[1]).strip().upper() ret = gs.look_up(country_e, 1) if ret is not None: country_e=ret['name_e'] d['country_e'] = country_e d['province_e'] = '' d['url'] = data['host'] + m[0] country_list.append(d) return country_list
def tv(identifier): channel = television.channels_by_id[str(identifier).decode('UTF-8')] data = get_data(television.create_link(channel[u'cmd'])) link = data[u'cmd'] if not link: link = 'http://149.13.0.80/nrj128.m3u' return redirect(link, code=302)
def fetch_store_details(data): url = data["url"] try: body = cm.get_data(url, hdr={"X-Requested-With": ""}) except Exception, e: cm.dump("Error in fetching store details: %s" % url, log_name) return ()
def fetch_details(data, detail_url, entry): entry = entry.copy() try: body = cm.get_data(detail_url) except Exception, e: cm.dump('Error in fetching countries: %s' % detail_url, log_name) return entry
def fetch_stores_au(data): url = 'http://www.hushpuppies.com.au/stockists/index/search/' try: body = cm.get_data(url) except Exception, e: cm.dump('Error in fetching stores: %s' % url, log_name) return ()
def fetch_cities(data): url = data['url'] % data['country_id'] try: raw = json.loads(cm.get_data(url))['places'] except Exception, e: cm.dump('Error in fetching cities: %s' % url, log_name) return ()
def fetch_store_details(data): url = data['url'] try: body = cm.get_data(url) except Exception, e: cm.dump('Error in fetching store details: %s' % url, log_name) return ()
def fetch_continents(data): url = data['url'] % 0 try: raw = json.loads(cm.get_data(url))['places'] except Exception, e: cm.dump('Error in fetching continents: %s' % url, log_name) return ()
def fetch_cities(data): url = data['host'] + data['url'] try: body = cm.get_data(url) except Exception, e: cm.dump('Error in fetching cities: %s' % url, log_name) return []
def fetch_stores(data): url = data['url'] try: raw = json.loads(cm.get_data(url)) except Exception, e: cm.dump('Error in fetching stores: %s' % url, log_name) return ()
def fetch_countries(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] start = html.find('<select name="country" id="inp-country"') if start == -1: return [] sub, start, end = cm.extract_closure(html[start:], ur'<select\b', ur'</select>') if end == 0: return [] country_list = [] for m in re.findall(ur'<option value="([A-Z]{2})">(.*?)</option>', sub): d = data.copy() d['country_code'] = m[0] d[cm.country_c] = m[1].strip() for key in [cm.country_e, cm.continent_e, cm.continent_c]: d[key] = '' ret = gs.look_up(d['country_code'], 1) if ret is not None: d[cm.country_e] = ret['name_e'] d[cm.country_c] = ret['name_c'] d[cm.continent_c] = ret['continent']['name_c'] d[cm.continent_e] = ret['continent']['name_e'] country_list.append(d)
def fetch_cities(data): url = data['host'] + '/ajax/esiajaxProxy.asp' try: body = cm.get_data( url, { 'c': 'FF_StoreLocator2', 'm': 'getCountiesAjax', 'ws': 'ch-ch', 'pid': 178, 'cid': data['country_code'], 'CT': 0 }) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] results = [] for m in re.findall(ur'<li><a href="" data-value="(.+?)">', body): d = data.copy() d['city'] = m results.append(d)
def currency_update(param_dict): """ 更新货币的汇率信息 @param param_dict: """ db = RoseVisionDb() db.conn(getattr(gs, 'DATABASE')['DB_SPEC']) rs = db.query_match(['iso_code', 'currency'], 'region_info').fetch_row(maxrows=0) db.start_transaction() try: for code, currency in rs: print str.format('Fetching for currency data for {0}...', currency) data = cm.get_data(url=str.format( 'http://download.finance.yahoo.com/d/quotes.csv?s={0}CNY=X' '&f=sl1d1t1ba&e=.json', currency)) rdr = csv.reader(StringIO(data['body'])) line_data = [val for val in rdr][0] timestamp = datetime.datetime.strptime( str.format('{0} {1}', line_data[2], line_data[3]), '%m/%d/%Y %I:%M%p') db.update( { 'rate': line_data[1], 'update_time': timestamp.strftime('%Y-%m-%d %H:%M:%S') }, 'region_info', str.format('iso_code="{0}"', code)) db.commit() except: db.rollback() raise
def func(data, level): """ :param data: :param level: :return: siblings """ if level == 4: # get store details stores = get_store_details(data['content'], data) return [{'func': None, 'data': s} for s in stores] else: if level == 1: content = cm.get_data(data['url']) else: content = data['content'] entries = get_entries(content, pattern[level - 1]) def siblings_data(ent): # Each time when a new level is reached, a new field is added to data, and the 'content' # field is updated. This is returned to build new siblings. local_d = dict(data) if level == 1: local_d[cm.continent_e] = ent elif level == 2: local_d[cm.country_e] = ent elif level == 3: local_d[cm.city_e] = ent local_d['content'] = entries[ent] return local_d return [{ 'func': lambda data: func(data, level + 1), 'data': siblings_data(ent) } for ent in entries]
def fetch_countries(data): url = data['country_url'] try: body = cm.get_data(url, {'display_country': 'CN'}) except Exception, e: cm.dump('Error in fetching countries: %s' % url, log_name) return ()
def run(cls, logger=None, **kwargs): """ 更新货币的汇率信息 @param param_dict: """ logger = logger if 'logger' in kwargs else get_logger() logger.info('Update currency STARTED!!!!') with RoseVisionDb(getattr(gs, 'DATABASE')['DB_SPEC']) as db: for currency in db.query_match('currency', 'currency_info').fetch_row(maxrows=0): currency = currency[0] try: logger.debug(str.format('Fetching for currency data for {0}...', currency)) data = cm.get_data(url=str.format('http://download.finance.yahoo.com/d/quotes.csv?s={0}CNY=X' '&f=sl1d1t1ba&e=.json', currency)) rate, d, t = [val for val in csv.reader(StringIO(data['body']))][0][1:4] rate = float(rate) timestamp = datetime.strptime(' '.join((d, t)), '%m/%d/%Y %I:%M%p').strftime('%Y-%m-%d %H:%M:%S') db.update({'rate': rate, 'update_time': timestamp}, 'currency_info', str.format('currency="{0}"', currency)) except (ValueError, IOError): continue except: raise logger.info('Update currency ENDED!!!!')
def test_gp_linear_kernel(self): """Test Gaussian process predictions with the linear kernel.""" train_features, train_targets, test_features, test_targets = get_data() # Test prediction routine with linear kernel. kdict = { 'k1': { 'type': 'linear', 'scaling': 1., 'scaling_bounds': ((0., None), ) }, 'c1': { 'type': 'constant', 'const': 1. } } gp = GaussianProcess(train_fp=train_features, train_target=train_targets, kernel_dict=kdict, regularization=1e-3, optimize_hyperparameters=True, scale_data=True) pred = gp.predict(test_fp=test_features, test_target=test_targets, get_validation_error=True, get_training_error=True) self.assertEqual(len(pred['prediction']), len(test_features)) print('linear prediction:', pred['validation_error']['rmse_average'])
def _validate(**kwargs): """ Check each feed type and keep valid results """ results = [] version = kwargs["version"] for record_type in [mds.STATUS_CHANGES, mds.TRIPS]: datasource = common.get_data(record_type, **kwargs) if len(datasource) > 0: versions = set([d["version"] for d in datasource]) if len(versions) > 1: expected, unexpected = mds.Version( versions.pop()), mds.Version(versions.pop()) error = mds.versions.UnexpectedVersionError( expected, unexpected) results.append( (record_type, expected, datasource, [], [error], [])) continue version = mds.Version(version or versions.pop()) try: valid, errors, removed = validate(record_type, datasource, version) results.append( (record_type, version, datasource, valid, errors, removed)) except mds.versions.UnexpectedVersionError as unexpected_version: results.append((record_type, version, datasource, [], [unexpected_version], [])) return results
def query(): """Perform inference on some examples of documents from our classes.""" tf.logging.set_verbosity(FLAGS.verbosity) classes = get_data(FLAGS.data_dir, classes_only=True) FLAGS.output_dim = len(classes) queries = np.loadtxt(FLAGS.query_file, dtype=str, delimiter='\n') _, x_query, _, query_lengths, _, _ = process_vocabulary( None, queries, FLAGS, reuse=True, sequence_lengths=FLAGS.model == 'rnn') if FLAGS.model == 'perceptron': model = bag_of_words_perceptron_model elif FLAGS.model == 'mlp': model = bag_of_words_MLP_model elif FLAGS.model == 'rnn': model = rnn_model else: raise ValueError('unknown model') classifications = predict(x_query, query_lengths, model, FLAGS) for i, query in enumerate(queries): print('The model classifies "{}" as a member of the class {}.'.format( query, classes['class'][classifications[i]]))
def fetch_stores(data): # <h2 property="dc:title" url = data[cm.url] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] store_list = [] for m in re.finditer(ur'<h2 property="dc:title"', html): end = html.find('</header>', m.start()) if end == -1: continue sub = html[m.start():end] m1 = re.search(ur'<a href="(.+?)">(.+?)</a></h2>', sub) if m1 is None: print 'Error: no more details for %s' % url continue d = data.copy() d[cm.url] = data['host'] + m1.group(1) d[cm.name_e] = cm.html2plain(m1.group(2)).strip() store_list.append(d)
def fetch_cities(data): url = data['home_url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching cities: %s' % url, 'canali_log.txt') dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] start = body.find(u'<nav class="countrySelector">') if start == -1: cm.dump('Error occured in fetching country list: %s' % url, 'canali_log.txt') body = cm.extract_closure(body[start:], ur'<nav\b', ur'</nav>')[0] results = [] for m in re.finditer(ur'<li><a href=".+?">(.+?)</a>', body): country = m.group(1).strip().upper() sub = cm.extract_closure(body[m.end():], ur'<ul\b', ur'</ul>')[0] for m1 in re.findall(ur'<li><a class=".+?" href="(.+?)">(.+?)</a></li>', sub): d = data.copy() d['country'] = country d['url'] = data['host'] + m1[0] d['city'] = m1[1].strip().upper() results.append(d)
def fetch_stores(data): url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching stores: %s' % url, log_name) return [] store_list = [] for m in re.finditer(ur'<item id="\d+">', body): sub = cm.extract_closure(body[m.start():], ur'<item\b', ur'</item>')[0] entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c']) m1 = re.search(ur'<country>([^<>]+)</country>', sub) if m1 is not None: tmp = m1.group(1).split('/') for v in tmp: ret = gs.look_up(v.strip().upper(), 1) if ret is not None: entry[cm.country_e] = ret['name_e'] break m1 = re.search(ur'<city>([^<>]+)</city>', sub) if m1 is not None: val = cm.reformat_addr(m1.group(1)) if entry[cm.country_e] == 'UNITED STATES': tmp_list = tuple(tmp.strip() for tmp in cm.reformat_addr(val).strip(',')) if len(tmp_list) == 2: if re.search('[A-Z]{2}', tmp_list[1]): entry[cm.province_e] = tmp_list[1] entry[cm.city_e] = cm.extract_city(m1.group(1))[0] m1 = re.search(ur'<brands>([^<>]+)</brands>', sub) if m1 is not None: tmp = m1.group(1).split('/') brand_list = [] for v in tmp: if v.strip() != '': brand_list.append(v) entry[cm.store_type] = ', '.join(brand_map[key] for key in brand_list) m1 = re.search(ur'<name>([^<>]+)</name>', sub) if m1 is not None: entry[cm.name_e] = m1.group(1).strip() m1 = re.search(ur'<address>([^<>]+)</address>', sub) if m1 is not None: entry[cm.addr_e] = cm.reformat_addr(m1.group(1)) m1 = re.search(ur'<tel>([^<>]+)</tel>', sub) if m1 is not None: entry[cm.tel] = m1.group(1).strip() m1 = re.search(ur'sll=(-?\d+\.\d+),(-?\d+\.\d+)', sub) if m1 is not None: entry[cm.lat] = string.atof(m1.group(1)) entry[cm.lng] = string.atof(m1.group(2)) gs.field_sense(entry) ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e]) if ret[1] is not None: entry[cm.province_e] = ret[1] gs.field_sense(entry) cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'], entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]), log_name) db.insert_record(entry, 'stores') store_list.append(entry)
def fetch_states(data): global national_added url = data['url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching states: %s' % url, log_name) return [] national_added = False m = re.search(ur'Choose a (state|region|province)', body) if m is None: d = data.copy() d['state'] = '' return [d] body = cm.extract_closure(body[m.start():], ur'<ul>', ur'</ul>')[0] results = [] for m in re.findall(ur'<a href="([^"]+)">([^<>]+)</a>', body): d = data.copy() d['url'] = data['host'] + m[0] d['state'] = cm.html2plain(m[1]).strip().upper() results.append(d)
def test_load(self): """Function to test loading a pre-generated model.""" train_features, train_targets, test_features, test_targets = get_data() self.make_test(train_features, train_targets, test_features, test_targets) new_gp = io.read(filename='test-model', ext='pkl') pred = new_gp.predict(test_fp=test_features, test_target=test_targets, get_validation_error=True, get_training_error=True) self.assertTrue( np.allclose(pred['validation_error']['rmse_all'], self.original['validation_error']['rmse_all'])) gp = io.read(filename='test-model', ext='hdf5') pred = gp.predict(test_fp=test_features, test_target=test_targets, get_validation_error=True, get_training_error=True) self.assertTrue( np.allclose(pred['validation_error']['rmse_all'], self.original['validation_error']['rmse_all'])) os.remove('{}/test-model.pkl'.format(wkdir)) os.remove('{}/test-model.hdf5'.format(wkdir))
def fetch_countries(data): url = data['host'] + 'boutique' try: body = cm.get_data(url) except Exception, e: cm.dump('Error in fetching countries: %s' % url, log_name) return []
def fetch_store_details(url, data): """ 获得门店的详细信息(url下可能有多个门店) :rtype : [{}] :param url: :param data: """ try: html = cm.get_data(url) except Exception: print 'Error occured: %s / %s' % (str(data), url) dump_data = { 'level': 2, 'time': cm.format_time(), 'data': data, 'brand_id': brand_id } cm.dump(dump_data) return [] # 可能有多个门店,拆分 sub_html = [] for m in re.finditer(ur'<li\s+class\s*=\s*"boutique-info-cadre-\d+"\s*>', html): start = m.start() + len(m.group()) end = html.find('</li>', start) sub_html.append(html[start:end])
def fetch_cities(data): url = data['home_url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching geo info: %s' % url, 'samsonite_log.txt') dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] pat = re.compile(ur"if \(currlan == 'ZHT'\)\s*\{.+?\}", re.S) body = re.sub(pat, '', body) m = re.search(ur'dsy.add\("0",\[(.+?)\]', body) if m is None: cm.dump('Error in fetching geo info: %s' % url, 'samsonite_log.txt') return [] province_list = [m1 for m1 in re.findall(ur'"(.+?)"', m.group(1))] city_list = [] for m in re.findall(ur'dsy.add\("0_(\d+)",\[(.+?)\]', body): for m1 in re.findall(ur'"(.+?)"', m[1]): c = data.copy() c['province'] = province_list[string.atoi(m[0])] c['city'] = m1 city_list.append(c)
def get_continents(data): """ 返回洲列表 :rtype : [{'name':u'欧洲', 'url':'http://....'}, ...] :param data: :return: """ url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 1, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': brand_id } cm.dump(dump_data) return [] return [{ 'name': m[1], 'url': m[0] } for m in re.findall( ur'<a href="(http://us.christianlouboutin.com/us_en/storelocator/\S+)">(.+?)</a>', html)]
def fetch_stores(data): url = data['url'] try: body = cm.get_data(url, cookie=data['cookie']) except Exception, e: cm.dump('Error in fetching stores: %s' % url, log_name) return ()
def fetch(level=1, data=None, user='******', passwd=''): # Walk from the root node, where level == 1. if data is None: data = { 'url': 'http://www.mido.cn/zh/retailer_li/POS', 'brand_id': 10260, 'brandname_e': u'MIDO', 'brandname_c': u'美度' } global db db = cm.StoresDb() db.connect_db(user=user, passwd=passwd) db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', data['brand_id'])) url = data['url'] try: data['html'] = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] store_list = fetch_countries(data) db.disconnect_db()
def fetch_countries(data): url = data["url"] try: body = cm.get_data(url) except Exception, e: cm.dump("Error in fetching countries: %s" % url, log_name) return []
def fetch_cities_beauty(data): url = '%s/%s' % (data['city_url'], data['country_code']) try: body = cm.get_data(url) except Exception, e: cm.dump('Error in fetching countries: %s' % url, log_name) return ()
def test_read_write(self): """Function to test reading an writing GA files.""" train_features, train_targets, _, _ = get_data() train_features = train_features[:, :20] ga1 = GeneticAlgorithm(population_size=10, fit_func=minimize_error, features=train_features, targets=train_targets, population=None) self.assertEqual(np.shape(ga1.population), (10, 20)) ga1.search(2, writefile='gaWrite.json') self.assertTrue(len(ga1.population) == 10) self.assertTrue(len(ga1.fitness) == 10) old_pop, _ = read_data('gaWrite.json') ga2 = GeneticAlgorithm(population_size=10, fit_func=minimize_error, features=train_features, targets=train_targets, population=old_pop) self.assertTrue(np.allclose(ga2.population, ga1.population)) ga2.search(50) self.assertTrue(len(ga2.population) == 10) self.assertTrue(len(ga2.fitness) == 10)
def fetch_countries(data): url = data['url'] try: html = cm.get_data(url) except Exception: print 'Error occured: %s' % url dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']} cm.dump(dump_data) return [] # 处理重定向 m = re.search('<h2>Object moved to <a href="(.+?)">', html) if m is not None: data['url'] = data['host'] + m.group(1) return fetch_countries(data) m = re.search('<span class="country">Choose a country</span>', html) if m is None: return [] sub, start, end = cm.extract_closure(html[m.end():], r'<ul\b', r'</ul>') if end == 0: return [] country_list = [] for m in re.findall('<li><a .*?href="(.+?)">(.+?)</a></li>', sub): d = data.copy() country_e = cm.html2plain(m[1]).strip().upper() ret = gs.look_up(country_e, 1) if ret is not None: country_e = ret['name_e'] d['country_e'] = country_e d['province_e'] = '' d['url'] = data['host'] + m[0] country_list.append(d) return country_list
def get_frag_countries(url): # 获得国家代码 """ 获得国家的名字和代码 :rtype : [{'id':**, 'country':**}, ...] :param url: :return: """ try: html = common.get_data(url) except Exception: print 'Error occured: %s' % url_fragrance dump_data = {'level': 1, 'time': common.format_time(), 'data': {'url': url_fragrance}, 'brand_id': brand_id} common.dump(dump_data) return [], False start = html.find('<select name="country" id="id_country">') if start == -1: return [], False sub, s, e = common.extract_closure(html[start:], ur'<select\b', ur'</select>') if e == 0: return [], False return [{'id': string.atoi(m[0]), 'country': m[1].strip().upper()} for m in re.findall(ur'<option value="(\d+)".*?>(.+?)</option>', sub)]
def parse_data2(self): if self.name == '互联网': url = 'http://news.baidu.com/widget?' params = { 'id': 'AllOtherData', 'channel': self.__dict[self.name], 't': str(time.time() * 1000).split('.')[0] } self.content = get_data(url=url, params=params, plate=self.name) tree = etree.HTML(self.content) lis = tree.xpath('//div[contains(@class,"item")]') else: tree = etree.HTML(self.content) lis = tree.xpath('//li[@class="item"]') for li in lis: for ia in li.xpath('h3//a'): href = ia.xpath('@href')[0] title = ia.xpath('text()')[0] if 'baidu.com' in href: self.news.append({ '标题': title, '链接': href }) logger.info(self.name + ':' + str(self.news)) return
def fetch_cities(data): url = data['home_url'] try: body = cm.get_data(url) except Exception: cm.dump('Error in fetching cities: %s' % url, 'unode50_log.txt') dump_data = { 'level': 0, 'time': cm.format_time(), 'data': { 'url': url }, 'brand_id': data['brand_id'] } cm.dump(dump_data) return [] m = re.search(ur'countries\s*=\s*\{', body) if m is None: cm.dump('Error in fetching cities: %s' % url, 'unode50_log.txt') return [] body = cm.extract_closure(body[m.start():], ur'\{', ur'\}')[0] raw = json.loads(body) results = [] for key in raw: d = data.copy() d['country'] = raw[key]['name'].strip().upper() d['country_id'] = key results.append(d) return results
def fetch_countries(data): url = data['url'] try: body = cm.get_data(url) except Exception, e: cm.dump('Error in fetching countries: %s' % url, log_name) return []
def get_frag_countries(url): # 获得国家代码 """ 获得国家的名字和代码 :rtype : [{'id':**, 'country':**}, ...] :param url: :return: """ try: html = common.get_data(url) except Exception: print 'Error occured: %s' % url_fragrance dump_data = { 'level': 1, 'time': common.format_time(), 'data': { 'url': url_fragrance }, 'brand_id': brand_id } common.dump(dump_data) return [], False start = html.find('<select name="country" id="id_country">') if start == -1: return [], False sub, s, e = common.extract_closure(html[start:], ur'<select\b', ur'</select>') if e == 0: return [], False return [{ 'id': string.atoi(m[0]), 'country': m[1].strip().upper() } for m in re.findall(ur'<option value="(\d+)".*?>(.+?)</option>', sub)]
def fetch_countries_eu(data): url = data['url_eu'] try: body = cm.get_data(url) except Exception, e: cm.dump('Error in fetching EU countries: %s' % url, log_name) return ()
def test_gp_addative_kernel(self): """Test Gaussian process predictions with the addative kernel.""" train_features, train_targets, test_features, test_targets = get_data() # Test prediction with addative linear and gaussian kernel. kdict = { 'k1': { 'type': 'linear', 'features': [0, 1], 'scaling': 1. }, 'k2': { 'type': 'gaussian', 'features': [2, 3], 'width': 1., 'scaling': 1. }, 'c1': { 'type': 'constant', 'const': 1. } } gp = GaussianProcess(train_fp=train_features, train_target=train_targets, kernel_dict=kdict, regularization=1e-3, optimize_hyperparameters=True, scale_data=True) pred = gp.predict(test_fp=test_features, test_target=test_targets, get_validation_error=True, get_training_error=True) self.assertEqual(len(pred['prediction']), len(test_features)) print('addition prediction:', pred['validation_error']['rmse_average'])
def fetch_stores(data): store_list = [] global tot_processed tot_processed += 1 cm.dump('Processint city #%d' % tot_processed, log_name) next_val = 0 while True: if next_val == -1: break url = data['store_url'] param = {'location_form[services]': data['type_key'], 'location_form[countries]': data['country_id'], 'startNextValues': next_val, 'location_form[latitude]': data['city_lat'], 'location_form[longitude]': data['city_lng']} try: body = cm.get_data(url, param) except Exception, e: cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name) return [] try: raw = json.loads(body) except Exception, e: cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name) return []
def func(data, level): """ :param data: :param level: :return: siblings """ if level == 4: # get store details stores = get_store_details(data['content'], data) return [{'func': None, 'data': s} for s in stores] else: if level == 1: content = cm.get_data(data['url']) else: content = data['content'] entries = get_entries(content, pattern[level - 1]) def siblings_data(ent): # Each time when a new level is reached, a new field is added to data, and the 'content' # field is updated. This is returned to build new siblings. local_d = dict(data) if level == 1: local_d[cm.continent_e] = ent elif level == 2: local_d[cm.country_e] = ent elif level == 3: local_d[cm.city_e] = ent local_d['content'] = entries[ent] return local_d return [{'func': lambda data: func(data, level + 1), 'data': siblings_data(ent)} for ent in entries]