class Stats(object): """ This class is responsible for creating and updating database entry(document in Elasticsearch DB) There are two usage options: 1. without arguments - as a based class of TestStatsMixin - for saving test statistics 2. with arguments - as a separate object to update an existing document """ def __init__(self, *args, **kwargs): self._test_index = kwargs.get('test_index', None) self._test_id = kwargs.get('test_id', None) self._es_doc_type = "test_stats" self.es = ES() self._stats = {} if not self._test_id: super(Stats, self).__init__(*args, **kwargs) def create(self): self.es.create_doc(index=self._test_index, doc_type=self._es_doc_type, doc_id=self._test_id, body=self._stats) def update(self, data): """ Update document :param data: data dictionary """ try: self.es.update_doc(index=self._test_index, doc_type=self._es_doc_type, doc_id=self._test_id, body=data) except Exception as ex: logger.error('Failed to update test stats: test_id: %s, error: %s', self._test_id, ex)
def removeArticleTag(request): try: article_id = request.GET['article_id'] label = request.GET['label'] tag = request.GET['tag'] #label = labelMap[label] #### print "removeArticleTag" articleDAO = ArticleDAO('articles_testN') flag, update = articleDAO.removeTag(article_id, tag) url = "http://localhost:9200/news_spider_db/articles_testN/" + str( article_id) + "/_update" print url es = ES() doc = {"doc": update} # print doc # 更新 result = es.post(url, doc) #print "remove aticle result:",result message = "success" return HttpResponse(json.dumps(message), content_type="application/json") except BaseException, e: logging.error(e) print e print traceback.print_exc() return HttpResponse(json.dumps("failed"), content_type="application/json")
def __init__(self, sc, data_path, tmdb_key): self.sc = sc self.sqlContext = SQLContext(self.sc) self.data_path = data_path self.es = ES(self.sc, self.sqlContext) tmdb.API_KEY = tmdb_key self.tmdb_key = tmdb_key
def es(game, render, config, generations, sigma, seed, random_noise_size, classic_es, activation, gain, optimize, mutate, num_parents, no_videos, big_net, novelty): timestamp = datetime.datetime.now() optimize = opt_modes[optimize] activation = act_modes[activation] if config == "default": config = "configurations/default_atari_config.json" with open(config, 'r') as f: config = json.loads(f.read()) game = game[0].capitalize() + game[1:] i = short_names.index(game) config['env_id'] = envs[i] config['env_short'] = short_names[i] else: with open(config, 'r') as f: config = json.loads(f.read()) path = "save/{}-{}_{}".format(config["env_short"], str(timestamp.date()), str(timestamp.time())) txt = "Log {}.log\n\nWith parameters: \ngame={} ({}) \nconfig={} \ngenerations={} \nsigma={} \nseed={}\nrandom_noise_size={} \ \nclassic_es={} \n(xavier) gain={} \nactivation={} \noptimize={}\nmutate={} parameters\nnum_parents={} workers\nbig_net={}\nnovelty={}\n\ ".format(path, config['env_short'], config['env_id'], config, generations, sigma, seed, random_noise_size, classic_es, gain, activation, optimize, "all" if mutate == 1 else "1/{} of".format(mutate), "all" if num_parents == 1 else "1/{} of".format(num_parents), big_net, novelty) worker = ES(config, rand_num_table_size=random_noise_size, sigma=sigma, seed=seed, render=render, verbose=True, log_path=path, initial_text=txt, classic_es=classic_es, gain=gain, activation=activation, optimize=optimize, mutate=mutate, no_videos=no_videos, big_net=big_net, novelty=novelty, num_parents=num_parents) worker(generations) worker.save(path + '.es')
def __init__(self, *args, **kwargs): self._test_index = kwargs.get('test_index', None) self._test_id = kwargs.get('test_id', None) self._es_doc_type = "test_stats" self.es = ES() self._stats = {} if not self._test_id: super(Stats, self).__init__(*args, **kwargs)
def test_fetch_domain(monkeypatch): es = ES('ip', 'domain') monkeypatch.setattr(es.es, 'describe_elasticsearch_domain', fake_es_describe) config = es.fetch_config() assert isinstance(config, dict)
def test_extract_access_list(monkeypatch): es = ES('192.168.0.2/32', 'domain') data = { "DomainStatus": { "AccessPolicies": '{"Statement":[{"Condition":{"IpAddress":{"aws:SourceIp":["192.168.0.1/32"]}}}]}' } } config, ips = es.append_ip_acl(data) assert ips == ["192.168.0.1/32", "192.168.0.2/32"] assert isinstance(config, dict)
def handler(event, context): ''' :param event: :param context: :return: ''' event = json.loads(event['Records'][0]['Sns']['Message']) feed = event['feed'] iocfp = IOCFeedParser(event) elasticsearch = ES(es_host=event['es_endpoint']) feed_data = iocfp.get_feed(feed) parsed_feed = iocfp.proccess_feed_data(feed_data, feed) iocfp.save_to_s3(feed_data, feed) elasticsearch.bulk_es_index_dataframe(es_index='iocs', df=parsed_feed)
def changeLabel(request): try: article_id = request.GET['article_id'].strip() label = request.GET['label'] reverseMap = {"0": "1", "1": "0"} rLabel = reverseMap[label] logging.info("[changeLabel] article_id=" +article_id +" label=" \ + label + " rLabel=" + rLabel) #label = labelMap[label] #reverseLabel = labelMap[reverseLabel] articleDAO = ArticleDAO('articles_testN') article = articleDAO.show_article(article_id) article.pop("_id") article.pop("id") article['article_label'] = int(rLabel) user = request.session.get('user', default=None) if user['role'] == "0": article['article_label_state'] = 2 article['update_admin'] = user['username'] elif user['role'] == "1": article['article_label_state'] = 1 article['update_student'] = user['username'] #article['article_label_state'] = 0 result = articleDAO.update_article(article_id, article) url = "http://localhost:9200/news_spider_db/articles_testN/" + str( article_id) + "/_update" es = ES() doc = {"doc": {"article_label": int(rLabel)}} # 更新 es.post(url, doc) logging.info("[changeLabel] result=" + str(result)) if result: return HttpResponse( json.dumps('{"label":"' + str(rLabel) + '","article_id":"' + str(article_id) + '"}'), content_type="application/json") else: return HttpResponse(json.dumps('{"label":"failed"}'), content_type="application/json") except BaseException, e: logging.error(e) return HttpResponse(json.dumps('{"label":"failed"}'), content_type="application/json")
def __init__(self, action_space): """Initialize a new agent.""" self.action_space = action_space self.actions = [] actions_vec = np.load("./saved_files/top1000_actions.npz")["actions"] for i in range(actions_vec.shape[0]): act = action_space.from_vect(actions_vec[i]) self.actions.append(act) self.actions = self.actions[:1000] self.act_num = len(self.actions) self.sub_ids = np.load('./saved_files/sub_id_info.npz')['sub_ids'] self.do_nothing_action = action_space({}) self.origin_ids = range(len(self.actions)) offset = action_space.n_line self.action_to_sub_topo = {} for sub_id, sub_elem_num in enumerate(action_space.sub_info): self.action_to_sub_topo[sub_id] = (offset, offset + sub_elem_num) offset += sub_elem_num self.step = 0 model = PowerNetModel() algorithm = ES(model) self.es_agent = ESAgent(algorithm) self.es_agent.restore(save_path='./saved_files', filename='model.ckpt') self.to_print_data = [] self.last_disconnect_step = -100 self.last_diconnect_line = None self.simulation_times = 0
def __init__(self, config): self.config = config env = gym.make(self.config['env_name']) self.config['obs_dim'] = env.observation_space.shape[0] self.config['act_dim'] = env.action_space.shape[0] self.obs_filter = MeanStdFilter(self.config['obs_dim']) self.noise = SharedNoiseTable(self.config['noise_size']) model = MujocoModel(self.config['act_dim']) algorithm = ES(model) self.agent = MujocoAgent(algorithm, self.config) self.latest_flat_weights = self.agent.get_flat_weights() self.latest_obs_filter = self.obs_filter.as_serializable() self.sample_total_episodes = 0 self.sample_total_steps = 0 self.actors_signal_input_queues = [] self.actors_output_queues = [] self.create_actors() self.eval_rewards_stat = WindowStat(self.config['report_window_size']) self.eval_lengths_stat = WindowStat(self.config['report_window_size'])
def __init__(self, es_index, es_doc_type, send_email=False, email_recipients=(), email_template_fp="", query_limit=1000, logger=None): self._es = ES() self._conf = self._es._conf self._es_index = es_index self._es_doc_type = es_doc_type self._limit = query_limit self._send_email = send_email self._email_recipients = email_recipients self._email_template_fp = email_template_fp self.log = logger if logger else log
def approval(request): ## 审核 try: article_id = request.GET['article_id'].strip() label = request.GET['label'] label = labelMap[label] articleDAO = ArticleDAO('articles_testN') article = articleDAO.show_article(article_id) article.pop("id") article.pop("_id") user = request.session.get('user', default=None) if user['role'] == "0": article['article_label_state'] = 2 article['update_admin'] = user['username'] elif user['role'] == "1": article['article_label_state'] = 1 article['update_student'] = user['username'] #article['article_label_state'] = 0 result = articleDAO.update_article(article_id, article) url = "http://localhost:9200/news_spider_db/articles_testN/" + str( article_id) + "/_update" es = ES() doc = { "doc": { "article_label_state": article['article_label_state'], "update_admin": article['update_admin'], "update_student": article['update_student'] } } # 更新 es.post(url, doc) if result: return HttpResponse(json.dumps("success"), content_type="application/json") else: return HttpResponse(json.dumps("failed"), content_type="application/json") except BaseException, e: logging.error(e) return HttpResponse(json.dumps("failed"), content_type="application/json")
def test_add_create_index_with_mapping(): es = ES(None, 'datagouvfr_test-index-mapping', mapping) assert not elastic.indices.exists(index = 'datagouvfr_test-index-mapping-2020-01-01') es.add({"day": "2020-01-01"}) es.commit() assert elastic.indices.exists(index = 'datagouvfr_test-index-mapping-2020-01-01') assert mapping['mappings'] == elastic.indices.get(index = 'datagouvfr_test-index-mapping-2020-01-01')['datagouvfr_test-index-mapping-2020-01-01']['mappings']
def __init__(self, config): self.config = config self.env = gym.make(self.config['env_name']) self.config['obs_dim'] = self.env.observation_space.shape[0] self.config['act_dim'] = self.env.action_space.shape[0] self.obs_filter = MeanStdFilter(self.config['obs_dim']) self.noise = SharedNoiseTable(self.config['noise_size']) model = MujocoModel(self.config['act_dim']) algorithm = ES(model) self.agent = MujocoAgent(algorithm, self.config)
def get_results(results_path, update_db): bad_chars = " " os.chdir(os.path.join(results_path, "perf_fast_forward_output")) db = ES() results = {} for dirname in os.listdir(os.getcwd()): logger.info(dirname) for filename in os.listdir(dirname): new_filename = "".join(c for c in filename if c not in bad_chars) test_type = dirname + "_" + os.path.splitext(new_filename)[0] json_path = os.path.join(dirname, filename) with open(json_path, 'r') as f: logger.info("Reading: %s", json_path) datastore = json.load(f) datastore.update({'hostname': HOSTNAME}) if update_db: db.create(index=ES_INDEX, doc_type=test_type, doc_id=TEST_ID, body=datastore) results[test_type] = datastore return results
def search(): # accepts json payload with "query" key and "files_and_folders" # query is mandatory, files_and_folders default value is 1 es = ES() content = request.get_data() post_input = json.loads(content) try: query = post_input['query'] except KeyError: return 'No search query!' try: files_and_folders = int(post_input['files_and_folders']) except (KeyError, ValueError): files_and_folders = 1 if files_and_folders: output = es.query_all(query) else: output = es.query_files(query) return output
class StdOutListener(StreamListener): counter = 0 total_docs_to_be_indexed = 1000 def __init__(self,*args,**kwargs): super(StdOutListener, self).__init__(*args, **kwargs) self.es = ES().getES() def on_data(self,data): print data while self.total_docs_to_be_indexed > self.counter: tweet = json.loads(data) self.index_tweet(tweet) self.counter += 1 return True return False def index_tweet(self,tweet): self.es.index(index = 'twitter', doc_type = 'tweets', id = tweet['id_str'], body = tweet) def on_error(self,status): print "the status is: " + str(status) pass
def addArticleTag(request): try: article_id = request.GET['article_id'] label = request.GET['label'] tag = request.GET['tag'] #label = labelMap[label] #### articleDAO = ArticleDAO('articles_testN') flag, update = articleDAO.addTag(article_id, tag) message = "success" url = "http://localhost:9200/news_spider_db/articles_testN/" + str( article_id) + "/_update" es = ES() doc = {"doc": update} # 更新 result = es.post(url, doc) print result return HttpResponse(json.dumps(message), content_type="application/json") except BaseException, e: logging.error(e) return HttpResponse(json.dumps("failed"), content_type="application/json")
def test_es_add(): es = ES(None, 'datagouvfr_test-add', mapping) assert not elastic.indices.exists(index = 'datagouvfr_test-add-2020-01-01') es.add({"hello": "world", "day": "2020-01-01"}) es.commit() assert elastic.indices.exists(index = 'datagouvfr_test-add-2020-01-01') time.sleep(1) data = elastic.search(index = 'datagouvfr_test-add-2020-01-01') assert len(data['hits']['hits']) == 1
def __init__(self,*args,**kwargs): super(StdOutListener, self).__init__(*args, **kwargs) self.es = ES().getES()
parsimony_pressure_w1=parsimonyW1, parsimony_pressure_w2=parsimonyW2, tourn_size=tournsize, min_height=min_height, max_height=max_height, mut_min_height=mut_min_height, mut_max_height=mut_max_height) last_pop, log = gpsolver.solve(n_generations=n_generations) best_fitness = log.chapters['fitness'].select('min') avg_size = log.chapters['size'].select('avg') np.savetxt('best_fits.txt', best_fitness) np.savetxt('avg_size', avg_size) ######## saving best solution found ######### best = max(last_pop, key=operator.attrgetter("fitness")) logging.info('best individual\'s fittness: ' + str(best.fitness)) draw(best, 'out/best_ind.pdf') ######### using evolution strategy ########## def my_cost(constants): return gpsolver.simpleFitness(best, constants)[0] logging.info('current constatns ' + str(constants)) mes = ES(len(constants), my_cost) mes.set_ans(constants) mes.evolve(iterations=1000) logging.info('best found constants ' + str(mes.ans))
def run_asebo(params): env = gym.make(params['env_name']) params['ob_dim'] = env.observation_space.shape[0] params['ac_dim'] = env.action_space.shape[0] m = 0 v = 0 params['k'] += -1 params['alpha'] = 1 params['zeros'] = False master = get_policy(params) if params['log']: params['num_sensings'] = 4 + int(3 * np.log(master.N)) if params['k'] > master.N: params['k'] = master.N n_eps = 0 n_iter = 1 ts_cumulative = 0 ts = [] rollouts = [] rewards = [] samples = [] alphas = [] G = [] while n_iter < params['max_iter']: params['n_iter'] = n_iter gradient, n_samples, timesteps = ES(params, master, G) ts_cumulative += timesteps ts.append(ts_cumulative) alphas.append(params['alpha']) if n_iter == 1: G = np.array(gradient) else: G *= params['decay'] G = np.vstack([G, gradient]) n_eps += 2 * n_samples rollouts.append(n_eps) gradient /= (np.linalg.norm(gradient) / master.N + 1e-8) update, m, v = Adam(gradient, m, v, params['learning_rate'], n_iter) master.update(update) test_policy = worker(params, master, np.zeros([1, master.N]), 0) reward = test_policy.rollout(train=False) rewards.append(reward) samples.append(n_samples) print('Iteration: %s, Rollouts: %s, Reward: %s, Alpha: %s, Samples: %s' %(n_iter, n_eps, reward, params['alpha'], n_samples)) n_iter += 1 out = pd.DataFrame({'Rollouts': rollouts, 'Reward': rewards, 'Samples': samples, 'Timesteps': ts, 'Alpha': alphas}) out.to_csv('Seed%s.csv' %(params['seed']), index=False)
if "Lat" in record and "Long_" in record and len(record["Lat"]) > 0 and len(record["Long_"]) > 0: new["location_point"] = f"{record['Lat']},{record['Long_']}" for source, destination in MAP_FIELDS.items(): if source in record: if (len(record[source]) > 0): new[destination] = record[source] return new def read_csv(file): file_base = os.path.basename(file) print(f"reading file '{file}'") if match := re.match('^(\d\d?)-(\d\d?)-(\d{4})', file_base): index_fragment = f"{match.group(3)}-{match.group(1)}-{match.group(2)}" else: index_fragment = f"file={file_base}" es = ES(index_fragment) with open(file) as csvfile: reader = csv.DictReader(csvfile) for row in reader: timestamp = datetime.datetime.strptime(row['Last_Update'], '%Y-%m-%d %H:%M:%S') row['Last_Update'] = timestamp # es.post_record(convert_record(row)) es.bulk_add(convert_record(row)) es.bulk_write()
def __init__(self, action_space): BaseAgent.__init__(self, action_space=action_space) self.simulate_times = 0 unitary_action_model = UnitaryActionModel() algorithm = ES(unitary_action_model) self.unitary_es_agent = UnitaryESAgent(algorithm) combined_actions_model_1 = CombinedActionsModel() combined_actions_model_2 = CombinedActionsModel() ensemble_algorithm = EnsembleES(combined_actions_model_1, combined_actions_model_2) self.combine_es_agent = CombineESAgent(ensemble_algorithm) self.unitary_es_agent.restore('./saved_files', 'unitary_action_model.ckpt') self.combine_es_agent.restore('./saved_files', 'combined_actions_model.ckpt') unitary_actions_vec = np.load( "./saved_files/v6_top500_unitary_actions.npz")["actions"] self.unitary_actions = [] for i in range(unitary_actions_vec.shape[0]): action = action_space.from_vect(unitary_actions_vec[i]) self.unitary_actions.append(action) redispatch_actions_vec = np.load( "./saved_files/redispatch_actions.npz")["actions"] self.redispatch_actions = [] for i in range(redispatch_actions_vec.shape[0]): action = action_space.from_vect(redispatch_actions_vec[i]) self.redispatch_actions.append(action) with open("./saved_files/action_to_sub_id.pickle", "rb") as f: self.action_to_sub_id = pickle.load(f) self.after_line56_or_line45_disconnect_actions = [] self.three_sub_action_to_sub_ids = {} actions_vec = np.load( "./saved_files/v10_merge_three_sub_actions.npz")["actions"] for i in range(actions_vec.shape[0]): action = action_space.from_vect(actions_vec[i]) self.after_line56_or_line45_disconnect_actions.append(action) with open("saved_files/three_sub_action_to_sub_ids.pickle", "rb") as f: self.three_sub_action_to_sub_ids = pickle.load(f) self.used_combine_actions = False self.redispatch_cnt = 0 self.max_redispatch_cnt = 3 self.serial_actions = [] self.do_nothing_action = action_space({}) self.action_space = action_space offset = 59 self.action_to_sub_topo = {} for sub_id, sub_elem_num in enumerate(action_space.sub_info): self.action_to_sub_topo[sub_id] = (offset, offset + sub_elem_num) offset += sub_elem_num self.observation = None self.redispatch_months = set([3])
from es import ES from config import * es = ES().getES() if es.indices.exists(index = index_name): script = { 'script': 'ctx._source.category=\"Programming\"' } es.update(index = index_name, doc_type = doc_type, body = script, id = '123', ignore = 404) # script = {"script" :"ctx._source.category+=tag", # "params":{ # "tag": "Python" # } # es.update(index = index_name, doc_type = doc_type, body = script, id = '1', ignore=404)
def main(): ## create index es = ES() if FLAGS_FIRST_RUN: if es.check_existing_index(index_name=FLAGS_CORPUS_NAME, delete_existing=False): es.create_skipgram2eid_index(index_name=FLAGS_CORPUS_NAME, type_name="skipgram2eid") es.create_eid2skipgram_index(index_name=FLAGS_CORPUS_NAME, type_name="eid2skipgram") es.create_eid2eid_index(index_name=FLAGS_CORPUS_NAME, type_name="eid2eid") start = time.time() skipgram2id, skipgram2eidcounts, eid2skipgramcounts = util.load_skipgram2eidcounts( eidSkipgramFilePath) end = time.time() print("[INFO] Loading data using time %s (seconds)" % (end - start)) start = time.time() eid2eid_w_strength = util.calculateEidSimilarity(skipgram2eidcounts) end = time.time() print("[INFO] Calculating eid-eid similarity using time %s (seconds)" % (end - start)) es.index_skipgram2eid(index_name=FLAGS_CORPUS_NAME, type_name="skipgram2eid", skipgram2id=skipgram2id, skipgram2eidcounts=skipgram2eidcounts) es.index_eid2skipgram(index_name=FLAGS_CORPUS_NAME, type_name="eid2skipgram", eid2skipgramcounts=eid2skipgramcounts) es.index_eid2eid(index_name=FLAGS_CORPUS_NAME, type_name="eid2eid", eid2eid_w_strength=eid2eid_w_strength) es.match_all(index_name=FLAGS_CORPUS_NAME, type_name="skipgram2eid") es.match_all(index_name=FLAGS_CORPUS_NAME, type_name="eid2skipgram") es.match_all(index_name=FLAGS_CORPUS_NAME, type_name="eid2eid") eid2ename, ename2eid = util.loadEidToEntityMap(eidEnameFilePath) eid2types = util.loadEidToTypeMap(eidTypeFilePath, ename2eid=ename2eid) # userInput = ["NBA", "NCAA", "NFL"] # sports league, good performance userInput = ["BBC", "HBO", "CNN", "Fox", "Channel 4"] # TV Channel, good performance # userInput = ["Twitter", "Microsoft", "Lenovo", "Toyota", "Qualcomm"] # company, good performance # userInput = ["Toyota", "Hyundai", "Mazda", "Chrysler", "Ford"] # car company (top-30, avg.rank=10), good performance # userInput = ["Google", "Facebook", "Microsoft", "Amazon", "Twitter"] # high tech company, good performance # # userInput = ["United States", "China", "Japan", "germany", "England", "Russia", "India"] # country, using dist.sim # userInput = ["Illinois", "Texas", "California", "Ohio", "Maryland"] # state, using dist.sim seedEidsWithConfidence = [(ename2eid[ele.lower()], 0.0) for ele in userInput] negativeSeedEids = set() params = SetExpanParams(index_name=FLAGS_CORPUS_NAME, max_iter=10, ensemble_batch=10, num_of_top_skipgrams=150, num_of_top_candidate_eids=50, feature_subset_size_ratio=0.8, average_rank=10, skipgramDistLower=3, skipgramDistUpper=30, use_type=False) start = time.time() (expanded_eids, stop_iter) = setExpan(es, seedEidsWithConfidence, negativeSeedEids, eid2ename, eid2types, params, FLAGS_DEBUG=False) end = time.time() print("[INFO!!!] Finish SetExpan++ in %s seconds" % (end - start)) for ele in expanded_eids: print(ele[0], eid2ename[ele[0]], ele[1])
from config import * from es import ES import pprint prt = pprint.PrettyPrinter(indent=1) es = ES().getES() query = { "query":{ "match_all":{} } } response = es.search(index = index_name, doc_type = doc_type, body = query, size = 10, request_timeout = 10) prt.pprint(response)
""" Run and plot the ES results for Ackley """ from es import ES import helper total_data = None for h in range(30): print(h) es = ES(limits=[15.0]*30) # search 30 times and get med data = es.search(1000, 1e-5) if not total_data: total_data = [[], [],[],[], []] total_data[0] = data[0] total_data[1] = data[1] total_data[2] = data[2] total_data[3] = data[3] total_data[4] = data[4] else: for i in range(len(total_data[0])): total_data[1][i] += data[1][i] total_data[2][i] += data[2][i] total_data[3][i] += data[3][i] total_data[4][i] += data[4][i] for i in range(len(total_data[0])): total_data[1][i] /= 30
class Engine: def __init__(self, sc, data_path, tmdb_key): self.sc = sc self.sqlContext = SQLContext(self.sc) self.data_path = data_path self.es = ES(self.sc, self.sqlContext) tmdb.API_KEY = tmdb_key self.tmdb_key = tmdb_key def load_data_from_file(self): ratings_file_path = os.path.join(self.data_path, 'ratings.csv') ratings_raw_RDD = self.sc.textFile(ratings_file_path) ratings_header = ratings_raw_RDD.take(1)[0] rating_schema = StructType(\ [StructField('userId', IntegerType(), True),\ StructField('movieId', IntegerType(), True),\ StructField('rating', FloatType(), True)]\ ) ratings_RDD = ratings_raw_RDD.filter( lambda line: line != ratings_header).map( lambda line: line.split(",")).map( lambda x: (int(x[0]), int(x[1]), float(x[2]))).cache() self.ratings_RDD = ratings_RDD rating_df = self.sqlContext.createDataFrame(ratings_RDD, rating_schema) self.rating_df = rating_df ratings_dict_RDD = rating_df.rdd.map(lambda item: (item['movieId'], { 'userId': item['userId'], 'movieId': item['movieId'], 'rating': item['rating'] })) #logger.info(ratings_dict_RDD.take(10)) self.ratings_dict_RDD = ratings_dict_RDD movie_schema = StructType(\ [StructField('movieId', IntegerType(), True),\ StructField('title', StringType(), True),\ StructField('genres', StringType(), True)]\ ) movies_file_path = os.path.join(self.data_path, 'movies.csv') movies_raw_RDD = self.sc.textFile(movies_file_path) movies_header = movies_raw_RDD.take(1)[0] movies_header_list = movies_header.split(",") self.movies_RDD = movies_raw_RDD.filter( lambda line: line != movies_header).map(lambda line: line.split( ",")).map(lambda x: (int(x[0]), x[1], x[2])).cache() movies_df = self.sqlContext.createDataFrame(self.movies_RDD, movie_schema) self.movies_df = movies_df movies_dict_RDD = rating_df.rdd.map(lambda item: (item['movieId'], { 'movieId': item['movieId'], 'title': item['title'], 'genres': item['genres'] })) self.movies_dict_RDD = movies_dict_RDD self.rank = 10 self.iterations = 10 self.train() #TODO Get the image by using tmdb API and links.csv links_schema = StructType(\ [StructField('movieId', IntegerType(), True),\ StructField('imdbId', IntegerType(), True),\ StructField('tmdbId', IntegerType(), True)]\ ) links_file_path = os.path.join(self.data_path, 'links.csv') links_raw_RDD = self.sc.textFile(links_file_path) links_header = links_raw_RDD.take(1)[0] links_header_list = links_header.split(",") self.links_RDD = links_raw_RDD.filter( lambda line: line != links_header).map(lambda line: line.split( ",")).map(lambda x: (int(x[0]), x[1], x[2])).cache() links_df = self.sqlContext.createDataFrame(self.links_RDD, links_schema) self.links_df = links_df links_dict_RDD = links_df.rdd.map(lambda item: (item['movieId'], { 'movieId': item['movieId'], 'imdbId': item['imdbId'], 'tmdbId': item['tmdbId'] })) def get_predicted_rating(self, userId, movieId): predicted_rating_RDD = self.model.predict(userId, movieId) logger.info(predicted_rating_RDD) return redicted_rating_RDD def get_predicted_rating_from_file(self, file_name): data = self.sc.textFile(file_name) ratings = data.map(lambda l: l.split(',')).map( lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))) testdata = ratings.map(lambda p: (p[0], p[1])) predictions = self.model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2])) ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join( predictions) RMSE = math.sqrt( ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()) logger.info("RMSE = " + str(RMSE)) #logger.info(predictions.collect()) return predictions.collect() def get_es_ratingRDD(self): ratings = self.es.get_ratingRDD() return ratings def get_es_ratingRDD_by_userId(self, userId): ratings = self.es.get_ratingRDD_by_userId(userId) return ratings def get_es_ratingRDD_by_movieId(self, userId): ratings = self.es.get_ratingRDD_by_movieId(movieId) return ratings def get_es_ratingRDD_by_user_movie(self, userId, movieId): ratings = self.es.get_ratingRDD_by_user_movie(userId, movieId) return ratings def search_movie_tmdb(self, movie_name): search = tmdb.Search() response = search.movie(query=movie_name) #logger.info(response) data_list = [] for s in search.results: data = { 'title': s['title'], 'date': s['date'], 'popularity': s['popularity'], 'id': s['id'] } #logger.info(data) data_list.append(data) result = {'response': response, 'data': data_list} return result def create_es_index(self): self.es.create_index("movielens") def save_to_es(self): self.rating_df.write.format("es").save("movielens/ratings") self.movies_df.write.format("es").save("movielens/movies") self.links_df.write.format("es").save("movielens/links") def save_to_es_hadoop(self): es_write_conf = { "es.nodes": 'localhost', "es.port": '9200', "es.resource": 'movielens/ratings', #"es.input.json" : "yes" "es.mapping.id": "movieId" } self.ratings_dict_RDD.saveAsNewAPIHadoopFile( path='-', outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat", keyClass="org.apache.hadoop.io.NullWritable", valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", conf=es_write_conf) return true def train(self): self.model = ALS.train(self.ratings_RDD, self.rank, self.iterations, 0.01) logger.info("ALS model") def topN_ratings_unrated_movies(self, user_id, count): unrated = self.ratings_RDD.filter(lambda x: not x[0] == user_id).map( lambda x: (user_id, x[1])).distinct() predicted_RDD = self.model.predictAll(unrated) total_RDD = self.ratings_RDD.union(predicted_RDD) list_predict_movie = predicted_RDD.map( lambda x: x.product).distinct().collect() predicted_movie_RDD = total_RDD.filter( lambda x: x[1] in list_predict_movie) predicted_groupby_product_rating_RDD = predicted_movie_RDD.map( lambda x: (x[1], x[2])).groupByKey() product_avgRating_count_RDD = predicted_groupby_product_rating_RDD.map( get_product_avgRating_count) filtered_RDD = product_avgRating_count_RDD.filter( lambda x: x[1][0] > 3 and x[1][1] > 30) ratings_list = filtered_RDD.takeOrdered(count, key=lambda x: -x[1][0]) ratings_movie_id = [x[0] for x in ratings_list] movie_title_dict = self.es.get_movieTitleByMovieId(ratings_movie_id) result_list = [(x[0], movie_title_dict[x[0]], x[1][0], x[1][1]) for x in ratings_list] result = [] for x in result_list: y = {"movieId": x[0], "title": x[1], "rating": x[2], "count": x[3]} result.append(y) return result
class BaseResultsAnalyzer(object): def __init__(self, es_index, es_doc_type, send_email=False, email_recipients=(), email_template_fp="", query_limit=1000, logger=None): self._es = ES() self._conf = self._es._conf self._es_index = es_index self._es_doc_type = es_doc_type self._limit = query_limit self._send_email = send_email self._email_recipients = email_recipients self._email_template_fp = email_template_fp self.log = logger if logger else log def get_all(self): """ Get all the test results in json format """ return self._es.search(index=self._es_index, size=self._limit) def get_test_by_id(self, test_id): """ Get test results by test id :param test_id: test id created by performance test :return: test results in json format """ if not self._es.exists( index=self._es_index, doc_type=self._es_doc_type, id=test_id): self.log.error('Test results not found: {}'.format(test_id)) return None return self._es.get(index=self._es_index, doc_type=self._es_doc_type, id=test_id) def _test_version(self, test_doc): if test_doc['_source'].get('versions'): for v in ('scylla-server', 'scylla-enterprise-server'): k = test_doc['_source']['versions'].get(v) if k: return k self.log.error('Scylla version is not found for test %s', test_doc['_id']) return None def render_to_html(self, results, html_file_path=""): """ Render analysis results to html template :param results: results dictionary :param html_file_path: Boolean, whether to save html file on disk :return: html string """ self.log.info("Rendering results to html using '%s' template...", self._email_template_fp) loader = jinja2.FileSystemLoader( os.path.dirname(os.path.abspath(__file__))) env = jinja2.Environment(loader=loader, autoescape=True) template = env.get_template(self._email_template_fp) html = template.render(results) if html_file_path: with open(html_file_path, "w") as f: f.write(html) self.log.info("HTML report saved to '%s'.", html_file_path) return html def send_email(self, subject, content, html=True, files=()): if self._send_email and self._email_recipients: self.log.debug('Send email to {}'.format(self._email_recipients)) em = Email() em.send(subject, content, html=html, recipients=self._email_recipients, files=files) else: self.log.warning( "Won't send email (send_email: %s, recipients: %s)", self._send_email, self._email_recipients) def gen_kibana_dashboard_url(self, dashboard_path=""): return "%s/%s" % (self._conf.get('kibana_url'), dashboard_path) def check_regression(self): return NotImplementedError("check_regression should be implemented!")
def search(request): try: logging.info("search(request)") #pdb.set_trace() # 当前页 if 'current_page' in request.POST: current_page = int(request.POST['current_page']) else: current_page = 0 # 文章抓取网站 webMap = { "xlw": "新浪网", "xhs": "新华社", "fhw": "凤凰网" } #"rmw":"人民网","zhw":"中华网", if 'webs' in request.POST: str_webs = request.POST['webs'] str_webs = str_webs.split(",") webs = [] for i in range(len(str_webs)): key = str_webs[i] webs.append(webMap[key]) else: webs = ["新浪网", "新华网", "凤凰网"] #"人民网", "中华网", str_webs = ["xlw", "xhs", "fhw"] #"rmw","zhw", # 标签 if "tags" in request.POST: str_tags = request.POST['tags'] print str_tags tags = str_tags.split(",") else: tags = [] if '' in tags: tags.remove('') print "tags:", tags # 每页显示的页数 if "page_size" in request.POST: page_size = request.POST['page_size'] page_size = int(page_size) else: page_size = 20 if "timerange" in request.POST: timerange = request.POST['timerange'] timerange = timerange.split(" - ") startTime = timerange[0] endTime = timerange[1] else: startTime = '2017-01-11' endTime = '2017-01-15' timerange = startTime + " - " + endTime # 是否去重 if "article_db" in request.POST: article_db = request.POST['article_db'] article_db = int(article_db) else: article_db = 0 if "label_states" in request.POST: label_states = request.POST['label_states'] str_label_states = label_states.split(",") label_state = [] for i in range(len(str_label_states)): label_state.append(int(str_label_states[i])) else: label_state = [0, 1, 2] print "label_state:", label_state if "timerange_check" in request.POST: timerange_check = request.POST["timerange_check"] timerange_check = int(timerange_check) else: timerange_check = 0 print "timerange_check", timerange_check if "label" in request.POST: label = request.POST['label'] str_labels = label.split(",") label = [] for i in range(len(str_labels)): label.append(int(str_labels[i])) else: label = [0, 1] print "label", label if "search_key" in request.POST: search_key = request.POST['search_key'].strip() else: search_key = None if "search_type" in request.POST: search_type = request.POST['search_type'].strip() else: search_type = "simple_search" print "search_key:", search_key print "search_type", search_type #label = labelMap[label] user = request.session.get('user', default=None) if 1 in label_state and user['role'] == "1": condition = { "article_source": webs, "article_db": article_db, "article_label_state": label_state, "startTime": startTime, "endTime": endTime, "current_page": current_page, "page_size": page_size, "update_student": user["username"], "article_label": label, "tags": tags, "timerange_check": timerange_check, "search_type": search_type } else: condition = { "article_source": webs, "article_db": article_db, "article_label_state": label_state, "startTime": startTime, "endTime": endTime, "current_page": current_page, "page_size": page_size, "article_label": label, "tags": tags, "timerange_check": timerange_check, "search_type": search_type } print "condition:", condition logging.info("[search] condition=" + str(condition)) system_setting = SystemSetting() # databases = system_setting.get("databases", "mongodb") # print "databases:",databases #pdb.set_trace() if search_key == None or search_key.strip() == "": print "mongodb " articleDAO = ArticleDAO('articles_testN') articleList = articleDAO.article_search_list(condition) else: print "elastic search " es = ES() if search_type == "simple_search": articleList = es.article_simple_search(condition, search_key) else: articleList = es.article_search_list(condition, search_key) logging.info("[search] len(result)=" + str(len(articleList))) except BaseException, e: logging.error(e) print e print traceback.print_exc() articleList = []
from es import ES from config import * from time import time es = ES().getES() if not es.indices.exists(index = index_name): print ' index does not exists, creat new index' es.indices.create(index = index_name, body = body) time.sleep(2) print 'index created successfully' else: print 'An index with this name already exists' doc1 = { 'name': 'Erlang', 'category': ['Distribute','OTP','Erlang','Elixir','Elm','Actor'], 'Publication': 'Ericsson', 'Publishing Date': '1970-01-01' } es.index(index = index_name, doc_type = doc_type, body = doc1, id = '123') response = es.get(index= index_name, doc_type = doc_type, id = '123', ignore= 404) print response
from crawler import Crawler from es import ES # given seed URLs, our topic is "Catholic Church" seed_urls = [ "http://en.wikipedia.org/wiki/Catholic_Church", "http://en.wikipedia.org/wiki/Christianity", "http://en.wikipedia.org/wiki/Ten_Commandments_in_Catholic_theology" ] # crawler crawler = Crawler() crawler.initialize(seed_urls) crawler.crawl_control() # merge indexes my_es = ES() my_es.initialize() my_es.es_control()
model = Model() model.push(XnorDense(INPUT_SIZE, NUM_UNITS)) model.push(XnorDense(NUM_UNITS, OUTPUT_SIZE)) normal_model = Model() normal_model.push(Dense(INPUT_SIZE, NUM_UNITS)) normal_model.push(Dense(NUM_UNITS, NUM_UNITS)) normal_model.push(Dense(NUM_UNITS, OUTPUT_SIZE, activation='softmax')) # opt = GA(pop_size=POP_SIZE, num_parents=NUM_PARENTS, \ # fitness_func=log_loss, rand_func=normal_rand, mutation_func=normal_mutation) opt = ES(pop_size=POP_SIZE, fitness_func=log_loss, rand_func=sparse_rand) ini_idx = 0 end_idx = BATCH_SIZE while ini_idx < y_train.shape[0]: batch_xs = x_train[ini_idx:end_idx] batch_ys = y_train[ini_idx:end_idx] opt.fit(normal_model, batch_xs, batch_ys) normal_model.set_params(opt.best) pred = normal_model.forward(x_test) pred = np.argmax(pred, axis=1)