def main(): raw_list = csv_to_list(csv_file)[:100] total_len = len(raw_list) counter = 0 result_dict = dict() print "Commencing Web Scraping..." start_time = time.time() for raw_link in raw_list: try: raw_link = raw_link[0] whois_link = "http://www.whois.com/whois/" + raw_link ipaddress_link = "http://" + raw_link + ".ipaddress.com/" whois_soup = link_to_lxmlsoup(whois_link) ipaddress_soup = link_to_lxmlsoup(ipaddress_link) result_dict.setdefault('Raw Link', []).append(str(raw_link)) result_dict = whois_parser(whois_soup, result_dict) result_dict = ipaddress_parser(ipaddress_soup, result_dict) counter, total_len = print_counter(counter, total_len) if counter % 400 == 0: print "Commencing 30 Second Sleep after 400 iterations" time.sleep(30) time_elapsed = time.time() - start_time print_progress(time_elapsed, counter, total_len) except: dict_to_json(result_dict, 'output.json') dict_to_csv(result_dict, 'output.csv') print "Unexpected Error", sys.exc_info()[0] raise dict_to_json(result_dict, 'output.json') dict_to_csv(result_dict, 'output.csv')
def get_json(self): d = { "name": self.name.title(), "weight": self.weight, "primarytype": self.primarytype, "secondarytype": self.secondarytype, "typemods": [{ "type": k, "value": v } for k, v in sorted(self.typeprofile.type_modifiers.items(), key=lambda x: x[1]) if v != 1], "hp": self.hp, "defense": self.defense, "attack": self.attack, "spattack": self.spattack, "spdefense": self.spdefense, "speed": self.speed, } return dict_to_json(d)
def for_user_model(): U = 4 Nu = [10, 21, 14, 32] uids = [20000, 40000, 10000, 30000] sport = "Running" randomState = np.random.RandomState(12345) f = gzip.open("synth_user_model.gz", "w") theta = randomState.rand(U + 3) alpha_all = theta[-3] theta_0 = theta[-2] theta_1 = theta[-1] data = [] for u in xrange(0, U): nu = Nu[u] alpha = theta[u] uid = uids[u] for i in xrange(0, nu): d = randomState.randint(low = 1, high = 10) t = (alpha + alpha_all) * (theta_0 + theta_1 * d) * 3600.0 w = {"Distance" : d, "Duration" : t, "user_id" : str(uid), "sport" : sport} w_json = utils.dict_to_json(w) f.write(w_json + "\n") print theta f.close()
def get(self): increment_hit_counter(datastore_key_hits_streams_json) # Get database db = memcache.get(memcache_key_database) if db is None: logger.warn( 'memcache failed on key: {}'.format(memcache_key_database)) db_json = ndb_get_entity(JsonDatabase, datastore_key_database).value db = utils.json_to_dict(db_json) memcache.set(memcache_key_database, db) # Get last update time last_update_time = memcache.get(memcache_key_last_update) if last_update_time is None: logger.warn( 'memcache failed on key: {}'.format(memcache_key_last_update)) last_update_time = ndb_get_entity(Time, datastore_key_last_update).value memcache.set(memcache_key_last_update, last_update_time) json_obj = {'streams': db, 'last_update': last_update_time} json_str = utils.dict_to_json(json_obj) self.response.headers['Content-Type'] = 'application/json' self.response.out.write(json_str)
def dump(self): """ Dump the output to json. """ report_as_json_string = utils.dict_to_json(self.report) if self.out_file: utils.string_to_file(self.out_file, report_as_json_string) else: print report_as_json_string
def backup_database(backup_key): db = memcache.get(memcache_key_database) if db is None: logger.warn('memcache failed on key: {}'.format(memcache_key_database)) db_json = ndb_get_entity(JsonDatabase, datastore_key_database).value db = utils.json_to_dict(db_json) memcache.set(memcache_key_database, db) db_json = utils.dict_to_json(db) logger.info('Backup database to key: {}'.format(backup_key)) ndb_set_value(JsonDatabase, backup_key, db_json)
def update_database(): db_json = ndb_get_entity(JsonDatabase, datastore_key_database).value db = utils.json_to_dict(db_json) current_streams = streams.get_current_streams() updated_db = streams.update_database(db, current_streams) updated_db_json = utils.dict_to_json(updated_db) ndb_set_value(JsonDatabase, datastore_key_database, updated_db_json) memcache.delete(memcache_key_database) memcache.set(memcache_key_database, updated_db)
def books(data, arr): if request.json: body = dict_to_json(request.json) if body['type'] == 'date': return books_by_published_date(body, arr) elif body['type'] == 'title': return books_by_title(body, arr) else: return 'The type of the request is not recognise', 400 else: return json.dumps(data, indent=4), 201
def for_evolving_user_model(): U = 4 E = 3 Nu = [10, 21, 14, 32] uids = sorted([20000, 40000, 10000, 30000]) print uids sport = "Running" sigma = np.zeros((U, max(Nu))) randomState = np.random.RandomState(12345) f = gzip.open("synth_evolving_user_model.gz", "w") #theta = randomState.rand(U * E + E + 2) theta_0 = randomState.rand() theta_1 = randomState.rand() alpha = np.array(sorted(list(randomState.rand(E)), reverse = True)) # per user alpha for each experience data = [] theta = [] for u in xrange(0, U): nu = Nu[u] #alpha_u = np.sort(randomState.rand(E)) # per user alpha for each experience alpha_u = np.array(sorted(list(randomState.rand(E)), reverse = True)) # per user alpha for each experience #print alpha_u theta = theta + list(alpha_u) uid = uids[u] dts = np.sort(randomState.randint(low = 1000000000, high = 1100000000, size = (nu))) exp_levels = np.sort(randomState.randint(low = 0, high = E, size = (nu))) for i in xrange(0, nu): d = randomState.randint(low = 1, high = 10) e = exp_levels[i] sigma[u, i] = e a_e = alpha[e] a_ue = alpha_u[e] print "u = %d, e = %d, alpha_ue = %f" % (u, e, a_ue) t = (a_e + a_ue) * (theta_0 + theta_1 * d) * 3600.0 #print "a term = " + str((a_e + a_ue)) + " theta term = " + str(theta_0 + theta_1 * d) dt = datetime.fromtimestamp(dts[i]).strftime('%b %d, %Y %I:%M %p') w = {"Distance" : d, "Duration" : t, "user_id" : str(uid), "sport" : sport, "date-time" : dt, "experience" : e} w_json = utils.dict_to_json(w) f.write(w_json + "\n") data.append([u, uid, d, t, dts[i]]) f.close() data = np.matrix(data) sigma_list = [list(sigma[i, :]) for i in range(0, U)] theta += list(alpha) theta += [theta_0] theta += [theta_1] np.set_printoptions(formatter={'float': '{: 0.3f}'.format}) print "Actual theta : ", theta
def condense_and_clean_data(infile, outfile): """ infile must be a .gz file generated by the sql_to_json_parser.py condense_and_clean_data will do the following: - replace trace data by averages - replace strings like '2.35 mi' to 2.35 """ t1 = time.time() fo = gzip.open(outfile, "w") fi = gzip.open(infile) precision = 6 # 6 digits after decimal param_formatter = ParamFormatter(precision = precision) n = 0 n_params_ignored = 0 n_values_ignored = 0 ignored_params = set() ignored_values = set() for line in fi: d = {} w = utils.json_to_dict(line.strip()) for k, v in w.items(): if (isinstance(v, list)): # replace trace data by averages v = round(numpy.mean(utils.remove_null_values_single(v)), precision) k = k + "(avg)" d[k] = v else: # convert and replace units - for example, convert '2.35 mi' to 2.35 try: v = param_formatter.to_number(k, v) d[k] = v except InvalidValueException as e: n_values_ignored += 1 ignored_values.add(e.value) except InvalidParamException as e: n_params_ignored += 1 ignored_params.add(e.param) w_str = utils.dict_to_json(d) fo.write(w_str + "\n") n += 1 if (n % 10000 == 0): print "Written %d workouts.." % (n) fi.close() fo.close() t2 = time.time() print "Time taken = " + str(t2 - t1) + " seconds" print "%d params ignored" % (n_params_ignored) print "List of ignored parameters : " + str(ignored_params) print "%d values ignored" % (n_values_ignored) print "List of ignored values : " + str(ignored_values) print "Total %d workouts written" % (n)
def dump(self): """ Dump the output to json. """ if self.options.role is not None: role = "roles/" + self.options.role if role not in self.report["roles"]: print("Role does not exist: %s" % self.options.role) sys.exit(2) data = self.report["roles"][role] else: data = self.report report_as_json_string = utils.dict_to_json(data, pretty_print=True) if self.out_file: utils.string_to_file(self.out_file, report_as_json_string) else: print report_as_json_string
def get(self): logger.info('Database init started') afreeca_json = streams.afreeca_init_db_json init_db = streams.get_initial_database(afreeca_json) init_db_json = utils.dict_to_json(init_db) try: ndb_set_value(JsonDatabase, datastore_key_database, init_db_json, True) ndb_set_value(Time, datastore_key_last_update, datetime.datetime.utcnow()) except KeyError: pass try: ndb_set_value(HitCounter, datastore_key_hits_streams_json, 0, True) except KeyError: pass logger.info('Database init finished')
def for_linear_model(): theta = np.matrix([[0.1, 0.2]]).T X = np.matrix(np.random.rand(1000, 1)) X = add_offset_feature(X) print X.shape noise = np.matrix(np.random.normal(0, 0.1, 1000)).T Y = X.dot(theta) + noise print Y.shape np.save("X_synthetic_direct.npy", X) np.save("Y_synthetic_direct.npy", Y) with gzip.open("synthetic_workouts.gz", "w") as f: for i in range(0, 1000): d = {} d["Distance"] = 100 * X[i, 1] d["Duration"] = 100 * Y[i, 0] s = utils.dict_to_json(d) f.write(s + "\n")
def for_baseline_model(): U = 4 Nu = [10, 21, 4, 32] uids = [20000, 40000, 10000, 30000] sport = "Running" randomState = np.random.RandomState(12345) f = gzip.open("synth_baseline_model.gz", "w") theta = randomState.randint(low = 100, high = 10000, size = U) print theta data = [] for u in xrange(0, U): nu = Nu[u] v = theta[u] uid = uids[u] for i in xrange(0, nu): d = randomState.randint(low = 1, high = 10) t = v * d w = {"Distance" : d, "Duration" : t, "user_id" : str(uid), "sport" : sport} w_json = utils.dict_to_json(w) f.write(w_json + "\n") f.close()
def test_set_database(db, out_name): json_str = utils.dict_to_json(db) with open(out_name, 'w') as f: f.write(json_str)
def run(config_file, fold=0, device_id=0): os.environ['CUDA_VISIBLE_DEVICES'] = str(device_id) config = load_config(config_file) if not '_fold' in config.work_dir: config.work_dir = config.work_dir + '_fold{}'.format(fold) validloader = make_loader( data_dir=config.data.train_dir, df_path=config.data.train_df_path, features=config.data.features, phase='valid', img_size=(config.data.height, config.data.width), batch_size=config.test.batch_size, num_workers=config.num_workers, idx_fold=fold, transforms=get_transforms(config.transforms.test), model_scale=config.data.model_scale, return_fnames=True, ) # load model checkpoint_path = config.work_dir + '/checkpoints/best.pth' model = load_model(config_file, checkpoint_path) folds = pd.read_csv('data/folds.csv') predictions = [] targets = [] image_ids = [] z_pos = config.data.z_pos[0] with torch.no_grad(): for i, (batch_images, batch_mask_regr, batch_image_ids) in enumerate(tqdm(validloader)): batch_preds = model(batch_images.to(config.device)) batch_preds[:, 0] = torch.sigmoid(batch_preds[:, 0]) batch_preds[:, z_pos] = depth_transform(batch_preds[:, z_pos]) batch_preds = batch_preds.data.cpu().numpy() batch_mask_regr = batch_mask_regr.data.cpu().numpy() image_ids.extend(batch_image_ids) for preds, mask_regr, image_id in zip(batch_preds, batch_mask_regr, batch_image_ids): coords = extract_coords( preds, features=config.data.features, img_size=(config.data.height, config.data.width), confidence_threshold=config.test.confidence_threshold, distance_threshold=config.test.distance_threshold, ) predictions.append(coords) s = folds.loc[folds.ImageId == image_id.split('.jpg')[0], 'PredictionString'].values[0] true_coords = str2coords( s, names=['id', 'yaw', 'pitch', 'roll', 'x', 'y', 'z']) targets.append(true_coords) with open(config.work_dir + '/predictions.pkl', 'wb') as f: pickle.dump(predictions, f) with open(config.work_dir + '/targets.pkl', 'wb') as f: pickle.dump(targets, f) rows = [] for p, i in zip(predictions, image_ids): rows.append({'ImageId': i, 'PredictionString': coords2str(p)}) pred_df = pd.DataFrame(rows) pred_df.to_csv(config.work_dir + '/val_pred.csv', index=False) all_result, result = calc_map_score(targets, predictions) result['confidence_threshold'] = config.test.confidence_threshold result['distance_threshold'] = config.test.distance_threshold dict_to_json( all_result, config.work_dir + '/all_result_th{}.json'.format(config.test.distance_threshold)) dict_to_json( result, config.work_dir + '/result_th{}.json'.format(config.test.distance_threshold)) for k in sorted(result.keys()): print(k, result[k])
for div in divs: date = div.find("div", {"class", "meta"}).text picture = div.find("img")["src"] description = div.find("p").text title = div.find("h2").text tip = div.find("div", {"class", "img_desc"}).text url = div.find("a")["href"] content = scrap_content(url) news_data.append({ "title": title, "dateInfo": date, "picture": picture, "description": description, "type": tip, "url": url, "content": content }) return news_data def fcs_crawler(): return crawl_articles(FCS_URL) if __name__ == '__main__': from utils import dict_to_json data = fcs_crawler() dict_to_json("JSON_IGNORE.json", data)
model = stock_rnn_model.model callback = EarlyStopping(monitor="loss", patience=10, verbose=1, mode="auto") training_history = model.fit(X_train, Y_train, epochs=1000, batch_size=128, validation_data=(X_val, Y_val), callbacks=[callback]) model.save(model_type + "_stock_" + input_output_type + "_inference.h5") training_history_dict = training_history.history utils.dict_to_json( training_history_dict, "./" + model_type + "_" + input_output_type + "_training_history") plot_loss(training_history_dict) training_score, validation_score = model_score(model, X_train, Y_train, X_val, Y_val) info("=================================") info("Train MSE: %.5f%%" % training_score) info("Validation MSE: %.5f%%" % validation_score) info("=================================") predicted_close = model.predict(final_test_for_all) #print(predicted_close) #print(predicted_close.shape) predicted_close = stock_rnn_model.get_avereage_predicted_close( predicted_close, "mean") #print(predicted_close)