def match_police_code_borough(police_obj): borough = '' borough_total = [] metropolitan_codes = police_obj.get_neighbourhood_codes() for pair in metropolitan_codes: boundaries = police_obj.get_neighbourhood_boundaries(pair['id']) i = 0 while i < len(boundaries) - 1: lat = boundaries[i]['latitude'] long = boundaries[i]['longitude'] borough = get_borough(lat, long) if borough == "CAN'T FIND BOROUGH": i += 1 else: break borough_total.append({ 'id': pair['id'], 'name': pair['name'], 'borough': borough }) print(pair['id'] + ', ' + pair['name'] + ', ' + borough) write_to_json('borough.json', borough_total)
def create_sequences(self): sequences = self.df.groupby(['DEVICE_ID']) sequences = sequences.groups train_data={} target={} for p, key in enumerate(sequences): print "sequence " + str(p) #if p<69369: #continue if p==5000: break data_point = None seq_target=[] sorted_indexs = self.df.ix[sequences[key].values].sort('EVENT_TIME').index.values # city = randint(1,45) for idx in sorted_indexs: # print self.df.ix[idx]['PROGRAM_GENRE'] genre = self.genres[self.df.ix[idx]['PROGRAM_GENRE']] # print genre seq_target.append(genre) temp = self.df.ix[idx].drop(['EVENT_TIME','PROGRAM_GENRE','HOUSEHOLD_ID']) # temp = self.df.ix[idx] if data_point is None: data_point = temp.as_matrix() else: data_point = np.concatenate((data_point,temp)) # data.df = data.df.drop(sorted_indexs) train_data[key]=list(data_point) target[key] = seq_target print len(train_data[key]) utils.write_to_json('trainA.json',train_data) utils.write_to_json('trainA_target.json',target)
def get_initial_company_info(): """Gets the initial information for each company""" company_dict = utils.open_json(MONITOR) for company in company_dict: # Gets symbol for company if company_dict[company]["Symbol"] == "unknown": try: with urllib.request.urlopen( f'https://finance.yahoo.com/_finance_doubledown/' f'api/resource/searchassist;searchTerm={company}' ) as response: html = response.read().decode() d = json.loads(html) company_dict[company]["Symbol"] = d['items'][0][ 'symbol'] except urllib.error.HTTPError as error: utils.write_to_log(f'Error opening URL: {error}') # Gets initial share price if company_dict[company]["Initial-share-price"] == 1: yahoo = Share(company_dict[company]["Symbol"]) share = yahoo.get_price() company_dict[company]["Initial-share-price"] = float(share) company_dict[company]["Current-share-price"] = float(share) utils.write_to_json(MONITOR, company_dict)
def _get_historic_tweets(api, keyword, json_file_name, num_of_tweets): """Get previous arg.num_of_tweets related to arg.keyword.""" tweet_list = [] print("Getting previous %s tweets..." % str(num_of_tweets)) try: for tweet in tweepy.Cursor(api.search, q=keyword).items(num_of_tweets): entry = { 'Screen-Name': str(tweet.user.screen_name), 'Username': (tweet.user.name), 'Created-At': str(tweet.created_at), 'Text': str(tweet.text), 'User-Location': str(tweet.user.location), 'Coordinates': str(tweet.coordinates), 'Device-Type': str(tweet.source), 'Hashtags': str(tweet.entities.get('hashtags')), 'Quote-Status': str(tweet.is_quote_status), 'Retweeted': str(tweet.retweeted), 'Retweet-Count': str(tweet.retweet_count), 'Favorited': str(tweet.favorited), 'Favorite-Count': str(tweet.favorite_count), 'Replied': str(tweet.in_reply_to_status_id_str) } tweet_list.append(entry) print("...tweets fetched") utils.write_to_json(json_file_name, tweet_list) except tweepy.TweepError as e: raise HistoricTweetException(str(e))
def main(): players = read_json(ALL_PLAYERS_FILE_PATH) stat_list = [] for player in players: print("Fetching player {}".format(player['name']['display'])) stats = fetch_player_stats(int(player['id'])) item_dict = {'player': player, 'stats': stats} stat_list.append(item_dict) write_to_json(stat_list, PLAYER_STATS_FILE_PATH)
def save_spotify_responses(csv_file, output_file): with open(csv_file, 'r') as csv_f: charts = csv.DictReader(csv_f) responses = [] time = arrow.now().isoformat() for song in charts: track_id = song['URL'].split('/')[-1] url = full_url(track_id) r = requests.get(url).json() responses.append(r) output = {} output['data'] = responses output['date-retrieved'] = time write_to_json(output, output_file)
def gen_vocab(df, whichdata): if whichdata == "test": outfname = config.MSVD_VID_CAPS_TEST_PATH dictsize = config.TEST_VIDS elif whichdata == "val": outfname = config.MSVD_VID_CAPS_VAL_PATH dictsize = config.VAL_VIDS else: outfname = config.MSVD_VID_CAPS_TRAIN_PATH dictsize = config.TRAIN_VIDS vocab = set() punct_dict = get_punctuations() translator = string.maketrans("", "") vid_caps_dict = {} for index, row in df.iterrows(): vid_id = str(row["VideoID"]) + "_" + str(row["Start"]) + "_" + str( row["End"]) tokens, _ = tokenize(row["Description"], punct_dict, translator) if (vid_id in vid_caps_dict): vid_caps_dict[vid_id].append(tokens) else: vid_caps_dict[vid_id] = [tokens] if whichdata == "train": vocab |= set(tokens) utils.write_to_json(vid_caps_dict, outfname) print("Size of " + whichdata + " vid caps dict: " + str(len(vid_caps_dict))) assert len(vid_caps_dict) == dictsize if whichdata == "train": vocab_list = list(vocab) vocab_list.sort() vocab_dict = { vocab_list[index]: index + 2 for index in range(len(vocab_list)) } # vocab_dict['<bos>'] = 0 vocab_dict['<eos>'] = 0 vocab_dict['UNK'] = 1 vocab_rev_dict = { index + 2: vocab_list[index] for index in range(len(vocab_list)) } # vocab_rev_dict[0] = '<bos>' vocab_rev_dict[0] = '<eos>' vocab_rev_dict[1] = 'UNK' utils.write_to_json(vocab_dict, config.MSVD_VOCAB_PATH) utils.write_to_pickle(vocab_rev_dict, config.MSVD_REVERSE_VOCAB_PATH) print("Size of Vocabulary: " + str(len(vocab))) return vocab, vid_caps_dict
def train_util(params): save_dir = params['save_dir'] print('current save dir : ' + save_dir) utils.create_dir_if_not_exist(save_dir) reload_model = params['reload_model'] if reload_model: print 'preparing reload' save_dir_backup = params['save_dir'] from_dir_backup = params['from_dir'] # never start retrain in the same folder assert save_dir_backup != from_dir_backup print 'save dir ', save_dir_backup print 'from_dir ', from_dir_backup print 'setting current model config with the old one' model_config_old = utils.read_from_json(from_dir_backup + 'model_config.json') model_config_old['reload_model'] = True model_config_old['save_dir'] = params['save_dir'] model_config_old['from_dir'] = params['from_dir'] model_config_old['max_epochs'] = params['max_epochs'] model_config_old['dispFreq'] = params['dispFreq'] model_config_old['sampleFreq'] = params['sampleFreq'] model_config_old['validFreq'] = params['validFreq'] model_config_old['debug'] = params['debug'] params = model_config_old feats_dir = params['feats_dir'] elif params['cnn_name'] != "MURALI": feats_dir = params['feats_dir'] + params['cnn_name'] + "_kmeans3/" else: feats_dir = params['feats_dir'] print('feats dir : ' + feats_dir) params['feats_dir'] = feats_dir config_save_path = save_dir + "model_config.json" print('saving model config into %s' % config_save_path) utils.write_to_json(params, config_save_path) t0 = time.time() print('training an attention model') train(params, **params) print('training time in total %.4f sec' % (time.time() - t0))
def get_current_shares(): """Gets current shares, compares it to initial, finds difference. Returns for output to handle""" company_dict = utils.open_json(MONITOR) for company in company_dict: try: yahoo = Share(company_dict[company]["Symbol"]) yahoo.refresh() share = yahoo.get_price() company_dict[company]["Current-share-price"] = float(share) company_dict[company]["Share-price-list"].append(float(share)) except ValueError: # yahoo.get_price() will return None if an error occurs print("Could not add to the Current share/Share price list") utils.write_to_json(MONITOR, company_dict)
def minus_days(): """Takes away a day from the "Days-Left", removes from monitor.json if == 0""" company_dict = utils.open_json(MONITOR) remove = [] for company in company_dict: if company_dict[company]["Days-left"] > 0: company_dict[company]["Days-left"] -= 1 elif company_dict[company]["Days-left"] == 0: remove.append(company) for company in remove: # Do I want to keep a record of all the companies that have been mentioned and their prices??? # Goes here del company_dict[company] utils.write_to_json(MONITOR, company_dict)
def get_polygon_police_code(police_obj): borough_total = [] data = load_json_local('borough.json') for area in data: poly_string = '' boundaries = police_obj.get_neighbourhood_boundaries(area['id']) indexes = get_linear_spaced_indexes(length=len(boundaries), spacing=100) for index in indexes: poly_string += str(boundaries[index]['latitude']) + ',' + str( boundaries[index]['longitude']) + ':' print(poly_string[:-1]) borough_total.append({ 'id': area['id'], 'name': area['name'], 'borough': area['borough'], 'polygon': poly_string[:-1] }) write_to_json('boroughs_info.json', borough_total)
def check_for_companies(self): """Checks list of companies with Trump's tweet seeing if any companies are listed in his tweet. Inputs matches into monitor.json""" matches = [] punc = ("!", ",", ".", ":", ";", "@", "?", "(", ")") self.tweet = ''.join( [letter for letter in self.tweet if letter not in punc]).lower() with open(COMPANIES) as f: companies = [line.strip() for line in f] for word in self.tweet.split(): # Binary search for word if utils.find(companies, word): matches.append(word) company_dict = utils.open_json(MONITOR) comp_d = {} # Information that is needed by get_initial/current for company in matches: comp_d[company] = {} comp_d[company]["Date-mentioned"] = "{:%d-%m-%Y %H:%M:%S}".format( datetime.datetime.now()) comp_d[company]["Mentioned by"] = self.handle comp_d[company]["Tweet"] = self.original_tweet comp_d[company]["Days-left"] = 7 comp_d[company]["Symbol"] = "unknown" comp_d[company]["Initial-share-price"] = 1 comp_d[company]["Current-share-price"] = 1 comp_d[company]["Share-price-list"] = [] company_dict.update(comp_d) utils.write_to_json(MONITOR, company_dict) return matches
def company_result(): title = 'マッチ度ランキング | Workers' # 個人のデータを取得 target_dic = analyze_personality() # 個人のデータを保存 write_to_json(target_dic, 'json/target.json') # おすすめ企業のデータを取得 companies = get_recommended_companies(target_dic) # 個人と各企業のパラメータを重ねた画像を作成 for company in companies: image_path = 'static/images/{0}.png'.format(company['id']) com_param_dic = company['params'] make_big_five_graph( [target_dic, com_param_dic, dummy_dic0, dummy_dic1], image_path) return render_template('company_result.html', title=title, companies=companies)
def analyze_personality(): """個人のテキストをクエリとし,PersonalityInsightsから性格パラメータを取得する. Returns: dict: Big 5のパラメータ. keyは{'ope', 'con', 'ext', 'agr', 'emo'} """ with open('apikey.txt', 'r') as f: api_key = f.read() authenticator = IAMAuthenticator(api_key) personality_insights = PersonalityInsightsV3(version='2017-10-13', authenticator=authenticator) personality_insights.set_service_url( 'https://gateway.watsonplatform.net/personality-insights/api') with open('sample.txt', 'r') as profile_text: profile = personality_insights.profile( profile_text.read(), 'application/json', consumption_preferences=True, content_language='ja', accept_language='ja', ).get_result() json_obj = profile write_to_json(json_obj, 'json/result.json') ope = json_obj["personality"][0]["percentile"] con = json_obj["personality"][1]["percentile"] ext = json_obj["personality"][2]["percentile"] agr = json_obj["personality"][3]["percentile"] emo = json_obj["personality"][4]["percentile"] dic = {'ope': ope, 'con': con, 'ext': ext, 'agr': agr, 'emo': emo} return dic
def map_url_with_ids(videos): vid_ids = [] vid_urls = [] url_ids_map = OrderedDict() for vid in videos: vid_url = vid['url'] vid_id = vid['video_id'] if vid_url in url_ids_map: url_ids_map[vid_url].append(vid_id) else: url_ids_map[vid_url] = [vid_id] vid_ids.append(vid_id) vid_urls.append(vid_url) assert len(set(vid_ids))==MSRVTT_TOTAL_VIDS print "urls#:",len(set(vid_urls)),'/',len(vid_urls) url_ydl_map = OrderedDict() count = 0 success = 0 fail = 0 for url in url_ids_map: ydl_url, status = get_youtube_url(url) url_ydl_map[url] = { "ydl_url": ydl_url, "status": status } if status=="Success": success += 1 else: fail += 1 count = count + 1 print success,"/",count," ",fail,"/",count url_ydl_map["#success"] = success url_ydl_map["#fail"] = fail url_ydl_map["#count"] = count utils.write_to_json(url_ids_map, MSRVTT_DIR+"urls_vidids_map.json") utils.write_to_json(url_ydl_map, MSRVTT_DIR+"urls_ydl_map.json")
import os from pathlib import Path import utils vasp_folder = Path( '/home/khalkhal/Simulations/VASP/Millerite/Machine_Learning/new-training-builder/VASP_folder' ) counter = 0 for folder in os.listdir(vasp_folder): counter += 1 if counter % 10 == 0: print("number of structures read: %d" % counter, end="\n") poscar = vasp_folder / folder / "POSCAR" atoms, cell = utils.read_poscar(poscar) atoms = utils.CN(atoms, cell) atom_file = vasp_folder / folder / "atoms.json" utils.write_to_json(atom_file, atoms)
for file in os.listdir(vasp_folder): if file.endswith(".xsd"): print("Processing ", file, "...") atoms = None cell = None filename = xsd_folder / xsd_dir / file atoms, cell = utils.read_xsd(filename) atoms = utils.CN(atoms, cell) old_struct_num = len(struct_list) struct_list, atoms, wrongs, bcodes = utils.neutralizer(vasp_folder, atoms, cell, struct_list, file, folder, wrongs, bcodes) new_struct_num = len(struct_list) if old_struct_num != new_struct_num: path = vasp_folder / str(new_struct_num) / 'atoms.json' utils.write_to_json(path, atoms) print(new_struct_num - old_struct_num, "new structures were made...") print("Current total number of structures: ", new_struct_num) new_struct_file = Path('/home/khalkhal/Simulations/VASP/Millerite/Machine_Learning/new-training-builder/struct_list.csv') utils.write_struct_list(new_struct_file, struct_list) print(wrongs) codes = sorted(bcodes) for i in range(len(codes)): if codes[i] == 0: print(bcodes[i]) # codes = list(Counter(bcodes).keys()) # equals to list(set(words)) # freqs = list(Counter(bcodes).values()) # counts the elements' frequency
class GithubApiQuery(object): basic_rate_linit = 5000 search_rate_linit = 30 def __init__(self,token): self.token = token self._manager = Github(login_or_token=self.token) @property def rate_limit(self): return self._manager.rate_limiting[0] @property def resetime(self): """ Return the nb seconds to wait for next reset of 5000 calls (truncated to seconds)""" return self._manager.rate_limiting_resettime - int(time.time()) def get_user(self, name): """ Return github.NamedUser.NamedUser class for user with name like 'acreux' """ return self._manager.get_user(name) if __name__ == "__main__": key = get_my_key() g = GithubApiQuery(key) guido = g.get_user("gvanrossum") write_to_json(guido.raw_data, "Guido.json")
def get_pure_arbitrage(min_margin, max_item_purchase_price, min_potential_revenue, min_system_sec_rating, single_cargo=True, cargo_capacity=0, get_routes=True, get_new_orders=False, get_new_lookups=False, safe_regions=True): if single_cargo and cargo_capacity == 0: print("Please provide a cargo capacity") return # can't force another download of the lookups. Assumes they've been saved in get_and_save_orders() lookups = get_name_lookups(force=get_new_lookups) region_name_by_region = lookups["regions"] system_name_by_system = lookups["systems"] type_name_by_type = lookups["types"] if get_new_orders: get_and_save_orders(force=True, force_lookups=False, safe_regions=safe_regions) df = pd.read_csv("./data/orders/orders.csv", quotechar="|") print("\nGetting interim dictionary. This may take a minute..") df_grouped = df.groupby("type_name") df_dict = df_grouped.apply( lambda group: { "buy": {col: group[group["is_buy_order"] == True][col].tolist() for col in group.columns}, "sell": {col: group[group["is_buy_order"] == False][col].tolist() for col in group.columns} } ).to_dict() u.write_to_json(df_dict, "./data/orders/orders_by_item.csv") else: print("\nLoading saved order dictionary at: ./data/orders/orders_by_item.csv") df_dict = u.load_data("./data/orders/orders_by_item.csv") if len(df_dict) == 0: print("No order data saved, but 'get_new_orders' parameter was set to False. Downloading anyway") get_and_save_orders(force=True, force_lookups=False, safe_regions=safe_regions) df = pd.read_csv("./data/orders/orders.csv", quotechar="|") print("\nGetting interim dictionary. This may take a minute..") df_grouped = df.groupby("type_name") df_dict = df_grouped.apply( lambda group: { "buy": {col: group[group["is_buy_order"] == True][col].tolist() for col in group.columns}, "sell": {col: group[group["is_buy_order"] == False][col].tolist() for col in group.columns} } ).to_dict() u.write_to_json(df_dict, "./data/orders/orders_by_item.csv") rows = [] item_count = 0 artbitrage_count = 0 system_details = get_system_details(system_name_by_system) for item in df_dict.keys(): item_count += 1 u.overwrite_print("Processing item: " + str(item_count) + "/" + str(len(df_dict.keys())) + ". " + str(artbitrage_count) + " opportunities found so far") for i in range(len(df_dict[item]["buy"]["price"])): if system_details[df_dict[item]["buy"]["system_name"][i]]["security_status"] < min_system_sec_rating: continue for j in range(len(df_dict[item]["sell"]["price"])): if df_dict[item]["sell"]["price"][j] > max_item_purchase_price or system_details[df_dict[item]["sell"]["system_name"][j]]["security_status"] < min_system_sec_rating: continue if df_dict[item]["buy"]["price"][i] > df_dict[item]["sell"]["price"][j]: margin = ((df_dict[item]["buy"]["price"][i] / df_dict[item]["sell"]["price"][j]) - 1)*100 if margin >= min_margin: max_items_could_be_transacted = min(df_dict[item]["sell"]["volume_remain"][j], df_dict[item]["buy"]["volume_remain"][i]) potential_revenue = (max_items_could_be_transacted*df_dict[item]["buy"]["price"][i]) - (max_items_could_be_transacted*df_dict[item]["sell"]["price"][j]) if potential_revenue >= min_potential_revenue: artbitrage_count += 1 row = { "item_id": df_dict[item]["sell"]["type_id"][j], "item": item.replace(",", "-"), "buy_in_region": df_dict[item]["sell"]["region_name"][j], "buy_in_system_name": df_dict[item]["sell"]["system_name"][j], "buy_in_location_id": df_dict[item]["sell"]["location_id"][j], "sell_in_region": df_dict[item]["buy"]["region_name"][i], "sell_in_system_name": df_dict[item]["buy"]["system_name"][i], "sell_in_location_id": df_dict[item]["buy"]["location_id"][i], "buy_price": df_dict[item]["sell"]["price"][j], "sell_price": df_dict[item]["buy"]["price"][i], "buy_min_volume": df_dict[item]["sell"]["min_volume"][j], "sell_min_volume": df_dict[item]["buy"]["min_volume"][i], "amount_available_to_buy": df_dict[item]["sell"]["volume_remain"][j], "amount_able_to_be_sold": df_dict[item]["buy"]["volume_remain"][i], "margin": margin, "potential_revenue": potential_revenue, "_buy_system": df_dict[item]["sell"]["system_id"][j], "_sell_system": df_dict[item]["buy"]["system_id"][i], "buy_system_sec": system_details[df_dict[item]["sell"]["system_name"][j]]["security_status"], "sell_system_sec": system_details[df_dict[item]["buy"]["system_name"][i]]["security_status"] } rows.append(row) # Get type details only for items with arbitrage opportunities. Saves pulling down 35k items 1 by 1 type_ids = list(set([str(row["item_id"]) for row in rows])) type_details = get_type_details(type_name_by_type, type_ids) header = [ "item_id", "item", "buy_in_region", "buy_in_system_name", "buy_in_location_id", "sell_in_region", "sell_in_system_name", "sell_in_location_id", "buy_price", "sell_price", "buy_min_volume", "sell_min_volume", "amount_available_to_buy", "amount_able_to_be_sold", "margin", "potential_revenue", "_buy_system", "_sell_system", "buy_system_sec", "sell_system_sec", "item_volume" ] for row in rows: row["item_volume"] = type_details[row["item"]]["packaged_volume"] single_cargo_rows = [] if single_cargo: print("\nFiltering to opportunities making > " + str(min_potential_revenue) + " per cargo of " + str(cargo_capacity) + "m3") header.append("potential_revenue_per_cargo") for i, row in enumerate(rows): items_per_single_cargo = cargo_capacity / row["item_volume"] potential_revenue_per_item = row["sell_price"] - row["buy_price"] potential_revenue_per_cargo = min( potential_revenue_per_item*items_per_single_cargo, row["potential_revenue"] ) row["potential_revenue_per_cargo"] = potential_revenue_per_cargo if potential_revenue_per_cargo > min_potential_revenue: single_cargo_rows.append(row) rows = single_cargo_rows print("\nFiltered to " + str(len(rows)) + " opportunities") if get_routes: header.append("route") header.append("route_jumps") od_pairs = list(set([(row["_buy_system"], row["_sell_system"], row["buy_in_system_name"], row["sell_in_system_name"]) for row in rows])) route_by_od_pair = get_routes_by_od_pairs(od_pairs) for row in rows: route = route_by_od_pair[(row["_buy_system"], row["_sell_system"], row["buy_in_system_name"], row["sell_in_system_name"])] row["route"] = '-'.join([str(i) for i in route]) row["route_jumps"] = len(route) u.write_to_csv(header,rows,"./output/pure_arbitrage.csv")
def _record_failure(self): utils.write_to_json(self.path_err, self.err_list)
def train(model_options, dataset_name = 'MSVD', cnn_name = 'ResNet50', train_data_ids_path = config.MSVD_DATA_IDS_TRAIN_PATH, val_data_ids_path = config.MSVD_DATA_IDS_VAL_PATH, test_data_ids_path = config.MSVD_DATA_IDS_TEST_PATH, vocab_path = config.MSVD_VOCAB_PATH, reverse_vocab_path = config.MSVD_REVERSE_VOCAB_PATH, mb_size_train = 64, mb_size_test = 128, train_caps_path = config.MSVD_VID_CAPS_TRAIN_PATH, val_caps_path = config.MSVD_VID_CAPS_VAL_PATH, test_caps_path = config.MSVD_VID_CAPS_TEST_PATH, feats_dir = config.MSVD_FEATS_DIR, save_dir = config.SAVE_DIR_PATH, word_dim = 512, # word embeddings size ctx_dim = 2048, # video cnn feature dimension lstm_dim = 512, # lstm unit size patience = 20, max_epochs = 500, decay_c = 1e-4, alpha_entropy_r = 0., alpha_c = 0.70602, clip_c = 10., lrate = 0.0001, vocab_size = 20000, # n_words maxlen_caption = 30, # max length of the descprition optimizer = 'adadelta', batch_size = 64, # for trees use 25 metric = 'everything', # set to perplexity on DVS # blue, meteor, or both use_dropout = True, selector = True, ctx2out = True, prev2out = True, dispFreq = 10, validFreq = 2000, saveFreq = -1, # save the parameters after every saveFreq updates sampleFreq = 100, # generate some samples after every sampleFreq updates verbose = True, debug = False, reload_model = False, from_dir = '', ctx_frames = 28, # 26 when compare random_seed = 1234, beam_search = True ): tf.set_random_seed(random_seed) model = Model() print 'loading data' engine = data_engine.Movie2Caption(dataset_name,cnn_name,train_data_ids_path, val_data_ids_path, test_data_ids_path, vocab_path, reverse_vocab_path, mb_size_train, mb_size_test, maxlen_caption, train_caps_path, val_caps_path, test_caps_path, feats_dir) model_options['ctx_dim'] = engine.ctx_dim ctx_dim = engine.ctx_dim model_options['vocab_size'] = engine.vocab_size vocab_size = engine.vocab_size print 'n_words:', model_options['vocab_size'] print 'ctx_dim:', model_options['ctx_dim'] utils.write_to_json(model_options, '%smodel_options.json'%save_dir) # set test values, for debugging idx = engine.kf_train[0] x_tv, mask_tv, ctx_tv, ctx_mask_tv, ctx_pca_tv = data_engine.prepare_data(engine, [engine.train_data_ids[index] for index in idx], mode="train") print 'init params' t0 = time.time() params = model.init_params(model_options) k_centers = 3 # description string: #words x #samples X = tf.placeholder(tf.int32, shape=(None, None), name='word_seq_x') # word seq input (t,m) MASK = tf.placeholder(tf.float32, shape=(None, None), name='word_seq_mask') # (t,m) # context: #samples x #annotations x dim CTX = tf.placeholder(tf.float32, shape=(None, ctx_frames, ctx_dim), name='ctx') CTX_MASK = tf.placeholder(tf.float32, shape=(None, ctx_frames), name='ctx_mask') CTX_PCA = tf.placeholder(tf.float32, shape=(None, k_centers, ctx_dim), name='ctx_pca') CTX_SAMPLER = tf.placeholder(tf.float32, shape=(ctx_frames, ctx_dim), name='ctx_sampler') CTX_MASK_SAMPLER = tf.placeholder(tf.float32, shape=(ctx_frames), name='ctx_mask_sampler') CTX_PCA_SAMPLER = tf.placeholder(tf.float32, shape=(k_centers, ctx_dim), name='ctx_pca_sampler') X_SAMPLER = tf.placeholder(tf.int32, shape=(None,), name='x_sampler') # DOUBT 1 or None ? BO_INIT_STATE_SAMPLER = tf.placeholder(tf.float32, shape=(None,lstm_dim), name='bo_init_state_sampler') TO_INIT_STATE_SAMPLER = tf.placeholder(tf.float32, shape=(None,lstm_dim), name='to_init_state_sampler') BO_INIT_MEMORY_SAMPLER = tf.placeholder(tf.float32, shape=(None,lstm_dim), name='bo_init_memory_sampler') TO_INIT_MEMORY_SAMPLER = tf.placeholder(tf.float32, shape=(None,lstm_dim), name='to_init_memory_sampler') # create tensorflow variables print 'buliding model' tfparams = utils.init_tfparams(params) use_noise, COST, extra = model.build_model(tfparams, model_options, X, MASK, CTX, CTX_MASK, CTX_PCA) ALPHAS = extra[1] # (t,64,28) BETAS = extra[2] # (t,64) print 'buliding sampler' f_init, f_next = model.build_sampler(tfparams, model_options, use_noise, CTX_SAMPLER, CTX_MASK_SAMPLER, CTX_PCA_SAMPLER, X_SAMPLER, BO_INIT_STATE_SAMPLER, TO_INIT_STATE_SAMPLER, BO_INIT_MEMORY_SAMPLER, TO_INIT_MEMORY_SAMPLER) print 'building f_log_probs' f_log_probs = -COST print 'check trainables' wrt = utils.itemlist(tfparams, model_options) trainables = tf.trainable_variables() print len(wrt),len(trainables) # assert len(wrt)==len(trainables) COST = tf.reduce_mean(COST, name="LOSS") if decay_c > 0.: decay_c = tf.Variable(np.float32(decay_c), trainable=False, name='decay_c') weight_decay = 0. for vv in wrt: weight_decay += tf.reduce_sum(vv ** 2) weight_decay *= decay_c COST += weight_decay if alpha_c > 0.: alpha_c = tf.Variable(np.float32(alpha_c), trainable=False, name='alpha_c') alpha_reg = alpha_c * tf.reduce_mean(tf.reduce_sum(((1.-tf.reduce_sum(ALPHAS, axis=0))**2), axis=-1)) COST += alpha_reg if alpha_entropy_r > 0: alpha_entropy_r = tf.Variable(np.float32(alpha_entropy_r), name='alpha_entropy_r') alpha_reg_2 = alpha_entropy_r * tf.reduce_mean(tf.reduce_sum((-tf.add(ALPHAS * tf.log(ALPHAS+1e-8),axis=-1)), axis=-1)) COST += alpha_reg_2 else: alpha_reg_2 = tf.zeros_like(COST) print 'building f_alpha' f_alpha = [ALPHAS, BETAS] print 'build train fns' UPDATE_OPS = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(UPDATE_OPS): # optimizer = tf.train.AdadeltaOptimizer(learning_rate=1.0, rho=0.95, epsilon=1e-06).minimize(loss=COST, var_list=wrt) optimizer = tf.train.AdadeltaOptimizer(learning_rate=1.0, rho=0.95, epsilon=1e-06) # optimizer = tf.train.AdamOptimizer() gradients, variables = zip(*optimizer.compute_gradients(loss=COST, var_list=wrt)) gradients, _ = tf.clip_by_global_norm(gradients, clip_c) capped_grads_and_vars = zip(gradients, variables) TRAIN_OP = optimizer.apply_gradients(capped_grads_and_vars) # Initialize all variables var_init = tf.global_variables_initializer() # Ops to save and restore all the variables. saver = tf.train.Saver() print 'compilation took %.4f sec'%(time.time()-t0) print 'Optimization' history_errs = [] # reload history if reload_model: print 'loading history error...' history_errs = np.load(from_dir+'model_best_so_far.npz')['history_errs'].tolist() bad_counter = 0 processes = None queue = None rqueue = None shared_params = None uidx = 0 uidx_best_blue = 0 uidx_best_valid_err = 0 estop = False # best_p = utils.unzip(tparams) best_blue_valid = 0 best_valid_err = 999 alphas_ratio = [] train_err = -1 train_perp = -1 valid_err = -1 valid_perp = -1 test_err = -1 test_perp = -1 # Launch the graph with tf.Session() as sess: sess.run(var_init) if reload_model: print 'restoring model...' saver.restore(sess, from_dir+"model_best_so_far.ckpt") for eidx in xrange(max_epochs): n_samples = 0 train_costs = [] grads_record = [] for idx in engine.kf_train: tags = [engine.train_data_ids[index] for index in idx] n_samples += len(tags) uidx += 1 sess.run(tf.assign(use_noise, True)) pd_start = time.time() x, mask, ctx, ctx_mask, ctx_pca = data_engine.prepare_data(engine, tags, mode="train") pd_duration = time.time() - pd_start if x is None: print 'Minibatch with zero sample under length ', maxlen continue # writer = tf.summary.FileWriter("graph_cost", sess.graph) cost, alphas, betas = sess.run([COST,ALPHAS,BETAS], feed_dict={ X: x, MASK: mask, CTX: ctx, CTX_PCA: ctx_pca, CTX_MASK: ctx_mask}) ud_start = time.time() sess.run(TRAIN_OP, feed_dict={ X: x, MASK: mask, CTX: ctx, CTX_PCA: ctx_pca, CTX_MASK: ctx_mask}) ud_duration = time.time() - ud_start # writer.close() if np.isnan(cost) or np.isinf(cost): print 'NaN detected in cost' import pdb; pdb.set_trace() if eidx == 0: train_error = cost else: train_error = train_error * 0.95 + cost * 0.05 train_costs.append(cost) if np.mod(uidx, dispFreq) == 0: print 'Epoch: ', eidx, \ ', Update: ', uidx, \ ', train cost mean so far: ', train_error, \ ', fetching data time spent (sec): ', pd_duration, \ ', update time spent (sec): ', ud_duration, \ ', save_dir: ', save_dir, '\n' alphas, betas = sess.run(f_alpha, feed_dict={ X: x, MASK: mask, CTX: ctx, CTX_PCA: ctx_pca, CTX_MASK: ctx_mask}) counts = mask.sum(0) betas_mean = (betas * mask).sum(0) / counts betas_mean = betas_mean.mean() print 'alpha ratio %.3f, betas mean %.3f\n'%( alphas.min(-1).mean() / (alphas.max(-1)).mean(), betas_mean) l = 0 for vv in x[:, 0]: if vv == 0: # eos break if vv in engine.reverse_vocab: print '(', np.round(betas[l, 0], 3), ')', engine.reverse_vocab[vv], else: print '(', np.round(betas[l, 0], 3), ')', 'UNK', print ",", l += 1 print '(', np.round(betas[l, 0], 3), ')\n' if np.mod(uidx, saveFreq) == 0: pass if np.mod(uidx, sampleFreq) == 0: sess.run(tf.assign(use_noise, False)) print '------------- sampling from train ----------' x_s = x # (t,m) mask_s = mask # (t,m) ctx_s = ctx # (m,28,2048) ctx_mask_s = ctx_mask # (m,28) ctx_pca_s = ctx_pca model.sample_execute(sess, engine, model_options, tfparams, f_init, f_next, x_s, ctx_s, ctx_mask_s, ctx_pca_s) # print '------------- sampling from valid ----------' # idx = engine.kf_val[np.random.randint(1, len(engine.kf_val) - 1)] # tags = [engine.val_data_ids[index] for index in idx] # x_s, mask_s, ctx_s, mask_ctx_s, ctx_pca_s = data_engine.prepare_data(engine, tags,"val") # model.sample_execute(sess, engine, model_options, tfparams, f_init, f_next, x_s, ctx_s, ctx_mask_s, ctx_pca_s) # print "" if validFreq != -1 and np.mod(uidx, validFreq) == 0: t0_valid = time.time() alphas, _ = sess.run(f_alpha, feed_dict={ X: x, MASK: mask, CTX: ctx, CTX_PCA: ctx_pca, CTX_MASK: ctx_mask}) ratio = alphas.min(-1).mean()/(alphas.max(-1)).mean() alphas_ratio.append(ratio) np.savetxt(save_dir+'alpha_ratio.txt',alphas_ratio) np.savez(save_dir+'model_current.npz', history_errs=history_errs) saver.save(sess, save_dir+'model_current.ckpt') sess.run(tf.assign(use_noise, False)) train_err = -1 train_perp = -1 valid_err = -1 valid_perp = -1 test_err = -1 test_perp = -1 if not debug: # first compute train cost if 0: print 'computing cost on trainset' train_err, train_perp = model.pred_probs(sess, engine, 'train', f_log_probs, verbose=model_options['verbose']) else: train_err = 0. train_perp = 0. if 1: print 'validating...' valid_err, valid_perp = model.pred_probs(sess, engine, 'val', f_log_probs, verbose=model_options['verbose']) else: valid_err = 0. valid_perp = 0. if 0: print 'testing...' test_err, test_perp = model.pred_probs(sess, engine, 'test', f_log_probs, verbose=model_options['verbose']) else: test_err = 0. test_perp = 0. mean_ranking = 0 blue_t0 = time.time() scores, processes, queue, rqueue, shared_params = \ metrics.compute_score(sess=sess, model_type='attention', model_archive=None, options=model_options, engine=engine, save_dir=save_dir, beam=5, n_process=5, whichset='both', on_cpu=False, processes=processes, queue=queue, rqueue=rqueue, shared_params=shared_params, metric=metric, one_time=False, f_init=f_init, f_next=f_next, model=model ) ''' {'blue': {'test': [-1], 'valid': [77.7, 60.5, 48.7, 38.5, 38.3]}, 'alternative_valid': {'Bleu_3': 0.40702270203174923, 'Bleu_4': 0.29276570520368456, 'CIDEr': 0.25247168210607884, 'Bleu_2': 0.529069629270047, 'Bleu_1': 0.6804308797115253, 'ROUGE_L': 0.51083584331688392}, 'meteor': {'test': [-1], 'valid': [0.282787550236724]}} ''' valid_B1 = scores['valid']['Bleu_1'] valid_B2 = scores['valid']['Bleu_2'] valid_B3 = scores['valid']['Bleu_3'] valid_B4 = scores['valid']['Bleu_4'] valid_Rouge = scores['valid']['ROUGE_L'] valid_Cider = scores['valid']['CIDEr'] valid_meteor = scores['valid']['METEOR'] test_B1 = scores['test']['Bleu_1'] test_B2 = scores['test']['Bleu_2'] test_B3 = scores['test']['Bleu_3'] test_B4 = scores['test']['Bleu_4'] test_Rouge = scores['test']['ROUGE_L'] test_Cider = scores['test']['CIDEr'] test_meteor = scores['test']['METEOR'] print 'computing meteor/blue score used %.4f sec, '\ 'blue score: %.1f, meteor score: %.1f'%( time.time()-blue_t0, valid_B4, valid_meteor) history_errs.append([eidx, uidx, train_err, train_perp, valid_perp, test_perp, valid_err, test_err, valid_B1, valid_B2, valid_B3, valid_B4, valid_meteor, valid_Rouge, valid_Cider, test_B1, test_B2, test_B3, test_B4, test_meteor, test_Rouge, test_Cider]) np.savetxt(save_dir+'train_valid_test.txt', history_errs, fmt='%.3f') print 'save validation results to %s'%save_dir # save best model according to the best blue or meteor if len(history_errs) > 1 and \ valid_B4 > np.array(history_errs)[:-1,11].max(): print 'Saving to %s...'%save_dir, np.savez( save_dir+'model_best_blue_or_meteor.npz', history_errs=history_errs) saver.save(sess, save_dir+'model_best_blue_or_meteor.ckpt') # DOUBT if len(history_errs) > 1 and \ valid_err < np.array(history_errs)[:-1,6].min(): # best_p = utils.unzip(tparams) # DOUBT bad_counter = 0 best_valid_err = valid_err uidx_best_valid_err = uidx print 'Saving to %s...'%save_dir, np.savez(save_dir+'model_best_so_far.npz', history_errs=history_errs) saver.save(sess, save_dir+'model_best_so_far.ckpt') utils.write_to_json(model_options, '%smodel_options.json'%save_dir) print 'Done' elif len(history_errs) > 1 and \ valid_err >= np.array(history_errs)[:-1,6].min(): bad_counter += 1 print 'history best ',np.array(history_errs)[:,6].min() print 'bad_counter ',bad_counter print 'patience ',patience if bad_counter > patience: print 'Early Stop!' estop = True break if test_B4>0.52 and test_meteor>0.32: print 'Saving to %s...'%save_dir, np.savez( save_dir+'model_'+str(uidx)+'.npz', history_errs=history_errs) saver.save(sess, save_dir+'model_'+str(uidx)+'.ckpt') print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err, \ 'best valid err so far',best_valid_err print 'valid took %.2f sec'%(time.time() - t0_valid) # end of validatioin if debug: break if estop: break if debug: break # end for loop over minibatches print 'This epoch has seen %d samples, train cost %.2f'%( n_samples, np.mean(train_costs)) # end for loop over epochs print 'Optimization ended.' print 'stopped at epoch %d, minibatch %d, '\ 'curent Train %.2f, current Valid %.2f, current Test %.2f '%( eidx,uidx,np.mean(train_err),np.mean(valid_err),np.mean(test_err)) if history_errs != []: history = np.asarray(history_errs) best_valid_idx = history[:,6].argmin() np.savetxt(save_dir+'train_valid_test.txt', history, fmt='%.4f') print 'final best exp ', history[best_valid_idx] np.savez( save_dir+'model_train_end.npz', history_errs=history_errs) saver.save(sess, save_dir+'model_train_end.ckpt') return
def gen_vocab(df, whichdata): if whichdata == "test": outfname = config.MURALI_MSVD_VID_CAPS_TEST_PATH dictsize = config.MURALI_TEST_VIDS capspath = config.MURALI_MSVD_CAPTIONS_TEST_PATH elif whichdata == "val": outfname = config.MURALI_MSVD_VID_CAPS_VAL_PATH dictsize = config.MURALI_VAL_VIDS capspath = None raise NotImplementedError() else: outfname = config.MURALI_MSVD_VID_CAPS_TRAIN_PATH dictsize = config.MURALI_TRAIN_VIDS capspath = config.MURALI_MSVD_CAPTIONS_TRAIN_PATH vocab = set() punct_dict = get_punctuations() translator = string.maketrans("", "") vid_caps_dict = {} omitted_caps = [] for index in range(dictsize): vid_id = whichdata + "_" + str(index) descriptions = utils.read_file_to_list(capspath + str(index) + ".txt")[0].split("|") vid_caps = [] for desc in descriptions: try: cap = desc.strip().encode('UTF-8') if len(cap) > 0: vid_caps.append(cap) except Exception as e: # print vid_id, " : ", desc.strip() omitted_caps.append(vid_id + " : " + desc.strip()) for vid_cap in vid_caps: tokens, _ = tokenize(vid_cap, punct_dict, translator) if (vid_id in vid_caps_dict): vid_caps_dict[vid_id].append(tokens) else: vid_caps_dict[vid_id] = [tokens] if whichdata == "train": vocab |= set(tokens) print("Non-ASCII captions omitted :" + str(len(omitted_caps))) utils.write_to_json(vid_caps_dict, outfname) print("Size of " + whichdata + " vid caps dict: " + str(len(vid_caps_dict))) assert len(vid_caps_dict) == dictsize if whichdata == "train": vocab_list = list(vocab) vocab_list.sort() vocab_dict = { vocab_list[index]: index + 2 for index in range(len(vocab_list)) } # vocab_dict['<bos>'] = 0 vocab_dict['<eos>'] = 0 vocab_dict['UNK'] = 1 vocab_rev_dict = { index + 2: vocab_list[index] for index in range(len(vocab_list)) } # vocab_rev_dict[0] = '<bos>' vocab_rev_dict[0] = '<eos>' vocab_rev_dict[1] = 'UNK' utils.write_to_json(vocab_dict, config.MURALI_MSVD_VOCAB_PATH) utils.write_to_pickle(vocab_rev_dict, config.MURALI_MSVD_REVERSE_VOCAB_PATH) print("Size of Vocabulary: " + str(len(vocab))) return vocab, vid_caps_dict, omitted_caps
def _save(self): ''' save final data. data_lst, scrape_err_lst, parse_err_lst ''' write_to_json(self.data_path, self.data_lst) write_to_json(self.scrape_err_path, self.scrape_err_lst) write_to_json(self.parse_err_path, self.parse_err_lst)
def cityDic(places): geolocator = Nominatim(user_agent="specify_your_app_name_here") geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1) place_dicts = [] for place in places: place_dict = {"text":place, "address":"", "latitude":"", "longtitude":""} location = geocode(place) if location: place_dict["address"] = location.address point = tuple(location.point) place_dict["latitude"] = point[0] place_dict["longtitude"] = point[1] place_dicts.append(place_dict) return place_dicts if __name__ == '__main__': args = get_args() data = load_from_json(args.data) place_tags = [] # TODO : Process only sentences with label 1 for sentence in data["sentences"]: places = geograpy.get_place_context(text=sentence) place_dicts = cityDic(places.cities) # Only cities ??? place_tags.append(place_dicts) data["place_tags"] = place_tags write_to_json(data, data["id"], extension="json", out_dir=args.out_dir)
(SY.shift().rolling(window=3).mean())[25:-25]) variables_dict['ASY4'].extend( (SY.shift().rolling(window=4).mean())[25:-25]) variables_dict['ASY5'].extend( (SY.shift().rolling(window=5).mean())[25:-25]) variables_dict['ASY6'].extend( (SY.shift().rolling(window=6).mean())[25:-25]) variables_dict['ASY7'].extend( (SY.shift().rolling(window=7).mean())[25:-25]) variables_dict['ASY8'].extend( (SY.shift().rolling(window=8).mean())[25:-25]) variables_dict['ASY9'].extend( (SY.shift().rolling(window=9).mean())[25:-25]) variables_dict['ASY10'].extend( (SY.shift().rolling(window=10).mean())[25:-25]) variables_dict['ASY15'].extend( (SY.shift().rolling(window=15).mean())[25:-25]) variables_dict['ASY20'].extend( (SY.shift().rolling(window=20).mean())[25:-25]) variables_dict['ASY25'].extend( (SY.shift().rolling(window=25).mean())[25:-25]) print('Finished Downloading Data') # Clean and save data idx_to_remove = check_for_nan_elements(variables_dict, predictor_names, verbose=True) remove_nan_elements(variables_dict, idx_to_remove, verbose=True) scale_and_save(variables_dict, predictor_names, verbose=True) write_to_json(variables_dict, verbose=True)
# Be cautious when run this file!!! from utils import read_from_json, write_to_json jp = read_from_json("data/jp.json") uk = read_from_json("data/uk.json") us = read_from_json("data/us.json") data = jp + uk + us write_to_json(data, "data/data.json")