Example #1
0
def match_police_code_borough(police_obj):
    borough = ''
    borough_total = []

    metropolitan_codes = police_obj.get_neighbourhood_codes()
    for pair in metropolitan_codes:
        boundaries = police_obj.get_neighbourhood_boundaries(pair['id'])

        i = 0
        while i < len(boundaries) - 1:
            lat = boundaries[i]['latitude']
            long = boundaries[i]['longitude']
            borough = get_borough(lat, long)
            if borough == "CAN'T FIND BOROUGH":
                i += 1
            else:
                break

        borough_total.append({
            'id': pair['id'],
            'name': pair['name'],
            'borough': borough
        })
        print(pair['id'] + ', ' + pair['name'] + ', ' + borough)
    write_to_json('borough.json', borough_total)
 def create_sequences(self):
     sequences = self.df.groupby(['DEVICE_ID'])
     sequences = sequences.groups
     train_data={}
     target={}
     for p, key in enumerate(sequences):
         print "sequence " + str(p)
         #if p<69369:
             #continue
         if p==5000:
             break
         data_point = None
         seq_target=[]
         sorted_indexs = self.df.ix[sequences[key].values].sort('EVENT_TIME').index.values
         # city = randint(1,45)
         for idx in sorted_indexs:
             # print self.df.ix[idx]['PROGRAM_GENRE']
             genre = self.genres[self.df.ix[idx]['PROGRAM_GENRE']]
             # print genre
             seq_target.append(genre)
             temp = self.df.ix[idx].drop(['EVENT_TIME','PROGRAM_GENRE','HOUSEHOLD_ID'])
             # temp = self.df.ix[idx]
             if data_point is None:
                 data_point = temp.as_matrix()
             else:
                 data_point = np.concatenate((data_point,temp))
         # data.df = data.df.drop(sorted_indexs)
         train_data[key]=list(data_point)
         target[key] = seq_target
         print len(train_data[key])
     utils.write_to_json('trainA.json',train_data)
     utils.write_to_json('trainA_target.json',target)
    def get_initial_company_info():
        """Gets the initial information for each company"""

        company_dict = utils.open_json(MONITOR)

        for company in company_dict:
            # Gets symbol for company
            if company_dict[company]["Symbol"] == "unknown":
                try:
                    with urllib.request.urlopen(
                            f'https://finance.yahoo.com/_finance_doubledown/'
                            f'api/resource/searchassist;searchTerm={company}'
                    ) as response:

                        html = response.read().decode()
                        d = json.loads(html)

                        company_dict[company]["Symbol"] = d['items'][0][
                            'symbol']

                except urllib.error.HTTPError as error:
                    utils.write_to_log(f'Error opening URL: {error}')

            # Gets initial share price
            if company_dict[company]["Initial-share-price"] == 1:
                yahoo = Share(company_dict[company]["Symbol"])
                share = yahoo.get_price()
                company_dict[company]["Initial-share-price"] = float(share)
                company_dict[company]["Current-share-price"] = float(share)

        utils.write_to_json(MONITOR, company_dict)
def _get_historic_tweets(api, keyword, json_file_name, num_of_tweets):
    """Get previous arg.num_of_tweets related to arg.keyword."""
    tweet_list = []
    print("Getting previous %s tweets..." % str(num_of_tweets))
    try:
        for tweet in tweepy.Cursor(api.search, q=keyword).items(num_of_tweets):
            entry = {
                'Screen-Name': str(tweet.user.screen_name),
                'Username': (tweet.user.name),
                'Created-At': str(tweet.created_at),
                'Text': str(tweet.text),
                'User-Location': str(tweet.user.location),
                'Coordinates': str(tweet.coordinates),
                'Device-Type': str(tweet.source),
                'Hashtags': str(tweet.entities.get('hashtags')),
                'Quote-Status': str(tweet.is_quote_status),
                'Retweeted': str(tweet.retweeted),
                'Retweet-Count': str(tweet.retweet_count),
                'Favorited': str(tweet.favorited),
                'Favorite-Count': str(tweet.favorite_count),
                'Replied': str(tweet.in_reply_to_status_id_str)
            }
            tweet_list.append(entry)
        print("...tweets fetched")
        utils.write_to_json(json_file_name, tweet_list)
    except tweepy.TweepError as e:
        raise HistoricTweetException(str(e))
def main():
    players = read_json(ALL_PLAYERS_FILE_PATH)
    stat_list = []
    for player in players:
        print("Fetching player {}".format(player['name']['display']))
        stats = fetch_player_stats(int(player['id']))
        item_dict = {'player': player, 'stats': stats}
        stat_list.append(item_dict)
    write_to_json(stat_list, PLAYER_STATS_FILE_PATH)
def save_spotify_responses(csv_file, output_file):
    with open(csv_file, 'r') as csv_f:
        charts = csv.DictReader(csv_f)
        responses = []
        time = arrow.now().isoformat()
        for song in charts:
            track_id = song['URL'].split('/')[-1]
            url = full_url(track_id)
            r = requests.get(url).json()
            responses.append(r)
    output = {}
    output['data'] = responses
    output['date-retrieved'] = time
    write_to_json(output, output_file)
Example #7
0
def gen_vocab(df, whichdata):
    if whichdata == "test":
        outfname = config.MSVD_VID_CAPS_TEST_PATH
        dictsize = config.TEST_VIDS
    elif whichdata == "val":
        outfname = config.MSVD_VID_CAPS_VAL_PATH
        dictsize = config.VAL_VIDS
    else:
        outfname = config.MSVD_VID_CAPS_TRAIN_PATH
        dictsize = config.TRAIN_VIDS
    vocab = set()
    punct_dict = get_punctuations()
    translator = string.maketrans("", "")
    vid_caps_dict = {}
    for index, row in df.iterrows():
        vid_id = str(row["VideoID"]) + "_" + str(row["Start"]) + "_" + str(
            row["End"])
        tokens, _ = tokenize(row["Description"], punct_dict, translator)
        if (vid_id in vid_caps_dict):
            vid_caps_dict[vid_id].append(tokens)
        else:
            vid_caps_dict[vid_id] = [tokens]
        if whichdata == "train":
            vocab |= set(tokens)
    utils.write_to_json(vid_caps_dict, outfname)
    print("Size of " + whichdata + " vid caps dict: " +
          str(len(vid_caps_dict)))
    assert len(vid_caps_dict) == dictsize
    if whichdata == "train":
        vocab_list = list(vocab)
        vocab_list.sort()
        vocab_dict = {
            vocab_list[index]: index + 2
            for index in range(len(vocab_list))
        }
        # vocab_dict['<bos>'] = 0
        vocab_dict['<eos>'] = 0
        vocab_dict['UNK'] = 1
        vocab_rev_dict = {
            index + 2: vocab_list[index]
            for index in range(len(vocab_list))
        }
        # vocab_rev_dict[0] = '<bos>'
        vocab_rev_dict[0] = '<eos>'
        vocab_rev_dict[1] = 'UNK'
        utils.write_to_json(vocab_dict, config.MSVD_VOCAB_PATH)
        utils.write_to_pickle(vocab_rev_dict, config.MSVD_REVERSE_VOCAB_PATH)
        print("Size of Vocabulary: " + str(len(vocab)))
    return vocab, vid_caps_dict
Example #8
0
def train_util(params):
    save_dir = params['save_dir']
    print('current save dir : ' + save_dir)
    utils.create_dir_if_not_exist(save_dir)

    reload_model = params['reload_model']
    if reload_model:
        print 'preparing reload'
        save_dir_backup = params['save_dir']
        from_dir_backup = params['from_dir']
        # never start retrain in the same folder
        assert save_dir_backup != from_dir_backup
        print 'save dir ', save_dir_backup
        print 'from_dir ', from_dir_backup
        print 'setting current model config with the old one'
        model_config_old = utils.read_from_json(from_dir_backup +
                                                'model_config.json')
        model_config_old['reload_model'] = True
        model_config_old['save_dir'] = params['save_dir']
        model_config_old['from_dir'] = params['from_dir']
        model_config_old['max_epochs'] = params['max_epochs']
        model_config_old['dispFreq'] = params['dispFreq']
        model_config_old['sampleFreq'] = params['sampleFreq']
        model_config_old['validFreq'] = params['validFreq']
        model_config_old['debug'] = params['debug']
        params = model_config_old
        feats_dir = params['feats_dir']
    elif params['cnn_name'] != "MURALI":
        feats_dir = params['feats_dir'] + params['cnn_name'] + "_kmeans3/"
    else:
        feats_dir = params['feats_dir']
    print('feats dir : ' + feats_dir)
    params['feats_dir'] = feats_dir

    config_save_path = save_dir + "model_config.json"
    print('saving model config into %s' % config_save_path)
    utils.write_to_json(params, config_save_path)

    t0 = time.time()
    print('training an attention model')
    train(params, **params)
    print('training time in total %.4f sec' % (time.time() - t0))
    def get_current_shares():
        """Gets current shares, compares it to initial, finds difference.
           Returns for output to handle"""

        company_dict = utils.open_json(MONITOR)

        for company in company_dict:
            try:
                yahoo = Share(company_dict[company]["Symbol"])
                yahoo.refresh()
                share = yahoo.get_price()

                company_dict[company]["Current-share-price"] = float(share)
                company_dict[company]["Share-price-list"].append(float(share))

            except ValueError:
                # yahoo.get_price() will return None if an error occurs
                print("Could not add to the Current share/Share price list")

        utils.write_to_json(MONITOR, company_dict)
    def minus_days():
        """Takes away a day from the "Days-Left",
           removes from monitor.json if == 0"""

        company_dict = utils.open_json(MONITOR)
        remove = []

        for company in company_dict:
            if company_dict[company]["Days-left"] > 0:
                company_dict[company]["Days-left"] -= 1

            elif company_dict[company]["Days-left"] == 0:
                remove.append(company)

        for company in remove:
            # Do I want to keep a record of all the companies that have been mentioned and their prices???
            # Goes here
            del company_dict[company]

        utils.write_to_json(MONITOR, company_dict)
Example #11
0
def get_polygon_police_code(police_obj):
    borough_total = []

    data = load_json_local('borough.json')
    for area in data:
        poly_string = ''
        boundaries = police_obj.get_neighbourhood_boundaries(area['id'])
        indexes = get_linear_spaced_indexes(length=len(boundaries),
                                            spacing=100)
        for index in indexes:
            poly_string += str(boundaries[index]['latitude']) + ',' + str(
                boundaries[index]['longitude']) + ':'
        print(poly_string[:-1])
        borough_total.append({
            'id': area['id'],
            'name': area['name'],
            'borough': area['borough'],
            'polygon': poly_string[:-1]
        })
    write_to_json('boroughs_info.json', borough_total)
    def check_for_companies(self):
        """Checks list of companies with Trump's tweet
           seeing if any companies are listed in his tweet.
           Inputs matches into monitor.json"""

        matches = []
        punc = ("!", ",", ".", ":", ";", "@", "?", "(", ")")

        self.tweet = ''.join(
            [letter for letter in self.tweet if letter not in punc]).lower()

        with open(COMPANIES) as f:
            companies = [line.strip() for line in f]

        for word in self.tweet.split():
            # Binary search for word
            if utils.find(companies, word):
                matches.append(word)

        company_dict = utils.open_json(MONITOR)
        comp_d = {}

        # Information that is needed by get_initial/current
        for company in matches:
            comp_d[company] = {}
            comp_d[company]["Date-mentioned"] = "{:%d-%m-%Y %H:%M:%S}".format(
                datetime.datetime.now())
            comp_d[company]["Mentioned by"] = self.handle
            comp_d[company]["Tweet"] = self.original_tweet
            comp_d[company]["Days-left"] = 7
            comp_d[company]["Symbol"] = "unknown"
            comp_d[company]["Initial-share-price"] = 1
            comp_d[company]["Current-share-price"] = 1
            comp_d[company]["Share-price-list"] = []

        company_dict.update(comp_d)
        utils.write_to_json(MONITOR, company_dict)

        return matches
Example #13
0
def company_result():
    title = 'マッチ度ランキング | Workers'

    # 個人のデータを取得
    target_dic = analyze_personality()

    # 個人のデータを保存
    write_to_json(target_dic, 'json/target.json')

    # おすすめ企業のデータを取得
    companies = get_recommended_companies(target_dic)

    # 個人と各企業のパラメータを重ねた画像を作成
    for company in companies:
        image_path = 'static/images/{0}.png'.format(company['id'])
        com_param_dic = company['params']
        make_big_five_graph(
            [target_dic, com_param_dic, dummy_dic0, dummy_dic1], image_path)

    return render_template('company_result.html',
                           title=title,
                           companies=companies)
def analyze_personality():
    """個人のテキストをクエリとし,PersonalityInsightsから性格パラメータを取得する.

    Returns:
        dict: Big 5のパラメータ. keyは{'ope', 'con', 'ext', 'agr', 'emo'}
    """
    with open('apikey.txt', 'r') as f:
        api_key = f.read()

    authenticator = IAMAuthenticator(api_key)
    personality_insights = PersonalityInsightsV3(version='2017-10-13',
                                                 authenticator=authenticator)

    personality_insights.set_service_url(
        'https://gateway.watsonplatform.net/personality-insights/api')

    with open('sample.txt', 'r') as profile_text:
        profile = personality_insights.profile(
            profile_text.read(),
            'application/json',
            consumption_preferences=True,
            content_language='ja',
            accept_language='ja',
        ).get_result()

    json_obj = profile
    write_to_json(json_obj, 'json/result.json')

    ope = json_obj["personality"][0]["percentile"]
    con = json_obj["personality"][1]["percentile"]
    ext = json_obj["personality"][2]["percentile"]
    agr = json_obj["personality"][3]["percentile"]
    emo = json_obj["personality"][4]["percentile"]

    dic = {'ope': ope, 'con': con, 'ext': ext, 'agr': agr, 'emo': emo}

    return dic
def map_url_with_ids(videos):
	vid_ids = []
	vid_urls = []
	url_ids_map = OrderedDict()
	for vid in videos:
		vid_url = vid['url']
		vid_id = vid['video_id']
		if vid_url in url_ids_map:
			url_ids_map[vid_url].append(vid_id)
		else:
			url_ids_map[vid_url] = [vid_id]
		vid_ids.append(vid_id)
		vid_urls.append(vid_url)
	assert len(set(vid_ids))==MSRVTT_TOTAL_VIDS
	print "urls#:",len(set(vid_urls)),'/',len(vid_urls)
	url_ydl_map = OrderedDict()
	count = 0
	success = 0
	fail = 0
	for url in url_ids_map:
		ydl_url, status = get_youtube_url(url)
		url_ydl_map[url] = {
			"ydl_url": ydl_url,
			"status": status
		}
		if status=="Success":
			success += 1
		else:
			fail += 1
		count = count + 1
	print success,"/",count," ",fail,"/",count
	url_ydl_map["#success"] = success
	url_ydl_map["#fail"] = fail
	url_ydl_map["#count"] = count
	utils.write_to_json(url_ids_map, MSRVTT_DIR+"urls_vidids_map.json")
	utils.write_to_json(url_ydl_map, MSRVTT_DIR+"urls_ydl_map.json")
Example #16
0
import os
from pathlib import Path
import utils

vasp_folder = Path(
    '/home/khalkhal/Simulations/VASP/Millerite/Machine_Learning/new-training-builder/VASP_folder'
)
counter = 0
for folder in os.listdir(vasp_folder):
    counter += 1
    if counter % 10 == 0:
        print("number of structures read: %d" % counter, end="\n")
    poscar = vasp_folder / folder / "POSCAR"
    atoms, cell = utils.read_poscar(poscar)
    atoms = utils.CN(atoms, cell)
    atom_file = vasp_folder / folder / "atoms.json"
    utils.write_to_json(atom_file, atoms)
Example #17
0
for file in os.listdir(vasp_folder):

    if file.endswith(".xsd"):
        print("Processing ", file, "...")
        atoms = None
        cell = None
        filename = xsd_folder / xsd_dir / file
        atoms, cell = utils.read_xsd(filename)
        atoms = utils.CN(atoms, cell)
        old_struct_num = len(struct_list)
        struct_list, atoms, wrongs, bcodes = utils.neutralizer(vasp_folder, atoms, cell, struct_list, file,
                                                               folder, wrongs, bcodes)
        new_struct_num = len(struct_list)
        if old_struct_num != new_struct_num:
            path = vasp_folder / str(new_struct_num) / 'atoms.json'
            utils.write_to_json(path, atoms)
        print(new_struct_num - old_struct_num, "new structures were made...")
        print("Current total number of structures: ", new_struct_num)

new_struct_file = Path('/home/khalkhal/Simulations/VASP/Millerite/Machine_Learning/new-training-builder/struct_list.csv')
utils.write_struct_list(new_struct_file, struct_list)

print(wrongs)
codes = sorted(bcodes)

for i in range(len(codes)):
    if codes[i] == 0:
        print(bcodes[i])

# codes = list(Counter(bcodes).keys()) # equals to list(set(words))
# freqs = list(Counter(bcodes).values()) # counts the elements' frequency
Example #18
0
class GithubApiQuery(object):
    basic_rate_linit = 5000
    search_rate_linit = 30

    def __init__(self,token):
        self.token = token
        self._manager = Github(login_or_token=self.token)


    @property
    def rate_limit(self):
        return self._manager.rate_limiting[0]

    @property
    def resetime(self):
        """ Return the nb seconds to wait for next reset of 5000 calls (truncated to seconds)"""
        return self._manager.rate_limiting_resettime - int(time.time())


    def get_user(self, name):
        """
        Return github.NamedUser.NamedUser class for user with name like 'acreux'
         """
        return self._manager.get_user(name)

if __name__ == "__main__":
    key = get_my_key()
    g = GithubApiQuery(key)
    guido = g.get_user("gvanrossum")
    write_to_json(guido.raw_data, "Guido.json")
Example #19
0
def get_pure_arbitrage(min_margin, max_item_purchase_price, min_potential_revenue, min_system_sec_rating, single_cargo=True, cargo_capacity=0, get_routes=True, get_new_orders=False, get_new_lookups=False, safe_regions=True):

    if single_cargo and cargo_capacity == 0:
        print("Please provide a cargo capacity")
        return

    # can't force another download of the lookups. Assumes they've been saved in get_and_save_orders()
    lookups = get_name_lookups(force=get_new_lookups)
    region_name_by_region = lookups["regions"]
    system_name_by_system = lookups["systems"]
    type_name_by_type = lookups["types"]

    if get_new_orders:
        get_and_save_orders(force=True, force_lookups=False, safe_regions=safe_regions)
        df = pd.read_csv("./data/orders/orders.csv", quotechar="|")
        print("\nGetting interim dictionary. This may take a minute..")
        df_grouped = df.groupby("type_name")
        df_dict = df_grouped.apply(
            lambda group: {
                "buy": {col: group[group["is_buy_order"] == True][col].tolist() for col in group.columns},
                "sell": {col: group[group["is_buy_order"] == False][col].tolist() for col in group.columns}
            }
        ).to_dict()
        u.write_to_json(df_dict, "./data/orders/orders_by_item.csv")
    else:
        print("\nLoading saved order dictionary at: ./data/orders/orders_by_item.csv")
        df_dict = u.load_data("./data/orders/orders_by_item.csv")
        if len(df_dict) == 0:
            print("No order data saved, but 'get_new_orders' parameter was set to False. Downloading anyway")
            get_and_save_orders(force=True, force_lookups=False, safe_regions=safe_regions)
            df = pd.read_csv("./data/orders/orders.csv", quotechar="|")
            print("\nGetting interim dictionary. This may take a minute..")
            df_grouped = df.groupby("type_name")
            df_dict = df_grouped.apply(
                lambda group: {
                    "buy": {col: group[group["is_buy_order"] == True][col].tolist() for col in group.columns},
                    "sell": {col: group[group["is_buy_order"] == False][col].tolist() for col in group.columns}
                }
            ).to_dict()
            u.write_to_json(df_dict, "./data/orders/orders_by_item.csv")

    rows = []
    item_count = 0
    artbitrage_count = 0
    system_details = get_system_details(system_name_by_system)
    for item in df_dict.keys():
        item_count += 1
        u.overwrite_print("Processing item: " + str(item_count) + "/" + str(len(df_dict.keys())) + ". " + str(artbitrage_count) + " opportunities found so far")
        for i in range(len(df_dict[item]["buy"]["price"])):
            if system_details[df_dict[item]["buy"]["system_name"][i]]["security_status"] < min_system_sec_rating:
                continue
            for j in range(len(df_dict[item]["sell"]["price"])):
                if df_dict[item]["sell"]["price"][j] > max_item_purchase_price or system_details[df_dict[item]["sell"]["system_name"][j]]["security_status"] < min_system_sec_rating:
                    continue
                if df_dict[item]["buy"]["price"][i] > df_dict[item]["sell"]["price"][j]:
                    margin = ((df_dict[item]["buy"]["price"][i] / df_dict[item]["sell"]["price"][j]) - 1)*100
                    if margin >= min_margin:
                        max_items_could_be_transacted = min(df_dict[item]["sell"]["volume_remain"][j], df_dict[item]["buy"]["volume_remain"][i])
                        potential_revenue = (max_items_could_be_transacted*df_dict[item]["buy"]["price"][i]) - (max_items_could_be_transacted*df_dict[item]["sell"]["price"][j])
                        if potential_revenue >= min_potential_revenue:
                            artbitrage_count += 1
                            row = {
                                "item_id": df_dict[item]["sell"]["type_id"][j],
                                "item": item.replace(",", "-"),
                                "buy_in_region": df_dict[item]["sell"]["region_name"][j],
                                "buy_in_system_name": df_dict[item]["sell"]["system_name"][j],
                                "buy_in_location_id": df_dict[item]["sell"]["location_id"][j],
                                "sell_in_region": df_dict[item]["buy"]["region_name"][i],
                                "sell_in_system_name": df_dict[item]["buy"]["system_name"][i],
                                "sell_in_location_id": df_dict[item]["buy"]["location_id"][i],
                                "buy_price": df_dict[item]["sell"]["price"][j],
                                "sell_price": df_dict[item]["buy"]["price"][i],
                                "buy_min_volume": df_dict[item]["sell"]["min_volume"][j],
                                "sell_min_volume": df_dict[item]["buy"]["min_volume"][i],
                                "amount_available_to_buy": df_dict[item]["sell"]["volume_remain"][j],
                                "amount_able_to_be_sold": df_dict[item]["buy"]["volume_remain"][i],
                                "margin": margin,
                                "potential_revenue": potential_revenue,
                                "_buy_system": df_dict[item]["sell"]["system_id"][j],
                                "_sell_system": df_dict[item]["buy"]["system_id"][i],
                                "buy_system_sec": system_details[df_dict[item]["sell"]["system_name"][j]]["security_status"],
                                "sell_system_sec": system_details[df_dict[item]["buy"]["system_name"][i]]["security_status"]
                            }
                            rows.append(row)

    # Get type details only for items with arbitrage opportunities. Saves pulling down 35k items 1 by 1
    type_ids = list(set([str(row["item_id"]) for row in rows]))
    type_details = get_type_details(type_name_by_type, type_ids)
    header = [
        "item_id", "item", "buy_in_region", "buy_in_system_name", "buy_in_location_id",
        "sell_in_region", "sell_in_system_name", "sell_in_location_id",
        "buy_price", "sell_price", "buy_min_volume", "sell_min_volume", "amount_available_to_buy",
        "amount_able_to_be_sold", "margin", "potential_revenue", "_buy_system", "_sell_system",
        "buy_system_sec", "sell_system_sec", "item_volume"
    ]
    for row in rows:
        row["item_volume"] = type_details[row["item"]]["packaged_volume"]

    single_cargo_rows = []
    if single_cargo:
        print("\nFiltering to opportunities making > " + str(min_potential_revenue) + " per cargo of " + str(cargo_capacity) + "m3")
        header.append("potential_revenue_per_cargo")
        for i, row in enumerate(rows):
            items_per_single_cargo = cargo_capacity / row["item_volume"]
            potential_revenue_per_item = row["sell_price"] - row["buy_price"]
            potential_revenue_per_cargo = min(
                potential_revenue_per_item*items_per_single_cargo,
                row["potential_revenue"]
            )
            row["potential_revenue_per_cargo"] = potential_revenue_per_cargo
            if potential_revenue_per_cargo > min_potential_revenue:
                single_cargo_rows.append(row)

        rows = single_cargo_rows
        print("\nFiltered to " + str(len(rows)) + " opportunities")


    if get_routes:
        header.append("route")
        header.append("route_jumps")
        od_pairs = list(set([(row["_buy_system"], row["_sell_system"], row["buy_in_system_name"], row["sell_in_system_name"]) for row in rows]))
        route_by_od_pair = get_routes_by_od_pairs(od_pairs)
        for row in rows:
            route = route_by_od_pair[(row["_buy_system"], row["_sell_system"], row["buy_in_system_name"], row["sell_in_system_name"])]
            row["route"] = '-'.join([str(i) for i in route])
            row["route_jumps"] = len(route)

    u.write_to_csv(header,rows,"./output/pure_arbitrage.csv")
 def _record_failure(self):
     utils.write_to_json(self.path_err, self.err_list)
def train(model_options,
        dataset_name = 'MSVD',
        cnn_name = 'ResNet50',
        train_data_ids_path = config.MSVD_DATA_IDS_TRAIN_PATH,
        val_data_ids_path = config.MSVD_DATA_IDS_VAL_PATH,
        test_data_ids_path = config.MSVD_DATA_IDS_TEST_PATH,
        vocab_path = config.MSVD_VOCAB_PATH,
        reverse_vocab_path = config.MSVD_REVERSE_VOCAB_PATH,
        mb_size_train = 64,
        mb_size_test = 128,
        train_caps_path = config.MSVD_VID_CAPS_TRAIN_PATH,
        val_caps_path = config.MSVD_VID_CAPS_VAL_PATH,
        test_caps_path = config.MSVD_VID_CAPS_TEST_PATH,
        feats_dir = config.MSVD_FEATS_DIR,
        save_dir = config.SAVE_DIR_PATH,
        word_dim = 512,   # word embeddings size
        ctx_dim = 2048,   # video cnn feature dimension
        lstm_dim = 512,   # lstm unit size
        patience = 20,
        max_epochs = 500,
        decay_c = 1e-4,
        alpha_entropy_r = 0.,
        alpha_c = 0.70602,
        clip_c = 10.,
        lrate = 0.0001,
        vocab_size = 20000, # n_words
        maxlen_caption = 30,  # max length of the descprition
        optimizer = 'adadelta',
        batch_size = 64,  # for trees use 25
        metric = 'everything',    # set to perplexity on DVS # blue, meteor, or both
        use_dropout = True,
        selector = True,
        ctx2out = True,
        prev2out = True,
        dispFreq = 10,
        validFreq = 2000,
        saveFreq = -1, # save the parameters after every saveFreq updates
        sampleFreq = 100, # generate some samples after every sampleFreq updates
        verbose = True,
        debug = False,
        reload_model = False,
        from_dir = '',
        ctx_frames = 28, # 26 when compare
        random_seed = 1234,
        beam_search = True
        ):

    tf.set_random_seed(random_seed)

    model = Model()

    print 'loading data'
    engine = data_engine.Movie2Caption(dataset_name,cnn_name,train_data_ids_path, val_data_ids_path, test_data_ids_path,
                vocab_path, reverse_vocab_path, mb_size_train, mb_size_test, maxlen_caption,
                train_caps_path, val_caps_path, test_caps_path, feats_dir)

    model_options['ctx_dim'] = engine.ctx_dim
    ctx_dim = engine.ctx_dim
    model_options['vocab_size'] = engine.vocab_size
    vocab_size = engine.vocab_size
    print 'n_words:', model_options['vocab_size']
    print 'ctx_dim:', model_options['ctx_dim']

    utils.write_to_json(model_options, '%smodel_options.json'%save_dir)

    # set test values, for debugging
    idx = engine.kf_train[0]
    x_tv, mask_tv, ctx_tv, ctx_mask_tv, ctx_pca_tv = data_engine.prepare_data(engine, [engine.train_data_ids[index] for index in idx], mode="train")

    print 'init params'
    t0 = time.time()
    params = model.init_params(model_options)

    k_centers = 3

    # description string: #words x #samples
    X = tf.placeholder(tf.int32, shape=(None, None), name='word_seq_x')  # word seq input (t,m)
    MASK = tf.placeholder(tf.float32, shape=(None, None), name='word_seq_mask')   # (t,m)
    # context: #samples x #annotations x dim
    CTX = tf.placeholder(tf.float32, shape=(None, ctx_frames, ctx_dim), name='ctx')
    CTX_MASK = tf.placeholder(tf.float32, shape=(None, ctx_frames), name='ctx_mask')
    CTX_PCA = tf.placeholder(tf.float32, shape=(None, k_centers, ctx_dim), name='ctx_pca')

    CTX_SAMPLER = tf.placeholder(tf.float32, shape=(ctx_frames, ctx_dim), name='ctx_sampler')
    CTX_MASK_SAMPLER = tf.placeholder(tf.float32, shape=(ctx_frames), name='ctx_mask_sampler')
    CTX_PCA_SAMPLER = tf.placeholder(tf.float32, shape=(k_centers, ctx_dim), name='ctx_pca_sampler')
    X_SAMPLER = tf.placeholder(tf.int32, shape=(None,), name='x_sampler')   # DOUBT 1 or None ?
    BO_INIT_STATE_SAMPLER = tf.placeholder(tf.float32, shape=(None,lstm_dim), name='bo_init_state_sampler')
    TO_INIT_STATE_SAMPLER = tf.placeholder(tf.float32, shape=(None,lstm_dim), name='to_init_state_sampler')
    BO_INIT_MEMORY_SAMPLER = tf.placeholder(tf.float32, shape=(None,lstm_dim), name='bo_init_memory_sampler')
    TO_INIT_MEMORY_SAMPLER = tf.placeholder(tf.float32, shape=(None,lstm_dim), name='to_init_memory_sampler')

    # create tensorflow variables
    print 'buliding model'
    tfparams = utils.init_tfparams(params)

    use_noise, COST, extra = model.build_model(tfparams, model_options, X, MASK, CTX, CTX_MASK, CTX_PCA)
    ALPHAS = extra[1]   # (t,64,28)
    BETAS = extra[2]    # (t,64)

    print 'buliding sampler'
    f_init, f_next = model.build_sampler(tfparams, model_options, use_noise,
                                CTX_SAMPLER, CTX_MASK_SAMPLER, CTX_PCA_SAMPLER, X_SAMPLER, BO_INIT_STATE_SAMPLER,
                                TO_INIT_STATE_SAMPLER, BO_INIT_MEMORY_SAMPLER, TO_INIT_MEMORY_SAMPLER)

    print 'building f_log_probs'
    f_log_probs = -COST

    print 'check trainables'
    wrt = utils.itemlist(tfparams, model_options)
    trainables = tf.trainable_variables()
    print len(wrt),len(trainables)
    # assert len(wrt)==len(trainables)

    COST = tf.reduce_mean(COST, name="LOSS")
    if decay_c > 0.:
        decay_c = tf.Variable(np.float32(decay_c), trainable=False, name='decay_c')
        weight_decay = 0.
        for vv in wrt:
            weight_decay += tf.reduce_sum(vv ** 2)
        weight_decay *= decay_c
        COST += weight_decay

    if alpha_c > 0.:
        alpha_c = tf.Variable(np.float32(alpha_c), trainable=False, name='alpha_c')
        alpha_reg = alpha_c * tf.reduce_mean(tf.reduce_sum(((1.-tf.reduce_sum(ALPHAS, axis=0))**2), axis=-1))
        COST += alpha_reg

    if alpha_entropy_r > 0:
        alpha_entropy_r = tf.Variable(np.float32(alpha_entropy_r),
                                        name='alpha_entropy_r')
        alpha_reg_2 = alpha_entropy_r * tf.reduce_mean(tf.reduce_sum((-tf.add(ALPHAS *
                    tf.log(ALPHAS+1e-8),axis=-1)), axis=-1))
        COST += alpha_reg_2
    else:
        alpha_reg_2 = tf.zeros_like(COST)

    print 'building f_alpha'
    f_alpha = [ALPHAS, BETAS]

    print 'build train fns'
    UPDATE_OPS = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(UPDATE_OPS):
        # optimizer = tf.train.AdadeltaOptimizer(learning_rate=1.0, rho=0.95, epsilon=1e-06).minimize(loss=COST, var_list=wrt)
        optimizer = tf.train.AdadeltaOptimizer(learning_rate=1.0, rho=0.95, epsilon=1e-06)
        # optimizer = tf.train.AdamOptimizer()
        gradients, variables = zip(*optimizer.compute_gradients(loss=COST, var_list=wrt))
        gradients, _ = tf.clip_by_global_norm(gradients, clip_c)
        capped_grads_and_vars = zip(gradients, variables)
        TRAIN_OP = optimizer.apply_gradients(capped_grads_and_vars)


    # Initialize all variables
    var_init = tf.global_variables_initializer()
    # Ops to save and restore all the variables.
    saver = tf.train.Saver()

    print 'compilation took %.4f sec'%(time.time()-t0)
    print 'Optimization'

    history_errs = []
    # reload history
    if reload_model:
        print 'loading history error...'
        history_errs = np.load(from_dir+'model_best_so_far.npz')['history_errs'].tolist()

    bad_counter = 0

    processes = None
    queue = None
    rqueue = None
    shared_params = None

    uidx = 0
    uidx_best_blue = 0
    uidx_best_valid_err = 0
    estop = False
    # best_p = utils.unzip(tparams)
    best_blue_valid = 0
    best_valid_err = 999
    alphas_ratio = []

    train_err = -1
    train_perp = -1
    valid_err = -1
    valid_perp = -1
    test_err = -1
    test_perp = -1

    # Launch the graph
    with tf.Session() as sess:
        sess.run(var_init)
        if reload_model:
            print 'restoring model...'
            saver.restore(sess, from_dir+"model_best_so_far.ckpt")
        for eidx in xrange(max_epochs):
            n_samples = 0
            train_costs = []
            grads_record = []
            for idx in engine.kf_train:
                tags = [engine.train_data_ids[index] for index in idx]
                n_samples += len(tags)
                uidx += 1
                
                sess.run(tf.assign(use_noise, True))

                pd_start = time.time()
                x, mask, ctx, ctx_mask, ctx_pca = data_engine.prepare_data(engine, tags, mode="train")
                pd_duration = time.time() - pd_start
                if x is None:
                    print 'Minibatch with zero sample under length ', maxlen
                    continue

                # writer = tf.summary.FileWriter("graph_cost", sess.graph)
                cost, alphas, betas = sess.run([COST,ALPHAS,BETAS], feed_dict={
                                        X: x,
                                        MASK: mask,
                                        CTX: ctx,
                                        CTX_PCA: ctx_pca,
                                        CTX_MASK: ctx_mask})

                ud_start = time.time()
                sess.run(TRAIN_OP, feed_dict={
                                        X: x,
                                        MASK: mask,
                                        CTX: ctx,
                                        CTX_PCA: ctx_pca,
                                        CTX_MASK: ctx_mask})
                ud_duration = time.time() - ud_start

                # writer.close()
                if np.isnan(cost) or np.isinf(cost):
                    print 'NaN detected in cost'
                    import pdb; pdb.set_trace()
                
                if eidx == 0:
                    train_error = cost
                else:
                    train_error = train_error * 0.95 + cost * 0.05
                train_costs.append(cost)
                
                if np.mod(uidx, dispFreq) == 0:
                    print 'Epoch: ', eidx, \
                        ', Update: ', uidx, \
                        ', train cost mean so far: ', train_error, \
                        ', fetching data time spent (sec): ', pd_duration, \
                        ', update time spent (sec): ', ud_duration, \
                        ', save_dir: ', save_dir, '\n'
                    
                    alphas, betas = sess.run(f_alpha, feed_dict={
                                            X: x,
                                            MASK: mask,
                                            CTX: ctx,
                                            CTX_PCA: ctx_pca,
                                            CTX_MASK: ctx_mask})
                    counts = mask.sum(0)
                    betas_mean = (betas * mask).sum(0) / counts
                    betas_mean = betas_mean.mean()
                    print 'alpha ratio %.3f, betas mean %.3f\n'%(
                        alphas.min(-1).mean() / (alphas.max(-1)).mean(), betas_mean)
                    l = 0
                    for vv in x[:, 0]:
                        if vv == 0: # eos
                            break
                        if vv in engine.reverse_vocab:
                            print '(', np.round(betas[l, 0], 3), ')', engine.reverse_vocab[vv],
                        else:
                            print '(', np.round(betas[l, 0], 3), ')', 'UNK',
                        print ",",
                        l += 1
                    print '(', np.round(betas[l, 0], 3), ')\n'

                if np.mod(uidx, saveFreq) == 0:
                    pass

                if np.mod(uidx, sampleFreq) == 0:
                    sess.run(tf.assign(use_noise, False))
                    print '------------- sampling from train ----------'
                    x_s = x     # (t,m)
                    mask_s = mask   # (t,m)
                    ctx_s = ctx     # (m,28,2048)
                    ctx_mask_s = ctx_mask   # (m,28)
                    ctx_pca_s = ctx_pca
                    model.sample_execute(sess, engine, model_options, tfparams, f_init, f_next, x_s, ctx_s, ctx_mask_s, ctx_pca_s)
                    # print '------------- sampling from valid ----------'
                    # idx = engine.kf_val[np.random.randint(1, len(engine.kf_val) - 1)]
                    # tags = [engine.val_data_ids[index] for index in idx]
                    # x_s, mask_s, ctx_s, mask_ctx_s, ctx_pca_s = data_engine.prepare_data(engine, tags,"val")
                    # model.sample_execute(sess, engine, model_options, tfparams, f_init, f_next, x_s, ctx_s, ctx_mask_s, ctx_pca_s)
                    # print ""

                if validFreq != -1 and np.mod(uidx, validFreq) == 0:
                    t0_valid = time.time()
                    alphas, _ = sess.run(f_alpha, feed_dict={
                                            X: x,
                                            MASK: mask,
                                            CTX: ctx,
                                            CTX_PCA: ctx_pca,
                                            CTX_MASK: ctx_mask})
                    ratio = alphas.min(-1).mean()/(alphas.max(-1)).mean()
                    alphas_ratio.append(ratio)
                    np.savetxt(save_dir+'alpha_ratio.txt',alphas_ratio)

                    np.savez(save_dir+'model_current.npz', history_errs=history_errs)
                    saver.save(sess, save_dir+'model_current.ckpt')

                    sess.run(tf.assign(use_noise, False))

                    train_err = -1
                    train_perp = -1
                    valid_err = -1
                    valid_perp = -1
                    test_err = -1
                    test_perp = -1
                    if not debug:
                        # first compute train cost
                        if 0:
                            print 'computing cost on trainset'
                            train_err, train_perp = model.pred_probs(sess, engine, 'train', 
                                    f_log_probs, verbose=model_options['verbose'])
                        else:
                            train_err = 0.
                            train_perp = 0.
                        if 1:
                            print 'validating...'
                            valid_err, valid_perp = model.pred_probs(sess, engine, 'val',
                                    f_log_probs, verbose=model_options['verbose'])
                        else:
                            valid_err = 0.
                            valid_perp = 0.
                        if 0:
                            print 'testing...'
                            test_err, test_perp = model.pred_probs(sess, engine, 'test',
                                    f_log_probs, verbose=model_options['verbose'])
                        else:
                            test_err = 0.
                            test_perp = 0.
                    
                    mean_ranking = 0
                    blue_t0 = time.time()
                    scores, processes, queue, rqueue, shared_params = \
                        metrics.compute_score(sess=sess,
                        model_type='attention',
                        model_archive=None,
                        options=model_options,
                        engine=engine,
                        save_dir=save_dir,
                        beam=5, n_process=5,
                        whichset='both',
                        on_cpu=False,   
                        processes=processes, queue=queue, rqueue=rqueue,
                        shared_params=shared_params, metric=metric,
                        one_time=False,
                        f_init=f_init, f_next=f_next, model=model
                        )
                    '''
                     {'blue': {'test': [-1], 'valid': [77.7, 60.5, 48.7, 38.5, 38.3]},
                     'alternative_valid': {'Bleu_3': 0.40702270203174923,
                     'Bleu_4': 0.29276570520368456,
                     'CIDEr': 0.25247168210607884,
                     'Bleu_2': 0.529069629270047,
                     'Bleu_1': 0.6804308797115253,
                     'ROUGE_L': 0.51083584331688392},
                     'meteor': {'test': [-1], 'valid': [0.282787550236724]}}
                    '''
                    valid_B1 = scores['valid']['Bleu_1']
                    valid_B2 = scores['valid']['Bleu_2']
                    valid_B3 = scores['valid']['Bleu_3']
                    valid_B4 = scores['valid']['Bleu_4']
                    valid_Rouge = scores['valid']['ROUGE_L']
                    valid_Cider = scores['valid']['CIDEr']
                    valid_meteor = scores['valid']['METEOR']
                    test_B1 = scores['test']['Bleu_1']
                    test_B2 = scores['test']['Bleu_2']
                    test_B3 = scores['test']['Bleu_3']
                    test_B4 = scores['test']['Bleu_4']
                    test_Rouge = scores['test']['ROUGE_L']
                    test_Cider = scores['test']['CIDEr']
                    test_meteor = scores['test']['METEOR']
                    print 'computing meteor/blue score used %.4f sec, '\
                      'blue score: %.1f, meteor score: %.1f'%(
                    time.time()-blue_t0, valid_B4, valid_meteor)
                    history_errs.append([eidx, uidx, train_err, train_perp,
                                         valid_perp, test_perp,
                                         valid_err, test_err,
                                         valid_B1, valid_B2, valid_B3,
                                         valid_B4, valid_meteor, valid_Rouge, valid_Cider,
                                         test_B1, test_B2, test_B3,
                                         test_B4, test_meteor, test_Rouge, test_Cider])
                    np.savetxt(save_dir+'train_valid_test.txt',
                                  history_errs, fmt='%.3f')
                    print 'save validation results to %s'%save_dir
                    # save best model according to the best blue or meteor
                    if len(history_errs) > 1 and \
                      valid_B4 > np.array(history_errs)[:-1,11].max():
                        print 'Saving to %s...'%save_dir,
                        np.savez(
                            save_dir+'model_best_blue_or_meteor.npz',
                            history_errs=history_errs)
                        saver.save(sess, save_dir+'model_best_blue_or_meteor.ckpt') # DOUBT
                    if len(history_errs) > 1 and \
                      valid_err < np.array(history_errs)[:-1,6].min():
                        # best_p = utils.unzip(tparams) # DOUBT
                        bad_counter = 0
                        best_valid_err = valid_err
                        uidx_best_valid_err = uidx

                        print 'Saving to %s...'%save_dir,
                        np.savez(save_dir+'model_best_so_far.npz',
                                history_errs=history_errs)
                        saver.save(sess, save_dir+'model_best_so_far.ckpt')
                        utils.write_to_json(model_options, '%smodel_options.json'%save_dir)
                        print 'Done'
                    elif len(history_errs) > 1 and \
                        valid_err >= np.array(history_errs)[:-1,6].min():
                        bad_counter += 1
                        print 'history best ',np.array(history_errs)[:,6].min()
                        print 'bad_counter ',bad_counter
                        print 'patience ',patience
                        if bad_counter > patience:
                            print 'Early Stop!'
                            estop = True
                            break

                    if test_B4>0.52 and test_meteor>0.32:
                        print 'Saving to %s...'%save_dir,
                        np.savez(
                            save_dir+'model_'+str(uidx)+'.npz',
                            history_errs=history_errs)
                        saver.save(sess, save_dir+'model_'+str(uidx)+'.ckpt')

                    print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err, \
                      'best valid err so far',best_valid_err
                    print 'valid took %.2f sec'%(time.time() - t0_valid)
                    # end of validatioin
                if debug:
                    break

            if estop:
                break
            if debug:
                break

            # end for loop over minibatches
            print 'This epoch has seen %d samples, train cost %.2f'%(
                n_samples, np.mean(train_costs))

        # end for loop over epochs
        print 'Optimization ended.'
        
        print 'stopped at epoch %d, minibatch %d, '\
          'curent Train %.2f, current Valid %.2f, current Test %.2f '%(
              eidx,uidx,np.mean(train_err),np.mean(valid_err),np.mean(test_err))
        
        if history_errs != []:
            history = np.asarray(history_errs)
            best_valid_idx = history[:,6].argmin()
            np.savetxt(save_dir+'train_valid_test.txt', history, fmt='%.4f')
            print 'final best exp ', history[best_valid_idx]

        np.savez(
            save_dir+'model_train_end.npz',
            history_errs=history_errs)
        saver.save(sess, save_dir+'model_train_end.ckpt')
    return
Example #22
0
def gen_vocab(df, whichdata):
    if whichdata == "test":
        outfname = config.MURALI_MSVD_VID_CAPS_TEST_PATH
        dictsize = config.MURALI_TEST_VIDS
        capspath = config.MURALI_MSVD_CAPTIONS_TEST_PATH
    elif whichdata == "val":
        outfname = config.MURALI_MSVD_VID_CAPS_VAL_PATH
        dictsize = config.MURALI_VAL_VIDS
        capspath = None
        raise NotImplementedError()
    else:
        outfname = config.MURALI_MSVD_VID_CAPS_TRAIN_PATH
        dictsize = config.MURALI_TRAIN_VIDS
        capspath = config.MURALI_MSVD_CAPTIONS_TRAIN_PATH
    vocab = set()
    punct_dict = get_punctuations()
    translator = string.maketrans("", "")
    vid_caps_dict = {}
    omitted_caps = []
    for index in range(dictsize):
        vid_id = whichdata + "_" + str(index)
        descriptions = utils.read_file_to_list(capspath + str(index) +
                                               ".txt")[0].split("|")
        vid_caps = []
        for desc in descriptions:
            try:
                cap = desc.strip().encode('UTF-8')
                if len(cap) > 0:
                    vid_caps.append(cap)
            except Exception as e:
                # print vid_id, " : ", desc.strip()
                omitted_caps.append(vid_id + " : " + desc.strip())
        for vid_cap in vid_caps:
            tokens, _ = tokenize(vid_cap, punct_dict, translator)
            if (vid_id in vid_caps_dict):
                vid_caps_dict[vid_id].append(tokens)
            else:
                vid_caps_dict[vid_id] = [tokens]
            if whichdata == "train":
                vocab |= set(tokens)
    print("Non-ASCII captions omitted :" + str(len(omitted_caps)))
    utils.write_to_json(vid_caps_dict, outfname)
    print("Size of " + whichdata + " vid caps dict: " +
          str(len(vid_caps_dict)))
    assert len(vid_caps_dict) == dictsize
    if whichdata == "train":
        vocab_list = list(vocab)
        vocab_list.sort()
        vocab_dict = {
            vocab_list[index]: index + 2
            for index in range(len(vocab_list))
        }
        # vocab_dict['<bos>'] = 0
        vocab_dict['<eos>'] = 0
        vocab_dict['UNK'] = 1
        vocab_rev_dict = {
            index + 2: vocab_list[index]
            for index in range(len(vocab_list))
        }
        # vocab_rev_dict[0] = '<bos>'
        vocab_rev_dict[0] = '<eos>'
        vocab_rev_dict[1] = 'UNK'
        utils.write_to_json(vocab_dict, config.MURALI_MSVD_VOCAB_PATH)
        utils.write_to_pickle(vocab_rev_dict,
                              config.MURALI_MSVD_REVERSE_VOCAB_PATH)
        print("Size of Vocabulary: " + str(len(vocab)))
    return vocab, vid_caps_dict, omitted_caps
Example #23
0
 def _save(self):
     ''' save final data. data_lst, scrape_err_lst, parse_err_lst '''
     write_to_json(self.data_path, self.data_lst)
     write_to_json(self.scrape_err_path, self.scrape_err_lst)
     write_to_json(self.parse_err_path, self.parse_err_lst)
Example #24
0
def cityDic(places):
    geolocator = Nominatim(user_agent="specify_your_app_name_here")
    geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)
    place_dicts = []
    for place in places:
        place_dict = {"text":place, "address":"", "latitude":"", "longtitude":""}
        location = geocode(place)
        if location:
            place_dict["address"] = location.address
            point = tuple(location.point)
            place_dict["latitude"] = point[0]
            place_dict["longtitude"] = point[1]

        place_dicts.append(place_dict)
    return place_dicts

if __name__ == '__main__':
    args = get_args()
    data = load_from_json(args.data)

    place_tags = []
    # TODO : Process only sentences with label 1
    for sentence in data["sentences"]:
        places = geograpy.get_place_context(text=sentence)
        place_dicts = cityDic(places.cities) # Only cities ???
        place_tags.append(place_dicts)

    data["place_tags"] = place_tags
    write_to_json(data, data["id"], extension="json", out_dir=args.out_dir)
            (SY.shift().rolling(window=3).mean())[25:-25])
        variables_dict['ASY4'].extend(
            (SY.shift().rolling(window=4).mean())[25:-25])
        variables_dict['ASY5'].extend(
            (SY.shift().rolling(window=5).mean())[25:-25])
        variables_dict['ASY6'].extend(
            (SY.shift().rolling(window=6).mean())[25:-25])
        variables_dict['ASY7'].extend(
            (SY.shift().rolling(window=7).mean())[25:-25])
        variables_dict['ASY8'].extend(
            (SY.shift().rolling(window=8).mean())[25:-25])
        variables_dict['ASY9'].extend(
            (SY.shift().rolling(window=9).mean())[25:-25])
        variables_dict['ASY10'].extend(
            (SY.shift().rolling(window=10).mean())[25:-25])
        variables_dict['ASY15'].extend(
            (SY.shift().rolling(window=15).mean())[25:-25])
        variables_dict['ASY20'].extend(
            (SY.shift().rolling(window=20).mean())[25:-25])
        variables_dict['ASY25'].extend(
            (SY.shift().rolling(window=25).mean())[25:-25])
print('Finished Downloading Data')

# Clean and save data
idx_to_remove = check_for_nan_elements(variables_dict,
                                       predictor_names,
                                       verbose=True)
remove_nan_elements(variables_dict, idx_to_remove, verbose=True)
scale_and_save(variables_dict, predictor_names, verbose=True)
write_to_json(variables_dict, verbose=True)
Example #26
0
# Be cautious when run this file!!!

from utils import read_from_json, write_to_json

jp = read_from_json("data/jp.json")
uk = read_from_json("data/uk.json")
us = read_from_json("data/us.json")

data = jp + uk + us

write_to_json(data, "data/data.json")