def load_data(self): print('loading {}-{} features'.format(self.dataset_name,self.cnn_name)) self.train_data_ids = utils.read_file_to_list(self.train_data_ids_path) self.val_data_ids = utils.read_file_to_list(self.val_data_ids_path) self.test_data_ids = utils.read_file_to_list(self.test_data_ids_path) utils.shuffle_array(self.train_data_ids) utils.shuffle_array(self.val_data_ids) utils.shuffle_array(self.test_data_ids) self.train_data_ids = self.train_data_ids[:1] # ONLY FOR DEBUG - REMOVE self.val_data_ids = self.val_data_ids[:1] self.test_data_ids = self.test_data_ids[:1] self.train_caps = utils.read_from_json(self.train_caps_path) self.val_caps = utils.read_from_json(self.val_caps_path) self.test_caps = utils.read_from_json(self.test_caps_path) self.vocab = utils.read_from_json(self.vocab_path) self.reverse_vocab = utils.read_from_pickle(self.reverse_vocab_path) self.vocab_size = len(self.vocab) if self.cnn_name in ['ResNet50', 'ResNet152', 'InceptionV3']: self.ctx_dim = 2048 elif self.cnn_name in ['MURALI']: self.ctx_dim = 1024 elif self.cnn_name in ['VGG19']: self.ctx_dim = 512 else: raise NotImplementedError() self.train_ids = self.get_vid_ids(self.train_data_ids) self.val_ids = self.get_vid_ids(self.val_data_ids) self.test_ids = self.get_vid_ids(self.test_data_ids) self.kf_train = utils.generate_minibatch_idx(len(self.train_data_ids), self.mb_size_train) self.kf_val = utils.generate_minibatch_idx(len(self.val_data_ids), self.mb_size_test) #TODO - verify test or val self.kf_test = utils.generate_minibatch_idx(len(self.test_data_ids), self.mb_size_test)
def main(): events = utils.read_from_json("resources/events.json") languages_with_events = utils.read_from_json( "resources/languages_with_events.json" ) #get_languages_with_events(events) for language in languages: language = language.strip("\n") if language != "ar": continue if language not in language_links: continue event_representations = {} for event_type in events: for event in events[event_type]: if event not in languages_with_events: continue representation, y = make_representation( event_type, language, languages_with_events[event]) event_rep = {'representation': representation, 'y': y} event_representations[event] = event_rep if y == 1: print(language, event_representations[event]) utils.save2json(event_representations, "data/data_%s.json" % language)
def main(): argc = len(sys.argv) if argc == 1: print("Please specify regions") print("US, UK, JP") sys.exit(0) else: region = sys.argv[1] if region.lower() == 'us': data = read_from_json( "spotify-responses/us_spotify_responses.json") songs = data['data'] write_duration_to_csv(songs, region.lower()) elif region.lower() == 'uk': data = read_from_json( "spotify-responses/uk_spotify_responses.json") songs = data['data'] write_duration_to_csv(songs, region.lower()) elif region.lower() == "jp": data = read_from_json( "spotify-responses/jp_spotify_responses.json") songs = data['data'] write_duration_to_csv(songs, region.lower()) else: raise NotImplementedError
def start(self): material_info_list = utils.read_from_json(Config.material_list_path) mf = MaterialFactory() for i in material_info_list: m = mf.get_material(i) print("create material: ", m) self.material_list.append(m) reaction_info_list = utils.read_from_json(Config.reaction_list_path) rf = ReactionFactory() for i in reaction_info_list: r = rf.get_reaction(i) print("create reaction: ", r) self.reaction_list.append(r)
def download_us_mp3(): data = read_from_json('us_spotify_responses.json') songs = data['data'] n = 100 has_no_preview_url = 0 for i, song in enumerate(songs[:n]): preview_url = song['preview_url'] if preview_url is None: print(i + 1, song['name'], "by", song['artists']) has_no_preview_url += 1 continue else: file_name = str(i + 1) + "-" + '-'.join( song['name'].split(' ')) + '.mp3' file_path = "tracks/us/" full_path = file_path + file_name r = requests.get(preview_url, stream=True) # Taken from # http://stackoverflow.com/questions/16694907/how-to-download-large-file-in-python-with-requests-py with open(full_path, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks f.write(chunk) # f.flush() commented by recommendation from J.F.Sebastian print("Total songs that has no preview_url:", has_no_preview_url)
def main(): for filename in sorted(glob.glob(directory)): language = filename.split("/")[-1].split(".")[0] if args.language: if language != args.language: continue print(f"\nLanguage:\t{language}") if args.check_os == "y": if os.path.isfile(f"data/covid19/processed/{language}.json"): print(f"{language} has already been analyzed, moving on...") continue if language not in set(countries_per_language.keys()): continue input_data = utils.read_from_json(filename) output_data = {} days = sorted(input_data.keys()) nr_days = len(days) if nr_days < 10: continue previous_links = [] previous_links_locations = {} previous_references = [] previous_references_locations = {} references_origins = Counter() links_origins = Counter() for n, day in enumerate(days): print("Processing day %s of %s:\t%s" % (n, nr_days, day)) timestamps_output = { "links": {}, "references": {} } links = sorted(input_data[day]["links"]) references = sorted(input_data[day]["references"]) links_countries = get_links_locations(links, previous_links, previous_links_locations, language) # dict timestamps_output["links"] = links_countries previous_links = links previous_links_locations = links_countries references_countries = get_reference_locations(references, previous_references, previous_references_locations) timestamps_output["references"] = references_countries previous_references = references previous_references_locations = references_countries #print("Completed day %s of %s" % (n, nr_days)) #print(timestamps_output, "\n\n") output_data[day] = timestamps_output utils.save_to_json(language, "processed", output_data)
def prepare_data_ids(vid_caps_path, ids_save_path): vid_caps_dict = utils.read_from_json(vid_caps_path) data_ids = [] for vid_caps in vid_caps_dict.items(): vid_id = vid_caps[0] if vid_id[-4:] == ".avi": vid_id = vid_id[:-4] for seq_id in range(len(vid_caps[1])): data_id = vid_id + "|" + str(seq_id) data_ids.append(data_id) utils.write_list_to_file(ids_save_path, data_ids)
def load_backup_file(path): cluster = u.read_from_json(u.join_path(path, "cluster.json")) ct = u.read_from_json(u.join_path(path, "clusterTemplate.json")) ngs = u.read_from_json(u.join_path(path, "node_groups.json")) flavors = u.read_from_json(u.join_path(path, "flavors.json")) key_pair = u.read_from_json(u.join_path(path, "keypair.json")) sgs = u.read_from_json(u.join_path(path, "security_groups.json")) images = u.read_from_json(u.join_path(path, "images.json")) return cluster, ct, ngs['node_groups'], flavors['flavors'], key_pair, sgs[ 'security_groups'], images['images']
def write_zero_crossing_rate_to_json(tracks, region): json_file = { "us": "data/us.json", "uk": "data/uk.json", "jp": "data/jp.json" } selected_json = json_file[region] input_json = read_from_json(selected_json) files_zero_crossing_rate = get_files_zero_crossing_rate(tracks) for obj in input_json: position = obj['position'] obj['zero_crossing_rate'] = files_zero_crossing_rate[position].tolist() # write_to_json(input_json, selected_json) json.dump(input_json, codecs.open(selected_json, 'w', encoding='utf-8'), separators=(',', ':'), sort_keys=True, indent=2) # this saves the array in .json format
def train_util(params): save_dir = params['save_dir'] print('current save dir : ' + save_dir) utils.create_dir_if_not_exist(save_dir) reload_model = params['reload_model'] if reload_model: print 'preparing reload' save_dir_backup = params['save_dir'] from_dir_backup = params['from_dir'] # never start retrain in the same folder assert save_dir_backup != from_dir_backup print 'save dir ', save_dir_backup print 'from_dir ', from_dir_backup print 'setting current model config with the old one' model_config_old = utils.read_from_json(from_dir_backup + 'model_config.json') model_config_old['reload_model'] = True model_config_old['save_dir'] = params['save_dir'] model_config_old['from_dir'] = params['from_dir'] model_config_old['max_epochs'] = params['max_epochs'] model_config_old['dispFreq'] = params['dispFreq'] model_config_old['sampleFreq'] = params['sampleFreq'] model_config_old['validFreq'] = params['validFreq'] model_config_old['debug'] = params['debug'] params = model_config_old feats_dir = params['feats_dir'] elif params['cnn_name'] != "MURALI": feats_dir = params['feats_dir'] + params['cnn_name'] + "_kmeans3/" else: feats_dir = params['feats_dir'] print('feats dir : ' + feats_dir) params['feats_dir'] = feats_dir config_save_path = save_dir + "model_config.json" print('saving model config into %s' % config_save_path) utils.write_to_json(params, config_save_path) t0 = time.time() print('training an attention model') train(params, **params) print('training time in total %.4f sec' % (time.time() - t0))
def main(): for filename in sorted(glob.glob(input_directory)): language = filename.split("/")[-1].split(".")[0] if args.language: if language != args.language: continue print("\nLanguage:\t", language) if args.check_os == "y": if os.path.isfile(f"data/weekly/{language}.png"): print(f"{language} has already been processed, moving on...") continue input_data = utils.read_from_json(filename) day_data = get_day_data(input_data) week_data = get_week_data(day_data) utils.save_to_json(language, "weekly", week_data) print("done")
def shuffle_from_file(dir): res = list(utils.read_from_json(dir)) random.shuffle(res) return res
parser.add_argument("--check_os", default="y") args = parser.parse_args() directory = "data/covid19/weekly/*.json" #geolocator = Nominatim(user_agent="LocalGlobal") inferrer = gpinfer.LogisticInferrer() def make_countries_list(): countries = set() for list_of_countries in list(countries_per_language.values()): for country in list_of_countries: countries.add(country) return countries countries_per_language = utils.read_from_json("resources/countries_per_language.json") countries = make_countries_list() country_nicknames = { "people's republic of china": "china", "kingdom of denmark": "denmark", "kingdom of the netherlands": "the netherlands", "united states of america": "united states", "usa": "united states", "uk": "united kingdom", "great britain": "united kingdom" } links_locations_holder = {} references_locations_holder = {}
vid_urls.append(vid_url) assert len(set(vid_ids))==MSRVTT_TOTAL_VIDS print "urls#:",len(set(vid_urls)),'/',len(vid_urls) url_ydl_map = OrderedDict() count = 0 success = 0 fail = 0 for url in url_ids_map: ydl_url, status = get_youtube_url(url) url_ydl_map[url] = { "ydl_url": ydl_url, "status": status } if status=="Success": success += 1 else: fail += 1 count = count + 1 print success,"/",count," ",fail,"/",count url_ydl_map["#success"] = success url_ydl_map["#fail"] = fail url_ydl_map["#count"] = count utils.write_to_json(url_ids_map, MSRVTT_DIR+"urls_vidids_map.json") utils.write_to_json(url_ydl_map, MSRVTT_DIR+"urls_ydl_map.json") if __name__ == '__main__': print('loading json data...') data = utils.read_from_json(MSRVTT_JSON_DATA_PATH) videos = data['videos'] assert len(videos)==MSRVTT_TOTAL_VIDS map_url_with_ids(videos)
# Be cautious when run this file!!! from utils import read_from_json, write_to_json jp = read_from_json("data/jp.json") uk = read_from_json("data/uk.json") us = read_from_json("data/us.json") data = jp + uk + us write_to_json(data, "data/data.json")
import sys import utils if __name__ == "__main__": big_data = set() for file in sys.argv[1:]: print(len(utils.read_from_json(file))) big_data |= set(utils.read_from_json(file)) print(len(big_data)) utils.write_messeges_to_json("big_data.json", big_data)
""" import utils import numpy as np import matplotlib.pyplot as plt from sklearn import svm, metrics, model_selection languages = sorted( ["ar"] ) #"fr", "da", "sv", "nb", "nl", "de", "is"]) #sorted(open("resources/wikipedia_LVs.txt").readlines()) for language in languages: f1s = [] accuracies = [] precisions = [] recalls = [] data = utils.read_from_json("data/data_%s.json" % language) X = [data[dp]["representation"] for dp in data] y = [data[dp]["y"] for dp in data] true_pos = 0 false_pos = 0 true_neg = 0 false_neg = 0 cross_val = 10 for i in range(cross_val): X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.20) weight = 2
import pickle from utils import save_as_json, read_from_json import pdb out_dir = 'data/output/blocs/' out_filename = 'dictionary_blocs.json' blocs = ['investment_blocs_2020.json'] dict_blocs = {} for bloc in blocs: bloc_name = bloc.replace('.json', '') this_bloc_dict = read_from_json(out_dir, bloc) dict_blocs[bloc_name] = this_bloc_dict save_as_json(dict_blocs, out_dir, out_filename) # check parsed_dict_blocs = read_from_json(out_dir, out_filename) pdb.set_trace()
for event in events[type]: # get all languages that have that event event_languages = get_event_languages(event) for language in languages: # page already exists if language in event_languages: continue event_probability = calculate_event_probability(lang, event, type, event_languages) if event_probability > threshold: recommended_pages[lang].append(event) """ languages = sorted(open("resources/wikipedia_LVs.txt").readlines() ) #sorted(["fr", "da", "sv", "nb", "nl", "de", "is"]) # event_type_distributions = utils.read_from_json( "resources/event_distributions.json") language_links = utils.read_from_json("resources/language_links.json") def get_events_from_type(): events_query = """ SELECT ?subtype ?subtypeLabel ?type ?typeLabel #(COUNT(?x) AS ?cnt) WHERE { ?type wdt:P279 wd:Q1656682 . ?subtype wdt:P31 ?type . SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . } } """