Exemple #1
0
 def load_data(self):
     print('loading {}-{} features'.format(self.dataset_name,self.cnn_name))
     self.train_data_ids = utils.read_file_to_list(self.train_data_ids_path)
     self.val_data_ids = utils.read_file_to_list(self.val_data_ids_path)
     self.test_data_ids = utils.read_file_to_list(self.test_data_ids_path)
     utils.shuffle_array(self.train_data_ids)
     utils.shuffle_array(self.val_data_ids)
     utils.shuffle_array(self.test_data_ids)
     self.train_data_ids = self.train_data_ids[:1]   # ONLY FOR DEBUG - REMOVE
     self.val_data_ids = self.val_data_ids[:1]
     self.test_data_ids = self.test_data_ids[:1]
     self.train_caps = utils.read_from_json(self.train_caps_path)
     self.val_caps = utils.read_from_json(self.val_caps_path)
     self.test_caps = utils.read_from_json(self.test_caps_path)
     self.vocab = utils.read_from_json(self.vocab_path)
     self.reverse_vocab = utils.read_from_pickle(self.reverse_vocab_path)
     self.vocab_size = len(self.vocab)
     if self.cnn_name in ['ResNet50', 'ResNet152', 'InceptionV3']:
         self.ctx_dim = 2048
     elif self.cnn_name in ['MURALI']:
         self.ctx_dim = 1024
     elif self.cnn_name in ['VGG19']:
         self.ctx_dim = 512
     else:
         raise NotImplementedError()
     self.train_ids = self.get_vid_ids(self.train_data_ids)
     self.val_ids = self.get_vid_ids(self.val_data_ids)
     self.test_ids = self.get_vid_ids(self.test_data_ids)
     self.kf_train = utils.generate_minibatch_idx(len(self.train_data_ids), self.mb_size_train)
     self.kf_val = utils.generate_minibatch_idx(len(self.val_data_ids), self.mb_size_test)   #TODO - verify test or val
     self.kf_test = utils.generate_minibatch_idx(len(self.test_data_ids), self.mb_size_test)
def main():

    events = utils.read_from_json("resources/events.json")
    languages_with_events = utils.read_from_json(
        "resources/languages_with_events.json"
    )  #get_languages_with_events(events)

    for language in languages:
        language = language.strip("\n")
        if language != "ar": continue
        if language not in language_links: continue

        event_representations = {}

        for event_type in events:
            for event in events[event_type]:
                if event not in languages_with_events:
                    continue

                representation, y = make_representation(
                    event_type, language, languages_with_events[event])
                event_rep = {'representation': representation, 'y': y}
                event_representations[event] = event_rep
                if y == 1:
                    print(language, event_representations[event])

        utils.save2json(event_representations, "data/data_%s.json" % language)
Exemple #3
0
def main():
    argc = len(sys.argv)
    if argc == 1:
        print("Please specify regions")
        print("US, UK, JP")
        sys.exit(0)
    else:
        region = sys.argv[1]
        if region.lower() == 'us':
            data = read_from_json(
                "spotify-responses/us_spotify_responses.json")
            songs = data['data']
            write_duration_to_csv(songs, region.lower())
        elif region.lower() == 'uk':
            data = read_from_json(
                "spotify-responses/uk_spotify_responses.json")
            songs = data['data']
            write_duration_to_csv(songs, region.lower())
        elif region.lower() == "jp":
            data = read_from_json(
                "spotify-responses/jp_spotify_responses.json")
            songs = data['data']
            write_duration_to_csv(songs, region.lower())
        else:
            raise NotImplementedError
Exemple #4
0
    def start(self):
        material_info_list = utils.read_from_json(Config.material_list_path)
        mf = MaterialFactory()
        for i in material_info_list:
            m = mf.get_material(i)
            print("create material: ", m)
            self.material_list.append(m)

        reaction_info_list = utils.read_from_json(Config.reaction_list_path)
        rf = ReactionFactory()
        for i in reaction_info_list:
            r = rf.get_reaction(i)
            print("create reaction: ", r)
            self.reaction_list.append(r)
def download_us_mp3():
    data = read_from_json('us_spotify_responses.json')
    songs = data['data']
    n = 100
    has_no_preview_url = 0
    for i, song in enumerate(songs[:n]):
        preview_url = song['preview_url']
        if preview_url is None:
            print(i + 1, song['name'], "by", song['artists'])
            has_no_preview_url += 1
            continue
        else:
            file_name = str(i + 1) + "-" + '-'.join(
                song['name'].split(' ')) + '.mp3'
            file_path = "tracks/us/"
            full_path = file_path + file_name
            r = requests.get(preview_url, stream=True)
            # Taken from
            # http://stackoverflow.com/questions/16694907/how-to-download-large-file-in-python-with-requests-py
            with open(full_path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=1024):
                    if chunk:  # filter out keep-alive new chunks
                        f.write(chunk)
                        # f.flush() commented by recommendation from J.F.Sebastian
    print("Total songs that has no preview_url:", has_no_preview_url)
Exemple #6
0
def main():

	for filename in sorted(glob.glob(directory)):

		language = filename.split("/")[-1].split(".")[0]
		if args.language:
			if language != args.language: continue

		print(f"\nLanguage:\t{language}")

		if args.check_os == "y":
			if os.path.isfile(f"data/covid19/processed/{language}.json"):
				print(f"{language} has already been analyzed, moving on...")
				continue
	
		if language not in set(countries_per_language.keys()): 
			continue

		input_data = utils.read_from_json(filename)
		output_data = {}
		days = sorted(input_data.keys())
		nr_days = len(days)
		if nr_days < 10: continue

		previous_links = []
		previous_links_locations = {}
		previous_references = []
		previous_references_locations = {}

		references_origins = Counter()
		links_origins = Counter()

		for n, day in enumerate(days):
			print("Processing day %s of %s:\t%s" % (n, nr_days, day))

			timestamps_output = {
				"links": {},
				"references": {}
			}

			links = sorted(input_data[day]["links"])
			references = sorted(input_data[day]["references"])

			links_countries = get_links_locations(links, previous_links, previous_links_locations, language) # dict
			timestamps_output["links"] = links_countries
			previous_links = links
			previous_links_locations = links_countries

			references_countries = get_reference_locations(references, previous_references, previous_references_locations)
			timestamps_output["references"] = references_countries
			previous_references = references
			previous_references_locations = references_countries
	
			#print("Completed day %s of %s" % (n, nr_days))
			#print(timestamps_output, "\n\n")
			output_data[day] = timestamps_output
		utils.save_to_json(language, "processed", output_data)
Exemple #7
0
def prepare_data_ids(vid_caps_path, ids_save_path):
    vid_caps_dict = utils.read_from_json(vid_caps_path)
    data_ids = []
    for vid_caps in vid_caps_dict.items():
        vid_id = vid_caps[0]
        if vid_id[-4:] == ".avi":
            vid_id = vid_id[:-4]
        for seq_id in range(len(vid_caps[1])):
            data_id = vid_id + "|" + str(seq_id)
            data_ids.append(data_id)
    utils.write_list_to_file(ids_save_path, data_ids)
def load_backup_file(path):
    cluster = u.read_from_json(u.join_path(path, "cluster.json"))
    ct = u.read_from_json(u.join_path(path, "clusterTemplate.json"))
    ngs = u.read_from_json(u.join_path(path, "node_groups.json"))
    flavors = u.read_from_json(u.join_path(path, "flavors.json"))
    key_pair = u.read_from_json(u.join_path(path, "keypair.json"))
    sgs = u.read_from_json(u.join_path(path, "security_groups.json"))
    images = u.read_from_json(u.join_path(path, "images.json"))
    return cluster, ct, ngs['node_groups'], flavors['flavors'], key_pair, sgs[
        'security_groups'], images['images']
Exemple #9
0
def write_zero_crossing_rate_to_json(tracks, region):
    json_file = {
        "us": "data/us.json",
        "uk": "data/uk.json",
        "jp": "data/jp.json"
    }
    selected_json = json_file[region]
    input_json = read_from_json(selected_json)
    files_zero_crossing_rate = get_files_zero_crossing_rate(tracks)
    for obj in input_json:
        position = obj['position']
        obj['zero_crossing_rate'] = files_zero_crossing_rate[position].tolist()
    # write_to_json(input_json, selected_json)
    json.dump(input_json,
              codecs.open(selected_json, 'w', encoding='utf-8'),
              separators=(',', ':'),
              sort_keys=True,
              indent=2)  # this saves the array in .json format
Exemple #10
0
def train_util(params):
    save_dir = params['save_dir']
    print('current save dir : ' + save_dir)
    utils.create_dir_if_not_exist(save_dir)

    reload_model = params['reload_model']
    if reload_model:
        print 'preparing reload'
        save_dir_backup = params['save_dir']
        from_dir_backup = params['from_dir']
        # never start retrain in the same folder
        assert save_dir_backup != from_dir_backup
        print 'save dir ', save_dir_backup
        print 'from_dir ', from_dir_backup
        print 'setting current model config with the old one'
        model_config_old = utils.read_from_json(from_dir_backup +
                                                'model_config.json')
        model_config_old['reload_model'] = True
        model_config_old['save_dir'] = params['save_dir']
        model_config_old['from_dir'] = params['from_dir']
        model_config_old['max_epochs'] = params['max_epochs']
        model_config_old['dispFreq'] = params['dispFreq']
        model_config_old['sampleFreq'] = params['sampleFreq']
        model_config_old['validFreq'] = params['validFreq']
        model_config_old['debug'] = params['debug']
        params = model_config_old
        feats_dir = params['feats_dir']
    elif params['cnn_name'] != "MURALI":
        feats_dir = params['feats_dir'] + params['cnn_name'] + "_kmeans3/"
    else:
        feats_dir = params['feats_dir']
    print('feats dir : ' + feats_dir)
    params['feats_dir'] = feats_dir

    config_save_path = save_dir + "model_config.json"
    print('saving model config into %s' % config_save_path)
    utils.write_to_json(params, config_save_path)

    t0 = time.time()
    print('training an attention model')
    train(params, **params)
    print('training time in total %.4f sec' % (time.time() - t0))
Exemple #11
0
def main():
    for filename in sorted(glob.glob(input_directory)):

        language = filename.split("/")[-1].split(".")[0]
        if args.language:
            if language != args.language: continue

        print("\nLanguage:\t", language)

        if args.check_os == "y":
            if os.path.isfile(f"data/weekly/{language}.png"):
                print(f"{language} has already been processed, moving on...")
                continue

        input_data = utils.read_from_json(filename)

        day_data = get_day_data(input_data)
        week_data = get_week_data(day_data)
        utils.save_to_json(language, "weekly", week_data)
        print("done")
def shuffle_from_file(dir):
    res = list(utils.read_from_json(dir))
    random.shuffle(res)
    return res
Exemple #13
0
parser.add_argument("--check_os", default="y")

args = parser.parse_args()

directory = "data/covid19/weekly/*.json"
#geolocator = Nominatim(user_agent="LocalGlobal")
inferrer = gpinfer.LogisticInferrer()

def make_countries_list():
	countries = set()
	for list_of_countries in list(countries_per_language.values()):
		for country in list_of_countries:
			countries.add(country)
	return countries

countries_per_language = utils.read_from_json("resources/countries_per_language.json")
countries = make_countries_list()

country_nicknames = {
	"people's republic of china": "china",
	"kingdom of denmark": "denmark",
	"kingdom of the netherlands": "the netherlands",
	"united states of america": "united states",
	"usa": "united states",
	"uk": "united kingdom",
	"great britain": "united kingdom"
}

links_locations_holder = {}
references_locations_holder = {}
		vid_urls.append(vid_url)
	assert len(set(vid_ids))==MSRVTT_TOTAL_VIDS
	print "urls#:",len(set(vid_urls)),'/',len(vid_urls)
	url_ydl_map = OrderedDict()
	count = 0
	success = 0
	fail = 0
	for url in url_ids_map:
		ydl_url, status = get_youtube_url(url)
		url_ydl_map[url] = {
			"ydl_url": ydl_url,
			"status": status
		}
		if status=="Success":
			success += 1
		else:
			fail += 1
		count = count + 1
	print success,"/",count," ",fail,"/",count
	url_ydl_map["#success"] = success
	url_ydl_map["#fail"] = fail
	url_ydl_map["#count"] = count
	utils.write_to_json(url_ids_map, MSRVTT_DIR+"urls_vidids_map.json")
	utils.write_to_json(url_ydl_map, MSRVTT_DIR+"urls_ydl_map.json")

if __name__ == '__main__':
	print('loading json data...')
	data = utils.read_from_json(MSRVTT_JSON_DATA_PATH)
	videos = data['videos']
	assert len(videos)==MSRVTT_TOTAL_VIDS
	map_url_with_ids(videos)
# Be cautious when run this file!!!

from utils import read_from_json, write_to_json

jp = read_from_json("data/jp.json")
uk = read_from_json("data/uk.json")
us = read_from_json("data/us.json")

data = jp + uk + us

write_to_json(data, "data/data.json")
Exemple #16
0
import sys
import utils

if __name__ == "__main__":
    big_data = set()
    for file in sys.argv[1:]:
        print(len(utils.read_from_json(file)))
        big_data |= set(utils.read_from_json(file))
        print(len(big_data))
    utils.write_messeges_to_json("big_data.json", big_data)
Exemple #17
0
"""
import utils
import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, metrics, model_selection

languages = sorted(
    ["ar"]
)  #"fr", "da", "sv", "nb", "nl", "de", "is"]) #sorted(open("resources/wikipedia_LVs.txt").readlines())

for language in languages:
    f1s = []
    accuracies = []
    precisions = []
    recalls = []
    data = utils.read_from_json("data/data_%s.json" % language)

    X = [data[dp]["representation"] for dp in data]
    y = [data[dp]["y"] for dp in data]

    true_pos = 0
    false_pos = 0
    true_neg = 0
    false_neg = 0

    cross_val = 10
    for i in range(cross_val):
        X_train, X_test, y_train, y_test = model_selection.train_test_split(
            X, y, test_size=0.20)

        weight = 2
import pickle
from utils import save_as_json, read_from_json
import pdb
out_dir = 'data/output/blocs/'
out_filename = 'dictionary_blocs.json'
blocs = ['investment_blocs_2020.json']

dict_blocs = {}
for bloc in blocs:
    bloc_name = bloc.replace('.json', '')
    this_bloc_dict = read_from_json(out_dir, bloc)
    dict_blocs[bloc_name] = this_bloc_dict

save_as_json(dict_blocs, out_dir, out_filename)

# check
parsed_dict_blocs = read_from_json(out_dir, out_filename)
pdb.set_trace()
	for event in events[type]:
		# get all languages that have that event
		event_languages = get_event_languages(event)

		for language in languages:
			# page already exists
			if language in event_languages: continue
			event_probability = calculate_event_probability(lang, event, type, event_languages)
			if event_probability > threshold:
				recommended_pages[lang].append(event)

"""

languages = sorted(open("resources/wikipedia_LVs.txt").readlines()
                   )  #sorted(["fr", "da", "sv", "nb", "nl", "de", "is"]) #
event_type_distributions = utils.read_from_json(
    "resources/event_distributions.json")
language_links = utils.read_from_json("resources/language_links.json")


def get_events_from_type():
    events_query = """
		SELECT ?subtype ?subtypeLabel ?type ?typeLabel #(COUNT(?x) AS ?cnt)
		WHERE
		{
			?type wdt:P279 wd:Q1656682 .
			?subtype wdt:P31 ?type .
			SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
		} 

		"""