def run_apply_weighting_command(self): data = methods.load_json(self.file_path_field.getText()) if not data: self.warning('Data is invalid') return False methods.apply_weighting(self.transform, data=data)
def run_apply_weighting_command( self ): data = methods.load_json( self.file_path_field.getText( ) ) if not data: self.warning('Data is invalid') return False methods.apply_weighting( self.transform, data = data )
def annotate_data(source_file, annotations_file, tag=False, compounds=False): "Function to annotate existing coco data" data = load_json(source_file) for entry in data: raw_description = entry['caption'] doc = nlp.tokenizer(raw_description) entry['tokenized'] = [tok.orth_ for tok in doc] if tag: # Call the tagger on the document. nlp.tagger(doc) entry['tagged'] = [(tok.orth_, tok.tag_) for tok in doc] if compounds: list_of_compounds = compounds_from_doc(doc) entry['compounds'] = list_of_compounds save_json(data, annotations_file) return data
def coverage(name, target): """ Compute coverage for a specific system. This function is agnostic to whether you want coverage over entire Val or only the set of learnable types. """ base = './Data/Systems/' path = base + name + '/Val/stats.json' system = load_json(path) gen = set(system['types']) recalled = gen & target return { "recalled": recalled, "score": len(recalled) / len(target), "not_in_val": gen - target }
def run_all(args): "Run all metrics on the data and save JSON files with the results." # Annotate generated data. annotated = annotate_data(args.source_file, args.annotations_file, tag=True, compounds=True) # Load training data. (For computing novelty.) train_data = load_json('./Data/COCO/Processed/tokenized_train2014.json') train_descriptions = [ entry['caption'] for entry in train_data['annotations'] ] # Load annotated data. sentences = sentences_from_file(args.annotations_file) # Analyze the data. stats = system_stats(sentences) # Get raw descriptions. gen_descriptions = [ entry['caption'] for entry in load_json(args.source_file) ] extra_stats = sentence_stats(train_descriptions, gen_descriptions) stats.update(extra_stats) # Save statistics data. save_json(stats, args.stats_file) ################################ # Global recall train_stats = load_json('./Data/COCO/Processed/train_stats.json') val_stats = load_json('./Data/COCO/Processed/val_stats.json') train = set(train_stats['types']) val = set(val_stats['types']) learnable = train & val gen = set(stats['types']) recalled = gen & val coverage = { "recalled": recalled, "score": len(recalled) / len(learnable), "not_in_val": gen - learnable } coverage['omissions'] = most_frequent_omissions( coverage['recalled'], val_stats, # Use validation set as reference. n=None) val_count_list = get_count_list(val_stats) coverage['percentiles'] = percentiles(val_count_list, recalled) save_json(coverage, args.global_coverage_file) #################################### # Local recall val_index = index_from_file('./Data/COCO/Processed/tagged_val2014.json', tagged=True, lower=True) generated = {entry['image_id']: entry['tokenized'] for entry in annotated} local_recall_res = dict(scores=local_recall_scores(generated, val_index), counts=local_recall_counts(generated, val_index)) save_json(local_recall_res, args.local_coverage_file) ################################## # Nouns pps npdata = { 'pp_data': pp_stats(annotated), 'compound_data': compound_stats(annotated) } save_json(npdata, args.noun_pp_file)
if __name__ == "__main__": system2label = { 'Dai-et-al-2017': 'Dai et al. 2017', 'Liu-et-al-2017': 'Liu et al. 2017', 'Mun-et-al-2017': 'Mun et al. 2017', 'Shetty-et-al-2016': 'Shetty et al. 2016', 'Shetty-et-al-2017': 'Shetty et al. 2017', 'Tavakoli-et-al-2017': 'Tavakoli et al. 2017', 'Vinyals-et-al-2017': 'Vinyals et al. 2017', 'Wu-et-al-2016': 'Wu et al. 2016', 'Zhou-et-al-2017': 'Zhou et al. 2017' } system2color = dict(zip(sorted(system2label), my_palette)) train_stats = load_json('./Data/COCO/Processed/train_stats.json') val_stats = load_json('./Data/COCO/Processed/val_stats.json') train = set(train_stats['types']) val = set(val_stats['types']) learnable = train & val limit = len(learnable) / len(val) size_limit = len(val) - len(learnable) print( f'The limit is: {limit}. This means {size_limit} words in Val cannot be learned.' ) ################################################################################ # Run the script.
def load_system_stats(name): "Load system stats based on the system name." base = './Data/Systems/' path = base + name + '/Val/stats.json' return load_json(path)
return get_values(data, keys) systems = { 'Dai-et-al-2017': "Dai et al. 2017", 'Liu-et-al-2017': "Liu et al. 2017", 'Mun-et-al-2017': "Mun et al. 2017", 'Shetty-et-al-2016': 'Shetty et al. 2016', 'Shetty-et-al-2017': 'Shetty et al. 2017', 'Tavakoli-et-al-2017': 'Tavakoli et al. 2017', 'Vinyals-et-al-2017': 'Vinyals et al. 2017', 'Wu-et-al-2016': 'Wu et al. 2016', 'Zhou-et-al-2017': 'Zhou et al. 2017' } train_stats = load_json('./Data/COCO/Processed/train_stats.json') val_stats = load_json('./Data/COCO/Processed/val_stats.json') system_stats = {sys_name: load_system_stats(sys_name) for sys_name in systems} bleu_meteor = load_json('./Data/Systems/bleu_meteor.json') global_recall = load_json('./Data/Output/global_recall.json') local_recall = load_json('./Data/Output/local_recall.json') headers = [ 'System', 'BLEU', 'Meteor', "ASL", "SDSL", "Types", "TTR1", 'TTR2', 'Novel', 'Cov', 'Loc5' ] system_keys = [ "average_sentence_length", 'std_sentence_length', "num_types", "type_token_ratio", 'bittr', 'percentage_novel' ] corpus_keys = [
def name_to_stats_path(name): "Get mapping based on system name." base = './Data/Systems/' path = base + name + '/Val/stats.json' return load_json(path)
if word in not_learned} # Convert to counter. omissions = Counter(omissions) # Clean the data. del omissions['..'] for char in punctuation + ' \n': del omissions[char] # Return most common omissions. top_n = omissions.most_common(n) return list_from_counts(top_n) train_stats = load_json('./Data/COCO/Processed/train_stats.json') val_stats = load_json('./Data/COCO/Processed/val_stats.json') train = set(train_stats['types']) val = set(val_stats['types']) not_learned = train & val for name in systems: data = name_to_stats_path(name) not_learned -= set(data['types']) global_train_ranking = get_top_n_omitted(train_stats, not_learned, n=ranking_length) global_val_ranking = get_top_n_omitted(val_stats, not_learned, n=ranking_length) ################################################################################
def load_system_data(name): base = './Data/Systems/' path = base + name + '/Val/annotated.json' return load_json(path)
base = './Data/Systems/' path = base + name + '/Val/annotated.json' return load_json(path) def get_keys(d, keys): return [d[key] for key in keys] ################################################################################ # Compute stats. ########################### # Val val_tagged = load_json('./Data/COCO/Processed/tagged_val2014.json') parallel_entries = parallel_entries(val_tagged) parallel_results = [ depth_including_compounds(entries) for entries in parallel_entries ] val_result = average_dicts(parallel_results) parallel_histos = [ get_depths_histogram(entries) for entries in parallel_entries ] type_histos = [d['type_histogram'] for d in parallel_histos] token_histos = [d['token_histogram'] for d in parallel_histos] val_histo = dict(type_histogram=average_dicts(type_histos), token_histogram=average_dicts(token_histos)) ###########################
from methods import sentences_from_file, system_stats, load_json, save_json, sentence_stats train_data = load_json('./Data/COCO/Processed/tokenized_train2014.json') train_descriptions = [entry['caption'] for entry in train_data['annotations']] for folder in [ 'Dai-et-al-2017', 'Liu-et-al-2017', 'Mun-et-al-2017', 'Shetty-et-al-2016', 'Shetty-et-al-2017', 'Tavakoli-et-al-2017', 'Vinyals-et-al-2017', 'Wu-et-al-2016', 'Zhou-et-al-2017' ]: print('Processing:', folder) # Define source and target. base = './Data/Systems/' source = base + folder + '/Val/annotated.json' target = base + folder + '/Val/stats.json' # Load data. sentences = sentences_from_file(source) # Process data. stats = system_stats(sentences) # Get raw descriptions. gen_descriptions = [entry['caption'] for entry in load_json(source)] extra_stats = sentence_stats(train_descriptions, gen_descriptions) stats.update(extra_stats) # Save data. save_json(stats, target)
from methods import parallel_sentences_from_file, parallel_stats, load_json, save_json, sentence_stats train = parallel_sentences_from_file('./Data/COCO/Processed/tokenized_train2014.json', tagged=False, # Don't load tags. lower=True) # Lowercase all descriptions. val = parallel_sentences_from_file('./Data/COCO/Processed/tagged_val2014.json', tagged=False, # Don't load tags. lower=True) # Lowercase all descriptions. # Compute stats for train and val data. train_stats = parallel_stats(train) val_stats = parallel_stats(val) # Extra stats. train_data = load_json('./Data/COCO/Processed/tokenized_train2014.json') train_descriptions = [entry['caption'] for entry in train_data['annotations']] val_data = load_json('./Data/COCO/Processed/tagged_val2014.json') val_descriptions = [entry['caption'] for entry in val_data['annotations']] extra_stats = sentence_stats(train_descriptions, val_descriptions) val_stats.update(extra_stats) # Save data to file. save_json(train_stats, './Data/COCO/Processed/train_stats.json') save_json(val_stats, './Data/COCO/Processed/val_stats.json')
def load_system_stats(name): "Load system stats based on the system name." base = './Data/Systems/' path = base + name + '/Val/stats.json' return load_json(path) systems = [ 'Dai-et-al-2017', 'Liu-et-al-2017', 'Mun-et-al-2017', 'Shetty-et-al-2016', 'Shetty-et-al-2017', 'Tavakoli-et-al-2017', 'Vinyals-et-al-2017', 'Wu-et-al-2016', 'Zhou-et-al-2017' ] # Load the data system_stats = {sys_name: load_system_stats(sys_name) for sys_name in systems} global_recall = load_json('./Data/Output/global_recall.json') local_recall = load_json('./Data/Output/local_recall.json') # Values to be correlated. system_keys = [ "average_sentence_length", 'std_sentence_length', "num_types", "type_token_ratio", 'bittr', 'percentage_novel' ] # Let's first index all scores by system. # This is easiest to inspect, and we don't care about efficiency here. result_rows = dict() for system in systems: result_rows[system] = [system_stats[system][key] for key in system_keys] # Add local and global recall scores from their separate files.