def find_ambiguous_examples2(): raw = read_file('resources/farsnetWSDDataset.json') parsed_json = json.loads(raw) output = json.loads(read_file('resources/FNWords2.json')) ambigs = json.loads(read_file('resources/ambigs2.json')) parsed = read_file('parsed_synsets2.json') parsed = json.loads(parsed) for item in parsed_json[:len(parsed) + 2000]: id = int(item['id']) if id in parsed: continue example = item['example'] for sentence in example.split('*'): if len(sentence) < 30: continue entry = item.copy() all_syns, candid_syns = extract_synsets(sentence) entry['example'] = sentence entry['words'] = candid_syns for key in candid_syns: if id in candid_syns[key] and len(candid_syns[key]) > 1: entry['ambig_word'] = key ambigs.append(entry) output.append(entry) parsed.append(id) if (len(output) % 10) == 0: print("Parsed {} examples, {} ambigs found".format( len(output), len(ambigs))) write_file('resources/FNWords2.json', json.dumps(output, ensure_ascii=False)) write_file('resources/ambigs2.json', json.dumps(ambigs, ensure_ascii=False)) write_file('parsed_synsets2.json', json.dumps(parsed, ensure_ascii=False))
def find_gloss_relations(name): base_path = 'resources/' + name + '/' words_path = base_path + 'all_synsets.json' relations_path = base_path + 'gloss_relations.json' processed_path = base_path + 'gloss_parsed.json' parsed_synsets = json.loads(read_file(words_path)) wordnet = prepare_wordnet(name) relations = json.loads(read_file(relations_path)) processed_syns = json.loads(read_file(processed_path)) i = 0 j = len(processed_syns) for syn_id in parsed_synsets: j += 1 if syn_id in processed_syns: continue processed_syns.append(syn_id) i += 1 syn_data = parsed_synsets[syn_id] temp, words = wordnet.extract_synsets(syn_data['gloss']) for word_tag in words: if len(words[word_tag]) == 1: relations.append((syn_id, str(words[word_tag][0]))) if i % 10 == 0: print('{} relations found so far(iteration:{})'.format( len(relations), j)) if i > 150: break write_file(relations_path, json.dumps(relations)) write_file(processed_path, json.dumps(processed_syns)) print("{} gloss relations found and written to {} folder".format( len(relations), name))
def find_ambiguous_examples(name): base_path = 'resources/' + name + '/' raw = read_file(base_path + 'all_synsets.json') dataset_synsets = json.loads(raw) words_path = base_path + 'examples_words.json' output = json.loads(read_file(words_path)) ambigs_path = base_path + 'ambigs.json' ambigs = json.loads(read_file(ambigs_path)) processed_path = base_path + 'parsed_synsets.json' parsed = json.loads(read_file(processed_path)) wordnet = prepare_wordnet(name) i = 0 j = len(parsed) for syn_id in dataset_synsets: j += 1 item = dataset_synsets[syn_id] if syn_id in parsed or 'example' not in item: continue i += 1 example = item['example'] for sentence in example.split('*'): entry = item.copy() all_syns, candid_syns = wordnet.extract_synsets(sentence) entry['example'] = sentence entry['words'] = candid_syns #@todo: @bug: If an example contains "*", current code only keeps the last sentence output[syn_id] = entry if len(sentence) < 30: continue #If a long enough example has more than one meaning for the word that is #representative of syn_id(sense_snapshot), it is a good item for wsd dataset: for key in candid_syns: if int(syn_id) in candid_syns[key] and len( candid_syns[key]) > 1: entry['ambig_word'] = key #@todo: @bug: If an example contains "*", current code only keeps the last sentence ambigs[syn_id] = entry #We found anchor word, do not iterate over other words of the example break parsed.append(syn_id) if (i % 10) == 0: print("Parsed {} examples, {} ambigs found".format(j, len(ambigs))) if i > 200: break write_file(words_path, json.dumps(output, ensure_ascii=False)) write_file(ambigs_path, json.dumps(ambigs, ensure_ascii=False)) write_file(processed_path, json.dumps(parsed, ensure_ascii=False))
def find_almost_certain_examples(name): """ Tries to find examples in which all words have only one synsets except the main word""" ambigs = json.loads(read_file('resources/' + name + '/ambigs.json')) certain_sens = 0 num_certains = [] certain_examples = [] for syn_id in ambigs: item = ambigs[syn_id] certain_words = 0 for key in item['words']: if len(item['words'][key]) == 1: certain_words += 1 num_certains.append(certain_words) if certain_words == len(item['words']) - 1: certain_sens += 1 certain_examples.append(item) import numpy as np import matplotlib.pyplot as plt print(np.histogram(num_certains)) plt.hist(num_certains, bins=np.arange(12) - .5) plt.ylabel('Number of sentences') plt.xlabel('Number of certain words') plt.show() print(len(ambigs), certain_sens, sum(num_certains)) write_file('resources/' + name + '/certains.json', json.dumps(certain_examples, ensure_ascii=False))
def senses_snapshot(self, syn_id): if self._corpus is None: self._corpus = json.loads( read_file('resources/Farsnet/all_synsets.json')) if syn_id in self._corpus: return self._corpus[syn_id]['senses_snapshot'] return ''
def fetch_definition(self, syn_id): if self._synsets == {}: # fp = open('resources/Wordnet/words.json', '+w') # self._synsets = json.load(fp) self._synsets = json.loads( read_file('resources/Wordnet/all_synsets.json')) return self._synsets[str(syn_id)]
def find_ambig_by_id(self, syn_id): if self._ambigs == {}: # fp = open('resources/Wordnet/words.json', '+w') # self._synsets = json.load(fp) self._ambigs = json.loads( read_file('resources/Wordnet/ambigs.json')) return self._ambigs[syn_id]
def find_ambiguous_examples(): raw = read_file('resources/farsnetWSDDataset.json') parsed_json = json.loads(raw) output = json.loads(read_file('resources/FNWords.json')) ambigs = json.loads(read_file('resources/ambigs.json')) parsed = read_file('parsed_synsets.json') parsed = json.loads(parsed) for item in parsed_json[:len(parsed) + 200]: id = int(item['id']) if id in parsed: continue example = normalizer.normalize(item['example']) #Remove punctuation, keep halfspace: example = re.sub(r'[^\w\s\u200c]', '', example) words = word_tokenize(example) words = remove_stop_words(words) tags = tagger.tag(words) synsets = {} for (w, tag) in tags: syns = fetch_synsets(w, tag) if len(syns) == 0: root = stemmer.stem(w) if root is not "" and root != w: syns = fetch_synsets(root, tag) #only save ambigiues words: if len(syns) > 0: synsets[w + '_' + tag] = syns if id in syns: ambigs.append( item ) #this will take the key "words" too, even it is set later item['words'] = synsets output.append(item) parsed.append(id) if (len(output) % 10) == 0: print("Parsed {} examples, {} ambigs found".format( len(output), len(ambigs))) write_file('resources/FNWords.json', json.dumps( output, ensure_ascii=False, )) write_file('resources/ambigs.json', json.dumps(ambigs, ensure_ascii=False)) write_file('parsed_synsets.json', json.dumps(parsed, ensure_ascii=False))
def graph(self): if self._graph is None: import networkx as nx self._graph = nx.Graph() #self._graph = importFromPaj("resources/Farsnet/synset_relation.paj") #self._graph = importFromPaj("resources/Farsnet/synset_related_to.paj") #self._graph = importFromPaj("resources/Farsnet/synset_hypernyms.paj") gloss_relations = json.loads( read_file('resources/Farsnet/gloss_relations.json')) for (src, dst) in gloss_relations: self._graph.add_node(src, Value=self.senses_snapshot(src) + ';' + src) self._graph.add_node(dst, Value=self.senses_snapshot(dst) + ';' + dst) self._graph.add_edge(src, dst, Relation='gloss') return self._graph
def find_ambig_by_id(self, syn_id): if self._ambigs is None: self._ambigs = json.loads( read_file('resources/Farsnet/examples_words.json')) return self._ambigs[syn_id]
import re from Utility import read_file stop_words = read_file("resources/stop-words.txt").split() '''and re.sub("\s|\u200c", "", word).isalnum()''' def remove_stop_words(words): return [word for word in words if word not in stop_words]
def find_ambig_by_id(syn_id): samples = json.loads(read_file('resources/FNWords2.json')) for item in samples: if item['id'] == str(syn_id): return item
from Utility import read_file, write_file import json from DoWSD import run_wsd import numpy as np test_set_size = 100 ambs = json.loads(read_file('resources/ambigs2.json')) #ambs = json.loads(read_file('resources/certains.json')) test_set = np.random.choice(ambs, test_set_size, False) #test_set = ambs[:test_set_size] #write_file('debug/last_test_set2.json', json.dumps(list(test_set), ensure_ascii=False)) #test_set = json.loads(read_file('debug/last_test_set2.json')) total_true = 0 i = 0 baseline = 0 fatals = 0 isolated_syns = 0 zero_edges = {'positive': 0, 'negative': 0} sum_edges = 0 sum_nodes = 0 for item in test_set: i += 1 all_syns = [str(j) for k in item['words'] for j in item['words'][k]] answer, report = run_wsd((all_syns, item['words']), True, item['id']) ranks = report['ranks'] ambig_key = item['ambig_word'] baseline += 1 / len(item['words'][ambig_key]) if ambig_key not in answer: