Example #1
0
def find_ambiguous_examples2():
    raw = read_file('resources/farsnetWSDDataset.json')
    parsed_json = json.loads(raw)
    output = json.loads(read_file('resources/FNWords2.json'))
    ambigs = json.loads(read_file('resources/ambigs2.json'))
    parsed = read_file('parsed_synsets2.json')
    parsed = json.loads(parsed)
    for item in parsed_json[:len(parsed) + 2000]:
        id = int(item['id'])
        if id in parsed:
            continue
        example = item['example']
        for sentence in example.split('*'):
            if len(sentence) < 30:
                continue
            entry = item.copy()
            all_syns, candid_syns = extract_synsets(sentence)
            entry['example'] = sentence
            entry['words'] = candid_syns
            for key in candid_syns:
                if id in candid_syns[key] and len(candid_syns[key]) > 1:
                    entry['ambig_word'] = key
                    ambigs.append(entry)
            output.append(entry)
        parsed.append(id)
        if (len(output) % 10) == 0:
            print("Parsed {} examples, {} ambigs found".format(
                len(output), len(ambigs)))
    write_file('resources/FNWords2.json', json.dumps(output,
                                                     ensure_ascii=False))
    write_file('resources/ambigs2.json', json.dumps(ambigs,
                                                    ensure_ascii=False))
    write_file('parsed_synsets2.json', json.dumps(parsed, ensure_ascii=False))
Example #2
0
def find_gloss_relations(name):
    base_path = 'resources/' + name + '/'
    words_path = base_path + 'all_synsets.json'
    relations_path = base_path + 'gloss_relations.json'
    processed_path = base_path + 'gloss_parsed.json'
    parsed_synsets = json.loads(read_file(words_path))
    wordnet = prepare_wordnet(name)
    relations = json.loads(read_file(relations_path))
    processed_syns = json.loads(read_file(processed_path))
    i = 0
    j = len(processed_syns)
    for syn_id in parsed_synsets:
        j += 1
        if syn_id in processed_syns:
            continue
        processed_syns.append(syn_id)
        i += 1
        syn_data = parsed_synsets[syn_id]
        temp, words = wordnet.extract_synsets(syn_data['gloss'])
        for word_tag in words:
            if len(words[word_tag]) == 1:
                relations.append((syn_id, str(words[word_tag][0])))
        if i % 10 == 0:
            print('{} relations found so far(iteration:{})'.format(
                len(relations), j))
        if i > 150:
            break
    write_file(relations_path, json.dumps(relations))
    write_file(processed_path, json.dumps(processed_syns))
    print("{} gloss relations found and written to {} folder".format(
        len(relations), name))
Example #3
0
def find_ambiguous_examples(name):
    base_path = 'resources/' + name + '/'
    raw = read_file(base_path + 'all_synsets.json')
    dataset_synsets = json.loads(raw)
    words_path = base_path + 'examples_words.json'
    output = json.loads(read_file(words_path))
    ambigs_path = base_path + 'ambigs.json'
    ambigs = json.loads(read_file(ambigs_path))
    processed_path = base_path + 'parsed_synsets.json'
    parsed = json.loads(read_file(processed_path))
    wordnet = prepare_wordnet(name)
    i = 0
    j = len(parsed)
    for syn_id in dataset_synsets:
        j += 1
        item = dataset_synsets[syn_id]
        if syn_id in parsed or 'example' not in item:
            continue
        i += 1
        example = item['example']
        for sentence in example.split('*'):
            entry = item.copy()
            all_syns, candid_syns = wordnet.extract_synsets(sentence)
            entry['example'] = sentence
            entry['words'] = candid_syns
            #@todo: @bug: If an example contains "*", current code only keeps the last sentence
            output[syn_id] = entry
            if len(sentence) < 30:
                continue
            #If a long enough example has more than one meaning for the word that is
            #representative of syn_id(sense_snapshot), it is a good item for wsd dataset:
            for key in candid_syns:
                if int(syn_id) in candid_syns[key] and len(
                        candid_syns[key]) > 1:
                    entry['ambig_word'] = key
                    #@todo: @bug: If an example contains "*", current code only keeps the last sentence
                    ambigs[syn_id] = entry
                    #We found anchor word, do not iterate over other words of the example
                    break

        parsed.append(syn_id)
        if (i % 10) == 0:
            print("Parsed {} examples, {} ambigs found".format(j, len(ambigs)))
        if i > 200:
            break
    write_file(words_path, json.dumps(output, ensure_ascii=False))
    write_file(ambigs_path, json.dumps(ambigs, ensure_ascii=False))
    write_file(processed_path, json.dumps(parsed, ensure_ascii=False))
Example #4
0
def find_almost_certain_examples(name):
    """ Tries to find examples in which all words have only one synsets except the main word"""
    ambigs = json.loads(read_file('resources/' + name + '/ambigs.json'))
    certain_sens = 0
    num_certains = []
    certain_examples = []
    for syn_id in ambigs:
        item = ambigs[syn_id]
        certain_words = 0
        for key in item['words']:
            if len(item['words'][key]) == 1:
                certain_words += 1
        num_certains.append(certain_words)
        if certain_words == len(item['words']) - 1:
            certain_sens += 1
            certain_examples.append(item)
    import numpy as np
    import matplotlib.pyplot as plt
    print(np.histogram(num_certains))
    plt.hist(num_certains, bins=np.arange(12) - .5)
    plt.ylabel('Number of sentences')
    plt.xlabel('Number of certain words')
    plt.show()
    print(len(ambigs), certain_sens, sum(num_certains))
    write_file('resources/' + name + '/certains.json',
               json.dumps(certain_examples, ensure_ascii=False))
Example #5
0
 def senses_snapshot(self, syn_id):
     if self._corpus is None:
         self._corpus = json.loads(
             read_file('resources/Farsnet/all_synsets.json'))
     if syn_id in self._corpus:
         return self._corpus[syn_id]['senses_snapshot']
     return ''
Example #6
0
 def fetch_definition(self, syn_id):
     if self._synsets == {}:
         # fp = open('resources/Wordnet/words.json', '+w')
         # self._synsets = json.load(fp)
         self._synsets = json.loads(
             read_file('resources/Wordnet/all_synsets.json'))
     return self._synsets[str(syn_id)]
Example #7
0
 def find_ambig_by_id(self, syn_id):
     if self._ambigs == {}:
         # fp = open('resources/Wordnet/words.json', '+w')
         # self._synsets = json.load(fp)
         self._ambigs = json.loads(
             read_file('resources/Wordnet/ambigs.json'))
     return self._ambigs[syn_id]
Example #8
0
def find_ambiguous_examples():
    raw = read_file('resources/farsnetWSDDataset.json')
    parsed_json = json.loads(raw)
    output = json.loads(read_file('resources/FNWords.json'))
    ambigs = json.loads(read_file('resources/ambigs.json'))
    parsed = read_file('parsed_synsets.json')
    parsed = json.loads(parsed)
    for item in parsed_json[:len(parsed) + 200]:
        id = int(item['id'])
        if id in parsed:
            continue
        example = normalizer.normalize(item['example'])
        #Remove punctuation, keep halfspace:
        example = re.sub(r'[^\w\s\u200c]', '', example)
        words = word_tokenize(example)
        words = remove_stop_words(words)
        tags = tagger.tag(words)
        synsets = {}
        for (w, tag) in tags:
            syns = fetch_synsets(w, tag)
            if len(syns) == 0:
                root = stemmer.stem(w)
                if root is not "" and root != w:
                    syns = fetch_synsets(root, tag)
            #only save ambigiues words:
            if len(syns) > 0:
                synsets[w + '_' + tag] = syns
                if id in syns:
                    ambigs.append(
                        item
                    )  #this will take the key "words" too, even it is set later
        item['words'] = synsets
        output.append(item)
        parsed.append(id)
        if (len(output) % 10) == 0:
            print("Parsed {} examples, {} ambigs found".format(
                len(output), len(ambigs)))
    write_file('resources/FNWords.json',
               json.dumps(
                   output,
                   ensure_ascii=False,
               ))
    write_file('resources/ambigs.json', json.dumps(ambigs, ensure_ascii=False))
    write_file('parsed_synsets.json', json.dumps(parsed, ensure_ascii=False))
Example #9
0
 def graph(self):
     if self._graph is None:
         import networkx as nx
         self._graph = nx.Graph()
         #self._graph = importFromPaj("resources/Farsnet/synset_relation.paj")
         #self._graph = importFromPaj("resources/Farsnet/synset_related_to.paj")
         #self._graph = importFromPaj("resources/Farsnet/synset_hypernyms.paj")
         gloss_relations = json.loads(
             read_file('resources/Farsnet/gloss_relations.json'))
         for (src, dst) in gloss_relations:
             self._graph.add_node(src,
                                  Value=self.senses_snapshot(src) + ';' +
                                  src)
             self._graph.add_node(dst,
                                  Value=self.senses_snapshot(dst) + ';' +
                                  dst)
             self._graph.add_edge(src, dst, Relation='gloss')
     return self._graph
Example #10
0
 def find_ambig_by_id(self, syn_id):
     if self._ambigs is None:
         self._ambigs = json.loads(
             read_file('resources/Farsnet/examples_words.json'))
     return self._ambigs[syn_id]
Example #11
0
import re
from Utility import read_file

stop_words = read_file("resources/stop-words.txt").split()

'''and re.sub("\s|\u200c", "", word).isalnum()'''
def remove_stop_words(words):
    return [word for word in words if word not in stop_words]
Example #12
0
def find_ambig_by_id(syn_id):
    samples = json.loads(read_file('resources/FNWords2.json'))
    for item in samples:
        if item['id'] == str(syn_id):
            return item
Example #13
0
from Utility import read_file, write_file
import json
from DoWSD import run_wsd
import numpy as np

test_set_size = 100
ambs = json.loads(read_file('resources/ambigs2.json'))
#ambs = json.loads(read_file('resources/certains.json'))
test_set = np.random.choice(ambs, test_set_size, False)
#test_set = ambs[:test_set_size]

#write_file('debug/last_test_set2.json', json.dumps(list(test_set), ensure_ascii=False))
#test_set = json.loads(read_file('debug/last_test_set2.json'))

total_true = 0
i = 0
baseline = 0
fatals = 0
isolated_syns = 0
zero_edges = {'positive': 0, 'negative': 0}
sum_edges = 0
sum_nodes = 0
for item in test_set:
    i += 1
    all_syns = [str(j) for k in item['words'] for j in item['words'][k]]
    answer, report = run_wsd((all_syns, item['words']), True, item['id'])
    ranks = report['ranks']
    ambig_key = item['ambig_word']
    baseline += 1 / len(item['words'][ambig_key])

    if ambig_key not in answer: