Esempio n. 1
0
def get_ic(ic_str):
    def into_corpus(dirstr):
        import pandas as pd
        from nltk.tokenize import word_tokenize
        from nltk.corpus.reader.api import concat

        def files(s):
            for dir, dirs, files in os.walk(s):
                for f in files:
                    yield os.path.join(dir, f)

        def chunks(fp):
            if fp.endswith(".txt"):
                print("reading in text file as document: " + fp)
                with open(fp) as f:
                    return [f.read()]
            elif fp.endswith(".csv.gz"):
                print("reading in file as tweet documents: " + fp)
                return pd.read_csv(fp, compression="gzip").message.values

        return concat(
            word_tokenize(c) for f in files(dirstr) for c in chunks(f))

    if os.path.exists(ic_str):
        if os.path.isdir(ic_str):
            print("Assuming path leads to EITHER txt or twitter csv.gz files")
            return wn.ic(into_corpus(ic_str), False, 0.)
        elif ic_str.endswith(".dat"):  # assume this is a wordnet corpus.
            return wn.WordNetICCorpusReader(ic_str)
        else:
            raise NotImplementedError
    elif type(ic_str) == str:
        if ic_str == "brown":
            try:
                from nltk.corpus import brown
            except LookupError:
                import nltk
                nltk.download('brown')
                from nltk.corpus import webtext
            return wn.ic(brown, False, 0.)
        elif ic_str == "web":
            try:
                from nltk.corpus import webtext
            except LookupError:
                import nltk
                nltk.download('webtext')
                from nltk.corpus import webtext
            return wn.ic(webtext, False, 0.)
        else:
            raise NotImplementedError
    else:
        raise NotImplementedError
Esempio n. 2
0
def print_semantic_similarity(word1, word2):
    print('Printing path similarity')
    print(str(wn.synsets(word1)[0].path_similarity(wn.synsets(word2)[0])))
    print('Printing Leacock-Chodorow similarity')
    print(str(wn.synsets(word1)[0].lch_similarity(wn.synsets(word2)[0])))
    print('Printing Wu-Palmer similarity')
    print(str(wn.synsets(word1)[0].wup_similarity(wn.synsets(word2)[0])))
    '''
    Requires information content
    '''
    brown_ic = wordnet_ic.ic('ic-brown.dat')
    semcor_ic = wordnet_ic.ic('ic-semcor.dat')
    genesis_ic = wn.ic(genesis, False, 0.0)
    print('Printing Resnik similarity')
    print('Brown information content: ' + str(
        wn.synsets(word1)[0].res_similarity(wn.synsets(word2)[0], brown_ic)))
    print('Genesis information content: ' + str(
        wn.synsets(word1)[0].res_similarity(wn.synsets(word2)[0], genesis_ic)))
    print('Printing Jiang-Conrath similarity')
    print('Brown information content: ' + str(
        wn.synsets(word1)[0].jcn_similarity(wn.synsets(word2)[0], brown_ic)))
    print('Genesis information content: ' + str(
        wn.synsets(word1)[0].jcn_similarity(wn.synsets(word2)[0], genesis_ic)))
    print('Printing Lin similarity')
    print('Semcor information content: ' + str(
        wn.synsets(word1)[0].lin_similarity(wn.synsets(word2)[0], semcor_ic)))
	def run(self):
		## start operations
		stime = time.time()
		self.printer.mainTitle("Thesauto - automated creation of a thesaurus using WordNet")
		
		## read file
		self.printer.stage(1, 4, "Extracting words from input file")
		self.printer.info("File: " + basename(self.inputFile))
		words = self.extractwords()
		self.printer.lines(words, max=20, 
			title="-- Extracted " + str(len(words)) + " words --")
		
		## prepare WordNet IC
		self.printer.stage(2, 4, "Preparing WordNet IC (Information Content)")
		if not self.database:
			IC = wordnet.ic(brown, False, 1.0)
		else:
			IC = None
			self.printer.info("Using base WordNet thesaurus as a database instead. Skipped.")
			self.printer.info("Base: " + basename(self.database))
			
		## create a thesaurus for each set of words
		self.printer.stage(3, 4, "Building thesaurus")
		thesaurus = self.buildWordnetThesaurus(words, IC)
		self.printer.lines(thesaurus, max=10, line_max=75,
			title="-- Built thesaurus --")
			
		## save the final thesaurus
		self.printer.stage(4, 4, "Saving full thesaurus")
		open(self.outputFile, 'w').write( \
			''.join([self.setToString(set) for set in thesaurus]))
		self.printer.info(self.outputFile + " written.")
		
		etime = time.time()
		self.printer.info("Execution took " + str(etime-stime) + " seconds\n")
Esempio n. 4
0
def get_sim(word1, word2, similarity='path', combine='max'):
    s1 = wn.synsets(word1)
    s2 = wn.synsets(word2)
    if similarity == 'path':
        vals = np.array([x.path_similarity(y) for x, y in product(s1, s2)],
                        dtype=float)
    elif similarity == 'lch':
        vals = np.array([x.lch_similarity(y) for x, y in product(s1, s2)],
                        dtype=float)
    elif similarity == 'wup':
        vals = np.array([x.wup_similarity(y) for x, y in product(s1, s2)],
                        dtype=float)
    elif similarity == 'res':
        from nltk.corpus import reuters
        ic = wn.ic(reuters, False, 0.0)
        vals = np.array([x.res_similarity(y, ic) for x, y in product(s1, s2)],
                        dtype=float)
    elif similarity == 'jcn':
        from nltk.corpus import reuters
        ic = wn.ic(reuters, False, 0.0)
        vals = np.array([x.jcn_similarity(y, ic) for x, y in product(s1, s2)],
                        dtype=float)
    elif similarity == 'lin':
        from nltk.corpus import reuters
        ic = wn.ic(reuters, False, 0.0)
        vals = np.array([x.lin_similarity(y, ic) for x, y in product(s1, s2)],
                        dtype=float)

    if combine == 'max':
        return np.nanmax(vals)
    elif combine == 'mean':
        return np.nanmean(vals)
    elif combine == 'min':
        return np.nanmin(vals)

    return 0
Esempio n. 5
0
def get_corpus_and_ic(set_of_strings, file_corpus_output):

    if os.path.exists(file_corpus_output):
        os.remove(file_corpus_output)

    with open(file_corpus_output, "w+", encoding='utf8') as file_corpus:
        for token in set_of_strings:
            #file_corpus.write(token.replace('\n', '') + '\n')
            file_corpus.write(token + '\n')

    corpus = PlaintextCorpusReader(
        file_corpus_output.split('/')[0],
        file_corpus_output.split('/')[1])
    corpus_ic = wn.ic(corpus, False, 0.0)

    return corpus, corpus_ic
Esempio n. 6
0
from __future__ import print_function
import sys
from composes.utils import io_utils, scoring_utils
from composes.similarity.cos import CosSimilarity
from nltk.corpus import wordnet as wn
from nltk.corpus import brown
from nltk.corpus import wordnet_ic
from itertools import combinations
print("Loading brown IC")
brown_ic = wn.ic(brown, False, 0.0)
print("done")

def getss(word):
    return wn.synsets(word,'n')[0]
def wn_sim(ss1,ss2):
    print("comparing",ss1,"with",ss2)
    return wn.path_similarity(ss1,ss2)
def jcn_sim(ss1,ss2):
    return ss1.jcn_similarity(ss2, brown_ic)
def lin_sim(ss1,ss2):
    return ss1.lin_similarity(ss2, brown_ic)
def res_sim(ss1,ss2):
    return ss1.res_similarity(ss2, brown_ic)
def wup_sim(ss1,ss2):
    return ss1.wup_similarity(ss2, brown_ic)
def lch_sim(ss1,ss2):
    return ss1.lch_similarity(ss2, brown_ic)
def mean(seq):
    print(sum(seq) / len(seq))
    return sum(seq) / len(seq)
def is_better(ingredients, result, other):
Esempio n. 7
0
import numpy as np
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import brown

_brown_ic = wn.ic(brown)


def nn_suggestion(phrase):
    """Suggest the name in a phrase
    """
    if " " not in phrase:
        return

    names = [n for (n, t) in nltk.pos_tag(phrase.split()) if t == 'NN']
    if len(names) > 1:
        print('Warning: "{}" has multiple NN to choose from'.format(phrase))
    for nn in names:
        yield nn


def name_suggestions(phrase):
    """Go through the name suggestions
    """
    yield phrase
    if ' ' in phrase:
        yield '_'.join(phrase.split())  # Wordnet substitutes space with underscore
    yield phrase.replace(" ", "")
    for nn in nn_suggestion(phrase):
        yield nn
Esempio n. 8
0
#Imports
import sys
import re
import numpy as np
import pandas as pd
import time
import csv

from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
brown_ic = wordnet_ic.ic('ic-brown.dat')
semcor_ic = wordnet_ic.ic('ic-semcor.dat')

from nltk.corpus import genesis
genesis_ic = wn.ic(genesis, False, 0.0)


#Creates keywords list from formed CSV file with sysReview.py
def create_list(keywords_file):
    with open(keywords_file, 'rb') as csvfile:
        data_CSV = csv.reader(csvfile, delimiter=',')
        words, path, definition = [], [], []
        for row in data_CSV:
            words.append(row[0])
            path.append(row[1])
            definition.append(row[2])
            all = len(row) - 2
        words = filter(None, words)  #Removes empty items in list
        path = filter(None, path)
        definition = filter(None, definition)
    return (words, all)
Esempio n. 9
0
for doc in corpus:
    corpus_dict.append(dict(doc))
dictlen = len(dictionary)

tc = WordNetEvaluator()

tc_means = []
tc_medians = []
words_list = []

ofilemean = open(dname + "/"+tcmethod+"_mean_rand_"+str(word_count)+".txt", "w")
ofilemedian = open(dname + "/"+tcmethod+"_median_rand_"+str(word_count)+".txt", "w")

if ic:
    if dname == "reuters_LDA":
        src_ic = wn.ic(reuters, False, 0.0)
    else:
        src_ic = wn.ic(brown, False, 0.0)



for i in range(sample_times):
    random_words = []
    # generate random numbers
    for n in range(word_count):
        word = random.randint(1, dictlen-1)
        while word in random_words:
            word = random.randint(0, dictlen-1)
        random_words.append(word)

    keylist = []
Esempio n. 10
0
import nltk
from nltk.corpus import wordnet
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
brown_ic = wordnet_ic.ic('ic-brown.dat')
semcor_ic = wordnet_ic.ic('ic-semcor.dat')
from nltk.corpus import genesis
genesis_ic = wn.ic(genesis, False, 0.0)
lion = wn.synset('lion.n.01')
cat = wn.synset('cat.n.01')
print(lion.res_similarity(cat, brown_ic))
print(lion.res_similarity(cat, genesis_ic))
print(lion.jcn_similarity(cat, brown_ic))
print(lion.jcn_similarity(cat, genesis_ic))
print(lion.lin_similarity(cat, semcor_ic))


Esempio n. 11
0
print(bad.name() == 'bad')
print(bad.synset().definition() == 'having undesirable or negative qualities')

print('=====================================')
print('Calculating WordNet Synset Similarity')
print('=====================================')

lion = wordnet.synset('lion.n.01')

print(lion.path_similarity(cat))
# print(lion.lch_similarity(cat))
print(lion.wup_similarity(cat))

brown_ic = wordnet_ic.ic('ic-brown.dat')
semcor_ic = wordnet_ic.ic('ic-semcor.dat')
genesis_ic = wordnet.ic(genesis, False, 0.0)
print(lion.res_similarity(cat, brown_ic))
print(lion.res_similarity(cat, genesis_ic))
print(lion.jcn_similarity(cat, brown_ic))
print(lion.jcn_similarity(cat, genesis_ic))
print(lion.lin_similarity(cat, semcor_ic))

cb = wordnet.synset('cookbook.n.01')
ib = wordnet.synset('instruction_book.n.01')
print(cb.wup_similarity(ib) == 0.91666666666666663)

ref = cb.hypernyms()[0]
print(cb.shortest_path_distance(ref) == 1)
print(ib.shortest_path_distance(ref) == 1)
print(cb.shortest_path_distance(ib) == 2)
Esempio n. 12
0
 def __init__(self):
     self.prep = Preprocessor()
     self.genesis_ic = wn.ic(genesis, False, 0.0)
Esempio n. 13
0
import sqlite3
import sys
import random
import numpy as np
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
from nltk.corpus import genesis
from scipy import stats

# information content references
BROWN_IC = wordnet_ic.ic('ic-brown.dat')
SEMCOR_IC = wordnet_ic.ic('ic-semcor.dat')
GENESIS_IC = wn.ic(genesis, False, 0.0)

class Similarity_Test:

    def __init__(self, sample_size, ic):
        self.sample_size = sample_size
        self.ic = ic
        self.overlap = []
        self.path = []
        self.wup = []
        # Don't use the following similarity metrics because they only supports comparing words with same POS
        # self.lch = []
        # self.res = []
        # self.jcn = []
        # self.lin = []
        self._scores_over_sample()


    def _load_sdr(self, word):
Esempio n. 14
0
#jcn_similarity test
from nltk.corpus import wordnet
from nltk.corpus import wordnet_ic
from nltk.corpus import genesis

brown_ic = wordnet_ic.ic('ic-brown.dat')
genesis_ic = wordnet.ic(genesis,False,0.0)
semcor_ic = wordnet_ic.ic('ic-semcor.dat')



list1 = ['flight']
list2 = ['trip']
list3 = []
for word1 in list1:
    for word2 in list2:
        wordFromList1 = wordnet.synsets(word1,pos=wordnet.NOUN)
        wordFromList2 = wordnet.synsets(word2,pos=wordnet.NOUN)
        print wordFromList1 ,"\n", wordFromList2
        if wordFromList1 and wordFromList2:
            for item1 in wordFromList1:
                for item2 in wordFromList2:
                    s=item1.jcn_similarity(item2,brown_ic)
                    print(item1, item2, s)
                    list3.append(s)

print(max(list3))

print list3
print(hit.lch_similarity(slap))  # doctest: +ELLIPSIS
print(wn.lch_similarity(hit, slap))  # doctest: +ELLIPSIS
print(hit.lch_similarity(slap, simulate_root=False))
print(wn.lch_similarity(hit, slap, simulate_root=False))
print(dog.wup_similarity(cat))  # doctest: +ELLIPSIS
print(hit.wup_similarity(slap))
print(wn.wup_similarity(hit, slap))
print(hit.wup_similarity(slap, simulate_root=False))
print(wn.wup_similarity(hit, slap, simulate_root=False))
# import nltk
# nltk.download('wordnet_ic')
brown_ic = wordnet_ic.ic('ic-brown.dat')
semcor_ic = wordnet_ic.ic('ic-semcor.dat')
# import nltk
# nltk.download('genesis')
genesis_ic = wn.ic(genesis, False, 0.0)
print(dog.res_similarity(cat, brown_ic))  # doctest: +ELLIPSIS
print(dog.res_similarity(cat, genesis_ic))  # doctest: +ELLIPSIS
print(dog.jcn_similarity(cat, brown_ic))  # doctest: +ELLIPSIS
print(dog.jcn_similarity(cat, genesis_ic))  # doctest: +ELLIPSIS
print(dog.lin_similarity(cat, semcor_ic))  # doctest: +ELLIPSIS
# access to all synsets
for synset in list(wn.all_synsets('n'))[:10]:
    print(synset)
print(wn.synsets('dog'))  # doctest: +ELLIPSIS
print(wn.synsets('dog', pos='v'))
for synset in islice(wn.all_synsets('n'), 5):
    print(synset, synset.hypernyms())
# morphy
print(wn.morphy('denied', wn.NOUN))
print(wn.morphy('denied', wn.VERB))