Beispiel #1
0
    def load(self, pos):
        wn = self.wn

        if pos == 'n':
            roots = wn.synsets('entity')
        else:
            roots = [s for s in wn.all_synsets(pos) if len(s.hypernyms()) == 0]

        self.root = WordNetTreeNode('root')

        for synset in roots:
            self.__append_synset(synset, self.root)

        # unfortunately, the block above is not guaranteed to build
        # the entire WordNet tree. The reason is that it starts at root
        # adding the descendants retrieved from synset.hyponyms(). For some
        # odd reason that method not always returns all hyponyms. For
        # example, portugal.n.01 is not retrieved as a hyponym of
        # european_country.n.01, but if we call
        #   wn.synsets('portugal')[0].hypernym_paths()
        # european-country.n.01 appears as its ancestor.

        # check for synsets that were not foundss
        index = self.hashtable()
        for synset in wn.all_synsets(pos):
            if synset.name() not in index:
                for path in synset.hypernym_paths():
                    keys = [s.name() for s in path]
                    self.__extend(keys,
                        is_internal = len(path[-1].hyponyms()) > 0)
	def prepare(self):
		for verb in wn.all_synsets('v'):
			for lemma in verb.lemmas():
				if 1 in lemma.frame_ids():
					for lemma in verb.lemmas():
						#print lemma.name()
						#print (lemma, lemma.frame_ids(), "|".join(lemma.frame_strings()))
						#print verb.frame_strings()
						verbs.append(str(lemma.name()).replace('_', ' '))
		#print verbs

		for noun in wn.all_synsets('n'):
			#print noun
			for lemma in noun.lemmas():
				#print lemma.name()
				nouns.append(self.plural(str(lemma.name()).replace('_', ' ')))
		#print nouns

		for adj in wn.all_synsets('a'):
			#print adj
			for lemma in adj.lemmas():
				#print lemma.name()
				adjectives.append(str(lemma.name()).replace('_', ' '))

		for adv in wn.all_synsets('r'):
			#print adv
			for lemma in adv.lemmas():
				#print lemma.name()
				adverbs.append(str(lemma.name()).replace('_', ' '))
Beispiel #3
0
def list_nouns():

    global NOUNS
    print "[+] Creating list of nouns... (This only has to be done once)"

    if WIKI_LANGUAGE == 'en':
        ## Make list of nouns from wordnet
        NOUNS = {x.name().split('.', 1)[0] for x in wn.all_synsets('n')}
        ## TODO CREATE A SEPARATE LIST FOR NOUNS ENDING IN S
    elif WIKI_LANGUAGE == 'es':
        ## Make list of nouns from cess_esp
        list = nltk.corpus.cess_esp.tagged_words()
        sust = []
        for elem in list:
            if elem[1][0] == 'n':
                sust.append(elem[0])
        NOUNS = set(sust)
    # TODO german language support
    # elif WIKI_LANGUAGE == 'de':
    else:
        print "[!] Language not recognised, using English."
        ## Make list of nouns from wordnet
        NOUNS = {x.name().split('.', 1)[0] for x in wn.all_synsets('n')}


    print "    Done!"
Beispiel #4
0
def populateBars():

    connection = mdb.connect('localhost', 'user', 'pass', 'barlytics')

    current = connection.cursor()

    nounsList = []
    adjectiveList = []
    cityList = ['San Francisco', 'Chicago', 'New York', 'Austin', 'Seattle']

    print "here"
    count = 0
    for synset in list(wn.all_synsets('n')):
        nounsList.append(str(synset.name).split('.')[0])
        count = count + 1
        if count >= 50000:
            break

    count= 0
    print "here"
    for synset in list(wn.all_synsets('a')):
        adjectiveList.append(str(synset.name).split('.')[0])
        count = count + 1
        if count >= 50000:
            break
    print "here"
    finalList = []
    for i in range(10000):
        string = ''
        string = "The " + adjectiveList[randint(0, len(adjectiveList) - 1)].capitalize()

        string = string + " " + nounsList[randint(0, len(nounsList) - 1)].capitalize()
        finalList.append(string)

        name = string
        license = str(randint(1000000, 9000000))
        city = str(address.city())
        phone = str(phone_number.phone_number_format(0))
        addr = str(randint(1, 255)) + " " + address.street_name()


        query = 'insert into bars values("' + name + '", "' + license + '", "' + city + '", "' + phone + '", "' + addr + '"); '
        print query
        try:
            current.execute(query)
        except mdb.IntegrityError:
            print "integrity error:"
    print 'commit'
    connection.commit()
Beispiel #5
0
    def _run_extract(self):
        #extract all 2 word AN and NN compounds from WN and write to file
        print "Extracting noun compounds from WN"


        discards=[]
        allsynsets=list(wn.all_synsets(self.parameters['pos']))
        if not self.parameters['testing']:
            self.n=len(allsynsets)
        for synset in list(wn.all_synsets(self.parameters['pos']))[:self.n]:
            for lemma in synset.lemmas:  #walk over all lemmas for all synsets
                #print lemma.name
                words=lemma.name.split('_')
                if len(words)==2:#check 2 words
                    poslist=[]
                    for word in words:
                        poslist.append(PairGenerator.getpos(word))#generate a PosList List for this pair of words
                    #print words,poslist
                    headpos=poslist.pop()
                    if 'N' in headpos:#is 'N' a possible part of speech for the head word (last word in the list)
                        phrase=words.pop()+'/N'
                        modpos=poslist.pop()
                        mod=words.pop()
                        if 'N' in modpos: #is 'N' a poss part of speech for mod
                            NNphrase=phrase+":nn-DEP:"+mod+'/N'
                            self.NNs.append(NNphrase)
                        if 'J' in modpos:#is 'J' a poss part of speech for mod
                            ANphrase=phrase+":amod-DEP:"+mod+'/J'
                            self.ANs.append(ANphrase)

                        if len(modpos)==0:#only considering J and N for mod
                            #print "Discarding "+lemma.name
                            discards.append(lemma.name)
                    else:#only considering N for head
                        #print "Discarding "+lemma.name
                        discards.append(lemma.name)

        print len(self.NNs),self.NNs
        print len(self.ANs),self.ANs
        print len(discards),discards
        #write lists to file
        with open(self.ANpath,'w') as outstream:
            for AN in self.ANs:
                outstream.write(AN+'\n')
        with open(self.NNpath,'w') as outstream:
            for NN in self.NNs:
                outstream.write(NN+'\n')
        return
def exercise3():
    print
    print "Exercise - 3"
    ss = [w for w in wn.all_synsets('v')]
    result = sum([len(ss[i].hypernyms()) for i in range(len(ss))])
    print "Total number of hypernyms of 'v' is: %d" %result
    print "Average number of hypernyms is: %f" %(result/float(len(ss)))
Beispiel #7
0
def getAllGlossLinks(useTagger=False, useverbs=False, reflexive=False, n=10000):
    links = {}
    print "Gathering synsets"
    synsets = [ss for ss in wordnet.all_synsets()]
    n = 0
    for ss in synsets:
        print "%.3f"%(float(n)/float(len(synsets)))
        n += 1
        ssname = ss.name
        defn = wordboundary.split(ss.definition.strip())
        if useTagger:
            defn = [(form, wdnettags[tag[0]]) for form, tag in useTagger.tag(defn) if not form == "" and tag[0] in wdnettags]
        if not ssname in links:
            links[ssname] = {}
        for w in defn[:n]:
            if type(w) == "str":
                wsynsets = wordnet.synsets(w)
            else:
                wsynsets = wordnet.synsets(w[0], w[1])
            for s in wsynsets:
                sname = s.name
                links[ssname][sname] = True
                if reflexive:
                    if not sname in links:
                        links[sname] = {}
                    links[sname][ssname] = True
        if not ssname in links:
            print ssname, defn
    for l in links:
        ll = links[l]
        for d in ll:
            links[l][d] = 1.0/float(len(ll))
    return links
Beispiel #8
0
    def __init__(self):
        t0 = time()
        print 'initalizing random word generator'

        self.s_articles = ['A', 'The']
        self.o_articles = ['a','the']
        self.prepositions = ['of','in','to','for','with','on','at','from','by',
        'about','as','into','like','through','after','over','out','around']

        self.nouns = list(wn.all_synsets(wn.NOUN))
        self.verbs = list(wn.all_synsets(wn.VERB))
        self.adjectives = list(wn.all_synsets(wn.ADJ))
        self.adverbs = list(wn.all_synsets(wn.ADV))
        t1 = time()
        runTime = t1-t0
        print 'word list initalized in ' + str(runTime) + ' seconds'
def main(argv):
  huang_vocab = LoadHuang()
  manaal_vocab = LoadManaal()
  brown_vocab = LoadBrown()

  all_lemmas = {x.lower() for x in wn.all_lemma_names(pos=wn.ADJ)}
  all_alpha_lemmas = {x for x in all_lemmas if x.isalpha()}
  all_synsets = set(wn.all_synsets(pos=wn.ADJ))
  all_alpha_synsets = {x for x in all_synsets if IsAlphaSS(x)}
  all_lemmas_with_single_synset = {x for x in all_lemmas if IsSingleSynset(x)}
  all_lemmas_ambig_synset = {x for x in all_lemmas if not IsSingleSynset(x)}
  all_lemmas_with_single_synset_alpha = {x for x in all_lemmas_with_single_synset if x.isalpha()}
  all_lemmas_ambig_synset_alpha = {x for x in all_lemmas_ambig_synset if x.isalpha()}
  all_alpha_lemmas_has_noun = {x for x in all_alpha_lemmas if LemmaHasNoun(x)}
  all_alpha_lemmas_has_noun_single_lexname = {x for x in all_alpha_lemmas_has_noun if IsNounSingleLexName(x) }
  print "all_lemmas:", len(all_lemmas)
  print "all_alpha_lemmas:", len(all_alpha_lemmas)
  print "all_synsets:", len(all_synsets)
  print "all_alpha_synsets:", len(all_alpha_synsets)
  print "all_lemmas_with_single_synset:", len(all_lemmas_with_single_synset)
  print "all_lemmas_ambig_synset:", len(all_lemmas_ambig_synset)
  print "all_lemmas_with_single_synset_alpha", len(all_lemmas_with_single_synset_alpha)
  print "all_lemmas_ambig_synset_alpha", len(all_lemmas_ambig_synset_alpha)
  print "all_alpha_lemmas_has_noun", len(all_alpha_lemmas_has_noun)
  print "all_alpha_lemmas_has_noun_single_lexname", len(all_alpha_lemmas_has_noun_single_lexname)
  print "huang.intersect(all_alpha_lemmas)", len(huang_vocab.intersection(all_alpha_lemmas))
  print "manaal.intersect(all_alpha_lemmas)", len(manaal_vocab.intersection(all_alpha_lemmas))
  print "brown.intersect(all_alpha_lemmas)", len(brown_vocab.intersection(all_alpha_lemmas))
  print "huang*manaal*brown*all_alpha_lemmas", len(huang_vocab.intersection(all_alpha_lemmas, manaal_vocab, brown_vocab))
  print "huang.intersect(all_lemmas_with_single_synset_alpha)", len(huang_vocab.intersection(all_lemmas_with_single_synset_alpha))
  print "manaal.intersect(all_lemmas_with_single_synset_alpha)", len(manaal_vocab.intersection(all_lemmas_with_single_synset_alpha))
  print "brown.intersect(all_lemmas_with_single_synset_alpha)", len(brown_vocab.intersection(all_lemmas_with_single_synset_alpha))
  print "huang*manaal*brown*all_lemmas_with_single_synset_alpha", len(huang_vocab.intersection(all_lemmas_with_single_synset_alpha, manaal_vocab, brown_vocab))
Beispiel #10
0
    def convert_all_to_basic(reviews):
        print("Process Started")
        print("Gettin all nouns....")
        words=[s for s in wn.all_synsets(wn.NOUN) if  (s.name().find('-')==-1) and (s.name().find('_')==-1) and len(s.name().split('.')[0])<12]

        print("Processing basic logic probability...")
        words2 = []
        filter_basic_logic(words,words2)

        print("Removing redundancy...")
        a = list(set(words2))
        a.sort()
        remove_unwanted(a)
        newReviews = []
        for review in reviews:
            tempReview = ""
            tokens = word_tokenize(review)
            for token in tokens:
                tempword = check_basic(token,a)
                if tempword:
                    tempReview = tempReview + " " + tempword
                else:
                    tempReview = tempReview + " " +  token
            newReviews.append(tempReview)
        return newReviews
Beispiel #11
0
def wn_pos_dist():
    """Count the Synsets in each WordNet POS category."""
    # One-dimensional count dict with 0 as the default value:
    cats = defaultdict(int)
    # The counting loop:
    for synset in wn.all_synsets():
        cats[synset.pos] += 1
    def load_corpora( self ):

        print "Loading corpora..."

        pth = os.path.realpath( os.path.dirname(__file__) )
        nltk.data.path.append( os.path.join( pth, "nltk_data" ) )
        from nltk.corpus import wordnet as wn

        self._adjectives = list(wn.all_synsets('a'))
        self._nouns = list(wn.all_synsets('n'))

        with open( os.path.join( pth, "firstnames.txt") ) as fh:
            self._firstnames = fh.readlines()

        with open( os.path.join( pth, "surnames.txt") ) as fh:
            self._surnames = fh.readlines()
Beispiel #13
0
def populate_cache():
    adjectives, nouns = (set(), set())

    for wordset, kind in [
        (adjectives, wordnet.ADJ),
        (nouns, wordnet.NOUN),
    ]:
        for synset in wordnet.all_synsets(kind):
            for lemma in filter(
                lambda l: all((
                    not re.search(r'\d', l.name()),
                    l.name() not in BLACKLIST,
                    not l.name().endswith('_to'),
                    l.count() > 0,
                )), synset.lemmas()
            ):
                wordset.add(lemma.name().replace('_', ' '))

    os.mkdir(CACHE_PATH)

    for words, filename in [
        (adjectives, 'adjectives'),
        (nouns, 'nouns'),
    ]:
        with open(os.path.join(CACHE_PATH, filename), 'w') as f:
            f.writelines((u'{}\n'.format(w) for w in words))
Beispiel #14
0
def list_nouns():
    ## TODO CREATE A SEPARATE LIST FOR NOUNS ENDING IN S
    global NOUNS
    print "[+] Creating list of nouns... (This only has to be done once)"
    ## Make list of nouns in wordnet
    NOUNS = {x.name().split('.', 1)[0] for x in wn.all_synsets('n')}
    print "    Done!"
Beispiel #15
0
def ex26_branchingfactor():
  from nltk.corpus import wordnet as wn
  num_synsets = 0
  num_hyponyms = 0
  for noun_synset in wn.all_synsets("n"):
    (num_hyponyms, num_synsets) = \
      branchingfactor_r(noun_synset, num_synsets, num_hyponyms)
  print "branching factor=", (num_hyponyms / num_synsets)
Beispiel #16
0
def print_all_synset_categories():
    """
    Prints all domains and
    categories for research purposes
    """
    categories = []
    for synset in list(wordnet.all_synsets('n')):
        categories.append(synset)
    return categories
Beispiel #17
0
def getTaggedHyps():
    allsynsets = [ss for ss in wordnet.all_synsets()]
    alltags = {}
    for ss in allsynsets:
        alltags[ss.pos] = True
    taggedhyps = {}
    for tag in alltags:
        taggedhyps[tag] = getAllHyps(allsynsets, tag=tag)
    return taggedhyps
Beispiel #18
0
def getUpDownLinks():
    links = {}
    for ss in wordnet.all_synsets():
        for s in ss.hypernyms()+ss.hyponyms():
            try:
                links[ss.name][s.name] = True
            except:
                links[ss.name] = {s.name:True}
    return links
Beispiel #19
0
def synonyms(word, lch_threshold=2.26):
  for net1 in wn.synsets(word):
    for net2 in wn.all_synsets():
      try:
        lch = net1.lch_similarity(net2)
      except:
        continue
      if lch >= lch_threshold:
        yield (net1, net2, lch)
Beispiel #20
0
def print_all_synset_categories():
    """Print all domains and categories for research purposes.

    :rtype categories (list): A list of all wordnet synsets.
    """
    categories = []
    for synset in list(wordnet.all_synsets('n')):
        categories.append(synset)
    return categories
Beispiel #21
0
def wordlistGen(type_="a"):
    words = list(wordnet.all_synsets(type_))
    word_strs = []
    random.shuffle(words)
    for word in words[:1000]:
        word = str(word.lemma_names()[0])
        print word
        word_strs.append(str(word))
    pickle.dump(word_strs, open(os.getcwd() + "/" + type_ + ".p", "wb"))
Beispiel #22
0
def ex13():
  from nltk.corpus import wordnet as wn
  num_synsets = 0
  num_synsets_wo_hyponyms = 0
  for noun_synset in wn.all_synsets("n"):
    if len(noun_synset.hyponyms()) == 0:
      num_synsets_wo_hyponyms = num_synsets_wo_hyponyms + 1
    num_synsets = num_synsets + 1
  print num_synsets_wo_hyponyms * 100 / num_synsets
Beispiel #23
0
 def load(self, pos):
     if pos == 'n':
         roots = wn.synsets('entity')
     else:
         roots = [s for s in wn.all_synsets(pos) if len(s.hypernyms()) == 0]
     
     self.root = WordNetTreeNode('root')
     
     for synset in roots:
         self.__append_synset(synset, self.root)
Beispiel #24
0
def export_wn_lexicon(output_fpath):
    with codecs.open(output_fpath, "w", "utf-8") as output:
        num_lemmas = 0
        for i, synset in enumerate(wordnet.all_synsets()):
            for lemma in synset.lemmas():
                print >> output, lemma.name()
                num_lemmas += 1

        print "Wordnet vocabulary:", output_fpath
        print "# lemmas:", num_lemmas
Beispiel #25
0
    def __init__(self):
        words=[s for s in wn.all_synsets(wn.NOUN) if  (s.name().find('-')==-1) and (s.name().find('_')==-1) and len(s.name().split('.')[0])<12]

        words2 = self.filter_basic_logic(words)
        # print(words2)

        a = list(set(words2))
        a  = self.remove_unwanted(a)

        self.basic_word_list = a
Beispiel #26
0
def _get_wordnet_words():
    """Get the set of all words known by WordNet.

    This is the set of all lemma names for all synonym sets in WordNet.

    """

    return set(word.lower()
               for synset in wordnet.all_synsets()
               for word in synset.lemma_names())
Beispiel #27
0
def ex27_polysemy():
  from nltk.corpus import wordnet as wn
  for pos in ["n", "v", "a"]:
    synsets = wn.all_synsets(pos)
    num_synsets = 0
    num_senses = 0
    for synset in synsets:
      num_synsets = num_synsets + 1
      num_senses = num_senses + len(synset.lemmas)
    print "polysemy(" + pos + ")=", (num_senses / num_synsets)
Beispiel #28
0
def branching_factor():
	all_syn = wn.all_synsets('n')
	total_ratio = 0
	all = 0
	for syn in all_syn:
		leaves = len(syn.hyponyms())
		if leaves > 0:
			ratio = float(leaves)
			total_ratio = total_ratio + ratio
			all += 1
	return total_ratio / float(all)
Beispiel #29
0
def polysemy(pos):
	p = 0
 	lemmas = []
 	syns = list(wn.all_synsets(pos))
 	for synset in syns:
  		lemmas.extend(synset.lemma_names)
 	for lemma in lemmas:
  		new = len(wn.synsets(lemma, pos))
  		p = p + new
  	length = len(syns)
 	return p/length
def build_dictionary():
	dictionary = dict()
	from nltk.corpus import wordnet
	for synset in wordnet.all_synsets():
		for lemma in synset.lemmas:
			if lemma.name not in dictionary:
				dictionary[lemma.name.lower()] = [synset.definition]
			else:
				dictionary[lemma.name.lower()].append(synset.definition)

	json.dump(dictionary, gzip.open('dictionary.json.gzip', 'w'))
Beispiel #31
0
		self.off_to_description = {}
		self.description_to_off = {}
		with open(file, 'r') as f:
			for line in f:
				index = line.index(',')
				self.off_to_description[int(line[0:index])] = line[index + 1:].strip()
		for key in self.off_to_description:
			self.description_to_off[self.off_to_description[key]] = key
		print "synset reader successfully initialized"
		return self.off_to_description, self.description_to_off

	def get_description(self, offset):
		try:
			if isinstance(offset, int):
				return self.off_to_description[offset]
			if isinstance(offset, str): # ss_...
				return self.off_to_description[int(offset[3:])]
		except KeyError:
			print "synset offset ", offset, "not found"

if __name__=="__main__":	
	from nltk.corpus import wordnet as wn
	import sys

	print "writing synsets to {}".format(sys.argv[1])

	with open(sys.argv[1], 'w') as f:
		for ss in list(wn.all_synsets()):
			f.write('{},{}\n'.format(ss.offset(), ss.definition()))

Beispiel #32
0
import discord
import asyncio
from discord.ext.commands import Bot
from discord.ext import commands
import json
from nltk.corpus import wordnet as wn
import random

nouns = list(wn.all_synsets(wn.NOUN))
adjectives = list(wn.all_synsets(wn.ADJ))

client = commands.Bot(description="Art Bot", command_prefix="!")


@client.event
async def on_ready():
    print('Logged in as ' + client.user.name + ' (ID:' + client.user.id +
          ') | Connected to ' + str(len(client.servers)) +
          ' servers | Connected to ' +
          str(len(set(client.get_all_members()))) + ' users')
    print('--------')


@client.event
async def on_message(message):
    msg = message.content
    if msg.lower() == 'go' and is_admin(message.author.id):
        await client.send_message(client.get_channel('545331566881144833'),
                                  get_random_options(nouns, adjectives))

from difflib import get_close_matches

try:
    from nltk.corpus import wordnet as wn
    raise_lookuperror_if_wordnet_data_absent = wn.synsets("python")
except LookupError:
    import nltk
    nltk.download("wordnet")
from unipath import Path
import inflect

ALL_WORDNET_WORDS = set()
for synset in list(wn.all_synsets()):
    for lemma in synset.lemmas():
        ALL_WORDNET_WORDS.add(lemma.name())

verbs_fh =  open(Path(__file__).ancestor(1).child("en-verbs.txt"))
lines = verbs_fh.readlines()
verbs_fh.close()
CONJUGATED_VERB_LIST = []
for line in lines:
    if line[0] != ";":
        CONJUGATED_VERB_LIST.append(
            [string for string in line.strip().split(",") if string != ""])

ADJECTIVE_TO_ADVERB = {"good" : "well", "fast" : "fast", "hard" : "hard",
                       "late" : "late", "early" : "early", "daily" : "daily",
                       "straight" : "straight"}
for ss in wn.all_synsets(pos = "r"):
    for lemma in ss.lemmas():
        word = lemma.name()
Beispiel #34
0
    set([word for (word, tag) in brown_rel_tagged if tag == "NOUN" and word.isalpha() and not word[0].isupper()])
)

romn_clean = sorted(
    set([word for (word, tag) in brown_rom_tagged if tag == "NOUN" and word.isalpha() and not word[0].isupper()])
)


# %%
from nltk.corpus import wordnet as wn


type = "n"

# %%
synsets = wn.all_synsets(type)


# %%
def find_polysemy(text):

    count = 0

    for w in text:
        count += len(wn.synsets(w))

    return count / len(text)


n_rel = find_polysemy(rel_clean)
print(n_rel)
Beispiel #35
0
from nltk.corpus import wordnet as wn
noun_count = 0
total_noun_count = 0
for synset in wn.all_synsets('n'):
    #print(synset.name()[:-5])
    total_noun_count += 1
    noun_count += len(wn.synsets(synset.name()[:-5], 'n'))
print(noun_count, total_noun_count, noun_count / total_noun_count)
Beispiel #36
0
import numpy as np
from nltk.corpus import wordnet as wn

from vectorspace import VSM


def syn2sks(synset):
    return list(set([lemma.key() for lemma in synset.lemmas()]))


sks_vecs_path = sys.argv[1]
syns_vecs_path = sys.argv[2]

print('Loading sensekey vecs ...')
sks_vsm = VSM()
sks_vsm.load_txt(sks_vecs_path)

print('Aggregating synset vecs ...')
syn_vecs = defaultdict(list)
for syn in wn.all_synsets():
    for sk in syn2sks(syn):
        if sk in sks_vsm.labels_set:
            syn_vecs[syn.name()].append(sks_vsm.get_vec(sk))

print('Writing synset vecs ...')
with open(syns_vecs_path, 'w') as syns_vecs_f:
    for syn, syn_vecs in syn_vecs.items():
        syn_vec = np.array(syn_vecs).mean(axis=0)
        syn_vec_str = ' '.join([str(round(v, 6)) for v in syn_vec.tolist()])
        syns_vecs_f.write('%s %s\n' % (syn, syn_vec_str))
# https://medium.com/snips-ai/an-introduction-to-snips-nlu-the-open-source-library-behind-snips-embedded-voice-platform-b12b1a60a41a
#
# * To be very fair on our benchmarks and results, we used the same train and test set used by the other benchmarks and no cross validation or stratified splits were used. The test data was not used in any way to improve the results. The dataset used can be found here:
#
# https://github.com/Botfuel/benchmark-nlp-2018/tree/master/results
#
#
# Spacy english dataset with vectors needs to be present. It can be downloaded using the following command:
#
# python -m spacy download en_core_web_lg

# !python -m spacy download en_core_web_lg
nlp = spacy.load('en_core_web_lg')
print('Running')

nouns = {x.name().split('.', 1)[0] for x in wordnet.all_synsets('n')}
verbs = {x.name().split('.', 1)[0] for x in wordnet.all_synsets('v')}


def get_synonyms(word, number=3):
    synonyms = []
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonyms.append(l.name().lower().replace("_", " "))
    synonyms = list(OrderedDict.fromkeys(synonyms))
    return synonyms[:number]


#Hyperparameters
benchmark_dataset = ''  # Choose from 'AskUbuntu', 'Chatbot' or 'WebApplication'
oversample = False  # Whether to oversample small classes or not. True in the paper
split = []
ids = []
modelid = []
ids_dict = {}

with open('all.csv', 'rb') as f:
    reader = csv.reader(f)  #csv read object file
    next(reader)  # skip the headers
    for row in reader:
        ids.append(row[0])
        synsetid.append(row[1])
        modelid.append(row[3])
        split.append(row[4])
#pdb.set_trace()

syns = list(wordnet.all_synsets())
offsets_list = [(s.offset(), s) for s in syns]
offsets_dict = dict(offsets_list)

class_ids = set(synsetid)
class_ids = list(class_ids)
class_dict = {}

for id in class_ids:
    #pdb.set_trace()
    key = int(id)
    class_name = offsets_dict[key]
    class_name = str(class_name)
    value = class_name.split('.')[0][8:]
    class_dict[key] = value
Beispiel #39
0
    # text = [stemmer.stem(w) for w in text]
    text = [lemmatizer.lemmatize(w) for w in text]
    # now = time.time()
    # print('terminou lemmatize em:', int(now-start))
    return ' '.join(text)

def norm(vector):
    val = vector.dot(vector.transpose())
    if vector.__class__ not in [np.matrix, np.ndarray]:
        val = val.toarray()
    return math.sqrt(val)

# Então precisarei de lemma_namses. Quer ver se cobrem as definitions
lemmas = []
defs = []
for s in wn.all_synsets():
    defs.append(lemmatize(s.definition()))
    lemmas.append(lemmatize(' '.join(s.lemma_names())))
len(lemmas)
len(defs)

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(defs)
vec2 = CountVectorizer()
Y = vec2.fit_transform(lemmas)

voc1 = vectorizer.get_feature_names()
voc2 = vec2.get_feature_names()

inter = list(set(voc1) & set(voc2))
print('voc1: {}; voc2: {}; inter: {}'.format(len(voc1), len(voc2), len(inter)))
Beispiel #40
0
 def load_sk2syn(self):
     for synset in wn.all_synsets():
         for lemma in synset.lemmas():
             self.map_sk2syn[lemma.key()] = synset
Beispiel #41
0
 def get_all_syns(self):
     return list(wn.all_synsets())
Beispiel #42
0
data_test_500_rand1_unseen = json.load(
    open(os.path.join(data_path, 'data_test_500_rand1_unseen.json')))
data_desc_c = json.load(open(os.path.join(data_path, 'data_desc_c.json')))

word_list = []
for data in data_test_500_rand1_seen:
    word_list.append(data['word'])  # word
for data in data_test_500_rand1_unseen:
    word_list.append(data['word'])  # word

#lines = open(os.path.join(data_path, 'concept_words.txt')).readlines()
#concept_words = [line.strip() for line in lines]
concept_words = [value['word'] for value in data_desc_c]
word_list = word_list + concept_words

all_synsets = list(wn.all_synsets())
word_synset = {}
for synset in all_synsets:
    # filter all multi-word phrases indicated by _
    lemmas = [lemma for lemma in synset.lemmas() if "_" not in lemma.name()]
    if len(lemmas) == 0:
        continue
    for lemma in lemmas:
        wd = lemma.name().lower()
        if wd in word_list:
            if wd not in word_synset:
                word_synset[wd] = []
            tmp = [le.name().lower() for le in lemmas]
            word_synset[wd].extend(tmp)
word_syn = []
for wd in word_list:
#워드넷으로 명사, 동사, 형용사, 부사의 다의어 평균 계산

from nltk.corpus import wordnet as wn
type = 'n' # 품사의 유형을 명사로 설정(n-명사, v-동사, r-부사, a-형용사)

synsets = wn.all_synsets(type) # 존재하는 명사 유형 n의 모든 synset 반환

# lemma 리스트로 통합
lemmas = []
for synset in synsets:
    for lemma in synset.lemmas():
        lemmas.append(lemma.name())

# 중복을 제거하고 개별 lemmas count
lemmas = set(lemmas) #리스트를 집합으로 변환하면 중복제거

count = 0
for lemma in lemmas:
    count = count + len(wn.synsets(lemma, type))

print('개별 기본형 합계: ', len(lemmas))
print('총 뜻: ', count)
print(type, '(명사)의 다의어 평균: ', count/len(lemmas))
import numpy as np
from nltk.corpus import wordnet as wn

for head in wn.all_synsets():
    for rel in [
            'hyponyms', 'hypernyms', 'part_meronyms', 'substance_meronyms',
            'part_holonyms', 'substance_holonyms', 'entailments'
    ]:
        for tail in getattr(head, rel)():
            print('\t'.join([head.name(), rel, tail.name()]))
from nltk.corpus import wordnet as wn
from textstat.textstat import textstat
"""
Run with `python collect_naked.py > unfiltered_naked.txt`
"""

adjectives = list(wn.all_synsets('a')) + list(wn.all_synsets('s'))
nakeds = []

# Collect all two-syllable adjectives
for item in adjectives:
    for adj in item.lemmas():
        syllables = round(textstat.syllable_count(adj.name()))
        if syllables == 2.0:
            naked = adj.name().replace("_", " ")
            nakeds.append(naked)

# Uniques only
nakeds = set(nakeds)

for naked in nakeds:
    print naked.encode('utf8')
Beispiel #46
0
def get_concept_set():

    for concept in nlwn.all_synsets(pos='n'):
		yield concept.name()
'''
From WordNet extracts all nouns
and writes them into a new txt file
'''

import nltk
from nltk.corpus import wordnet as wn
my_file=open('English_nouns.txt','a')
for synset in list(wn.all_synsets('n')):
    my_file.write(synset.name()[:-5]+"\n")
my_file.close()
# url = "https://api.weather.gov/gridpoints/LWX/89,65/forecast"
# html = request.urlopen(url).read().decode('utf8')
# print(html[2100:2200])

# ch3 ex22
# import re
# response = request.urlopen('http://news.bbc.co.uk/')
# raw = response.read().decode('utf8')
# print(re.sub(r'(<.*?>|<\/.*?>)(?s)', '', raw))

# ch2 ex27
from nltk.corpus import wordnet as wn

print("average polysemy of: ")
# nouns
synsets = wn.all_synsets("n")
lemmas = set()
for synset in synsets:
    for lemma in synset.lemmas():
        lemmas.add(lemma.name())
count = 0
for lemma in lemmas:
    count = count + len(wn.synsets(lemma, "n"))
print("nouns: %s" % (count / len(lemmas)))
# verbs
synsets = wn.all_synsets("v")
lemmas = set()
for synset in synsets:
    for lemma in synset.lemmas():
        lemmas.add(lemma.name())
count = 0
Beispiel #49
0
#12.
count_distinct = 0
dublettes = []
prev = ''
for entry in nltk.corpus.cmudict.entries():
    if ((entry[0] == prev) and (entry[0] not in dublettes)):
        dublettes.append(entry[0])
    else: 
        count_distinct = count_distinct + 1
        prev = entry[0]
print count_distinct
print (len(dublettes) / count_distinct) * 100

#13.
all_syns = list(wn.all_synsets('n'))
no_hyponyms = [s for s in all_syns if len(s.hyponyms()) == 0]
print (len(no_hyponyms) / len(all_syns)) * 100

#14.
def supergloss(s):
    gloss = 'definition: ' + s.definition() + '\n\n'
    gloss = gloss + 'Hypernyms:\n'
    for hypernym in s.hypernyms():
        gloss = gloss + hypernym.name() + ': ' + hypernym.definition() + '\n'
    gloss = gloss + '\nHyponyms:\n'
    for hyponym in s.hyponyms():
        gloss = gloss + hyponym.name() + ': ' + hyponym.definition() + '\n'
    return gloss

print  superglosssuperglo (wn.synset('bicycle.n.01'))
Beispiel #50
0
def offset_pos_from_emb_key(str):
    return str[PREF:PREF + 8], str[PREF + 9:PREF + 10]


def offstr(off):
    return '{0:0>8}'.format(off)


def pos_of_set(set_name):
    return set_name.split('.')[-2]


if __name__ == '__main__':
    # map synset names to offsets
    sset_offs = {s.name(): s.offset() for s in wn.all_synsets()}

    # load expected synset names in order (3.0)
    with open(MATRICES_DATA_W_SETS) as matrix_file:
        set_names = pickle.load(matrix_file)[1]  # it's the same
        # as wn.all_synsets(), but let's be really careful and extensible.
    print('finished loading {} 3.0 synsets and offset map'.format(
        len(set_names)))

    # load embeddings
    syn_embs = {'n': {}, 'v': {}, 'r': {}, 'a': {}}
    with open(SYNSET_EMBEDDINGS_FILE) as embs_file:
        header_line = True
        for l in embs_file.readlines():
            if header_line:  # skip
                header_line = False
Beispiel #51
0
        else:
            if len(def_words) <= 4 or ';' in definition:
                return definition.replace(',', '').split(';')
            else:
                return []
    else:
        return []


# the whole process might take a long time for all POS, better deploy the process of each POS in separate machines
# for noun synsets, we only deal with those that does not have example sentences
for pos in ['n', 'v', 'a', 'r']:
    non_retreive = list()
    type2pos = {1: 'n', 2: 'v', 3: 'a', 4: 'r', 5: 'a'}
    all_synsets = [
        i.name() for i in wn.all_synsets(pos) if len(
            wn.synsets(i.name().split('.')[0], type2pos[int(
                i.lemmas()[0].key().split('%')[1][0])])) > 0
    ]
    if os.path.exists('./sentence_dict_%s_new' % pos):
        sentence_dict = {
            i: j
            for i, j in pickle.load(open('./sentence_dict_%s_new' %
                                         pos, 'rb')).items()
        }
        non_retreive = [i for i in sentence_dict.keys()]
    else:
        sentence_dict = defaultdict(list)
        non_retreive = all_synsets

    loop_bool = True
Beispiel #52
0
 def get_all_synsets(self):
     return wn.all_synsets('n')
Beispiel #53
0
motorcar = wn.synsets('motorcar')
print('synsets that motorcar belongs to: ' + repr(motorcar))
cars = wn.synset('car.n.01')
print('synset of car sense 1: ' + str(cars))

print('car sense 1 lemma names: ' + repr(cars.lemma_names()))
print('car sense 1 definition: ' + cars.definition())
print('car sense 1 example sentences: ' + repr(cars.examples()))
car_lemmas = cars.lemmas()
print('car sense 1 lemmas: ' + repr(car_lemmas))

automobile = wn.lemma('car.n.01.automobile')
print('synset of automobile (car sense 1): ' + str(automobile.synset()))
print('name of the automobile lemma: ' + automobile.name())

all_noun_synsets = wn.all_synsets('n')
print('number of noun synsets: ' + str(len(list(all_noun_synsets))))

car_synsets = wn.synsets('car')
print('synsets that car belongs to: ' + repr(car_synsets))
for synset in car_synsets:
	print(str(synset) + ' ' + repr(synset.lemma_names()))

print('synsets in which car is a lemma: ' +  repr(wn.lemmas('car')))

motorcar = wn.synset('car.n.01')
types_of_motorcar = motorcar.hyponyms()
print('types of motorcars: ' + repr(types_of_motorcar))
print('types of motorcars (all words): ' + repr(sorted([lemma.name() for synset in types_of_motorcar for lemma in synset.lemmas()])))

print('motorcar hypernyms: ' + repr(motorcar.hypernyms()))
#6. What percentage of noun synsets have no hyponyms? You can get all noun synsets using wn.all_synsets('n')
from nltk.corpus import wordnet as wn

nounSynsets = set(wn.all_synsets('n'))

zeroCount = 0
total = 0
for synset in nounSynsets:
    total += 1
    if len(synset.hyponyms()) == 0:
        zeroCount += 1

percentage = round((zeroCount / total) * 100, 2)
print(percentage, "%")
Beispiel #55
0
def init_gloss_data(depth=2):
    gloss_data = {}

    for synset in tqdm.tqdm(list(wn.all_synsets())):
        gloss_data[synset.name()] = {}

        related_synsets = [synset]
        for d in range(depth):
            gloss_words = []
            example_words = []
            for s in related_synsets:
                gloss_words += synset_gloss[s.name()]
                example_words += synset_example[s.name()]

            for w in gloss_words:
                if w not in gloss_data[synset.name()]:
                    gloss_data[synset.name()][w] = {
                        "freq": 1,
                        "graph_distance": d,
                    }
                else:
                    gloss_data[synset.name()][w]["freq"] += 1

            for w in example_words:
                if w not in gloss_data[synset.name()]:
                    gloss_data[synset.name()][w] = {
                        "freq": 1,
                        "graph_distance": d + 1,
                    }
                else:
                    gloss_data[synset.name()][w]["freq"] += 1

            new_related_synset = []
            for s in related_synsets:
                ns = s.also_sees() \
                     + s.attributes() \
                     + s.causes() \
                     + s.entailments() \
                     + s.hyponyms() \
                     + s.hypernyms() \
                     + s.instance_hypernyms() \
                     + s.instance_hyponyms() \
                     + s.member_meronyms() \
                     + s.member_holonyms() \
                     + s.part_holonyms() \
                     + s.part_meronyms() \
                     + s.region_domains() \
                     + s.substance_meronyms() \
                     + s.substance_holonyms() \
                     + s.topic_domains() \
                     + s.usage_domains() \
                     + s.verb_groups() \
                     + s.similar_tos()

                for l in s.lemmas():
                    ns += [
                        x.synset() for x in l.derivationally_related_forms()
                    ]
                    ns += [x.synset() for x in l.pertainyms()]

                if use_glossdisambiguated:
                    ns += synset_gloss_relation[s.name()]

                new_related_synset += ns

            related_synsets = list(set(new_related_synset))

    return gloss_data
Beispiel #56
0
import django

django.setup()

# WordNet30 Populating
from ws_web.models import WordNet30

from nltk.corpus import wordnet as wn


def populate(x):
    # data is a list of lists
    d, created = WordNet30.objects.get_or_create(word=x.name().split('.')[0],
                                                 pos=x.pos(),
                                                 offset=x.offset(),
                                                 definition=x.definition(),
                                                 examples=x.examples(),
                                                 lemma_names=x.lemma_names(),
                                                 name=x.name())
    print(d, created)


if __name__ == "__main__":
    y = wn.all_synsets()
    i = 0
    for synset in iter(y):
        i += 1
        populate(synset)
        print(i)
for entry in test:
	for ingredient in entry["ingredients"]:
		words = ingredient.lower().replace("-","").split(" ")
		for i, word in enumerate(words):
			if word in adjDict and i<len(words)-1:
				adjDict[word] += 1

wordList = []
for key, value in adjDict.iteritems():
	if value>5:
		wordList.append(key)
			



nouns = {x.name().split('.', 1)[0] for x in wn.all_synsets('n')}
# adjs = {x.name().split('.', 1)[0] for x in wn.all_synsets(wn.ADJ)}

finalList = []
seen = set()
for word in wordList:
	if word in nouns:
		wordList.remove(word)
	else:
		finalList.append(word)
		seen.add(word)

print finalList
with open('excludeList.json', 'w') as outfile:
	json.dump(finalList, outfile)
Beispiel #58
0
files                = sorted( glob.glob( input_search_string ) )
print "Found {0} input images in {1}".format( len(files), input_search_string )

if len(files) > 0:

    #Only load names if there is something to name
    if generate_memorable_names:
        print 'Loading words for memorable name generation.'
        import nltk
        import random
        from nltk.corpus import wordnet
        
        # Seed based on input path so that names will be the same for multiple volumes
        random.seed( sbdm_string_hash( original_input_ids_path ) )

        nouns, verbs, adjectives, adverbs = [list(wordnet.all_synsets(pos=POS)) for POS in [wordnet.NOUN, wordnet.VERB, wordnet.ADJ, wordnet.ADV]]
        nouns_verbs = nouns + verbs
        adjectives_adverbs = adjectives + adverbs

        nouns_verbs = [x for x in nouns_verbs if not ( '_' in x.lemmas[0].name or '-' in x.lemmas[0].name )]
        adjectives_adverbs = [x for x in adjectives_adverbs if not ( '_' in x.lemmas[0].name or '-' in x.lemmas[0].name )]
        
        def make_memorable_name():

            while True:
                word1 = random.choice(random.choice(adjectives_adverbs).lemmas).name
                #ignore hyphenated words
                if not ('_' in word1 or '-' in word1):
                    break

            while True:
Beispiel #59
0
#!/usr/bin/env python

import nltk
from nltk.corpus import wordnet as wn

ss = wn.all_synsets(pos=wn.NOUN)

for s in ss:
    ws = [l.name() for l in s.lemmas()]
    for n, t in nltk.pos_tag(ws):
        if t == 'NNP' and len(n) > 3:
            print(n.replace('_', ' '))
Beispiel #60
0
def add_topic_features(data, test=False):
    print("Adding topic features")
    if not test:
        print("Train set registered, computing sentiment and topics")
        adjectives = set([
            synset.name().split('.')[0]
            for synset in list(wn.all_synsets(wn.ADJ))
        ])

        dataset = data
        documents = dataset['reviewText']
        new_df = pd.DataFrame({'document': documents})
        tokenized_doc = new_df['document']
        tokenized_doc = tokenized_doc.apply(
            lambda x: [item for item in x if item in adjectives])

        # de-tokenization
        detokenized_doc = []
        for i in range(len(new_df)):
            t = ' '.join(tokenized_doc[i])
            detokenized_doc.append(t)

        new_df['document'] = detokenized_doc
        vectorizer = TfidfVectorizer(
            stop_words='english',
            max_features=100,  # keep top 1000 terms
            max_df=0.9,
            smooth_idf=True)

        X = vectorizer.fit_transform(new_df['document'])
        # SVD represent documents and terms in vectors
        svd_model = TruncatedSVD(n_components=25,
                                 algorithm='randomized',
                                 n_iter=100,
                                 random_state=122)

        svd_model.fit(X)
        terms = vectorizer.get_feature_names()
        topics = []
        for i, comp in enumerate(svd_model.components_):
            terms_comp = zip(terms, comp)
            sorted_terms = sorted(terms_comp, key=lambda x: x[1],
                                  reverse=True)[:7]
            topics.append(sorted_terms[0][0])

        cleaned_topics = list(set(topics))
        print("Writing topics")
        print(cleaned_topics)
        with open('topics.txt', 'w') as topic_file:
            for item in cleaned_topics:
                topic_file.write(item + '\n')
        #add features to df
        df = data
        df = pd.concat([df, pd.DataFrame(columns=cleaned_topics)], sort=False)
        df = df.fillna(int(0))
        print("Adding topics to train")
        print(cleaned_topics)
        for i, row in df.iterrows():
            intersect = set(row['reviewText']) & set(cleaned_topics)
            for word in intersect:
                df.at[i, word] = 1
        return df

    else:
        print("Test registered, writing topics to dataframe")
        cleaned_topics = []
        print("Opening topic file")
        with open('topics.txt', 'r') as topic_file:
            for line in topic_file.readlines():
                cleaned_topics.append(line.strip("\n"))
        #print(data.head(2))
        print("Adding test topics")
        print(cleaned_topics)

        df = data
        df = pd.concat([df, pd.DataFrame(columns=cleaned_topics)], sort=False)
        df = df.fillna(int(0))
        for i, row in df.iterrows():
            intersect = set(row['reviewText']) & set(cleaned_topics)
            for word in intersect:
                df.at[i, word] = 1
        return df