Example #1
0
def resnik(word1, word2, ic_data):
    """
    Calculate resnik similarity between word1 and word2 using information content data.
    Resnik similarity = max of [ic(s) for s in subsumers(word1, word2)]
    :param word1:
    :param word2:
    :param ic_data:
    :return:
    """
    resnik_sim = 0
    best_sense = ''
    for syn_sense_probe in wordnet.synsets(word1):
        for syn_sense_noun in wordnet.synsets(word2):
            syn_word1 = wordnet.synset(
                syn_sense_probe.name())  # get synnet for word1
            syn_word2 = wordnet.synset(
                syn_sense_noun.name())  # get synnet for word2
            subsumers = syn_word1.common_hypernyms(
                syn_word2)  # get all subsumers
            ic = 0
            if len(subsumers) > 0:
                # If there exists subsumers for probe and noun
                for s in subsumers:
                    ic = information_content(s,
                                             ic_data) if information_content(
                                                 s, ic_data) > ic else ic
            if ic > resnik_sim:
                # If current information content is better
                resnik_sim = ic
                best_sense = syn_word1
    return resnik_sim, best_sense
Example #2
0
def generate_cv_entry(synset, fout):
	#ml = monosemous_lemmas(synset)
	ml = []
	# if synset.name() == 'living_thing.n.01':
	# 	pdb.set_trace()
	intersectable_lemmas = find_intersectable_child_lemmas(synset)

	synset_ic = information_content(synset, ic)

	hypernym_names = []
	for hypernym in synset.hypernyms() + synset.instance_hypernyms():
		hyper_ic = information_content(hypernym, ic)
		if synset_ic == 0 or hyper_ic == 0:
			if hypernym.name != 'entity.n.01':
				shared_ic_ratio = 1.0
			else:
				shared_ic_ratio = 0.0
		else:
			shared_ic_ratio = hyper_ic / synset_ic

		hypernym_names.append((hypernym.name(), shared_ic_ratio))

	fout.write('! %s\n' % synset.name())
	fout.write('WV ')
	for m in ml:
		fout.write('%s ' % m)
	fout.write('\nWVI ')
	for w1, w2 in intersectable_lemmas:
		fout.write('%s %s ; ' % (w1, w2))
	fout.write('\nP ')
	for p, weight in hypernym_names:
		fout.write('%s %f ; ' % (p, weight))
	fout.write('\n\n')
Example #3
0
def ic1_ic2_lcs_ic(synset1,synset2,ic):
    ic1 = nltk_reader.information_content(synset1, ic)
    ic2 = nltk_reader.information_content(synset2, ic)
    #获得祖先节点
    subsumers = synset1.common_hypernyms(synset2)
    if len(subsumers) == 0:
        subsumer_ic = 0
    else:
        subsumer_ic = max(nltk_reader.information_content(s, ic) for s in subsumers)

    return ic1, ic2, subsumer_ic
def getInfoContent(wordList):
  tokens = []
  for word in wordList:
    tokens += nltk.word_tokenize(word)
  filtered_tokens = [w for w in tokens
                     if not w in stopwords.words('english')]
  uniquetokens = list(set(filtered_tokens))
  icArray = []
  for token in uniquetokens:
    tempNum = 0
    synsets = wn.synsets(token)
    if len(synsets) > 0:
      for synset in synsets:
        if not set([synset.pos()]).intersection(set(['a','s','r'])):
          synsetItem = synset
          tempNum = 1
          break
      if tempNum == 1:
        infoContent = information_content(synsetItem, brown_ic)
        icArray.append([token, infoContent])
      else :
        icArray.append([token, 0.0])
    else :
      icArray.append([token, 0.0])
  return icArray
Example #5
0
def resnik_similarity(word1, word2, ic):
    # initializing
    _ic = wordnet_ic.ic(ic)
    max_subsumer_ic = 0
    most_similar_syn_w1 = wn.synsets(word1)[0]

    # if the word isn't in wordnet, ignore it
    if len(wn.synsets(word2)) == 0:
        most_similar_syn_w2 = 0
    # double for loop to pick the most similar sense of each of the two words
    for w1_synset in wn.synsets(word1):
        for w2_sysnet in wn.synsets(word2):
            subsumers = w1_synset.common_hypernyms(w2_sysnet)

            if len(subsumers) == 0:
                subsumer_ic = 0
            else:
                # get the information content value for the most specific/informative subsumer
                subsumer_ic = max(
                    information_content(sub, _ic) for sub in subsumers)
            # make sure we keep the most similar sense pair
            if max_subsumer_ic < subsumer_ic:
                max_subsumer_ic = subsumer_ic
                most_similar_syn_w1 = w1_synset
                most_similar_syn_w2 = w2_sysnet

    return max_subsumer_ic, most_similar_syn_w1, most_similar_syn_w2
 def ic_corpus(self, word):
     c = self.word2synset(word)
     if len(c) == 0 and " " in word:
         return self.ic_corpus(word.split(" ")[0])
     elif len(c) == 0:
         return 0.0
     return sum([information_content(ci, self._ic_corpus)
                 for ci in c]) / len(c)
Example #7
0
    def ic(self, x):
        if not self._ic:
            wnic = WordNetICCorpusReader(nltk.data.find('corpora/wordnet_ic'),
                                         '.*\.dat')
            self._ic = wnic.ic('ic-bnc-resnik-add1.dat')

        val = information_content(x, self._ic)

        return val
Example #8
0
def resnik_similarity(word, context, wnic):
    
    probe_senses = wordnet.synsets(word)
    context_senses = wordnet.synsets(context)

    if not context_senses:
        return None

    top_subs = set()

    for sense in probe_senses:
        for context in context_senses:
            common_hypernyms = sense.common_hypernyms(context)
            if common_hypernyms:
                mark1 = max(common_hypernyms, key=lambda x: information_content(x, wnic))
                mark2 = information_content(mark1, wnic)
                top_subs.add((mark1, mark2))

    mis = max(top_subs, key=lambda x: x[1])
    
    return mis
Example #9
0
def ResnikSimilarity(probe_word, context_word, ic):
    probe_senses = wn.synsets(probe_word)
    context_senses = wn.synsets(context_word)

    if not context_senses:
        return None

    most_informative_subsumers = set()

    for p in probe_senses:
        for c in context_senses:
            common_hypernyms = p.common_hypernyms(c)
            if common_hypernyms:
                temp = max(common_hypernyms,
                           key=lambda x: information_content(x, ic))
                score = information_content(temp, ic)

                most_informative_subsumers.add((temp, score))

    mis = max(most_informative_subsumers, key=lambda x: x[1])

    return mis
Example #10
0
def resnik_similarity(word, context, wnic):

    probe_senses = wordnet.synsets(word)
    context_senses = wordnet.synsets(context)

    if not context_senses:
        return None

    top_subs = set()

    for sense in probe_senses:
        for context in context_senses:
            common_hypernyms = sense.common_hypernyms(context)
            if common_hypernyms:
                mark1 = max(common_hypernyms,
                            key=lambda x: information_content(x, wnic))
                mark2 = information_content(mark1, wnic)
                top_subs.add((mark1, mark2))

    mis = max(top_subs, key=lambda x: x[1])

    return mis
 def sim(self, c1, c2):
     """Find the most informative subsumer of these 2 concepts."""
     hypernyms = c1.common_hypernyms(c2)
     most_information = None
     most_informative_hypernym = None
     information = []
     for hypernym in hypernyms:
         hypernym_ic = information_content(hypernym, self.brown_ic)
         information.append(hypernym_ic)
         if most_information is None or hypernym_ic > most_information:
             most_information = hypernym_ic
             most_informative_hypernym = hypernym
     return (most_information, most_informative_hypernym)
Example #12
0
def resnik(word1, word2, ic):
    word1 = wordnet.synsets(word1)
    word2 = wordnet.synsets(word2)
    sim = {}
    if word2 == []:
        return (('NOSENSE', 'NOSENSE'), 0)
    for syn1 in word1:
        for syn2 in word2:
            common = syn1.common_hypernyms(syn2)
            if len(common) > 0:
                sim[(syn1,
                     syn2)] = max(information_content(s, ic) for s in common)
            else:
                sim[(syn1, syn2)] = 0
    return sorted(sim.items(), key=lambda x: -x[1])[0]
Example #13
0
def getMetadata(textData):
    ic_freq_obj = {}
    textArray = json.dumps(textData).split("\\n")
    parsedTextArray = [x.split(';') for x in textArray]
    sentenceList = [x[3] for x in parsedTextArray if len(x) == 4]
    filteredSentenceList = []
    filteredWords = []
    nestedWordList = []
    tokens = []
    specialString = "!@#$%^&*()[]{};:,./<>?\|`~=_+-"
    for sentence in sentenceList:
        sentLower = sentence.lower()
        sentence_filt = sentLower.translate \
                         ({ord(c): " "+c+" " for c in specialString})
        # wordTokens = nltk.word_tokenize(sentence_filt)
        wordTokens = sentence_filt.split()
        nestedWordList.append(wordTokens)
        tokens.extend(wordTokens)
        filteredSentenceList.append(sentence_filt)

    #### COMPUTE POS AND NER TAGS FOR EACH LINE ####
    posTupleList = st.tag_sents(nestedWordList)
    NERTaggedList = NER.tag_sents(nestedWordList)
    combinedTagList = []
    for i, posTuples in enumerate(posTupleList):
        NERTuples = NERTaggedList[i]
        # posTuples and NERTuples are each a list of tags for the same
        # sentence.
        # These will be combined into one list of tags for that sentence.
        sentTagsList = []
        for j, posTuple in enumerate(posTuples):
            NERTuple = NERTuples[j]
            word = posTuple[0]

            # posTuple and NERTuple represent the same word and its
            # POS/NER tag.
            # These will now be combined to the form:
            # {word: {"POS": <postag>, "NER": <nertag>}}
            combinedTagObj = {}
            tags = {}
            tags["NER"] = NERTuple[1]
            tags["POS"] = posTuple[1]
            combinedTagObj[word] = tags
            sentTagsList.append(combinedTagObj)
        combinedTagList.append(sentTagsList)
        filteredWords.extend(sentTagsList)

    # write the combined tags to a file, can come in handy later.
    with open('./public/pythonscripts/combinedTags.txt', 'w') as fObj:
        fObj.write(str(combinedTagList))

    with open('./public/pythonscripts/filteredWords.txt', 'w') as fObj:
        fObj.write(str(filteredWords))

    #### COMPUTE FREQUENCIES AND INFORMATION CONTENTS FOR EACH WORD ####

    # remove all special characters for each word, replace them with
    # spaces. Then get rid of whatever follows the space. So words like
    # "there's" become "there", and words like
    punctuationLeftovers = ["s", "re", 'na', "m", "em", "d"]
    completeStopwords = stopwords.words("english") +\
                        punctuationLeftovers
    filtered_tokens = [
        w.lower() for w in tokens if not w.lower() in set(completeStopwords)
    ]
    frequencyDict = Counter(filtered_tokens)
    uniquetokens = list(set(filtered_tokens))
    icArray = []
    tagCountObj = {}
    outLiers = []
    maxInfoContent = 0
    for token in uniquetokens:
        tempNum = 0
        synsets = wn.synsets(token)
        if len(synsets) > 0:
            for synset in synsets:
                if not \
                set([synset.pos()]).intersection(set(["a","s","r"])) :
                    synsetItem = synset
                    tempNum = 1
                    break
            if tempNum == 1:
                infoContentValue = information_content(synsetItem,
                                                       ic_bnc_plus1)
                if infoContentValue >= 1e+300:
                    outLiers.append((token, infoContentValue))
                else:
                    icArray.append((token, infoContentValue))
                    if maxInfoContent < infoContentValue:
                        maxInfoContent = infoContentValue
            else:
                icArray.append((token, 0.0))
        ####
        POSList = []
        NERList = []
        allTags = {}
        for fw in filteredWords:
            # fw = {"word": {"POS": "xxx", "NER: "yyy"}}
            if token in fw:
                tokenPOS = fw[token]["POS"]
                tokenNER = fw[token]["NER"]
                POSList.append(tokenPOS)
                NERList.append(tokenNER)
        POSList = list(set(POSList))
        NERList = list(set(NERList))
        allTags = {}
        allTags["POSList"] = POSList
        allTags["NERList"] = NERList
        tagCountObj[token] = allTags

    for outLier, ic in outLiers:
        icArray.append((outLier, maxInfoContent))
    for word, ic in icArray:
        metric = {}
        metric["infoContent"] = ic
        metric["frequency"] = frequencyDict[word]
        metric["POSList"] = tagCountObj[word]["POSList"]
        metric["NERList"] = tagCountObj[word]["NERList"]
        ic_freq_obj[word] = metric

    # Finally, perform topic modeling if required, or just include a
    # pre-calculated list of topics (better for consistency in user
    # studies)
    loadTopics = 0  # change this to 1 if you want to read from file.
    if loadTopics == 0:
        topicsObj = genTopicModels(parsedTextArray, 3, 10)
    else:
        with open('./public/pythonscripts/topics.json') as tObj:
            topicsObj = json.load(tObj)
    nlpOutputObj = {}
    nlpOutputObj["metadata"] = ic_freq_obj
    nlpOutputObj["sentencetags"] = combinedTagList
    nlpOutputObj["topicmodels"] = topicsObj
    # nlpOutputStr = str(nlpOutputObj)
    nlpOutputStr = json.dumps(nlpOutputObj)
    # JSON with single quotes gets vomited on at the client end, so
    # let's change all of those.
    # nlpOutput = nlpOutputStr.replace("'", '"')
    with open('./public/pythonscripts/outfile.txt', 'w') as fObj:
        fObj.write(nlpOutputStr)
    return nlpOutputStr
Example #14
0
def neg_logP(c, ic_corpus):
    ic =float(information_content(c,ic_corpus))
    return ic
Example #15
0
 def synset_ic(self, c):
     return information_content(c, self._ic_corpus)
 def ic_corpus_synset(self, syn):
     return information_content(syn, self._ic_corpus)
 def wordnet_lcs_ic(self, syn1, syn2):
     return information_content(
         syn1.lowest_common_hypernyms(syn2)[0], self._ic_corpus)
Example #18
0
    def compress_isa_graph(self, verbose=True):
        """
        This function is used to compress the extracted graph from WordNet by removing some of the nodes.
        The compression strategy follows paper 'Nearly-Automated Metadata Hierarchy Creation'

        :param verbose: whether to show compression steps for debugging
        :return:
        """
        print("\n\nCompressing WordNet object hierarchy...")

        graph1 = copy.deepcopy(self.graph)

        # Rule 1 - Remove all nodes with low information content
        brown = wnic.ic('ic-brown.dat')
        for node in list(self.graph.nodes()):
            if self.graph.nodes[node]["type"] != "object_id" and self.graph.nodes[node]["type"] != "wordnet_synset":
                if rwn.information_content(wn.synset(node), brown) < 3.0:
                    self.graph.remove_node(node)
        if verbose:
            diff = set(graph1.nodes()) - set(self.graph.nodes())
            print("Nodes removed by compression rule 1: {}".format(list(diff)))

        # Rule 2 - Remove all nodes with only a single child except the root
        if verbose:
            graph2 = copy.deepcopy(self.graph)
        # starting from leaf nodes
        nodes_sort = [node for node in self.graph if len(
            list(self.graph.predecessors(node))) == 0]
        while len(nodes_sort) > 0:
            node = nodes_sort.pop(0)
            if node not in self.graph:
                continue

            parents = list(self.graph.successors(node))
            children = list(self.graph.predecessors(node))
            for parent in parents:
                nodes_sort.append(parent)

            if len(children) == 1 and len(
                    parents) != 0 and self.graph.nodes[node]["type"] != "object_id" and self.graph.nodes[node]["type"] != "wordnet_synset":
                self.graph.remove_node(node)
                for parent in parents:
                    for child in children:
                        self.graph.add_edge(child, parent, relation='IsA')
        if verbose:
            diff = set(graph2.nodes()) - set(self.graph.nodes())
            print("Nodes removed by compression rule 2: {}".format(list(diff)))

        # Rule 3 - Remove all nodes whose name contains the name of the parent
        # (except seed)
        if verbose:
            graph3 = copy.deepcopy(self.graph)
        for node in list(self.graph.nodes()):
            if len(list(self.graph.predecessors(node))) == 0:
                continue
            if self.graph.nodes[node]["type"] == "object_id" or self.graph.nodes[node]["type"] == "wordnet_synset":
                continue
            parents = list(self.graph.successors(node))
            children = list(self.graph.predecessors(node))
            should_remove = True if len(parents) > 0 else False
            for parent in parents:
                pname = parent.split('.')[0]
                cname = node.split('.')[0]
                if pname not in cname:
                    should_remove = False
                    break
            if should_remove:
                self.graph.remove_node(node)
                for child in children:
                    for parent in parents:
                        self.graph.add_edge(child, parent, relation='IsA')
        if verbose:
            diff = set(graph3.nodes()) - set(self.graph.nodes())
            print("Nodes removed by compression rule 3: {}".format(list(diff)))

        # sanity check: make sure no initial object nodes are removed
        current_seeds = []
        for n in list(graph1.nodes()):
            if graph1.nodes[n]["type"] == "wordnet_synset" or graph1.nodes[n]["type"] == "object_id":
                assert n in self.graph.nodes

        # add a common parent to combine the isolated graphs created by
        # compression
        root_nodes = [
            (node,
             "entity.n.01") for node in self.graph if len(
                list(
                    self.graph.successors(node))) == 0]
        self.graph.add_node(
            "entity.n.01",
            color="orange",
            type="extracted_wordnet_synset")
        self.graph.add_edges_from(root_nodes, relation="IsA")
Example #19
0
def neg_logP(c, ic_corpus):
    ic = float(information_content(c, ic_corpus))
    return ic
def getMetadata(textData):
    ic_freq_obj = {}
    textArray = json.dumps(textData).split("\\n")
    parsedTextArray = [x.split(';') for x in textArray]
    sentenceList = [x[3] for x in parsedTextArray if len(x)==4]
    filteredSentenceList = []
    filteredWords = []
    nestedWordList = []
    tokens = []
    specialString = "!@#$%^&*()[]{};:,./<>?\|`~=_+-"
    for sentence in sentenceList:
        sentLower = sentence.lower()
        sentence_filt = sentLower.translate \
                         ({ord(c): " "+c+" " for c in specialString})
        # wordTokens = nltk.word_tokenize(sentence_filt)
        wordTokens = sentence_filt.split()
        nestedWordList.append(wordTokens)
        tokens.extend(wordTokens)
        filteredSentenceList.append(sentence_filt);

    #### COMPUTE POS AND NER TAGS FOR EACH LINE ####
    posTupleList = st.tag_sents(nestedWordList)
    NERTaggedList = NER.tag_sents(nestedWordList)
    combinedTagList = []
    for i, posTuples in enumerate(posTupleList):
        NERTuples = NERTaggedList[i]
        # posTuples and NERTuples are each a list of tags for the same
        # sentence.
        # These will be combined into one list of tags for that sentence.
        sentTagsList = []
        for j, posTuple in enumerate(posTuples):
            NERTuple = NERTuples[j]
            word = posTuple[0]
            
            # posTuple and NERTuple represent the same word and its
            # POS/NER tag.
            # These will now be combined to the form:
            # {word: {"POS": <postag>, "NER": <nertag>}}
            combinedTagObj = {}
            tags = {}
            tags["NER"] = NERTuple[1]
            tags["POS"] = posTuple[1]
            combinedTagObj[word] = tags
            sentTagsList.append(combinedTagObj)
        combinedTagList.append(sentTagsList)
        filteredWords.extend(sentTagsList)

    # write the combined tags to a file, can come in handy later.
    with open('./public/pythonscripts/combinedTags.txt', 'w') as fObj:
        fObj.write(str(combinedTagList))

    with open('./public/pythonscripts/filteredWords.txt', 'w') as fObj:
        fObj.write(str(filteredWords))

    #### COMPUTE FREQUENCIES AND INFORMATION CONTENTS FOR EACH WORD ####

    # remove all special characters for each word, replace them with
    # spaces. Then get rid of whatever follows the space. So words like
    # "there's" become "there", and words like
    punctuationLeftovers = ["s", "re", 'na', "m", "em", "d"]
    completeStopwords = stopwords.words("english") +\
                        punctuationLeftovers
    filtered_tokens = [w.lower() for w in tokens if not
                       w.lower() in set(completeStopwords)]
    frequencyDict = Counter(filtered_tokens)
    uniquetokens = list(set(filtered_tokens))
    icArray = []
    tagCountObj = {}
    outLiers = []
    maxInfoContent = 0
    for token in uniquetokens:
        tempNum = 0
        synsets = wn.synsets(token)
        if len(synsets) > 0:
            for synset in synsets:
                if not \
                set([synset.pos()]).intersection(set(["a","s","r"])) :
                    synsetItem = synset
                    tempNum = 1
                    break
            if tempNum == 1:
                infoContentValue = information_content(synsetItem,
                                                       ic_bnc_plus1)
                if infoContentValue >= 1e+300 :
                    outLiers.append((token, infoContentValue))
                else :
                    icArray.append((token, infoContentValue))
                    if maxInfoContent < infoContentValue:
                        maxInfoContent = infoContentValue
            else :
                icArray.append((token, 0.0))
        ####
        POSList = []
        NERList = []
        allTags = {}
        for fw in filteredWords:
            # fw = {"word": {"POS": "xxx", "NER: "yyy"}}
            if token in fw:
                tokenPOS = fw[token]["POS"]
                tokenNER = fw[token]["NER"]
                POSList.append(tokenPOS)
                NERList.append(tokenNER)
        POSList = list(set(POSList))
        NERList = list(set(NERList))
        allTags = {}
        allTags["POSList"] = POSList
        allTags["NERList"] = NERList
        tagCountObj[token] = allTags

    for outLier, ic in outLiers:
        icArray.append((outLier, maxInfoContent))
    for word, ic in icArray:
        metric = {}
        metric["infoContent"] = ic
        metric["frequency"] = frequencyDict[word]
        metric["POSList"] = tagCountObj[word]["POSList"]
        metric["NERList"] = tagCountObj[word]["NERList"]
        ic_freq_obj[word] = metric

    # Finally, perform topic modeling if required, or just include a
    # pre-calculated list of topics (better for consistency in user
    # studies)
    loadTopics = 1 # change this to 1 if you want to read from file.
    if loadTopics == 0:
        topicsObj = genTopicModels(parsedTextArray, 3, 10)
    else :
        with open('./public/pythonscripts/topics.json') as tObj:
            topicsObj = json.load(tObj)
    nlpOutputObj = {}
    nlpOutputObj["metadata"] = ic_freq_obj
    nlpOutputObj["sentencetags"] = combinedTagList
    nlpOutputObj["topicmodels"] = topicsObj
    # nlpOutputStr = str(nlpOutputObj)
    nlpOutputStr = json.dumps(nlpOutputObj)
    # JSON with single quotes gets vomited on at the client end, so
    # let's change all of those.
    # nlpOutput = nlpOutputStr.replace("'", '"')
    with open('./public/pythonscripts/outfile.txt', 'w') as fObj:
            fObj.write(nlpOutputStr)
    return nlpOutputStr
Example #21
0
 def synset_ic(self, c):
     return information_content(c, self._ic_corpus)
Example #22
0
for a in preA:
    row = []
    for b in preB:
        wdsim = wns.word_similarity(a, b, 'wup')
        row.append(wdsim)
    data.append(row)
data = numpy.matrix(data)
#max values in rows
Amax = data.max(1)
icA = []
for i in range(len(preA)):
    try:
        if lesk(preA, preA[i]) is None:
            # preA is not in WordNet
            icA.append(1)
        elif information_content(lesk(preA, preA[i]), brown_ic) == 'inf':
            icA.append(1)
        else:
            icA.append(information_content(lesk(preA, preA[i]), brown_ic))
    except Exception:
        icA.append(1)
icA = numpy.matrix(icA)
#max values in columns
Bmax = data.max(0)
icB = []
for i in range(len(preB)):
    try:
        if lesk(preB, preB[i]) is None:
            # preB is not in WordNet
            icB.append(1)
        elif information_content(lesk(preB, preB[i]), brown_ic) == 'inf':