def resnik(word1, word2, ic_data): """ Calculate resnik similarity between word1 and word2 using information content data. Resnik similarity = max of [ic(s) for s in subsumers(word1, word2)] :param word1: :param word2: :param ic_data: :return: """ resnik_sim = 0 best_sense = '' for syn_sense_probe in wordnet.synsets(word1): for syn_sense_noun in wordnet.synsets(word2): syn_word1 = wordnet.synset( syn_sense_probe.name()) # get synnet for word1 syn_word2 = wordnet.synset( syn_sense_noun.name()) # get synnet for word2 subsumers = syn_word1.common_hypernyms( syn_word2) # get all subsumers ic = 0 if len(subsumers) > 0: # If there exists subsumers for probe and noun for s in subsumers: ic = information_content(s, ic_data) if information_content( s, ic_data) > ic else ic if ic > resnik_sim: # If current information content is better resnik_sim = ic best_sense = syn_word1 return resnik_sim, best_sense
def generate_cv_entry(synset, fout): #ml = monosemous_lemmas(synset) ml = [] # if synset.name() == 'living_thing.n.01': # pdb.set_trace() intersectable_lemmas = find_intersectable_child_lemmas(synset) synset_ic = information_content(synset, ic) hypernym_names = [] for hypernym in synset.hypernyms() + synset.instance_hypernyms(): hyper_ic = information_content(hypernym, ic) if synset_ic == 0 or hyper_ic == 0: if hypernym.name != 'entity.n.01': shared_ic_ratio = 1.0 else: shared_ic_ratio = 0.0 else: shared_ic_ratio = hyper_ic / synset_ic hypernym_names.append((hypernym.name(), shared_ic_ratio)) fout.write('! %s\n' % synset.name()) fout.write('WV ') for m in ml: fout.write('%s ' % m) fout.write('\nWVI ') for w1, w2 in intersectable_lemmas: fout.write('%s %s ; ' % (w1, w2)) fout.write('\nP ') for p, weight in hypernym_names: fout.write('%s %f ; ' % (p, weight)) fout.write('\n\n')
def ic1_ic2_lcs_ic(synset1,synset2,ic): ic1 = nltk_reader.information_content(synset1, ic) ic2 = nltk_reader.information_content(synset2, ic) #获得祖先节点 subsumers = synset1.common_hypernyms(synset2) if len(subsumers) == 0: subsumer_ic = 0 else: subsumer_ic = max(nltk_reader.information_content(s, ic) for s in subsumers) return ic1, ic2, subsumer_ic
def getInfoContent(wordList): tokens = [] for word in wordList: tokens += nltk.word_tokenize(word) filtered_tokens = [w for w in tokens if not w in stopwords.words('english')] uniquetokens = list(set(filtered_tokens)) icArray = [] for token in uniquetokens: tempNum = 0 synsets = wn.synsets(token) if len(synsets) > 0: for synset in synsets: if not set([synset.pos()]).intersection(set(['a','s','r'])): synsetItem = synset tempNum = 1 break if tempNum == 1: infoContent = information_content(synsetItem, brown_ic) icArray.append([token, infoContent]) else : icArray.append([token, 0.0]) else : icArray.append([token, 0.0]) return icArray
def resnik_similarity(word1, word2, ic): # initializing _ic = wordnet_ic.ic(ic) max_subsumer_ic = 0 most_similar_syn_w1 = wn.synsets(word1)[0] # if the word isn't in wordnet, ignore it if len(wn.synsets(word2)) == 0: most_similar_syn_w2 = 0 # double for loop to pick the most similar sense of each of the two words for w1_synset in wn.synsets(word1): for w2_sysnet in wn.synsets(word2): subsumers = w1_synset.common_hypernyms(w2_sysnet) if len(subsumers) == 0: subsumer_ic = 0 else: # get the information content value for the most specific/informative subsumer subsumer_ic = max( information_content(sub, _ic) for sub in subsumers) # make sure we keep the most similar sense pair if max_subsumer_ic < subsumer_ic: max_subsumer_ic = subsumer_ic most_similar_syn_w1 = w1_synset most_similar_syn_w2 = w2_sysnet return max_subsumer_ic, most_similar_syn_w1, most_similar_syn_w2
def ic_corpus(self, word): c = self.word2synset(word) if len(c) == 0 and " " in word: return self.ic_corpus(word.split(" ")[0]) elif len(c) == 0: return 0.0 return sum([information_content(ci, self._ic_corpus) for ci in c]) / len(c)
def ic(self, x): if not self._ic: wnic = WordNetICCorpusReader(nltk.data.find('corpora/wordnet_ic'), '.*\.dat') self._ic = wnic.ic('ic-bnc-resnik-add1.dat') val = information_content(x, self._ic) return val
def resnik_similarity(word, context, wnic): probe_senses = wordnet.synsets(word) context_senses = wordnet.synsets(context) if not context_senses: return None top_subs = set() for sense in probe_senses: for context in context_senses: common_hypernyms = sense.common_hypernyms(context) if common_hypernyms: mark1 = max(common_hypernyms, key=lambda x: information_content(x, wnic)) mark2 = information_content(mark1, wnic) top_subs.add((mark1, mark2)) mis = max(top_subs, key=lambda x: x[1]) return mis
def ResnikSimilarity(probe_word, context_word, ic): probe_senses = wn.synsets(probe_word) context_senses = wn.synsets(context_word) if not context_senses: return None most_informative_subsumers = set() for p in probe_senses: for c in context_senses: common_hypernyms = p.common_hypernyms(c) if common_hypernyms: temp = max(common_hypernyms, key=lambda x: information_content(x, ic)) score = information_content(temp, ic) most_informative_subsumers.add((temp, score)) mis = max(most_informative_subsumers, key=lambda x: x[1]) return mis
def sim(self, c1, c2): """Find the most informative subsumer of these 2 concepts.""" hypernyms = c1.common_hypernyms(c2) most_information = None most_informative_hypernym = None information = [] for hypernym in hypernyms: hypernym_ic = information_content(hypernym, self.brown_ic) information.append(hypernym_ic) if most_information is None or hypernym_ic > most_information: most_information = hypernym_ic most_informative_hypernym = hypernym return (most_information, most_informative_hypernym)
def resnik(word1, word2, ic): word1 = wordnet.synsets(word1) word2 = wordnet.synsets(word2) sim = {} if word2 == []: return (('NOSENSE', 'NOSENSE'), 0) for syn1 in word1: for syn2 in word2: common = syn1.common_hypernyms(syn2) if len(common) > 0: sim[(syn1, syn2)] = max(information_content(s, ic) for s in common) else: sim[(syn1, syn2)] = 0 return sorted(sim.items(), key=lambda x: -x[1])[0]
def getMetadata(textData): ic_freq_obj = {} textArray = json.dumps(textData).split("\\n") parsedTextArray = [x.split(';') for x in textArray] sentenceList = [x[3] for x in parsedTextArray if len(x) == 4] filteredSentenceList = [] filteredWords = [] nestedWordList = [] tokens = [] specialString = "!@#$%^&*()[]{};:,./<>?\|`~=_+-" for sentence in sentenceList: sentLower = sentence.lower() sentence_filt = sentLower.translate \ ({ord(c): " "+c+" " for c in specialString}) # wordTokens = nltk.word_tokenize(sentence_filt) wordTokens = sentence_filt.split() nestedWordList.append(wordTokens) tokens.extend(wordTokens) filteredSentenceList.append(sentence_filt) #### COMPUTE POS AND NER TAGS FOR EACH LINE #### posTupleList = st.tag_sents(nestedWordList) NERTaggedList = NER.tag_sents(nestedWordList) combinedTagList = [] for i, posTuples in enumerate(posTupleList): NERTuples = NERTaggedList[i] # posTuples and NERTuples are each a list of tags for the same # sentence. # These will be combined into one list of tags for that sentence. sentTagsList = [] for j, posTuple in enumerate(posTuples): NERTuple = NERTuples[j] word = posTuple[0] # posTuple and NERTuple represent the same word and its # POS/NER tag. # These will now be combined to the form: # {word: {"POS": <postag>, "NER": <nertag>}} combinedTagObj = {} tags = {} tags["NER"] = NERTuple[1] tags["POS"] = posTuple[1] combinedTagObj[word] = tags sentTagsList.append(combinedTagObj) combinedTagList.append(sentTagsList) filteredWords.extend(sentTagsList) # write the combined tags to a file, can come in handy later. with open('./public/pythonscripts/combinedTags.txt', 'w') as fObj: fObj.write(str(combinedTagList)) with open('./public/pythonscripts/filteredWords.txt', 'w') as fObj: fObj.write(str(filteredWords)) #### COMPUTE FREQUENCIES AND INFORMATION CONTENTS FOR EACH WORD #### # remove all special characters for each word, replace them with # spaces. Then get rid of whatever follows the space. So words like # "there's" become "there", and words like punctuationLeftovers = ["s", "re", 'na', "m", "em", "d"] completeStopwords = stopwords.words("english") +\ punctuationLeftovers filtered_tokens = [ w.lower() for w in tokens if not w.lower() in set(completeStopwords) ] frequencyDict = Counter(filtered_tokens) uniquetokens = list(set(filtered_tokens)) icArray = [] tagCountObj = {} outLiers = [] maxInfoContent = 0 for token in uniquetokens: tempNum = 0 synsets = wn.synsets(token) if len(synsets) > 0: for synset in synsets: if not \ set([synset.pos()]).intersection(set(["a","s","r"])) : synsetItem = synset tempNum = 1 break if tempNum == 1: infoContentValue = information_content(synsetItem, ic_bnc_plus1) if infoContentValue >= 1e+300: outLiers.append((token, infoContentValue)) else: icArray.append((token, infoContentValue)) if maxInfoContent < infoContentValue: maxInfoContent = infoContentValue else: icArray.append((token, 0.0)) #### POSList = [] NERList = [] allTags = {} for fw in filteredWords: # fw = {"word": {"POS": "xxx", "NER: "yyy"}} if token in fw: tokenPOS = fw[token]["POS"] tokenNER = fw[token]["NER"] POSList.append(tokenPOS) NERList.append(tokenNER) POSList = list(set(POSList)) NERList = list(set(NERList)) allTags = {} allTags["POSList"] = POSList allTags["NERList"] = NERList tagCountObj[token] = allTags for outLier, ic in outLiers: icArray.append((outLier, maxInfoContent)) for word, ic in icArray: metric = {} metric["infoContent"] = ic metric["frequency"] = frequencyDict[word] metric["POSList"] = tagCountObj[word]["POSList"] metric["NERList"] = tagCountObj[word]["NERList"] ic_freq_obj[word] = metric # Finally, perform topic modeling if required, or just include a # pre-calculated list of topics (better for consistency in user # studies) loadTopics = 0 # change this to 1 if you want to read from file. if loadTopics == 0: topicsObj = genTopicModels(parsedTextArray, 3, 10) else: with open('./public/pythonscripts/topics.json') as tObj: topicsObj = json.load(tObj) nlpOutputObj = {} nlpOutputObj["metadata"] = ic_freq_obj nlpOutputObj["sentencetags"] = combinedTagList nlpOutputObj["topicmodels"] = topicsObj # nlpOutputStr = str(nlpOutputObj) nlpOutputStr = json.dumps(nlpOutputObj) # JSON with single quotes gets vomited on at the client end, so # let's change all of those. # nlpOutput = nlpOutputStr.replace("'", '"') with open('./public/pythonscripts/outfile.txt', 'w') as fObj: fObj.write(nlpOutputStr) return nlpOutputStr
def neg_logP(c, ic_corpus): ic =float(information_content(c,ic_corpus)) return ic
def synset_ic(self, c): return information_content(c, self._ic_corpus)
def ic_corpus_synset(self, syn): return information_content(syn, self._ic_corpus)
def wordnet_lcs_ic(self, syn1, syn2): return information_content( syn1.lowest_common_hypernyms(syn2)[0], self._ic_corpus)
def compress_isa_graph(self, verbose=True): """ This function is used to compress the extracted graph from WordNet by removing some of the nodes. The compression strategy follows paper 'Nearly-Automated Metadata Hierarchy Creation' :param verbose: whether to show compression steps for debugging :return: """ print("\n\nCompressing WordNet object hierarchy...") graph1 = copy.deepcopy(self.graph) # Rule 1 - Remove all nodes with low information content brown = wnic.ic('ic-brown.dat') for node in list(self.graph.nodes()): if self.graph.nodes[node]["type"] != "object_id" and self.graph.nodes[node]["type"] != "wordnet_synset": if rwn.information_content(wn.synset(node), brown) < 3.0: self.graph.remove_node(node) if verbose: diff = set(graph1.nodes()) - set(self.graph.nodes()) print("Nodes removed by compression rule 1: {}".format(list(diff))) # Rule 2 - Remove all nodes with only a single child except the root if verbose: graph2 = copy.deepcopy(self.graph) # starting from leaf nodes nodes_sort = [node for node in self.graph if len( list(self.graph.predecessors(node))) == 0] while len(nodes_sort) > 0: node = nodes_sort.pop(0) if node not in self.graph: continue parents = list(self.graph.successors(node)) children = list(self.graph.predecessors(node)) for parent in parents: nodes_sort.append(parent) if len(children) == 1 and len( parents) != 0 and self.graph.nodes[node]["type"] != "object_id" and self.graph.nodes[node]["type"] != "wordnet_synset": self.graph.remove_node(node) for parent in parents: for child in children: self.graph.add_edge(child, parent, relation='IsA') if verbose: diff = set(graph2.nodes()) - set(self.graph.nodes()) print("Nodes removed by compression rule 2: {}".format(list(diff))) # Rule 3 - Remove all nodes whose name contains the name of the parent # (except seed) if verbose: graph3 = copy.deepcopy(self.graph) for node in list(self.graph.nodes()): if len(list(self.graph.predecessors(node))) == 0: continue if self.graph.nodes[node]["type"] == "object_id" or self.graph.nodes[node]["type"] == "wordnet_synset": continue parents = list(self.graph.successors(node)) children = list(self.graph.predecessors(node)) should_remove = True if len(parents) > 0 else False for parent in parents: pname = parent.split('.')[0] cname = node.split('.')[0] if pname not in cname: should_remove = False break if should_remove: self.graph.remove_node(node) for child in children: for parent in parents: self.graph.add_edge(child, parent, relation='IsA') if verbose: diff = set(graph3.nodes()) - set(self.graph.nodes()) print("Nodes removed by compression rule 3: {}".format(list(diff))) # sanity check: make sure no initial object nodes are removed current_seeds = [] for n in list(graph1.nodes()): if graph1.nodes[n]["type"] == "wordnet_synset" or graph1.nodes[n]["type"] == "object_id": assert n in self.graph.nodes # add a common parent to combine the isolated graphs created by # compression root_nodes = [ (node, "entity.n.01") for node in self.graph if len( list( self.graph.successors(node))) == 0] self.graph.add_node( "entity.n.01", color="orange", type="extracted_wordnet_synset") self.graph.add_edges_from(root_nodes, relation="IsA")
def neg_logP(c, ic_corpus): ic = float(information_content(c, ic_corpus)) return ic
def getMetadata(textData): ic_freq_obj = {} textArray = json.dumps(textData).split("\\n") parsedTextArray = [x.split(';') for x in textArray] sentenceList = [x[3] for x in parsedTextArray if len(x)==4] filteredSentenceList = [] filteredWords = [] nestedWordList = [] tokens = [] specialString = "!@#$%^&*()[]{};:,./<>?\|`~=_+-" for sentence in sentenceList: sentLower = sentence.lower() sentence_filt = sentLower.translate \ ({ord(c): " "+c+" " for c in specialString}) # wordTokens = nltk.word_tokenize(sentence_filt) wordTokens = sentence_filt.split() nestedWordList.append(wordTokens) tokens.extend(wordTokens) filteredSentenceList.append(sentence_filt); #### COMPUTE POS AND NER TAGS FOR EACH LINE #### posTupleList = st.tag_sents(nestedWordList) NERTaggedList = NER.tag_sents(nestedWordList) combinedTagList = [] for i, posTuples in enumerate(posTupleList): NERTuples = NERTaggedList[i] # posTuples and NERTuples are each a list of tags for the same # sentence. # These will be combined into one list of tags for that sentence. sentTagsList = [] for j, posTuple in enumerate(posTuples): NERTuple = NERTuples[j] word = posTuple[0] # posTuple and NERTuple represent the same word and its # POS/NER tag. # These will now be combined to the form: # {word: {"POS": <postag>, "NER": <nertag>}} combinedTagObj = {} tags = {} tags["NER"] = NERTuple[1] tags["POS"] = posTuple[1] combinedTagObj[word] = tags sentTagsList.append(combinedTagObj) combinedTagList.append(sentTagsList) filteredWords.extend(sentTagsList) # write the combined tags to a file, can come in handy later. with open('./public/pythonscripts/combinedTags.txt', 'w') as fObj: fObj.write(str(combinedTagList)) with open('./public/pythonscripts/filteredWords.txt', 'w') as fObj: fObj.write(str(filteredWords)) #### COMPUTE FREQUENCIES AND INFORMATION CONTENTS FOR EACH WORD #### # remove all special characters for each word, replace them with # spaces. Then get rid of whatever follows the space. So words like # "there's" become "there", and words like punctuationLeftovers = ["s", "re", 'na', "m", "em", "d"] completeStopwords = stopwords.words("english") +\ punctuationLeftovers filtered_tokens = [w.lower() for w in tokens if not w.lower() in set(completeStopwords)] frequencyDict = Counter(filtered_tokens) uniquetokens = list(set(filtered_tokens)) icArray = [] tagCountObj = {} outLiers = [] maxInfoContent = 0 for token in uniquetokens: tempNum = 0 synsets = wn.synsets(token) if len(synsets) > 0: for synset in synsets: if not \ set([synset.pos()]).intersection(set(["a","s","r"])) : synsetItem = synset tempNum = 1 break if tempNum == 1: infoContentValue = information_content(synsetItem, ic_bnc_plus1) if infoContentValue >= 1e+300 : outLiers.append((token, infoContentValue)) else : icArray.append((token, infoContentValue)) if maxInfoContent < infoContentValue: maxInfoContent = infoContentValue else : icArray.append((token, 0.0)) #### POSList = [] NERList = [] allTags = {} for fw in filteredWords: # fw = {"word": {"POS": "xxx", "NER: "yyy"}} if token in fw: tokenPOS = fw[token]["POS"] tokenNER = fw[token]["NER"] POSList.append(tokenPOS) NERList.append(tokenNER) POSList = list(set(POSList)) NERList = list(set(NERList)) allTags = {} allTags["POSList"] = POSList allTags["NERList"] = NERList tagCountObj[token] = allTags for outLier, ic in outLiers: icArray.append((outLier, maxInfoContent)) for word, ic in icArray: metric = {} metric["infoContent"] = ic metric["frequency"] = frequencyDict[word] metric["POSList"] = tagCountObj[word]["POSList"] metric["NERList"] = tagCountObj[word]["NERList"] ic_freq_obj[word] = metric # Finally, perform topic modeling if required, or just include a # pre-calculated list of topics (better for consistency in user # studies) loadTopics = 1 # change this to 1 if you want to read from file. if loadTopics == 0: topicsObj = genTopicModels(parsedTextArray, 3, 10) else : with open('./public/pythonscripts/topics.json') as tObj: topicsObj = json.load(tObj) nlpOutputObj = {} nlpOutputObj["metadata"] = ic_freq_obj nlpOutputObj["sentencetags"] = combinedTagList nlpOutputObj["topicmodels"] = topicsObj # nlpOutputStr = str(nlpOutputObj) nlpOutputStr = json.dumps(nlpOutputObj) # JSON with single quotes gets vomited on at the client end, so # let's change all of those. # nlpOutput = nlpOutputStr.replace("'", '"') with open('./public/pythonscripts/outfile.txt', 'w') as fObj: fObj.write(nlpOutputStr) return nlpOutputStr
for a in preA: row = [] for b in preB: wdsim = wns.word_similarity(a, b, 'wup') row.append(wdsim) data.append(row) data = numpy.matrix(data) #max values in rows Amax = data.max(1) icA = [] for i in range(len(preA)): try: if lesk(preA, preA[i]) is None: # preA is not in WordNet icA.append(1) elif information_content(lesk(preA, preA[i]), brown_ic) == 'inf': icA.append(1) else: icA.append(information_content(lesk(preA, preA[i]), brown_ic)) except Exception: icA.append(1) icA = numpy.matrix(icA) #max values in columns Bmax = data.max(0) icB = [] for i in range(len(preB)): try: if lesk(preB, preB[i]) is None: # preB is not in WordNet icB.append(1) elif information_content(lesk(preB, preB[i]), brown_ic) == 'inf':