def test_word_similarity(): from sematch.semantic.similarity import WordNetSimilarity wns = WordNetSimilarity() dog = wns.word2synset('dog') cat = wns.word2synset('cat') # Measuring semantic similarity between concepts using Path method assert wns.similarity(dog[0], cat[0], 'path') is not None # 0.2 # Computing English word similarity using Li method assert wns.word_similarity('dog', 'cat', 'li') is not None # 0.449327301063 # Computing Spanish word similarity using Lin method assert wns.monol_word_similarity('perro', 'gato', 'spa', 'lin') is not None #0.876800984373 # Computing Chinese word similarity using Wu & Palmer method assert wns.monol_word_similarity('狗', '猫', 'cmn', 'wup') is not None # 0.857142857143 # Computing Spanish and English word similarity using Resnik method assert wns.crossl_word_similarity('perro', 'cat', 'spa', 'eng', 'res') is not None #7.91166650904 # Computing Spanish and Chinese word similarity using Jiang & Conrad method assert wns.crossl_word_similarity('perro', '猫', 'spa', 'cmn', 'jcn') is not None #0.31023804699 # Computing Chinese and English word similarity using WPath method assert wns.crossl_word_similarity('狗', 'cat', 'cmn', 'eng', 'wpath') is not None #0.593666388463
def map_subjects(subjects: list, filter_dis=0.2): # mapping the subjects, filter the i,j in M wns = WordNetSimilarity() # enumerate pairing and calculate distances # [['中国人', '安乐死'], ['太阳', '很好']] pair = [] # return the indexes pairing pair_idxs = [] for index, value in enumerate(subjects): i = index + 1 while i < len(subjects): # compare list : next list com_value = subjects[i] for v in value: for cv in com_value: pair_distance = wns.monol_word_similarity( v, cv, 'cmn', 'wup') # print(f'{v} -> {cv}: {pair_distance}') if pair_distance > filter_dis: pair.append(pair_distance) # pairing index: (row, column) pair_idxs.append( ([index, value.index(v)], [i, com_value.index(cv)])) i += 1 return pair_idxs
def controlledSetWordNetSimilarity(self, word, similarWords): wns = WordNetSimilarity() for similarWord in similarWords.copy(): if wns.word_similarity( word, similarWord, 'li' ) < 0.9996: # Variable to control accuracy of controlset similarWords.discard(similarWord) return similarWords
def __init__(self, wsd_method='maxsim', sim_name='wpath'): ''' wsd_methods = ['random_sense','first','frequent','maxsim', 'graph', 'lesk', 'naive'] sim_name = ['path', 'lch', 'wup', 'li', 'res', 'lin', 'jcn', 'wpath'] ''' self._method = wsd_method self._sim_name = sim_name self._wn_sim = WordNetSimilarity()
def test_language(): from sematch.semantic.similarity import WordNetSimilarity wns = WordNetSimilarity() #check the supported languages assert wns.languages() is not None #find the language code assert wns.languages('English') is not None assert wns.languages('chinese_simplified') is not None assert wns.languages('spanish') is not None
def test_classification_evaluation(): from sematch.evaluation import AspectEvaluation from sematch.application import SimClassifier, SimSVMClassifier from sematch.semantic.similarity import WordNetSimilarity evaluation = AspectEvaluation() X, y = evaluation.load_dataset() wns = WordNetSimilarity() word_sim = lambda x, y: wns.word_similarity(x, y) simclassifier = SimClassifier.train(zip(X, y), word_sim) evaluation.evaluate(X, y, simclassifier) simSVMclassifier = SimSVMClassifier.train(X, y, word_sim) evaluation.evaluate(X, y, simSVMclassifier)
def __init__(self): self.out = {} self.keras = keras_similar() self.classifier = Qclassifier() self.spell=Spelling() self.wn = WordNetSimilarity() self.en_nlp = spacy.load("en_core_web_md") self.stopwords_en=[] with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'utils', 'stopwords_en.txt')) as f: self.stopwords_en = f.read().splitlines()
def test_wordsim_evaluation(): from sematch.evaluation import WordSimEvaluation from sematch.semantic.similarity import WordNetSimilarity wordsim_eval = WordSimEvaluation() wns = WordNetSimilarity() #define similarity metrics lin = lambda x, y: wns.word_similarity(x, y, 'lin') wpath = lambda x, y: wns.word_similarity_wpath(x, y, 0.8) #evaluate similarity metrics assert wordsim_eval.evaluate_multiple_metrics({'lin':lin, 'wpath':wpath}, 'noun_simlex') is not None #performa Steiger's Z significance Test assert wordsim_eval.statistical_test('wpath', 'lin', 'noun_simlex') is not None
def semantic_matching(trend_one, trend_two): treshold = 0.3 trend_one_processed = text_processing(trend_one, keep_spaces=True) trend_two_processed = text_processing(trend_two, keep_spaces=True) # The options are Wordnet, YAGO and DBpedia (only the first seems usable) wns = WordNetSimilarity() matches = list({ x['original'] for x in trend_one_processed for y in trend_two_processed if wns.word_similarity(x['processed'], y['processed'], 'li') > treshold }) if len(matches) == 0: return 'No matches' return matches
def test_wordsim_evaluation(): from sematch.evaluation import WordSimEvaluation from sematch.semantic.similarity import WordNetSimilarity wordsim_eval = WordSimEvaluation() wns = WordNetSimilarity() #define similarity metrics lin = lambda x, y: wns.word_similarity(x, y, 'lin') wpath = lambda x, y: wns.word_similarity_wpath(x, y, 0.8) #evaluate similarity metrics assert wordsim_eval.evaluate_multiple_metrics({ 'lin': lin, 'wpath': wpath }, 'noun_simlex') is not None #performa Steiger's Z significance Test assert wordsim_eval.statistical_test('wpath', 'lin', 'noun_simlex') is not None
def test_wordsim_evaluation(): from sematch.evaluation import WordSimEvaluation from sematch.semantic.similarity import WordNetSimilarity evaluation = WordSimEvaluation() print evaluation.dataset_names() wns = WordNetSimilarity() # define similarity metrics wpath = lambda x, y: wns.word_similarity_wpath(x, y, 0.8) # evaluate similarity metrics print evaluation.evaluate_metric('wpath', wpath, 'noun_simlex') # performa Steiger's Z significance Test print evaluation.statistical_test('wpath', 'path', 'noun_simlex') wpath_es = lambda x, y: wns.monol_word_similarity(x, y, 'spa', 'path') wpath_en_es = lambda x, y: wns.crossl_word_similarity( x, y, 'eng', 'spa', 'wpath') print evaluation.evaluate_metric('wpath_es', wpath_es, 'rg65_spanish') print evaluation.evaluate_metric('wpath_en_es', wpath_en_es, 'rg65_EN-ES')
def test_query_ned(): from sematch.nlp import FeatureExtractor from sematch.nlp import EntityFeature from sematch.nlp import SpaCyNLP from sematch.utility import FileIO from sematch.semantic.relatedness import TextRelatedness from sematch.nel import EntityDisambiguation import itertools sy = SpaCyNLP() features = EntityFeature.load( feature_dict_file='models/query_features.json') extractor = FeatureExtractor(features, sy.pos_tag) ned = EntityDisambiguation(extractor) rel = TextRelatedness() from sematch.semantic.similarity import WordNetSimilarity wns = WordNetSimilarity() #print wns.word_similarity('cooling', 'air_conditioner', 'li') #similarity = lambda x,y : rel.text_similarity(x,y, model='lsa') query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt') query = [q for q in query if extractor.context_features(q['query'])] print len(query) import warnings warnings.filterwarnings("ignore") metrics = ['path', 'wup', 'res', 'lin', 'jcn', 'wpath'] for m in metrics: print m similarity = lambda x, y: wns.word_similarity(x, y, m) for k in range(1, 21): gold = [] predict = [] for q in query: gold.append(q['gold']) #e = ned.text_disambiguate(q['query'], q['candidate'], similarity) e = ned.word_disambiguate(q['query'], q['candidate'], similarity, K=k) predict.append(e) from sklearn.metrics import precision_recall_fscore_support #from sklearn.metrics import classification_report #print classification_report(gold, predict) print precision_recall_fscore_support(gold, predict, average='weighted')[2]
def __init__(self, corpus, feature_num=10, model='onehot', wn_method='path', vec_file='models/GoogleNews-vectors-negative300.bin', binary=True): """ :param corpus: use a corpus to train a vector representation :param feature_num: number of dimensions :param model: onehot or wordnet or word2vec or both """ self._model = model self._wn_method = wn_method self._features = self.extract_features(corpus, feature_num) self._wns = WordNetSimilarity( ) if model == 'wordnet' or model == 'both' else None self._wvs = WordVecSimilarity( vec_file, binary) if model == 'word2vec' or model == 'both' else None
def test_simcat_classifier(): from sematch.classification import SimCatClassifier from sematch.evaluation import ABSAEvaluation from sematch.semantic.similarity import WordNetSimilarity # defining similarity metric wns = WordNetSimilarity() sim_metric_jcn = lambda x, y: wns.word_similarity(x, y, 'jcn') sim_metric_wpath = lambda x, y: wns.word_similarity_wpath(x, y, 0.9) # loadding dataset absa_eval = ABSAEvaluation() X_train_16, y_train_16 = absa_eval.load_dataset('eval/aspect/ABSA16_Restaurants_Train_SB1_v2.xml') X_test_16, y_test_16 = absa_eval.load_dataset('eval/aspect/ABSA16_Restaurants_Train_SB1_v2.xml') # train the classifiers sim_jcn_classifier = SimCatClassifier.train(zip(X_train_16, y_train_16), sim_metric_jcn) sim_wpath_classifier = SimCatClassifier.train(zip(X_train_16, y_train_16), sim_metric_wpath) # evaluate the classifiers #absa_eval.evaluate(X_test_16, y_test_16, sim_jcn_classifier) #absa_eval.evaluate(X_test_16, y_test_16, sim_wpath_classifier) assert sim_jcn_classifier is not None assert sim_wpath_classifier is not None
def test_query_ned(): from sematch.nlp import FeatureExtractor from sematch.nlp import EntityFeature from sematch.nlp import SpaCyNLP from sematch.utility import FileIO from sematch.semantic.relatedness import TextRelatedness from sematch.nel import EntityDisambiguation import itertools sy = SpaCyNLP() features = EntityFeature.load(feature_dict_file='models/query_features.json') extractor = FeatureExtractor(features, sy.pos_tag) ned = EntityDisambiguation(extractor) rel = TextRelatedness() from sematch.semantic.similarity import WordNetSimilarity wns = WordNetSimilarity() #print wns.word_similarity('cooling', 'air_conditioner', 'li') #similarity = lambda x,y : rel.text_similarity(x,y, model='lsa') query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt') query = [q for q in query if extractor.context_features(q['query'])] print len(query) import warnings warnings.filterwarnings("ignore") metrics = ['path', 'wup', 'res', 'lin', 'jcn', 'wpath'] for m in metrics: print m similarity = lambda x, y: wns.word_similarity(x, y, m) for k in range(1, 21): gold = [] predict = [] for q in query: gold.append(q['gold']) #e = ned.text_disambiguate(q['query'], q['candidate'], similarity) e = ned.word_disambiguate(q['query'], q['candidate'], similarity, K=k) predict.append(e) from sklearn.metrics import precision_recall_fscore_support #from sklearn.metrics import classification_report #print classification_report(gold, predict) print precision_recall_fscore_support(gold, predict, average='weighted')[2]
def test_wordnet_similarity(): from sematch.semantic.similarity import WordNetSimilarity wns = WordNetSimilarity() dog = wns.word2synset('dog') cat = wns.word2synset('cat') # Measuring semantic similarity between concepts using Path method assert wns.similarity(dog[0], cat[0], 'path') is not None # 0.2 # Computing English word similarity using Li method assert wns.word_similarity('dog', 'cat', 'li') is not None# 0.449327301063 # Computing Spanish word similarity using Lin method assert wns.monol_word_similarity('perro', 'gato', 'spa', 'lin') is not None#0.876800984373 # Computing Chinese word similarity using Wu & Palmer method assert wns.monol_word_similarity('狗', '猫', 'cmn', 'wup') is not None# 0.857142857143 # Computing Spanish and English word similarity using Resnik method assert wns.crossl_word_similarity('perro', 'cat', 'spa', 'eng', 'res') is not None#7.91166650904 # Computing Spanish and Chinese word similarity using Jiang & Conrad method assert wns.crossl_word_similarity('perro', '猫', 'spa', 'cmn', 'jcn') is not None#0.31023804699 # Computing Chinese and English word similarity using WPath method assert wns.crossl_word_similarity('狗', 'cat', 'cmn', 'eng', 'wpath') is not None#0.593666388463
def test_sim_graph(): from sematch.semantic.graph import SimGraph from sematch.semantic.similarity import WordNetSimilarity from sematch.nlp import Extraction, lemmatization from sematch.sparql import EntityFeatures from collections import Counter madrid = EntityFeatures().features( 'http://dbpedia.org/resource/Tom_Cruise') words = Extraction().extract_words_sent(madrid['abstract']) words = list(set(lemmatization(words))) wns = WordNetSimilarity() word_graph = SimGraph(words, wns.word_similarity) word_scores = word_graph.page_rank() words, scores = zip(*Counter(word_scores).most_common(10)) assert words is not None
from sematch.semantic.similarity import WordNetSimilarity wns = WordNetSimilarity() # Computing English word similarity using Li method wns.word_similarity('dog', 'cat', 'li') # 0.449327301063 # Computing Spanish word similarity using Lin method wns.monol_word_similarity('perro', 'gato', 'spa', 'lin') #0.876800984373 # Computing Chinese word similarity using Wu & Palmer method wns.monol_word_similarity('狗', '猫', 'cmn', 'wup') # 0.857142857143 # Computing Spanish and English word similarity using Resnik method wns.crossl_word_similarity('perro', 'cat', 'spa', 'eng', 'res') #7.91166650904 # Computing Spanish and Chinese word similarity using Jiang & Conrad method wns.crossl_word_similarity('perro', '猫', 'spa', 'cmn', 'jcn') #0.31023804699 # Computing Chinese and English word similarity using WPath method wns.crossl_word_similarity('狗', 'cat', 'cmn', 'eng', 'wpath') #0.593666388463
def test_synset_expand(): from sematch.semantic.similarity import WordNetSimilarity wns = WordNetSimilarity() cat = wns.word2synset('cat')[0] assert wns.synset_expand(cat) is not None
def yhmh_nlp(url, trigger_words): text, triggers = parse_my_url(url, trigger_words) print("triggers2: %s" % (triggers)) if text is "" or len(triggers) == 0: return "" client = language.LanguageServiceClient() if isinstance(text, six.binary_type): text = text.decode('utf-8') # Instantiates a plain text document. document = types.Document(content=text, type=enums.Document.Type.PLAIN_TEXT) # Detects entities in the document. You can also analyze HTML with: # document.type == enums.Document.Type.HTML entities = client.analyze_entities(document).entities verbose = True counter = 0 counter2 = 0 text_output_array = pd.DataFrame(np.zeros((len(entities), 3))) for entity in entities: entity_type = enums.Entity.Type(entity.type) if len(entity.name) < 25 and '.' not in entity.name: text_output_array.iloc[counter, 0] = entity.name text_output_array.iloc[counter, 1] = entity_type.name text_output_array.iloc[counter, 2] = entity.salience counter += 1 else: counter2 += 1 celebrity_status = 0 if len(entities) > 0: if entities[0].metadata.get( 'wikipedia_url', '-') != '-' and text_output_array.iloc[0, 1] == 'PERSON': celebrity_status = 1 elif entities[1].metadata.get( 'wikipedia_url', '-') and text_output_array.iloc[1, 1] == 'PERSON': celebrity_status = 1 else: celebrity_status = 0 text_output_array = text_output_array.iloc[0:len(entities) - counter2, :] # Detects the sentiment of the text #sentiment = client.analyze_sentiment(document=document).document_sentiment wns = WordNetSimilarity() keywords_target = pd.Series.to_list(text_output_array[0]) #keywords_target = list(set(keywords_target)) #seen = set(keywords_target) #keywords_target = [] #for x in keywords_target: # if x not in seen: # keywords_target.append(x) # seen.add(x) # #keywords_target=seen forbidden_keywords = [ 'medicine', 'drug', 'fun', 'hospital', 'suicide', 'death', 'mental', 'health', 'illness', 'insta', ',man', 'woman', 'family', 'people', 'many', 'place', 'same', 'others', 'brain', 'all', 'end', 'statement', 'lot', 'condolences' ] regex = re.compile(r'([A-Z]([a-z])+)') selected_files = list(filter(regex.search, keywords_target)) res = list(set(keywords_target) - set(selected_files)) regex = re.compile(r'^@') selected_files = list(filter(regex.search, res)) res = list(set(keywords_target) - set(selected_files)) regex = re.compile(r"\b[A-Z][A-Z]+\b") selected_files = list(filter(regex.search, res)) res = list(set(res) - set(selected_files)) regex = re.compile(r'([A-Z]([a-z])+)') selected_files = list(filter(regex.search, res)) res = list(set(res) - set(selected_files)) for key in range(len(res)): if ' ' in res[key]: res[key] = res[key].split(' ')[0] for x in range(len(res)): for y in range(len(forbidden_keywords)): if res[x] == forbidden_keywords[y]: res[x] = [] res = list(filter(None, res)) res_dictionary = Counter(res) res_output = res_dictionary.most_common(10) res_output = dict(res_output) res_output = list(res_output.keys()) print(res_output) res = res_output[0:num_keywords] database = pd.read_csv( CURATED_LIST ) #('/Users/vmutai/Projects/HMH/admin/microblog/app/yhmh_curated_articles.csv') if celebrity_status == 1: database = database[database.celebrity == 1] elif celebrity_status == 0: database = database[database.celebrity == 0] similarity_ranks = pd.DataFrame(np.zeros(database.shape[0])) for z in range(database.shape[0]): newlist = [] N_rows = len(res) keywords_source = database.iloc[z, 4:4 + num_keywords] keywords_source = pd.Series.tolist(keywords_source) N_cols = len(keywords_source) #similarity_list = pd.DataFrame(np.zeros((N_rows, N_cols))) foo = [1] for x in range(len(res)): for y in range(len(keywords_source)): value = wns.word_similarity(res[x], keywords_source[y], 'lin') #similarity_matrix.at[x,y]=value foo.append(value) matrix_average = sum(foo) / np.count_nonzero(foo) similarity_ranks.at[z, 0] = matrix_average maximum = pd.DataFrame.idxmax(similarity_ranks) url_to_return = pd.Series.tolist(database.iloc[maximum, 0]) print(url_to_return) title = pd.Series.tolist(database.iloc[maximum, 1]) def output(title, res_output, url_to_return): a = { 'header': title[0], 'keywords_list': res_output, 'url_recommendation': url_to_return[0] } print("JSON DUMP") print(a) try: return json.dumps(a) except: return "awesome2!" json_output = output(title, res_output, url_to_return) print(json_output) return json_output
ResultTemplateFlask = os.path.join(ResultPathContent, 'Trizifiier').replace('\\', '/') bigram_measures = nltk.collocations.BigramAssocMeasures() trigram_measures = nltk.collocations.TrigramAssocMeasures() if not os.path.exists( ResultTemplateFlask): #creation des dossiers templates et dataFormat os.mkdir(ResultTemplateFlask) if not os.path.exists(ResultTemplateFlask + '/templates' ): #creation des dossiers templates et dataFormat os.mkdir(ResultTemplateFlask + '/templates') if not os.path.exists(ResultTemplateFlask + '/DataFormat' ): #creation des dossiers templates et dataFormat os.mkdir(ResultTemplateFlask + '/DataFormat') #add here tempo dir temporar = configFile.temporPath wns = WordNetSimilarity() i = 0 # build file list #direct = os.path.normpath(ResultBiblioPath) #direct = os.path.normpath(ResultClaimsPath) direct = os.path.normpath(ResultAbstractPath) # affiche url de chaque documents txt dans le dossier de la requete inseree , EN tous les url dossier pour en ect... Fr, En, Unk = GenereListeFichiers(direct) def convert_tag(tag): tag_dict = {'N': 'n', 'J': 'a', 'R': 'r', 'V': 'v'} try: return tag_dict[tag[0]] except KeyError:
class WSD: def __init__(self, wsd_method='maxsim', sim_name='wpath'): ''' wsd_methods = ['random_sense','first','frequent','maxsim', 'graph', 'lesk', 'naive'] sim_name = ['path', 'lch', 'wup', 'li', 'res', 'lin', 'jcn', 'wpath'] ''' self._method = wsd_method self._sim_name = sim_name self._wn_sim = WordNetSimilarity() def disambiguate_graph(self, sentence): words_origin = word_tokenize(sentence) #extract words that have a synset in WordNet, currently support NOUN. words = [w for w in words_origin if self._wn_sim.word2synset(w)] # map words to synsets words_synsets = {w:self._wn_sim.word2synset(w) for w in words} # construct sets list synsets = list(itertools.chain.from_iterable([words_synsets[w] for w in words])) # remove duplicate synsets synsets = list(set(synsets)) # define semantic similarity metric sim_metric = lambda x, y: self._wn_sim.similarity(x, y, self._sim_name) # construct similarity graphs sim_graph = SimGraph(synsets, sim_metric) # get pagerank scores of synsets rank_scores = sim_graph.page_rank() results = [] for w in words_origin: if w in words: candidate_scores = {s:rank_scores[s] for s in words_synsets[w]} results.append((w, Counter(candidate_scores).most_common(1)[0][0])) else: results.append((w, None)) return results def classify(self, featureset): context = featureset['context'] senses = featureset['senses'] return self.max_senses(context, senses) def context2words(self, sent): words = word_tokenize(sent.lower()) words = [w for w in words if len(w) > 2] return lemmatization(words) def random_sense(self, word): senses = self._wn_sim.word2synset(word) return random.choice(senses) def first_sense(self, word): senses = self._wn_sim.word2synset(word) return senses[0] def word_sense_similarity(self, word, sense): word_senses = self._wn_sim.word2synset(word) scorer = lambda x:self._wn_sim.similarity(x, sense, self._sim_name) sim_scores = map(scorer, word_senses) + [0.0] return max(sim_scores) def max_senses(self, context, senses): if len(senses) == 1: return senses[0] context_words = self.context2words(context) result = {} for ss in senses: scorer = lambda x: self.word_sense_similarity(x, ss) sim_score = sum(map(scorer, context_words)) result[ss] = sim_score return Counter(result).most_common(1)[0][0] def max_sim(self, context, word): senses = self._wn_sim.word2synset(word) return self.max_senses(context, senses) def lesk(self, context, word): from nltk.wsd import lesk as nltk_lesk context_words = self.context2words(context) return nltk_lesk(context_words, word, 'n')
class TextPreprocessor(BaseEstimator, TransformerMixin): """ Transform input text into feature representation """ def __init__(self, corpus, feature_num=10, model='onehot', wn_method='path', vec_file='models/GoogleNews-vectors-negative300.bin', binary=True): """ :param corpus: use a corpus to train a vector representation :param feature_num: number of dimensions :param model: onehot or wordnet or word2vec or both """ self._model = model self._wn_method = wn_method self._features = self.extract_features(corpus, feature_num) self._wns = WordNetSimilarity( ) if model == 'wordnet' or model == 'both' else None self._wvs = WordVecSimilarity( vec_file, binary) if model == 'word2vec' or model == 'both' else None def fit(self, X, y=None): return self def inverse_transform(self, X): return X def extract_features(self, corpus, feature_num=10): cat_word = {} for sent, cat in corpus: cat_word.setdefault(cat, []).extend(lemmatization(word_tokenize(sent))) features = {cat: Counter(cat_word[cat]) for cat in cat_word} feature_words = [] for c, f in features.iteritems(): words, counts = zip(*f.most_common(feature_num)) feature_words.extend(list(words)) feature_words = set(feature_words) return feature_words def similarity(self, tokens, feature, method='wordnet'): if method == 'wordnet': sim = lambda x: self._wns.word_similarity(feature, x, self. _wn_method) else: sim = lambda x: self._wvs.word_similarity(feature, x) return max(map(sim, tokens) + [0.0]) def unigram_features(self, tokens): words = set(tokens) features = {} for f in self._features: features['contains({})'.format(f)] = (f in words) return features def wordnet_features(self, tokens): words = set(tokens) features = {} for f in self._features: features['wns({})'.format(f)] = self.similarity(words, f) return features def word2vec_features(self, tokens): words = set(tokens) features = {} for f in self._features: features['w2v({})'.format(f)] = self.similarity(words, f, method='word2vec') return features def semantic_features(self, tokens): words = set(tokens) features = {} for f in self._features: features['wns({})'.format(f)] = self.similarity(words, f) features['w2v({})'.format(f)] = self.similarity(words, f, method='word2vec') return features def transform(self, X): tokenize = lambda x: lemmatization(word_tokenize(x)) X_tokens = map(tokenize, X) if self._model == 'onehot': return map(self.unigram_features, X_tokens) elif self._model == 'wordnet': return map(self.wordnet_features, X_tokens) elif self._model == 'word2vec': return map(self.word2vec_features, X_tokens) elif self._model == 'both': return map(self.semantic_features, X_tokens)
import pandas as pd import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sematch.semantic.similarity import WordNetSimilarity WNS = WordNetSimilarity() # NOTE: For reference see: https://pdfs.semanticscholar.org/1374/617e135eaa772e52c9a2e8253f49483676d6.pdf def random_sentences(num_rand_sentences, df_main): """Select num_rand_sentences at random from the Dataframe Args: num_rand_sentences (int): the number of sentences to select at random Return: list: list of sentences """ size = num_rand_sentences indices = np.random.randint(0, df_main.shape[0], size) tokenized_subset = df_main['tokenized_sentence'].dropna() sentence_subset = df_main['sentence'].dropna() lecture_subset = df_main['lecture'].dropna() start_time_subset = df_main['start_time'].dropna() end_time_subset = df_main['end_time'].dropna() random_tokenized_sentences = map(lambda x: tokenized_subset[x], indices) random_normal_sentences = map(lambda x: sentence_subset[x], indices)
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # #Predicate semantic similarity in Python2 import numpy import json import sys from sematch.semantic.similarity import WordNetSimilarity from nltk.wsd import lesk from nltk.corpus import wordnet_ic from nltk.corpus.reader.wordnet import information_content brown_ic = wordnet_ic.ic('ic-brown.dat') wns = WordNetSimilarity() # arg1 and arg2: predicates represented in strings separated by underscores # e.g. cast_member or star preA = sys.argv[1].split("_") preB = sys.argv[2].split("_") # arg3: pairwise similarity matrix in which rows are separated by underscore # e.g. 0.6_0.5, or 0.6,0.7_0.3,0.4 data = [] for a in preA: row = [] for b in preB: wdsim = wns.word_similarity(a, b, 'wup') row.append(wdsim) data.append(row) data = numpy.matrix(data) #max values in rows
t = re.sub(" ?(f|ht)(tp)(s?)(://)(.*)[.|/](.*)", ' ', t) t = re.sub("@\w+ ?", ' ', t) t = re.sub("[^\w\s]|[\d]", ' ', t) t = re.sub(stop, ' ', t) t = re.sub("\s+", ' ', t) t = t.split() t = [w for w in t if w.isalpha()] t = [wordnet_lemmatizer.lemmatize(w) for w in t] clean.append(t) return clean cleanCleanCat1 = cleanTexts(categoryList1) cleanCleanCat2 = cleanTexts(categoryList2) wns = WordNetSimilarity() similarCategories = [] for cat in cleanCleanCat1: sims = [] for t in cleanCleanCat2: TextSim = [] for w in cat: # wdsSim=[1 if w == wr else wns.word_similarity(w, wr, 'li') for wr in t] wdsSim = [wns.word_similarity(w, wr, 'li') for wr in t] TextSim.extend(wdsSim) sims.append((cleanCleanCat2.index(t), sum(TextSim))) if max(sims, key=lambda x: x[1])[1] > 0: similarCategories.append( (max(sims, key=lambda x: x[1])[0], max(sims, key=lambda x: x[1])[1])) else:
# pip install sematch # nltk.download('wordnet_ic') # You also need to edit one of the sematch library files, sparql in case you are using python 3. You need to change the print statement. from sematch.semantic.similarity import WordNetSimilarity import pandas as pd wns = WordNetSimilarity() words = ['artist', 'musician', 'scientist', 'physicist', 'actor', 'movie'] sim_matrix = [[wns.word_similarity(w1, w2, 'wpath') for w1 in words] for w2 in words] df = pd.DataFrame(sim_matrix, index=words, columns=words) print(df) print(wns.word_similarity("Dog", "Cat"))
from sematch.semantic.similarity import WordNetSimilarity L1=[] L2=[] L3=[] wns = WordNetSimilarity() # Computing English word similarity using Li method x=wns.word_similarity('programmer', 'coder', 'software engineer') if(x>0.7): L1.append('programmer') L1.append('coder') L1.append('software engineer') else:continue # Computing english word similarity using Li method wns.word_similarity('softwrae program', 'computer software', 'software system') if(x>0.7): L1.append('software program') L1.append('computer software') L1.append('software system') else:continue
from sematch.semantic.similarity import WordNetSimilarity import codecs wns = WordNetSimilarity() poems = codecs.open('generatedpoems.txt', 'r', encoding='utf-8') data = open('data.txt', 'a') for x in poems: temp_words = x.split(" ") total = 0 count = 0 for y in range(len(temp_words) - 1): total += wns.word_similarity(temp_words[y], temp_words[y + 1], 'li') count += 1 total /= count data.write(str(total) + '\n') data.close() poems.close() #print wns.word_similarity(w1, w2, 'li')
class fmodel(object): def __init__(self): self.out = {} self.keras = keras_similar() self.classifier = Qclassifier() self.spell=Spelling() self.wn = WordNetSimilarity() self.en_nlp = spacy.load("en_core_web_md") self.stopwords_en=[] with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'utils', 'stopwords_en.txt')) as f: self.stopwords_en = f.read().splitlines() def ent_nltk(self, sentence): ne_tree = ne_chunk(pos_tag(word_tokenize(sentence))) iob_tagged = tree2conlltags(ne_tree) ents = [[0, 0, 10]] for i in range(len(iob_tagged)): each = iob_tagged[i] if each[2] != 'O': if ents[-1][2] == (i - 1): ents[-1][0] += " " + each[0] ents[-1][2] = i else: ents.append([each[0], each[2][2:], i]) if len(ents) > 1: ents = ents[1:] ents = [ent[0] for ent in ents] else: ents = [] return ents def mini_similar(self, q1, q2): self.out = {'sim': 0, 'sim_per': 0.0, 'keras': 0, 'class': ["", ""], 'f_class': 0, "sentiment": [0, 0, 0], "keywords": [[""], [""]], "numbers": [[], []], "entities": [[], []], "max_keywords": 0, "keywords_sim": 0} regex = re.compile('[^a-zA-Z0-9]') q1 = regex.sub('', q1) q2 = regex.sub('', q2) if q1 == q2: self.out['sim'] = 1 self.out['sim_per'] = 100 return self.out else: s1 = self.wn.word_similarity(q1, q2, 'lin') print(s1) if s1 > 0.9: self.out['sim'] = 1 self.out['sim_per'] = 100 return self.out elif s1 > 0.8: self.out['sim'] = 1 self.out['sim_per'] = s1 # max([s1,s2,s3]) return self.out return self.out def is_one_word(self, q1, q2): l1 = q1 l2 = q2 flag1 = False flag2 = False stop = True word1 = "" word2 = "" if len(l1)!=len(l2): return False else: for i in range(len(l1)): if l1[i].text != l2[i].text or l1[i].lemma_ != l2[i].lemma_: if(flag2): return False elif l1[i].text in self.stopwords_en and l2[i].text in self.stopwords_en: word1 = l1[i].text word2 = l2[i].text flag1 = True else: word1 = l1[i].lemma_ word2 = l2[i].lemma_ flag1 = True flag2 = True if flag1: self.out = self.mini_similar(word1,word2) return True def similar(self, text, challenge): if not isinstance(text, str) or not isinstance(challenge, str): q1 = text q2 = challenge else: q1 = normalizr.normalize(text, normalizations) q2 = normalizr.normalize(challenge, normalizations) q1 = self.spell.correct_str(q1,True) q2 = self.spell.correct_str(q2,True) if (len(q1.split()) == 1 and len(q2.split()) == 1) or (q1 == q2): return self.mini_similar(q1, q2) regex = re.compile(u'/') # [^a-zA-Z]') q1 = regex.sub('', q1) q2 = regex.sub('', q2) self.out = {'sim': 0, 'sim_per': 0.0, 'keras': 0.0, 'class': ["", ""], 'f_class': 0, "sentiment": [0, 0, 0], "keywords": [[""], [""]], "numbers": [[], []], "entities": [[], []], "max_keywords": 0, "keywords_sim": 0.0} q1_neg_list = list(set(mark_negation(q1.split())[0])) q2_neg_list = list(set(mark_negation(q2.split())[0])) if q1 == "" or q2 == "": return self.out sq1 = self.en_nlp(q1) sq2 = self.en_nlp(q2) if self.is_one_word(sq1, sq2): return self.out count = 0 start_time = time.time() entsq1 = self.ent_nltk(q1) entsq2 = self.ent_nltk(q2) self.out['entities'][1] = entsq2 self.out['entities'][0] = entsq1 for ent in sq1.ents: if ent.text not in entsq1: # self.out['entities'][0].append([ent.label_, ent.text]) self.out['entities'][0].append(ent.text) for ent in sq2.ents: if ent.text not in entsq2: # self.out['entities'][1].append((ent.label_, ent.text)) self.out['entities'][1].append(ent.text) if self.out['entities'][0]: if self.out['entities'][1]: if(len(self.out['entities'][0])!= len(self.out['entities'][1])): return self.out self.out['max_keywords'] += len( set(self.out['entities'][0] + self.out['entities'][1])) for each in self.out['entities'][0]: if(each in self.out['entities'][1]): count += 1 else: return self.out else: return self.out elif self.out['entities'][1]: return self.out elapsed_time = time.time() - start_time self.out['keras'] = self.keras.similar(q1, q2) self.out['sentiment'][0] = get_sentiment_values(q1)[1]['compound'] self.out['sentiment'][1] = get_sentiment_values(q2)[1]['compound'] self.out['sentiment'][2] = abs( self.out['sentiment'][0] - self.out['sentiment'][1]) if (abs(self.out['sentiment'][0]) > 0.3 and abs( self.out['sentiment'][1]) > 0.3): if self.out['sentiment'][2] >= 0.6: return self.out start_time = time.time() self.out['class'][0] = self.classifier.classify_question(sq1) self.out['class'][1] = self.classifier.classify_question(sq2) self.out['f_class'] = (self.out['class'][0] == self.out['class'][1]) self.out['keywords'][0], self.out['numbers'][0] = extract_features(sq1) self.out['keywords'][1], self.out['numbers'][1] = extract_features(sq2) self.out['max_keywords'] += len( set(self.out['keywords'][0] + self.out['keywords'][1])) if self.out['class'][0] > 0 and self.out['class'][1] > 0: self.out['max_keywords'] += 1 for each in self.out['keywords'][0]: if each in self.out['keywords'][1]: if (each in q1_neg_list and each not in q2_neg_list) or ( each in q2_neg_list and each not in q1_neg_list): self.out['max_keywords'] += 1 else: if(each in self.stopwords_en): count += 0.30 #self.out['max_keywords'] -= 1 else: count+=1 if self.out['numbers'][0]: self.out['max_keywords'] += 1 if self.out['numbers'][1]: self.out['max_keywords'] += 1 if self.out['numbers'][1] != self.out['numbers'][0]: return self.out elif self.out['numbers'][1]: self.out['max_keywords'] += 1 if self.out['class'][0] > 0 and self.out['class'][1] > 0: self.out['max_keywords'] += 1 if self.out['f_class']: if self.out['max_keywords'] > 1: count += 1 else: count += 0.35 # keywords_s1= [x for x in keywords_s1 if x not in keywords_s2] # keywords_s3= [x for x in keywords_s2 if x not in keywords_s1] if self.out['max_keywords'] < 1: self.out['keywords_sim'] = 0 else: self.out['keywords_sim'] = (count / self.out['max_keywords']) * 100 self.out['sim_per'] = (self.out['keywords_sim']+self.out['keras'])/2.0 #print(self.out['keywords_sim'],count,self.out['max_keywords']) ''' k_value = [] s_value = [] k = 100.0 s = 30.0 k_step = 10.0 s_step = 4.0 self.out["sim_per"] = (self.out['keywords_sim'] + self.out['keras']) / 2 for i in range(7): k -= k_step s += s_step k_value.append(k) s_value.append(s) ''' s_value = [34.0, 40.0, 50.0, 55.0, 60.0, 60.0, 60.0] k_value = [90.0, 85.0, 80.0, 75.0, 70.0, 60.0, 30.0] if self.out['keras'] >= k_value[0]: if self.out['keywords_sim'] >= s_value[0]: self.out['sim'] = 1 return self.out elif self.out['keras'] > k_value[1]: if self.out['keywords_sim'] >= s_value[1]: self.out['sim'] = 1 return self.out elif self.out['keras'] > k_value[2]: if self.out['keywords_sim'] >= s_value[2]: self.out['sim'] = 1 return self.out elif self.out['keras'] > k_value[3]: if self.out['keywords_sim'] >= s_value[3]: self.out['sim'] = 1 return self.out elif self.out['keras'] > k_value[4]: if self.out['keywords_sim'] >= s_value[4]: self.out['sim'] = 1 return self.out elif self.out['keras'] > k_value[5]: if self.out['keywords_sim'] >= s_value[5]: self.out['sim'] = 1 return self.out elif self.out['keras'] > k_value[6]: if self.out['keywords_sim'] >= s_value[6]: self.out['sim'] = 1 return self.out return self.out def similarr(self, text, questions=list()): answer, max_similarity = None, 0 if not text or len(questions) == 0: return answer, max_similarity for question in questions: try: result = self.similar(text.lower(), question.get('question').lower()) except: result = self.similar(text, question.get('question')) if result.get('sim') == 1: confidence = result.get('sim_per') if max_similarity <= confidence <= 100: max_similarity = confidence answer = question.get('id') # print("round stop\n") if max_similarity >= 95: break # print('[Stop]') return answer, max_similarity def get_suggestions(self, text=None, texts=list()): res = [] s = [] min_confidence = 45 for each in texts: result = self.similar(text, each.get('question').lower()) if result.get('sim') == 1: confidence = result.get('sim_per') if 100 >= confidence > min_confidence: if each.get('rich_text'): response = each.get('rich_text') else: flow = int(each.get('response').replace('flow-', '')) flow = Flow.objects.filter(id=flow).values('id', 'name', 'category__name') if flow.exists(): response = [{'flow': flow}] else: response = None if response: res.append((confidence, each.get('id'), response, each.get('question'))) s = sorted(res, key=operator.itemgetter(0), reverse=True)[:3] suggestions = [] for e in s: if e[2]: messages = [] for m in e[2]: messages.append({'message': format_message(m)}) suggestions.append({'confidence': e[0], 'id': e[1], 'message': messages}) return suggestions
from sematch.semantic.similarity import WordNetSimilarity # import jieba # import synonyms # import jieba.posseg as pseg wns = WordNetSimilarity() wns.monol_word_similarity('狗', '猫', 'cmn', 'wup') # print(wns.word_similarity('dog', 'cat', 'li')) # print(wns.monol_word_similarity('忧患', '安乐', 'cmn', 'wup')) print(wns.monol_word_similarity('狗', '猫', 'cmn', 'wup')) print(wns.monol_word_similarity('猫', '狗', 'cmn', 'wup')) # print(wns.monol_word_similarity('电脑', '键盘', 'cmn', 'wup')) # print(wns.monol_word_similarity('电脑', '电脑', 'cmn', 'wup')) # print(wns.monol_word_similarity('国家', '国家', 'cmn', 'wup')) # # def parse_token(data): # # words = [] # # for d in data: # # # jieba.enable_paddle() # seg_data = pseg.cut(data, use_paddle=True) #default # # per_word = [str(word) for word in seg_data if not str(word) in jieba_sp_words] # # for word, flag in seg_data: # # print(f'{word}, {flag}') # # words.append(seg_data) # return seg_data # # # def word_flag(sentence:list): # for word,flag in sentence: # return word,flag #
from sematch.semantic.similarity import WordNetSimilarity wn_sim = WordNetSimilarity() w1 = 'gil' lang1 = 'pol' w2 = "sowa" lang2 = 'pol' result = [] # for sim_type in ['path','lch','wup','li','res','lin','jcn','wpath','zhou']: for sim_type in ['path', 'wup', 'li', 'res', 'lin', 'jcn', 'wpath', 'zhou']: sim = wn_sim.crossl_word_similarity(w1, w2, lang1, lang2, sim_type) tmp = {'name': sim_type, 'sim': sim} result.append(tmp) print(tmp) avg = (result[0]['sim'] + result[1]['sim'] + result[2]['sim'] + result[3]['sim'] / 10 + result[4]['sim'] + result[5]['sim'] + result[6]['sim']) / 7 print("average from other methods: " + str(avg))
from datetime import datetime from csv import DictReader from math import exp, log, sqrt from random import random, shuffle import pickle import sys import string import numpy as np from sematch.semantic.similarity import WordNetSimilarity from config import path wns = WordNetSimilarity() import string string.punctuation.__add__('!!') string.punctuation.__add__('(') string.punctuation.__add__(')') string.punctuation.__add__('?') string.punctuation.__add__('.') string.punctuation.__add__(',') # from gensim.models import Word2Vec # model = Word2Vec.load_word2vec_format(path+'GoogleNews-vectors-negative300.bin', binary=True) # C binary format # print model.vocab model = None def remove_punctuation(x): new_line = [w for w in list(x) if w not in string.punctuation] new_line = ''.join(new_line) return new_line
from flask import Flask, json, request, render_template as template from sematch.application import Matcher from sematch.semantic.similarity import ConceptSimilarity, WordNetSimilarity from sematch.semantic.similarity import YagoTypeSimilarity, EntitySimilarity from sematch.semantic.graph import DBpediaDataTransform, Taxonomy import os DEBUG = True SECRET_KEY = 'Secret_development_key' DATA_FILE = 'data/data.txt' app = Flask(__name__) app.config.from_object(__name__) wn_sim = WordNetSimilarity() yago_sim = YagoTypeSimilarity() matcher = Matcher() dbpedia_sim = ConceptSimilarity(Taxonomy(DBpediaDataTransform()), 'models/dbpedia_type_ic.txt') entity = EntitySimilarity() from search import text_lsa, text_tfidf, data @app.route('/api/text_search') def text_search(): query = request.args.get('query') result = text_tfidf.search(query) result_data = []