def test_evaluation(): from sematch.utility import FileIO query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt') question = FileIO.read_json_file('dataset/ned/question_ned_cleaned.txt') tweet = FileIO.read_json_file('dataset/ned/tweet_ned_cleaned.txt') print len(query) print len(question) print len(tweet)
def test_entity_feature(): from sematch.utility import FileIO from sematch.nlp import EntityFeature query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt') question = FileIO.read_json_file('dataset/ned/question_ned_cleaned.txt') tweet = FileIO.read_json_file('dataset/ned/tweet_ned_cleaned.txt') import itertools candidates = list(itertools.chain.from_iterable(map(lambda x:x['candidate'], question))) set_candidates = list(set(candidates)) print len(set_candidates) EntityFeature.candidate_features(set_candidates, export_file='models/question_features.json')
def graph_ic_reader(self, filename): """ Load the saved IC values :param filename: the file containing IC values of concepts :return: a dictionary concept:IC """ data = FileIO.read_json_file(filename) return {d['concept']:float(d['ic']) for d in data}
def load(cls, feature_dict_file='models/entity_features.json'): from sematch.utility import FileIO entity_features = FileIO.read_json_file(feature_dict_file) entity_features = { e['dbr']: (e['desc'], e['cat']) for e in entity_features } return cls(entity_features)
def __init__(self, graph_ic='models/yago_type_ic.txt', mappings="models/type-linkings.txt"): WordNetSimilarity.__init__(self) self._graph_ic = GraphIC(graph_ic) self._mappings = FileIO.read_json_file(mappings) self._id2mappings = {data['offset']: data for data in self._mappings} self._yago2id = { data['yago_dbpedia']: data['offset'] for data in self._mappings }
def test_feature_extractor(): from sematch.nlp import FeatureExtractor from sematch.nlp import EntityFeature from sematch.nlp import SpaCyNLP from sematch.utility import FileIO import itertools sy = SpaCyNLP() w_extractor = FeatureExtractor(sy.pos_tag) features = EntityFeature.load(feature_dict_file='models/query_features.json') query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt') candidates = list(itertools.chain.from_iterable(map(lambda x: x['candidate'], query))) set_candidates = list(set(candidates)) for can in set_candidates[:10]: print w_extractor.entity_word_features([can], features)
def candidate_features(cls, candidates, export_file='models/candidate_features.json', feature_dict_file='models/entity_features.json'): from sematch.utility import FileIO entity_features = FileIO.read_json_file(feature_dict_file) entity_features = {e['dbr']: (e['desc'], e['cat']) for e in entity_features} features = [] for i, can in enumerate(candidates): print i, " ", can data = {} data['dbr'] = can data['desc'] = entity_features[can][0] if can in entity_features else None data['cat'] = entity_features[can][1] if can in entity_features else [] features.append(data) FileIO.save_json_file(export_file, features) return features
def test_query_ned(): from sematch.nlp import FeatureExtractor from sematch.nlp import EntityFeature from sematch.nlp import SpaCyNLP from sematch.utility import FileIO from sematch.semantic.relatedness import TextRelatedness from sematch.nel import EntityDisambiguation import itertools sy = SpaCyNLP() features = EntityFeature.load( feature_dict_file='models/query_features.json') extractor = FeatureExtractor(features, sy.pos_tag) ned = EntityDisambiguation(extractor) rel = TextRelatedness() from sematch.semantic.similarity import WordNetSimilarity wns = WordNetSimilarity() #print wns.word_similarity('cooling', 'air_conditioner', 'li') #similarity = lambda x,y : rel.text_similarity(x,y, model='lsa') query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt') query = [q for q in query if extractor.context_features(q['query'])] print len(query) import warnings warnings.filterwarnings("ignore") metrics = ['path', 'wup', 'res', 'lin', 'jcn', 'wpath'] for m in metrics: print m similarity = lambda x, y: wns.word_similarity(x, y, m) for k in range(1, 21): gold = [] predict = [] for q in query: gold.append(q['gold']) #e = ned.text_disambiguate(q['query'], q['candidate'], similarity) e = ned.word_disambiguate(q['query'], q['candidate'], similarity, K=k) predict.append(e) from sklearn.metrics import precision_recall_fscore_support #from sklearn.metrics import classification_report #print classification_report(gold, predict) print precision_recall_fscore_support(gold, predict, average='weighted')[2]
def test_query_ned(): from sematch.nlp import FeatureExtractor from sematch.nlp import EntityFeature from sematch.nlp import SpaCyNLP from sematch.utility import FileIO from sematch.semantic.relatedness import TextRelatedness from sematch.nel import EntityDisambiguation import itertools sy = SpaCyNLP() features = EntityFeature.load(feature_dict_file='models/query_features.json') extractor = FeatureExtractor(features, sy.pos_tag) ned = EntityDisambiguation(extractor) rel = TextRelatedness() from sematch.semantic.similarity import WordNetSimilarity wns = WordNetSimilarity() #print wns.word_similarity('cooling', 'air_conditioner', 'li') #similarity = lambda x,y : rel.text_similarity(x,y, model='lsa') query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt') query = [q for q in query if extractor.context_features(q['query'])] print len(query) import warnings warnings.filterwarnings("ignore") metrics = ['path', 'wup', 'res', 'lin', 'jcn', 'wpath'] for m in metrics: print m similarity = lambda x, y: wns.word_similarity(x, y, m) for k in range(1, 21): gold = [] predict = [] for q in query: gold.append(q['gold']) #e = ned.text_disambiguate(q['query'], q['candidate'], similarity) e = ned.word_disambiguate(q['query'], q['candidate'], similarity, K=k) predict.append(e) from sklearn.metrics import precision_recall_fscore_support #from sklearn.metrics import classification_report #print classification_report(gold, predict) print precision_recall_fscore_support(gold, predict, average='weighted')[2]
def candidate_features(cls, candidates, export_file='models/candidate_features.json', feature_dict_file='models/entity_features.json'): from sematch.utility import FileIO entity_features = FileIO.read_json_file(feature_dict_file) entity_features = { e['dbr']: (e['desc'], e['cat']) for e in entity_features } features = [] for i, can in enumerate(candidates): print i, " ", can data = {} data['dbr'] = can data['desc'] = entity_features[can][ 0] if can in entity_features else None data['cat'] = entity_features[can][ 1] if can in entity_features else [] features.append(data) FileIO.save_json_file(export_file, features) return features
def load_dataset(self): data = FileIO.read_json_file('dataset/aspect/data.txt') X, y = zip(*[(d['text'], d['label']) for d in data]) return X, y
def __init__(self, graph_ic='models/yago_type_ic.txt', mappings="models/type-linkings.txt"): WordNetSimilarity.__init__(self) self._graph_ic = GraphIC(graph_ic) self._mappings = FileIO.read_json_file(mappings) self._id2mappings = {data['offset']: data for data in self._mappings} self._yago2id = {data['yago_dbpedia']: data['offset'] for data in self._mappings}
def load(cls, name_dict_file='models/name.dict'): from sematch.utility import FileIO name = FileIO.read_json_file(name_dict_file) name = {n['name']: n['concepts'] for n in name} return cls(name)
def load(cls, feature_dict_file='models/entity_features.json'): from sematch.utility import FileIO entity_features = FileIO.read_json_file(feature_dict_file) entity_features = {e['dbr']: (e['desc'], e['cat']) for e in entity_features} return cls(entity_features)