Ejemplo n.º 1
0
def test_evaluation():
    from sematch.utility import FileIO
    query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt')
    question = FileIO.read_json_file('dataset/ned/question_ned_cleaned.txt')
    tweet = FileIO.read_json_file('dataset/ned/tweet_ned_cleaned.txt')
    print len(query)
    print len(question)
    print len(tweet)
Ejemplo n.º 2
0
def test_entity_feature():
    from sematch.utility import FileIO
    from sematch.nlp import EntityFeature
    query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt')
    question = FileIO.read_json_file('dataset/ned/question_ned_cleaned.txt')
    tweet = FileIO.read_json_file('dataset/ned/tweet_ned_cleaned.txt')
    import itertools
    candidates = list(itertools.chain.from_iterable(map(lambda x:x['candidate'], question)))
    set_candidates = list(set(candidates))
    print len(set_candidates)
    EntityFeature.candidate_features(set_candidates, export_file='models/question_features.json')
Ejemplo n.º 3
0
 def graph_ic_reader(self, filename):
     """
     Load the saved IC values
     :param filename: the file containing IC values of concepts
     :return: a dictionary concept:IC
     """
     data = FileIO.read_json_file(filename)
     return {d['concept']:float(d['ic']) for d in data}
Ejemplo n.º 4
0
 def load(cls, feature_dict_file='models/entity_features.json'):
     from sematch.utility import FileIO
     entity_features = FileIO.read_json_file(feature_dict_file)
     entity_features = {
         e['dbr']: (e['desc'], e['cat'])
         for e in entity_features
     }
     return cls(entity_features)
Ejemplo n.º 5
0
 def __init__(self,
              graph_ic='models/yago_type_ic.txt',
              mappings="models/type-linkings.txt"):
     WordNetSimilarity.__init__(self)
     self._graph_ic = GraphIC(graph_ic)
     self._mappings = FileIO.read_json_file(mappings)
     self._id2mappings = {data['offset']: data for data in self._mappings}
     self._yago2id = {
         data['yago_dbpedia']: data['offset']
         for data in self._mappings
     }
Ejemplo n.º 6
0
def test_feature_extractor():
    from sematch.nlp import FeatureExtractor
    from sematch.nlp import EntityFeature
    from sematch.nlp import SpaCyNLP
    from sematch.utility import FileIO
    import itertools
    sy = SpaCyNLP()
    w_extractor = FeatureExtractor(sy.pos_tag)
    features = EntityFeature.load(feature_dict_file='models/query_features.json')
    query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt')
    candidates = list(itertools.chain.from_iterable(map(lambda x: x['candidate'], query)))
    set_candidates = list(set(candidates))
    for can in set_candidates[:10]:
        print w_extractor.entity_word_features([can], features)
Ejemplo n.º 7
0
 def candidate_features(cls, candidates, export_file='models/candidate_features.json',
                        feature_dict_file='models/entity_features.json'):
     from sematch.utility import FileIO
     entity_features = FileIO.read_json_file(feature_dict_file)
     entity_features = {e['dbr']: (e['desc'], e['cat']) for e in entity_features}
     features = []
     for i, can in enumerate(candidates):
         print i, " ", can
         data = {}
         data['dbr'] = can
         data['desc'] = entity_features[can][0] if can in entity_features else None
         data['cat'] = entity_features[can][1] if can in entity_features else []
         features.append(data)
     FileIO.save_json_file(export_file, features)
     return features
Ejemplo n.º 8
0
def test_query_ned():
    from sematch.nlp import FeatureExtractor
    from sematch.nlp import EntityFeature
    from sematch.nlp import SpaCyNLP
    from sematch.utility import FileIO
    from sematch.semantic.relatedness import TextRelatedness
    from sematch.nel import EntityDisambiguation
    import itertools
    sy = SpaCyNLP()
    features = EntityFeature.load(
        feature_dict_file='models/query_features.json')
    extractor = FeatureExtractor(features, sy.pos_tag)
    ned = EntityDisambiguation(extractor)
    rel = TextRelatedness()
    from sematch.semantic.similarity import WordNetSimilarity
    wns = WordNetSimilarity()
    #print wns.word_similarity('cooling', 'air_conditioner', 'li')
    #similarity = lambda x,y : rel.text_similarity(x,y, model='lsa')

    query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt')
    query = [q for q in query if extractor.context_features(q['query'])]
    print len(query)
    import warnings
    warnings.filterwarnings("ignore")
    metrics = ['path', 'wup', 'res', 'lin', 'jcn', 'wpath']
    for m in metrics:
        print m
        similarity = lambda x, y: wns.word_similarity(x, y, m)
        for k in range(1, 21):
            gold = []
            predict = []
            for q in query:
                gold.append(q['gold'])
                #e = ned.text_disambiguate(q['query'], q['candidate'], similarity)
                e = ned.word_disambiguate(q['query'],
                                          q['candidate'],
                                          similarity,
                                          K=k)
                predict.append(e)
            from sklearn.metrics import precision_recall_fscore_support
            #from sklearn.metrics import classification_report
            #print classification_report(gold, predict)
            print precision_recall_fscore_support(gold,
                                                  predict,
                                                  average='weighted')[2]
Ejemplo n.º 9
0
def test_query_ned():
    from sematch.nlp import FeatureExtractor
    from sematch.nlp import EntityFeature
    from sematch.nlp import SpaCyNLP
    from sematch.utility import FileIO
    from sematch.semantic.relatedness import TextRelatedness
    from sematch.nel import EntityDisambiguation
    import itertools
    sy = SpaCyNLP()
    features = EntityFeature.load(feature_dict_file='models/query_features.json')
    extractor = FeatureExtractor(features, sy.pos_tag)
    ned = EntityDisambiguation(extractor)
    rel = TextRelatedness()
    from sematch.semantic.similarity import WordNetSimilarity
    wns = WordNetSimilarity()
    #print wns.word_similarity('cooling', 'air_conditioner', 'li')
    #similarity = lambda x,y : rel.text_similarity(x,y, model='lsa')

    query = FileIO.read_json_file('dataset/ned/query_ned_cleaned.txt')
    query = [q for q in query if extractor.context_features(q['query'])]
    print len(query)
    import warnings
    warnings.filterwarnings("ignore")
    metrics = ['path', 'wup', 'res', 'lin', 'jcn', 'wpath']
    for m in metrics:
        print m
        similarity = lambda x, y: wns.word_similarity(x, y, m)
        for k in range(1, 21):
            gold = []
            predict = []
            for q in query:
                gold.append(q['gold'])
                #e = ned.text_disambiguate(q['query'], q['candidate'], similarity)
                e = ned.word_disambiguate(q['query'], q['candidate'], similarity, K=k)
                predict.append(e)
            from sklearn.metrics import precision_recall_fscore_support
            #from sklearn.metrics import classification_report
            #print classification_report(gold, predict)
            print precision_recall_fscore_support(gold, predict, average='weighted')[2]
Ejemplo n.º 10
0
 def candidate_features(cls,
                        candidates,
                        export_file='models/candidate_features.json',
                        feature_dict_file='models/entity_features.json'):
     from sematch.utility import FileIO
     entity_features = FileIO.read_json_file(feature_dict_file)
     entity_features = {
         e['dbr']: (e['desc'], e['cat'])
         for e in entity_features
     }
     features = []
     for i, can in enumerate(candidates):
         print i, " ", can
         data = {}
         data['dbr'] = can
         data['desc'] = entity_features[can][
             0] if can in entity_features else None
         data['cat'] = entity_features[can][
             1] if can in entity_features else []
         features.append(data)
     FileIO.save_json_file(export_file, features)
     return features
Ejemplo n.º 11
0
 def load_dataset(self):
     data = FileIO.read_json_file('dataset/aspect/data.txt')
     X, y = zip(*[(d['text'], d['label']) for d in data])
     return X, y
Ejemplo n.º 12
0
 def load_dataset(self):
     data = FileIO.read_json_file('dataset/aspect/data.txt')
     X, y = zip(*[(d['text'], d['label']) for d in data])
     return X, y
Ejemplo n.º 13
0
 def __init__(self, graph_ic='models/yago_type_ic.txt', mappings="models/type-linkings.txt"):
     WordNetSimilarity.__init__(self)
     self._graph_ic = GraphIC(graph_ic)
     self._mappings = FileIO.read_json_file(mappings)
     self._id2mappings = {data['offset']: data for data in self._mappings}
     self._yago2id = {data['yago_dbpedia']: data['offset'] for data in self._mappings}
Ejemplo n.º 14
0
 def load(cls, name_dict_file='models/name.dict'):
     from sematch.utility import FileIO
     name = FileIO.read_json_file(name_dict_file)
     name = {n['name']: n['concepts'] for n in name}
     return cls(name)
Ejemplo n.º 15
0
 def load(cls, feature_dict_file='models/entity_features.json'):
     from sematch.utility import FileIO
     entity_features = FileIO.read_json_file(feature_dict_file)
     entity_features = {e['dbr']: (e['desc'], e['cat']) for e in entity_features}
     return cls(entity_features)
Ejemplo n.º 16
0
 def load(cls, name_dict_file='models/name.dict'):
     from sematch.utility import FileIO
     name = FileIO.read_json_file(name_dict_file)
     name = {n['name']: n['concepts'] for n in name}
     return cls(name)