def save_merged_ground_truth_data( full_selection, filename=util.resource(default_ground_truth_data_file)): data_processor = TrainingDataProcessing() data_ls = data_processor.read_topic_labels( util.resource('labeled-topics-1000-ls.txt')) data_dm = data_processor.read_topic_labels( util.resource('labeled-topics-1000-dm.txt')) data = merge_ground_truth(data_ls, data_dm) data_processor.save_topic_labels(data.keys(), data.values(), filename)
def __init__( self, subcat_index_file=util.resource('wikipedia/uri-to-subcats'), supercat_index_file=util.resource('wikipedia/uri-to-supercats')): self._wiki_graph = WikipediaGraphIndex( subcat_index_file=subcat_index_file, supercat_index_file=supercat_index_file) self._children = collections.OrderedDict() self._parents = collections.OrderedDict()
def depth_based_selection(root=default_root, max_depth=default_max_depth): relation_cache = CategoryRelationCache( subcat_index_file=util.resource('wikipedia/uri-to-subcats'), supercat_index_file=util.resource('wikipedia/uri-to-supercats')) full_selection = CategorySelection(root, max_depth=max_depth, relation_cache=relation_cache) full_selection.run() return full_selection
def __init__(self, acm_concept_file=util.resource("acm-concepts.txt"), acm_rels_file=util.resource("acm-relations.txt"), acm_mapping_file=util.resource("acm-wiki-mapping.txt")): self._concepts = self._read_concepts(acm_concept_file) acm_ids, wiki_uris = self._read_mapping(acm_mapping_file) self._wiki2acm = dict(zip(wiki_uris, acm_ids)) self._acm2wiki = {v: k for k, v in self._wiki2acm.items()} self._children = self._read_relations(acm_rels_file)
def read_ground_truth_data( filename=util.resource('labeled-relations-new-1000-dm.txt')): data = read_relation_type_labels(filename) is_class = topic_type.topic_type_prediction(remember_gt=True).is_class def node_to_type_char(title): uri = title_to_uri(title, category=True) return 'Class' if is_class(uri) else 'Individual' data['parent_type'] = data['parent'].apply(node_to_type_char) data['child_type'] = data['child'].apply(node_to_type_char) # fixing the incorrectly classified nodes data.ix[data['parent'] == 'Computational linguistics', 'parent_type'] = 'Individual' data.ix[data['child'] == 'Twitter', 'child_type'] = 'Individual' data.ix[data['child'] == 'Populous', 'child_type'] = 'Individual' data.ix[data['child'] == 'Canary Islands', 'child_type'] = 'Individual' data.ix[data['parent'] == 'Algorithms and data structures', 'parent_type'] = 'Individual' data.ix[data['child'] == 'Computational statistics', 'child_type'] = 'Individual' data.ix[data['child'] == 'Digital electronics', 'child_type'] = 'Individual' data.ix[data['parent'] == 'Computer storage media', 'parent_type'] = 'Class' data.ix[data['parent'] == 'IEC 61131', 'parent_type'] = 'Class' data.ix[data['child'] == 'IEC 61131', 'child_type'] = 'Class' data.ix[data['parent'] == 'Digital cameras', 'parent_type'] = 'Class' data.ix[data['child'] == 'Sony cameras', 'child_type'] = 'Class' data.ix[data['parent'] == 'Computer companies', 'parent_type'] = 'Class' data.ix[data['parent'] == 'Computer companies', 'parent_type'] = 'Class' data.ix[data['parent'] == 'Mathematics of computing', 'parent_type'] = 'Individual' data.ix[data['parent'] == '1970s in computer science', 'parent_type'] = 'Individual' data.ix[data['parent'] == '1980s in computer science', 'parent_type'] = 'Individual' data.ix[data['parent'] == 'Health informatics', 'parent_type'] = 'Individual' data.ix[data['parent'] == '1990s in video gaming', 'parent_type'] = 'Individual' data.ix[data['child'] == 'Anime based on video games', 'child_type'] = 'Class' data.ix[data['child'] == 'Android cameras with optical zoom', 'child_type'] = 'Class' data.ix[data['parent'] == 'Algorithms', 'parent_type'] = 'Class' data.ix[data['child'] == 'Algorithms', 'child_type'] = 'Class' return data
def topic_type_prediction( topic_uris=None, classes=None, ground_truth_file=util.resource('labeled-topic-types-1000-dm.txt'), n_folds=10, param_grid=_PARAM_GRID, tuned_clf=LinearSVC(loss='l1'), scoring='f1', random_state=0, remember_gt=False): if ground_truth_file and not topic_uris: topic_uris, classes = read_ground_truth(ground_truth_file) return TopicTypePrediction(topic_uris, classes, n_folds=n_folds, param_grid=param_grid, tuned_clf=tuned_clf, scoring=scoring, random_state=random_state, remember_gt=remember_gt)
def generate_topic_classifier(): topic_uris, labels = read_node_type_labels( util.resource('labeled-topic-types-1000-dm.txt')) classes = [label_to_class(label) for label in labels] features = generate_default_features(topic_uris) cv_clf = train_cv_clf(topic_uris, classes, features) best_clf = cv_clf.best_estimator_ best_clf.fit(to_features(features, topic_uris), classes) ground_truth_class = dict(zip(topic_uris, classes)) def is_class(uri): if uri in ground_truth_class: return ground_truth_class[uri] else: return best_clf.predict(to_features(features, [uri]))[0] return is_class
def __init__( self, subcat_index_file=util.resource('wikipedia/uri-to-subcats'), supercat_index_file=util.resource('wikipedia/uri-to-supercats')): self._subcat_index_file = subcat_index_file self._supercat_index_file = supercat_index_file
def read_ground_truth_data( filename=util.resource(default_ground_truth_data_file)): return TrainingDataProcessing().read_topic_labels(filename)
def read_ground_truth_topic_labels(): data_processing = topics.TrainingDataProcessing() return data_processing.read_topic_labels( util.resource('labeled-topics-music-1000-dm.txt'))
import logging logging.basicConfig(level=logging.WARN) # Silence the verbose urllib logger. logging.getLogger('requests.packages.urllib3.connectionpool').setLevel( logging.WARN) from dswont import topics from dswont import util from dswont import dbpedia # <codecell> ROOT_CATEGORY_MUSIC = 'http://dbpedia.org/resource/Category:Music' DEFAULT_SELECTION_DEPTH = 9 DEFAULT_RELATION_CACHE = topics.CategoryRelationCache( subcat_index_file=util.resource('wikipedia/uri-to-subcats-music'), supercat_index_file=util.resource('wikipedia/uri-to-supercats-music')) def music_category_selection(**params): updated_params = { 'root': ROOT_CATEGORY_MUSIC, 'relation_cache': DEFAULT_RELATION_CACHE } updated_params.update(params) selection = topics.CategorySelection(**updated_params) selection.run() return selection def precompute_full_selection(precomputed_data={}):
from dswont.dbpedia import title_to_uri, uri_to_title, is_category_uri from dswont import util import pandas as pd import warnings warnings.filterwarnings("ignore", category=DeprecationWarning) WTX_NODE_TYPE_FILE = util.resource('wikitaxonomy/node-types.txt') WTX_NODE_REL_FILE = util.resource('wikitaxonomy/rel-types.txt') def nodes_data(file): data = pd.read_csv(file, sep=' ', names=['node', 'type']) data['node'] = data['node'].str.replace('_', ' ') data = data.set_index('node') data['is_class'] = data['type'].apply(lambda x: 'class' == x) return data def rel_data(file): data = pd.read_csv(file, sep=' -> |\s', names=['parent', 'child']) data['parent'] = data['parent'].str.replace('_', ' ') data['child'] = data['child'].str.replace('_', ' ') data = data.set_index(['parent', 'child']) return data def to_title(title_or_uri): if is_category_uri(title_or_uri): return uri_to_title(title_or_uri)
def print_common_suffixes(): gt_file = util.resource('labeled-topic-types-1000-dm.txt') topic_uris, _ = read_ground_truth(gt_file) all_suffixes = generate_all_suffices(topic_uris) print("Most common suffixes:", generate_common_suffixes(all_suffixes)[:20])