def _create_filtered_index(self, source=dir_path + '../data/character_index.csv', destination=dir_path + '../data/character_index_filtered.csv'): with io.open(source, 'rb') as fin_index, io.open(destination, 'w', encoding='utf8') as fout: total_lines_relations = line_counting.cached_counter.count_lines( self.path_relations) self.logger.print_info('Collecting important entities...') important_articles = set() nt_reader = NTReader(self.path_relations) for subject, predicate, object in tqdm( nt_reader.yield_cleaned_entry_names(), total=total_lines_relations): important_articles.add(subject) total_lines_index = line_counting.cached_counter.count_lines( source) self.logger.print_info('Filtering important entities...') index_reader = csv.reader(fin_index, delimiter=self.delimiter, encoding='utf-8', quoting=csv.QUOTE_NONE) for line in tqdm(index_reader, total=total_lines_index): subject, character_offset = line if subject in important_articles: fout.write(subject + self.delimiter + character_offset + '\n')
def __init__(self, articles_limit, use_dump=False, randomize=False, match_threshold=0.005, type_matching=True, allow_unknown_entity_types=True, print_interim_results=True, threads=4, resources_path=dir_path + '../data/mappingbased_objects_en.ttl', patterns_input_path=dir_path + '../data/patterns_cleaned.pkl', facts_output_path=dir_path + '../results/extracted_facts.nt', extended_facts_output_path=dir_path + '../results/extracted_facts_extended.txt'): super(FactExtractor, self).__init__(patterns_input_path) self.articles_limit = articles_limit self.use_dump = use_dump self.allow_unknown_entity_types = allow_unknown_entity_types self.match_threshold = match_threshold self.type_matching = type_matching self.nt_reader = NTReader(resources_path, randomize) self.wikipedia_connector = WikipediaConnector(self.use_dump) self.pattern_extractor = PatternExtractor() self.pattern_matcher = PatternMatcher() self.print_interim_results = print_interim_results self.discovery_resources = set() self.extracted_facts = [] self.threads = threads self.nt_writer = NTWriter(facts_output_path) self.extended_facts_output_path = extended_facts_output_path # self._make_pattern_types_transitive() self._load_discovery_resources()
def __init__(self, dbpedia_facts_path=dir_path + '../data/mappingbased_objects_en.ttl', facts_input_path=dir_path + '../results/extracted_facts.nt', facts_output_path=dir_path + '../results/new_facts.nt'): self.dbpedia_nt_reader = NTReader(dbpedia_facts_path) self.extracted_facts_nt_reader = NTReader(facts_input_path) self.nt_writer = NTWriter(facts_output_path)
def __init__(self, resources_path=dir_path + '../data/mappingbased_objects_en.ttl', facts_limit=100000): # self.instance_types = EntityTypes(types_paths=["../data/types_en.csv"], types_index=False, # types_indexed_file=False) self.instance_types = EntityTypes() self.resources_path = resources_path self.nt_reader = NTReader(resources_path, False) self.logger = Logger.from_config_file() self.delimiter = '#' self.predicates = dict() self.facts_limit = facts_limit
def __init__(self, facts_path=dir_path + '../data/mappingbased_objects_en.ttl', output_path=dir_path + '../data/type_patterns_raw.pkl', facts_limit=False): super(TypeLearner, self).__init__(None, output_path) self.facts_path = facts_path self.output_path = output_path self.facts_limit = facts_limit if facts_limit > 0 else sys.maxint self.nt_reader = NTReader(facts_path) self.instance_types = EntityTypes() self.subjects = dict() self.objects = dict() self.type_patterns = dict()
def __init__(self, facts_limit, randomize=False, ground_truth_path=dir_path + '../pattern_testing/ground_truth.ttl'): self.facts_limit = facts_limit self.randomize = randomize self.nt_reader = NTReader(ground_truth_path, randomize) self.logger = Logger.from_config_file() self.results = {} self.fact_extractor = None # count known, right and wrong facts for each relation_type self.known_facts_counter = Counter() self.right_facts_counter = Counter() self.wrong_facts_counter = Counter()
class FactCleaner(object): def __init__(self, dbpedia_facts_path=dir_path + '../data/mappingbased_objects_en.ttl', facts_input_path=dir_path + '../results/extracted_facts.nt', facts_output_path=dir_path + '../results/new_facts.nt'): self.dbpedia_nt_reader = NTReader(dbpedia_facts_path) self.extracted_facts_nt_reader = NTReader(facts_input_path) self.nt_writer = NTWriter(facts_output_path) def clean_facts(self): dbpedia_facts = set() for subject, predicate, object in self.dbpedia_nt_reader.yield_entries( ): dbpedia_facts.add((subject, predicate, object)) extracted_facts = set() for subject, predicate, object in self.extracted_facts_nt_reader.yield_entries( ): extracted_facts.add((subject, predicate, object)) cleaned_facts = extracted_facts - dbpedia_facts self.nt_writer.write_nt(cleaned_facts)
def __init__(self, relation_types_limit, facts_limit, resources_path=dir_path + '../data/mappingbased_objects_en.ttl', relation_types=None, use_dump=False, randomize=False, perform_tests=False, type_learning=True, replace_redirects=False, patterns_output_path=dir_path + '../data/patterns_raw.pkl', threads=4): super(WikipediaPatternExtractor, self).__init__(None, patterns_output_path) self.use_dump = use_dump self.facts_limit = facts_limit self.perform_tests = perform_tests self.type_learning = type_learning self.wikipedia_connector = WikipediaConnector( use_dump=self.use_dump, redirect=replace_redirects) self.pattern_extractor = PatternExtractor() self.num_of_threads = threads self.nt_reader = NTReader(resources_path, randomize) self.logger = Logger.from_config_file() if relation_types is not None and len(relation_types) > 0: self.relation_types = [ 'http://dbpedia.org/ontology/' + r for r in relation_types if r ] self.relation_types_limit = len(self.relation_types) else: self.relation_types = None # means any relation may be learned self.relation_types_limit = relation_types_limit self.dbpedia = {} self.matches = []
class FactExtractor(PatternTool): def __init__(self, articles_limit, use_dump=False, randomize=False, match_threshold=0.005, type_matching=True, allow_unknown_entity_types=True, print_interim_results=True, threads=4, resources_path=dir_path + '../data/mappingbased_objects_en.ttl', patterns_input_path=dir_path + '../data/patterns_cleaned.pkl', facts_output_path=dir_path + '../results/extracted_facts.nt', extended_facts_output_path=dir_path + '../results/extracted_facts_extended.txt'): super(FactExtractor, self).__init__(patterns_input_path) self.articles_limit = articles_limit self.use_dump = use_dump self.allow_unknown_entity_types = allow_unknown_entity_types self.match_threshold = match_threshold self.type_matching = type_matching self.nt_reader = NTReader(resources_path, randomize) self.wikipedia_connector = WikipediaConnector(self.use_dump) self.pattern_extractor = PatternExtractor() self.pattern_matcher = PatternMatcher() self.print_interim_results = print_interim_results self.discovery_resources = set() self.extracted_facts = [] self.threads = threads self.nt_writer = NTWriter(facts_output_path) self.extended_facts_output_path = extended_facts_output_path # self._make_pattern_types_transitive() self._load_discovery_resources() @classmethod def from_config_file(cls): config_parser = cls.get_config_parser() use_dump = config_parser.getboolean('general', 'use_dump') randomize = config_parser.getboolean('fact_extractor', 'randomize') articles_limit = config_parser.getint('fact_extractor', 'articles_limit') match_threshold = config_parser.getfloat('fact_extractor', 'match_threshold') type_matching = config_parser.getboolean('fact_extractor', 'type_matching') allow_unknown_entity_types = config_parser.getboolean( 'fact_extractor', 'allow_unknown_entity_types') num_of_threads = config_parser.getint('fact_extractor', 'threads') return cls(articles_limit, use_dump, randomize, match_threshold, type_matching, allow_unknown_entity_types, threads=num_of_threads) def _make_pattern_types_transitive(self): for relation, pattern in self.relation_type_patterns.iteritems(): pattern.subject_type_frequencies = self.pattern_extractor \ .get_transitive_types(pattern.subject_type_frequencies) pattern.object_type_frequencies = self.pattern_extractor \ .get_transitive_types(pattern.object_type_frequencies) @staticmethod def flat_map(list_of_lists): return [item for list in list_of_lists for item in list] def _load_discovery_resources(self): article_counter = 0 valid_types = set( FactExtractor.flat_map( self._get_specific_type_frequencies('subject').values())) self.logger.print_info('Collecting entities for fact extraction...') for subject, predicate, object in self.nt_reader.yield_entries(): if article_counter == self.articles_limit: break if subject in self.training_resources or subject in self.discovery_resources: continue subject_types = set( self.pattern_extractor.get_entity_types(subject).keys()) if (self.allow_unknown_entity_types and len(subject_types) == 0) \ or len(subject_types & valid_types) > 0: self.discovery_resources.add(subject) article_counter += 1 self.logger.print_done( 'Collecting entities for fact extraction completed: ' + str(len(self.discovery_resources)) + ' articles') def _match_pattern_against_relation_type_patterns(self, pattern, reasonable_relations): matching_relations = [] for relation in reasonable_relations: relation_pattern = self.relation_type_patterns[relation] match_score = self.pattern_matcher.match_patterns( relation, relation_pattern, pattern, self.type_matching, self.allow_unknown_entity_types) if match_score >= self.match_threshold: matching_relations.append((relation, match_score)) return matching_relations def _filter_reasonable_relations(self, entity, types_of_relations): reasonable_relations = set() entity_types = self.pattern_extractor.get_entity_types(entity) if self.allow_unknown_entity_types and len(entity_types) == 0: reasonable_relations = set(types_of_relations.keys()) else: for relation, types in types_of_relations.iteritems(): assert types is not None # Otherwise types were not learned in the training step. # In this case you probably have to adjust the config file and rerun the training step. if len(entity_types & types) > 0: reasonable_relations.add(relation) return reasonable_relations def _get_specific_type_frequencies(self, subject_or_object): if subject_or_object == 'subject': return { relation: pattern.subject_type_frequencies for relation, pattern in self.relation_type_patterns.iteritems() } elif subject_or_object == 'object': return { relation: pattern.object_type_frequencies for relation, pattern in self.relation_type_patterns.iteritems() } else: assert False def _extract_facts_from_sentences(self, sentences, subject_entity=None): facts = [] if self.type_matching: reasonable_relations_for_subject = self._filter_reasonable_relations( subject_entity, self._get_specific_type_frequencies('subject')) for sentence in sentences: if sentence.number_of_tokens() > 50: continue # probably too long for stanford tokenizer relative_position = sentence.relative_pos nl_sentence = sentence.as_string() object_addresses_of_links = sentence.addresses_of_dbpedia_links() for object_link, object_addresses in object_addresses_of_links.iteritems( ): object_entity = uri_rewriting.strip_name(object_link) if self.type_matching: reasonable_relations_for_object = self._filter_reasonable_relations( object_entity, self._get_specific_type_frequencies('object')) reasonable_relations = reasonable_relations_for_subject & reasonable_relations_for_object else: reasonable_relations = self.relation_type_patterns if not len(reasonable_relations): continue pattern = self.pattern_extractor.extract_pattern( nl_sentence, object_addresses, relative_position, self.type_matching, subject_entity, object_entity) if pattern is None: continue matching_relations = self._match_pattern_against_relation_type_patterns( pattern, reasonable_relations) new_facts = [(predicate, object_link, score, nl_sentence) for (predicate, score) in matching_relations] facts.extend(new_facts) return facts def extract_facts_from_html(self, html, resource): tagged_sentences = TaggedSentence.from_html(html) referenced_sentences = filter(lambda sent: sent.contains_any_link(), tagged_sentences) if self.type_matching: subject_entity = uri_rewriting.strip_name(resource) else: subject_entity = None facts = self._extract_facts_from_sentences(referenced_sentences, subject_entity) facts = [(resource, predicate, object, score, nl_sentence) for (predicate, object, score, nl_sentence) in facts] if self.print_interim_results: for fact in facts: print(fact) return facts def _extract_facts_from_resource(self, chunk=None): self.logger.print_info('--- start fact extraction thread ----') if chunk is None: chunk = set() facts = [] for resource in chunk: wikipedia_resource = uri_rewriting.convert_to_wikipedia_resource_uri( resource) self.logger.print_info('--- ' + wikipedia_resource + ' ----') html = self.wikipedia_connector.get_wikipedia_article_html( resource) facts.extend(self.extract_facts_from_html(html, resource)) self.extracted_facts.extend(facts) @staticmethod def _chunks(data, size=10000): """ Yield successive n-sized chunks from input. """ for i in range(0, len(data), size): yield data[i:i + size] def _remove_dead_objects(self): self.extracted_facts = filter( lambda (subject, predicate, object, score, nl_sentence): 'redlink=1' not in object, self.extracted_facts) def extract_facts(self): self.logger.print_info('Fact extraction...') chunk_size = int(ceil(len(self.discovery_resources) / self.threads)) threads = [] # gather resources for each thread for chunk in self._chunks(list(self.discovery_resources), chunk_size): t = Thread(target=self._extract_facts_from_resource, kwargs={'chunk': chunk}) threads.append(t) # start all threads for t in threads: t.start() # wait for all threads to finish for t in threads: t.join() self._remove_dead_objects() self.extracted_facts.sort(key=lambda fact: fact[0][3], reverse=True) self.logger.print_done('Fact extraction completed') def save_extracted_facts(self): short_facts = [(subject, predicate, object) for (subject, predicate, object, socre, nl_sentence) in self.extracted_facts] self.nt_writer.write_nt(short_facts) with codecs.open(self.extended_facts_output_path, 'wb', 'utf-8') as fout: self.logger.print_info('\n\nSaving extended facts to "' + self.extended_facts_output_path + '"...') for fact in tqdm(self.extracted_facts): fout.write(str(fact) + '\n') @property def training_relation_types(self): return self.relation_type_patterns.keys() def set_print_interim_results(self, boolean): self.print_interim_results = boolean
class PatternTester(ConfigInitializer): def __init__(self, facts_limit, randomize=False, ground_truth_path=dir_path + '../pattern_testing/ground_truth.ttl'): self.facts_limit = facts_limit self.randomize = randomize self.nt_reader = NTReader(ground_truth_path, randomize) self.logger = Logger.from_config_file() self.results = {} self.fact_extractor = None # count known, right and wrong facts for each relation_type self.known_facts_counter = Counter() self.right_facts_counter = Counter() self.wrong_facts_counter = Counter() @classmethod def from_config_file(cls): config_parser = cls.get_config_parser() facts_limit = config_parser.getint('pattern_testing', 'facts_limit') randomize = config_parser.getboolean('pattern_testing', 'randomize') return cls(facts_limit, randomize) def _collect_testing_facts(self): if self.fact_extractor is None: self.fact_extractor = FactExtractor.from_config_file() self.fact_extractor.set_print_interim_results(False) training_resources = self.fact_extractor.training_resources training_relations = self.fact_extractor.training_relation_types entities = dict() fact_counter = 0 self.logger.print_info('Collecting facts for testing...') for subject, predicate, object in self.nt_reader.yield_entries(): if fact_counter == self.facts_limit * len(training_relations): break if subject in training_resources: self.logger.print_error( 'Resource: "' + subject + '" was already used for training and thus won\'t be used for testing' ) continue if predicate not in training_relations: continue if self.known_facts_counter[predicate] == self.facts_limit: continue # maintain a dict for each entity with given relations as key # and their target values as list entities.setdefault(subject, []).append((predicate, object)) self.known_facts_counter[predicate] += 1 fact_counter += 1 return entities def get_testing_resources(self): return set([ subject for subject, predicate, object in self.nt_reader.yield_entries() ]) def test_patterns(self): test_entities = self._collect_testing_facts() self.fact_extractor.discovery_resources = test_entities self.fact_extractor.extract_facts() for fact in self.fact_extractor.extracted_facts: print(fact) subject, predicate, object, score, nl_sentence = fact if (predicate, object) in test_entities[subject]: self.right_facts_counter[predicate] += 1 print('Match') else: self.wrong_facts_counter[predicate] += 1 print('No match') print('') @staticmethod def _calculate_f_measure(precision, recall): if precision is None or recall is None or precision + recall == 0: return None numerator = 2 * (precision * recall) return numerator / (precision + recall) @staticmethod def _soft_division(dividend, divisor): try: return dividend / float(divisor) except ZeroDivisionError: return None @staticmethod def _calculate_precision_recall_and_f_measure(total, right, wrong): precision = PatternTester._soft_division(right, right + wrong) recall = PatternTester._soft_division(right, total) f_measure = PatternTester._calculate_f_measure(precision, recall) return precision, recall, f_measure def print_results(self): for relation_type in self.fact_extractor.training_relation_types: total = self.known_facts_counter[relation_type] right = self.right_facts_counter[relation_type] wrong = self.wrong_facts_counter[relation_type] precision, recall, f_measure = PatternTester._calculate_precision_recall_and_f_measure( total, right, wrong) print(relation_type + ' Known facts:' + str(total) + ' Right:' + str(right) + ' Wrong:' + str(wrong) + ' Precision:' + str(precision) + ' Recall:' + str(recall) + ' F-Measure:' + str(f_measure))
class StatisticGenerator(object): def __init__(self, resources_path=dir_path + '../data/mappingbased_objects_en.ttl', facts_limit=100000): # self.instance_types = EntityTypes(types_paths=["../data/types_en.csv"], types_index=False, # types_indexed_file=False) self.instance_types = EntityTypes() self.resources_path = resources_path self.nt_reader = NTReader(resources_path, False) self.logger = Logger.from_config_file() self.delimiter = '#' self.predicates = dict() self.facts_limit = facts_limit def collect_predicates(self, facts_limit=100000): self.facts_limit = facts_limit self.predicates = dict() total_count = 0 total_lines = min( line_counting.cached_counter.count_lines(self.resources_path), self.facts_limit) self.logger.print_info('Collecting facts for each predicate...') for subject, predicate, object in tqdm(self.nt_reader.yield_entries(), total=total_lines): total_count += 1 if total_count > self.facts_limit: break subject = uri_rewriting.strip_cleaned_name(subject) object = uri_rewriting.strip_cleaned_name(object) self.predicates.setdefault(predicate, {}).setdefault(subject, []).append(object) def count_types(self): subject_counts = [] object_counts = [] has_both = 0 has_exact_one = 0 has_nothing = 0 facts = 0 outlier_threshold = 100 for predicate in tqdm(self.predicates, total=len(self.predicates)): for subject in self.predicates[predicate]: subject_types = self.instance_types.get_types(subject) for object in self.predicates[predicate][subject]: object_types = self.instance_types.get_types(object) facts += 1 if subject_types: if len(subject_types) < outlier_threshold: subject_counts.append(len(subject_types)) if object_types: if len(object_types) < outlier_threshold: object_counts.append(len(object_types)) if subject_types and object_types: has_both += 1 if not subject_types and not object_types: has_nothing += 1 if (len(subject_types) > 0) ^ (len(object_types) > 0): has_exact_one += 1 subject_counts = pd.Series(subject_counts) # subject_counts.plot.hist(bins=100) # plt.show() object_counts = pd.Series(object_counts) # object_counts.plot.hist(bins=100) # plt.show() self.logger.print_info('Facts: ' + str(facts)) self.logger.print_info('With subject type: ' + str(subject_counts.count())) self.logger.print_info('Mean subject type count: ' + str(subject_counts.mean())) self.logger.print_info('Standard deviation subject type count: ' + str(subject_counts.std())) self.logger.print_info('With object type: ' + str(object_counts.count())) self.logger.print_info('Mean object type count: ' + str(object_counts.mean())) self.logger.print_info('Standard deviation object type count: ' + str(object_counts.std())) self.logger.print_info('Both with type(s): ' + str(has_both)) self.logger.print_info('Exact one with type(s): ' + str(has_exact_one)) self.logger.print_info('None with type(s): ' + str(has_nothing)) def test_types_independence(self, expectation_threshold=10): variances = {} total_included_count = 0 sum_avg_variance = 0 empty_token = '#empty' self.logger.print_info( 'Collecting subject and object types for each predicate and calculating independence score...' ) for predicate in tqdm(self.predicates, total=len(self.predicates)): predicate_count = 0 predicate_subject_types = Counter() predicate_object_types = Counter() combinations = Counter() for subject in self.predicates[predicate]: subject_types = self.instance_types.get_types(subject).append( empty_token) for object in self.predicates[predicate][subject]: # TODO: check for occurrence in Wikipedia article # TODO: exclude double underscores predicate_count += 1 object_types = self.instance_types.get_types(object) predicate_subject_types.update(subject_types) predicate_object_types.update(object_types) cross_product = [(s, o) for s in subject_types for o in object_types] combinations.update(cross_product) # print(predicate) variance = StatisticGenerator.calculate_independence_score( predicate_count, predicate_subject_types, predicate_object_types, combinations, expectation_threshold) if variance is None: continue variances[predicate] = variance sum_avg_variance += float(predicate_count) * variance total_included_count += predicate_count total_avg_variance = sum_avg_variance / total_included_count with open("types_independence_" + str(int(time.time())) + ".csv", 'wb') as csv_file: writer = unicodecsv.writer(csv_file, delimiter=self.delimiter) writer.writerow([ "Threshold", expectation_threshold, "Facts count", self.facts_limit, "Avg variance", total_avg_variance ]) for predicate, variance in sorted(variances.items(), key=operator.itemgetter(1)): writer.writerow([predicate, variance]) print(predicate, " ", variance) def measure_type_diversity(self, threshold=2): subject_types_count = Counter() object_types_count = Counter() relation_subject_types = {} relation_object_types = {} facts = Counter() for predicate in tqdm(self.predicates, total=len(self.predicates)): relation_subject_types[predicate] = Counter() relation_object_types[predicate] = Counter() for subject in self.predicates[predicate]: subject_types = self.instance_types.get_types(subject) for object in self.predicates[predicate][subject]: object_types = self.instance_types.get_types(object) facts[predicate] += 1 for subject_type in subject_types: relation_subject_types[predicate][subject_type] += 1 subject_types_count[subject_type] += 1 for object_type in object_types: relation_object_types[predicate][object_type] += 1 object_types_count[object_type] += 1 # print(predicate, subject, object) subject_specs = StatisticGenerator.calculate_specifity( facts, subject_types_count, relation_subject_types) object_specs = StatisticGenerator.calculate_specifity( facts, object_types_count, relation_object_types) both_specs = {} for predicate in subject_specs: both_specs[predicate] = {} both_specs[predicate]["subject"] = subject_specs[predicate] for predicate in object_specs: both_specs.setdefault(predicate, {}) both_specs[predicate]["object"] = object_specs[predicate] for predicate in both_specs: print(';'.join([ predicate, str(both_specs[predicate].setdefault("subject", -1)), str(both_specs[predicate].setdefault("object", -1)) ])) @staticmethod def calculate_specifity(facts, types, relation_types): total_facts = sum(facts.values()) specifities = {} for predicate in relation_types: if len(set(relation_types[predicate])) == 0: continue deviations = 0 for name, predicate_type_frequency in relation_types[ predicate].most_common(): predicate_relative_frequency = float( predicate_type_frequency) / facts[predicate] total_frequency = float(types[name] - predicate_type_frequency) / total_facts # print(name) # print(facts[predicate]) # print(predicate_frequency) # print(predicate_relative_frequency) # print(total_frequency) assert abs(predicate_relative_frequency - total_frequency) <= 1 deviations += predicate_type_frequency * abs( predicate_relative_frequency - total_frequency) specifities[predicate] = float(deviations) / sum( relation_types[predicate].values()) return specifities @staticmethod def calculate_independence_score(facts_count, subject_types, object_types, combinations, expectation_threshold): sum_rel_variance = 0 included_combination_count = 0 for combination, observed_count in combinations.most_common(): subject, object = combination expected_count = float( subject_types[subject] * object_types[object]) / facts_count if expected_count < expectation_threshold: continue included_combination_count += observed_count rel_variance = (float(abs(observed_count - expected_count)) / expected_count) sum_rel_variance += observed_count * rel_variance if included_combination_count == 0: return None return sum_rel_variance / included_combination_count
class TypeLearner(TypeTool): def __init__(self, facts_path=dir_path + '../data/mappingbased_objects_en.ttl', output_path=dir_path + '../data/type_patterns_raw.pkl', facts_limit=False): super(TypeLearner, self).__init__(None, output_path) self.facts_path = facts_path self.output_path = output_path self.facts_limit = facts_limit if facts_limit > 0 else sys.maxint self.nt_reader = NTReader(facts_path) self.instance_types = EntityTypes() self.subjects = dict() self.objects = dict() self.type_patterns = dict() @classmethod def from_config_file(cls): config_parser = cls.get_config_parser() section = 'type_learner' facts_limit = config_parser.getint(section, 'facts_limit') return cls(facts_limit=facts_limit) @staticmethod def _update_entity_counter(entities, entity, predicate): entity = uri_rewriting.strip_cleaned_name(entity) entities.setdefault(entity, Counter()) entities[entity][predicate] += 1 def _count_predicates(self): total_lines = min( line_counting.cached_counter.count_lines(self.facts_path), self.facts_limit) facts_count = 0 self.logger.print_info( 'Counting relations for subjects and objects...') for subject, predicate, object in tqdm(self.nt_reader.yield_entries(), total=total_lines): facts_count += 1 if facts_count > self.facts_limit: break self._update_entity_counter(self.subjects, subject, predicate) self._update_entity_counter(self.objects, object, predicate) self.type_patterns.setdefault(predicate, TypePattern()) self.type_patterns[predicate].facts += 1 def _get_types(self, entities): relations = dict() for entity in tqdm(entities, total=len(entities)): types = self.instance_types.get_types(entity) for predicate, quantity in entities[entity].iteritems(): relations.setdefault(predicate, Counter()) for type in types: relations[predicate].update({type: quantity}) return relations def _count_types(self): self.logger.print_info('Retrieving types for subjects...') subject_types = self._get_types(self.subjects) self.logger.print_info('Cumulating subject types for relations...') for predicate in tqdm(subject_types, total=len(subject_types)): self.type_patterns[predicate].subject_types += subject_types[ predicate] self.logger.print_info('Retrieving types for objects...') object_types = self._get_types(self.objects) self.logger.print_info('Cumulating object types for relations...') for predicate in tqdm(object_types, total=len(object_types)): self.type_patterns[predicate].object_types += object_types[ predicate] def learn_types(self): self.logger.print_info('Type learning...') self._count_predicates() self._count_types() self.logger.print_done('Type learning completed.')
class WikipediaPatternExtractor(PatternTool): def __init__(self, relation_types_limit, facts_limit, resources_path=dir_path + '../data/mappingbased_objects_en.ttl', relation_types=None, use_dump=False, randomize=False, perform_tests=False, type_learning=True, replace_redirects=False, patterns_output_path=dir_path + '../data/patterns_raw.pkl', threads=4): super(WikipediaPatternExtractor, self).__init__(None, patterns_output_path) self.use_dump = use_dump self.facts_limit = facts_limit self.perform_tests = perform_tests self.type_learning = type_learning self.wikipedia_connector = WikipediaConnector( use_dump=self.use_dump, redirect=replace_redirects) self.pattern_extractor = PatternExtractor() self.num_of_threads = threads self.nt_reader = NTReader(resources_path, randomize) self.logger = Logger.from_config_file() if relation_types is not None and len(relation_types) > 0: self.relation_types = [ 'http://dbpedia.org/ontology/' + r for r in relation_types if r ] self.relation_types_limit = len(self.relation_types) else: self.relation_types = None # means any relation may be learned self.relation_types_limit = relation_types_limit self.dbpedia = {} self.matches = [] @classmethod def from_config_file(cls): config_parser = cls.get_config_parser() use_dump = config_parser.getboolean('general', 'use_dump') section = 'wikipedia_pattern_extractor' randomize = config_parser.getboolean(section, 'randomize') perform_tests = config_parser.getboolean(section, 'randomize') relation_types_limit = config_parser.getint(section, 'relation_types_limit') facts_limit = config_parser.getint(section, 'facts_limit') replace_redirects = config_parser.getboolean(section, 'replace_redirects') type_learning = config_parser.getboolean(section, 'type_learning') threads = config_parser.getint(section, 'threads') relation_types = config_parser.get(section, 'relation_types') relation_types = WikipediaPatternExtractor.split_string_list( relation_types) relation_types = filter(lambda rt: rt != '' and ';' not in rt, relation_types) # filter comments return cls(relation_types_limit, facts_limit, relation_types=relation_types, use_dump=use_dump, randomize=randomize, threads=threads, perform_tests=perform_tests, replace_redirects=replace_redirects, type_learning=type_learning) @staticmethod def split_string_list(string): return string.split(',') # ------------------------------------------------------------------------------------------------- # Data Preprocessing # ------------------------------------------------------------------------------------------------- def parse_dbpedia_data(self): """ Takes all DBpedia ontology relations (subj verb target) stored in file_name and returns a dictionary with subjects as keys and all of their related information as dict values. more precisely {subj: { verb1: [val1, val2, val3...], verb2: [val1, ...] } } """ entities = dict() relation_types_counter = Counter() fact_counter = 0 testing_resources = PatternTester.from_config_file( ).get_testing_resources() self.logger.print_info('Collecting facts for training...') for subject, predicate, object in self.nt_reader.yield_entries(): if fact_counter == self.facts_limit * self.relation_types_limit: break if len( relation_types_counter ) == self.relation_types_limit and predicate not in relation_types_counter: continue if relation_types_counter[predicate] == self.facts_limit: continue if self.relation_types is not None and predicate not in self.relation_types: continue if subject in testing_resources: continue # maintain a dict for each entity with given relations as key # and their target values as list entities.setdefault(subject, {}).setdefault(predicate, []).append(object) relation_types_counter[predicate] += 1 fact_counter += 1 self.logger.print_done('Collecting facts for training completed') self.logger.print_info('Relation types:') most_common_relation_types = relation_types_counter.most_common() for i in range(len(most_common_relation_types)): relation_type, frequency = most_common_relation_types[i] print('\t' + str(i + 1) + ':\t' + str(frequency) + ' x\t' + relation_type).expandtabs(10) return entities @staticmethod def _chunks(data, size=10000): """ Helper function to divide data evenly for all threads """ it = iter(data) for i in xrange(0, len(data), size): yield {k: data[k] for k in islice(it, size)} def tag_sentences(self, chunk=None): if chunk is None: chunk = {} for entity, values in chunk.iteritems(): # for each relationship filter sentences that contain # target resources of entity's relationship for rel, resources in values.iteritems(): wikipedia_target_resources = map( uri_rewriting.convert_to_internal_wikipedia_link, resources) # retrieve tokenized wikipedia sentences that include DBpedia resources that we are looking for tagged_sentences = self.wikipedia_connector.get_filtered_wikipedia_article( entity, wikipedia_target_resources) values[rel] = { 'resources': wikipedia_target_resources, 'sentences': tagged_sentences, 'patterns': [] } def discover_patterns(self): """ Preprocesses data (initializing main data structure) 1. Filter relevant DBpedia facts by relationships 2. Turn DBpedia data into in-memory dictionary where all processing takes place 3. Fetch relevant Wikipedia articles and filter relevant sentences out of html text (for link search) 4. Data is stored in self.dbpedia """ # parse dbpedia information self.dbpedia = self.parse_dbpedia_data() self.logger.print_info('Sentence Extraction...') threads = [] chunk_size = int(ceil(len(self.dbpedia) / self.num_of_threads)) # gather all arguments for each thread for chunk in WikipediaPatternExtractor._chunks(self.dbpedia, chunk_size): t = Thread(target=self.tag_sentences, kwargs={'chunk': chunk}) threads.append(t) # start all threads for x in threads: x.start() # Wait for all threads to finish for x in threads: x.join() def extract_entity_patterns(self, chunk={}): color_mapping = { 'magenta': ['NN', 'NNS'], 'green': ['NNP', 'NNPS'], 'cyan': ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'], 'yellow': ['JJ', 'JJR', 'JJS'] } # reverse color mapping color_mapping = { v: k for k, values in color_mapping.iteritems() for v in values } for entity, relations in chunk.iteritems(): cleaned_subject_entity_name = uri_rewriting.strip_cleaned_name( entity) subject_entity = uri_rewriting.strip_name(entity) for rel_ontology, values in relations.iteritems(): target_resources = values['resources'] sentences = values['sentences'] rel_ontology = rel_ontology.split('/')[-1] data = [{ 'entity': cleaned_subject_entity_name, 'relation': rel_ontology, 'resource': res, 'sentence': sent } for res in target_resources for sent in sentences if sent.contains_any_link([res]) and res != entity] # remove needless sentence information based on relation facts # data = map(self.shorten_sentence, data) # POS tag sentences for entry in data: sentence = entry['sentence'] if sentence.number_of_tokens() > 50: continue # probably too long for stanford tokenizer resource = entry['resource'] nl_sentence = sentence.as_string() relative_position = sentence.relative_pos entry['nl sentence'] = nl_sentence tokenized_sentences = map(word_tokenize, [nl_sentence]) pos_tagged_sentences = pos_tag_sents( tokenized_sentences).pop() object_addresses = sentence.addresses_of_link(resource) object_entity = uri_rewriting.strip_name(resource) pattern = self.pattern_extractor.extract_pattern( nl_sentence, object_addresses, relative_position, self.type_learning, subject_entity, object_entity) if pattern is not None: values['patterns'].append(pattern) entry['pattern'] = pattern # color sentence parts according to POS tag colored_sentence = [ colored(word, color_mapping.setdefault(pos, 'white')) for word, pos in pos_tagged_sentences ] colored_sentence = ' '.join(colored_sentence) colored_sentence = re.sub( r' (.\[\d+m),', ',', colored_sentence) # remove space before commas entry['colored_sentence'] = colored_sentence self.matches.extend(data) # --------------------------------------------------------------------------------------------- # Statistics and Visualizations # --------------------------------------------------------------------------------------------- def extract_patterns(self): self.logger.print_info('Pattern extraction...') threads = [] chunk_size = int(ceil(len(self.dbpedia) / self.num_of_threads)) # gather all arguments for each thread for chunk in WikipediaPatternExtractor._chunks(self.dbpedia, chunk_size): t = Thread(target=self.extract_entity_patterns, kwargs={'chunk': chunk}) threads.append(t) # start all threads for x in threads: x.start() # Wait for all threads to finish for x in threads: x.join() # drop duplicates self.matches.sort() self.matches = list(x for x, _ in itertools.groupby(self.matches)) self.logger.print_done('Pattern extraction completed') def print_occurrences(self): """ Prints each occurrence of a given DBpedia fact with their corresponding and matched sentence. The matched sentence is POS tagges using maxent treebank pos tagging model. Nouns, verbs and adjectives are printed in colour. """ for entry in self.matches: if not entry.get('colored_sentence', None): continue print( colored( '[DBP Entitity] \t', 'red', attrs={'concealed', 'bold'}) + colored(entry['entity'], 'white')).expandtabs(20) print( colored( '[DBP Ontology] \t', 'red', attrs={'concealed', 'bold'}) + colored(entry['relation'], 'white')).expandtabs(20) print( colored( '[DBP Resource] \t', 'red', attrs={'concealed', 'bold'}) + colored(uri_rewriting.strip_cleaned_name(entry['resource']), 'white')).expandtabs(20) print( colored( '[Wiki Occurence] \t', 'red', attrs={'concealed', 'bold'}) + entry['colored_sentence']).expandtabs(20) print('') print('[POS KEY]\t' + colored('NORMAL NOUN\t', 'magenta') + colored('PROPER NOUN\t', 'green') + colored('VERB\t', 'cyan') + colored('ADJ\t', 'yellow')).expandtabs(20) def count_matches(self): matches_count = {} for relation, pattern in self.relation_type_patterns.iteritems(): matches_count[relation] = pattern.covered_sentences return matches_count def calculate_text_coverage(self): """ Prints CLI stats about percentage of matched dbpedia facts in wiki raw text. """ matched_count = self.count_matches() total_count = {} for entity, relation_types in self.dbpedia.iteritems(): for relation, values in relation_types.iteritems(): target_resources = values.get('resources', []) total_count.setdefault(relation, 0) total_count[relation] += len(target_resources) occurrence_count = {} for relation in total_count: occurrence_count[relation] = { 'total': total_count[relation], 'matched': min(total_count[relation], matched_count.setdefault(relation, 0)) } # there might be more occurrences of a fact in an article, thus, resulting in a coverage above 100% # print bar chart data = [ ('% ' + str(vals['matched']) + '/' + str(vals['total']) + ' ' + rel.split('/')[-1], vals['matched'] / vals['total'] * 100) for rel, vals in occurrence_count.iteritems() ] graph = Pyasciigraph() for line in graph.graph('occurred facts in percentage', data): print(line) def merge_patterns(self): self.logger.print_info('Pattern merging...') for entity, relations in tqdm(self.dbpedia.iteritems()): for rel, values in relations.iteritems(): for pattern in values['patterns']: if rel in self.relation_type_patterns: self.relation_type_patterns[rel] = Pattern._merge( self.relation_type_patterns[rel], pattern, self.perform_tests) else: self.relation_type_patterns[rel] = pattern self.logger.print_done('Pattern merging completed.') def save_patterns(self): self.training_resources = set(self.dbpedia.keys()) super(WikipediaPatternExtractor, self).save_patterns()