Exemple #1
0
 def __init__(self, dataset='WN18'):
     self.log = Logger.get_log_cate('learning.txt', 'Learning')
     self.cfg = Config.load_learning_config(dataset)
     self.log.info(
         '****************************start new section*************************************'
     )
     self.log.info('initialize learning {}'.format(current_milli_time()))
     self.triple_set = TripleSet()
     self.triple_set.read_triples(self.cfg['path_training'])
Exemple #2
0
 def __init__(self, datasets='WN18'):
     Rule.set_application_mode()
     self.config = Config.load_eval_config(datasets)
     self.training_set, self.validation_set, self.test_set = TripleSet(
     ), TripleSet(), TripleSet()
     self.training_set.read_triples(self.config['path_training'])
     self.validation_set.read_triples(self.config['path_valid'])
     self.test_set.read_triples(self.config['path_test'])
     self.result_set = ResultSet(self.config['path_prediction'],
                                 self.config['path_prediction'], True, 10)
Exemple #3
0
class Evaluation(object):
    def __init__(self, datasets='WN18'):
        Rule.set_application_mode()
        self.config = Config.load_eval_config(datasets)
        self.training_set, self.validation_set, self.test_set = TripleSet(
        ), TripleSet(), TripleSet()
        self.training_set.read_triples(self.config['path_training'])
        self.validation_set.read_triples(self.config['path_valid'])
        self.test_set.read_triples(self.config['path_test'])
        self.result_set = ResultSet(self.config['path_prediction'],
                                    self.config['path_prediction'], True, 10)

    def eval(self, is_test_set=True, path_extend=False):
        # print('result_set {}'.format(len(result_set.results)))
        if path_extend:
            self.result_set = ResultSet(self.config['path_prediction_ext'],
                                        self.config['path_prediction_ext'],
                                        True, 10)
        elif not is_test_set:
            self.result_set = ResultSet(self.config['path_eval_predict'],
                                        self.config['path_eval_predict'], True,
                                        10)
        hitsAtK = HitsAtK()
        hitsAtK.filter_sets.append(self.training_set)
        hitsAtK.filter_sets.append(self.validation_set)
        hitsAtK.filter_sets.append(self.test_set)
        score_set = self.test_set if is_test_set else self.validation_set
        if path_extend:
            score_set = self.validation_set
        self.__compute_scores(self.result_set, score_set, hitsAtK)
        print('hits@1    hits@3    hits@10')
        h1 = (hitsAtK.hits_adn_head_filtered[0] +
              hitsAtK.hits_adn_tail_filtered[0]) / (hitsAtK.counter_head +
                                                    hitsAtK.counter_tail)
        h3 = (hitsAtK.hits_adn_head_filtered[2] +
              hitsAtK.hits_adn_tail_filtered[2]) / (hitsAtK.counter_head +
                                                    hitsAtK.counter_tail)
        h10 = (hitsAtK.hits_adn_head_filtered[9] +
               hitsAtK.hits_adn_tail_filtered[9]) / (hitsAtK.counter_head +
                                                     hitsAtK.counter_tail)
        print('{:.4f}\t  {:.4f}    {:.4f}'.format(h1, h3, h10))

    def __compute_scores(self, result_set, gold, hitsAtK):
        for triple in gold.triples:
            cand1 = result_set.get_head_candidates(str(triple))
            hitsAtK.evaluate_head(cand1, triple)
            cand2 = result_set.get_tail_candidates(str(triple))
            hitsAtK.evaluate_tail(cand2, triple)
Exemple #4
0
  def prediction(self, valid_set=False,extend=False):
    training_set, test_set, valid_set = TripleSet(), TripleSet(), TripleSet()
    training_set.read_triples(self.cfg['path_training'])
    test_set.read_triples(self.cfg['path_test'])
    valid_set.read_triples(self.cfg['path_valid'])

    path_rules_used = self.cfg['path_rules']
    #for path_rules_used in self.cfg['path_rules']:
    start_time = current_milli_time()
    tmp_path = path_rules_used.split('/')
    path_output_used = 'predictions/{}/{}'.format(self.datasets, tmp_path[2].replace('rule', 'predict'))
    self.log.info('rules learning: {}'.format(path_rules_used))
    self.log.info('output learning: {}'.format(path_output_used))
    rules = RuleReader(path_rules_used).read()
    if extend:
      rules_exd = RuleReader(self.cfg['path_rules_ext']).read()
      rules.extend(rules_exd)
      path_output_used = 'predictions/{}/ext_{}'.format(self.datasets, tmp_path[2].replace('rule', 'predict'))
      test_set, valid_set = valid_set, test_set
    elif valid_set:
      path_output_used = 'predictions/{}/predict_valid_1000.txt'.format(self.datasets)
      test_set, valid_set = valid_set, test_set

    rules_size = len(rules)
    print('*** read rules {} rom file {}'.format(rules_size, path_rules_used))
    rule_engine = RuleEngine(path_output_used, self.cfg['unseen_nagative_examples'])
    rule_engine.apply_rules_arx(rules, training_set, test_set, valid_set, self.cfg['top_k_output'])
    print('* evaluated {} rules to propose candiates for {} *2 completion tasks'.format(rules_size, len(test_set.triples)))
    print('* finished in {} ms.'.format(current_milli_time() - start_time))
    self.log.info('finished in {} s.'.format((current_milli_time() - start_time) // 1000))
Exemple #5
0
class Learning(object):
    def __init__(self, dataset='WN18'):
        self.log = Logger.get_log_cate('learning.txt', 'Learning')
        self.cfg = Config.load_learning_config(dataset)
        self.log.info(
            '****************************start new section*************************************'
        )
        self.log.info('initialize learning {}'.format(current_milli_time()))
        self.triple_set = TripleSet()
        self.triple_set.read_triples(self.cfg['path_training'])

    def train(self):
        triple_set = self.triple_set
        index_start_time = current_milli_time()
        self.log.info('training with config {}'.format(self.cfg))
        path_sampler = PathSampler(triple_set)
        path_counter, batch_counter = 0, 0
        mine_cyclic_not_acyclic = False
        all_useful_rules = [set()]
        snapshot_index, rule_size_cyclic, rule_size_acyclic = 0, 0, 0
        last_cyclic_coverage, last_acyclic_coverage = 0.0, 0.0
        self.log.info('indexing dataset: {}'.format(self.cfg['path_training']))
        self.log.info('time elapsed: {} ms'.format(current_milli_time() -
                                                   index_start_time))
        snapshots_at = self.cfg['snapshots_at']
        dataset = self.cfg['dataset']
        start_time = current_milli_time()
        while True:
            batch_previously_found_rules, batch_new_useful_rules, batch_rules = 0, 0, 0
            rule_size = rule_size_cyclic if mine_cyclic_not_acyclic else rule_size_acyclic
            useful_rules = all_useful_rules[rule_size]
            elapsed_seconds = (current_milli_time() - start_time) // 1000
            ## snapshots rule affter t seconds white learning
            if elapsed_seconds > snapshots_at[snapshot_index]:
                total_rule = 0
                for _rules in all_useful_rules:
                    total_rule += len(_rules)
                snapshot_file = 'learning_rules/{}/rule_{}.txt'.format(
                    dataset, snapshots_at[snapshot_index])
                snapshot_index += 1
                self.log.info('snapshot_rules: {} in file {}'.format(
                    total_rule, snapshot_file))
                snapshot_rules = copy.deepcopy(all_useful_rules)
                thread_snapshot = threading.Thread(
                    target=self.process_snapshot_rule,
                    args=(
                        snapshot_rules,
                        snapshot_file,
                    ))
                thread_snapshot.start()
                print('created snapshot {} after {} seconds'.format(
                    snapshot_index, elapsed_seconds))
                if snapshot_index == len(snapshots_at):
                    print(
                        '*************************done learning*********************************'
                    )
                    thread_snapshot.join()
                    return 0
            # batch learnig
            batch_start_time = current_milli_time()
            while True:
                if current_milli_time(
                ) - batch_start_time > self.cfg['batch_time']:
                    break
                path_counter += 1
                path = path_sampler.sample_path(rule_size + 2,
                                                mine_cyclic_not_acyclic)
                if path != None and path.is_valid():
                    rule = Rule()
                    rule.init_from_path(path)
                    gen_rules = rule.get_generalizations(
                        mine_cyclic_not_acyclic)
                    for r in gen_rules:
                        if r.is_trivial():
                            continue
                        batch_rules += 1
                        if r not in useful_rules:
                            r.compute_scores(triple_set)
                        if r.confidence >= self.cfg[
                                'threshold_confidence'] and r.correctly_predicted >= self.cfg[
                                    'threshold_correct_predictions']:
                            batch_new_useful_rules += 1
                            useful_rules.add(r)
                        else:
                            batch_previously_found_rules += 1

            batch_counter += 1
            str_type = 'CYCLIC' if mine_cyclic_not_acyclic else 'ACYCLIC'
            print('=====> batch [{} {}] {} (sampled {} pathes) *****'.format(
                str_type, rule_size + 1, batch_counter, path_counter))
            current_coverage = batch_previously_found_rules / (
                batch_new_useful_rules + batch_previously_found_rules)
            print(
                '=====> fraction of previously seen rules within useful rules in this batch: {} num of new rule = {} num of previously rule = {} num of all batch rules = {}'
                .format(current_coverage, batch_new_useful_rules,
                        batch_previously_found_rules, batch_rules))
            print('=====> stored rules: {}'.format(len(useful_rules)))
            if mine_cyclic_not_acyclic:
                last_cyclic_coverage = current_coverage
            else:
                last_cyclic_coverage = current_coverage

            if current_coverage > self.cfg[
                    'saturation'] and batch_previously_found_rules > 1:
                rule_size += 1
                if mine_cyclic_not_acyclic:
                    rule_size_cyclic = rule_size
                if not mine_cyclic_not_acyclic:
                    rule_size_acyclic = rule_size
                print(
                    '========================================================='
                )
                print('=====> increasing rule size of {} rule to {}'.format(
                    str_type, rule_size + 1))
                self.log.info(
                    'increasing rule size of {} rules to {}  after {} s'.
                    format(str_type, rule_size + 1,
                           (current_milli_time() - start_time) // 1000))
                all_useful_rules.append(set())

            mine_cyclic_not_acyclic = not mine_cyclic_not_acyclic
            if mine_cyclic_not_acyclic and rule_size_cyclic + 1 > self.cfg[
                    'max_length_cylic']:
                mine_cyclic_not_acyclic = False

    def process_snapshot_rule(self, rules, file):
        if path.exists(file):
            remove(file)
        with open(file, 'w') as output_stream:
            for set_rule in rules:
                for rule in set_rule:
                    print(rule, file=output_stream)

    def process_snapshot_rule_exis_file(self, rules, file):
        with open(file, 'a+') as output_stream:
            for set_rule in rules:
                for rule in set_rule:
                    print(rule, file=output_stream)

    def train_with_batch(self, batch_triple, batch_time=100):
        is_connected, new_triple = self.triple_set.add_batch_triple(
            batch_triple)
        if is_connected:
            triple_set = self.triple_set
            path_sampler = PathSampler(triple_set)
            index_start_time = current_milli_time()
            self.log.info(
                'train_with_batch triple_set: {}, new_triple: {}'.format(
                    len(triple_set.triples), len(new_triple.triples)))
            path_counter, batch_counter = 0, 0
            mine_cyclic_not_acyclic = False
            all_useful_rules = [set()]
            snapshot_index, rule_size_cyclic, rule_size_acyclic = 0, 0, 0
            last_cyclic_coverage, last_acyclic_coverage = 0.0, 0.0
            self.log.info('indexing dataset: {}'.format(
                self.cfg['path_training']))
            self.log.info('time elapsed: {} ms'.format(current_milli_time() -
                                                       index_start_time))
            dataset = self.cfg['dataset']
            start_time = current_milli_time()
            while True:
                batch_previously_found_rules, batch_new_useful_rules, batch_rules = 0, 0, 0
                rule_size = rule_size_cyclic if mine_cyclic_not_acyclic else rule_size_acyclic
                useful_rules = all_useful_rules[rule_size]
                elapsed_seconds = (current_milli_time() - start_time) // 1000
                if elapsed_seconds > batch_time:
                    total_rule = 0
                    for _rules in all_useful_rules:
                        total_rule += len(_rules)
                    snapshot_file = 'learning_rules/{}/rule_extend_{}.txt'.format(
                        dataset, 800)
                    self.log.info(
                        '***************************************************************'
                    )
                    self.log.info('**snapshot_rules: {} in file {}'.format(
                        total_rule, snapshot_file))
                    self.log.info(
                        '***************************************************************'
                    )
                    snapshot_rules = copy.deepcopy(all_useful_rules)
                    thread_snapshot = threading.Thread(
                        target=self.process_snapshot_rule,
                        args=(
                            snapshot_rules,
                            snapshot_file,
                        ))
                    thread_snapshot.start()
                    print('created snapshot {} after {} seconds'.format(
                        total_rule, elapsed_seconds))
                    print(
                        '*************************done learning*********************************'
                    )
                    thread_snapshot.join()
                    return 0
                batch_start_time = current_milli_time()
                while True:
                    if current_milli_time(
                    ) - batch_start_time > self.cfg['batch_time']:
                        break
                    path_counter += 1
                    path = path_sampler.sample_batch_path(
                        rule_size + 2, new_triple, mine_cyclic_not_acyclic)
                    if path != None and path.is_valid():
                        rule = Rule()
                        rule.init_from_path(path)
                        gen_rules = rule.get_generalizations(
                            mine_cyclic_not_acyclic)
                        for r in gen_rules:
                            if r.is_trivial():
                                continue
                            batch_rules += 1
                            if r not in useful_rules:
                                r.compute_scores(triple_set)
                            if r.confidence >= self.cfg[
                                    'threshold_confidence'] and r.correctly_predicted >= self.cfg[
                                        'threshold_correct_predictions']:
                                batch_new_useful_rules += 1
                                useful_rules.add(r)
                            else:
                                batch_previously_found_rules += 1
                batch_counter += 1
                str_type = 'CYCLIC' if mine_cyclic_not_acyclic else 'ACYCLIC'
                print(
                    '=====> batch [{} {}] {} (sampled {} pathes) *****'.format(
                        str_type, rule_size + 1, batch_counter, path_counter))
                current_coverage = batch_previously_found_rules / (
                    batch_new_useful_rules + batch_previously_found_rules)
                print(
                    '=====> fraction of previously seen rules within useful rules in this batch: {} num of new rule = {} num of previously rule = {} num of all batch rules = {}'
                    .format(current_coverage, batch_new_useful_rules,
                            batch_previously_found_rules, batch_rules))
                print('=====> stored rules: {}'.format(len(useful_rules)))
                if mine_cyclic_not_acyclic:
                    last_cyclic_coverage = current_coverage
                else:
                    last_cyclic_coverage = current_coverage

                if current_coverage > self.cfg[
                        'saturation'] and batch_previously_found_rules > 1:
                    rule_size += 1
                    if mine_cyclic_not_acyclic:
                        rule_size_cyclic = rule_size
                    if not mine_cyclic_not_acyclic:
                        rule_size_acyclic = rule_size
                    print(
                        '========================================================='
                    )
                    print(
                        '=====> increasing rule size of {} rule to {}'.format(
                            str_type, rule_size + 1))
                    self.log.info(
                        'increasing rule size of {} rules to {}  after {} s'.
                        format(str_type, rule_size + 1,
                               (current_milli_time() - start_time) // 1000))
                    all_useful_rules.append(set())

                mine_cyclic_not_acyclic = not mine_cyclic_not_acyclic
                if mine_cyclic_not_acyclic and rule_size_cyclic + 1 > self.cfg[
                        'max_length_cylic']:
                    mine_cyclic_not_acyclic = False

    def train_with_edge(self, triple):
        is_connected, new_triple = self.triple_set.add_edge_triple(triple)
        if is_connected:
            triple_set = self.triple_set
            path_sampler = PathSampler(triple_set)
            index_start_time = current_milli_time()
            self.log.info(
                'train_with_batch triple_set: {}, new_triple: {}'.format(
                    len(triple_set.triples), new_triple))
            path_counter, batch_counter = 0, 0
            mine_cyclic_not_acyclic = False
            all_useful_rules = [set()]
            snapshot_index, rule_size_cyclic, rule_size_acyclic = 0, 0, 0
            last_cyclic_coverage, last_acyclic_coverage = 0.0, 0.0
            self.log.info('indexing dataset: {}'.format(
                self.cfg['path_training']))
            self.log.info('time elapsed: {} ms'.format(current_milli_time() -
                                                       index_start_time))
            dataset = self.cfg['dataset']
            start_time = current_milli_time()
            while True:
                batch_previously_found_rules, batch_new_useful_rules, batch_rules = 0, 0, 0
                rule_size = rule_size_cyclic if mine_cyclic_not_acyclic else rule_size_acyclic
                useful_rules = all_useful_rules[rule_size]
                elapsed_seconds = (current_milli_time() - start_time) // 1000
                if elapsed_seconds > 1:
                    total_rule = 0
                    for _rules in all_useful_rules:
                        total_rule += len(_rules)
                    snapshot_file = 'learning_rules/{}/rule_extend_{}.txt'.format(
                        dataset, 20)
                    self.log.info(
                        '***************************************************************'
                    )
                    self.log.info('**snapshot_rules: {} in file {}'.format(
                        total_rule, snapshot_file))
                    self.log.info(
                        '***************************************************************'
                    )
                    snapshot_rules = copy.deepcopy(all_useful_rules)
                    thread_snapshot = threading.Thread(
                        target=self.process_snapshot_rule_exis_file,
                        args=(
                            snapshot_rules,
                            snapshot_file,
                        ))
                    thread_snapshot.start()
                    print('created snapshot {} after {} seconds'.format(
                        total_rule, elapsed_seconds))
                    print(
                        '*************************done learning*********************************'
                    )
                    thread_snapshot.join()
                    return 0
                batch_start_time = current_milli_time()
                while True:
                    if current_milli_time(
                    ) - batch_start_time > self.cfg['batch_time']:
                        break
                    path_counter += 1
                    path = path_sampler.sample_triple(rule_size + 2,
                                                      new_triple,
                                                      mine_cyclic_not_acyclic)
                    if path != None and path.is_valid():
                        rule = Rule()
                        rule.init_from_path(path)
                        gen_rules = rule.get_generalizations(
                            mine_cyclic_not_acyclic)
                        for r in gen_rules:
                            if r.is_trivial():
                                continue
                            batch_rules += 1
                            if r not in useful_rules:
                                r.compute_scores(triple_set)
                            if r.confidence >= 0.45 and r.correctly_predicted >= self.cfg[
                                    'threshold_correct_predictions']:  #self.cfg['threshold_confidence']
                                batch_new_useful_rules += 1
                                useful_rules.add(r)
                            else:
                                batch_previously_found_rules += 1
                batch_counter += 1
                str_type = 'CYCLIC' if mine_cyclic_not_acyclic else 'ACYCLIC'
                print(
                    '=====> batch [{} {}] {} (sampled {} pathes) *****'.format(
                        str_type, rule_size + 1, batch_counter, path_counter))
                if batch_new_useful_rules + batch_previously_found_rules != 0:
                    current_coverage = batch_previously_found_rules / (
                        batch_new_useful_rules + batch_previously_found_rules)
                else:
                    current_coverage = 0
                print(
                    '=====> fraction of previously seen rules within useful rules in this batch: {} num of new rule = {} num of previously rule = {} num of all batch rules = {}'
                    .format(current_coverage, batch_new_useful_rules,
                            batch_previously_found_rules, batch_rules))
                print('=====> stored rules: {}'.format(len(useful_rules)))
                if mine_cyclic_not_acyclic:
                    last_cyclic_coverage = current_coverage
                else:
                    last_cyclic_coverage = current_coverage

                if current_coverage > self.cfg[
                        'saturation'] and batch_previously_found_rules > 1:
                    rule_size += 1
                    if mine_cyclic_not_acyclic:
                        rule_size_cyclic = rule_size
                    if not mine_cyclic_not_acyclic:
                        rule_size_acyclic = rule_size
                    print(
                        '========================================================='
                    )
                    print(
                        '=====> increasing rule size of {} rule to {}'.format(
                            str_type, rule_size + 1))
                    self.log.info(
                        'increasing rule size of {} rules to {}  after {} s'.
                        format(str_type, rule_size + 1,
                               (current_milli_time() - start_time) // 1000))
                    all_useful_rules.append(set())

                mine_cyclic_not_acyclic = not mine_cyclic_not_acyclic
                if mine_cyclic_not_acyclic and rule_size_cyclic + 1 > self.cfg[
                        'max_length_cylic']:
                    mine_cyclic_not_acyclic = False
Exemple #6
0
    def apply_rules_arx(self, rules, training_set, test_set, validation_set,
                        k):
        print('* applying rules')
        relation_to_rules = self.create_ordered_rule_index(rules)
        print(
            '* set up index structure covering rules for {} different relations'
            .format(len(relation_to_rules)))
        filter_set = TripleSet()
        filter_set.add_triple_set(training_set)
        filter_set.add_triple_set(test_set)
        filter_set.add_triple_set(validation_set)
        print('* constructed filter set with {} triples'.format(
            len(filter_set.triples)))
        if len(filter_set.triples) == 0:
            print('WARNING: using empty filter set!')
        # prepare the data structures used a s cache for question that are reoccuring
        # start iterating over the test cases
        counter, current_time, start_time = 0, 0, current_milli_time()

        ScoreTree.set_lower_bound(k)
        ScoreTree.set_upper_bound(ScoreTree.lower_bound)
        ScoreTree.set_epsilon(0.0001)

        for triple in test_set.triples:
            if counter % 100 == 0:
                print('* (# {} ) trying to guess the tail/head of {}'.format(
                    counter, triple))
                current_time = current_milli_time()
                print('Elapsed (s) = {}'.format(
                    (current_time - start_time) // 1000))
                start_time = current_milli_time()
            relation = triple.relation
            head = triple.head
            tail = triple.tail
            tail_question, head_question = (relation, head), (relation, tail)
            k_tail_tree = ScoreTree()
            k_head_tree = ScoreTree()

            if relation in relation_to_rules:
                relevant_rules = relation_to_rules.get(relation)
                for rule in relevant_rules:
                    if not k_tail_tree.fine():
                        tail_candidates = rule.compute_tail_results(
                            head, training_set)
                        f_tail_candidates = self.__get_filtered_entities(
                            filter_set, test_set, triple, tail_candidates,
                            True)
                        k_tail_tree.add_values(rule.get_applied_confidence(),
                                               f_tail_candidates)
                    else:
                        break
                for rule in relevant_rules:
                    if not k_head_tree.fine():
                        head_candidates = rule.compute_head_results(
                            tail, training_set)
                        f_head_candidates = self.__get_filtered_entities(
                            filter_set, test_set, triple, head_candidates,
                            False)
                        k_head_tree.add_values(rule.get_applied_confidence(),
                                               f_head_candidates)
                    else:
                        break

            k_tail_candidates, k_head_candidates = {}, {}
            k_tail_tree.get_as_linked_map(k_tail_candidates)
            k_head_tree.get_as_linked_map(k_head_candidates)
            top_k_tail_candidates = self.__sort_by_value(k_tail_candidates, k)
            top_k_head_candidates = self.__sort_by_value(k_head_candidates, k)
            counter += 1
            writer = threading.Thread(
                target=self.__process_write_top_k_candidates,
                args=(
                    triple,
                    test_set,
                    top_k_tail_candidates,
                    top_k_head_candidates,
                ))
            writer.start()
        writer.join()
        print('* done with rule application')