def form_knowledge_base(self, id_vecs=True, unitary=False):

        # Check existence of corpus
        if self.corpus_dict is None:
            raise Exception("Attempted to form the knowledge "
                            "base without a corpus.")

        print "Number of items in knowledge base:", len(self.corpus_dict)

        if not id_vecs:
            print "Processing Corpus"
            self.processCorpus()

        print "Generating relation type vectors"
        print "Using relation types: ", self.relation_symbols

        self.relation_type_vectors = {symbol: HRR(self.dimension)
                                      for symbol in self.relation_symbols}
        if unitary:
            for k, h in self.relation_type_vectors.iteritems():
                h.make_unitary()

        if id_vecs:
            key_order = self.corpus_dict.keys()
        else:
            # Order words by the dependencies of their definitions
            # Only have to do this if we're not using ID-vectors
            key_order = []
            resolved = set(self.relation_symbols)

            dependencies = {}
            for key in self.corpus_dict.keys():
                dependencies[key] = set(
                    [tag[1] for tag in self.corpus_dict[key]
                     if tag[0] in self.relation_symbols])

            while len(key_order) < (len(self.corpus_dict)
                                    + len(self.relation_symbols)):

                resolvable = set()

                for key in dependencies:
                    if dependencies[key].issubset(resolved):
                        resolvable.add(key)

                # add the resolvable keys to the order list and resolved set
                key_order.extend(resolvable)
                resolved = resolved.union(resolvable)

                # remove resolved tags from the dependency dictionary
                for r in resolvable:
                    del dependencies[r]

                # if no items are resolvable, we're stuck
                if len(resolvable) == 0:
                    break

            del resolved
            del resolvable
            if len(key_order) < len(self.corpus_dict):
                raise Exception("Dependency resolution failed.")

        self.semantic_pointers = collections.OrderedDict()

        print "Generating ID-vectors"

        if id_vecs:
            self.id_vectors = collections.OrderedDict()

            for key in key_order:
                self.id_vectors[key] = HRR(self.dimension)
        else:
            self.id_vectors = self.semantic_pointers

        print "Generating HRR vectors"
        for key in key_order:
            relations = filter(
                lambda x: x[0] in self.relation_symbols,
                self.corpus_dict[key])

            if len(relations) == 0:
                self.semantic_pointers[key] = HRR(self.dimension)
                continue

            semantic_pointer = HRR(data=np.zeros(self.dimension))

            for n in range(self.sp_noise):
                semantic_pointer += HRR(self.dimension)

            for relation in relations:

                id_vector = self.id_vectors[relation[1]]

                relation_type_vector = self.relation_type_vectors[relation[0]]

                pair = id_vector * relation_type_vector

                semantic_pointer += pair

            if self.normalize:
                semantic_pointer.normalize()

            self.semantic_pointers[key] = semantic_pointer

        # convert all vectors from hrrs to numpy ndarrays
        for k in key_order:
            h = self.semantic_pointers[k]
            self.semantic_pointers[k] = h.v

        if id_vecs:
            for k in key_order:
                h = self.id_vectors[k]
                self.id_vectors[k] = h.v

        for k in self.relation_type_vectors:
            h = self.relation_type_vectors[k]
            self.relation_type_vectors[k] = h.v
    def run(self):
        self.dimension = len(self.id_vectors.values()[0])

        self.role_hrrs = self.create_role_hrrs()
        self.pos_map = self.create_pos_map()

        score = defaultdict(float)

        for i in range(self.num_trials):
            title = "New Sentence Test"
            if self.deep:
                title += "- Deep"

            tools.print_header(self.output_file, title)

            sentence = self.generate_sentence()

            if self.deep:
                embed = self.rng.sample(sentence.keys(), 1)[0]

                embedded_sentence = self.generate_sentence()

                del sentence[embed]

                for role in embedded_sentence.keys():
                    sentence[embed + role] = embedded_sentence[role]

            tag_vectors = {}
            sentence_hrr = HRR(data=np.zeros(self.dimension))

            # Pick role-fillers and create HRR representing the sentence
            # Also store the hrr to use as the query to extract each synset
            # included in the sentence.
            for role in sentence:
                tag_hrr = [self.role_hrrs[x] for x in role]
                tag_hrr = reduce(lambda x, y: x * y, tag_hrr)

                synset = sentence[role]

                sentence_hrr += tag_hrr * HRR(data=self.id_vectors[synset])

                tag_vectors[role] = tag_hrr.v

            sentence_hrr.normalize()

            sentence_vector = sentence_hrr.v

            print >> self.output_file, "Roles in sentence:"
            print >> self.output_file, sentence

            # ask about parts of the sentence
            sentence_score = defaultdict(float)
            sentence_length = defaultdict(float)
            for role in sentence.keys():

                answer = sentence[role]

                self.current_start_key = None
                self.current_target_keys = [answer]
                self.current_num_relations = len(sentence)

                print >> self.output_file, "\nTesting ", role

                result, correct, valid, exact = self.test_link(
                    tag_vectors[role],
                    sentence_vector,
                    None,
                    answer,
                    output_file=self.output_file,
                    return_vec=False,
                    num_relations=len(sentence),
                    answers=[answer])

                depth = len(role)
                if correct:
                    sentence_score[depth] += 1
                    print >> self.output_file, "Correct."
                else:
                    print >> self.output_file, "Incorrect."

                sentence_length[depth] += 1

                if self.short:
                    break

            for d in sentence_length:
                sentence_percent = sentence_score[d] / sentence_length[d]

                print >> self.output_file, \
                    "Percent correct for current sentence at depth %d: %f" \
                    % (d, sentence_percent)

                score[d] = score[d] + sentence_percent

        for d in score:
            print "Sentence test score at depth %d: %f out of %d" \
                % (d, score[d], self.num_trials)

            percent = score[d] / self.num_trials

            title = "Sentence Test Summary - Depth = %d" % d
            tools.print_header(self.output_file, title)
            print >> self.output_file, "Correct: ", score[d]
            print >> self.output_file, "Total: ", self.num_trials
            print >> self.output_file, "Percent: ", percent
            tools.print_footer(self.output_file, title)

            self.add_data("sentence_score_%d" % d, percent)
Beispiel #3
0
    def form_knowledge_base(self, id_vecs=True, unitary=False):

        # Check existence of corpus
        if self.corpus_dict is None:
            raise Exception("Attempted to form the knowledge "
                            "base without a corpus.")

        print "Number of items in knowledge base:", len(self.corpus_dict)

        if not id_vecs:
            print "Processing Corpus"
            self.processCorpus()

        print "Generating relation type vectors"
        print "Using relation types: ", self.relation_symbols

        self.relation_type_vectors = {
            symbol: HRR(self.dimension)
            for symbol in self.relation_symbols
        }
        if unitary:
            for k, h in self.relation_type_vectors.iteritems():
                h.make_unitary()

        if id_vecs:
            key_order = self.corpus_dict.keys()
        else:
            # Order words by the dependencies of their definitions
            # Only have to do this if we're not using ID-vectors
            key_order = []
            resolved = set(self.relation_symbols)

            dependencies = {}
            for key in self.corpus_dict.keys():
                dependencies[key] = set([
                    tag[1] for tag in self.corpus_dict[key]
                    if tag[0] in self.relation_symbols
                ])

            while len(key_order) < (len(self.corpus_dict) +
                                    len(self.relation_symbols)):

                resolvable = set()

                for key in dependencies:
                    if dependencies[key].issubset(resolved):
                        resolvable.add(key)

                # add the resolvable keys to the order list and resolved set
                key_order.extend(resolvable)
                resolved = resolved.union(resolvable)

                # remove resolved tags from the dependency dictionary
                for r in resolvable:
                    del dependencies[r]

                # if no items are resolvable, we're stuck
                if len(resolvable) == 0:
                    break

            del resolved
            del resolvable
            if len(key_order) < len(self.corpus_dict):
                raise Exception("Dependency resolution failed.")

        self.semantic_pointers = collections.OrderedDict()

        print "Generating ID-vectors"

        if id_vecs:
            self.id_vectors = collections.OrderedDict()

            for key in key_order:
                self.id_vectors[key] = HRR(self.dimension)
        else:
            self.id_vectors = self.semantic_pointers

        print "Generating HRR vectors"
        for key in key_order:
            relations = filter(lambda x: x[0] in self.relation_symbols,
                               self.corpus_dict[key])

            if len(relations) == 0:
                self.semantic_pointers[key] = HRR(self.dimension)
                continue

            semantic_pointer = HRR(data=np.zeros(self.dimension))

            for n in range(self.sp_noise):
                semantic_pointer += HRR(self.dimension)

            for relation in relations:

                id_vector = self.id_vectors[relation[1]]

                relation_type_vector = self.relation_type_vectors[relation[0]]

                pair = id_vector * relation_type_vector

                semantic_pointer += pair

            if self.normalize:
                semantic_pointer.normalize()

            self.semantic_pointers[key] = semantic_pointer

        # convert all vectors from hrrs to numpy ndarrays
        for k in key_order:
            h = self.semantic_pointers[k]
            self.semantic_pointers[k] = h.v

        if id_vecs:
            for k in key_order:
                h = self.id_vectors[k]
                self.id_vectors[k] = h.v

        for k in self.relation_type_vectors:
            h = self.relation_type_vectors[k]
            self.relation_type_vectors[k] = h.v
Beispiel #4
0
    def run(self):
        self.dimension = len(self.id_vectors.values()[0])

        self.role_hrrs = self.create_role_hrrs()
        self.pos_map = self.create_pos_map()

        score = defaultdict(float)

        for i in range(self.num_trials):
            title = "New Sentence Test"
            if self.deep:
                title += "- Deep"

            tools.print_header(self.output_file, title)

            sentence = self.generate_sentence()

            if self.deep:
                embed = self.rng.sample(sentence.keys(), 1)[0]

                embedded_sentence = self.generate_sentence()

                del sentence[embed]

                for role in embedded_sentence.keys():
                    sentence[embed + role] = embedded_sentence[role]

            tag_vectors = {}
            sentence_hrr = HRR(data=np.zeros(self.dimension))

            # Pick role-fillers and create HRR representing the sentence
            # Also store the hrr to use as the query to extract each synset
            # included in the sentence.
            for role in sentence:
                tag_hrr = [self.role_hrrs[x] for x in role]
                tag_hrr = reduce(lambda x, y: x * y, tag_hrr)

                synset = sentence[role]

                sentence_hrr += tag_hrr * HRR(data=self.id_vectors[synset])

                tag_vectors[role] = tag_hrr.v

            sentence_hrr.normalize()

            sentence_vector = sentence_hrr.v

            print >> self.output_file, "Roles in sentence:"
            print >> self.output_file, sentence

            # ask about parts of the sentence
            sentence_score = defaultdict(float)
            sentence_length = defaultdict(float)
            for role in sentence.keys():

                answer = sentence[role]

                self.current_start_key = None
                self.current_target_keys = [answer]
                self.current_num_relations = len(sentence)

                print >> self.output_file, "\nTesting ", role

                result, correct, valid, exact = self.test_link(
                    tag_vectors[role], sentence_vector, None, answer,
                    output_file=self.output_file, return_vec=False,
                    num_relations=len(sentence), answers=[answer])

                depth = len(role)
                if correct:
                    sentence_score[depth] += 1
                    print >> self.output_file, "Correct."
                else:
                    print >> self.output_file, "Incorrect."

                sentence_length[depth] += 1

                if self.short:
                    break

            for d in sentence_length:
                sentence_percent = sentence_score[d] / sentence_length[d]

                print >> self.output_file, \
                    "Percent correct for current sentence at depth %d: %f" \
                    % (d, sentence_percent)

                score[d] = score[d] + sentence_percent

        for d in score:
            print "Sentence test score at depth %d: %f out of %d" \
                % (d, score[d], self.num_trials)

            percent = score[d] / self.num_trials

            title = "Sentence Test Summary - Depth = %d" % d
            tools.print_header(self.output_file, title)
            print >> self.output_file, "Correct: ", score[d]
            print >> self.output_file, "Total: ", self.num_trials
            print >> self.output_file, "Percent: ", percent
            tools.print_footer(self.output_file, title)

            self.add_data("sentence_score_%d" % d, percent)