def extract_patterns(self, config):

        # extract ReVerb pattern and detect the presence of the passive voice
        patterns_bet_tags = Reverb.extract_reverb_patterns_ptb(self.bet_words)
        if len(patterns_bet_tags) > 0:
            # print ">0"
            self.passive_voice = self.config.reverb.detect_passive_voice(
                patterns_bet_tags)
            # forced hack since _'s_ is always tagged as VBZ, (u"'s", 'VBZ') and causes ReVerb to identify
            # a pattern which is wrong, if this happens, ignore that a pattern was extracted
            if patterns_bet_tags[0][0] == "'s":
                self.bet_vector = self.construct_words_vectors(
                    self.bet_words, config)
            else:
                self.bet_vector = self.construct_pattern_vector(
                    patterns_bet_tags, config)
        else:
            #print "=0"
            self.bet_vector = self.construct_words_vectors(
                self.bet_words, config)

        # extract two words before the first entity, and two words after the second entity
        if len(self.bef_words) > 0:
            # print "before words exist"
            self.bef_vector = self.construct_words_vectors(
                self.bef_words, config)

        if len(self.aft_words) > 0:
            # print "after words exist"
            self.aft_vector = self.construct_words_vectors(
                self.aft_words, config)
Esempio n. 2
0
        def extract_patterns(self, config):

            # extract ReVerb pattern and detect the presence of the passive voice
            patterns_bet_tags = Reverb.extract_reverb_patterns_ptb(self.bet_words)
            if len(patterns_bet_tags) > 0:
                self.passive_voice = self.config.reverb.detect_passive_voice(patterns_bet_tags)
                # forced hack since _'s_ is always tagged as VBZ, (u"'s", 'VBZ') and causes ReVerb to identify
                # a pattern which is wrong, if this happens, ignore that a pattern was extracted
                if patterns_bet_tags[0][0] == "'s":
                    self.bet_vector = self.construct_words_vectors(self.bet_words, config)
                else:
                    self.bet_vector = self.construct_pattern_vector(patterns_bet_tags, config)
            else:
                self.bet_vector = self.construct_words_vectors(self.bet_words, config)

            # extract two words before the first entity, and two words after the second entity
            if len(self.bef_words) > 0:
                self.bef_vector = self.construct_words_vectors(self.bef_words, config)

            if len(self.aft_words) > 0:
                self.aft_vector = self.construct_words_vectors(self.aft_words, config)
Esempio n. 3
0
    def __init__(self, config_file, seeds_file, negative_seeds, sentences_file,
                 similarity, confidance):

        self.seed_tuples = set()
        self.negative_seed_tuples = set()
        self.e1_type = None
        self.e2_type = None
        self.stopwords = stopwords.words('english')
        self.threshold_similarity = similarity
        self.instance_confidance = confidance
        self.reverb = Reverb()

        for line in fileinput.input(config_file):
            if line.startswith("#") or len(line) == 1:
                continue

            if line.startswith("wUpdt"):
                self.wUpdt = float(line.split("=")[1])

            if line.startswith("wUnk"):
                self.wUnk = float(line.split("=")[1])

            if line.startswith("wNeg"):
                self.wNeg = float(line.split("=")[1])

            if line.startswith("number_iterations"):
                self.number_iterations = int(line.split("=")[1])

            if line.startswith("use_RlogF"):
                self.use_RlogF = bool(line.split("=")[1])

            if line.startswith("min_pattern_support"):
                self.min_pattern_support = int(line.split("=")[1])

            if line.startswith("max_tokens_away"):
                self.max_tokens_away = int(line.split("=")[1])

            if line.startswith("min_tokens_away"):
                self.min_tokens_away = int(line.split("=")[1])

            if line.startswith("context_window_size"):
                self.context_window_size = int(line.split("=")[1])

            if line.startswith("use_reverb"):
                self.use_reverb = line.split("=")[1].strip()

            if line.startswith("alpha"):
                self.alpha = float(line.split("=")[1])

            if line.startswith("beta"):
                self.beta = float(line.split("=")[1])

            if line.startswith("gamma"):
                self.gamma = float(line.split("=")[1])

        assert self.alpha + self.beta + self.gamma == 1

        self.read_seeds(seeds_file)
        self.read_negative_seeds(negative_seeds)
        fileinput.close()

        print("\nConfiguration parameters")
        print("========================")
        print("Relationship Representation")
        print("e1 type              :", self.e1_type)
        print("e2 type              :", self.e2_type)
        print("context window       :", self.context_window_size)
        print("max tokens away      :", self.max_tokens_away)
        print("min tokens away      :", self.min_tokens_away)
        print("use ReVerb           :", self.use_reverb)

        print("\nVectors")
        print("alpha                :", self.alpha)
        print("beta                 :", self.beta)
        print("gamma                :", self.gamma)

        print("\nSeeds:")
        print("positive seeds       :", len(self.seed_tuples))
        print("negative seeds       :", len(self.negative_seed_tuples))
        print("negative seeds wNeg  :", self.wNeg)
        print("unknown seeds wUnk   :", self.wUnk)

        print("\nParameters and Thresholds")
        print("threshold_similarity :", self.threshold_similarity)
        print("instance confidence  :", self.instance_confidance)
        print("min_pattern_support  :", self.min_pattern_support)
        print("iterations           :", self.number_iterations)
        print("iteration wUpdt      :", self.wUpdt)
        print("\n")

        try:
            os.path.isfile("vsm.pkl")
            f = open("vsm.pkl", "r")
            print("\nLoading tf-idf model from disk...")
            self.vsm = pickle.load(f)
            f.close()

        except IOError:
            print("\nGenerating tf-idf model from sentences...")
            self.vsm = VectorSpaceModel.VectorSpaceModel(
                sentences_file, self.stopwords)
            print("\nWriting generated model to disk...")
            f = open("vsm.pkl", "wb")
            pickle.dump(self.vsm, f)
            f.close()