コード例 #1
0
class DomainSimilarity:
    def __init__(self, input_dir, threshold_tfidf, threshold_perplexity_ngram,
                 threshold_edit_distance):
        self.threshold_tfidf = 1 - threshold_tfidf
        self.threshold_perplexity_ngram = threshold_perplexity_ngram
        self.threshold_edit_distance = threshold_edit_distance
        self.input_dir = input_dir
        self.sentences = []
        if not os.path.isdir(input_dir):
            raise Exception("The provided dir " + str(input_dir) +
                            " does not exist")
        self.__train_models()

        self.queries_asked = 0
        self.sentences_asked = 0
        self.accepted_by_tfidf = 0
        self.accepted_by_ngp = 0
        self.accepted_by_edit_distance = 0
        self.sum_tfidf = 0
        self.sum_ngp = 0
        self.sum_edit = 0

    def __train_models(self):
        # Now load all sentences from specific domain, and train TFIDF model and NGramPerplexity model.
        self.ngp = NGramPerplexity()
        self.tfidf = TFIDF()
        print("Training models from specific corpora")
        for file in os.listdir(self.input_dir):
            print("Training models from specific corpora: " + file)
            with open(self.input_dir + "/" + file, encoding="utf-8") as input:
                for line in input:
                    words = WordExtractor.get_words(line)
                    if len(words) == 0:
                        continue
                    self.sentences.append(words)
                    self.ngp.train_from_text(words)
                    self.tfidf.train_from_text(words)

    def print_progress(self):
        print("Average tfidf: " + str(1 - self.sum_tfidf / self.queries_asked))
        print("Average ngram-perplexity: " +
              str(self.sum_ngp / self.sentences_asked))
        print("Average edit-distance: " +
              str(self.sum_edit / self.queries_asked))
        print("Accept percent by tfidf extractor: " +
              Formatter.percent(self.accepted_by_tfidf / self.sentences_asked))
        print("Accept percent by ngram-perplexity extractor: " +
              Formatter.percent(self.accepted_by_ngp / self.sentences_asked))
        print("Accept percent by edit-distance extractor: " +
              Formatter.percent(self.accepted_by_edit_distance /
                                self.sentences_asked))

    def accepts_sentence(self, words_general):
        # sentence_general: string
        # Returns True if similarity of sentence_general is either:
        # > threshold1 according to tf-idf of one of stored sentences
        # > threshold2 according to ngramperplexity of one of stored sentences
        # > threshold3 according to levenshtein of one of stored sentences
        self.sentences_asked += 1
        accept_ngp = False
        accept_tfidf = False
        accept_edit_distance = False

        perplexity = self.ngp.calc_perplexity(words_general)
        self.sum_ngp += perplexity
        if perplexity <= self.threshold_perplexity_ngram:
            if RUN_CONFIGURATION.mode == MODE.TURBO:
                return True
            self.accepted_by_ngp += 1
            accept_ngp = True

        for words_specific in self.sentences:
            self.queries_asked += 1
            if accept_tfidf and accept_edit_distance:
                return True
            if not accept_tfidf:
                sim = self.tfidf.calc_cosine_similarity(
                    words_general, words_specific)
                self.sum_tfidf += sim
                if sim >= self.threshold_tfidf:
                    if RUN_CONFIGURATION.mode == MODE.TURBO:
                        return True
                    self.accepted_by_tfidf += 1
                    accept_tfidf = True
            if not accept_edit_distance:
                edit_distance = Levenshtein.normalized_distance(
                    words_general, words_specific)
                self.sum_edit += edit_distance
                if edit_distance <= self.threshold_edit_distance:
                    if RUN_CONFIGURATION.mode == MODE.TURBO:
                        return True
                    self.accepted_by_edit_distance += 1
                    accept_edit_distance = True

        if accept_tfidf or accept_ngp or accept_edit_distance:
            return True

        return False
コード例 #2
0
class DomainSimilarity:

    def __init__(self, input_dir, threshold_tfidf, threshold_perplexity_ngram, threshold_edit_distance):
        self.threshold_tfidf = 1-threshold_tfidf
        self.threshold_perplexity_ngram = threshold_perplexity_ngram
        self.threshold_edit_distance = threshold_edit_distance
        self.input_dir = input_dir
        self.sentences = []
        if not os.path.isdir(input_dir):
            raise Exception("The provided dir " + str(input_dir) + " does not exist")
        self.__train_models()

        self.queries_asked = 0
        self.sentences_asked = 0
        self.accepted_by_tfidf = 0
        self.accepted_by_ngp = 0
        self.accepted_by_edit_distance = 0
        self.sum_tfidf = 0
        self.sum_ngp = 0
        self.sum_edit = 0

    def __train_models(self):
        # Now load all sentences from specific domain, and train TFIDF model and NGramPerplexity model.
        self.ngp = NGramPerplexity()
        self.tfidf = TFIDF()
        print("Training models from specific corpora")
        for file in os.listdir(self.input_dir):
            print("Training models from specific corpora: " + file)
            with open(self.input_dir + "/" + file, encoding="utf-8") as input:
                for line in input:
                    words = WordExtractor.get_words(line)
                    if len(words) == 0:
                        continue
                    self.sentences.append(words)
                    self.ngp.train_from_text(words)
                    self.tfidf.train_from_text(words)

    def print_progress(self):
        print("Average tfidf: " + str(1 - self.sum_tfidf / self.queries_asked))
        print("Average ngram-perplexity: " + str(self.sum_ngp / self.sentences_asked))
        print("Average edit-distance: " + str(self.sum_edit / self.queries_asked))
        print("Accept percent by tfidf extractor: " + Formatter.percent(self.accepted_by_tfidf / self.sentences_asked))
        print("Accept percent by ngram-perplexity extractor: " + Formatter.percent(self.accepted_by_ngp / self.sentences_asked))
        print("Accept percent by edit-distance extractor: " + Formatter.percent(self.accepted_by_edit_distance / self.sentences_asked))

    def accepts_sentence(self, words_general):
        # sentence_general: string
        # Returns True if similarity of sentence_general is either:
        # > threshold1 according to tf-idf of one of stored sentences
        # > threshold2 according to ngramperplexity of one of stored sentences
        # > threshold3 according to levenshtein of one of stored sentences
        self.sentences_asked += 1
        accept_ngp = False
        accept_tfidf = False
        accept_edit_distance = False

        perplexity = self.ngp.calc_perplexity(words_general)
        self.sum_ngp += perplexity
        if perplexity <= self.threshold_perplexity_ngram:
            if RUN_CONFIGURATION.mode == MODE.TURBO:
                return True
            self.accepted_by_ngp += 1
            accept_ngp = True

        for words_specific in self.sentences:
            self.queries_asked += 1
            if accept_tfidf and accept_edit_distance:
                return True
            if not accept_tfidf:
                sim = self.tfidf.calc_cosine_similarity(words_general, words_specific)
                self.sum_tfidf += sim
                if sim >= self.threshold_tfidf:
                    if RUN_CONFIGURATION.mode == MODE.TURBO:
                        return True
                    self.accepted_by_tfidf += 1
                    accept_tfidf = True
            if not accept_edit_distance:
                edit_distance = Levenshtein.normalized_distance(words_general, words_specific)
                self.sum_edit += edit_distance
                if edit_distance <= self.threshold_edit_distance:
                    if RUN_CONFIGURATION.mode == MODE.TURBO:
                        return True
                    self.accepted_by_edit_distance += 1
                    accept_edit_distance = True

        if accept_tfidf or accept_ngp or accept_edit_distance:
            return True

        return False
コード例 #3
0
]

different_sentences = [
    [],
    WordExtractor.get_words(
        "I play guitar in a band, but I better like to play piano"),
    WordExtractor.get_words(
        "The Jaguar is a very dangerous animal, that can run very fast"),
    WordExtractor.get_words(
        "Dinosaurs are extinct and we will probably never see them alive.")
]

#A very similar sentence must be more similar than a slightly similar sentence
for very_similar_sentence in very_similar_sentences:
    for slightly_similar_sentence in slightly_similar_sentences:
        pp1 = ngp.calc_perplexity(very_similar_sentence)
        pp2 = ngp.calc_perplexity(slightly_similar_sentence)
        print(str(pp1) + " < " + str(pp2))
        assert (pp1 < pp2)

#A slightly similar sentence must be more similar than a different sentence
for slightly_similar_sentence in slightly_similar_sentences:
    for different_sentence in different_sentences:
        pp1 = ngp.calc_perplexity(very_similar_sentence)
        pp2 = ngp.calc_perplexity(different_sentence)
        print(str(pp1) + " < " + str(pp2))
        assert (pp1 < pp2)

#A very similar sentence must be more similar than a different sentence
for very_similar_sentence in very_similar_sentences:
    for different_sentence in different_sentences:
コード例 #4
0
                          WordExtractor.get_words("I think they would like an ice cream to enjoy at the beach.")]

slightly_similar_sentences = [WordExtractor.get_words("Great day today, I want to run a long distance"),
                            WordExtractor.get_words("It is expensive to buy too many things"),
                            WordExtractor.get_words("These people are new to me, I have never seen them before.")]

different_sentences = [[],
                       WordExtractor.get_words("I play guitar in a band, but I better like to play piano"),
                       WordExtractor.get_words("The Jaguar is a very dangerous animal, that can run very fast"),
                       WordExtractor.get_words("Dinosaurs are extinct and we will probably never see them alive.")]


#A very similar sentence must be more similar than a slightly similar sentence
for very_similar_sentence in very_similar_sentences:
    for slightly_similar_sentence in slightly_similar_sentences:
        pp1 = ngp.calc_perplexity(very_similar_sentence)
        pp2 = ngp.calc_perplexity(slightly_similar_sentence)
        print(str(pp1) + " < " + str(pp2))
        assert(pp1 < pp2)


#A slightly similar sentence must be more similar than a different sentence
for slightly_similar_sentence in slightly_similar_sentences:
    for different_sentence in different_sentences:
        pp1 = ngp.calc_perplexity(very_similar_sentence)
        pp2 = ngp.calc_perplexity(different_sentence)
        print(str(pp1) + " < " + str(pp2))
        assert(pp1 < pp2)


#A very similar sentence must be more similar than a different sentence