class DomainSimilarity: def __init__(self, input_dir, threshold_tfidf, threshold_perplexity_ngram, threshold_edit_distance): self.threshold_tfidf = 1 - threshold_tfidf self.threshold_perplexity_ngram = threshold_perplexity_ngram self.threshold_edit_distance = threshold_edit_distance self.input_dir = input_dir self.sentences = [] if not os.path.isdir(input_dir): raise Exception("The provided dir " + str(input_dir) + " does not exist") self.__train_models() self.queries_asked = 0 self.sentences_asked = 0 self.accepted_by_tfidf = 0 self.accepted_by_ngp = 0 self.accepted_by_edit_distance = 0 self.sum_tfidf = 0 self.sum_ngp = 0 self.sum_edit = 0 def __train_models(self): # Now load all sentences from specific domain, and train TFIDF model and NGramPerplexity model. self.ngp = NGramPerplexity() self.tfidf = TFIDF() print("Training models from specific corpora") for file in os.listdir(self.input_dir): print("Training models from specific corpora: " + file) with open(self.input_dir + "/" + file, encoding="utf-8") as input: for line in input: words = WordExtractor.get_words(line) if len(words) == 0: continue self.sentences.append(words) self.ngp.train_from_text(words) self.tfidf.train_from_text(words) def print_progress(self): print("Average tfidf: " + str(1 - self.sum_tfidf / self.queries_asked)) print("Average ngram-perplexity: " + str(self.sum_ngp / self.sentences_asked)) print("Average edit-distance: " + str(self.sum_edit / self.queries_asked)) print("Accept percent by tfidf extractor: " + Formatter.percent(self.accepted_by_tfidf / self.sentences_asked)) print("Accept percent by ngram-perplexity extractor: " + Formatter.percent(self.accepted_by_ngp / self.sentences_asked)) print("Accept percent by edit-distance extractor: " + Formatter.percent(self.accepted_by_edit_distance / self.sentences_asked)) def accepts_sentence(self, words_general): # sentence_general: string # Returns True if similarity of sentence_general is either: # > threshold1 according to tf-idf of one of stored sentences # > threshold2 according to ngramperplexity of one of stored sentences # > threshold3 according to levenshtein of one of stored sentences self.sentences_asked += 1 accept_ngp = False accept_tfidf = False accept_edit_distance = False perplexity = self.ngp.calc_perplexity(words_general) self.sum_ngp += perplexity if perplexity <= self.threshold_perplexity_ngram: if RUN_CONFIGURATION.mode == MODE.TURBO: return True self.accepted_by_ngp += 1 accept_ngp = True for words_specific in self.sentences: self.queries_asked += 1 if accept_tfidf and accept_edit_distance: return True if not accept_tfidf: sim = self.tfidf.calc_cosine_similarity( words_general, words_specific) self.sum_tfidf += sim if sim >= self.threshold_tfidf: if RUN_CONFIGURATION.mode == MODE.TURBO: return True self.accepted_by_tfidf += 1 accept_tfidf = True if not accept_edit_distance: edit_distance = Levenshtein.normalized_distance( words_general, words_specific) self.sum_edit += edit_distance if edit_distance <= self.threshold_edit_distance: if RUN_CONFIGURATION.mode == MODE.TURBO: return True self.accepted_by_edit_distance += 1 accept_edit_distance = True if accept_tfidf or accept_ngp or accept_edit_distance: return True return False
class DomainSimilarity: def __init__(self, input_dir, threshold_tfidf, threshold_perplexity_ngram, threshold_edit_distance): self.threshold_tfidf = 1-threshold_tfidf self.threshold_perplexity_ngram = threshold_perplexity_ngram self.threshold_edit_distance = threshold_edit_distance self.input_dir = input_dir self.sentences = [] if not os.path.isdir(input_dir): raise Exception("The provided dir " + str(input_dir) + " does not exist") self.__train_models() self.queries_asked = 0 self.sentences_asked = 0 self.accepted_by_tfidf = 0 self.accepted_by_ngp = 0 self.accepted_by_edit_distance = 0 self.sum_tfidf = 0 self.sum_ngp = 0 self.sum_edit = 0 def __train_models(self): # Now load all sentences from specific domain, and train TFIDF model and NGramPerplexity model. self.ngp = NGramPerplexity() self.tfidf = TFIDF() print("Training models from specific corpora") for file in os.listdir(self.input_dir): print("Training models from specific corpora: " + file) with open(self.input_dir + "/" + file, encoding="utf-8") as input: for line in input: words = WordExtractor.get_words(line) if len(words) == 0: continue self.sentences.append(words) self.ngp.train_from_text(words) self.tfidf.train_from_text(words) def print_progress(self): print("Average tfidf: " + str(1 - self.sum_tfidf / self.queries_asked)) print("Average ngram-perplexity: " + str(self.sum_ngp / self.sentences_asked)) print("Average edit-distance: " + str(self.sum_edit / self.queries_asked)) print("Accept percent by tfidf extractor: " + Formatter.percent(self.accepted_by_tfidf / self.sentences_asked)) print("Accept percent by ngram-perplexity extractor: " + Formatter.percent(self.accepted_by_ngp / self.sentences_asked)) print("Accept percent by edit-distance extractor: " + Formatter.percent(self.accepted_by_edit_distance / self.sentences_asked)) def accepts_sentence(self, words_general): # sentence_general: string # Returns True if similarity of sentence_general is either: # > threshold1 according to tf-idf of one of stored sentences # > threshold2 according to ngramperplexity of one of stored sentences # > threshold3 according to levenshtein of one of stored sentences self.sentences_asked += 1 accept_ngp = False accept_tfidf = False accept_edit_distance = False perplexity = self.ngp.calc_perplexity(words_general) self.sum_ngp += perplexity if perplexity <= self.threshold_perplexity_ngram: if RUN_CONFIGURATION.mode == MODE.TURBO: return True self.accepted_by_ngp += 1 accept_ngp = True for words_specific in self.sentences: self.queries_asked += 1 if accept_tfidf and accept_edit_distance: return True if not accept_tfidf: sim = self.tfidf.calc_cosine_similarity(words_general, words_specific) self.sum_tfidf += sim if sim >= self.threshold_tfidf: if RUN_CONFIGURATION.mode == MODE.TURBO: return True self.accepted_by_tfidf += 1 accept_tfidf = True if not accept_edit_distance: edit_distance = Levenshtein.normalized_distance(words_general, words_specific) self.sum_edit += edit_distance if edit_distance <= self.threshold_edit_distance: if RUN_CONFIGURATION.mode == MODE.TURBO: return True self.accepted_by_edit_distance += 1 accept_edit_distance = True if accept_tfidf or accept_ngp or accept_edit_distance: return True return False
] different_sentences = [ [], WordExtractor.get_words( "I play guitar in a band, but I better like to play piano"), WordExtractor.get_words( "The Jaguar is a very dangerous animal, that can run very fast"), WordExtractor.get_words( "Dinosaurs are extinct and we will probably never see them alive.") ] #A very similar sentence must be more similar than a slightly similar sentence for very_similar_sentence in very_similar_sentences: for slightly_similar_sentence in slightly_similar_sentences: pp1 = ngp.calc_perplexity(very_similar_sentence) pp2 = ngp.calc_perplexity(slightly_similar_sentence) print(str(pp1) + " < " + str(pp2)) assert (pp1 < pp2) #A slightly similar sentence must be more similar than a different sentence for slightly_similar_sentence in slightly_similar_sentences: for different_sentence in different_sentences: pp1 = ngp.calc_perplexity(very_similar_sentence) pp2 = ngp.calc_perplexity(different_sentence) print(str(pp1) + " < " + str(pp2)) assert (pp1 < pp2) #A very similar sentence must be more similar than a different sentence for very_similar_sentence in very_similar_sentences: for different_sentence in different_sentences:
WordExtractor.get_words("I think they would like an ice cream to enjoy at the beach.")] slightly_similar_sentences = [WordExtractor.get_words("Great day today, I want to run a long distance"), WordExtractor.get_words("It is expensive to buy too many things"), WordExtractor.get_words("These people are new to me, I have never seen them before.")] different_sentences = [[], WordExtractor.get_words("I play guitar in a band, but I better like to play piano"), WordExtractor.get_words("The Jaguar is a very dangerous animal, that can run very fast"), WordExtractor.get_words("Dinosaurs are extinct and we will probably never see them alive.")] #A very similar sentence must be more similar than a slightly similar sentence for very_similar_sentence in very_similar_sentences: for slightly_similar_sentence in slightly_similar_sentences: pp1 = ngp.calc_perplexity(very_similar_sentence) pp2 = ngp.calc_perplexity(slightly_similar_sentence) print(str(pp1) + " < " + str(pp2)) assert(pp1 < pp2) #A slightly similar sentence must be more similar than a different sentence for slightly_similar_sentence in slightly_similar_sentences: for different_sentence in different_sentences: pp1 = ngp.calc_perplexity(very_similar_sentence) pp2 = ngp.calc_perplexity(different_sentence) print(str(pp1) + " < " + str(pp2)) assert(pp1 < pp2) #A very similar sentence must be more similar than a different sentence