def test_word_extractor(self): extractor = WordExtractor() rawText = "What? Wait! Stop now. We-are-the-champs. & $%^&*()!@" expected = [ "What", "Wait", "Stop", "now", "We", "are", "the", "champs" ] actual = extractor.get_words_from_text(rawText) self.assertListEqual(actual, expected)
def calc_most_similar(tfidf, query, docs): best_doc = "" max_sim = 0 for doc in docs: sim = tfidf.calc_cosine_similarity(WordExtractor.get_words(query), WordExtractor.get_words(doc)) if sim > max_sim: max_sim = sim best_doc = doc return best_doc
def analyze_and_output_bilingual(self, general_en, general_pl, output_en, output_pl): words_general_en = WordExtractor.get_words(general_en) # skip if not assigned to worker if self.scanned_lines % RUN_CONFIGURATION.workers != RUN_CONFIGURATION.worker_id - 1: return self.assigned_lines += 1 if len(words_general_en) == 0: self.unusable_lines += 1 return words_general_pl = WordExtractor.get_words(general_pl) if len(words_general_pl) == 0: self.unusable_lines += 1 return # FOR BILINGUAL if RUN_CONFIGURATION.mode == MODE.STATISTICS: # Always run both languages in HELP mode to gather useful statistics en_accepts = self.domain_similarity_en.accepts_sentence( words_general_en) pl_accepts = self.domain_similarity_pl.accepts_sentence( words_general_pl) if RUN_CONFIGURATION.accept_criteria == ACCEPT_CRITERIA.BILINGUAL_EITHER: if not en_accepts and not pl_accepts: return elif RUN_CONFIGURATION.accept_criteria == ACCEPT_CRITERIA.BILINGUAL_BOTH: if not en_accepts or not pl_accepts: return else: #In TURBO mode, only run PL analysis if necessary en_accepts = self.domain_similarity_en.accepts_sentence( words_general_en) if RUN_CONFIGURATION.accept_criteria == ACCEPT_CRITERIA.BILINGUAL_EITHER: if not en_accepts: pl_accepts = self.domain_similarity_pl.accepts_sentence( words_general_pl) if not pl_accepts: return elif RUN_CONFIGURATION.accept_criteria == ACCEPT_CRITERIA.BILINGUAL_BOTH: if not en_accepts: return pl_accepts = self.domain_similarity_pl.accepts_sentence( words_general_pl) if not pl_accepts: return # Write to output since all conditions are satisfied self.extracted_lines += 1 output_en.write(general_en) output_pl.write(general_pl)
def analyze_and_output_bilingual(self, general_en, general_pl, output_en, output_pl): words_general_en = WordExtractor.get_words(general_en) # skip if not assigned to worker if self.scanned_lines % RUN_CONFIGURATION.workers != RUN_CONFIGURATION.worker_id - 1: return self.assigned_lines += 1 if len(words_general_en) == 0: self.unusable_lines += 1 return words_general_pl = WordExtractor.get_words(general_pl) if len(words_general_pl) == 0: self.unusable_lines += 1 return # FOR BILINGUAL if RUN_CONFIGURATION.mode == MODE.STATISTICS: # Always run both languages in HELP mode to gather useful statistics en_accepts = self.domain_similarity_en.accepts_sentence(words_general_en) pl_accepts = self.domain_similarity_pl.accepts_sentence(words_general_pl) if RUN_CONFIGURATION.accept_criteria == ACCEPT_CRITERIA.BILINGUAL_EITHER: if not en_accepts and not pl_accepts: return elif RUN_CONFIGURATION.accept_criteria == ACCEPT_CRITERIA.BILINGUAL_BOTH: if not en_accepts or not pl_accepts: return else: #In TURBO mode, only run PL analysis if necessary en_accepts = self.domain_similarity_en.accepts_sentence(words_general_en) if RUN_CONFIGURATION.accept_criteria == ACCEPT_CRITERIA.BILINGUAL_EITHER: if not en_accepts: pl_accepts = self.domain_similarity_pl.accepts_sentence(words_general_pl) if not pl_accepts: return elif RUN_CONFIGURATION.accept_criteria == ACCEPT_CRITERIA.BILINGUAL_BOTH: if not en_accepts: return pl_accepts = self.domain_similarity_pl.accepts_sentence(words_general_pl) if not pl_accepts: return # Write to output since all conditions are satisfied self.extracted_lines += 1 output_en.write(general_en) output_pl.write(general_pl)
def analyze_and_output_mono(self, general_line, output): # skip if not assigned to this worker if self.scanned_lines % RUN_CONFIGURATION.workers != RUN_CONFIGURATION.worker_id - 1: return self.assigned_lines += 1 words_general = WordExtractor.get_words(general_line) if len(words_general) == 0: self.unusable_lines += 1 return if self.domain_similarity.accepts_sentence(words_general): self.extracted_lines += 1 output.write(general_line)
def __train_models(self): # Now load all sentences from specific domain, and train TFIDF model and NGramPerplexity model. self.ngp = NGramPerplexity() self.tfidf = TFIDF() print("Training models from specific corpora") for file in os.listdir(self.input_dir): print("Training models from specific corpora: " + file) with open(self.input_dir + "/" + file, encoding="utf-8") as input: for line in input: words = WordExtractor.get_words(line) if len(words) == 0: continue self.sentences.append(words) self.ngp.train_from_text(words) self.tfidf.train_from_text(words)
best_doc = doc return best_doc tfidf = TFIDF() d1 = "It is a great day today" d2 = "The weather is absolutely great" d3 = "It is so warm today, almost too hot" d4 = "We're very happy with the weather today" d5 = "It is a great day to be at the beach!" d6 = "We should get out and enjoy the weather right now :)" d7 = "I've bought a radio I plan to bring to the beach today" d8 = "The beach is a bit crowded" d9 = "There's many kids at the beach today" d10 = "If it starts to rain at the beach, I will go home" documents = [d1, d2, d3, d4, d5, d6, d7, d8, d9, d10] for doc in documents: words = WordExtractor.get_words(doc) tfidf.train_from_text(words) q1 = "We have not had rain for a long time!" q2 = "Where can I buy a radio?" q3 = "The kids are happy to be at the beach." assert(calc_most_similar(tfidf, q1, documents) == d10) assert(calc_most_similar(tfidf, q2, documents) == d7) assert(calc_most_similar(tfidf, q3, documents) == d5) print("Success")
from ngramperplexity import NGramPerplexity from wordextractor import WordExtractor NGramPerplexity.ngram_size = 3 ngp = NGramPerplexity() ngp.train_from_text( WordExtractor.get_words("There are so many people at the beach")) ngp.train_from_text( WordExtractor.get_words( "The beach is so crowded with all these people, I wish they would just go to another beach" )) ngp.train_from_text( WordExtractor.get_words( "It is summer and a great day to go to the beach.")) ngp.train_from_text( WordExtractor.get_words( "Let's go to the beach and enjoy the great weather we've got today.")) ngp.train_from_text( WordExtractor.get_words( "I think the first thing I will do at the beach is to buy an ice cream." )) ngp.train_from_text( WordExtractor.get_words( "There's many people at the beach today, I think they are enjoying their holidays." )) ngp.train_from_text( WordExtractor.get_words( "I think something is going on at the beach right now, there are literally people everywhere." )) #sentences with great similarity
from database import MySqlDataSouce from wordextractor import WordExtractor import time ENGLISH = "EN" DEUTSCH = "DE" NOUN = "NOUN" SUCCESS = "SUCCESS" ERROR_RATE_LIMIT = "ERROR RATE LIMIT" ERROR_CLIENT_POOL ="ERROR CLIENT POOL" ERROR_UKNOWN = "ERROR UKNOWN" THROTTLE_SECONDS = 10 EXCEPTION_SLEEP_SECONDS = 60 ERROR_COUNT_SLEEP_SECONDS = 500 extractor = WordExtractor() fetcher = LeoFetcher() dao = MySqlDataSouce() parser = LeoParser() with open('resources/words.txt', 'r') as hall: data = hall.read() words = extractor.get_words_from_text(data) error_count = 0 for word in words : if error_count > 5 : print("Error count reached sleeping " + ERROR_COUNT_SLEEP_SECONDS+ " seconds") error_count = 0 time.sleep(ERROR_COUNT_SLEEP_SECONDS)
from ngramperplexity import NGramPerplexity from wordextractor import WordExtractor NGramPerplexity.ngram_size = 3 ngp = NGramPerplexity() ngp.train_from_text(WordExtractor.get_words("There are so many people at the beach")) ngp.train_from_text(WordExtractor.get_words("The beach is so crowded with all these people, I wish they would just go to another beach")) ngp.train_from_text(WordExtractor.get_words("It is summer and a great day to go to the beach.")) ngp.train_from_text(WordExtractor.get_words("Let's go to the beach and enjoy the great weather we've got today.")) ngp.train_from_text(WordExtractor.get_words("I think the first thing I will do at the beach is to buy an ice cream.")) ngp.train_from_text(WordExtractor.get_words("There's many people at the beach today, I think they are enjoying their holidays.")) ngp.train_from_text(WordExtractor.get_words("I think something is going on at the beach right now, there are literally people everywhere.")) #sentences with great similarity very_similar_sentences = [WordExtractor.get_words("The weather is great at the beach!"), WordExtractor.get_words("something is probably going on at the beach right now."), WordExtractor.get_words("I think they would like an ice cream to enjoy at the beach.")] slightly_similar_sentences = [WordExtractor.get_words("Great day today, I want to run a long distance"), WordExtractor.get_words("It is expensive to buy too many things"), WordExtractor.get_words("These people are new to me, I have never seen them before.")] different_sentences = [[], WordExtractor.get_words("I play guitar in a band, but I better like to play piano"), WordExtractor.get_words("The Jaguar is a very dangerous animal, that can run very fast"), WordExtractor.get_words("Dinosaurs are extinct and we will probably never see them alive.")] #A very similar sentence must be more similar than a slightly similar sentence for very_similar_sentence in very_similar_sentences: for slightly_similar_sentence in slightly_similar_sentences: