Esempio n. 1
0
 def test_word_extractor(self):
     extractor = WordExtractor()
     rawText = "What? Wait! Stop now. We-are-the-champs. & $%^&*()!@"
     expected = [
         "What", "Wait", "Stop", "now", "We", "are", "the", "champs"
     ]
     actual = extractor.get_words_from_text(rawText)
     self.assertListEqual(actual, expected)
def calc_most_similar(tfidf, query, docs):
    best_doc = ""
    max_sim = 0
    for doc in docs:
        sim = tfidf.calc_cosine_similarity(WordExtractor.get_words(query), WordExtractor.get_words(doc))
        if sim > max_sim:
            max_sim = sim
            best_doc = doc
    return best_doc
Esempio n. 3
0
    def analyze_and_output_bilingual(self, general_en, general_pl, output_en,
                                     output_pl):
        words_general_en = WordExtractor.get_words(general_en)

        # skip if not assigned to worker
        if self.scanned_lines % RUN_CONFIGURATION.workers != RUN_CONFIGURATION.worker_id - 1:
            return
        self.assigned_lines += 1

        if len(words_general_en) == 0:
            self.unusable_lines += 1
            return
        words_general_pl = WordExtractor.get_words(general_pl)
        if len(words_general_pl) == 0:
            self.unusable_lines += 1
            return

        # FOR BILINGUAL
        if RUN_CONFIGURATION.mode == MODE.STATISTICS:
            # Always run both languages in HELP mode to gather useful statistics
            en_accepts = self.domain_similarity_en.accepts_sentence(
                words_general_en)
            pl_accepts = self.domain_similarity_pl.accepts_sentence(
                words_general_pl)
            if RUN_CONFIGURATION.accept_criteria == ACCEPT_CRITERIA.BILINGUAL_EITHER:
                if not en_accepts and not pl_accepts:
                    return
            elif RUN_CONFIGURATION.accept_criteria == ACCEPT_CRITERIA.BILINGUAL_BOTH:
                if not en_accepts or not pl_accepts:
                    return
        else:
            #In TURBO mode, only run PL analysis if necessary
            en_accepts = self.domain_similarity_en.accepts_sentence(
                words_general_en)
            if RUN_CONFIGURATION.accept_criteria == ACCEPT_CRITERIA.BILINGUAL_EITHER:
                if not en_accepts:
                    pl_accepts = self.domain_similarity_pl.accepts_sentence(
                        words_general_pl)
                    if not pl_accepts:
                        return
            elif RUN_CONFIGURATION.accept_criteria == ACCEPT_CRITERIA.BILINGUAL_BOTH:
                if not en_accepts:
                    return
                pl_accepts = self.domain_similarity_pl.accepts_sentence(
                    words_general_pl)
                if not pl_accepts:
                    return

        # Write to output since all conditions are satisfied
        self.extracted_lines += 1
        output_en.write(general_en)
        output_pl.write(general_pl)
    def analyze_and_output_bilingual(self, general_en, general_pl, output_en, output_pl):
        words_general_en = WordExtractor.get_words(general_en)

        # skip if not assigned to worker
        if self.scanned_lines % RUN_CONFIGURATION.workers != RUN_CONFIGURATION.worker_id - 1:
            return
        self.assigned_lines += 1

        if len(words_general_en) == 0:
            self.unusable_lines += 1
            return
        words_general_pl = WordExtractor.get_words(general_pl)
        if len(words_general_pl) == 0:
            self.unusable_lines += 1
            return

        # FOR BILINGUAL
        if RUN_CONFIGURATION.mode == MODE.STATISTICS:
            # Always run both languages in HELP mode to gather useful statistics
            en_accepts = self.domain_similarity_en.accepts_sentence(words_general_en)
            pl_accepts = self.domain_similarity_pl.accepts_sentence(words_general_pl)
            if RUN_CONFIGURATION.accept_criteria == ACCEPT_CRITERIA.BILINGUAL_EITHER:
                if not en_accepts and not pl_accepts:
                    return
            elif RUN_CONFIGURATION.accept_criteria == ACCEPT_CRITERIA.BILINGUAL_BOTH:
                if not en_accepts or not pl_accepts:
                    return
        else:
            #In TURBO mode, only run PL analysis if necessary
            en_accepts = self.domain_similarity_en.accepts_sentence(words_general_en)
            if RUN_CONFIGURATION.accept_criteria == ACCEPT_CRITERIA.BILINGUAL_EITHER:
                if not en_accepts:
                    pl_accepts = self.domain_similarity_pl.accepts_sentence(words_general_pl)
                    if not pl_accepts:
                        return
            elif RUN_CONFIGURATION.accept_criteria == ACCEPT_CRITERIA.BILINGUAL_BOTH:
                if not en_accepts:
                    return
                pl_accepts = self.domain_similarity_pl.accepts_sentence(words_general_pl)
                if not pl_accepts:
                    return

        # Write to output since all conditions are satisfied
        self.extracted_lines += 1
        output_en.write(general_en)
        output_pl.write(general_pl)
Esempio n. 5
0
    def analyze_and_output_mono(self, general_line, output):
        # skip if not assigned to this worker
        if self.scanned_lines % RUN_CONFIGURATION.workers != RUN_CONFIGURATION.worker_id - 1:
            return
        self.assigned_lines += 1

        words_general = WordExtractor.get_words(general_line)
        if len(words_general) == 0:
            self.unusable_lines += 1
            return

        if self.domain_similarity.accepts_sentence(words_general):
            self.extracted_lines += 1
            output.write(general_line)
    def analyze_and_output_mono(self, general_line, output):
        # skip if not assigned to this worker
        if self.scanned_lines % RUN_CONFIGURATION.workers != RUN_CONFIGURATION.worker_id - 1:
            return
        self.assigned_lines += 1

        words_general = WordExtractor.get_words(general_line)
        if len(words_general) == 0:
            self.unusable_lines += 1
            return

        if self.domain_similarity.accepts_sentence(words_general):
            self.extracted_lines += 1
            output.write(general_line)
 def __train_models(self):
     # Now load all sentences from specific domain, and train TFIDF model and NGramPerplexity model.
     self.ngp = NGramPerplexity()
     self.tfidf = TFIDF()
     print("Training models from specific corpora")
     for file in os.listdir(self.input_dir):
         print("Training models from specific corpora: " + file)
         with open(self.input_dir + "/" + file, encoding="utf-8") as input:
             for line in input:
                 words = WordExtractor.get_words(line)
                 if len(words) == 0:
                     continue
                 self.sentences.append(words)
                 self.ngp.train_from_text(words)
                 self.tfidf.train_from_text(words)
 def __train_models(self):
     # Now load all sentences from specific domain, and train TFIDF model and NGramPerplexity model.
     self.ngp = NGramPerplexity()
     self.tfidf = TFIDF()
     print("Training models from specific corpora")
     for file in os.listdir(self.input_dir):
         print("Training models from specific corpora: " + file)
         with open(self.input_dir + "/" + file, encoding="utf-8") as input:
             for line in input:
                 words = WordExtractor.get_words(line)
                 if len(words) == 0:
                     continue
                 self.sentences.append(words)
                 self.ngp.train_from_text(words)
                 self.tfidf.train_from_text(words)
            best_doc = doc
    return best_doc

tfidf = TFIDF()
d1 = "It is a great day today"
d2 = "The weather is absolutely great"
d3 = "It is so warm today, almost too hot"
d4 = "We're very happy with the weather today"
d5 = "It is a great day to be at the beach!"
d6 = "We should get out and enjoy the weather right now :)"
d7 = "I've bought a radio I plan to bring to the beach today"
d8 = "The beach is a bit crowded"
d9 = "There's many kids at the beach today"
d10 = "If it starts to rain at the beach, I will go home"

documents = [d1, d2, d3, d4, d5, d6, d7, d8, d9, d10]
for doc in documents:
    words = WordExtractor.get_words(doc)
    tfidf.train_from_text(words)

q1 = "We have not had rain for a long time!"
q2 = "Where can I buy a radio?"
q3 = "The kids are happy to be at the beach."

assert(calc_most_similar(tfidf, q1, documents) == d10)
assert(calc_most_similar(tfidf, q2, documents) == d7)
assert(calc_most_similar(tfidf, q3, documents) == d5)

print("Success")

from ngramperplexity import NGramPerplexity
from wordextractor import WordExtractor

NGramPerplexity.ngram_size = 3
ngp = NGramPerplexity()
ngp.train_from_text(
    WordExtractor.get_words("There are so many people at the beach"))
ngp.train_from_text(
    WordExtractor.get_words(
        "The beach is so crowded with all these people, I wish they would just go to another beach"
    ))
ngp.train_from_text(
    WordExtractor.get_words(
        "It is summer and a great day to go to the beach."))
ngp.train_from_text(
    WordExtractor.get_words(
        "Let's go to the beach and enjoy the great weather we've got today."))
ngp.train_from_text(
    WordExtractor.get_words(
        "I think the first thing I will do at the beach is to buy an ice cream."
    ))
ngp.train_from_text(
    WordExtractor.get_words(
        "There's many people at the beach today, I think they are enjoying their holidays."
    ))
ngp.train_from_text(
    WordExtractor.get_words(
        "I think something is going on at the beach right now, there are literally people everywhere."
    ))

#sentences with great similarity
Esempio n. 11
0
from database import MySqlDataSouce
from wordextractor import WordExtractor
import time

ENGLISH = "EN"
DEUTSCH = "DE"
NOUN = "NOUN"
SUCCESS = "SUCCESS"
ERROR_RATE_LIMIT = "ERROR RATE LIMIT"
ERROR_CLIENT_POOL ="ERROR CLIENT POOL"
ERROR_UKNOWN = "ERROR UKNOWN"
THROTTLE_SECONDS = 10
EXCEPTION_SLEEP_SECONDS = 60
ERROR_COUNT_SLEEP_SECONDS = 500

extractor = WordExtractor()
fetcher = LeoFetcher()
dao = MySqlDataSouce()
parser = LeoParser()

with open('resources/words.txt', 'r') as hall:
    data = hall.read()

words = extractor.get_words_from_text(data)
error_count = 0
for word in words :

    if error_count > 5 :
        print("Error count reached sleeping " + ERROR_COUNT_SLEEP_SECONDS+ " seconds")
        error_count = 0
        time.sleep(ERROR_COUNT_SLEEP_SECONDS)
from ngramperplexity import NGramPerplexity
from wordextractor import WordExtractor

NGramPerplexity.ngram_size = 3
ngp = NGramPerplexity()
ngp.train_from_text(WordExtractor.get_words("There are so many people at the beach"))
ngp.train_from_text(WordExtractor.get_words("The beach is so crowded with all these people, I wish they would just go to another beach"))
ngp.train_from_text(WordExtractor.get_words("It is summer and a great day to go to the beach."))
ngp.train_from_text(WordExtractor.get_words("Let's go to the beach and enjoy the great weather we've got today."))
ngp.train_from_text(WordExtractor.get_words("I think the first thing I will do at the beach is to buy an ice cream."))
ngp.train_from_text(WordExtractor.get_words("There's many people at the beach today, I think they are enjoying their holidays."))
ngp.train_from_text(WordExtractor.get_words("I think something is going on at the beach right now, there are literally people everywhere."))

#sentences with great similarity
very_similar_sentences = [WordExtractor.get_words("The weather is great at the beach!"),
                          WordExtractor.get_words("something is probably going on at the beach right now."),
                          WordExtractor.get_words("I think they would like an ice cream to enjoy at the beach.")]

slightly_similar_sentences = [WordExtractor.get_words("Great day today, I want to run a long distance"),
                            WordExtractor.get_words("It is expensive to buy too many things"),
                            WordExtractor.get_words("These people are new to me, I have never seen them before.")]

different_sentences = [[],
                       WordExtractor.get_words("I play guitar in a band, but I better like to play piano"),
                       WordExtractor.get_words("The Jaguar is a very dangerous animal, that can run very fast"),
                       WordExtractor.get_words("Dinosaurs are extinct and we will probably never see them alive.")]


#A very similar sentence must be more similar than a slightly similar sentence
for very_similar_sentence in very_similar_sentences:
    for slightly_similar_sentence in slightly_similar_sentences: