Esempio n. 1
0
def get_lastest_reading():
    global phrase
    while get_lastest_reading_thread.is_alive():
        global lastest_reading
        label = lastest_reading
        if label != "":
            if label == "dot":
                if phrase != "":
                    spell = Speller()
                    phrase = spell(phrase)
                    if lang != "en":
                        translate_client = translate.Client()
                        translated_test = translate_client.translate(phrase, target_language=lang)["translatedText"]
                        voice = gTTS( text=translated_test, lang=lang, tld=top_domain[output_language], slow=False)
                    else:
                        voice = gTTS( text=phrase, lang=lang, tld=top_domain[output_language], slow=False) 
                    voice_file = "voice-{}.mp3".format(str(randint(1, 9999999)))         
                    voice.save(voice_file)
                    play_audio_thread = threading.Thread(target=play_audio, args=[voice_file])
                    play_audio_thread.daemon = True
                    play_audio_thread.start()
                    print (TGREEN + "audio:" + phrase, ENDC)
                    phrase = ""                    
            elif label == "space":
                phrase += " "
                spell = Speller()
                phrase = spell(phrase)
                print (TGREEN + phrase, ENDC) 
            else:
                phrase += label
                print (TGREEN + label, ENDC)
            time.sleep(detection_speed)
            lastest_reading = ""
Esempio n. 2
0
 def correctSpelling(self):
     speller = Speller()
     word_tokens = word_tokenize(self.text)
     lenghthening = [self.reduce_lengthening(word) for word in word_tokens]
     spelled = [speller.autocorrect_word(word) for word in lenghthening]
     self.text = " ".join(spelled)
     return self
Esempio n. 3
0
def loading_data(filename, basepath, extension):
    # Asumptions: The processed file is saved in the same folder with the pattern filename+'P'
    # Exist a column named "Comment" for which it is performed the data processing
    # Function  read csv files and return a pandas dataframe, exist the csv file.
    # First check if the processed file exist, this is due the fact that performing spell correcting is time demanding, so it is
    # better to save and load the result onces the comments are processed
    processed_filename = filename + 'P'
    processed_path = os.path.join(basepath,
                                  processed_filename + '.' + extension)
    #Check if the processed file exist, if exist load this file.
    if os.path.exists(processed_path):
        data = pd.read_csv(processed_path)
    else:
        start_time = time.time()
        #Load the Speller
        spell = Speller(lang='es')
        #load the crude data
        data_init = pd.read_csv(
            os.path.join(basepath, filename + '.' + extension))
        #Perform spell correction and use lower letters to the comment column
        data = spell_correction(data_init, 'Comment', 'processed comment',
                                spell)
        #Eliminates stopwords.
        data = stopwords_correction(data, 'processed comment')
        #Save file with the format of processed file
        try:
            data.to_csv(processed_path)
        except:
            print("Oops!", sys.exc_info()[0], "occurred.")
            raise Exception("Couldn't save the dataframe")
        print(time.time() - start_time)
    return data
def preprocess_text(doc):

    # Lowercase
    doc = doc.lower()

    # Remove Numbers
    doc = re.sub(r'\d+', '', doc)

    # Remove unidecode
    doc = unidecode.unidecode(doc)

    # Remove special characters
    doc = re.sub(r'[^\w\s]', '', doc)

    # Spelling Check
    spell = Speller(lang='en')
    doc = spell(doc)

    # Remove stopwords and lemmatize
    doc = [lemmer.lemmatize(w) for w in doc.split() if w not in stop_word]

    # Remove rare words generated from lemmatization
    doc1 = []
    common_and_rare_words = ['wa', 'ha', 'ive', 'im', 'youd', 'names']
    for wd in doc:
        if wd not in common_and_rare_words:
            doc1.append(wd)

    return ' '.join(doc1)
    def my_tokenizer(self, df, col_name):
        """
        Creates a new column "tokens" in the dataframe, from column 'col_name'.
        Creates a list of lowercase words.
        Removes stopwords.
        Removes string containing no letter."""

        # tokenize
        df['tokens'] = df[col_name].apply(lambda x: nltk.word_tokenize(x))
        
        # lowercase
        df['tokens'] = df.tokens.apply(lambda l: [w.lower() for w in l])

        def real_word_filter():
            # load and remove set of stopwords
            stops = set(stopwords.words('English'))
            df['tokens'] = df.tokens.apply(lambda l: [w for w in l if w not in stops])

            # keep only words with at least two letters, and keep those with dash (like 'fast-growing')
            #pattern = r"[a-z]+"
            pattern = r"^[a-z]+[-]?[a-z]+$"
            df['tokens'] = df.tokens.apply(lambda l: [w for w in l if bool(re.match(pattern, w))])

        # remove stop words dans pattern match
        real_word_filter()

        # spell checker
        spell = Speller()
        df['tokens'] = df.tokens.apply(lambda l: [spell(w) for w in l])

        # remove stop words dans pattern match, for a second time after spell checker
        real_word_filter()
Esempio n. 6
0
 def processChunk(self, list, output, procId):
     objs = {
         'speller': Speller(),
         'lemmatizer': WordNetLemmatizer(),
         'stemmer': nltk.stem.SnowballStemmer('english'),
         'mapper': Word2VecMapper(),
         'stop': set(stopwords.words('english'))
     }
     csv = pd.DataFrame(columns=["id", "embedding", "polarity"])
     flags = copy.copy(self.flags)
     for idx, value in enumerate(list):
         if (idx % 10 == 0):
             LOGGER.debug('{}, done {}/{}'.format(
                 procId, idx + 1,
                 int(len(self.data_set) / self.numberOfProcesses)))
         csv = self.processSingleDataSetValue(value[0], value[1], value[2],
                                              output, objs, flags, csv)
     LOGGER.debug("{}, finished processing".format(procId))
     # output.cancel_join_thread()
     if ("-csv" in sys.argv):
         path = "processed_data_set" if self.set else "processed_test_set"
         csv.to_csv(self.config.readValue(path).split(".")[0] + "_" +
                    str(procId) + ".csv",
                    sep=";",
                    index=False)
Esempio n. 7
0
def text_preprocessing(data, text_cols):

    lemmatizer = WordNetLemmatizer()
    combined = pd.concat([data['train'], data['test']], axis=0)

    spell = Speller(fast=True)
    for col in text_cols:
        combined[col] = combined[col].apply(lambda x: x.lower()
                                            if isinstance(x, str) else x)

    stop_words = set(stopwords.words('english'))

    for col in text_cols:
        preprocessed_text = []
        for words in combined[col]:
            if words is not np.nan:
                words = word_tokenize(words)
                words = [word for word in words if word.isalpha()]
                words = [word for word in words if word not in stop_words]
                words = [spell(word) for word in words]
                words = [lemmatizer.lemmatize(word) for word in words]

                preprocessed_text.append(' '.join(words))

            else:
                preprocessed_text.append(np.nan)

        combined[col] = preprocessed_text
    data['train'] = combined.iloc[:len(data['train'])]
    data['test'] = combined.iloc[len(data['train']):]
Esempio n. 8
0
 def __init__(self,
              numberOfProcesses=mp.cpu_count() - 1,
              optional_length=None):
     if "--log" in sys.argv:
         logging.basicConfig(level=logging.DEBUG)
     if "-embedding" in sys.argv:
         self.EMBEDDING = True
     else:
         self.EMBEDDING = False
     self.EMBEDDING_LENGTH = 300
     self.SEQUENCE_LENGTH = 2665
     self.TIMEOUT = 15
     self.OVERHEAD_TIMEOUT = 45
     self.explorer = DataExplorer()
     self.resultsProcessor = ResultsProcessor()
     self.englishStopWords = set(stopwords.words('english'))
     self.text = ''
     self.config = Config()
     self.flags = {
         'spelling': False,
         'stopWords': False,
         'lemmatize': False,
         'stem': False,
     }
     self.speller = Speller()
     self.lemmatizer = WordNetLemmatizer()
     self.stemmer = nltk.stem.SnowballStemmer('english')
     self.numberOfProcesses = numberOfProcesses
     self.mapper = Word2VecMapper()
     self.optional_length = optional_length
     self.config = Config()
Esempio n. 9
0
def predict(image_name):
    output_image_path = os.path.join("api", "temp")
    input_image_path = os.path.join(output_image_path, image_name)

    tokenizer = Tokenizer()

    model = MyModel(vocab_size=tokenizer.vocab_size,
                    beam_width=20,
                    stop_tolerance=15,
                    reduce_tolerance=10)
    model.compile(learning_rate=0.001)
    model.load_checkpoint(target=target_path)

    imgproc.__execute__(input_image_path, output_image_path)

    text = []
    confidence = []

    image_lines = sorted(
        glob(
            os.path.join(output_image_path,
                         image_name.split('.')[0], "lines", "*.png")))

    for img in image_lines:
        img = pp.preprocess_image(img, target_image_size, predict=True)
        img = pp.normalization([img])

        predicts, probabilities = model.predict(img, ctc_decode=True)

        predicts = tokenizer.sequences_to_texts(predicts)
        confidence.append(f"{predicts[0]} ==> {probabilities[0]}")
        text.append(Speller("en").autocorrect_sentence(predicts[0][0]))

    return "\n".join(text)
Esempio n. 10
0
class AutoCorrectSpellingChecker(SpellingChecker):
    def __init__(self, spelling_config=None):
        SpellingChecker.__init__(self, spelling_config)
        self._speller = Speller()

    def correct(self, phrase):
        return self._speller.autocorrect_sentence(phrase).upper()
Esempio n. 11
0
    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        spell = Speller(lang='en')
        for (i, line) in enumerate(lines):
            if i == 0 or line == []:
                continue
            guid = "%s-%s" % (set_type, i)
            text_a = line[3]  # generate_misspelling(line[3])
            #try:
            #    text_a = spell(text_a)
            #except:
            #    pass

            text_b = line[4]

            #pos = text_b.find(text_a)
            #text_a = text_b[:pos] + " <b> " + text_b[pos:pos + len(text_a)] + " </b> " + text_b[pos + len(text_a):]
            #text_b = None

            if len(line) < 6 or line[5] == '?':
                label = self.get_labels()[0]
            else:
                label = line[5]
            examples.append(
                InputExample(guid=guid,
                             text_a=text_a,
                             text_b=text_b,
                             label=label))
        return examples
Esempio n. 12
0
    def process(self, message, **kwargs):

        from autocorrect import Speller
        spell = Speller(lang='en')
        mesg = message.text #get original message
        text = spell(mesg) #correct the message with autocorrect
        message.text = text #set the corrected message as the message for the next components to process
def replace_repetition(text, spell=True):
    text = re.sub(r"((\w)\2{2,})", r'\2', text)
    if spell:
        spell = Speller(lang='en')
        text = spell(text)

    return text
Esempio n. 14
0
def preprocess(data, colname, applyList=[1, 2, 3]):
    def docPreprocess(badText):
        funcDict = {}
        funcDict[1] = lambda x: [spell(word) for word in x]
        funcDict[2] = lambda x: [
            nltk.stem.SnowballStemmer('english').stem(word) for word in x
        ]
        funcDict[3] = lambda x: [
            nltk.stem.WordNetLemmatizer().lemmatize(word)
            for word in betterText
        ]
        betterText = badText.split()
        betterText = [word for word in betterText if word not in stopWordsSet]
        betterText = [word for word in betterText if word not in freq]
        for i in applyList:
            betterText = funcDict[i](betterText)
        return betterText

    stopWordsSet = set(nltk.corpus.stopwords.words('english'))
    spell = Speller(lang='en')
    #
    data = data.copy()
    data[colname] = data[colname].str.replace("[0-9]", "").str.replace(
        "[^\s\w]", "").str.lower().str.encode('ascii',
                                              'ignore').str.decode('ascii')
    freq = pd.Series(' '.join(data[colname]).split()).value_counts()[:10]
    data[colname] = data[colname].apply(docPreprocess)
    data = data.dropna()
    return data
 def removeStopWords(self, text):
     spell = Speller(lang='en')
     clean_word_list = spell(text)
     clean_word_list = [
         word for word in clean_word_list.split() if word not in stoplist
     ]
     return clean_word_list
def func(txt):
    tokenized = txt
    stop_words = set(stopwords.words('english'))
    wordslist = nltk.word_tokenize(tokenized)
    tagged = nltk.pos_tag(wordslist)
    spell = Speller(lang='en')
    words = [(spell(w), tag) for w, tag in tagged if not w in stop_words]
    return words
Esempio n. 17
0
def remove_spelling_errors(list):
    '''remove spelling errors via speller module'''
    spell = Speller()
    new_list = []
    for word in list:
        word_new = spell(word)
        new_list.append(word_new)
    return new_list
Esempio n. 18
0
def auto_correct(text):
    spell = Speller(lang='en')
    tokens = word_tokenize(text)
    filtered_text = []
    for i in tokens:
        filtered_text.append(spell(i))
    filtered = ' '.join(filtered_text)
    return filtered
Esempio n. 19
0
def stem_words(tokenized_data):
    stemmer = PorterStemmer()
    spell = Speller(lang='en')

    for i in range(len(tokenized_data)):
        tokenized_data[i] = stemmer.stem(tokenized_data[i])

    return tokenized_data
Esempio n. 20
0
def itt_OCR(image,
            config='--psm 4 --oem 1'):  # oem 1, 2 -- psm 6, 11, 4, 1, 3, 12
    pytesseract.pytesseract.tesseract_cmd = 'C:/Program Files/Tesseract-OCR/tesseract.exe'
    text = pytesseract.image_to_string(image, lang='eng', config=config)
    spell = Speller()
    text = spell(text)

    print("text is: \n" + text)
    return text
Esempio n. 21
0
 def process_not_found_node(self, w):
     w = self.no_accent_vietnamese(w)
     spell = Speller(lang='en')
     predicted = spell(w)
     return {
         'prediction': '' if predicted == w else predicted,
         'm_eng': {},
         'm_vn': {}
     }
def main(args):
    vocab = build_vocab(json=args.caption_path,
                        threshold=args.threshold,
                        spell=Speller())
    vocab_path = args.vocab_path
    with open(vocab_path, 'wb') as f:
        pickle.dump(vocab, f)
    print("Total vocabulary size: {}".format(len(vocab)))
    print("Saved the vocabulary wrapper to '{}'".format(vocab_path))
 def __init__(self, corpus_size):
     self.vector_size = 300
     self.speller_obj = Speller(lang='en')
     self.stop_words = many_stop_words.get_stop_words("en")
     self.spacy_obj = spacy.load('en_core_web_sm')
     self.tokenizer_obj = Tokenizer(num_words=corpus_size,
                                    oov_token="<OOV>")
     with open("normalize_mapping.json") as normalize_file_obj:
         self.normalize_mapping = json.load(normalize_file_obj)
Esempio n. 24
0
def correct_spelling(data):
    spell = Speller(fast=True)
    tweets = {}
    for key, value in data:
        no_sym_emojis = re.sub(r' \:\) | \:\( | \:D | \:P | \:o | xD | \:\/ ',
                               ' ', value)
        corrected = spell(no_sym_emojis)
        tweets[key] = " ".join(corrected.split())
    return tweets
def correct_sentence(line):
    spell = Speller()
    lines = line.strip().split(' ')
    new_line = ""
    similar_word = {}
    for l in lines:
        new_line += spell(l) + " "
    # similar_word[l]=spell.candidates(l)
    return new_line
Esempio n. 26
0
def correct_word(word):
    spell = Speller(lang='en')
    words = word.strip().split(' ')
    new_word = ""
    similar_word = {}
    for l in words:
        new_word += spell(l) + " "
    # similar_word[l]=spell.candidates(l)
    return new_word
Esempio n. 27
0
def _main_():
    processed_file_datasatisfaction = 'satisfaction_ratingP'
    processed_file_npsresponse = 'NPS_responsesP'
    if os.path.exists(
            os.path.join(os.getcwd(), 'data',
                         processed_file_datasatisfaction + '.csv')):
        data_satisfaction = loading_data(
            os.path.join(os.getcwd(), 'data',
                         processed_file_datasatisfaction + '.csv'))
    else:
        start_time = time.time()
        spell = Speller(lang='es')
        data_satisfaction = loading_data('data/satisfaction_Ratings.csv')
        data_satisfaction = spell_correction(data_satisfaction, 'Comment',
                                             'processed comment', spell)
        save_dfdata(data_satisfaction, processed_file_datasatisfaction)
        print(time.time() - start_time)

    if os.path.exists(
            os.path.join(os.getcwd(), 'data',
                         processed_file_npsresponse + '.csv')):
        data_responses = loading_data(
            os.path.join(os.getcwd(), 'data',
                         processed_file_npsresponse + '.csv'))
    else:
        start_time = time.time()
        spell = Speller(lang='es')
        data_responses = loading_data('data/NPS_Responses.csv')
        data_responses = spell_correction(data_responses, 'Comment',
                                          'processed comment', spell)
        save_dfdata(data_responses, processed_file_npsresponse)
        print(time.time() - start_time)

    print('there are ' + str(len(data_satisfaction['Ticket Id'].unique())) +
          ' unique tickets')
    print('there are ' + str(len(data_satisfaction)) + ' tickets')
    duplicate_tickets = data_satisfaction.groupby('Ticket Id').size(
    ).sort_values(ascending=False).reset_index(name='tickets count')
    duplicate_example = data_satisfaction[data_satisfaction['Ticket Id'] ==
                                          duplicate_tickets['Ticket Id'][3]]

    satisfaction_wordcount = word_count(data_satisfaction, 'processed comment')
    print('finished...')
Esempio n. 28
0
def get_bot_response():
    spell = Speller()
    userText = request.args.get('msg')
    response = str(chat.chatbot_response(userText))
    if response in [
            "Sorry, can't understand you", "Please give me more info",
            "Not sure I understand"
    ]:
        userText = spell(userText)
        response = str(chat.chatbot_response(userText))
    return response
Esempio n. 29
0
 def correct_sentence(self, str):
     ''' New function is used to correct typos
     '''
     words = str.split()
     corrected_string = []
     spell = Speller(lang='en')
     for w in words:
         lw = w.lower()
         lw = spell(lw)
         corrected_string.append(lw)
     return " ".join(corrected_string)
def check_spelling(search_word):
    spell = Speller(lang='en')
    search = search_word.split()
    corrected = []

    for word in search:
        corrected.append(spell(word))

    corrected = ' '.join(corrected)

    return corrected