Python Spellerの例、autocorrect.Speller Pythonの例

コード例 #1

0

ファイルを表示

def get_lastest_reading():
    global phrase
    while get_lastest_reading_thread.is_alive():
        global lastest_reading
        label = lastest_reading
        if label != "":
            if label == "dot":
                if phrase != "":
                    spell = Speller()
                    phrase = spell(phrase)
                    if lang != "en":
                        translate_client = translate.Client()
                        translated_test = translate_client.translate(phrase, target_language=lang)["translatedText"]
                        voice = gTTS( text=translated_test, lang=lang, tld=top_domain[output_language], slow=False)
                    else:
                        voice = gTTS( text=phrase, lang=lang, tld=top_domain[output_language], slow=False) 
                    voice_file = "voice-{}.mp3".format(str(randint(1, 9999999)))         
                    voice.save(voice_file)
                    play_audio_thread = threading.Thread(target=play_audio, args=[voice_file])
                    play_audio_thread.daemon = True
                    play_audio_thread.start()
                    print (TGREEN + "audio:" + phrase, ENDC)
                    phrase = ""                    
            elif label == "space":
                phrase += " "
                spell = Speller()
                phrase = spell(phrase)
                print (TGREEN + phrase, ENDC) 
            else:
                phrase += label
                print (TGREEN + label, ENDC)
            time.sleep(detection_speed)
            lastest_reading = ""

コード例 #2

0

ファイルを表示

 def correctSpelling(self):
     speller = Speller()
     word_tokens = word_tokenize(self.text)
     lenghthening = [self.reduce_lengthening(word) for word in word_tokens]
     spelled = [speller.autocorrect_word(word) for word in lenghthening]
     self.text = " ".join(spelled)
     return self

コード例 #3

0

ファイルを表示

def loading_data(filename, basepath, extension):
    # Asumptions: The processed file is saved in the same folder with the pattern filename+'P'
    # Exist a column named "Comment" for which it is performed the data processing
    # Function  read csv files and return a pandas dataframe, exist the csv file.
    # First check if the processed file exist, this is due the fact that performing spell correcting is time demanding, so it is
    # better to save and load the result onces the comments are processed
    processed_filename = filename + 'P'
    processed_path = os.path.join(basepath,
                                  processed_filename + '.' + extension)
    #Check if the processed file exist, if exist load this file.
    if os.path.exists(processed_path):
        data = pd.read_csv(processed_path)
    else:
        start_time = time.time()
        #Load the Speller
        spell = Speller(lang='es')
        #load the crude data
        data_init = pd.read_csv(
            os.path.join(basepath, filename + '.' + extension))
        #Perform spell correction and use lower letters to the comment column
        data = spell_correction(data_init, 'Comment', 'processed comment',
                                spell)
        #Eliminates stopwords.
        data = stopwords_correction(data, 'processed comment')
        #Save file with the format of processed file
        try:
            data.to_csv(processed_path)
        except:
            print("Oops!", sys.exc_info()[0], "occurred.")
            raise Exception("Couldn't save the dataframe")
        print(time.time() - start_time)
    return data

コード例 #4

0

ファイルを表示

ファイル: Assignment Codes.py プロジェクト: Nicole-Hong/GMMA-865-BigData

def preprocess_text(doc):

    # Lowercase
    doc = doc.lower()

    # Remove Numbers
    doc = re.sub(r'\d+', '', doc)

    # Remove unidecode
    doc = unidecode.unidecode(doc)

    # Remove special characters
    doc = re.sub(r'[^\w\s]', '', doc)

    # Spelling Check
    spell = Speller(lang='en')
    doc = spell(doc)

    # Remove stopwords and lemmatize
    doc = [lemmer.lemmatize(w) for w in doc.split() if w not in stop_word]

    # Remove rare words generated from lemmatization
    doc1 = []
    common_and_rare_words = ['wa', 'ha', 'ive', 'im', 'youd', 'names']
    for wd in doc:
        if wd not in common_and_rare_words:
            doc1.append(wd)

    return ' '.join(doc1)

コード例 #5

0

ファイルを表示

ファイル: preprocessing.py プロジェクト: JustineWeb/fake_job_posting

    def my_tokenizer(self, df, col_name):
        """
        Creates a new column "tokens" in the dataframe, from column 'col_name'.
        Creates a list of lowercase words.
        Removes stopwords.
        Removes string containing no letter."""

        # tokenize
        df['tokens'] = df[col_name].apply(lambda x: nltk.word_tokenize(x))
        
        # lowercase
        df['tokens'] = df.tokens.apply(lambda l: [w.lower() for w in l])

        def real_word_filter():
            # load and remove set of stopwords
            stops = set(stopwords.words('English'))
            df['tokens'] = df.tokens.apply(lambda l: [w for w in l if w not in stops])

            # keep only words with at least two letters, and keep those with dash (like 'fast-growing')
            #pattern = r"[a-z]+"
            pattern = r"^[a-z]+[-]?[a-z]+$"
            df['tokens'] = df.tokens.apply(lambda l: [w for w in l if bool(re.match(pattern, w))])

        # remove stop words dans pattern match
        real_word_filter()

        # spell checker
        spell = Speller()
        df['tokens'] = df.tokens.apply(lambda l: [spell(w) for w in l])

        # remove stop words dans pattern match, for a second time after spell checker
        real_word_filter()

コード例 #6

0

ファイルを表示

 def processChunk(self, list, output, procId):
     objs = {
         'speller': Speller(),
         'lemmatizer': WordNetLemmatizer(),
         'stemmer': nltk.stem.SnowballStemmer('english'),
         'mapper': Word2VecMapper(),
         'stop': set(stopwords.words('english'))
     }
     csv = pd.DataFrame(columns=["id", "embedding", "polarity"])
     flags = copy.copy(self.flags)
     for idx, value in enumerate(list):
         if (idx % 10 == 0):
             LOGGER.debug('{}, done {}/{}'.format(
                 procId, idx + 1,
                 int(len(self.data_set) / self.numberOfProcesses)))
         csv = self.processSingleDataSetValue(value[0], value[1], value[2],
                                              output, objs, flags, csv)
     LOGGER.debug("{}, finished processing".format(procId))
     # output.cancel_join_thread()
     if ("-csv" in sys.argv):
         path = "processed_data_set" if self.set else "processed_test_set"
         csv.to_csv(self.config.readValue(path).split(".")[0] + "_" +
                    str(procId) + ".csv",
                    sep=";",
                    index=False)

コード例 #7

0

ファイルを表示

def text_preprocessing(data, text_cols):

    lemmatizer = WordNetLemmatizer()
    combined = pd.concat([data['train'], data['test']], axis=0)

    spell = Speller(fast=True)
    for col in text_cols:
        combined[col] = combined[col].apply(lambda x: x.lower()
                                            if isinstance(x, str) else x)

    stop_words = set(stopwords.words('english'))

    for col in text_cols:
        preprocessed_text = []
        for words in combined[col]:
            if words is not np.nan:
                words = word_tokenize(words)
                words = [word for word in words if word.isalpha()]
                words = [word for word in words if word not in stop_words]
                words = [spell(word) for word in words]
                words = [lemmatizer.lemmatize(word) for word in words]

                preprocessed_text.append(' '.join(words))

            else:
                preprocessed_text.append(np.nan)

        combined[col] = preprocessed_text
    data['train'] = combined.iloc[:len(data['train'])]
    data['test'] = combined.iloc[len(data['train']):]

コード例 #8

0

ファイルを表示

 def __init__(self,
              numberOfProcesses=mp.cpu_count() - 1,
              optional_length=None):
     if "--log" in sys.argv:
         logging.basicConfig(level=logging.DEBUG)
     if "-embedding" in sys.argv:
         self.EMBEDDING = True
     else:
         self.EMBEDDING = False
     self.EMBEDDING_LENGTH = 300
     self.SEQUENCE_LENGTH = 2665
     self.TIMEOUT = 15
     self.OVERHEAD_TIMEOUT = 45
     self.explorer = DataExplorer()
     self.resultsProcessor = ResultsProcessor()
     self.englishStopWords = set(stopwords.words('english'))
     self.text = ''
     self.config = Config()
     self.flags = {
         'spelling': False,
         'stopWords': False,
         'lemmatize': False,
         'stem': False,
     }
     self.speller = Speller()
     self.lemmatizer = WordNetLemmatizer()
     self.stemmer = nltk.stem.SnowballStemmer('english')
     self.numberOfProcesses = numberOfProcesses
     self.mapper = Word2VecMapper()
     self.optional_length = optional_length
     self.config = Config()

コード例 #9

0

ファイルを表示

ファイル: predict.py プロジェクト: Aman6744/digify_api

def predict(image_name):
    output_image_path = os.path.join("api", "temp")
    input_image_path = os.path.join(output_image_path, image_name)

    tokenizer = Tokenizer()

    model = MyModel(vocab_size=tokenizer.vocab_size,
                    beam_width=20,
                    stop_tolerance=15,
                    reduce_tolerance=10)
    model.compile(learning_rate=0.001)
    model.load_checkpoint(target=target_path)

    imgproc.__execute__(input_image_path, output_image_path)

    text = []
    confidence = []

    image_lines = sorted(
        glob(
            os.path.join(output_image_path,
                         image_name.split('.')[0], "lines", "*.png")))

    for img in image_lines:
        img = pp.preprocess_image(img, target_image_size, predict=True)
        img = pp.normalization([img])

        predicts, probabilities = model.predict(img, ctc_decode=True)

        predicts = tokenizer.sequences_to_texts(predicts)
        confidence.append(f"{predicts[0]} ==> {probabilities[0]}")
        text.append(Speller("en").autocorrect_sentence(predicts[0][0]))

    return "\n".join(text)

コード例 #10

0

ファイルを表示

class AutoCorrectSpellingChecker(SpellingChecker):
    def __init__(self, spelling_config=None):
        SpellingChecker.__init__(self, spelling_config)
        self._speller = Speller()

    def correct(self, phrase):
        return self._speller.autocorrect_sentence(phrase).upper()

コード例 #11

0

ファイルを表示

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        spell = Speller(lang='en')
        for (i, line) in enumerate(lines):
            if i == 0 or line == []:
                continue
            guid = "%s-%s" % (set_type, i)
            text_a = line[3]  # generate_misspelling(line[3])
            #try:
            #    text_a = spell(text_a)
            #except:
            #    pass

            text_b = line[4]

            #pos = text_b.find(text_a)
            #text_a = text_b[:pos] + " <b> " + text_b[pos:pos + len(text_a)] + " </b> " + text_b[pos + len(text_a):]
            #text_b = None

            if len(line) < 6 or line[5] == '?':
                label = self.get_labels()[0]
            else:
                label = line[5]
            examples.append(
                InputExample(guid=guid,
                             text_a=text_a,
                             text_b=text_b,
                             label=label))
        return examples

コード例 #12

0

ファイルを表示

ファイル: autocorrect.py プロジェクト: Ren-Y-Lin/ChatBot

    def process(self, message, **kwargs):

        from autocorrect import Speller
        spell = Speller(lang='en')
        mesg = message.text #get original message
        text = spell(mesg) #correct the message with autocorrect
        message.text = text #set the corrected message as the message for the next components to process

コード例 #13

0

ファイルを表示

ファイル: self_made_data_processing_functions.py プロジェクト: anthonyyazdani/Twitter-sentiment-classification

def replace_repetition(text, spell=True):
    text = re.sub(r"((\w)\2{2,})", r'\2', text)
    if spell:
        spell = Speller(lang='en')
        text = spell(text)

    return text

コード例 #14

0

ファイルを表示

def preprocess(data, colname, applyList=[1, 2, 3]):
    def docPreprocess(badText):
        funcDict = {}
        funcDict[1] = lambda x: [spell(word) for word in x]
        funcDict[2] = lambda x: [
            nltk.stem.SnowballStemmer('english').stem(word) for word in x
        ]
        funcDict[3] = lambda x: [
            nltk.stem.WordNetLemmatizer().lemmatize(word)
            for word in betterText
        ]
        betterText = badText.split()
        betterText = [word for word in betterText if word not in stopWordsSet]
        betterText = [word for word in betterText if word not in freq]
        for i in applyList:
            betterText = funcDict[i](betterText)
        return betterText

    stopWordsSet = set(nltk.corpus.stopwords.words('english'))
    spell = Speller(lang='en')
    #
    data = data.copy()
    data[colname] = data[colname].str.replace("[0-9]", "").str.replace(
        "[^\s\w]", "").str.lower().str.encode('ascii',
                                              'ignore').str.decode('ascii')
    freq = pd.Series(' '.join(data[colname]).split()).value_counts()[:10]
    data[colname] = data[colname].apply(docPreprocess)
    data = data.dropna()
    return data

コード例 #15

0

ファイルを表示

ファイル: preprocessing.py プロジェクト: gethmaperera/sentiment-and-sarcasm-analysis

 def removeStopWords(self, text):
     spell = Speller(lang='en')
     clean_word_list = spell(text)
     clean_word_list = [
         word for word in clean_word_list.split() if word not in stoplist
     ]
     return clean_word_list

コード例 #16

0

ファイルを表示

ファイル: lemmatizer.py プロジェクト: joshihimanshu21/Context-Switch-Slide

def func(txt):
    tokenized = txt
    stop_words = set(stopwords.words('english'))
    wordslist = nltk.word_tokenize(tokenized)
    tagged = nltk.pos_tag(wordslist)
    spell = Speller(lang='en')
    words = [(spell(w), tag) for w, tag in tagged if not w in stop_words]
    return words

コード例 #17

0

ファイルを表示

def remove_spelling_errors(list):
    '''remove spelling errors via speller module'''
    spell = Speller()
    new_list = []
    for word in list:
        word_new = spell(word)
        new_list.append(word_new)
    return new_list

コード例 #18

0

ファイルを表示

def auto_correct(text):
    spell = Speller(lang='en')
    tokens = word_tokenize(text)
    filtered_text = []
    for i in tokens:
        filtered_text.append(spell(i))
    filtered = ' '.join(filtered_text)
    return filtered

コード例 #19

0

ファイルを表示

def stem_words(tokenized_data):
    stemmer = PorterStemmer()
    spell = Speller(lang='en')

    for i in range(len(tokenized_data)):
        tokenized_data[i] = stemmer.stem(tokenized_data[i])

    return tokenized_data

コード例 #20

0

ファイルを表示

def itt_OCR(image,
            config='--psm 4 --oem 1'):  # oem 1, 2 -- psm 6, 11, 4, 1, 3, 12
    pytesseract.pytesseract.tesseract_cmd = 'C:/Program Files/Tesseract-OCR/tesseract.exe'
    text = pytesseract.image_to_string(image, lang='eng', config=config)
    spell = Speller()
    text = spell(text)

    print("text is: \n" + text)
    return text

コード例 #21

0

ファイルを表示

ファイル: graph_helper.py プロジェクト: ajax997/STranslator

 def process_not_found_node(self, w):
     w = self.no_accent_vietnamese(w)
     spell = Speller(lang='en')
     predicted = spell(w)
     return {
         'prediction': '' if predicted == w else predicted,
         'm_eng': {},
         'm_vn': {}
     }

コード例 #22

0

ファイルを表示

ファイル: build_vocab.py プロジェクト: gorold/50.039-Image-Captioning-Project

def main(args):
    vocab = build_vocab(json=args.caption_path,
                        threshold=args.threshold,
                        spell=Speller())
    vocab_path = args.vocab_path
    with open(vocab_path, 'wb') as f:
        pickle.dump(vocab, f)
    print("Total vocabulary size: {}".format(len(vocab)))
    print("Saved the vocabulary wrapper to '{}'".format(vocab_path))

コード例 #23

0

ファイルを表示

ファイル: disaster_tweet_classifier.py プロジェクト: UtkarshSarswat/nlp_disaster

 def __init__(self, corpus_size):
     self.vector_size = 300
     self.speller_obj = Speller(lang='en')
     self.stop_words = many_stop_words.get_stop_words("en")
     self.spacy_obj = spacy.load('en_core_web_sm')
     self.tokenizer_obj = Tokenizer(num_words=corpus_size,
                                    oov_token="<OOV>")
     with open("normalize_mapping.json") as normalize_file_obj:
         self.normalize_mapping = json.load(normalize_file_obj)

コード例 #24

0

ファイルを表示

def correct_spelling(data):
    spell = Speller(fast=True)
    tweets = {}
    for key, value in data:
        no_sym_emojis = re.sub(r' \:\) | \:\( | \:D | \:P | \:o | xD | \:\/ ',
                               ' ', value)
        corrected = spell(no_sym_emojis)
        tweets[key] = " ".join(corrected.split())
    return tweets

コード例 #25

0

ファイルを表示

ファイル: SpellChecker.py プロジェクト: Ivanlxw/Handwritten-Line-Text-Recognition-using-Deep-Learning-with-Tensorflow

def correct_sentence(line):
    spell = Speller()
    lines = line.strip().split(' ')
    new_line = ""
    similar_word = {}
    for l in lines:
        new_line += spell(l) + " "
    # similar_word[l]=spell.candidates(l)
    return new_line

コード例 #26

0

ファイルを表示

def correct_word(word):
    spell = Speller(lang='en')
    words = word.strip().split(' ')
    new_word = ""
    similar_word = {}
    for l in words:
        new_word += spell(l) + " "
    # similar_word[l]=spell.candidates(l)
    return new_word

コード例 #27

0

ファイルを表示

def _main_():
    processed_file_datasatisfaction = 'satisfaction_ratingP'
    processed_file_npsresponse = 'NPS_responsesP'
    if os.path.exists(
            os.path.join(os.getcwd(), 'data',
                         processed_file_datasatisfaction + '.csv')):
        data_satisfaction = loading_data(
            os.path.join(os.getcwd(), 'data',
                         processed_file_datasatisfaction + '.csv'))
    else:
        start_time = time.time()
        spell = Speller(lang='es')
        data_satisfaction = loading_data('data/satisfaction_Ratings.csv')
        data_satisfaction = spell_correction(data_satisfaction, 'Comment',
                                             'processed comment', spell)
        save_dfdata(data_satisfaction, processed_file_datasatisfaction)
        print(time.time() - start_time)

    if os.path.exists(
            os.path.join(os.getcwd(), 'data',
                         processed_file_npsresponse + '.csv')):
        data_responses = loading_data(
            os.path.join(os.getcwd(), 'data',
                         processed_file_npsresponse + '.csv'))
    else:
        start_time = time.time()
        spell = Speller(lang='es')
        data_responses = loading_data('data/NPS_Responses.csv')
        data_responses = spell_correction(data_responses, 'Comment',
                                          'processed comment', spell)
        save_dfdata(data_responses, processed_file_npsresponse)
        print(time.time() - start_time)

    print('there are ' + str(len(data_satisfaction['Ticket Id'].unique())) +
          ' unique tickets')
    print('there are ' + str(len(data_satisfaction)) + ' tickets')
    duplicate_tickets = data_satisfaction.groupby('Ticket Id').size(
    ).sort_values(ascending=False).reset_index(name='tickets count')
    duplicate_example = data_satisfaction[data_satisfaction['Ticket Id'] ==
                                          duplicate_tickets['Ticket Id'][3]]

    satisfaction_wordcount = word_count(data_satisfaction, 'processed comment')
    print('finished...')

コード例 #28

0

ファイルを表示

ファイル: ChatBot.py プロジェクト: jalelab/Chatbot

def get_bot_response():
    spell = Speller()
    userText = request.args.get('msg')
    response = str(chat.chatbot_response(userText))
    if response in [
            "Sorry, can't understand you", "Please give me more info",
            "Not sure I understand"
    ]:
        userText = spell(userText)
        response = str(chat.chatbot_response(userText))
    return response

コード例 #29

0

ファイルを表示

 def correct_sentence(self, str):
     ''' New function is used to correct typos
     '''
     words = str.split()
     corrected_string = []
     spell = Speller(lang='en')
     for w in words:
         lw = w.lower()
         lw = spell(lw)
         corrected_string.append(lw)
     return " ".join(corrected_string)

コード例 #30

0

ファイルを表示

ファイル: search_results.py プロジェクト: gilbert-khonstantine/OnlineTicketApp

def check_spelling(search_word):
    spell = Speller(lang='en')
    search = search_word.split()
    corrected = []

    for word in search:
        corrected.append(spell(word))

    corrected = ' '.join(corrected)

    return corrected