def write_in_file(): """ """ # Get the files list_of_files = os.listdir(TRAINING_FOLDER_PATH) number_of_file = len(list_of_files) # Initialise the lemmatizer lemmatizer = nltk.WordNetLemmatizer() # Initialise the tokenizer tokenizer = ToktokTokenizer() tokenizer.AMPERCENT = re.compile('& '), '& ' tokenizer.TOKTOK_REGEXES = [ (regex, sub) if sub != '& ' else (re.compile('& '), '& ') for (regex, sub) in ToktokTokenizer.TOKTOK_REGEXES ] toktok = tokenizer.tokenize total_text = pd.Series([]) # loop in the files for i in range(0, 11): file_name = list_of_files[i] print(i) # open the files with open(os.path.join(TRAINING_FOLDER_PATH, file_name), 'r', encoding="utf8") as text: string_text = text.read() splitted = toktok(string_text) # Lemmatize lemmatized = [lemmatizer.lemmatize(t) for t in splitted] tokens = pd.Series(lemmatized) # Take off random punctuation # All the numbers under the same name tokens.loc[tokens.apply(lambda x: x.isnumeric())] = "NUMBER" total_text = total_text.append(tokens) # Write in a file txtfilename = "training_text_file/" + str(i + 1) + "yo.txt" with io.open(txtfilename, "w", encoding="utf-8") as f: for item in total_text: f.write("%s " % item)
def lemmatize_10tranches(path, new_name): """ Clean the 10 tranches texts by lemmatize and take off punctuations and regroup the numbers """ # Initialise the lemmatizer lemmatizer = nltk.WordNetLemmatizer() # Initialise the tokenizer tokenizer = ToktokTokenizer() tokenizer.AMPERCENT = re.compile('& '), '& ' tokenizer.TOKTOK_REGEXES = [ (regex, sub) if sub != '& ' else (re.compile('& '), '& ') for (regex, sub) in ToktokTokenizer.TOKTOK_REGEXES ] toktok = tokenizer.tokenize # open the files with open(path, 'r', encoding="utf8") as text: string_text = text.read().lower() splitted = toktok(string_text) # Lemmatize lemmatized = [lemmatizer.lemmatize(t) for t in splitted] tokens = pd.Series(lemmatized) # Take off random punctuation # All the numbers under the same name tokens.loc[tokens.apply(lambda x: x.isnumeric())] = "NUMBER" # Write in a file txtfilename = new_name with io.open(txtfilename, "w", encoding="utf-8") as f: for item in tokens: f.write("%s " % item)