def from_file(self, filename): """Convert the file to sentence tokens. This is dangerous if the file is very large. If memory was a bottlneck, we'd need to be trickier about reading in chunks, checking for valid sentences, and resetting the file pointer to the last found sentence. """ with open(filename, 'r') as data: return self.from_text(data.read())
def pre_process_single_file(original_file, discourse_project_input_dir): # Checks if output directory exists, if not creates one if not os.path.isdir("output"): os.mkdir("output") # Extracts just the file name from the path filename = original_file.split(os.path.sep)[-1] with open(original_file, mode='r', encoding='utf-8') as data: originalData = data.read() restructured_text = re_structure_text(originalData) create_txt_file(restructured_text, filename) xml_filename = createXmlDocument_v2(restructured_text, filename, path=discourse_project_input_dir) read_xml_file(xml_filename, filename)
def read_file(path, stop_words): translator = str.maketrans('', '', string.punctuation) data = open(path) file = data.read() sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') text = sent_detector.tokenize(file.strip()) sentences = [] for sentence in text: new_sentence = sentence.translate(translator) sentences.append(word_tokenize(new_sentence.lower())) sentences.pop() new_file = file.translate(translator) lemmatizer = WordNetLemmatizer() words = word_tokenize(new_file.lower()) wordl = [lemmatizer.lemmatize(w) for w in words if w not in stop_words] word_list = list(wordl) return text, sentences, word_list
# download the English tokenizer from NLTK sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') DATA_DIR = "../data/OANC-GrAF/data/written_1/journal/verbatim_txt/" CSV_FILE_NAME = 'verbatim.csv' csv_file = open(CSV_FILE_NAME, 'w', newline='') csv_writer = csv.writer(csv_file, delimiter=',') csv_writer.writerow( ['file_id', 'num_sent', 'word_count', 'count_the', 'count_this', 'count_that', 'count_these', 'count_those', 'count_a', 'count_an', 'count_one', 'freq_the', 'freq_this', 'freq_that', 'freq_these', 'freq_those', 'freq_a', 'freq_an', 'freq_one']) with os.scandir(DATA_DIR) as all_files: for entry in all_files: data = open(os.path.join(DATA_DIR, entry.name)) tokenized_sentences = sent_detector.tokenize(data.read().replace("\n", " ").replace("\t", "")) #print(tokenized_sentences) num_sent = len(tokenized_sentences) word_count = { "the" : 0, "this" : 0, "that" : 0, "these" : 0, "those" : 0, "a" : 0, "an" : 0, "one" : 0 } all_word_count = 0
Original source: https://gist.github.com/Ghost---Shadow/c361f2d6b4501f40648b#file-plag-py @author: Quan """ from nltk.corpus import wordnet from nltk.corpus.reader.plaintext import PlaintextCorpusReader import re from nltk.tokenize import word_tokenize from random import randint import nltk.data from preprocessing.tokenizer_rinehart import tokenizer # Load a text file if required from ./data folder path = './data/Doyle.txt' data = open(path, "r+") text = data.read() # text = "Pete ate a large cake. Sam has a big mouth." # Tokenize the text tokenized = tokenizer(text) # Get the list of words from the entire text words = word_tokenize(text) # Identify the parts of speech tagged = nltk.pos_tag(words) def replace(words, tagged): output = "" for i in range(0, len(words)):