def process(self, filename_read, filename_save, header_string, footer_string, sentence_max_len = 15): self.header_string = header_string rows = self.readfile(filename_read) result = np.asarray([' '], dtype=np.str) rows = self.dropheader(rows) for row in rows: # find footer if row.find(footer_string) != -1: break row = row.lower().translate(self.translation) s = np.asarray(row, dtype=np.str) result = np.char.add(result, s) self.text = np.char.split(result).tolist()[0] self.text = list(filter(lambda stop: (self.is_stopword(stop) == bool(0)), self.text)) for x in range(len(self.text)): if x >= len(self.text): break self.text[x] = self.convert2number(x, 0) result = np.asarray(self.text) text_lem = np.asarray([' '], dtype=np.str) for item in result: try: y = self.stem_words[item] except: y = item y = np.char.split(self.expand_contractions(y)) arr = y.tolist() text_lem = np.append(text_lem, arr) voc = Vocabulary('vocabulary') sentence = [] text_out = [voc.to_token(self.SOS_token)] sen_length = 0 for word in text_lem: if word == " ": continue word_cleaned = word.translate(self.trans_punctuation) sentence.append(word_cleaned) sen_length += 1 if self.is_sentence_end(word) \ or (self.is_sentence_conditional_end(word, word_cleaned) and sen_length > sentence_max_len): if word_cleaned != "and": text_out.append(word_cleaned) sentence_str = ' '.join(sentence).replace(" and ","") voc.add_chunk(sentence_str) text_out.append(voc.to_token(self.EOS_token)) text_out.append(voc.to_token(self.SOS_token)) sentence.clear() sen_length = 0 elif word_cleaned != "and": text_out.append(word_cleaned) text_out[len(text_out) - 1] = "\n" voc.save_data(filename_save) x = 0 file_proc = open(filename_save, "w") for token in text_out: if token == voc.to_token(self.EOS_token): for i in range(voc.count_longest_sentence() - x): file_proc.write(voc.to_token(self.PAD_token) + " ") x = 0 file_proc.write(token + " ") x += 1 file_proc.close()