def extract_sentences(self, files, inputpath): dataset_Reader = FileReader() sentence_extractor = SentenceExtractor() sentence_list = [] try: for doc in files: with open(inputpath + doc): inputdataset = dataset_Reader.read(inputpath + doc) sentence_list.extend(sentence_extractor.extract_sentences(inputdataset)) except IOError: print "IOError" return sentence_list
def preprocess(self, files, input_path): dataset_Reader = FileReader() preprocessed_list = [] try: for doc in files: with open(input_path + doc): inputdataset = (dataset_Reader.read(input_path + doc)) preprocessed_data = [] for word in inputdataset.split(): word = word.lower() if word not in self.stop_words(): filter1 = self.regularise_expression(str(word)) filter2 = self.stem_word(filter1) preprocessed_data.append(str(filter2)) preprocessed_list.append(preprocessed_data) except IOError: print "IOError" return preprocessed_list
def get_data_and_label(self): self.get_files() nlp = NLP(None) nlp.set_stop_words() file_reader = FileReader(None) contents = [] labels = [] #init titles titles = {} for title in self.files: titles[title] = con.get_setting('labels', str(title)) for title in self.files: for file in self.files[title]: file_reader.file_path = file nlp.text = file_reader.read() contents.append(nlp.get_words_feature()) labels.append(titles[title]) return (contents, labels)