Example #1
1
 def extract_sentences(self, files, inputpath):
     dataset_Reader = FileReader()
     sentence_extractor = SentenceExtractor()
     sentence_list = []
     try:
         for doc in files:
             with open(inputpath + doc):
                 inputdataset = dataset_Reader.read(inputpath + doc)
                 sentence_list.extend(sentence_extractor.extract_sentences(inputdataset))
     except IOError:
         print "IOError"
         
     return sentence_list
Example #2
0
 def preprocess(self, files, input_path):
     dataset_Reader = FileReader()
     preprocessed_list = []
     try:
         for doc in files:
             with open(input_path + doc):
                 inputdataset = (dataset_Reader.read(input_path + doc))
                 preprocessed_data = []
                 for word in inputdataset.split():
                     word = word.lower()
                     if word not in self.stop_words():
                         filter1 = self.regularise_expression(str(word))
                         filter2 = self.stem_word(filter1)
                         preprocessed_data.append(str(filter2))
                         
                 preprocessed_list.append(preprocessed_data)
     except IOError:
         print "IOError"
     
     return preprocessed_list
Example #3
0
    def get_data_and_label(self):
        self.get_files()
        nlp = NLP(None)
        nlp.set_stop_words()
        file_reader = FileReader(None)
        contents = []
        labels = []

        #init titles
        titles = {}
        for title in self.files:
            titles[title] = con.get_setting('labels', str(title))

        for title in self.files:
            for file in self.files[title]:
                file_reader.file_path = file
                nlp.text = file_reader.read()

                contents.append(nlp.get_words_feature())
                labels.append(titles[title])

        return (contents, labels)