class SentenceParser: def __init__(self): self.file = File() self.stemmer = SnowballStemmer("english") logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) def extract_feature_from_doc(self, line): features = [] for (text, category, answer) in line: sent_features = self.extract_feature(text) # features.append((sent_features, category)) features.append((sent_features, answer)) # print(datetime.datetime.now()) print('Features of Answer: ', (sent_features, answer)) # print(datetime.datetime.now()) return features def extract_feature(self, text): print("\nQUESTION: ", text) words = self.preprocess(text) print("After Preprocess: ", words) # YOUR CODE HERE # Tag words tagged_words = [nltk.pos_tag(word_tokenize(words))] # Extract keys keys = self.extract_keys(tagged_words) # Stemming stemmed_words = [self.stemmer.stem(key) for key in keys] return self.get_feature_set(stemmed_words) def preprocess(self, sentence): # YOUR CODE HERE # make all lower case sentence = sentence.lower() #tokenize / segment the words & remove punctuations tokenizer = RegexpTokenizer(r'\w+') tokens = tokenizer.tokenize(sentence) # stop words set_of_stopwords = set(stopwords.words('english')) filtered_words = [ word for word in tokens if not word in set_of_stopwords ] # join the words return " ".join(filtered_words) def extract_keys(self, sentences): sent_keys = [] for sent in sentences: keys = [ x for (x, n) in sent if n == 'NN' or n == 'NNS' or n == 'VBN' or n == 'VBP' or n == 'RB' or n == 'VBZ' or n == 'VBG' or n == 'PRP' or n == 'JJ' ] if len(keys) == 0: keys = [x for (x, n) in sent] sent_keys.extend(keys) return sent_keys def get_feature_set(self, sent_keys): return {'keywords': ' '.join(sent_keys)} def get_content(self, fileName): with self.file.read(get_resource(fileName)) as content_file: lines = csv.reader(content_file, delimiter='|') res = [x for x in lines if len(x) == 3] return res