def predict_prob(self, source='training',result_file='answers.csv'): # source must be 'training' or 'validation' data_set = aux.preprocessData('../Data/'+source+'_set.tsv') replies = ['A', 'B', 'C', 'D'] probs=[] for index, row in data_set.iterrows(): prob=[] r = -1 row['question'] = row['question'].replace('. ', ' . ').replace('-', ' - ').replace('.', ' .').replace('_', ' ').lower() if (row['question'][-1]=='.'): row['question']=row['question'][:-1]+' . ' for word in nt.word_tokenize(row['question']): try: self.model[word] except: row['question'] = row['question'].replace(word, '') for reply in replies: r_current = -1 ans = row['answer' + reply].replace('-', ' - ').replace('. ', ' . ').lower() if (ans[-1]=='.'): ans=ans[:-1]+' . ' for word in nt.word_tokenize(ans): try: self.model[word] except: ans = ans.replace(word, '') try: ans = nt.word_tokenize(ans) if ans==[]: continue if row['type'] != 1: quest = nt.word_tokenize(row['question']) r_current = np.exp(self.model.n_similarity(quest, ans)) else: ind = row['question'].find(' ') if row['question'][ind + 1:].split() == []: r_current = np.exp(self.model.n_similarity(nt.word_tokenize(row['question']), ans)) elif row['question'][0:ind].split() == []: r_current = np.exp(self.model.n_similarity(nt.word_tokenize(row['question'][ind:]), ans)) else: r_current = np.exp(self.model.n_similarity(nt.word_tokenize(row['question'][0:ind]), ans)) * np.exp(self.model.n_similarity(nt.word_tokenize(row['question'][ind:]), ans)) except: () prob.append(r_current) if r_current > r: r = r_current # c ptot = sum(prob) prob= [ p/ptot for p in prob] probs.append(prob) return probs
def predict_answer(self, source='training',result_file='answers.csv'): # source must be 'training' or 'validation' res = np.zeros([2, 3]).astype('int') data_set = aux.preprocessData('../Data/'+source+'_set.tsv') replies = ['A', 'B', 'C', 'D'] goodAnswersCount = 0 unknowWordsCount = 0 unknowWords=[] if (source=='validation'): answer = open('../Results/'+result_file, 'w') answer.write('id,correctAnswer\n') for index, row in data_set.iterrows(): r = -1 good = replies[0] res[0, row['type']] += 1 row['question'] = row['question'].replace('. ', ' . ').replace('-', ' - ').replace('.', ' .').replace('_', ' ').lower() if (row['question'][-1]=='.'): row['question']=row['question'][:-1]+' . ' for word in nt.word_tokenize(row['question']): try: self.model[word] except: unknowWordsCount += 1 if not word.lower() in unknowWords: unknowWords.append(word.lower()) row['question'] = row['question'].replace(word, '') for reply in replies: r_current = -1 ans = row['answer' + reply].replace('-', ' - ').replace('. ', ' . ').lower() if (ans[-1]=='.'): ans=ans[:-1]+' . ' for word in nt.word_tokenize(ans): try: self.model[word] except: unknowWordsCount += 1 if not word in unknowWords: unknowWords.append(word) ans = ans.replace(word, '') try: ans = nt.word_tokenize(ans) if ans==[]: continue if row['type'] != 1: quest = nt.word_tokenize(row['question']) r_current = self.model.n_similarity(quest, ans) else: ind = row['question'].find(' ') if row['question'][ind + 1:].split() == []: r_current = self.model.n_similarity(nt.word_tokenize(row['question']), ans) elif row['question'][0:ind].split() == []: r_current = self.model.n_similarity(nt.word_tokenize(row['question'][ind:]), ans) else: r_current = np.exp(self.model.n_similarity(nt.word_tokenize(row['question'][0:ind]), ans)) * np.exp(self.model.n_similarity(nt.word_tokenize(row['question'][ind:]), ans)) except: unknowWordsCount += 1 if r_current > r: r = r_current good = reply # c if (source=='validation'): answer.write(str(row['id']) + ',' + good + '\n') elif (source=='training'): if (good==row['correctAnswer']): goodAnswersCount+=1 res[1,row['type']]+=1 return [goodAnswersCount,unknowWordsCount,unknowWords,res]