class CheckDataset(object): def __init__(self): self.nlp = NLPHelper() self.ut = Utility() self.fex = FeatureExtractor() def checkData(self): path = "scenario2_fullidn_pickle/" filelist = os.listdir(path) data = pd.DataFrame() for idx, file in enumerate(filelist): #buka file pickle yang isinya data ner, coref, dan pos dari suatu teks berita pkl_dict = self.ut.loadPickle(os.path.join(path, file)) # print(pkl_dict['ner']) # entities = self.fex.extractBefEntity(pkl_dict['ner']) filename = pkl_dict['filename'] df = pd.DataFrame() df = self.countTermFrequency(pkl_dict['text']) df['filename'] = filename data = data.append(df) # df['entities'] = entities self.ut.convertToExcel("TF.xlsx", data, 'Sheet1') def countTermFrequency(self, text): import nltk words = nltk.word_tokenize(text) fdist = nltk.FreqDist(words) df = pd.DataFrame.from_dict(fdist, orient='index').reset_index() df.columns = ['term', 'frequency'] # for word, frequency in fdist.most_common(50): # print(u'{}:{}'.format(word, frequency)) return df
def __init__(self): self.ut = Utility()
class ModelTrainer(object): def __init__(self): self.ut = Utility() def train(self, dataset, drop_element): #classifier algorithm, n_estimator = jumlah tree, random_state= angka apapun, sengaja didefine biar hasilnya tetap sama clf = RandomForestClassifier(n_estimators=3, random_state=2) #coba utak atik # extract feature needed, drop entity dataset = dataset.drop(['entity', 'id_text', drop_element], axis=1) # convert type to numeric dataset = self.ut.convertToNumeric(dataset) # dataset = self.ut.oneHotEncoding(dataset) # determine wich column is feature or label # X itu fitur X = dataset.iloc[:, : -1] # [x = take entire row, y = take all column except last column] # y itu label y = dataset.iloc[:, -1] # [x = take entire row, y = last column only] # get training score using cross validation result = self.nFoldCrossValidation(X, y, clf, nfold=10) if drop_element == 'who': # training and save into pickle # scenario 1 # joblib.dump(clf,'model/scen1_train_where_halfidn.pkl') # joblib.dump(clf,'model/HO_scen1_train_where_halfidn.pkl') # joblib.dump(clf,'model/3_scen1_train_where_halfidn.pkl') # joblib.dump(clf,'model/HO2_scen1_train_where_halfidn.pkl') # # scenario 2 # joblib.dump(clf,'model/scen2_train_where_fullidn.pkl') # joblib.dump(clf,'model/HO_scen2_train_where_fullidn.pkl') # joblib.dump(clf,'model/3_scen2_train_where_fullidn.pkl') # joblib.dump(clf,'model/HO2_scen2_train_where_fullidn.pkl') # # scenario 3 # joblib.dump(clf,'model/scen3_train_where_default.pkl') # joblib.dump(clf,'model/HO_scen3_train_where_default.pkl') joblib.dump(clf, 'model/3_scen3_train_where_default.pkl') # joblib.dump(clf,'model/HO2_scen3_train_where_default.pkl') # testing # joblib.dump(clf,'model/s2_testing_where.pkl') print("Model for WHERE has been saved") # scenario 1 # self.ut.convertToExcel("./result/scenario1_halfidn_WHERE_10fold.xlsx",result,"Sheet1") # self.ut.convertToExcel("./result/HO_scenario1_halfidn_WHERE_10fold.xlsx",result,"Sheet1") # self.ut.convertToExcel("./result/HO2_scenario1_halfidn_WHERE_10fold.xlsx",result,"Sheet1") # scenario 2 # self.ut.convertToExcel("./result/3_scenario2_fullidn_WHERE_10fold.xlsx",result,"Sheet1") # self.ut.convertToExcel("./result/HO2_scenario2_fullidn_WHERE_10fold.xlsx",result,"Sheet1") # scenario 3 self.ut.convertToExcel( "./result/3_scenario3_default_WHERE_10fold.xlsx", result, "Sheet1") # self.ut.convertToExcel("./result/HO2_scenario3_default_WHERE_10fold.xlsx",result,"Sheet1") # scenario testing # self.ut.convertToExcel("./result/s2_testing_WHERE_10fold.xlsx",result,"Sheet1") print( "Cross Validation for WHERE model has been saved to excel file!" ) elif drop_element == 'where': # training and save into pickle # scenario 1 # joblib.dump(clf,'model/scen1_train_who_halfidn.pkl') # joblib.dump(clf,'model/HO_scen1_train_who_halfidn.pkl') # joblib.dump(clf,'model/3_scen1_train_who_halfidn.pkl') # joblib.dump(clf,'model/HO2_scen1_train_who_halfidn.pkl') # # scenario 2 # joblib.dump(clf,'model/scen2_train_who_fullidn.pkl') # joblib.dump(clf,'model/HO_scen2_train_who_fullidn.pkl') # joblib.dump(clf,'model/3_scen2_train_who_fullidn.pkl') # joblib.dump(clf,'model/HO2_scen2_train_who_fullidn.pkl') # # scenario 3 # joblib.dump(clf,'model/scen3_train_who_default.pkl') # joblib.dump(clf,'model/HO_scen3_train_who_default.pkl') joblib.dump(clf, 'model/3_scen3_train_who_default.pkl') # joblib.dump(clf,'model/HO2_scen3_train_who_default.pkl') # testing # joblib.dump(clf,'model/s2_testing_who.pkl') print("Model for WHO has been saved") # scenario 1 # self.ut.convertToExcel("./result/scenario1_halfidn_WHO_10fold.xlsx",result,"Sheet1") # self.ut.convertToExcel("./result/HO_scenario1_halfidn_WHO_10fold.xlsx",result,"Sheet1") # self.ut.convertToExcel("./result/HO2_scenario1_halfidn_WHO_10fold.xlsx",result,"Sheet1") # scenario 2 # self.ut.convertToExcel("./result/3_scenario2_fullidn_WHO_10fold.xlsx",result,"Sheet1") # self.ut.convertToExcel("./result/HO2_scenario2_fullidn_WHO_10fold.xlsx",result,"Sheet1") # scenario 3 self.ut.convertToExcel( "./result/3_scenario3_default_WHO_10fold.xlsx", result, "Sheet1") # self.ut.convertToExcel("./result/HO2_scenario3_default_WHO_10fold.xlsx",result,"Sheet1") # scenario testing # self.ut.convertToExcel("./result/s2_testing_WHO_10fold.xlsx",result,"Sheet1") print( "Cross Validation for WHO model has been saved to excel file!") # classic method def getEvaluationScore(self, X_test, y_test, model): y_pred = model.predict(X_test) print(y_pred) # otak atik aja datanya, gimana biar nilainya jadi ga 0 lagi, undersampling oversampling ? print("Accuracy: ", (accuracy_score(y_test, y_pred) * 100).round(4)) print("Precision: ", (precision_score(y_test, y_pred) * 100).round(4)) print("Recall: ", (recall_score(y_test, y_pred) * 100).round(4)) print("F-measure: ", (f1_score(y_test, y_pred) * 100).round(4)) print("Confusion matrix:") print(confusion_matrix(y_test, y_pred)) # cross validation def nFoldCrossValidation(self, X, y, model, nfold): # ngitung jumlah class class_count = y.groupby(y).count().shape[0] # shuffle biar ngacak di awal, random_state biar dirunning berkali kali tetap sama k_fold = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=7) # initiate score lists precision_list = [] recall_list = [] fscore_list = [] train_score = [] test_score = [] # counter fold, dimulai dari 1 fold_count = 1 print("Confusion matrix of " + model.__class__.__name__ + ":\n") # train_indices and test_indices returns an array of indices which indicate train and test data (indices = index) for train_indices, test_indices in k_fold.split(X, y): #split per n fold # memisahkan data training dan data testing X_train, X_test = X.iloc[train_indices], X.iloc[ test_indices] #iloc = locate data by index y_train, y_test = y.iloc[train_indices], y.iloc[test_indices] # fit = buat training model.fit(X_train, y_train) # prediksi data training predictions = model.predict(X_train) # mendapatkan nilai akurasi dengan cara ngebandingin hasil prediksi dengan nilai aslinya (y_train / data training) satu persatu train_score.append( accuracy_score(y_train, predictions).round(4) ) #diround 4 biar hasilnya dibulatkan jadi 4 angka di belakang koma # prediksi data testing predictions = model.predict(X_test) # mendapatkan nilai akurasi dengan cara ngebandingin hasil prediksi dengan nilai aslinya (y_test / data testing) test_score.append(accuracy_score(y_test, predictions).round(4)) # mencari nilai precision, recall dan f_score dengan membandingkan nilai asli (y_testing) dengan hasil prediksi precision_list.append( precision_score(y_test, predictions).round(4)) recall_list.append(recall_score(y_test, predictions).round(4)) fscore_list.append(f1_score(y_test, predictions).round(4)) # menunjukkan fold ke berapa, dan confusion matrixnya seperti apa print("Fold " + str(fold_count) + ":") # urutan confusion matrix, pokoknya baris bawah, sebelah kanan itu True Positive nya kirinya False Positive, atas True Negative dan False Negative print(confusion_matrix(y_test, predictions)) print() fold_count += 1 # hitung rata - rata nilai akurasi, precision, recall, dan f_score acc_train = (sum(train_score) / len(train_score)).round(4) acc_test = (sum(test_score) / len(test_score)).round(4) precision = (sum(precision_list) / len(precision_list)).round(4) recall = (sum(recall_list) / len(recall_list)).round(4) f_score = (sum(fscore_list) / len(fscore_list)).round(4) print("Evaluation using " + model.__class__.__name__ + ":\n") # simpan data hasil perhitungan akurasi precision recall f_score ke dataframe fold_index = [str(i + 1) for i in range(nfold)] #create fold index fold_data = [ fold_index, train_score, test_score, precision_list, recall_list, fscore_list ] fold_column = [ 'fold', 'acc_train', 'acc_test', 'precision', 'recall', 'f_score' ] #create column name df_fold = pd.DataFrame(np.column_stack(fold_data), columns=fold_column) #bikin DataFrame df_fold = df_fold.set_index('fold') #set data fold as index #PRINT hasil print(df_fold) print("=" * 50 + "\n") print('Total data classified:', len(X)) # perlu dibandingkan nilai akurasi di training dan di testing, siapa tau ada overfitting, kalau misalnya ga beda jauh, berarti kemungkinan modelnya benar print('Accuracy on Train:', acc_train) print('Accuracy on Test:', acc_test) print('Precision:', precision) print('Recall:', recall) print('F-Score:', f_score) return df_fold
def __init__(self): self.ut = Utility() self.fwe = FiveWExtractor() self.fex = FeatureExtractor() self.nlp = NLPHelper() self.tr = ModelTrainer()
class EvaluateNews(object): def __init__(self): self.ut = Utility() self.fwe = FiveWExtractor() self.fex = FeatureExtractor() self.nlp = NLPHelper() self.tr = ModelTrainer() def evaluateGoldenDatasetNews(self, file_range=None): # filerange = (0, 10) # find feature in one text and save it to excel path = "./datasets/" filelist = os.listdir(path) data = pd.DataFrame() if file_range: filelist = filelist[file_range[0]:file_range[1]] for idx, file in enumerate(filelist): print(file) #buka file pickle yang isinya data ner, coref, dan pos dari suatu teks berita file_temp = self.ut.loadJSON(os.path.join(path, file)) # ekstraksi 5W dari file JSON try: temp = self.fwe.extract5w(file_temp["text"], file_temp["title"]) temp["file"] = file data = data.append(temp, ignore_index=True) except: temp = [] print("It failed huhu") self.ut.convertToExcel("idnhalf_goldendata_evaluate_089.xlsx", data, 'Sheet1') print("Evaluating golden data is done!") def extract5wLocalNewsForEval(self, filename): data = self.ut.loadCSV(filename, ',', "ISO-8859-1") data['extracted'] = data.apply( lambda x: self.fwe.extract5wLocalNews(x['text'], x['title']), axis=1) temp = pd.DataFrame() temp['title'] = data['extracted'].apply(lambda x: x['title']) temp['text'] = data['extracted'].apply(lambda x: x['text']) temp['who'] = data['extracted'].apply(lambda x: x['who']) temp['where'] = data['extracted'].apply(lambda x: x['where']) temp['what'] = data['extracted'].apply(lambda x: x['what']) temp['when'] = data['extracted'].apply(lambda x: x['when']) temp['why'] = data['extracted'].apply(lambda x: x['why']) # scenario 1 # self.ut.convertToExcel("3_scen1_halfidn_evallocalnews.xlsx",temp,'Sheet1') # self.ut.convertToExcel("HO_scen1_halfidn_evallocalnews.xlsx",temp,'Sheet1') # scenario 2 # self.ut.convertToExcel("3_scen2_fullidn_evallocalnews.xlsx",temp,'Sheet1') # self.ut.convertToExcel("HO_scen2_fullidn_evallocalnews.xlsx",temp,'Sheet1') # scenario 3 self.ut.convertToExcel("3_scen3_default_evallocalnews.xlsx", temp, 'Sheet1') # self.ut.convertToExcel("HO_scen3_default_evallocalnews.xlsx",temp,'Sheet1') print("Evaluating local news is done!") def extractFeatureFromLocalNews(self, filename): data = self.ut.loadCSV(filename, ',', "ISO-8859-1") data['ner'] = data['text'].apply(lambda x: self.nlp.getNER(x)) data['coref'] = data['text'].apply(lambda x: self.nlp.getCoref(x)) feature = pd.DataFrame() for i in range(data.shape[0]): feature = feature.append(self.fex.extractFeaturesDirectFromText( data.iloc[i]), ignore_index=True) # scenario 1 # self.ut.convertToExcel("scen1_halfidn_localfeature.xlsx",feature,'Sheet1') # scenario 2 # self.ut.convertToExcel("scen2_fullidn_localfeature.xlsx",feature,'Sheet1') # scenario 3 # self.ut.convertToExcel("scen3_default_localfeature.xlsx",feature,'Sheet1') def evaluateLocalWhoWhere(self, drop_element): # # scenario 1 # dataset = pd.read_excel('scen1_halfidn_localfeature.xlsx', sheet_name='Sheet1') # model_where = joblib.load('model/3_scen1_train_where_halfidn.pkl') # model_who = joblib.load('model/3_scen1_train_who_halfidn.pkl') # scenario 2 # dataset = pd.read_excel('scen2_fullidn_localfeature.xlsx', sheet_name='Sheet1') # model_where = joblib.load('model/3_scen2_train_where_fullidn.pkl') # model_who = joblib.load('model/3_scen2_train_who_fullidn.pkl') # # scenario 3 dataset = pd.read_excel('scen3_default_localfeature.xlsx', sheet_name='Sheet1') model_where = joblib.load('model/3_scen3_train_where_default.pkl') model_who = joblib.load('model/3_scen3_train_who_default.pkl') # scenario HO ------------------------------- # # # scenario 1 # dataset = pd.read_excel('scen1_halfidn_localfeature.xlsx', sheet_name='Sheet1') # model_where = joblib.load('model/HO2_scen1_train_where_halfidn.pkl') # model_who = joblib.load('model/HO2_scen1_train_who_halfidn.pkl') # scenario 2 # dataset = pd.read_excel('scen2_fullidn_localfeature.xlsx', sheet_name='Sheet1') # model_where = joblib.load('model/HO2_scen2_train_where_fullidn.pkl') # model_who = joblib.load('model/HO2_scen2_train_who_fullidn.pkl') # # # scenario 3 # dataset = pd.read_excel('scen3_default_localfeature.xlsx', sheet_name='Sheet1') # model_where = joblib.load('model/HO2_scen3_train_where_default.pkl') # model_who = joblib.load('model/HO2_scen3_train_who_default.pkl') # scenario test # dataset = pd.read_excel('scen1_halfidn_localfeature.xlsx', sheet_name='Sheet1') # model_where = joblib.load('model/s2_testing_where.pkl') # model_who = joblib.load('model/s2_testing_who.pkl') if drop_element == 'who': self.evaluateModelLocal(dataset, 'who', model_where) print("Evaluation for WHERE's local classifier is done!") elif drop_element == 'where': self.evaluateModelLocal(dataset, 'where', model_who) print("Evaluation for WHO's local classifier is done!") def evaluateModelLocal(self, dataset, drop_element, model): dataset = self.ut.convertToNumeric(dataset) dataset = dataset.drop(['entity', drop_element], axis=1) # !!! FOR ONE HOT ENCODING !!! # dataset = self.ut.oneHotEncoding(dataset) # determine wich column is feature or label # X itu fitur X = dataset.iloc[:, : -1] # [x = take entire row, y = take all column except last column] # y itu label y = dataset.iloc[:, -1] # [x = take entire row, y = last column only] # get training score using cross validation # result = self.nFoldCrossValidation(X, y, clf, nfold=10) result = self.tr.getEvaluationScore(X, y, model)
def __init__(self): self.pre = Preprocess() self.nlp = NLPHelper() self.fex = FeatureExtractor() self.ut = Utility() self.mt = ModelTrainer()
class FiveWExtractor(object): def __init__(self): self.pre = Preprocess() self.nlp = NLPHelper() self.fex = FeatureExtractor() self.ut = Utility() def extractNerCorefFromTxt(self, text, title): ner = self.nlp.getNER(text) print("NER extraction completed") coref = self.nlp.getCoref(text) print("Coref extraction completed") nlp_dict = { 'title': title, 'text': text, 'ner': ner, 'coref': coref, } print(nlp_dict) return nlp_dict def extractINANerAndCoref(self, text, title): ner = self.nlp.getIdnNER(text) print("NER extraction completed") coref = self.nlp.getCoref(text) print("Coref extraction completed") nlp_dict = { 'title': title, 'text': text, 'ner': ner, 'coref': coref, } return nlp_dict def extractWhoOrWhere(self, text, title, ml, ner_coref): # load machine learning model model = self.ut.loadPickle(ml) # extracting features and convert it to numeric type features = self.fex.extractFeaturesDirectFromText(ner_coref) # print(features) features = self.convertToNumeric(features) # features = self.ut.oneHotEncoding(features) # features = features.drop('entity', axis=1) print(features) # predicting who or where by it's feature, dropping unused column predict_candidate = model.predict(features.drop('entity', axis=1)) print("candidates: ", predict_candidate) candidate = [] for i in range(len(predict_candidate)): if predict_candidate[i] == 1: # insert candidate to list # candidate.append(features.iloc[i,5]) # !! FOR ONE HOT ENCODING ONLY candidate.append(features.iloc[i, 1]) return candidate def convertToNumeric(self, dataset): # convert categorical feature to numeric dataset['type'] = dataset['type'].map({ 'PERSON': 1, 'LOCATION': 2, 'ORGANIZATION': 3, 'NP': 4, 'DATE': 5, 'TIME': 6 }).astype(int) dataset['occ_title'] = dataset['occ_title'].map({ False: 0, True: 1 }).astype(int) return dataset def getWhenCandidatefromNER(self, ner_list): print("Getting date and time entities in text with NER...") # getting when candidate (date and time) from extracted NER list_date = [] list_time = [] when_candidates = [] date = [] time = [] for ner in ner_list: if ner[1] == 'DATE': date.append(ner[0]) elif ner[1] == 'TIME': time.append(ner[0]) else: if date != []: list_date.append(self.pre.joinText(date)) date = [] if time != []: list_time.append(self.pre.joinText(time)) time = [] list_date = self.pre.sieveSubstring(list_date) list_time = self.pre.sieveSubstring(list_time) when_candidates = list_date + list_time if when_candidates: return when_candidates else: return None def extractWhenFromText(self, text, ner): print() print("Extracting WHEN...") when_candidates = self.getWhenCandidatefromNER(ner) if when_candidates: when = None when_score = None for candidate in when_candidates: candidate_score = self.scoreWhenCandidate(candidate, text) if not when_score or candidate_score > when_score: when = candidate when_score = candidate_score return when else: return None def findPositioninText(self, candidate, sent_list): for i in range(len(sent_list)): pos = i + 1 match = re.search(candidate.lower(), sent_list[i].lower()) if match: return pos else: return None def scoreWhenCandidate(self, candidate, text): # w0, w1, w2, w3 = weight of value # d = the document length measured in sentences # pc || p(c) = the position measured in sentences of candidate c within the document print("Scoring WHEN candidate: " + candidate) w0 = 10 w1 = w2 = 1 w3 = 5 sent_list = sent_tokenize(text) d = len(sent_list) pc = self.findPositioninText(candidate, sent_list) if pc: score = w0 * ( (d - pc) / d) + w1 * self.isDate(candidate) + w2 * self.isTime( candidate) + w3 * self.isDateTime(candidate) else: score = 0 return score def isDate(self, candidate): # check if candidate is date instance, else return 0 print("Checking if " + candidate + " can be parsed to a Date object...") parser.parser.parse = parse_date try: parsed_candidate = parser.parser().parse(candidate, None) # if contain date if parsed_candidate[0].day or parsed_candidate[ 0].month or parsed_candidate[0].year or parsed_candidate[ 0].weekday: return 1 # if doesnt contain time and/or date else: return 0 except (ValueError, AttributeError) as e: return 0 def isDateTime(self, candidate): # check if it's parseable to datetime type print("Checking if " + candidate + " can be parsed to a DateTime object...") try: parsed_candidate = parse(candidate) return 1 except (ValueError, AttributeError) as e: return 0 def isTime(self, candidate): # check if when candidate contains date+time, time only, or neither print("Checking if " + candidate + " can be parsed to a Time object...") parser.parser.parse = parse_date try: parsed_time = parser.parser().parse(candidate, None) # if contain time if parsed_time[0].hour or parsed_time[0].minute or parsed_time[ 0].second or parsed_time[0].microsecond: # if contain date too if parsed_time[0].day or parsed_time[0].month or parsed_time[ 0].year or parsed_time[0].weekday: return 0.8 # if time only else: return 1 # if doesnt contain time and/or date else: return 0 except (ValueError, AttributeError) as e: return 0 def extractWhatFromText(self, who_candidates, title, text): print() print("Extracting WHAT...") what = [] if who_candidates: print(who_candidates) for who in who_candidates: # If one of our WHO candidates occurs in the title, we look for the subsequent verb phrase of it if who in title: print("getting subsequent Verb Phrase from title...") anno = list(self.nlp.getConstituencyParsing(title)) # print(anno) # returning verb phrase from title for sub_tree in anno[0].subtrees( lambda t: t.label() == 'VP'): what.append(' '.join(sub_tree.leaves())) # If there is no WHO in the headline, we search within the text for the first occurrence of our highest ranked WHO and also take the subsequent verb phrase as WHAT else: sent_list = sent_tokenize(text) for sent in sent_list: # find first occurrence of who in sentence match = re.findall( r'\b' + re.escape(who.lower()) + r'\b', sent.lower()) if match: print( "getting subsequent Verb Phrase from sentence..." ) # getting verb phrase anno = list(self.nlp.getConstituencyParsing(sent)) # print(anno) break # returning verb phrase from text for sub_tree in anno[0].subtrees( lambda t: t.label() == 'VP'): what.append(' '.join(sub_tree.leaves())) what = self.pre.sieveSubstring(what) return what else: return None def extractWhyFromText(self, what_candidates, text): print() print("Extracting WHY...") regexWhy = [('since', 0.2), ('cause', 0.3), ('because', 0.3), ('hence', 0.2), ('therefore', 0.3), ('why', 0.3), ('result', 0.4), ('reason', 0.3), ('provide', 0.1), ('s behind', 0.1), ('Due to', 0.2)] #for returning reason candidates from inputted text(s) why_candidates = [] #extract sentence from the text sentence_list = sent_tokenize(text) for sent in sentence_list: matched_key = [] # why = {} for reg in regexWhy: #check every word in sentence to see if there are same word with the keyword match = re.findall(r'\b' + re.escape(reg[0].lower()) + r'\b', sent.lower()) if match: matched_key.append(reg) if what_candidates: # check if what is in sentence # anggap 1 kalimat hanya punya 1 what for what in what_candidates: # match = re.findall(r'\b' + what.lower() + r'\b',sent.lower()) # if match: if what.lower() in sent.lower(): # check with WHAT(.*)to/TO(.*)/VB rule print("getting Part of Speech tag...") pos = self.nlp.getPOS(sent) for i in range(len(pos)): if pos[i][1] == 'TO': if pos[i + 1][1] == 'VB': print("getting VERB in text...") rule = ('(WHAT(.*)to/TO(.*)/VB)', 0.5) matched_key.append(rule) # check with (WHAT(.*)will) rule checked = re.findall(r'\b' + re.escape('will') + r'\b', sent.lower()) if checked: rule = ('(WHAT(.*)will)', 0.5) matched_key.append(rule) #store all reason list found from one text in container if matched_key != []: why = sent # why['sentence'] = sent # why['keys'] = list(set(matched_key)) # why['total_confidence'] = sum([value[1] for value in why['keys']]) why_candidates.append(why) return why_candidates def extract5w(self, text, title): # getting ML model for classifying who and where # scenario 1: # who_model = "./model/scen1_train_who_halfidn.pkl" # where_model = "./model/scen1_train_where_halfidn.pkl" # # # scenario 2: # who_model = "./model/scen2_train_who_fullidn.pkl" # where_model = "./model/scen2_train_where_fullidn.pkl" # # scenario 3: who_model = "./model/scen3_train_who_default.pkl" where_model = "./model/scen3_train_where_default.pkl" print("Using " + who_model + " as WHO classifier and " + where_model + " as WHERE classifier\n") # getting NER and Coref of the text ner_coref = self.extractNerCorefFromTxt(text, title) # extracting 5w print("Extracting WHO...") who = self.extractWhoOrWhere(text, title, who_model, ner_coref) print("\nExtracting WHERE...") where = self.extractWhoOrWhere(text, title, where_model, ner_coref) when = self.extractWhenFromText(text, ner_coref['ner']) if who: what = self.extractWhatFromText(who, title, text) else: what = None why = self.extractWhyFromText(what, text) result_dict = { 'title': title, 'text': text, "who": who, 'where': where, 'what': what, 'when': when, 'why': why } return result_dict def extract5wLocalNews(self, text, title): # getting ML model for classifying who and where # scenario 1: # who_model = "./model/scen1_train_who_halfidn.pkl" # where_model = "./model/scen1_train_where_halfidn.pkl" # who_model = "./model/3_scen1_train__whewho_halfidn.pkl" # where_model = "./model/3_scen1_trainre_halfidn.pkl" # # # scenario 2: who_model = "./model/scen2_train_who_fullidn.pkl" where_model = "./model/scen2_train_where_fullidn.pkl" # who_model = "./model/3_scen2_train_who_fullidn.pkl" # where_model = "./model/3_scen2_train_where_fullidn.pkl" # # scenario 3: # who_model = "./model/scen3_train_who_default.pkl" # where_model = "./model/scen3_train_where_default.pkl" # who_model = "./model/3_scen3_train_who_default.pkl" # where_model = "./model/3_scen3_train_where_default.pkl" # ------ HO -------- # scenario 1: # who_model = "./model/HO_scen1_train_who_halfidn.pkl" # where_model = "./model/HO_scen1_train_where_halfidn.pkl" # # # scenario 2: # who_model = "./model/HO_scen2_train_who_fullidn.pkl" # where_model = "./model/HO_scen2_train_where_fullidn.pkl" # # scenario 3: # who_model = "./model/HO_scen3_train_who_default.pkl" # where_model = "./model/HO_scen3_train_where_default.pkl" print("Using " + who_model + " as WHO classifier and " + where_model + " as WHERE classifier\n") # getting NER and Coref of the text ner_coref = self.extractINANerAndCoref(text, title) # extracting 5w who = self.extractWhoOrWhere(text, title, who_model, ner_coref) where = self.extractWhoOrWhere(text, title, where_model, ner_coref) when = self.extractWhenFromText(text, ner_coref['ner']) if who: what = self.extractWhatFromText(who, title, text) else: what = None why = self.extractWhyFromText(what, text) result_dict = { 'title': title, 'text': text, "who": who, 'where': where, 'what': what, 'when': when, 'why': why } return result_dict def prettyPrint5w(self, result): # print("\nExtracted 5W from: "+result['title']) print() if result['who']: print("WHO is involved in the news?: ", result['who']) else: print("Sorry, can not detect the WHO in the news") if result['where']: print("WHERE does the news take place?: ", result['where']) else: print("Sorry, can not detect the WHERE in the news") if result['when']: print("WHEN did the event in the news happen: ", result['when']) else: print("Sorry, can not detect the WHEN in the news") if not result['who']: print( "WHAT in the news is not detected, because the WHO element in the news was not detected" ) else: print("WHAT's happening in the news: ", result['what']) if not result['why']: if not result['what']: print( "WHY in the news is not detected, because the WHAT element in the news was not detected" ) else: print("Sorry, can not detect the WHY in the news") else: print("WHY did the event in the news happen: ", result['why'])
print("Extracting feature: distribution ") entities = self.findDistribution(data["text"], entities) # append text index for entity in entities: entity['id_text'] = data["filename"] feature = pd.DataFrame(entities) return feature def extractFeaturesDirectFromText(self, data): print("Extracting features from text...") print("Extracting feature: entity types") entities = self.extractEntity(data["ner"]) entities = self.pre.removeDuplicateListDict( self.findNounPhraseFromTitle(data["title"], entities)) print("Extracting feature: occurences in text ") entities = self.countCfOccurencesInText(entities, data["coref"], data["title"]) print("Extracting feature: occurences in title ") entities = self.findOccurencesInTitle(data["title"], entities) print("Extracting feature: distribution ") entities = self.findDistribution(data["text"], entities) feature = pd.DataFrame(entities) return feature e = FeatureExtractor() ut = Utility()
def __init__(self): self.nlp = NLPHelper() self.ut = Utility() self.fex = FeatureExtractor()