class CheckDataset(object): def __init__(self): self.nlp = NLPHelper() self.ut = Utility() self.fex = FeatureExtractor() def checkData(self): path = "scenario2_fullidn_pickle/" filelist = os.listdir(path) data = pd.DataFrame() for idx, file in enumerate(filelist): #buka file pickle yang isinya data ner, coref, dan pos dari suatu teks berita pkl_dict = self.ut.loadPickle(os.path.join(path, file)) # print(pkl_dict['ner']) # entities = self.fex.extractBefEntity(pkl_dict['ner']) filename = pkl_dict['filename'] df = pd.DataFrame() df = self.countTermFrequency(pkl_dict['text']) df['filename'] = filename data = data.append(df) # df['entities'] = entities self.ut.convertToExcel("TF.xlsx", data, 'Sheet1') def countTermFrequency(self, text): import nltk words = nltk.word_tokenize(text) fdist = nltk.FreqDist(words) df = pd.DataFrame.from_dict(fdist, orient='index').reset_index() df.columns = ['term', 'frequency'] # for word, frequency in fdist.most_common(50): # print(u'{}:{}'.format(word, frequency)) return df
class FiveWExtractor(object): def __init__(self): self.pre = Preprocess() self.nlp = NLPHelper() self.fex = FeatureExtractor() self.ut = Utility() def extractNerCorefFromTxt(self, text, title): ner = self.nlp.getNER(text) print("NER extraction completed") coref = self.nlp.getCoref(text) print("Coref extraction completed") nlp_dict = { 'title': title, 'text': text, 'ner': ner, 'coref': coref, } print(nlp_dict) return nlp_dict def extractINANerAndCoref(self, text, title): ner = self.nlp.getIdnNER(text) print("NER extraction completed") coref = self.nlp.getCoref(text) print("Coref extraction completed") nlp_dict = { 'title': title, 'text': text, 'ner': ner, 'coref': coref, } return nlp_dict def extractWhoOrWhere(self, text, title, ml, ner_coref): # load machine learning model model = self.ut.loadPickle(ml) # extracting features and convert it to numeric type features = self.fex.extractFeaturesDirectFromText(ner_coref) # print(features) features = self.convertToNumeric(features) # features = self.ut.oneHotEncoding(features) # features = features.drop('entity', axis=1) print(features) # predicting who or where by it's feature, dropping unused column predict_candidate = model.predict(features.drop('entity', axis=1)) print("candidates: ", predict_candidate) candidate = [] for i in range(len(predict_candidate)): if predict_candidate[i] == 1: # insert candidate to list # candidate.append(features.iloc[i,5]) # !! FOR ONE HOT ENCODING ONLY candidate.append(features.iloc[i, 1]) return candidate def convertToNumeric(self, dataset): # convert categorical feature to numeric dataset['type'] = dataset['type'].map({ 'PERSON': 1, 'LOCATION': 2, 'ORGANIZATION': 3, 'NP': 4, 'DATE': 5, 'TIME': 6 }).astype(int) dataset['occ_title'] = dataset['occ_title'].map({ False: 0, True: 1 }).astype(int) return dataset def getWhenCandidatefromNER(self, ner_list): print("Getting date and time entities in text with NER...") # getting when candidate (date and time) from extracted NER list_date = [] list_time = [] when_candidates = [] date = [] time = [] for ner in ner_list: if ner[1] == 'DATE': date.append(ner[0]) elif ner[1] == 'TIME': time.append(ner[0]) else: if date != []: list_date.append(self.pre.joinText(date)) date = [] if time != []: list_time.append(self.pre.joinText(time)) time = [] list_date = self.pre.sieveSubstring(list_date) list_time = self.pre.sieveSubstring(list_time) when_candidates = list_date + list_time if when_candidates: return when_candidates else: return None def extractWhenFromText(self, text, ner): print() print("Extracting WHEN...") when_candidates = self.getWhenCandidatefromNER(ner) if when_candidates: when = None when_score = None for candidate in when_candidates: candidate_score = self.scoreWhenCandidate(candidate, text) if not when_score or candidate_score > when_score: when = candidate when_score = candidate_score return when else: return None def findPositioninText(self, candidate, sent_list): for i in range(len(sent_list)): pos = i + 1 match = re.search(candidate.lower(), sent_list[i].lower()) if match: return pos else: return None def scoreWhenCandidate(self, candidate, text): # w0, w1, w2, w3 = weight of value # d = the document length measured in sentences # pc || p(c) = the position measured in sentences of candidate c within the document print("Scoring WHEN candidate: " + candidate) w0 = 10 w1 = w2 = 1 w3 = 5 sent_list = sent_tokenize(text) d = len(sent_list) pc = self.findPositioninText(candidate, sent_list) if pc: score = w0 * ( (d - pc) / d) + w1 * self.isDate(candidate) + w2 * self.isTime( candidate) + w3 * self.isDateTime(candidate) else: score = 0 return score def isDate(self, candidate): # check if candidate is date instance, else return 0 print("Checking if " + candidate + " can be parsed to a Date object...") parser.parser.parse = parse_date try: parsed_candidate = parser.parser().parse(candidate, None) # if contain date if parsed_candidate[0].day or parsed_candidate[ 0].month or parsed_candidate[0].year or parsed_candidate[ 0].weekday: return 1 # if doesnt contain time and/or date else: return 0 except (ValueError, AttributeError) as e: return 0 def isDateTime(self, candidate): # check if it's parseable to datetime type print("Checking if " + candidate + " can be parsed to a DateTime object...") try: parsed_candidate = parse(candidate) return 1 except (ValueError, AttributeError) as e: return 0 def isTime(self, candidate): # check if when candidate contains date+time, time only, or neither print("Checking if " + candidate + " can be parsed to a Time object...") parser.parser.parse = parse_date try: parsed_time = parser.parser().parse(candidate, None) # if contain time if parsed_time[0].hour or parsed_time[0].minute or parsed_time[ 0].second or parsed_time[0].microsecond: # if contain date too if parsed_time[0].day or parsed_time[0].month or parsed_time[ 0].year or parsed_time[0].weekday: return 0.8 # if time only else: return 1 # if doesnt contain time and/or date else: return 0 except (ValueError, AttributeError) as e: return 0 def extractWhatFromText(self, who_candidates, title, text): print() print("Extracting WHAT...") what = [] if who_candidates: print(who_candidates) for who in who_candidates: # If one of our WHO candidates occurs in the title, we look for the subsequent verb phrase of it if who in title: print("getting subsequent Verb Phrase from title...") anno = list(self.nlp.getConstituencyParsing(title)) # print(anno) # returning verb phrase from title for sub_tree in anno[0].subtrees( lambda t: t.label() == 'VP'): what.append(' '.join(sub_tree.leaves())) # If there is no WHO in the headline, we search within the text for the first occurrence of our highest ranked WHO and also take the subsequent verb phrase as WHAT else: sent_list = sent_tokenize(text) for sent in sent_list: # find first occurrence of who in sentence match = re.findall( r'\b' + re.escape(who.lower()) + r'\b', sent.lower()) if match: print( "getting subsequent Verb Phrase from sentence..." ) # getting verb phrase anno = list(self.nlp.getConstituencyParsing(sent)) # print(anno) break # returning verb phrase from text for sub_tree in anno[0].subtrees( lambda t: t.label() == 'VP'): what.append(' '.join(sub_tree.leaves())) what = self.pre.sieveSubstring(what) return what else: return None def extractWhyFromText(self, what_candidates, text): print() print("Extracting WHY...") regexWhy = [('since', 0.2), ('cause', 0.3), ('because', 0.3), ('hence', 0.2), ('therefore', 0.3), ('why', 0.3), ('result', 0.4), ('reason', 0.3), ('provide', 0.1), ('s behind', 0.1), ('Due to', 0.2)] #for returning reason candidates from inputted text(s) why_candidates = [] #extract sentence from the text sentence_list = sent_tokenize(text) for sent in sentence_list: matched_key = [] # why = {} for reg in regexWhy: #check every word in sentence to see if there are same word with the keyword match = re.findall(r'\b' + re.escape(reg[0].lower()) + r'\b', sent.lower()) if match: matched_key.append(reg) if what_candidates: # check if what is in sentence # anggap 1 kalimat hanya punya 1 what for what in what_candidates: # match = re.findall(r'\b' + what.lower() + r'\b',sent.lower()) # if match: if what.lower() in sent.lower(): # check with WHAT(.*)to/TO(.*)/VB rule print("getting Part of Speech tag...") pos = self.nlp.getPOS(sent) for i in range(len(pos)): if pos[i][1] == 'TO': if pos[i + 1][1] == 'VB': print("getting VERB in text...") rule = ('(WHAT(.*)to/TO(.*)/VB)', 0.5) matched_key.append(rule) # check with (WHAT(.*)will) rule checked = re.findall(r'\b' + re.escape('will') + r'\b', sent.lower()) if checked: rule = ('(WHAT(.*)will)', 0.5) matched_key.append(rule) #store all reason list found from one text in container if matched_key != []: why = sent # why['sentence'] = sent # why['keys'] = list(set(matched_key)) # why['total_confidence'] = sum([value[1] for value in why['keys']]) why_candidates.append(why) return why_candidates def extract5w(self, text, title): # getting ML model for classifying who and where # scenario 1: # who_model = "./model/scen1_train_who_halfidn.pkl" # where_model = "./model/scen1_train_where_halfidn.pkl" # # # scenario 2: # who_model = "./model/scen2_train_who_fullidn.pkl" # where_model = "./model/scen2_train_where_fullidn.pkl" # # scenario 3: who_model = "./model/scen3_train_who_default.pkl" where_model = "./model/scen3_train_where_default.pkl" print("Using " + who_model + " as WHO classifier and " + where_model + " as WHERE classifier\n") # getting NER and Coref of the text ner_coref = self.extractNerCorefFromTxt(text, title) # extracting 5w print("Extracting WHO...") who = self.extractWhoOrWhere(text, title, who_model, ner_coref) print("\nExtracting WHERE...") where = self.extractWhoOrWhere(text, title, where_model, ner_coref) when = self.extractWhenFromText(text, ner_coref['ner']) if who: what = self.extractWhatFromText(who, title, text) else: what = None why = self.extractWhyFromText(what, text) result_dict = { 'title': title, 'text': text, "who": who, 'where': where, 'what': what, 'when': when, 'why': why } return result_dict def extract5wLocalNews(self, text, title): # getting ML model for classifying who and where # scenario 1: # who_model = "./model/scen1_train_who_halfidn.pkl" # where_model = "./model/scen1_train_where_halfidn.pkl" # who_model = "./model/3_scen1_train__whewho_halfidn.pkl" # where_model = "./model/3_scen1_trainre_halfidn.pkl" # # # scenario 2: who_model = "./model/scen2_train_who_fullidn.pkl" where_model = "./model/scen2_train_where_fullidn.pkl" # who_model = "./model/3_scen2_train_who_fullidn.pkl" # where_model = "./model/3_scen2_train_where_fullidn.pkl" # # scenario 3: # who_model = "./model/scen3_train_who_default.pkl" # where_model = "./model/scen3_train_where_default.pkl" # who_model = "./model/3_scen3_train_who_default.pkl" # where_model = "./model/3_scen3_train_where_default.pkl" # ------ HO -------- # scenario 1: # who_model = "./model/HO_scen1_train_who_halfidn.pkl" # where_model = "./model/HO_scen1_train_where_halfidn.pkl" # # # scenario 2: # who_model = "./model/HO_scen2_train_who_fullidn.pkl" # where_model = "./model/HO_scen2_train_where_fullidn.pkl" # # scenario 3: # who_model = "./model/HO_scen3_train_who_default.pkl" # where_model = "./model/HO_scen3_train_where_default.pkl" print("Using " + who_model + " as WHO classifier and " + where_model + " as WHERE classifier\n") # getting NER and Coref of the text ner_coref = self.extractINANerAndCoref(text, title) # extracting 5w who = self.extractWhoOrWhere(text, title, who_model, ner_coref) where = self.extractWhoOrWhere(text, title, where_model, ner_coref) when = self.extractWhenFromText(text, ner_coref['ner']) if who: what = self.extractWhatFromText(who, title, text) else: what = None why = self.extractWhyFromText(what, text) result_dict = { 'title': title, 'text': text, "who": who, 'where': where, 'what': what, 'when': when, 'why': why } return result_dict def prettyPrint5w(self, result): # print("\nExtracted 5W from: "+result['title']) print() if result['who']: print("WHO is involved in the news?: ", result['who']) else: print("Sorry, can not detect the WHO in the news") if result['where']: print("WHERE does the news take place?: ", result['where']) else: print("Sorry, can not detect the WHERE in the news") if result['when']: print("WHEN did the event in the news happen: ", result['when']) else: print("Sorry, can not detect the WHEN in the news") if not result['who']: print( "WHAT in the news is not detected, because the WHO element in the news was not detected" ) else: print("WHAT's happening in the news: ", result['what']) if not result['why']: if not result['what']: print( "WHY in the news is not detected, because the WHAT element in the news was not detected" ) else: print("Sorry, can not detect the WHY in the news") else: print("WHY did the event in the news happen: ", result['why'])