Ejemplo n.º 1
0
class CheckDataset(object):
    def __init__(self):
        self.nlp = NLPHelper()
        self.ut = Utility()
        self.fex = FeatureExtractor()

    def checkData(self):
        path = "scenario2_fullidn_pickle/"
        filelist = os.listdir(path)
        data = pd.DataFrame()

        for idx, file in enumerate(filelist):

            #buka file pickle yang isinya data ner, coref, dan pos dari suatu teks berita
            pkl_dict = self.ut.loadPickle(os.path.join(path, file))
            # print(pkl_dict['ner'])
            # entities = self.fex.extractBefEntity(pkl_dict['ner'])
            filename = pkl_dict['filename']

            df = pd.DataFrame()

            df = self.countTermFrequency(pkl_dict['text'])
            df['filename'] = filename
            data = data.append(df)
            # df['entities'] = entities
        self.ut.convertToExcel("TF.xlsx", data, 'Sheet1')

    def countTermFrequency(self, text):
        import nltk

        words = nltk.word_tokenize(text)
        fdist = nltk.FreqDist(words)

        df = pd.DataFrame.from_dict(fdist, orient='index').reset_index()
        df.columns = ['term', 'frequency']
        # for word, frequency in fdist.most_common(50):
        #     print(u'{}:{}'.format(word, frequency))
        return df
Ejemplo n.º 2
0
 def __init__(self):
     self.ut = Utility()
Ejemplo n.º 3
0
class ModelTrainer(object):
    def __init__(self):
        self.ut = Utility()

    def train(self, dataset, drop_element):
        #classifier algorithm, n_estimator = jumlah tree, random_state= angka apapun, sengaja didefine biar hasilnya tetap sama
        clf = RandomForestClassifier(n_estimators=3,
                                     random_state=2)  #coba utak atik

        # extract feature needed, drop entity
        dataset = dataset.drop(['entity', 'id_text', drop_element], axis=1)
        # convert type to numeric
        dataset = self.ut.convertToNumeric(dataset)
        # dataset = self.ut.oneHotEncoding(dataset)

        # determine wich column is feature or label
        # X itu fitur
        X = dataset.iloc[:, :
                         -1]  # [x = take  entire row, y = take all column except last column]
        # y itu label
        y = dataset.iloc[:, -1]  # [x = take entire row, y = last column only]

        # get training score using cross validation
        result = self.nFoldCrossValidation(X, y, clf, nfold=10)

        if drop_element == 'who':

            # training and save into pickle
            # scenario 1
            # joblib.dump(clf,'model/scen1_train_where_halfidn.pkl')
            # joblib.dump(clf,'model/HO_scen1_train_where_halfidn.pkl')
            # joblib.dump(clf,'model/3_scen1_train_where_halfidn.pkl')
            # joblib.dump(clf,'model/HO2_scen1_train_where_halfidn.pkl')
            # # scenario 2
            # joblib.dump(clf,'model/scen2_train_where_fullidn.pkl')
            # joblib.dump(clf,'model/HO_scen2_train_where_fullidn.pkl')
            # joblib.dump(clf,'model/3_scen2_train_where_fullidn.pkl')
            # joblib.dump(clf,'model/HO2_scen2_train_where_fullidn.pkl')
            # # scenario 3
            # joblib.dump(clf,'model/scen3_train_where_default.pkl')
            # joblib.dump(clf,'model/HO_scen3_train_where_default.pkl')
            joblib.dump(clf, 'model/3_scen3_train_where_default.pkl')
            # joblib.dump(clf,'model/HO2_scen3_train_where_default.pkl')
            # testing
            # joblib.dump(clf,'model/s2_testing_where.pkl')
            print("Model for WHERE has been saved")

            # scenario 1
            # self.ut.convertToExcel("./result/scenario1_halfidn_WHERE_10fold.xlsx",result,"Sheet1")
            # self.ut.convertToExcel("./result/HO_scenario1_halfidn_WHERE_10fold.xlsx",result,"Sheet1")
            # self.ut.convertToExcel("./result/HO2_scenario1_halfidn_WHERE_10fold.xlsx",result,"Sheet1")
            # scenario 2
            # self.ut.convertToExcel("./result/3_scenario2_fullidn_WHERE_10fold.xlsx",result,"Sheet1")
            # self.ut.convertToExcel("./result/HO2_scenario2_fullidn_WHERE_10fold.xlsx",result,"Sheet1")
            # scenario 3
            self.ut.convertToExcel(
                "./result/3_scenario3_default_WHERE_10fold.xlsx", result,
                "Sheet1")
            # self.ut.convertToExcel("./result/HO2_scenario3_default_WHERE_10fold.xlsx",result,"Sheet1")

            # scenario testing
            # self.ut.convertToExcel("./result/s2_testing_WHERE_10fold.xlsx",result,"Sheet1")
            print(
                "Cross Validation for WHERE model has been saved to excel file!"
            )

        elif drop_element == 'where':
            # training and save into pickle
            # scenario 1
            # joblib.dump(clf,'model/scen1_train_who_halfidn.pkl')
            # joblib.dump(clf,'model/HO_scen1_train_who_halfidn.pkl')
            # joblib.dump(clf,'model/3_scen1_train_who_halfidn.pkl')
            # joblib.dump(clf,'model/HO2_scen1_train_who_halfidn.pkl')
            # # scenario 2
            # joblib.dump(clf,'model/scen2_train_who_fullidn.pkl')
            # joblib.dump(clf,'model/HO_scen2_train_who_fullidn.pkl')
            # joblib.dump(clf,'model/3_scen2_train_who_fullidn.pkl')
            # joblib.dump(clf,'model/HO2_scen2_train_who_fullidn.pkl')
            # # scenario 3
            # joblib.dump(clf,'model/scen3_train_who_default.pkl')
            # joblib.dump(clf,'model/HO_scen3_train_who_default.pkl')
            joblib.dump(clf, 'model/3_scen3_train_who_default.pkl')
            # joblib.dump(clf,'model/HO2_scen3_train_who_default.pkl')
            # testing
            # joblib.dump(clf,'model/s2_testing_who.pkl')
            print("Model for WHO has been saved")

            # scenario 1
            # self.ut.convertToExcel("./result/scenario1_halfidn_WHO_10fold.xlsx",result,"Sheet1")
            # self.ut.convertToExcel("./result/HO_scenario1_halfidn_WHO_10fold.xlsx",result,"Sheet1")
            # self.ut.convertToExcel("./result/HO2_scenario1_halfidn_WHO_10fold.xlsx",result,"Sheet1")
            # scenario 2
            # self.ut.convertToExcel("./result/3_scenario2_fullidn_WHO_10fold.xlsx",result,"Sheet1")
            # self.ut.convertToExcel("./result/HO2_scenario2_fullidn_WHO_10fold.xlsx",result,"Sheet1")
            # scenario 3
            self.ut.convertToExcel(
                "./result/3_scenario3_default_WHO_10fold.xlsx", result,
                "Sheet1")
            # self.ut.convertToExcel("./result/HO2_scenario3_default_WHO_10fold.xlsx",result,"Sheet1")

            # scenario testing
            # self.ut.convertToExcel("./result/s2_testing_WHO_10fold.xlsx",result,"Sheet1")
            print(
                "Cross Validation for WHO model has been saved to excel file!")

    # classic method
    def getEvaluationScore(self, X_test, y_test, model):
        y_pred = model.predict(X_test)
        print(y_pred)
        # otak atik aja datanya, gimana biar nilainya jadi ga 0 lagi, undersampling oversampling ?
        print("Accuracy: ", (accuracy_score(y_test, y_pred) * 100).round(4))
        print("Precision: ", (precision_score(y_test, y_pred) * 100).round(4))
        print("Recall: ", (recall_score(y_test, y_pred) * 100).round(4))
        print("F-measure: ", (f1_score(y_test, y_pred) * 100).round(4))
        print("Confusion matrix:")
        print(confusion_matrix(y_test, y_pred))

    # cross validation
    def nFoldCrossValidation(self, X, y, model, nfold):
        # ngitung jumlah class
        class_count = y.groupby(y).count().shape[0]
        # shuffle biar ngacak di awal, random_state biar dirunning berkali kali tetap sama
        k_fold = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=7)
        # initiate score lists
        precision_list = []
        recall_list = []
        fscore_list = []
        train_score = []
        test_score = []

        # counter fold, dimulai dari 1
        fold_count = 1
        print("Confusion matrix of " + model.__class__.__name__ + ":\n")

        # train_indices and test_indices returns an array of indices which indicate train and test data  (indices = index)
        for train_indices, test_indices in k_fold.split(X,
                                                        y):  #split per n fold
            # memisahkan data training dan data testing
            X_train, X_test = X.iloc[train_indices], X.iloc[
                test_indices]  #iloc = locate data by index
            y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]

            # fit = buat training
            model.fit(X_train, y_train)

            # prediksi data training
            predictions = model.predict(X_train)
            # mendapatkan nilai akurasi dengan cara ngebandingin hasil prediksi dengan nilai aslinya (y_train / data training) satu persatu
            train_score.append(
                accuracy_score(y_train, predictions).round(4)
            )  #diround 4 biar hasilnya dibulatkan jadi 4 angka di belakang koma

            # prediksi data testing
            predictions = model.predict(X_test)
            # mendapatkan nilai akurasi dengan cara ngebandingin hasil prediksi dengan nilai aslinya (y_test / data testing)
            test_score.append(accuracy_score(y_test, predictions).round(4))

            # mencari nilai precision, recall dan f_score dengan membandingkan nilai asli (y_testing) dengan hasil prediksi
            precision_list.append(
                precision_score(y_test, predictions).round(4))
            recall_list.append(recall_score(y_test, predictions).round(4))
            fscore_list.append(f1_score(y_test, predictions).round(4))

            # menunjukkan fold ke berapa, dan confusion matrixnya seperti apa
            print("Fold " + str(fold_count) + ":")
            # urutan confusion matrix, pokoknya baris bawah, sebelah kanan itu True Positive nya kirinya False Positive, atas True Negative dan False Negative
            print(confusion_matrix(y_test, predictions))
            print()
            fold_count += 1

        # hitung rata - rata nilai akurasi, precision, recall, dan f_score
        acc_train = (sum(train_score) / len(train_score)).round(4)
        acc_test = (sum(test_score) / len(test_score)).round(4)
        precision = (sum(precision_list) / len(precision_list)).round(4)
        recall = (sum(recall_list) / len(recall_list)).round(4)
        f_score = (sum(fscore_list) / len(fscore_list)).round(4)

        print("Evaluation using " + model.__class__.__name__ + ":\n")

        # simpan data hasil perhitungan akurasi precision recall f_score ke dataframe
        fold_index = [str(i + 1) for i in range(nfold)]  #create fold index
        fold_data = [
            fold_index, train_score, test_score, precision_list, recall_list,
            fscore_list
        ]
        fold_column = [
            'fold', 'acc_train', 'acc_test', 'precision', 'recall', 'f_score'
        ]  #create column name
        df_fold = pd.DataFrame(np.column_stack(fold_data),
                               columns=fold_column)  #bikin DataFrame
        df_fold = df_fold.set_index('fold')  #set data fold as index

        #PRINT  hasil
        print(df_fold)
        print("=" * 50 + "\n")

        print('Total data classified:', len(X))
        # perlu dibandingkan nilai akurasi di training dan di testing, siapa tau ada overfitting, kalau misalnya ga beda jauh, berarti kemungkinan modelnya benar
        print('Accuracy on Train:', acc_train)
        print('Accuracy on Test:', acc_test)
        print('Precision:', precision)
        print('Recall:', recall)
        print('F-Score:', f_score)

        return df_fold
Ejemplo n.º 4
0
 def __init__(self):
     self.ut = Utility()
     self.fwe = FiveWExtractor()
     self.fex = FeatureExtractor()
     self.nlp = NLPHelper()
     self.tr = ModelTrainer()
Ejemplo n.º 5
0
class EvaluateNews(object):
    def __init__(self):
        self.ut = Utility()
        self.fwe = FiveWExtractor()
        self.fex = FeatureExtractor()
        self.nlp = NLPHelper()
        self.tr = ModelTrainer()

    def evaluateGoldenDatasetNews(self, file_range=None):
        # filerange = (0, 10)
        # find feature in one text and save it to excel
        path = "./datasets/"
        filelist = os.listdir(path)
        data = pd.DataFrame()

        if file_range:
            filelist = filelist[file_range[0]:file_range[1]]

        for idx, file in enumerate(filelist):
            print(file)
            #buka file pickle yang isinya data ner, coref, dan pos dari suatu teks berita
            file_temp = self.ut.loadJSON(os.path.join(path, file))
            # ekstraksi 5W dari file JSON
            try:
                temp = self.fwe.extract5w(file_temp["text"],
                                          file_temp["title"])
                temp["file"] = file
                data = data.append(temp, ignore_index=True)
            except:
                temp = []
                print("It failed huhu")

        self.ut.convertToExcel("idnhalf_goldendata_evaluate_089.xlsx", data,
                               'Sheet1')

        print("Evaluating golden data is done!")

    def extract5wLocalNewsForEval(self, filename):

        data = self.ut.loadCSV(filename, ',', "ISO-8859-1")

        data['extracted'] = data.apply(
            lambda x: self.fwe.extract5wLocalNews(x['text'], x['title']),
            axis=1)
        temp = pd.DataFrame()
        temp['title'] = data['extracted'].apply(lambda x: x['title'])
        temp['text'] = data['extracted'].apply(lambda x: x['text'])
        temp['who'] = data['extracted'].apply(lambda x: x['who'])
        temp['where'] = data['extracted'].apply(lambda x: x['where'])
        temp['what'] = data['extracted'].apply(lambda x: x['what'])
        temp['when'] = data['extracted'].apply(lambda x: x['when'])
        temp['why'] = data['extracted'].apply(lambda x: x['why'])

        # scenario 1
        # self.ut.convertToExcel("3_scen1_halfidn_evallocalnews.xlsx",temp,'Sheet1')
        # self.ut.convertToExcel("HO_scen1_halfidn_evallocalnews.xlsx",temp,'Sheet1')
        # scenario 2
        # self.ut.convertToExcel("3_scen2_fullidn_evallocalnews.xlsx",temp,'Sheet1')
        # self.ut.convertToExcel("HO_scen2_fullidn_evallocalnews.xlsx",temp,'Sheet1')
        # scenario 3
        self.ut.convertToExcel("3_scen3_default_evallocalnews.xlsx", temp,
                               'Sheet1')
        # self.ut.convertToExcel("HO_scen3_default_evallocalnews.xlsx",temp,'Sheet1')

        print("Evaluating local news is done!")

    def extractFeatureFromLocalNews(self, filename):
        data = self.ut.loadCSV(filename, ',', "ISO-8859-1")

        data['ner'] = data['text'].apply(lambda x: self.nlp.getNER(x))
        data['coref'] = data['text'].apply(lambda x: self.nlp.getCoref(x))

        feature = pd.DataFrame()
        for i in range(data.shape[0]):
            feature = feature.append(self.fex.extractFeaturesDirectFromText(
                data.iloc[i]),
                                     ignore_index=True)

        # scenario 1
        # self.ut.convertToExcel("scen1_halfidn_localfeature.xlsx",feature,'Sheet1')
        # scenario 2
        # self.ut.convertToExcel("scen2_fullidn_localfeature.xlsx",feature,'Sheet1')
        # scenario 3
        # self.ut.convertToExcel("scen3_default_localfeature.xlsx",feature,'Sheet1')

    def evaluateLocalWhoWhere(self, drop_element):

        # # scenario 1
        # dataset = pd.read_excel('scen1_halfidn_localfeature.xlsx', sheet_name='Sheet1')
        # model_where = joblib.load('model/3_scen1_train_where_halfidn.pkl')
        # model_who = joblib.load('model/3_scen1_train_who_halfidn.pkl')

        # scenario 2
        # dataset = pd.read_excel('scen2_fullidn_localfeature.xlsx', sheet_name='Sheet1')
        # model_where = joblib.load('model/3_scen2_train_where_fullidn.pkl')
        # model_who = joblib.load('model/3_scen2_train_who_fullidn.pkl')

        # # scenario 3
        dataset = pd.read_excel('scen3_default_localfeature.xlsx',
                                sheet_name='Sheet1')
        model_where = joblib.load('model/3_scen3_train_where_default.pkl')
        model_who = joblib.load('model/3_scen3_train_who_default.pkl')

        # scenario HO -------------------------------
        # # # scenario 1
        # dataset = pd.read_excel('scen1_halfidn_localfeature.xlsx', sheet_name='Sheet1')
        # model_where = joblib.load('model/HO2_scen1_train_where_halfidn.pkl')
        # model_who = joblib.load('model/HO2_scen1_train_who_halfidn.pkl')

        # scenario 2
        # dataset = pd.read_excel('scen2_fullidn_localfeature.xlsx', sheet_name='Sheet1')
        # model_where = joblib.load('model/HO2_scen2_train_where_fullidn.pkl')
        # model_who = joblib.load('model/HO2_scen2_train_who_fullidn.pkl')

        # # # scenario 3
        # dataset = pd.read_excel('scen3_default_localfeature.xlsx', sheet_name='Sheet1')
        # model_where = joblib.load('model/HO2_scen3_train_where_default.pkl')
        # model_who = joblib.load('model/HO2_scen3_train_who_default.pkl')

        # scenario test
        # dataset = pd.read_excel('scen1_halfidn_localfeature.xlsx', sheet_name='Sheet1')
        # model_where = joblib.load('model/s2_testing_where.pkl')
        # model_who = joblib.load('model/s2_testing_who.pkl')

        if drop_element == 'who':
            self.evaluateModelLocal(dataset, 'who', model_where)
            print("Evaluation for WHERE's local classifier is done!")
        elif drop_element == 'where':
            self.evaluateModelLocal(dataset, 'where', model_who)
            print("Evaluation for WHO's local classifier is done!")

    def evaluateModelLocal(self, dataset, drop_element, model):
        dataset = self.ut.convertToNumeric(dataset)

        dataset = dataset.drop(['entity', drop_element], axis=1)
        # !!! FOR ONE HOT ENCODING !!!
        # dataset = self.ut.oneHotEncoding(dataset)

        # determine wich column is feature or label
        # X itu fitur
        X = dataset.iloc[:, :
                         -1]  # [x = take  entire row, y = take all column except last column]
        # y itu label
        y = dataset.iloc[:, -1]  # [x = take entire row, y = last column only]

        # get training score using cross validation
        # result = self.nFoldCrossValidation(X, y, clf, nfold=10)
        result = self.tr.getEvaluationScore(X, y, model)
Ejemplo n.º 6
0
 def __init__(self):
     self.pre = Preprocess()
     self.nlp = NLPHelper()
     self.fex = FeatureExtractor()
     self.ut = Utility()
     self.mt = ModelTrainer()
Ejemplo n.º 7
0
class FiveWExtractor(object):
    def __init__(self):
        self.pre = Preprocess()
        self.nlp = NLPHelper()
        self.fex = FeatureExtractor()
        self.ut = Utility()

    def extractNerCorefFromTxt(self, text, title):

        ner = self.nlp.getNER(text)
        print("NER extraction completed")
        coref = self.nlp.getCoref(text)
        print("Coref extraction completed")
        nlp_dict = {
            'title': title,
            'text': text,
            'ner': ner,
            'coref': coref,
        }
        print(nlp_dict)

        return nlp_dict

    def extractINANerAndCoref(self, text, title):
        ner = self.nlp.getIdnNER(text)
        print("NER extraction completed")
        coref = self.nlp.getCoref(text)
        print("Coref extraction completed")
        nlp_dict = {
            'title': title,
            'text': text,
            'ner': ner,
            'coref': coref,
        }

        return nlp_dict

    def extractWhoOrWhere(self, text, title, ml, ner_coref):

        # load machine learning model
        model = self.ut.loadPickle(ml)

        # extracting features and convert it to numeric type
        features = self.fex.extractFeaturesDirectFromText(ner_coref)
        # print(features)
        features = self.convertToNumeric(features)
        # features = self.ut.oneHotEncoding(features)
        # features = features.drop('entity', axis=1)

        print(features)

        # predicting who or where by it's feature, dropping unused column
        predict_candidate = model.predict(features.drop('entity', axis=1))
        print("candidates: ", predict_candidate)
        candidate = []

        for i in range(len(predict_candidate)):
            if predict_candidate[i] == 1:
                # insert candidate to list
                # candidate.append(features.iloc[i,5]) # !! FOR ONE HOT ENCODING ONLY
                candidate.append(features.iloc[i, 1])

        return candidate

    def convertToNumeric(self, dataset):
        # convert categorical feature to numeric
        dataset['type'] = dataset['type'].map({
            'PERSON': 1,
            'LOCATION': 2,
            'ORGANIZATION': 3,
            'NP': 4,
            'DATE': 5,
            'TIME': 6
        }).astype(int)
        dataset['occ_title'] = dataset['occ_title'].map({
            False: 0,
            True: 1
        }).astype(int)
        return dataset

    def getWhenCandidatefromNER(self, ner_list):
        print("Getting date and time entities in text with NER...")

        # getting when candidate (date and time) from extracted NER

        list_date = []
        list_time = []
        when_candidates = []

        date = []
        time = []

        for ner in ner_list:
            if ner[1] == 'DATE':
                date.append(ner[0])
            elif ner[1] == 'TIME':
                time.append(ner[0])
            else:
                if date != []:
                    list_date.append(self.pre.joinText(date))
                    date = []
                if time != []:
                    list_time.append(self.pre.joinText(time))
                    time = []

        list_date = self.pre.sieveSubstring(list_date)
        list_time = self.pre.sieveSubstring(list_time)
        when_candidates = list_date + list_time

        if when_candidates:
            return when_candidates
        else:
            return None

    def extractWhenFromText(self, text, ner):
        print()
        print("Extracting WHEN...")
        when_candidates = self.getWhenCandidatefromNER(ner)

        if when_candidates:
            when = None
            when_score = None

            for candidate in when_candidates:
                candidate_score = self.scoreWhenCandidate(candidate, text)
                if not when_score or candidate_score > when_score:
                    when = candidate
                    when_score = candidate_score

            return when
        else:
            return None

    def findPositioninText(self, candidate, sent_list):
        for i in range(len(sent_list)):
            pos = i + 1

            match = re.search(candidate.lower(), sent_list[i].lower())
            if match:
                return pos
            else:
                return None

    def scoreWhenCandidate(self, candidate, text):
        # w0, w1, w2, w3 = weight of value
        # d = the document length measured in sentences
        # pc || p(c) = the position measured in sentences of candidate c within the document
        print("Scoring WHEN candidate: " + candidate)
        w0 = 10
        w1 = w2 = 1
        w3 = 5

        sent_list = sent_tokenize(text)
        d = len(sent_list)
        pc = self.findPositioninText(candidate, sent_list)

        if pc:
            score = w0 * (
                (d - pc) / d) + w1 * self.isDate(candidate) + w2 * self.isTime(
                    candidate) + w3 * self.isDateTime(candidate)
        else:
            score = 0

        return score

    def isDate(self, candidate):
        # check if candidate is date instance, else return 0
        print("Checking if " + candidate +
              " can be parsed to a Date object...")
        parser.parser.parse = parse_date
        try:
            parsed_candidate = parser.parser().parse(candidate, None)
            # if contain date
            if parsed_candidate[0].day or parsed_candidate[
                    0].month or parsed_candidate[0].year or parsed_candidate[
                        0].weekday:
                return 1
            # if doesnt contain time and/or date
            else:
                return 0

        except (ValueError, AttributeError) as e:
            return 0

    def isDateTime(self, candidate):
        # check if it's parseable to datetime type
        print("Checking if " + candidate +
              " can be parsed to a DateTime object...")
        try:
            parsed_candidate = parse(candidate)
            return 1
        except (ValueError, AttributeError) as e:
            return 0

    def isTime(self, candidate):
        # check if when candidate contains date+time, time only, or neither
        print("Checking if " + candidate +
              " can be parsed to a Time object...")
        parser.parser.parse = parse_date
        try:
            parsed_time = parser.parser().parse(candidate, None)

            # if contain time
            if parsed_time[0].hour or parsed_time[0].minute or parsed_time[
                    0].second or parsed_time[0].microsecond:
                # if contain date too
                if parsed_time[0].day or parsed_time[0].month or parsed_time[
                        0].year or parsed_time[0].weekday:
                    return 0.8
                # if time only
                else:
                    return 1
            # if doesnt contain time and/or date
            else:
                return 0

        except (ValueError, AttributeError) as e:
            return 0

    def extractWhatFromText(self, who_candidates, title, text):
        print()
        print("Extracting WHAT...")
        what = []
        if who_candidates:
            print(who_candidates)
            for who in who_candidates:
                # If one of our WHO candidates occurs in the title, we look for the subsequent verb phrase of it
                if who in title:
                    print("getting subsequent Verb Phrase from title...")
                    anno = list(self.nlp.getConstituencyParsing(title))
                    # print(anno)
                    # returning verb phrase from title
                    for sub_tree in anno[0].subtrees(
                            lambda t: t.label() == 'VP'):
                        what.append(' '.join(sub_tree.leaves()))
                # If there is no WHO in the headline, we search within the text for the first occurrence of our highest ranked WHO and also take the subsequent verb phrase as WHAT
                else:
                    sent_list = sent_tokenize(text)
                    for sent in sent_list:
                        # find first occurrence of who in sentence
                        match = re.findall(
                            r'\b' + re.escape(who.lower()) + r'\b',
                            sent.lower())
                        if match:
                            print(
                                "getting subsequent Verb Phrase from sentence..."
                            )
                            # getting verb phrase
                            anno = list(self.nlp.getConstituencyParsing(sent))
                            # print(anno)
                            break
                    # returning verb phrase from text
                    for sub_tree in anno[0].subtrees(
                            lambda t: t.label() == 'VP'):
                        what.append(' '.join(sub_tree.leaves()))

            what = self.pre.sieveSubstring(what)

            return what
        else:
            return None

    def extractWhyFromText(self, what_candidates, text):
        print()
        print("Extracting WHY...")
        regexWhy = [('since', 0.2), ('cause', 0.3), ('because', 0.3),
                    ('hence', 0.2), ('therefore', 0.3), ('why', 0.3),
                    ('result', 0.4), ('reason', 0.3), ('provide', 0.1),
                    ('s behind', 0.1), ('Due to', 0.2)]

        #for returning reason candidates from inputted text(s)
        why_candidates = []

        #extract sentence from the text
        sentence_list = sent_tokenize(text)

        for sent in sentence_list:
            matched_key = []
            # why = {}
            for reg in regexWhy:
                #check every word in sentence to see if there are same word with the keyword
                match = re.findall(r'\b' + re.escape(reg[0].lower()) + r'\b',
                                   sent.lower())
                if match:
                    matched_key.append(reg)

            if what_candidates:
                # check if what is in sentence
                # anggap 1 kalimat hanya punya 1 what
                for what in what_candidates:
                    # match = re.findall(r'\b' + what.lower() + r'\b',sent.lower())
                    # if match:
                    if what.lower() in sent.lower():
                        # check with WHAT(.*)to/TO(.*)/VB rule

                        print("getting Part of Speech tag...")
                        pos = self.nlp.getPOS(sent)
                        for i in range(len(pos)):
                            if pos[i][1] == 'TO':
                                if pos[i + 1][1] == 'VB':
                                    print("getting VERB in text...")
                                    rule = ('(WHAT(.*)to/TO(.*)/VB)', 0.5)
                                    matched_key.append(rule)
                        # check with (WHAT(.*)will) rule
                        checked = re.findall(r'\b' + re.escape('will') + r'\b',
                                             sent.lower())
                        if checked:
                            rule = ('(WHAT(.*)will)', 0.5)
                            matched_key.append(rule)

            #store all reason list found from one text in  container
            if matched_key != []:
                why = sent
                # why['sentence'] = sent
                # why['keys'] = list(set(matched_key))
                # why['total_confidence'] = sum([value[1] for value in why['keys']])
                why_candidates.append(why)

        return why_candidates

    def extract5w(self, text, title):

        # getting ML model for classifying who and where
        # scenario 1:
        # who_model = "./model/scen1_train_who_halfidn.pkl"
        # where_model = "./model/scen1_train_where_halfidn.pkl"

        # # # scenario 2:
        # who_model = "./model/scen2_train_who_fullidn.pkl"
        # where_model = "./model/scen2_train_where_fullidn.pkl"

        # # scenario 3:
        who_model = "./model/scen3_train_who_default.pkl"
        where_model = "./model/scen3_train_where_default.pkl"

        print("Using " + who_model + " as WHO classifier and " + where_model +
              " as WHERE classifier\n")

        # getting NER and Coref of the text
        ner_coref = self.extractNerCorefFromTxt(text, title)
        # extracting 5w
        print("Extracting WHO...")
        who = self.extractWhoOrWhere(text, title, who_model, ner_coref)
        print("\nExtracting WHERE...")
        where = self.extractWhoOrWhere(text, title, where_model, ner_coref)
        when = self.extractWhenFromText(text, ner_coref['ner'])
        if who:
            what = self.extractWhatFromText(who, title, text)
        else:
            what = None
        why = self.extractWhyFromText(what, text)

        result_dict = {
            'title': title,
            'text': text,
            "who": who,
            'where': where,
            'what': what,
            'when': when,
            'why': why
        }
        return result_dict

    def extract5wLocalNews(self, text, title):

        # getting ML model for classifying who and where
        # scenario 1:
        # who_model = "./model/scen1_train_who_halfidn.pkl"
        # where_model = "./model/scen1_train_where_halfidn.pkl"
        # who_model = "./model/3_scen1_train__whewho_halfidn.pkl"
        # where_model = "./model/3_scen1_trainre_halfidn.pkl"

        # # # scenario 2:
        who_model = "./model/scen2_train_who_fullidn.pkl"
        where_model = "./model/scen2_train_where_fullidn.pkl"
        # who_model = "./model/3_scen2_train_who_fullidn.pkl"
        # where_model = "./model/3_scen2_train_where_fullidn.pkl"

        # # scenario 3:
        # who_model = "./model/scen3_train_who_default.pkl"
        # where_model = "./model/scen3_train_where_default.pkl"
        # who_model = "./model/3_scen3_train_who_default.pkl"
        # where_model = "./model/3_scen3_train_where_default.pkl"

        # ------ HO --------
        # scenario 1:
        # who_model = "./model/HO_scen1_train_who_halfidn.pkl"
        # where_model = "./model/HO_scen1_train_where_halfidn.pkl"

        # # # scenario 2:
        # who_model = "./model/HO_scen2_train_who_fullidn.pkl"
        # where_model = "./model/HO_scen2_train_where_fullidn.pkl"

        # # scenario 3:
        # who_model = "./model/HO_scen3_train_who_default.pkl"
        # where_model = "./model/HO_scen3_train_where_default.pkl"

        print("Using " + who_model + " as WHO classifier and " + where_model +
              " as WHERE classifier\n")

        # getting NER and Coref of the text
        ner_coref = self.extractINANerAndCoref(text, title)
        # extracting 5w
        who = self.extractWhoOrWhere(text, title, who_model, ner_coref)
        where = self.extractWhoOrWhere(text, title, where_model, ner_coref)
        when = self.extractWhenFromText(text, ner_coref['ner'])
        if who:
            what = self.extractWhatFromText(who, title, text)
        else:
            what = None
        why = self.extractWhyFromText(what, text)

        result_dict = {
            'title': title,
            'text': text,
            "who": who,
            'where': where,
            'what': what,
            'when': when,
            'why': why
        }
        return result_dict

    def prettyPrint5w(self, result):
        # print("\nExtracted 5W from: "+result['title'])
        print()
        if result['who']:
            print("WHO is involved in the news?: ", result['who'])
        else:
            print("Sorry, can not detect the WHO in the news")

        if result['where']:
            print("WHERE does the news take place?: ", result['where'])
        else:
            print("Sorry, can not detect the WHERE in the news")

        if result['when']:
            print("WHEN did the event in the news happen: ", result['when'])
        else:
            print("Sorry, can not detect the WHEN in the news")

        if not result['who']:
            print(
                "WHAT in the news is not detected, because the WHO element in the news was not detected"
            )
        else:
            print("WHAT's happening in the news: ", result['what'])

        if not result['why']:
            if not result['what']:
                print(
                    "WHY in the news is not detected, because the WHAT element in the news was not detected"
                )
            else:
                print("Sorry, can not detect the WHY in the news")
        else:
            print("WHY did the event in the news happen: ", result['why'])
Ejemplo n.º 8
0
        print("Extracting feature: distribution ")
        entities = self.findDistribution(data["text"], entities)

        # append text index
        for entity in entities:
            entity['id_text'] = data["filename"]

        feature = pd.DataFrame(entities)
        return feature

    def extractFeaturesDirectFromText(self, data):
        print("Extracting features from text...")
        print("Extracting feature: entity types")
        entities = self.extractEntity(data["ner"])
        entities = self.pre.removeDuplicateListDict(
            self.findNounPhraseFromTitle(data["title"], entities))
        print("Extracting feature: occurences in text ")
        entities = self.countCfOccurencesInText(entities, data["coref"],
                                                data["title"])
        print("Extracting feature: occurences in title ")
        entities = self.findOccurencesInTitle(data["title"], entities)
        print("Extracting feature: distribution ")
        entities = self.findDistribution(data["text"], entities)

        feature = pd.DataFrame(entities)
        return feature


e = FeatureExtractor()
ut = Utility()
Ejemplo n.º 9
0
 def __init__(self):
     self.nlp = NLPHelper()
     self.ut = Utility()
     self.fex = FeatureExtractor()