Beispiel #1
0
class CheckDataset(object):
    def __init__(self):
        self.nlp = NLPHelper()
        self.ut = Utility()
        self.fex = FeatureExtractor()

    def checkData(self):
        path = "scenario2_fullidn_pickle/"
        filelist = os.listdir(path)
        data = pd.DataFrame()

        for idx, file in enumerate(filelist):

            #buka file pickle yang isinya data ner, coref, dan pos dari suatu teks berita
            pkl_dict = self.ut.loadPickle(os.path.join(path, file))
            # print(pkl_dict['ner'])
            # entities = self.fex.extractBefEntity(pkl_dict['ner'])
            filename = pkl_dict['filename']

            df = pd.DataFrame()

            df = self.countTermFrequency(pkl_dict['text'])
            df['filename'] = filename
            data = data.append(df)
            # df['entities'] = entities
        self.ut.convertToExcel("TF.xlsx", data, 'Sheet1')

    def countTermFrequency(self, text):
        import nltk

        words = nltk.word_tokenize(text)
        fdist = nltk.FreqDist(words)

        df = pd.DataFrame.from_dict(fdist, orient='index').reset_index()
        df.columns = ['term', 'frequency']
        # for word, frequency in fdist.most_common(50):
        #     print(u'{}:{}'.format(word, frequency))
        return df
Beispiel #2
0
class FiveWExtractor(object):
    def __init__(self):
        self.pre = Preprocess()
        self.nlp = NLPHelper()
        self.fex = FeatureExtractor()
        self.ut = Utility()

    def extractNerCorefFromTxt(self, text, title):

        ner = self.nlp.getNER(text)
        print("NER extraction completed")
        coref = self.nlp.getCoref(text)
        print("Coref extraction completed")
        nlp_dict = {
            'title': title,
            'text': text,
            'ner': ner,
            'coref': coref,
        }
        print(nlp_dict)

        return nlp_dict

    def extractINANerAndCoref(self, text, title):
        ner = self.nlp.getIdnNER(text)
        print("NER extraction completed")
        coref = self.nlp.getCoref(text)
        print("Coref extraction completed")
        nlp_dict = {
            'title': title,
            'text': text,
            'ner': ner,
            'coref': coref,
        }

        return nlp_dict

    def extractWhoOrWhere(self, text, title, ml, ner_coref):

        # load machine learning model
        model = self.ut.loadPickle(ml)

        # extracting features and convert it to numeric type
        features = self.fex.extractFeaturesDirectFromText(ner_coref)
        # print(features)
        features = self.convertToNumeric(features)
        # features = self.ut.oneHotEncoding(features)
        # features = features.drop('entity', axis=1)

        print(features)

        # predicting who or where by it's feature, dropping unused column
        predict_candidate = model.predict(features.drop('entity', axis=1))
        print("candidates: ", predict_candidate)
        candidate = []

        for i in range(len(predict_candidate)):
            if predict_candidate[i] == 1:
                # insert candidate to list
                # candidate.append(features.iloc[i,5]) # !! FOR ONE HOT ENCODING ONLY
                candidate.append(features.iloc[i, 1])

        return candidate

    def convertToNumeric(self, dataset):
        # convert categorical feature to numeric
        dataset['type'] = dataset['type'].map({
            'PERSON': 1,
            'LOCATION': 2,
            'ORGANIZATION': 3,
            'NP': 4,
            'DATE': 5,
            'TIME': 6
        }).astype(int)
        dataset['occ_title'] = dataset['occ_title'].map({
            False: 0,
            True: 1
        }).astype(int)
        return dataset

    def getWhenCandidatefromNER(self, ner_list):
        print("Getting date and time entities in text with NER...")

        # getting when candidate (date and time) from extracted NER

        list_date = []
        list_time = []
        when_candidates = []

        date = []
        time = []

        for ner in ner_list:
            if ner[1] == 'DATE':
                date.append(ner[0])
            elif ner[1] == 'TIME':
                time.append(ner[0])
            else:
                if date != []:
                    list_date.append(self.pre.joinText(date))
                    date = []
                if time != []:
                    list_time.append(self.pre.joinText(time))
                    time = []

        list_date = self.pre.sieveSubstring(list_date)
        list_time = self.pre.sieveSubstring(list_time)
        when_candidates = list_date + list_time

        if when_candidates:
            return when_candidates
        else:
            return None

    def extractWhenFromText(self, text, ner):
        print()
        print("Extracting WHEN...")
        when_candidates = self.getWhenCandidatefromNER(ner)

        if when_candidates:
            when = None
            when_score = None

            for candidate in when_candidates:
                candidate_score = self.scoreWhenCandidate(candidate, text)
                if not when_score or candidate_score > when_score:
                    when = candidate
                    when_score = candidate_score

            return when
        else:
            return None

    def findPositioninText(self, candidate, sent_list):
        for i in range(len(sent_list)):
            pos = i + 1

            match = re.search(candidate.lower(), sent_list[i].lower())
            if match:
                return pos
            else:
                return None

    def scoreWhenCandidate(self, candidate, text):
        # w0, w1, w2, w3 = weight of value
        # d = the document length measured in sentences
        # pc || p(c) = the position measured in sentences of candidate c within the document
        print("Scoring WHEN candidate: " + candidate)
        w0 = 10
        w1 = w2 = 1
        w3 = 5

        sent_list = sent_tokenize(text)
        d = len(sent_list)
        pc = self.findPositioninText(candidate, sent_list)

        if pc:
            score = w0 * (
                (d - pc) / d) + w1 * self.isDate(candidate) + w2 * self.isTime(
                    candidate) + w3 * self.isDateTime(candidate)
        else:
            score = 0

        return score

    def isDate(self, candidate):
        # check if candidate is date instance, else return 0
        print("Checking if " + candidate +
              " can be parsed to a Date object...")
        parser.parser.parse = parse_date
        try:
            parsed_candidate = parser.parser().parse(candidate, None)
            # if contain date
            if parsed_candidate[0].day or parsed_candidate[
                    0].month or parsed_candidate[0].year or parsed_candidate[
                        0].weekday:
                return 1
            # if doesnt contain time and/or date
            else:
                return 0

        except (ValueError, AttributeError) as e:
            return 0

    def isDateTime(self, candidate):
        # check if it's parseable to datetime type
        print("Checking if " + candidate +
              " can be parsed to a DateTime object...")
        try:
            parsed_candidate = parse(candidate)
            return 1
        except (ValueError, AttributeError) as e:
            return 0

    def isTime(self, candidate):
        # check if when candidate contains date+time, time only, or neither
        print("Checking if " + candidate +
              " can be parsed to a Time object...")
        parser.parser.parse = parse_date
        try:
            parsed_time = parser.parser().parse(candidate, None)

            # if contain time
            if parsed_time[0].hour or parsed_time[0].minute or parsed_time[
                    0].second or parsed_time[0].microsecond:
                # if contain date too
                if parsed_time[0].day or parsed_time[0].month or parsed_time[
                        0].year or parsed_time[0].weekday:
                    return 0.8
                # if time only
                else:
                    return 1
            # if doesnt contain time and/or date
            else:
                return 0

        except (ValueError, AttributeError) as e:
            return 0

    def extractWhatFromText(self, who_candidates, title, text):
        print()
        print("Extracting WHAT...")
        what = []
        if who_candidates:
            print(who_candidates)
            for who in who_candidates:
                # If one of our WHO candidates occurs in the title, we look for the subsequent verb phrase of it
                if who in title:
                    print("getting subsequent Verb Phrase from title...")
                    anno = list(self.nlp.getConstituencyParsing(title))
                    # print(anno)
                    # returning verb phrase from title
                    for sub_tree in anno[0].subtrees(
                            lambda t: t.label() == 'VP'):
                        what.append(' '.join(sub_tree.leaves()))
                # If there is no WHO in the headline, we search within the text for the first occurrence of our highest ranked WHO and also take the subsequent verb phrase as WHAT
                else:
                    sent_list = sent_tokenize(text)
                    for sent in sent_list:
                        # find first occurrence of who in sentence
                        match = re.findall(
                            r'\b' + re.escape(who.lower()) + r'\b',
                            sent.lower())
                        if match:
                            print(
                                "getting subsequent Verb Phrase from sentence..."
                            )
                            # getting verb phrase
                            anno = list(self.nlp.getConstituencyParsing(sent))
                            # print(anno)
                            break
                    # returning verb phrase from text
                    for sub_tree in anno[0].subtrees(
                            lambda t: t.label() == 'VP'):
                        what.append(' '.join(sub_tree.leaves()))

            what = self.pre.sieveSubstring(what)

            return what
        else:
            return None

    def extractWhyFromText(self, what_candidates, text):
        print()
        print("Extracting WHY...")
        regexWhy = [('since', 0.2), ('cause', 0.3), ('because', 0.3),
                    ('hence', 0.2), ('therefore', 0.3), ('why', 0.3),
                    ('result', 0.4), ('reason', 0.3), ('provide', 0.1),
                    ('s behind', 0.1), ('Due to', 0.2)]

        #for returning reason candidates from inputted text(s)
        why_candidates = []

        #extract sentence from the text
        sentence_list = sent_tokenize(text)

        for sent in sentence_list:
            matched_key = []
            # why = {}
            for reg in regexWhy:
                #check every word in sentence to see if there are same word with the keyword
                match = re.findall(r'\b' + re.escape(reg[0].lower()) + r'\b',
                                   sent.lower())
                if match:
                    matched_key.append(reg)

            if what_candidates:
                # check if what is in sentence
                # anggap 1 kalimat hanya punya 1 what
                for what in what_candidates:
                    # match = re.findall(r'\b' + what.lower() + r'\b',sent.lower())
                    # if match:
                    if what.lower() in sent.lower():
                        # check with WHAT(.*)to/TO(.*)/VB rule

                        print("getting Part of Speech tag...")
                        pos = self.nlp.getPOS(sent)
                        for i in range(len(pos)):
                            if pos[i][1] == 'TO':
                                if pos[i + 1][1] == 'VB':
                                    print("getting VERB in text...")
                                    rule = ('(WHAT(.*)to/TO(.*)/VB)', 0.5)
                                    matched_key.append(rule)
                        # check with (WHAT(.*)will) rule
                        checked = re.findall(r'\b' + re.escape('will') + r'\b',
                                             sent.lower())
                        if checked:
                            rule = ('(WHAT(.*)will)', 0.5)
                            matched_key.append(rule)

            #store all reason list found from one text in  container
            if matched_key != []:
                why = sent
                # why['sentence'] = sent
                # why['keys'] = list(set(matched_key))
                # why['total_confidence'] = sum([value[1] for value in why['keys']])
                why_candidates.append(why)

        return why_candidates

    def extract5w(self, text, title):

        # getting ML model for classifying who and where
        # scenario 1:
        # who_model = "./model/scen1_train_who_halfidn.pkl"
        # where_model = "./model/scen1_train_where_halfidn.pkl"

        # # # scenario 2:
        # who_model = "./model/scen2_train_who_fullidn.pkl"
        # where_model = "./model/scen2_train_where_fullidn.pkl"

        # # scenario 3:
        who_model = "./model/scen3_train_who_default.pkl"
        where_model = "./model/scen3_train_where_default.pkl"

        print("Using " + who_model + " as WHO classifier and " + where_model +
              " as WHERE classifier\n")

        # getting NER and Coref of the text
        ner_coref = self.extractNerCorefFromTxt(text, title)
        # extracting 5w
        print("Extracting WHO...")
        who = self.extractWhoOrWhere(text, title, who_model, ner_coref)
        print("\nExtracting WHERE...")
        where = self.extractWhoOrWhere(text, title, where_model, ner_coref)
        when = self.extractWhenFromText(text, ner_coref['ner'])
        if who:
            what = self.extractWhatFromText(who, title, text)
        else:
            what = None
        why = self.extractWhyFromText(what, text)

        result_dict = {
            'title': title,
            'text': text,
            "who": who,
            'where': where,
            'what': what,
            'when': when,
            'why': why
        }
        return result_dict

    def extract5wLocalNews(self, text, title):

        # getting ML model for classifying who and where
        # scenario 1:
        # who_model = "./model/scen1_train_who_halfidn.pkl"
        # where_model = "./model/scen1_train_where_halfidn.pkl"
        # who_model = "./model/3_scen1_train__whewho_halfidn.pkl"
        # where_model = "./model/3_scen1_trainre_halfidn.pkl"

        # # # scenario 2:
        who_model = "./model/scen2_train_who_fullidn.pkl"
        where_model = "./model/scen2_train_where_fullidn.pkl"
        # who_model = "./model/3_scen2_train_who_fullidn.pkl"
        # where_model = "./model/3_scen2_train_where_fullidn.pkl"

        # # scenario 3:
        # who_model = "./model/scen3_train_who_default.pkl"
        # where_model = "./model/scen3_train_where_default.pkl"
        # who_model = "./model/3_scen3_train_who_default.pkl"
        # where_model = "./model/3_scen3_train_where_default.pkl"

        # ------ HO --------
        # scenario 1:
        # who_model = "./model/HO_scen1_train_who_halfidn.pkl"
        # where_model = "./model/HO_scen1_train_where_halfidn.pkl"

        # # # scenario 2:
        # who_model = "./model/HO_scen2_train_who_fullidn.pkl"
        # where_model = "./model/HO_scen2_train_where_fullidn.pkl"

        # # scenario 3:
        # who_model = "./model/HO_scen3_train_who_default.pkl"
        # where_model = "./model/HO_scen3_train_where_default.pkl"

        print("Using " + who_model + " as WHO classifier and " + where_model +
              " as WHERE classifier\n")

        # getting NER and Coref of the text
        ner_coref = self.extractINANerAndCoref(text, title)
        # extracting 5w
        who = self.extractWhoOrWhere(text, title, who_model, ner_coref)
        where = self.extractWhoOrWhere(text, title, where_model, ner_coref)
        when = self.extractWhenFromText(text, ner_coref['ner'])
        if who:
            what = self.extractWhatFromText(who, title, text)
        else:
            what = None
        why = self.extractWhyFromText(what, text)

        result_dict = {
            'title': title,
            'text': text,
            "who": who,
            'where': where,
            'what': what,
            'when': when,
            'why': why
        }
        return result_dict

    def prettyPrint5w(self, result):
        # print("\nExtracted 5W from: "+result['title'])
        print()
        if result['who']:
            print("WHO is involved in the news?: ", result['who'])
        else:
            print("Sorry, can not detect the WHO in the news")

        if result['where']:
            print("WHERE does the news take place?: ", result['where'])
        else:
            print("Sorry, can not detect the WHERE in the news")

        if result['when']:
            print("WHEN did the event in the news happen: ", result['when'])
        else:
            print("Sorry, can not detect the WHEN in the news")

        if not result['who']:
            print(
                "WHAT in the news is not detected, because the WHO element in the news was not detected"
            )
        else:
            print("WHAT's happening in the news: ", result['what'])

        if not result['why']:
            if not result['what']:
                print(
                    "WHY in the news is not detected, because the WHAT element in the news was not detected"
                )
            else:
                print("Sorry, can not detect the WHY in the news")
        else:
            print("WHY did the event in the news happen: ", result['why'])