Ejemplo n.º 1
0
    def testrun(self):
        db_connection = pymysql.connect(host='localhost', user='******', passwd='', database='dataset_twitter',charset='utf8')
        df = pd.read_sql('SELECT * FROM tweet_2 order by id asc limit 0,9', con=db_connection)
        p = Preprocessing()

        for index,row in df.iterrows():
            tweet = row['text']
            pretext = p.process(tweet)
            # print("Ori : ",tweet)
            print("Preprocessed : ",pretext," -> ",row['clas'])
            df.at[index,'text'] = pretext

        print('\n\n\n\n\n\n\n')

        exceptional_feature = ['clas','id']
        testdata = "Dikejar waktu biar bisa buat Pencitraan #prihatin"
        v = Vsm()
        vsm = v.vsm(df,exceptional_feature=exceptional_feature)
        nb = NaiveBayes()
        model = nb.builtmodel(vsm)
        nb.classify(testdata)
Ejemplo n.º 2
0
    def run(self):
        df = pd.read_sql('SELECT * FROM '+self.training_table+' order by id asc limit 0,9', con=self.con)
        # df = pd.read_sql('SELECT * FROM '+self.training_table+' order by id asc', con=self.con) # activate if deploying

        for index,row in df.iterrows():
            tweet = row[self.text_col]
            pretext = self.p.process(tweet)
            # print("Ori : ",tweet)
            print("Preprocessed : ",pretext," -> ",row[self.class_col])
            df.at[index,'text'] = pretext

        print('\n\n\n\n\n\n\n')

        v = Vsm()
        vsm = v.vsm(df,exceptional_feature=self.exceptional_feature)
        vsm = self.feature_selection.run(vsm,take_feature=10,exceptional_feature=self.exceptional_feature)
        nb = NaiveBayes()
        model = nb.builtmodel(vsm)

        testdata = "kecewa tolong hati jalan kalimant tidak rubah musim hujan parah"
        nb.classify(testdata)
Ejemplo n.º 3
0
class App():
    def __init__(self):
        self.db = ""
        self.training_table = ""
        self.exceptional_feature = []
        self.class_col = 'clas'
        self.text_col = 'text'

        self.con = None
        self.classificator = None
        self.dataTraining = None

    def checkConnection(self):
        if self.con != None:
            return True
        return False

    def connectTo(self, host, user, password,
                  db):  # connect to different db and return the connection
        tryConnect = Database()
        tryConnectStat = tryConnect.connect(host, user, password, db)
        if tryConnectStat['success'] == True:
            return tryConnect

        return None

    def connectDb(self, host, user, password, db):
        tables = None
        tryConnect = Database()
        tryConnectStat = tryConnect.connect(host, user, password, db)
        if tryConnectStat['success'] == True:
            self.con = tryConnect
            tables = self.con.tables(db)
        else:
            self.con = None

        ret = {}

        ret['success'] = tryConnectStat['success']
        ret['msg'] = tryConnectStat['msg']
        ret['tables'] = tables
        return ret

    def setTrainingTable(self, table):
        self.training_table = table

    def setExceptionalFeature(self, ex):
        self.exceptional_feature = ex

    def setClassCol(self, col):
        self.class_col = col

    def setTextCol(self, col):
        self.text_col = col

    def removesymbol(self, text):
        cleantext = text
        cleantext = cleantext.lower()
        url_pattern = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
        # symbol_pattern = '[\[\]\(\)!@#$%^&*-+=_`~\{\}\\\/;:\'\"<>,.?]'
        # allowonlyletternumber_pattern = "(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"
        allowonlyletternumber_pattern = "(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)|([^A-Za-z \t])|(\w+:\/\/\S+)"
        # allowonlyletter_pattern = "([^A-Za-z \t])|(\w+:\/\/\S+)"
        cleantext = ' '.join(
            re.sub(allowonlyletternumber_pattern, " ", cleantext).split())
        cleantext = re.sub(url_pattern, '', cleantext)
        cleantext = re.sub(r'\b\d+(?:\.\d+)?\s+', '', cleantext)
        # cleantext = re.sub(' +',' ',text) #hapus spasi berlebih
        return cleantext

    def isRootWordDB(self, text):
        sql = "SELECT id_rootword from root_word where rootword='{0}'"
        if self.con is not None:
            row = self.con.queryWithRowCount(sql.format(text))
            if row is not None and row['count'] < 1:
                return False
        else:
            print("No connection![isRootWordDB]")
        return True

    def isWordExistDB(self, text):
        sql = "SELECT id from fix_word where word='{0}'"
        if self.con is not None:
            row = self.con.queryWithRowCount(sql.format(text))
            if row is not None and row['count'] < 1:
                return False
        else:
            print("No connection![isWordExistDB]")
        return True

    def checkWord(self, text):
        abbreviation_word = text
        # if len(self.unfixedWords) > 0: # assume that word have no vocal character -> and re.search(r'^[^aeiou]+$',abbreviation_word)
        if not self.isWordExistDB(abbreviation_word):
            sqlinsert = "INSERT into fix_word set word='{0}'"
            if self.con is not None:
                if self.con.queryInsert(sqlinsert.format(abbreviation_word)):
                    print("Insert : Success")
                else:
                    print("Insert : Failed")
            else:
                print("No connection![checkWord]")

    def chekForUnidenChar(self, qc=None):
        stemmer = ECSP()
        self.dataTraining = self.con.getDataAsDF(self.training_table)
        if self.dataTraining is not None:
            for index, row in self.dataTraining.iterrows():
                if qc:
                    qc.processEvents()
                text = row[self.text_col]
                text = self.removesymbol(text)
                textToken = text.split(" ")
                for newtext in textToken:
                    if stemmer.isSixChar(
                            newtext) and not self.isRootWordDB(newtext):
                        self.checkWord(newtext)
                    if qc:
                        qc.processEvents()
                if qc:
                    qc.processEvents()

    def builtVSM(self,
                 doFeatureSelection,
                 take_feature,
                 threshold,
                 dataTraining=None,
                 qc=None,
                 logdis=None,
                 label=None):
        if dataTraining is None:
            dataTraining = self.dataTraining

        if logdis != None:
            logdis.appendPlainText("\nBuilding VSM ...")
        if label != None:
            label.setText("Building VSM ...")
        v = Vsm()
        vsm = v.vsm(dataTraining,
                    exceptional_feature=self.exceptional_feature,
                    coltext=self.text_col,
                    colclass=self.class_col,
                    qc=qc,
                    logdis=logdis)
        if doFeatureSelection:
            if label != None:
                label.setText("Doing feature selection ...")
            if logdis != None:
                logdis.appendPlainText("\nDoing feature selection ...")
                logdis.appendPlainText(
                    str(take_feature) + " - " + str(threshold))
            f = InfoGain()
            vsm = f.run(vsm,
                        take_feature=take_feature,
                        threshold=threshold,
                        exceptional_feature=self.exceptional_feature,
                        colclas=self.class_col,
                        qc=qc,
                        logdis=logdis)
            if logdis != None:
                logdis.appendPlainText("Feature before : " +
                                       str(vsm['featurebeforelen']))
                logdis.appendPlainText("Feature after : " +
                                       str(vsm['columnlen']) + "\n")

        return vsm

    def preprocessing(self,
                      doPreprocessing,
                      doFeatureSelection,
                      take_feature,
                      threshold,
                      progress,
                      qc,
                      label=None):
        features = None
        if self.con != None:
            if self.training_table:
                if label != None:
                    label.setText("Getting data training ...")
                self.dataTraining = self.con.getDataAsDF(self.training_table)
                progress.setValue(10)
                if self.dataTraining is not None:
                    p = Preprocessing(con=self.con)
                    oritext = None
                    uniqFeature = []
                    features = {}
                    originalFeatureCount = 0
                    progressP = 10
                    progressS = (70 - progressP) / len(self.dataTraining.index)
                    if label != None:
                        label.setText("Preprocessing data training ...")
                    for index, row in self.dataTraining.iterrows():
                        text = row[self.text_col]

                        if doPreprocessing:
                            pretext = p.process(text)
                            oritext = pretext['oritext']
                            pretext = pretext['stemmed_text']
                        else:
                            pretext = p.processNoPre(text)

                        t = p.processNoPre(pretext).split(
                            " ")  # bad performance
                        uniqFeature.extend(t)  # bad performance

                        # print("Ori : ",text)
                        # print("Preprocessed : ",pretext," -> ",row[self.class_col])
                        self.dataTraining.at[index, self.text_col] = pretext
                        progressP += progressS
                        progress.setValue(progressP)
                        # time.sleep(0.5)
                        qc.processEvents()
                    progress.setValue(70)
                    qc.processEvents()
                    uniqFeature = set(uniqFeature)  # bad performance
                    qc.processEvents()
                    features['featurebefore'] = len(
                        uniqFeature)  # bad performance
                    qc.processEvents()
                    progress.setValue(80)

                    features['vsm'] = self.builtVSM(doFeatureSelection,
                                                    take_feature,
                                                    threshold,
                                                    qc=qc,
                                                    label=label)
                    features['oritext'] = oritext
                    progress.setValue(90)
            else:
                print("No training table!")
        progress.setValue(100)

        return features

    def preprocessingText(self, doPreprocessing, progress, qc):
        if self.con != None:
            if self.training_table:
                self.dataTraining = self.con.getDataAsDF(self.training_table)
                progress.setValue(1)
                if self.dataTraining is not None:
                    p = Preprocessing(con=self.con)
                    progressP = 1
                    progressS = (99 - progressP) / len(self.dataTraining.index)
                    for index, row in self.dataTraining.iterrows():
                        text = row[self.text_col]

                        if doPreprocessing:
                            pretext = p.process(text)
                            pretext = pretext['stemmed_text']
                        else:
                            pretext = p.processNoPre(text)

                        self.dataTraining.at[index, self.text_col] = pretext
                        progressP += progressS
                        progress.setValue(progressP)
                        qc.processEvents()
                    qc.processEvents()
                    progress.setValue(99)
            else:
                print("No training table!")
        progress.setValue(100)

    def trainingClassificator(self, vsm, qc=None):
        self.classificator = NaiveBayes()
        vsm = vsm['vsm']
        model = self.classificator.builtmodel(vsm, qc=qc)
        return model

    def trainingClassificatorEval(self, vsm, qc=None, logdis=None):
        if logdis != None:
            logdis.appendPlainText("\nTraining model ...")
        classificator = NaiveBayes()
        classificator.builtmodel(vsm, qc=qc, logdis=logdis)
        return classificator

    def getDataTrainingProperty(self, clas):
        ret = {}
        if self.dataTraining is not None:
            ret['totaltrainingdata'] = len(self.dataTraining.index)
            t = self.dataTraining[self.class_col].value_counts()
            tdpc = ""
            num = 0
            tlen = len(t)
            for c in clas:
                tdpc += c + " : " + str(t[c])
                num += 1
                if num < tlen:
                    tdpc += ", "
            ret['totaltrainingdataperclas'] = tdpc
            return ret
        return False

    def getDataTrainingCount(self):
        if self.dataTraining is not None:
            return len(self.dataTraining.index)
        return False

    def evalSentence(self, model, sentence):
        return self.classificator.classifyWithModel(
            model, sentence, Preprocessing(con=self.con))

    def evalKFoldCV(self,
                    totaltrainingdata,
                    folds,
                    doFeatureSelection,
                    numFeatureToRetain,
                    thresholdFeatureIgnore,
                    qc=None,
                    stat=None,
                    logdis=None):
        if self.dataTraining is not None:
            dataLeft = totaltrainingdata % folds
            dataPerFold = floor(totaltrainingdata / folds)
            dataIndexFolds = []
            foldPos = 1
            dataPos = 1
            stat.setText("Counting folds ...")
            if qc:
                qc.processEvents()
            for i in range(totaltrainingdata):
                logdis.appendPlainText("Adding data index to fold " +
                                       str(foldPos) + " ...")
                if dataPos == 1:
                    newFold = []
                if dataPos <= dataPerFold:
                    newFold.append(i)
                    dataPos += 1
                    if dataPos > dataPerFold:
                        dataPos = 1
                if dataPos == 1:
                    dataIndexFolds.append(newFold)
                    foldPos += 1
                if foldPos > folds:
                    break
                if qc:
                    qc.processEvents()

            if dataLeft > 0:
                logdis.appendPlainText("\nAdding data left to folds ...\n")
                indexToAdd = 0
                for i in range((totaltrainingdata - dataLeft),
                               totaltrainingdata):
                    dataIndexFolds[indexToAdd].append(i)
                    indexToAdd += 1
                    if qc:
                        qc.processEvents()

            evalResults = []
            for i in range(folds):
                stat.setText("Processing fold " + str(i) + " ...")
                logdis.appendPlainText("\n\n\nProcessing k=" + str(i) +
                                       " ...\n")
                logdis.appendPlainText(
                    "Getting data training and data testing ...")
                dataIndexFolds_copy = list(dataIndexFolds)
                testData = self.dataTraining.iloc[dataIndexFolds_copy[i]]
                del dataIndexFolds_copy[i]
                trainingDataList = []
                for j in dataIndexFolds_copy:
                    trainingDataList.extend(j)
                    if qc:
                        qc.processEvents()
                trainingData = self.dataTraining.iloc[trainingDataList]
                vsmTraining = self.builtVSM(doFeatureSelection,
                                            numFeatureToRetain,
                                            thresholdFeatureIgnore,
                                            dataTraining=trainingData,
                                            qc=qc,
                                            logdis=logdis)
                model = self.trainingClassificatorEval(vsmTraining,
                                                       qc=qc,
                                                       logdis=logdis)
                evalResult = model.testclassificationDataframe(testData,
                                                               self.text_col,
                                                               self.class_col,
                                                               qc=qc,
                                                               logdis=logdis)
                evalResults.append(evalResult)
                if qc:
                    qc.processEvents()

            avg_accuration = 0
            avg_precision = 0
            avg_recall = 0
            for i in evalResults:
                avg_accuration += i['accuration']
                avg_precision += i['precision']
                avg_recall += i['recall']
                if qc:
                    qc.processEvents()
            if len(evalResults) > 0:
                er = len(evalResults)
                avg_accuration = avg_accuration / er
                avg_precision = avg_precision / er
                avg_recall = avg_recall / er
            ret = {}
            ret['accuration'] = float(format(avg_accuration, '.2f')) * 100
            ret['precision'] = float(format(avg_precision, '.2f'))
            ret['recall'] = float(format(avg_recall, '.2f'))

            logdis.appendPlainText(
                "\n ############################################### \n\n")

            # print(dataIndexFolds)
            return ret

        else:
            print("No data training found!")
        return False
Ejemplo n.º 4
0
 def trainingClassificatorEval(self, vsm, qc=None, logdis=None):
     if logdis != None:
         logdis.appendPlainText("\nTraining model ...")
     classificator = NaiveBayes()
     classificator.builtmodel(vsm, qc=qc, logdis=logdis)
     return classificator
Ejemplo n.º 5
0
 def trainingClassificator(self, vsm, qc=None):
     self.classificator = NaiveBayes()
     vsm = vsm['vsm']
     model = self.classificator.builtmodel(vsm, qc=qc)
     return model
Ejemplo n.º 6
0
 def trainingClassificatorEval(self, vsm, qc=None):
     classificator = NaiveBayes()
     classificator.builtmodel(vsm, qc=qc)
     return classificator
Ejemplo n.º 7
0
class App():
    def __init__(self):
        self.db = ""
        self.training_table = ""
        self.exceptional_feature = []
        self.class_col = 'clas'
        self.text_col = 'text'

        self.con = None
        self.classificator = None
        self.dataTraining = None

    def checkConnection(self):
        if self.con != None:
            return True
        return False

    def connectTo(self,host,user,password,db): # connect to different db and return the connection
        tryConnect = Database()
        tryConnectStat = tryConnect.connect(host,user,password,db)
        if tryConnectStat['success'] == True:
            return tryConnect

        return None

    def connectDb(self,host,user,password,db):
        tables = None
        tryConnect = Database()
        tryConnectStat = tryConnect.connect(host,user,password,db)
        if tryConnectStat['success'] == True:
            self.con = tryConnect
            tables = self.con.tables(db)
        else:
            self.con = None

        ret = {}

        ret['success'] = tryConnectStat['success']
        ret['msg'] = tryConnectStat['msg']
        ret['tables'] = tables
        return ret

    def setTrainingTable(self,table):
        self.training_table = table
    def setExceptionalFeature(self,ex):
        self.exceptional_feature = ex
    def setClassCol(self,col):
        self.class_col = col
    def setTextCol(self,col):
        self.text_col = col

    def preprocessing(self,doPreprocessing,doFeatureSelection,take_feature,threshold):
        features = None
        if self.con != None:
            if self.training_table:
                self.dataTraining = self.con.getDataAsDF(self.training_table)
                if self.dataTraining is not None:
                    p = Preprocessing()
                    uniqFeature = []
                    features = {}
                    originalFeatureCount = 0
                    for index,row in self.dataTraining.iterrows():
                        text = row[self.text_col]
                        t = p.processNoPre(text).split(" ") # bad performance
                        uniqFeature.extend(t) # bad performance

                        if doPreprocessing:
                            pretext = p.process(text)
                        else:
                            pretext = p.processNoPre(text)
                        # print("Ori : ",text)
                        # print("Preprocessed : ",pretext," -> ",row[self.class_col])
                        self.dataTraining.at[index,self.text_col] = pretext
                    uniqFeature = set(uniqFeature) # bad performance

                    v = Vsm()
                    vsm = v.vsm(self.dataTraining,exceptional_feature=self.exceptional_feature,coltext=self.text_col,colclass=self.class_col)
                    features['featurebefore'] = len(uniqFeature) # bad performance

                    if doFeatureSelection:
                        f = InfoGain()
                        vsm = f.run(vsm,take_feature=take_feature,threshold=threshold,exceptional_feature=self.exceptional_feature,colclas=self.class_col)
                    features['vsm'] = vsm

            else:
                print("No training table!")

        return features

    def trainingClassificator(self,vsm):
        self.classificator = NaiveBayes()
        vsm = vsm['vsm']
        model = self.classificator.builtmodel(vsm)
        return model

    def getDataTrainingProperty(self,clas):
        ret = {}
        if self.dataTraining is not None:
            ret['totaltrainingdata'] = len(self.dataTraining.index)
            t = self.dataTraining[self.class_col].value_counts()
            tdpc = ""
            num = 0
            tlen = len(t)
            for c in clas:
                tdpc+=c+" : "+str(t[c])
                num+=1
                if num < tlen:
                    tdpc+=", "
            ret['totaltrainingdataperclas'] = tdpc
            return ret
        return False

    def run(self):
        df = pd.read_sql('SELECT * FROM '+self.training_table+' order by id asc limit 0,9', con=self.con)
        # df = pd.read_sql('SELECT * FROM '+self.training_table+' order by id asc', con=self.con) # activate if deploying

        for index,row in df.iterrows():
            tweet = row[self.text_col]
            pretext = self.p.process(tweet)
            # print("Ori : ",tweet)
            print("Preprocessed : ",pretext," -> ",row[self.class_col])
            df.at[index,'text'] = pretext

        print('\n\n\n\n\n\n\n')

        v = Vsm()
        vsm = v.vsm(df,exceptional_feature=self.exceptional_feature)
        vsm = self.feature_selection.run(vsm,take_feature=10,exceptional_feature=self.exceptional_feature)
        nb = NaiveBayes()
        model = nb.builtmodel(vsm)

        testdata = "kecewa tolong hati jalan kalimant tidak rubah musim hujan parah"
        nb.classify(testdata)

    def run2(self):
        df = pd.read_sql('SELECT * FROM '+self.training_table+' order by id asc limit 0,9', con=self.con)

        for index,row in df.iterrows():
            tweet = row['text']
            pretext = self.p.process(tweet)
            # print("Ori : ",tweet)
            # print("Preprocessed : ",pretext," -> ",row['clas'])
            df.at[index,'text'] = pretext

        print('\n\n\n\n\n\n\n')

        # testdata = "Dikejar waktu biar bisa buat Pencitraan #prihatin"
        v = Vsm()
        vsm = v.vsm(df,exceptional_feature=self.exceptional_feature)
        vsm = self.feature_selection.run(vsm,exceptional_feature=self.exceptional_feature)

    def testrun(self):
        db_connection = pymysql.connect(host='localhost', user='******', passwd='', database='dataset_twitter',charset='utf8')
        df = pd.read_sql('SELECT * FROM tweet_2 order by id asc limit 0,9', con=db_connection)
        p = Preprocessing()

        for index,row in df.iterrows():
            tweet = row['text']
            pretext = p.process(tweet)
            # print("Ori : ",tweet)
            print("Preprocessed : ",pretext," -> ",row['clas'])
            df.at[index,'text'] = pretext

        print('\n\n\n\n\n\n\n')

        exceptional_feature = ['clas','id']
        testdata = "Dikejar waktu biar bisa buat Pencitraan #prihatin"
        v = Vsm()
        vsm = v.vsm(df,exceptional_feature=exceptional_feature)
        nb = NaiveBayes()
        model = nb.builtmodel(vsm)
        nb.classify(testdata)