Ejemplo n.º 1
0
def preprocessing(dataset):
    stemmer = StemmerFactory().create_stemmer()
    stopwords = StopWordRemoverFactory().create_stop_word_remover()
    for row in dataset:
        row['message'] = row.get('message').casefold()
        row['message'] = re.sub(r"[0-9]", "", row.get('message'))
        row['message'] = re.sub('[' + string.punctuation + ']', "",
                                row.get('message'))
        row['message_stopwords'] = stopwords.remove(row['message'])
        row['message_stemmed'] = stemmer.stem(row['message_stopwords'])
        row['message_tokenized'] = word_tokenize(row['message_stemmed'])
Ejemplo n.º 2
0
    def train(self):
        """
        NOTE: Implement your training procedure in this method.
        """
        # read data.csv using pandas and drop nan
        data = pd.read_csv("data.csv").dropna()
        # get article_content and transform to list
        contents = data["article_content"].values.tolist()
        # get article_topic and transform to list
        topics = data["article_topic"].values.tolist()
        # import library to tokenize and remove punctuation
        tokenizer = RegexpTokenizer(r'\w+')
        # stopword removal for bahasa indonesia
        stopword = StopWordRemoverFactory().create_stop_word_remover()
        # list to save clean contents
        clean_contents = list()
        # looping the contents, and preprocess for each content
        for content in contents:
            # case folding the sentence to be lowcase
            lowcase_word = content.lower()
            # remove stopword from the content
            stop_word = stopword.remove(lowcase_word)
            # tokenize the content
            sentence_token = tokenizer.tokenize(stop_word)
            # initialize a list for clean token
            clean_tokens = list()
            for token in sentence_token:
                # append token to the list after lower it
                clean_tokens.append(token)
            # transform a token to be sentence
            sentence = " ".join(clean_tokens) + ''
            # append clean sentence
            clean_contents.append(sentence)

        # count vectorizer
        X_train_counts = self.count_vect.fit_transform(clean_contents)
        # create tfidf from count vectorizer
        X_train_tfidf = self.tfidf_transformer.fit_transform(X_train_counts)
        # split data to train and test set > test 10%, train 90%
        X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf,
                                                            topics,
                                                            test_size=0.1)
        # train a model
        self.svm_clf.fit(X_train, y_train)

        # prediction for x_test
        prediction = self.svm_clf.predict(X_test)
        # model accuracy for x_test
        accuracy = accuracy_score(y_test, prediction)
        # print accuracy
        print(accuracy)
Ejemplo n.º 3
0
 def __init__(self, input, file_location):
     data = self.dataFromFile(file_location)
     stopword = StopWordRemoverFactory().create_stop_word_remover()
     stemmer = StemmerFactory().create_stemmer()
     input = stopword.remove(input.lower())
     input = stemmer.stem(input)
     valid = 0
     for i in range(len(data)):
         kal = stopword.remove(data[i][0].lower())
         kal = stemmer.stem(kal)
         if (self.bm(input.lower(), kal.lower()) != -1):
             if (valid == 0):
                 percent = len(input) * 100 / len(kal)
                 # print("Confidence1 : " + str(percent))
                 if (percent > 80):
                     self.answere = data[i][1]
                 valid = 1
         else:
             if valid == 0:
                 if (self.bm2(input.lower(), kal.lower()) >= 80):
                     # print("Confidence2 : " + str(bm2(input.lower(), kal.lower())))
                     self.answere = data[i][1]
                     valid = 1
Ejemplo n.º 4
0
    def clean_text(self, data):
      stopword = StopWordRemoverFactory().create_stop_word_remover()
      stemmer = StemmerFactory().create_stemmer()

      data = re.sub('[^a-zA-Z]',' ', str(data).lower())
      data = re.sub('\byok\b |\byuk\b', 'ayo', data)
      data = re.sub('\bmager\b', 'males', data)
      data = re.sub('\bmalas\b', 'males', data)
      data = re.sub('\bmls\b', 'males', data)
      data = re.sub('\bkuy\b', 'yuk', data)
      data = re.sub('\borg\b', 'orang', data)
      data = re.sub('\bjg\b', 'juga', data)
      data = re.sub('\budh\b', 'sudah', data)
      data = re.sub('\bmangat\b', 'semangat', data)
      data = re.sub('\bcemungut\b', 'semangat', data)
      data = re.sub('\bgas\b', 'yuk', data)
      data = re.sub('\benakeun\b', 'enak', data)
      data = re.sub('\bnaek\b', 'naik', data)
      data = re.sub('\bmmg\b', 'memang', data)
      data = re.sub('\bga\b', 'engga', data)
      data = re.sub('\bengga\b', 'tidak', data)
      data = re.sub('\bttg\b', 'tentang', data)
      data = re.sub('\brush hour\b', 'jam sibuk', data)
      data = re.sub('\bku\b', 'aku', data)
      data = re.sub('\bgak\b', 'tidak', data)
      data = re.sub('\bdgn\b', 'dengan', data)
      data = re.sub('\bbailk\b', 'pulang', data)
      data = re.sub('\bgatau\b', 'tidak tahu', data)
      data = re.sub('\bbat\b', 'banget', data)
      data = re.sub('\bampe\b', 'sampai', data)
      data = re.sub('\blg\b', 'sedang', data)
      data = re.sub('\banjay\b', 'asik', data)
      data = re.sub('\banjg\b', 'anjing', data)
      data = re.sub('\banjiing\b', 'anjing', data)
      data = re.sub('\bantum\b', 'kamu', data)
      data = re.sub('\basiq\b |\basyique\b |\basik\b', 'asyik', data)
      data = re.sub('\bbgt\b |\bbanget\b |\bbanged\b', 'sangat', data)
      data = re.sub('\bribet\b', 'repot', data)

      data = data.split()
      data = ' '.join(data)

      #setelah ngeganti baru ilangin stopword dan imbuhan kata dibawah ini
      #sastrawi remove stopwords
      data = stopword.remove(data) #stopword nya udah di di provide sastrawi
      #sastrawi stemming
      data = stemmer.stem(data)

      return data
Ejemplo n.º 5
0
def preprocess_text(input):
  #lowercase all character in the text
  text = input[0]
  text = text.lower()
  #remove punctuation
  text = text.translate(str.maketrans("","",string.punctuation))
  #remove leading and trailing whitespace
  text = text.strip()
  #remove StopWord
  stopword = StopWordRemoverFactory().create_stop_word_remover()
  text = stopword.remove(text)
  #stemming
  stemmer = StemmerFactory().create_stemmer()
  text = stemmer.stem(text)
  return text
Ejemplo n.º 6
0
class Preprocess:
    def __init__(self):
        self.stemmer = StemmerFactory().create_stemmer()
        self.remover = StopWordRemoverFactory().create_stop_word_remover()

    def preprocess(self, text):
        # # 1 stemming
        text_stem = self.stemmer.stem(text)
        #
        # # 2 hapus stop words
        text_clean = self.remover.remove(text_stem)
        #
        # # 3 tokenization
        # # 3.1 lowercase
        lowercase = text_clean.lower()
        preprocessed_text = lowercase.translate(None,
                                                string.punctuation).split()

        return preprocessed_text
Ejemplo n.º 7
0
def respond(strg):
    levenshtein = Levenshtein()
    stemmer = StemmerFactory().create_stemmer()
    stopwords = StopWordRemoverFactory().create_stop_word_remover()

    kategori = model.predict([strg])

    txt = stopwords.remove(strg)
    txt = stemmer.stem(txt)

    best = 1000
    res = []

    for words in dataset:
        if (words['category'] == kategori):
            distance = levenshtein.distance(txt, words['message_stemmed'])

            if (distance < best):
                best = distance
                res = words
    return res['respond']
Ejemplo n.º 8
0
def index(hashs, terms):
    for word in terms:
        if word in hashs:
            hashs[word] += 1
        else:
            hashs[word] = 1    

print('Indexing ...')
for path in sorted(IN_DIR.glob('*/*.html')):
    with open(path.resolve(), 'r', encoding='utf-8') as file:
        df[path.name] = dict()

        content = get_text(['title', 'top', 'middle', 'bottom'], file.read())
        content = content.translate(str.maketrans('','', punctuation))
        content = stopword.remove(content)
        terms = stemmer.stem(content.lower()).split()

        index(df[path.name], terms)
        index(tf, terms)
print('Indexing done!\n')

print('Calculating idf for terms...')
for term, freq in tf.items():
    df_i = 0
    for doc, tf_doc in df.items():
        df_i += 1 if term in tf_doc else 0
    idf[term] = (1 + math.log2(len(df)/df_i)) if df_i != 0 else 1
print('Calculated!\n')

with open(BASE_DIR / 'words_score.txt', 'w', encoding='utf-8') as file:
Ejemplo n.º 9
0
class TextSummarizer:
    def __init__(self, title: str, plot: str, human_synopsis: str):
        self.title = title
        self.plot = plot
        self.human_synopsis = human_synopsis
        self.stopwords = StopWordRemoverFactory().create_stop_word_remover()
        self.stemmer = StemmerFactory().create_stemmer()

    def __text_to_sentences(self, text: str) -> List[str]:
        regex = re.compile('\.\n\n|\.\n|\. |\.$')
        sentences = regex.split(text)
        return sentences

    def __stem_sentence(self, sentence: str) -> str:
        return self.stemmer.stem(sentence)

    def __stop_word_removal(self, words: List[str]) -> List[str]:
        temp_words = []
        for word in words:
            if word.lower() in self.title.lower():
                temp_words.append(word)
            else:
                temp = self.stopwords.remove(word)
                if temp:
                    temp_words.append(temp)

        return temp_words

    def __preprocess_text(self, text: str) -> tuple:
        temp_sentences = self.__text_to_sentences(text)
        sentences = []
        preprocessed_sentences = []
        for sentence in temp_sentences:
            if len(sentence) < 2:
                continue

            stemmed_sentence = self.__stem_sentence(sentence.lower())
            tokenized_sentence = nltk.tokenize.word_tokenize(stemmed_sentence)
            removed_stop_word_sentence = self.__stop_word_removal(
                tokenized_sentence)

            if len(removed_stop_word_sentence) < 2:
                continue

            sentences.append(sentence)
            preprocessed_sentences.append(removed_stop_word_sentence)

        return sentences, preprocessed_sentences

    def __sentence_similarity(self, sent1, sent2):
        """
        calculate the similarity between sentence!
        return distance between sentences
        """
        sent1 = [w.lower() for w in sent1]
        sent2 = [w.lower() for w in sent2]

        all_words = list(set(sent1 + sent2))

        vector1 = [0] * len(all_words)
        vector2 = [0] * len(all_words)

        # build the vector for the first sentence
        for w in sent1:
            vector1[all_words.index(w)] += 1

        # build the vector for the second sentence
        for w in sent2:
            vector2[all_words.index(w)] += 1

        return 1 - cosine_distance(vector1, vector2)

    def __build_similarity_matrix(self, sentences):
        """
        make a matrix to plot the similarity between sentences in a file
        return matrix
        """
        # Create an empty similarity matrix
        similarity_matrix = np.zeros((len(sentences), len(sentences)))

        for idx1 in range(len(sentences)):
            for idx2 in range(len(sentences)):
                if idx1 == idx2:  # ignore if both are same sentences
                    continue
                similarity_matrix[idx1][idx2] = self.__sentence_similarity(
                    sentences[idx1], sentences[idx2])

        return similarity_matrix

    def summarize(self, top_n=5):
        summarize_text = []

        # Step 1 - text preprocessing
        plot_sentences, plot_pre_sentences = self.__preprocess_text(self.plot)

        # Step 2 - Generate Similary Martix across sentences
        sentence_similarity_martix = self.__build_similarity_matrix(
            plot_pre_sentences)

        print(sentence_similarity_martix)
        # Step 3 - Rank sentences in similarity martix
        sentence_similarity_graph = nx.from_numpy_array(
            sentence_similarity_martix)
        plot_scores = nx.pagerank(sentence_similarity_graph)

        # Step 4 - Sort the rank and pick top sentences
        ranked_sentence = []
        for i in range(len(plot_scores)):
            ranked_sentence.append([plot_scores[i], plot_sentences[i], i])

        ranked_sentence.sort(key=lambda x: x[0], reverse=True)
        top_n = min(top_n, len(plot_sentences))
        summary = ranked_sentence[0:top_n]
        summary.sort(key=lambda x: x[2])
        summary = [i[1] for i in summary]
        summarize_text = ""
        for i in range(top_n):
            summarize_text += "".join(summary[i]) + ". "

        # Step 5 - Offcourse, output the summarize texr
        return summarize_text

    @staticmethod
    def generate_from_file(title, plotfilepath, synopsisfilepath):
        plot = ""
        synopsis = ""
        with open(plotfilepath, "r") as plot_file:
            plot = plot_file.read()
        with open(synopsisfilepath, "r") as synopsis_file:
            synopsis = synopsis_file.read()

        ts = TextSummarizer(title, plot, synopsis)
        return ts.summarize()
Ejemplo n.º 10
0
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
from string_matching_algorithm import *
import re as regex
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# factory = StopWordRemoverFactory()
newStopFactory = StopWordRemoverFactory().get_stop_words()
newStopFactory.remove("sampai")
newStopFactory.remove("dan")
newStopFactory.append("deadline")
newStopFactory.append("mengenai")
newStopFactory.append("tanggal")
stopword = StopWordRemover(ArrayDictionary(newStopFactory))

# Regex untuk bulan
JANUARI_REGEX = '[Jj]an(?:uari)?'
FEBRUARI_REGEX = '[Ff]eb(?:ruari)?'
MARET_REGEX = '[Mm]ar(?:et)?'
APRIL_REGEX = '[Aa]pr(?:il)?'
MEI_REGEX = '[Mm]ei'
JUNI_REGEX = '[Jj]uni?'
JULI_REGEX = '[Jj]uli?'
AGUSTUS_REGEX = '[Aa]gu(?:stus)?'
SEPTEMBER_REGEX = '[Ss]ep(?:tember)?'
OKTOBER_REGEX = '[Oo]kt(?:ober)?'
NOVEMBER_REGEX = '[Nn]ov(?:ember)?'
DESEMBER_REGEX = '[Dd]es(?:ember)?'

# Regex untuk keutuhan tanggal
ANYTHING = '.*'
DAY_REGEX = '(0[1-9]|[1-2][0-9]|3[0-1])'
Ejemplo n.º 11
0
def rem_stop_words(text):
    sw_rem = StopWordRemoverFactory().create_stop_word_remover()
    return sw_rem.remove(text)
Ejemplo n.º 12
0
class SpamClassifier(object):
    def __init__(self, tweets, labels):
        self.tweets, self.labels = tweets, labels
        self.clean_tweets = []
        self.conv_tweets = []
        self.stem_tweets = []
        self.processed_tweets = []

        self.spam_tweets, self.ham_tweets = (labels == 1).sum(), (
            labels == 0).sum()
        self.total_tweets = len(self.tweets)

        self.testdata = []
        self.testdata_terproses = []

        self.vocab1 = list()
        self.vocab2 = list()
        self.vocab3 = list()

        self.prior_spam = 0.0
        self.prior_ham = 0.0

        self.tf_spam1 = dict()
        self.tf_ham1 = dict()
        self.tf_spam2 = dict()
        self.tf_ham2 = dict()
        self.tf_spam3 = dict()
        self.tf_ham3 = dict()

        self.dfw1 = dict()
        self.dfw2 = dict()
        self.dfw3 = dict()

        self.pwtfidf_spam1 = dict()
        self.pwtfidf_ham1 = dict()
        self.pwtfidf_spam2 = dict()
        self.pwtfidf_ham2 = dict()
        self.pwtfidf_spam3 = dict()
        self.pwtfidf_ham3 = dict()

        self.stemmer = StemmerFactory().create_stemmer()
        self.stop = StopWordRemoverFactory().create_stop_word_remover()
        self.stop.dictionary.add('lasturladdr')
        self.stop.dictionary.add('rt')

    def praproses(self):
        for i in range(len(self.tweets)):
            self.clean_tweets.append(clean_text(self.tweets[i]))
            self.conv_tweets.append(konversi(self.clean_tweets[i], pengganti))
            self.stem_tweets.append(self.stemmer.stem(self.conv_tweets[i]))
            self.processed_tweets.append(self.stop.remove(self.stem_tweets[i]))

    def praprosestext(self, teks):
        cteks = clean_text(teks)
        konv_teks = konversi(cteks, pengganti)
        stemteks = self.stemmer.stem(konv_teks)
        nosw_teks = self.stop.remove(stemteks)
        return nosw_teks

    def hitungTFDF(self):
        for i in range(self.total_tweets):

            dfw = {}
            tfunigram = createToken(self.processed_tweets[i], gram=1)
            for word in tfunigram:

                if dfw.get(word, 0) == 0:  # hitung dokumen berisi word
                    dfw[word] = 1
                    self.dfw1[word] = self.dfw1.get(word, 0) + 1

                if self.labels[i]:
                    self.tf_spam1[word] = self.tf_spam1.get(word, 0) + 1

                else:
                    self.tf_ham1[word] = self.tf_ham1.get(word, 0) + 1

            dfw = {}
            tfbigram = createToken(self.processed_tweets[i], gram=2)
            for word in tfbigram:

                if dfw.get(word, 0) == 0:  # hitung dokumen berisi word
                    dfw[word] = 1
                    self.dfw2[word] = self.dfw2.get(word, 0) + 1

                if self.labels[i]:
                    self.tf_spam2[word] = self.tf_spam2.get(word, 0) + 1

                else:
                    self.tf_ham2[word] = self.tf_ham2.get(word, 0) + 1

            dfw = {}
            tftrigram = createToken(self.processed_tweets[i], gram=3)
            for word in tftrigram:

                if dfw.get(word, 0) == 0:  # hitung dokumen berisi word
                    dfw[word] = 1
                    self.dfw3[word] = self.dfw3.get(word, 0) + 1

                if self.labels[i]:
                    self.tf_spam3[word] = self.tf_spam3.get(word, 0) + 1

                else:
                    self.tf_ham3[word] = self.tf_ham3.get(word, 0) + 1

        self.vocab1 = list(dict(self.tf_spam1, **self.tf_ham1).keys())
        self.vocab2 = list(dict(self.tf_spam2, **self.tf_ham2).keys())
        self.vocab3 = list(dict(self.tf_spam3, **self.tf_ham3).keys())

    def train(self):
        self.praproses()
        self.hitungTFDF()

        self.prior_spam = self.spam_tweets / self.total_tweets
        self.prior_ham = self.ham_tweets / self.total_tweets

        #****Hitung tfidf untuk 1-gram dan 2-gram***

        #***** 1-gram****
        for word in self.tf_spam1:
            self.pwtfidf_spam1[word] = self.tf_spam1[word] \
                   * log10(len(self.tweets) / self.dfw1[word])

        for word in self.tf_ham1:
            self.pwtfidf_ham1[word] = self.tf_ham1[word] \
                   * log10(len(self.tweets) / self.dfw1[word])

        #===2 gram===
        for word in self.tf_spam2:
            self.pwtfidf_spam2[word] = self.tf_spam2[word] \
                    * log10(len(self.tweets) / (self.dfw2[word]))

        for word in self.tf_ham2:
            self.pwtfidf_ham2[word] = self.tf_ham2[word] \
                    * log10(len(self.tweets) / self.dfw2[word])

    def classify1(self, text, metode):

        self.metode = metode + '1gr'
        proses_text = self.praprosestext(text)
        self.testdata_terproses.append(proses_text)
        token = createToken(proses_text, gram=1)

        pSpam = log10(self.prior_spam)
        pHam = log10(self.prior_ham)

        for word in token:

            #==hitung probbilitas spam
            if metode == 'tfidf':
                pSpam += log10(self.pwtfidf_spam1.get(word, 1) + 1)
                pSpam -= log10(
                    sum(self.pwtfidf_spam1.values()) + len(self.tf_spam1))
            if metode == 'bow':
                pSpam += log10(
                    (self.tf_spam1.get(word, 0) + 1) /
                    (sum(self.tf_spam1.values()) + len(self.vocab1)))

        #== Hitung untuk ham =====
            if metode == 'tfidf':
                pHam += log10(self.pwtfidf_ham1.get(word, 1) + 1)
                pHam -= log10(
                    sum(self.pwtfidf_ham1.values()) + len(self.tf_ham1))
            if metode == 'bow':
                pHam += log10((self.tf_ham1.get(word, 0) + 1) /
                              (sum(self.tf_spam1.values()) + len(self.vocab1)))

        #print("pSpam: ",pSpam," pHam: ",pHam)
        return pSpam >= pHam

    def classify2(self, text, metode):

        self.metode = metode + '2gr'
        proses_text = self.praprosestext(text)
        self.testdata_terproses.append(proses_text)
        token = createToken(proses_text, gram=2)

        pSpam = log10(self.prior_spam)
        pHam = log10(self.prior_ham)

        for word in token:

            #==hitung probbilitas spam
            if metode == 'tfidf':
                pSpam += log10(self.pwtfidf_spam2.get(word, 1) + 1)
                pSpam -= log10(
                    sum(self.pwtfidf_spam2.values()) + len(self.tf_spam2))
            else:
                pSpam += log10(
                    (self.tf_spam2.get(word, 0) + 1) /
                    (sum(self.tf_spam2.values()) + len(self.vocab2)))

        #== Hitung untuk ham =====
            if metode == 'tfidf':
                pHam += log10(self.pwtfidf_ham2.get(word, 1) + 1)
                pHam -= log10(
                    sum(self.pwtfidf_ham2.values()) + len(self.tf_ham2))
            else:
                pHam += log10((self.tf_ham2.get(word, 0) + 1) /
                              (sum(self.tf_spam2.values()) + len(self.vocab2)))

        #print('pSpam: ',pSpam,' pHam: ',pHam)
        return pSpam >= pHam

    def sbclassify(self, text):

        self.metode = 'stupidbackoff'
        proses_text = self.praprosestext(text)
        self.testdata_terproses.append(proses_text)
        hamscore = 0.0
        spamscore = 0.0

        words = createToken(proses_text, gram=2)

        for word in words:
            wordtoken = word.split()
            tokenprev = wordtoken[0]
            tokennext = wordtoken[1]
            if word in self.tf_ham2:
                bicount = self.tf_ham2[word]
                bi_unicount = self.tf_ham1[tokenprev]
                hamscore += log10(bicount)
                hamscore -= log10(bi_unicount)
            else:
                if tokennext in self.tf_ham1:
                    unicount = self.tf_ham1[tokennext]
                else:
                    unicount = 0.4
                hamscore += log10(0.4)
                hamscore += log10(unicount)
                hamscore -= log10(
                    sum(self.tf_ham1.values()) + len(self.vocab1))

            if word in self.tf_spam2:
                bicount2 = self.tf_spam2[word]
                bi_unicount2 = self.tf_spam1[tokenprev]
                spamscore += log10(bicount2)
                spamscore -= log10(bi_unicount2)
            else:
                if tokennext in self.tf_spam1:
                    unicount2 = self.tf_spam1[tokennext]
                else:
                    unicount2 = 0.4
                spamscore += log10(0.4)
                spamscore += log10(unicount2)
                spamscore -= log10(
                    sum(self.tf_spam1.values()) + len(self.vocab1))
            #spamscore += log10(self.prior_spam)
            #hamscore += log10(self.prior_ham)
        return spamscore >= hamscore

    def predict(self, test_data, metode, gram):
        '''metode = stbo = stupid backoff'
                    bow = bag off word
                    tfidf = with tfidf
        '''
        self.testdata = []
        self.testdata_terproses = []
        result = dict()
        if metode == 'stbo':
            for (i, tweet) in enumerate(test_data):
                result[i] = int(self.sbclassify(tweet))
        else:
            if gram == 1:
                for (i, tweet) in enumerate(test_data):
                    result[i] = int(self.classify1(tweet, metode))
            if gram == 2:
                for (i, tweet) in enumerate(test_data):
                    result[i] = int(self.classify2(tweet, metode))
        return result

    def metrics(self, labels, predictions, tweets):
        etext = []
        eptext = []
        elabel = []
        true_pos, true_neg, false_pos, false_neg = 0, 0, 0, 0
        for i in range(len(labels)):
            true_pos += int((labels[i] == 1) and (predictions[i] == 1))
            true_neg += int(labels[i] == 0 and predictions[i] == 0)
            if (labels[i] == 0 and predictions[i] == 1):
                false_pos += 1
                etext.append(tweets[i])
                eptext.append(self.praprosestext(tweets[i]))
                elabel.append('fp')
            if (labels[i] == 1 and predictions[i] == 0):
                false_neg += 1
                etext.append(tweets[i])
                eptext.append(self.praprosestext(tweets[i]))
                elabel.append('fn')
        edf = pd.DataFrame(list(zip(etext, eptext, elabel)),
                           columns=['text', 'stemmedtext', 'label'])
        filename = 'data/false_' + self.metode + '.xlsx'
        writer = pd.ExcelWriter(filename, engine='xlsxwriter')
        edf.to_excel(writer, sheet_name='Sheet1')
        writer.save()
        precision = true_pos / (true_pos + false_pos)
        recall = true_pos / (true_pos + false_neg)
        Fscore = 2 * precision * recall / (precision + recall)
        accuracy = (true_pos + true_neg) / (true_pos + true_neg + false_pos +
                                            false_neg)
        print("Precision: ", precision)
        print("Recall: ", recall)
        print("F-score: ", Fscore)
        print("Accuracy: ", accuracy)
        print('\n==Confusion Matrix===')
        print("True Positiv: ", true_pos)
        print("False Positiv: ", false_pos)
        print("True Negativ: ", true_neg)
        print("False Negativ: ", false_neg)
Ejemplo n.º 13
0
            "Halo, namaku Alice.<br>Selamat datang di ChatBot Alice.<br>Tanyakan apapun kepadaku dan aku akan mencoba menjawabnya...<br>...dengan KMP, BM, dan Regex."
        )
    else:
        stopword = StopWordRemoverFactory().create_stop_word_remover()
        pertanyaan = []
        purePertanyaan = []
        jawaban = []
        for line in open('pertanyaan.txt').readlines():
            i = 0
            j = 0
            while (line[j] != ' '):
                j += 1
            while (line[i] != '?'):
                i += 1
            purePertanyaan.append((line[j + 1:i + 1]))
            pertanyaan.append(stopword.remove((line[j + 1:i].lower())))
            jawaban.append(line[i + 2:len(line) - 1])

        query = re.sub('[%s]' % re.escape(string.punctuation), '',
                       sys.argv[1].lower())
        query = stopword.remove(query)
        querylist = query.split(' ')
        synonymList = []
        for queryWord in querylist:
            synonymList.append(getSinonim(queryWord))
        #dikombinasi
        sentenceList = [[]]
        for word in synonymList:
            newList = []
            for synonym in word:
                for sentence in sentenceList: