예제 #1
0
def result_blog():
    score = []
    rank = []

    query = request.form["query"]
    # ***PRE PROCESSING***
    # Stopword
    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()
    _query = stopword.remove(query)

    # Stemming
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    _query = stemmer.stem(_query)

    # TF Query
    listTf = []
    blob = tb(_query)
    uniqWord = list(set(blob.words))
    for word in uniqWord:
        _n = blob.words.count(word)
        listTf.append(Tf("query", word, _n))

    # *** SCORING PROCESS ***
    # TF
    # Desc : GET BLOG TITLE WHICH THE CONTENT CONTAINT QUERY WORD
    listTitleBlog = []
    for word in uniqWord:
        try:
            blogList = BlogRepository().getByWord(word)
            for t in blogList:
                listTitleBlog.append(t["title"])
        except:
            print(word, "not available")

    if (len(listTitleBlog) == 0):
        return render_template("result.html", rank=rank, query=query)

    listTitleBlog = list(set(listTitleBlog))  #Unique Blog Title

    listBlog = []
    for l in listTitleBlog:
        listBlog.append(BlogRepository().getByTitle(l))

    # IDF
    blobList = []
    for blog in listBlog:
        content = blog["content"]
        blobList.append(tb(content))

    idfList = []
    for word in uniqWord:
        idfList.append(idf(word, blobList))

    # Scoring
    for title in listTitleBlog:
        result = 0
        for i, word in enumerate(uniqWord):
            try:
                # if word available
                # Counting Word
                blogData = BlogRepository().getByTitle(title)
                _content = blogData["tf"]
                blob = tb(_content)
                _n = blob.words.count(word)
                result = result + (idfList[i] * _n)
            except:
                print(word, "not available")
        score.append(result)
    lenScore = len(score)
    while (lenScore > 0):
        bestIndex = score.index(max(score))
        rank.append(listBlog[bestIndex])
        del score[bestIndex]
        del listBlog[bestIndex]
        lenScore = len(score)
    # print(rank)
    return render_template("result.html", rank=rank, query=query)
예제 #2
0
            skip_gs = p_bm.match_skip()
            # cari index yg terbesar
            shift = max(shift, skip_gs)
        # incrementing (0+1)
        i += shift
    # me return hasil matching
    return occurrences


app = Flask(__name__)
# untuk fix error cors
cors = CORS(app)
app.config['CORS_HEADERS'] = 'Content-Type'

# the stopword by Sastrawi
stop_factory = StopWordRemoverFactory().get_stop_words()

# fitur tambah kata stopword
more_stopword = []

# proses merging kata stopword sastrawi dengan kata yg kita tambah
data = stop_factory + more_stopword

dictionary = ArrayDictionary(data)

stopword = StopWordRemover(dictionary)
# end stopword


@app.route("/")
def index():
예제 #3
0
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

import string

factory_stopwrods = StopWordRemoverFactory()
stopwords = factory_stopwrods.get_stop_words()

factory_stemmer = StemmerFactory()
stemmer = factory_stemmer.create_stemmer()


def clean_text(text):

    # removing punctuation
    for c in string.punctuation:
        text = text.replace(c, "")

    # removing excessive whitespace
    text = " ".join(text.split())

    # text to array of word
    words = text.split()

    # removing stopwords
    words = [word for word in words if word not in stopwords]

    # stemming word in query
    words = [stemmer.stem(word) for word in words]

    return words
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory  # Rumus Library
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import os
import pandas as pd
import re

stemmer = StemmerFactory().create_stemmer()  # Object stemmer
remover = StopWordRemoverFactory().create_stop_word_remover()  # objek stopword


class DiceDistance(object):
    """
        Create custom libraries
    """
    def __init__(self, dataset):
        self.dataset = dataset
        self.RELEVANT = "Relevant"
        self.IRRELEVANT = "Irrelevant"
        self.LIST_REPLACE_TEXT = [
            '/', 'gram', 'ml', 'cc', 'buah', 'sendok teh', 'sendok makan',
            'sendok takar', 'butir', 'cangkir', 'siung', 'batang'
        ]
        self.scores = list()

    def replaceMultiple(self, mainString, toBeReplaces, newString):
        # Iterate over the strings to be replaced
        for elem in toBeReplaces:
            # Check if string is in the main string
            if elem in mainString:
                # Replace the string
                mainString = mainString.replace(elem, newString)
예제 #5
0
        clean_file.write('\n')
    print('finish')
    clean_file.close()


def visualize():
    file = open('result_' + filename, 'r')
    data = json.loads(file.read())
    print(json.dumps(data, indent=1))


# visualize()
# clean_data()
# clean_data_from_twitterscaper()
# exit()
indonesian_stopwords = StopWordRemoverFactory().get_stop_words()
punctuation = list(string.punctuation)
stop = indonesian_stopwords + stopwords.words('english') + punctuation + \
    ['jokowi', 'yg', 'ada', 'pak', 'ini', 'juga',
        'dan', 'rt', '…', 'mau', 'jangan', 'tanya', 'dlm', 'sering', 'jadi', 'lu', 'kalian']

word_count = 0
hashtag_all = []
source_all = []
print('Processing')
with open(clean_filename, 'r') as f:
    count_all = Counter()
    hashtag_counter = Counter()
    source_counter = Counter()
    # data_to_read = 20
    i = 0
예제 #6
0
df_status_1 = df_status[df["Status"] == 1]
message1_train, message1_test, status1_train, status1_test = train_test_split(df_message_1, df_status_1, test_size=0.2, random_state=4)

df_message_2 = df_message[df["Status"] == 2]
df_status_2 = df_status[df["Status"] == 2]
message2_train, message2_test, status2_train, status2_test = train_test_split(df_message_2, df_status_2, test_size=0.2, random_state=4)

#menggabungkan 80% message dn 20% train
df_message_train = pd.concat([message0_train, message1_train, message2_train])
df_status_train = pd.concat([status0_train, status1_train, status2_train])

df_message_test = pd.concat([message0_test, message1_test, message2_test])
df_status_test = pd.concat([status0_test, status1_test, status2_test])

#StopWord
idn_stopWord = StopWordRemoverFactory().get_stop_words()

cv = TfidfVectorizer(stop_words = idn_stopWord)

message_train_cv = cv.fit_transform(df_message_train)
getFitur = cv.get_feature_names()
a = message_train_cv.toarray()

#clasification
def cosineDistance(testData, trainingData):
    distance = 0
    penyebutCosineTest = 0
    penyebutCosineTrain = 0
    pembilangCosine = 0
    for i in range(len(getFitur)):
        pembilangCosine += (testData.item(i) * trainingData.item(i))
예제 #7
0
파일: sen.py 프로젝트: fadholifh/skrps
def stopw(data):
    wf = StopWordRemoverFactory()
    more_stopword = ['hehe', 'wkwk']
    stop = wf.create_stop_word_remover()
    sword = stop.remove(data)
    return sword
예제 #8
0
def index():
    if request.method == 'POST':
        file = Documents.query.order_by(Documents.sim).all()
        WordInAllDocument = []
        inputQuery = request.form['textquery']
        lowercaseQuery = inputQuery.lower()

        # create stemmer
        stemfactory = StemmerFactory()
        stemmer = stemfactory.create_stemmer()

        # Create stopwordsremover
        stopfactory = StopWordRemoverFactory()
        stopword = stopfactory.create_stop_word_remover()

        # Stemming Query with Sastrawi
        stemmedQuery = stemmer.stem(lowercaseQuery)

        # Remove Stopword from Query with Sastrawi
        removedStopQuery = stopword.remove(stemmedQuery)

        queryWordList = re.sub("[^\w]", " ", removedStopQuery).split()

        # Fill set of unique words from query
        for word in queryWordList:
            if word not in WordInAllDocument:
                WordInAllDocument.append(word)

        for doc in file:
            Similarity = 0
            if doc.url:
                filename = re.sub("[^\w]", "", doc.name) + '.txt'
            else:
                filename = doc.name
            fd = open("./static/" + filename, "r")
            fileContents = fd.read().lower()

            # Stemming File Contents with Sastrawi
            stemmedFileContents = stemmer.stem(fileContents)

            # Remove Stopword from File Contents with Sastrawi
            removedStopFileContents = stopword.remove(stemmedFileContents)

            fileContentsWordList = re.sub(
                "[^\w]", " ", removedStopFileContents).split(
                )  #Replace punctuation by space and split

            # Fill set of unique words from file
            for word in fileContentsWordList:
                if word not in WordInAllDocument:
                    WordInAllDocument.append(word)

            # Count word frequency in file and query
            queryVector = []
            fileContentsVector = []
            for word in WordInAllDocument:
                queryVector.append(queryWordList.count(word))
                fileContentsVector.append(fileContentsWordList.count(word))

            # Find dot product and magnitude of the vectors
            dotProduct = 0
            queryVectorLength = 0
            fileContentsVectorLength = 0
            for i in range(len(queryVector)):
                dotProduct += queryVector[i] * fileContentsVector[i]
                queryVectorLength += queryVector[i]**2
                fileContentsVectorLength += fileContentsVector[i]**2
            queryVectorLength = math.sqrt(queryVectorLength)
            fileContentsVectorLength = math.sqrt(fileContentsVectorLength)

            # Calculate similarity
            if queryVectorLength * fileContentsVectorLength != 0:
                Similarity = (float)(
                    dotProduct /
                    (queryVectorLength * fileContentsVectorLength)) * 100
                doc.sim = Similarity
            else:
                doc.sim = 0

        orderedFiles = Documents.query.order_by(Documents.sim.desc()).all()
        dcount = len(orderedFiles) + 1
        wcount = [[0 for j in range(dcount)]
                  for i in range(len(WordInAllDocument))]

        # Fill term table
        for i in range(len(WordInAllDocument)):
            wcount[i][0] = queryWordList.count(WordInAllDocument[i])

        j = 1
        for doc in orderedFiles:
            if doc.url:
                filename = re.sub("[^\w]", "", doc.name) + '.txt'
            else:
                filename = doc.name
            fd = open("./static/" + filename, "r")
            fileContents = fd.read().lower()

            # Stemming File Contents with Sastrawi
            stemmedFileContents = stemmer.stem(fileContents)

            # Remove Stopword from File Contents with Sastrawi
            removedStopFileContents = stopword.remove(stemmedFileContents)

            fileContentsWordList = re.sub(
                "[^\w]", " ", removedStopFileContents).split(
                )  #Replace punctuation by space and split

            i = 0
            for word in WordInAllDocument:
                # wcount[i][j] = fileContentsWordList.count(word)
                for word2 in fileContentsWordList:
                    if word == word2:
                        wcount[i][j] = wcount[i][j] + 1
                i = i + 1
            j = j + 1
        return render_template('index.html',
                               queryCnt=queryWordList,
                               success='Query success',
                               documents=orderedFiles,
                               arr=WordInAllDocument,
                               arr2=wcount,
                               dcount=dcount,
                               i=0,
                               input=inputQuery)
    else:
        documents = Documents.query.order_by(Documents.date_created).all()
        return render_template('index.html', documents=documents)
def prepro():
    data = pan.read_sql('SELECT * FROM dataset', con=database)

    data['comment'] = data['comment'].apply(
        lambda x: " ".join(x.lower() for x in x.split()))
    data['comment'] = data['comment'].str.replace("[^a-zA-Z]", " ")

    #tokenisasi
    tokenisasi = []
    for row in data['comment']:
        token = word_tokenize(row)
        tokenisasi.append(token)

    table = {}
    with open('spelling_word.txt', 'r') as syn:
        for row in syn:
            match = re.match(r'(\w+)\s+=\s+(.+)', row)
            if match:
                primary, synonyms = match.groups()
                synonyms = [synonym.lower() for synonym in synonyms.split()]
                for synonym in synonyms:
                    table[synonym] = primary.lower()

    spelling = []
    for idx, value in enumerate(tokenisasi):
        temp = []
        for idy, value1 in enumerate(value):
            temp.append(''.join(
                table.get(word.lower(), word)
                for word in re.findall(r'(\W+|\w+)', value1)))
        spelling.append(temp)

    #stopword
    stop_factory = StopWordRemoverFactory()
    data_stopword = stop_factory.get_stop_words()
    stopword = stop_factory.create_stop_word_remover()
    stopword_removal = []
    for idx, value in enumerate(spelling):
        temp = []
        for idy, value1 in enumerate(value):
            temp.append(stopword.remove(value1))
        stopword_removal.append(temp)

    #stemming
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    stemming = []
    for idx, value in enumerate(stopword_removal):
        temp = []
        for idy, value1 in enumerate(value):
            temp.append(stemmer.stem(value1))
        stemming.append(temp)

    hasil_prepro = []
    for idx, value in enumerate(stemming):
        punctuations = ''' '''
        no_punct = ""
        for idy, value1 in enumerate(value):
            if value1 not in punctuations:
                no_punct = no_punct + value1 + ' '
            k = no_punct
        hasil_prepro.append(k)

    data['comment'] = pan.Series(hasil_prepro)
    data['label_comment'] = data['label'].factorize()[0]
    label_comment = data[['label', 'label_comment'
                          ]].drop_duplicates().sort_values('label_comment')
    label1 = dict(label_comment.values)
    label2 = dict(label_comment[['label_comment', 'label']].values)

    db_username = '******'
    db_password = ''
    db_ip = 'localhost'
    db_name = 'sara'
    db_connection = sqlalchemy.create_engine(
        'mysql+mysqlconnector://{0}:{1}@{2}/{3}'.format(
            db_username, db_password, db_ip, db_name))

    data.to_sql(con=db_connection,
                name='data_latih',
                if_exists='replace',
                index=False)

    return redirect(url_for('data_latih'))
예제 #10
0
def text_to_tagReadyDF(input, isCSV=True, more_stopwords=None):
    """
	This function is used to convert raw text of PUU (either CSV file or pandas Series) into tag-ready dataframe.

	Args:
	- input (pd.Series variable): either CSV file (enter its file location) or pandas Series. If you want to use pandas Series, set 'isCSV' arg to False.
	- isCSV (Boolean): if True, CSV input used. If False, pd.Series input used.
	- more_stopwords (list): add more stopwords if you'd like.

	Return:
	- result dataframe
	"""
    # in case Sastrawi not detected
    try:
        from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
        stopwords = StopWordRemoverFactory().get_stop_words()
    except ModuleNotFoundError:
        print(
            "No module named 'Sastrawi' in your machine. Bypassing 'Sastrawi' dependency, but the number of stopwords will decrease."
        )
        stopwords = []

    # check input
    if isCSV:
        # add csv file that contains raw PUU texts
        text = pd.read_csv(input, delimiter='|')
        if text.iloc[:, 0].dtype == 'O':
            text = text.iloc[:, 0]
        else:
            raise ValueError(
                "As 'isCSV' set to True, the 1st column of your CSV file should be the texts you'd like to process."
            )

    else:
        # if pd.Series expected
        if isinstance(input, pd.Series):  # check if data type is suitable
            text = input
        else:
            raise TypeError(
                "As 'isCSV' set to False, 'input' should be a pandas Series.")

    # define punctuation
    punctAndSpace = string.punctuation + ' '
    # kita memerlukan karakter '(', ')', dan '.',
    # karena karakter tsb muncul di ayat dan angka
    punctAndSpace = punctAndSpace.replace('(', '')
    punctAndSpace = punctAndSpace.replace(')', '')
    punctAndSpace = punctAndSpace.replace('.', '')

    # tambah stopwords dari argument variable
    if more_stopwords != None:
        assert isinstance(more_stopwords,
                          list), "'more_stopwords' arg should be list type."
        stopwords += more_stopwords

    stopwords = sorted(set(stopwords))

    # ubah Raw teks PUU menjadi tokens ke sebuah kolom df
    # lalu beri tagging 'O' secara otomatis pada tokens yang tidak masuk interest annotations
    dfList = []
    for idx, t in tqdm(enumerate(text)):
        # tokenization
        tokens = [[word_tokenize(w), ' '] for w in t.split()]
        tokens = list(itertools.chain(*list(itertools.chain(*tokens))))
        tokens = tokens[:-1]

        split_res = []
        for t in tokens:
            # if-else di bawah ini untuk mengcover token berbentuk seperti ini,
            # 'Jakarta-Bogor-Ciawi'
            if re.match(r'\w+\-\w+.*', t):
                line = t.split('-')
                for i, j in enumerate(line):
                    split_res.append(j)
                    if i < len(line) - 1:
                        split_res.append('-')
            else:
                split_res.append(t)

        # membuat tagging 'O' untuk token yang kita anggap tidak masuk list annotations
        blank = [
            '' if i.lower() not in list(punctAndSpace) + stopwords else 'O'
            for i in split_res
        ]

        # buat menjadi df
        dfTemp = pd.DataFrame([split_res, blank]).T
        # beri nama kolom sesuai dengan index looping
        dfTemp.columns = ['token_' + str(idx), 'BIO_tag_' + str(idx)]
        dfList.append(dfTemp)
    # concat semua df
    df = pd.concat(dfList, axis=1)

    # # save ke file csv yang siap ditag manual
    # df.to_csv(output_loc)
    # print('CSV output file successfully written.')

    return df
예제 #11
0
def removeStopWords(text):
    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()
    return stopword.remove(text)
 def remove_stop_word(self, X):
     factory = StopWordRemoverFactory()
     stopword = factory.create_stop_word_remover()
     for i in range(X.shape[0]):
         X[i] = stopword.remove(X[i])
     return X
예제 #13
0
 def stopword(self, berita):
     factory = StopWordRemoverFactory()
     stopword = factory.create_stop_word_remover()
     hasil = stopword.remove(berita)
     return hasil
예제 #14
0
def cosine():
    rank = []
    score = []
    query = request.form["query"]
    # ***PRE PROCESSING***
    # Stopword
    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()
    _query = stopword.remove(query)

    # Stemming
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    _query = stemmer.stem(_query)

    # TF Query
    listTf = []
    blob = tb(_query)
    uniqWord = list(set(blob.words))
    # Desc : GET BLOG TITLE WHICH THE CONTENT CONTAINT QUERY WORD
    listTitleBlog = []
    for word in uniqWord:
        try:
            blogList = BlogRepository().getByWord(word)
            for t in blogList:
                listTitleBlog.append(t["title"])
        except:
            print(word, "not available")

    if (len(listTitleBlog) == 0):
        return render_template("result.html", rank=rank, query=query)

    listTitleBlog = list(set(listTitleBlog))  #Unique Blog Title

    blogAll = []
    for l in listTitleBlog:
        blogAll.append(BlogRepository().getByTitle(l))

    # *** COSINE SIMILIARITY ***
    # Scoring
    for blog in blogAll:
        # Get Set of Article and Query
        combined = _query + blog["tf"]
        blob = tb(combined)
        uniqWord = list(set(blob.words))
        # Count on two array how many word over there
        bQuery = tb(_query)
        bBlog = tb(blog["tf"])
        cQuery = []
        cBlog = []
        for word in uniqWord:
            _nQ = bQuery.words.count(word)
            cQuery.append(_nQ)
            _nB = bBlog.words.count(word)
            cBlog.append(_nB)
        # print(blog["title"])
        # print(cQuery)
        # print(cBlog)
        result = 1 - spatial.distance.cosine(cQuery, cBlog)
        score.append(result)
    lenScore = len(score)
    while (lenScore > 0):
        bestIndex = score.index(max(score))
        rank.append(blogAll[bestIndex])
        del score[bestIndex]
        del blogAll[bestIndex]
        lenScore = len(score)

    return render_template("result.html", rank=rank, query=query)
예제 #15
0
def fulltfidf(dateList, semesta):
    from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
    from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
    remover = StopWordRemoverFactory()
    stoper = remover.create_stop_word_remover()
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    tfidf = []
    arti, singkat = artisingkat()
    stopword = aksesstopword()
    for v in dateList:
        naskah = []
        for t in semesta:
            if t[1] == v:
                kecil = []
                twit = []
                twit = t[2].split(' ')
                for tw in twit:
                    tw = re.sub('[^A-Za-z ]+', '', tw)
                    tw = tw.lower()
                    if tw in singkat:
                        tw = arti[tw]
                    kecil.append(tw)
                kec = ' '.join(kecil)
                stemm = stoper.remove(kec)
                stemm = stemmer.stem(stemm)
                stop = stemm.split(' ')
                asli = []
                for k in stop:
                    if k in stopword:
                        asli.append(k)

                naskah.append([t[0], t[1], ' '.join(asli), asli, t[3], t[2]])

        wordset = set(naskah[0][3]).union(set(naskah[1][3]))
        for wor in naskah:
            if (wor[3] == naskah[0][3]) or (wor[3] == naskah[1][3]):
                pass
            else:
                wordset = set(wordset).union(set(wor[3]))
        wordDict = []
        for a in naskah:
            wordDict.append(dict.fromkeys(wordset, 0))

        tfbows = []
        for document in zip(naskah, wordDict):
            for word in document[0][3]:
                document[1][word] += 1
            tfbows.append(computeTF(document[1], document[0][3]))

        idfs = computeIDF(wordDict)

        for tfbow in zip(tfbows, naskah):
            hasil_tfidf = computeTFIDF(tfbow[0], idfs)
            stop_semen = []
            for stops in stopword:
                if stops not in hasil_tfidf:
                    stop_semen.append(stops)
            hasil_tfidf.update(dict.fromkeys(stop_semen, 0.0))
            datatfidf = pd.DataFrame.from_dict(hasil_tfidf)
            tfidf.append([
                tfbow[1][0], tfbow[1][1], tfbow[1][2], datatfidf, tfbow[1][4],
                tfbow[1][5]
            ])

    return tfidf
def simpan_input():
    if 'username' in session:
        if request.method == 'POST':
            file = request.files['file']

            if 'file' not in request.files:
                return render_template('member/aduan_konten.html')

            if file.filename == '':
                return render_template('member/aduan_konten.html')

            if file and allowed_file(file.filename):
                filename = secure_filename(file.filename)
                file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))

                url = request.form['input']

                html = urlopen(url)
                soup = BeautifulSoup(html, 'lxml')
                nama = soup.find('span', class_="FullNameGroup").text
                username_tweet = soup.find(
                    'span', class_="username u-dir u-textTruncate").text
                tweet = soup.find(
                    'p',
                    class_=
                    "TweetTextSize TweetTextSize--jumbo js-tweet-text tweet-text"
                ).text
                waktu = soup.find('span', class_="metadata").text
                retweet = soup.find('a', class_="request-retweeted-popup")
                if retweet == None:
                    retweet1 = 'null'
                else:
                    retweet1 = retweet.text

                like = soup.find('a', class_="request-favorited-popup")
                if like == None:
                    like1 = 'null'
                else:
                    like1 = like.text

                link = str(url)
                comment = str(tweet)
                input_comment = comment.replace("[^a-zA-Z]+", " ")
                input_comment = input_comment.replace(" +", " ")
                input_comment = input_comment.strip('[123.!? \n\t]')
                inputan = input_comment.lower()

                tokens = word_tokenize(inputan)
                separator = ' '
                token = separator.join(tokens)

                table = {}
                with open('spelling_word.txt', 'r') as syn:
                    for row in syn:
                        match = re.match(r'(\w+)\s+=\s+(.+)', row)
                        if match:
                            primary, synonyms = match.groups()
                            synonyms = [
                                synonym.lower()
                                for synonym in synonyms.split()
                            ]
                            for synonym in synonyms:
                                table[synonym] = primary.lower()

                spelling = []
                for idx, value in enumerate(tokens):
                    if value in table:
                        spelling.append(table[value])
                    else:
                        spelling.append(value)
                spell = ' '.join(spelling)

                stop_factory = StopWordRemoverFactory()
                stopword_fac = stop_factory.create_stop_word_remover()
                stopword = stopword_fac.remove(spell)

                factory = StemmerFactory()
                stemmer = factory.create_stemmer()
                stemming = (stemmer.stem(stopword))
                hasil = [stemming]

                data_pre = pan.read_sql('SELECT * FROM data_latih',
                                        con=database)

                tf = CountVectorizer()
                ft = CountVectorizer()
                vec = TfidfVectorizer(smooth_idf=False, norm=None)

                train_matrix = vec.fit_transform(
                    data_pre['comment'].values.astype('U')).toarray()
                test_matrix = vec.transform(hasil)
                transformed_data_latih = tf.fit_transform(
                    (data_pre['comment']))
                transformed_data_uji = tf.transform(hasil)
                ft_fitur = ft.fit_transform(hasil)

                fitur = vec.get_feature_names()
                tf_fitur = tf.get_feature_names()
                fitur_uji = ft.get_feature_names()

                freq_uji = np.ravel(test_matrix.sum(axis=0))
                freq_uji1 = np.ravel(ft_fitur.sum(axis=0))
                freq_latih = np.ravel(transformed_data_latih.sum(axis=0))
                data_latih = list(zip(tf_fitur, freq_latih))

                data_uji1 = list(zip(fitur_uji, freq_uji1))
                data_uji = list(zip(fitur, freq_uji))

                y_test = test_matrix
                joblib_file = "C:/Users/Tulenesia/sara/randomforest.sav"
                rf = joblib.load(joblib_file)
                aa = rf.predict(y_test)
                hasil_akhir = str(aa)

                hasiltext = ''
                label = ''
                if aa == [0]:
                    hasiltext = '"Pesan TIDAK Termasuk Kategori SARA (NON_SARA)"'
                    label = 'NON_SARA'
                elif aa == [1]:
                    hasiltext = '"Pesan Termasuk Kategori SARA"'
                    label = 'SARA'

                current = datetime.now()
                tahun = current.year
                bulan = current.month
                hari = current.day
                tgl = date(int(tahun), int(bulan), int(hari))

                data = {}
                user = session['username']
                data['username'] = user
                data['input'] = comment
                data['output'] = hasiltext
                data['random'] = rf
                data['prepro'] = hasil
                data['case'] = inputan
                data['tokens'] = tokens
                data['spelling'] = spell
                data['stopword'] = stopword
                data['stemming'] = stemming
                data['tf_idf'] = data_uji
                data['fitur'] = data_uji1
                data['label'] = label

                cur = database.cursor()
                cur.execute(
                    "INSERT INTO data_uji (username_member, url, nama, username_tweet, tweet, tgl_tweet, jml_retweet, jml_like, label, tgl_input) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
                    (
                        user,
                        link,
                        nama,
                        username_tweet,
                        comment,
                        waktu,
                        retweet1,
                        like1,
                        label,
                        tgl,
                    ))
                database.commit()

                return render_template('member/simpan_input.html', masuk=data)
예제 #17
0
파일: app.py 프로젝트: skuzky/TuguBot
def _get_data(tanya):
    factory = StopWordRemoverFactory()
    stopword = factory.create_stop_word_remover()
    # text = "Jam Buka Gembiraloka adalah 08 - 5 sore . Jam Buka Malioboro yaitu dari jam 9 - 10 .Lokasi Malioboro berada di Jalan Malioboro, Sosrorumedan .  Biaya tiket masuk malioboro adalah 15 ribu"
    with open('db/resonse.txt', 'r') as file:
        dataresp = file.read()

    with open("db/katadasar.txt", "r") as f:
        contents = f.read().splitlines()

    with open("db/daftar_kota.txt", "r") as f:
        kota = f.read().splitlines()

    suggestion = []
    list_wisata = ['situs warungboto']

    # define db teks
    text = dataresp
    txt_lower = text.lower()
    # define pattern
    pertanyaan = tanya.lower()

    # perubahan pola
    if 'alamat' in pertanyaan:
        new_pertanyaan = pertanyaan.replace('alamat', 'lokasi')
    elif 'waktu operasional' in pertanyaan:
        new_pertanyaan = pertanyaan.replace('waktu operasional', 'jam buka')
    elif 'jam operasional' in pertanyaan:
        new_pertanyaan = pertanyaan.replace('jam operasional', 'jam buka')
    elif 'waktu' in pertanyaan:
        new_pertanyaan = pertanyaan.replace('waktu', 'jam')
    elif 'berapa biaya tiket masuk' in pertanyaan:
        new_pertanyaan = pertanyaan.replace('berapa biaya tiket masuk',
                                            'harga tiket')
    elif 'berapa biaya' in pertanyaan:
        new_pertanyaan = pertanyaan.replace('berapa biaya', 'harga')
    elif 'jam berapa museum' in pertanyaan:
        new_pertanyaan = pertanyaan.replace('jam berapa museum', 'museum')
    elif 'jam berapa' in pertanyaan:
        new_pertanyaan = pertanyaan.replace('jam berapa', '')
    elif 'masuk' in pertanyaan:
        new_pertanyaan = pertanyaan.replace('masuk', '')
    elif 'letak' in pertanyaan:
        new_pertanyaan = pertanyaan.replace('letak', 'lokasi')
    elif 'berapa' in pertanyaan:
        new_pertanyaan = pertanyaan.replace('berapa', '')
    elif 'kapan' in pertanyaan:
        new_pertanyaan = pertanyaan.replace('kapan', '')
    elif 'biaya' in pertanyaan:
        new_pertanyaan = pertanyaan.replace('biaya', 'harga')
    elif 'titik 0 kilometer' in pertanyaan:
        new_pertanyaan = pertanyaan.replace('0', 'nol')
    elif 'titik 0 km' in pertanyaan:
        new_pertanyaan = pertanyaan.replace('0 km', 'nol kilometer')
    elif 'lokasi malioboro':
        new_pertanyaan = pertanyaan.replace('lokasi malioboro',
                                            'lokasi jalan malioboro')
    else:
        new_pertanyaan = pertanyaan

    stop_pattern = stopword.remove(new_pertanyaan)
    new_pattern = stop_pattern

    for kata in new_pattern.casefold().split():
        if kata not in contents:
            suggestion = difflib.get_close_matches(kata, contents)

    if not suggestion:
        result = BMSearch(txt_lower, new_pattern)
        kotas = [
            "kebun plasma nuftah", "situs warungboto", "museum vredeburg",
            "jalan malioboro", "de mata museum", "masjid gedhe kauman",
            "tugu jogja", "museum sonobudoyo", "plengkung gading",
            "museum bahari", "alun alun kidul", "alun alun utara",
            "taman pelangi", "taman pintar", "titik nol kilometer",
            "monumen jogja kembali", "museum kereta keraton", "taman sari",
            "monumen serangan umum 1 maret", "museum sasmitaloka",
            "museum dewantara kirti griya", "museum sasana wiratama",
            "gembiraloka"
        ]
        question = [
            "dimana", "berapa", "harga", "deskripsi", "dimana lokasi",
            "berapa harga", "berapa harga tiket", "kapan", "jam buka",
            "jam berapa"
        ]
        belum_ada = ["fasilitas", "sejarah", "video profil"]
        if any(new_pattern in kotas[x] for x in range(len(kotas))):
            new_text = "Silahkan gunakan pertanyaan yang lebih lengkap :)"
        elif any(new_pattern in question[x] for x in range(len(question))):
            new_text = "Mungkin kamu harus memberikan pertanyaan yang lebih lengkap"
        elif (result == -1) or (len(new_pattern) < 5):
            new_text = "Mohon maaf aku tidak mengerti maksud kamu :("
            if (new_pattern in belum_ada[x] for x in range(len(belum_ada))):
                slit = new_pattern.split(' ')
                for x in range(len(slit)):
                    for a in range(len(belum_ada)):
                        if (belum_ada[a] == slit[x]):
                            gaada = slit[x]
                            new_text = "Info tentang {} belum terdaftar di dalam database".format(
                                gaada)
            else:
                new_text = "Mohon maaf saku tidak mengerti maksud kamu :("
        else:
            if not new_pattern == None:
                new_text = [
                    i for i in text.split('.') if new_pattern in i.lower()
                ][0]
                if 'deskripsi' in new_text:
                    new_text = new_text.replace('deskripsi', '')
            # new_text = "Mohon maaf aku tidak mengerti maksud kamu :("
            # print(new_text)

        # return render_template("home.html", pertanyaan=new_text)
        return jsonify({
            'data':
            render_template('response.html',
                            jawaban=new_text,
                            pertanyaan=pertanyaan,
                            apa=tanya)
        })
    else:
        # for kata in new_pattern.casefold().split():
        #     # new_text = f'mungkin maksud anda {", ".join(str(x) for x in suggestion)}'
        #     new_text = f'mungkin maksud anda {", ".join(str(x) for x in suggestion)} ?'

        # del suggestion [:]
        return jsonify(
            {'data': render_template('response.html', jawaban=new_text)})
예제 #18
0
from gensim.corpora import Dictionary
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
import math
import numpy as np
import re
import json

# code NLP
model = Word2Vec.load("idwiki_word2vec_200.model")

dataset = []
with open('output.json') as json_file:
    dataset = json.load(json_file)

sw_remover = StopWordRemoverFactory().create_stop_word_remover()
stemmer = StemmerFactory().create_stemmer()


def preprocess(document):
    document = sw_remover.remove(document)
    document_stem = stemmer.stem(document).split(" ")
    document_token = [w for w in document_stem if w.isalpha()]
    return document_token


def predict_decease(input_document, docsim_index, dictionary):
    query = preprocess(input_document)
    sims = docsim_index[dictionary.doc2bow(query)]
    predict_result = sims[0]
예제 #19
0
from tqdm import tqdm
import os
import sys
import math


def index(hashs, lists):
    for i in lists:
        if i in hashs:
            hashs[i] += 1
        else:
            hashs[i] = 1


# get indonesian stopword
get_stopword = StopWordRemoverFactory()
stopwords = get_stopword.create_stop_word_remover()
# get indonesian stemming
get_stemmer = StemmerFactory()
stemmer = get_stemmer.create_stemmer()

# make hash
df, tf, idf, mains, titles = dict(), dict(), dict(), dict(), dict()

if os.path.exists('../data/clean'):
    print(f'Directory : ../data/clean')
    for f in tqdm(Path('../data/clean').glob("*.txt")):
        name = str(f).split('/')
        df[name[3]], mains[name[3]], titles[name[3]] = dict(), dict(), dict()

        File = open(f, 'r').read()
예제 #20
0
파일: test.py 프로젝트: Incerious/Plugin
    def a(self):
        fileName = askopenfilename(filetypes=(("Notepad", "*.txt"),
                                              ("All files", "*.*")))
        ref = open(fileName, "r")
        r1 = ref.read().lower()
        #tambahkan stopword
        factory = StopWordRemoverFactory()
        stopword = factory.create_stop_word_remover()
        kalimat = r1
        stop = stopword.remove(kalimat)
        ref = nltk.tokenize.word_tokenize(stop)

        print(ref)
        self.txt.insert(END, "Ref : {}".format(' '.join(ref)) + "\n")

        #fileName = askopenfilename(filetypes = ( ("Notepad", "*.txt"),("All files", "*.*") ))
        f = open("hypo.txt", 'r')
        #f2 = open(fileName, "r")
        f1 = f.read().lower()
        factory = StopWordRemoverFactory()
        stopword = factory.create_stop_word_remover()
        kalimat = f1
        stop = stopword.remove(kalimat)
        hyp = nltk.tokenize.word_tokenize(stop)
        print(hyp)
        # Mencetak direktori file yang dimasukan kedalam label
        self.txt.insert(END, "Hyp : {}".format(''.join(hyp)), "\n")

        def wer(r, h, debug=False):
            #costs will holds the costs, like in the Levenshtein distance algorithm
            costs = [[0 for inner in range(len(h) + 1)]
                     for outer in range(len(r) + 1)]
            # backtrace will hold the operations we've done.
            # so we could later backtrace, like the WER algorithm requires us to.
            backtrace = [[0 for inner in range(len(h) + 1)]
                         for outer in range(len(r) + 1)]

            OP_OK = 0
            OP_SUB = 1
            OP_INS = 2
            OP_DEL = 3
            DEL_PENALTY = 1  # Tact
            INS_PENALTY = 1  # Tact
            SUB_PENALTY = 1  # Tact
            # First column represents the case where we achieve zero
            # hypothesis words by deleting all reference words.
            for i in range(1, len(r) + 1):
                costs[i][0] = DEL_PENALTY * i
                backtrace[i][0] = OP_DEL

            # First row represents the case where we achieve the hypothesis
            # by inserting all hypothesis words into a zero-length reference.
            for j in range(1, len(h) + 1):
                costs[0][j] = INS_PENALTY * j
                backtrace[0][j] = OP_INS

            # computation
            for i in range(1, len(r) + 1):
                for j in range(1, len(h) + 1):
                    if r[i - 1] == h[j - 1]:
                        costs[i][j] = costs[i - 1][j - 1]
                        backtrace[i][j] = OP_OK
                    else:
                        substitutionCost = costs[i - 1][
                            j - 1] + SUB_PENALTY  # penalty is always 1
                        insertionCost = costs[i][
                            j - 1] + INS_PENALTY  # penalty is always 1
                        deletionCost = costs[
                            i - 1][j] + DEL_PENALTY  # penalty is always 1

                        costs[i][j] = min(substitutionCost, insertionCost,
                                          deletionCost)
                        if costs[i][j] == substitutionCost:
                            backtrace[i][j] = OP_SUB
                        elif costs[i][j] == insertionCost:
                            backtrace[i][j] = OP_INS
                        else:
                            backtrace[i][j] = OP_DEL

            # back trace though the best route:
            i = len(r)
            j = len(h)
            numSub = 0
            numDel = 0
            numIns = 0
            numCor = 0
            if debug:
                print("OP\tREF\tHYP")
                lines = []
            while i > 0 or j > 0:
                if backtrace[i][j] == OP_OK:
                    numCor += 1
                    i -= 1
                    j -= 1
                    if debug:
                        lines.append("OK\t" + r[i] + "\t" + h[j])
                elif backtrace[i][j] == OP_SUB:
                    numSub += 1
                    i -= 1
                    j -= 1
                    if debug:
                        lines.append("SUB\t" + r[i] + "\t" + h[j])
                elif backtrace[i][j] == OP_INS:
                    numIns += 1
                    j -= 1
                    if debug:
                        lines.append("INS\t" + "****" + "\t" + h[j])
                elif backtrace[i][j] == OP_DEL:
                    numDel += 1
                    i -= 1
                    if debug:
                        lines.append("DEL\t" + r[i] + "\t" + "****")
            if debug:
                lines = reversed(lines)
                for line in lines:
                    print(line)
                print("Ncor " + str(numCor))
                print("Nsub " + str(numSub))
                print("Ndel " + str(numDel))
                print("Nins " + str(numIns))
            return (numSub + numDel + numIns) / (float)(len(r))
            wer_result = round((numSub + numDel + numIns) / (float)(len(r)), 3)
            return {
                'WER': wer_result,
                'Cor': numCor,
                'Sub': numSub,
                'Ins': numIns,
                'Del': numDel
            }

        #proses pemanggilan def wer
        z = wer(ref, hyp, debug=True)
예제 #21
0
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model.stochastic_gradient import SGDClassifier
from xgboost import XGBClassifier 
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
 
# Update stopwords database
nltk.download('stopwords')
stopwords_factory = StopWordRemoverFactory()
stopwords_id = stopwords_factory.create_stop_word_remover()
stemmer_factory = StemmerFactory()
stemmer_id = stemmer_factory.create_stemmer()

def preprocess_data(titles, regex):
    ps = PorterStemmer()
    data = []
    for item in titles:
        
        title = item
        """
        # Remove all the spaces between numbers and keyterm
        title = re.sub('(?<=(\d)) (?=(g ))', '', title) 
        title = re.sub('(?<=(\d)) (?=(gb|mb))', '', title)
        title = re.sub('(?<=(\d)) (?=(mp))', '', title)   
예제 #22
0
from collections import Counter
import math
import random

pd.set_option('display.max_colwidth', None)
SAVED_FILES = 'indobert_files'
BASE_PATH = "financeReport"
CHARACTER_THRESHOLD = 350
FILE_PATH = os.path.join(BASE_PATH + "/" + "laporan-keuangan-2018.pdf")
ct = CRFTagger()
ct.set_model_file('all_indo_man_tag_corpus_model.crf.tagger')

tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
factory = StemmerFactory()
stemmer = factory.create_stemmer()
default_stopwords = StopWordRemoverFactory().get_stop_words()
additional_stopwords = [
    "(", ")", "senin", "selasa", "rabu", "kamis", "jumat", "sabtu", "minggu"
]
dictionary = ArrayDictionary(default_stopwords + additional_stopwords)
id_stopword = StopWordRemover(dictionary)
en_stopword = set(stopwords.words('english'))
en_stemmer = PorterStemmer()


def remove_numbers(text):
    words = tokenizer.tokenize(text)
    return " ".join(words)


def remove_punctuation(text):
예제 #23
0
def Tampil_data():
    if request.method == 'POST':

        select1 = 1
        select2 = 10
        selectcolom = "4"
        namacolom = "judul"

        wb = load_workbook(filename='app/upload_data/penelitian.xlsx')
        sheet_ranges = wb['DANA UAD']
        data = pd.DataFrame(sheet_ranges.values)

        row1 = int(select1)
        row2 = int(select2)

        cols = selectcolom.split(
            ",")  #memisahkan inputan kolom dipilih berdasarkan koma
        cols = list(map(int, cols))  #corvert to int
        xname = namacolom.split(
            ",")  #memisahkan inptan nama kolom berdasarkan koma
        data = data[row1:row2][
            cols]  #data terpilih berdasarkan inputan baris dan kolom
        data.columns = [xname]

        # -----------stopword-------------------------------
        factory = StopWordRemoverFactory()
        stopword = factory.create_stop_word_remover()

        a = []
        a.append(data['judul'].values.tolist())

        list_sentence = []
        for reviews in a:
            for review in reviews:
                data_clean = review.lower()
                list_sentence.append(stopword.remove(data_clean))
# -----------end of stopword-----------------------

# -----------steming-------------------------------
        list_stem = []
        for reviews in list_sentence:
            data_stem = (reviews.encode("ascii", "ignore"))
            list_stem.append(stemmer.stem(data_stem))


# -----------end of steming------------------------

        variable = data_stem

        pd.options.display.max_colwidth = 999
        data = pd.DataFrame(list_stem)
        head_filter = []
        for index in data.columns:
            custom_head = "Judul"
            head_filter.append(custom_head)
        data.columns = head_filter
    return render_template(
        'tampil_data.html',
        tables=[
            data.to_html(
                classes='table table-striped table-bordered table-hover')
        ])
 def setUp(self):
     self.factory = StopWordRemoverFactory()
     return super(Test_StopWordRemoverFactoryTest, self).setUp()
예제 #25
0
@author: Nadir Basalamah
"""

# coding: utf-8

# In[73]:

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
import pandas as pd

# In[74]:

stemm = StemmerFactory()
stemmer = stemm.create_stemmer()
stop = StopWordRemoverFactory()
stopwords = stop.get_stop_words()

# In[75]:

data = pd.read_csv("D:\dataset_textmining\dataset3.csv", encoding="ISO-8859-1")
dataset_uji = pd.read_csv("D:\dataset_textmining\datauji3.csv",
                          encoding="ISO-8859-1")

# ### Get komentar

# In[76]:

desc = data.loc[:, 'Komentar']
dataset = data.loc[:, ["Komentar", "Hasil Akhir"]]
data_uji = dataset_uji.loc[:, 'Komentar']
예제 #26
0
class Search:
    stemmer = StemmerFactory().create_stemmer()
    stopword = StopWordRemoverFactory().create_stop_word_remover()

    dictionary = {}

    def __init__(self):
        filescore = open('words_score.txt', 'r')
        lines = filescore.readlines()

        for line in lines:
            part = line.split(' :: ')
            term = part[0]
            self.dictionary[term] = {}
            weight_terms = part[1].split()
            for weight in weight_terms:
                doc, w = weight.split(':')
                self.dictionary[term][doc] = float(w)

    def get_title(self, text):
        return re.search('<title>(.*?)</title>', text).group(1)

    def get_url(self, text):
        return re.search('<url>(.*?)</url>', text).group(1)

    def get_content(self, tags, text):
        result = ''
        for tag in tags:
            try:
                result += re.search(f'<{tag}>(.*?)</{tag}>',
                                    text).group(1) + ' '
            except AttributeError:
                result += str(re.search(f'<{tag}>(.*?)</{tag}>', text)) + ' '
        return result

    def search(self, query, total):
        result = {}
        article = []
        article_with_seconds = {}

        query = query.translate(str.maketrans('', '', punctuation))
        query = self.stopword.remove(query)
        query_terms = self.stemmer.stem(query.lower()).split()

        start = time.time()

        for term in query_terms:
            if term in self.dictionary.keys():
                for doc in self.dictionary[term].keys():
                    if doc not in result.keys():
                        result[doc] = float(self.dictionary[term][doc])
                    else:
                        result[doc] += float(self.dictionary[term][doc])

        sorted_result = sorted(result.items(),
                               key=lambda x: x[1],
                               reverse=True)

        for doc, w in sorted_result[:total]:
            if doc.startswith('detik'):
                dir_path = 'Clean/Scrap detik/' + doc
            elif doc.startswith('kompas'):
                dir_path = 'Clean/Scrap kompas/' + doc
            else:
                dir_path = 'Clean/Scrap liputan6/' + doc

            document = open(dir_path, 'r', encoding='utf-8').read()

            article.append({
                'title':
                self.get_title(document),
                'url':
                self.get_url(document),
                'content':
                self.get_content(['top', 'middle', 'bottom'],
                                 document)[:200].lstrip() + '...'
            })

        finish = time.time()
        proccess_time = round((finish - start), 5)
        article_with_seconds[proccess_time] = {}
        article_with_seconds[proccess_time] = article

        return article_with_seconds
예제 #27
0
def createStopword(more_stopword=[]):
    stop_factory = StopWordRemoverFactory().get_stop_words()
    new_stop_word = stop_factory + more_stopword
    dictionary = ArrayDictionary(new_stop_word)
    stopword = StopWordRemover(dictionary)
    return stopword
def preprocess(text):
    """
    Fungsi untuk melakukan preproses (tokenizing, stopword removal, stemming)
    :param text : data artikel yang berisi judul, kata kunci, dan isi artikel yang telah dibentuk dalam satu baris
    :return row : list yang berisi kata artikel yang sudah dilakukan preprocessing
    """

    # Word tokenizer ------------------------------
    # text masukan dipisah berdasar spasi
    raw = text.split(' ')

    # membuat variabel cleaner untuk mengambil huruf saja dari text (simbol/selain huruf akan dihapus)
    cleaner = re.compile('[^a-zA-Z-]')

    # list penyimpan hasil tokenizing
    cleaned = []

    # untuk setiap kata dalam text
    for i in raw:

        # bersihkan kata tersebut dengan variabel cleaner, rubah semua ke lowercase, dan simpan ke list cleaned
        cleaned.append(cleaner.sub('', i).lower())

    # bersihkan list cleaned dari elemen kosong (sisa penghapusan) dan simpan ke variabel row bertipe list
    row = filter(None,cleaned)

    # Stopword removal & stemmer ------------------
    # factory_stemmer untuk pembangkit variabel stemmer
    factory_stemmer = StemmerFactory()

    # factory_stopword untuk pembangkit variabel stopword removal
    factory_stopword = StopWordRemoverFactory()

    # bangkitkan variabel stemmer dari factory_stemmer
    stemmer = factory_stemmer.create_stemmer()

    # bangkitkan variabel stopword removal dari factory_stemmer (uncomment baris dibawah jika ingin melakukan Stopword Removal dengan library Sastrawi
    # stopwords_removal = factory_stopword.create_stop_word_remover()

    # stopword removal; masukkan kata ke row jika kata tersebut bukan merupakan stopword yang berada pada library nltk
    # lokasi data stopword pada nltk (bisa ditambah manual dengan text editor) : [lokasi library nltk]\nltk_data\corpora\stopwords
    row = [word for word in row if word not in stopwords.words('indonesian')]

    # stemming
    new_row = []

    # untuk setiap kata dalam row
    for i in row:

        # stem kata tersebut dan simpan pada variabel new_i
        new_i = stemmer.stem(str(i))

        # jika new_i memiliki panjang lebih dari 2 (untuk menghapus hasil stemming tidak berarti yang kurang dari 2 digit)
        if len(new_i) > 2:
            # masukkan new_i ke new_row
            new_row.append(new_i)

    row = new_row

    # kembalikan row
    return row
예제 #29
0
negatif = inSetLexicon['negatif']
positif = inSetLexicon['positif']

# tokenization
from nltk.tokenize import sent_tokenize, word_tokenize

# stemming
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

#stopword removal
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

factory = StopWordRemoverFactory()
stopword = factory.create_stop_word_remover()

# -------------global variable-------------
# dataStatis = 'buang air kecil karena kurang pikiran, semua pergi dilakukan untuk mencari pasangan hidup, serta bersuka-sukaan.'
# dataStatis = 'aku mencium telapak kaki ayah, sangat ingin makan bawang agar sehat.'
dataStatis = 'aku jalan bebas hambat, tarik napas habis, membuat orak senyum di jalan.'

# dataStatis = 'Jangan terlalu sering bercanda pak, walaupun bercanda itu perlu'


# -------------import excel dataset-------------
def importExcelDataSet():
    hasil = []
    for i in range(2, 7):
        hasil.append(sheet1.cell(row=i, column=7).value)
예제 #30
0
class Search():
    """
    Class for searching algorithm
    """
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    stopword = StopWordRemoverFactory()
    stopword = stopword.create_stop_word_remover()
    ARTICLE_DIR = './Clean/'

    def __init__(self, inverted_index):
        self.inverted_file = open(inverted_index, 'r')
        self.doc_vector, self.tfidf = self.read_inverted()

    def read_inverted(self):
        """
        Read tf-idf from provided file
        """
        tfidf = {}
        inverted_index = {}
        doc_vector = {}
        for line in self.inverted_file.read().split('\n'):
            splitter = line.split(' | ')
            term = splitter[0].split('-')
            try:
                tfidf[term[0]] = {'idf': float(term[1].split(':')[1])}
            except IndexError:
                pass

            for docs_tf in splitter[1:]:
                docs_tf = docs_tf.split(':')

                tfidf[term[0]][docs_tf[0]] = float(docs_tf[1])
                if docs_tf[0] not in doc_vector:
                    doc_vector[docs_tf[0]] = float(docs_tf[1])**2
                else:
                    doc_vector[docs_tf[0]] += float(docs_tf[1])**2
        return doc_vector, tfidf

    def text_cleaner(self, text):
        """
        Remove punctuation, stopwords and stemming using 
        PySastrawi module
        """
        content = text.translate(str.maketrans('', '', punctuation))
        content = self.stopword.remove(content)
        text_cleaned = self.stemmer.stem(content.lower())

        query = []

        for token in text_cleaned.split(' '):
            if token not in self.tfidf:
                continue
            else:
                query.append(token)
        return query

    def query_vectorizer(self, query_token):
        """
        Vectorize query and count tfidf for the responsible 
        term
        """
        query_tf = {}
        query_tfidf = {}
        for token in query_token:
            if token not in self.tfidf:
                query_tfidf[token] = 0

            if token not in query_tf:
                query_tf[token] = 1
            else:
                query_tf[token] += 1
        total_sum = sum(query_tf.values())
        for token in query_token:
            if token not in self.tfidf:
                continue
            query_tfidf[token] = float(query_tf[token]) / \
                total_sum * self.tfidf[token]['idf']

        return query_tfidf

    def document_in_query_token(self, query_token):
        """
        return only document related to the query
        """
        union_docs = []
        for token in query_token:
            union_docs.extend(list(self.tfidf[token].keys()))
        if 'idf' in union_docs:
            union_docs.remove('idf')
        union_docs = set(union_docs)

        return union_docs

    def norm_from_vector(self, vector: dict):
        """
        return norm of given vector
        """
        norm = 0
        for key in vector.keys():
            norm += vector[key]**2
        return math.sqrt(norm)

    def search_query(self, query):
        """
        return document sorted by the cosine measure
        for each document
        """
        start = datetime.now()
        query_token = self.text_cleaner(query)
        query_tfidf = self.query_vectorizer(query_token)
        union_docs = self.document_in_query_token(query_token)

        cosine_measure = {}
        for token in query_token:
            for document in union_docs:
                if document not in self.tfidf[token]:
                    cosine_value = 0
                else:
                    cosine_value = self.tfidf[token][document] * \
                        query_tfidf[token]

                if document not in cosine_measure:
                    cosine_measure[document] = cosine_value
                else:
                    cosine_measure[document] += cosine_value
        if 'idf' in cosine_measure:
            cosine_measure.pop('idf')

        for key in cosine_measure.keys():
            cosine_measure[key] /= self.norm_from_vector(
                query_tfidf) * math.sqrt(self.doc_vector[key])
        cosine_measure = dict(
            sorted(cosine_measure.items(), key=lambda item: item[1], reverse=True))

        end = datetime.now()-start
        cosine_measure['process_time'] = end.total_seconds()
        return cosine_measure

    def get_article(self, docs_id):
        """
        return all of article information to display
        on website
        """
        file = open(self.ARTICLE_DIR + docs_id).read()

        article = {}
        url = re.search('(?<=\<url\>).*?(?=\<\/url\>)', file)
        title = re.search('(?<=\<title\>).*?(?=\<\/title\>)', file)
        top = re.search('(?<=\<top\>).*?(?=\<\/top\>)', file)
        middle = re.search('(?<=\<middle\>).*?(?=\<\/middle\>)', file)
        bottom = re.search('(?<=\<bottom\>).*?(?=\<\/bottom\>)', file)

        article['url'] = url[0]
        article['title'] = title[0]
        isi = top[0] + middle[0] + bottom[0]
        if len(isi) > 200:
            article['text'] = isi[:200] + '...'
        else:
            article['text'] = isi

        return article