コード例 #1
0
from gbk.gbk import GBK as Model
from collections import Counter  #https://www.geeksforgeeks.org/find-k-frequent-words-data-set-python/
# https://docs.python.org/2/library/collections.html
from documents import document
model = Model()

topics = {}
keys = {}
# tags must be unique and must have a space before and after(to distinguish from other words)
topics['model'] = ['sport', 'notsport']
keys['sport'] = ['sp']  #,'sports','game','ball','match']
keys['notsport'] = [' np']  #,'np','death']
# topics['model2'] = ['np','election','close']
# topics['model3'] = ['close']
model.init(topics, keys)  #.MinKey(2)

document1 = 'A great game sports  occur sp '
document2 = 'The election was over death np '
document3 = 'Very clean match, go ball sport sp '
document4 = 'A clean but forgettable game sports sp '
document5 = 'It was a close election np'
doclist = [document1, document2, document3, document4, document5]

# def countPhrase(key,content):
#     i=0
#     while key in content:
#         i+=1
#         content = content.replace(key,"",1)
#     return i

# print(countPhrase("have a nice","i,have a nice comma,have a yes but I also have a nice way to live"))
コード例 #2
0
from gbk.gbk import GBK as Model
from gbk.gbk import Merger

topics = {}
topics2 = {}
keys = {}
topics['model'] = ['sport', 'notsport']
topics2['model'] = ['sport', 'notsport', 'election']
keys['sport'] = ['sp']
keys['notsport'] = ['np']
keys['election'] = ['elc', 'election']

model1 = Model()
model2 = Model()
model1.init(topics, keys)
model2.init(topics, keys)

document1 = 'A great game sports  occur sp '
document2 = 'The election was over death np elc'
document3 = 'Very clean match, go ball sport sp '
document4 = 'A clean but forgettable game sports sp '
document5 = 'It was a close election np elc'
document6 = 'The new president Trump won the election elc'
doclist1 = [document1, document2, document3]
doclist2 = [document4, document5]

for i in range(0, len(doclist1)):
    # print(doclist1[i])
    model1.build(doclist1[i])

# print(model1.model)
コード例 #3
0
from gbk.gbk import GBK as Model
from topics import topics
import requests

headers = {'Content-Type': 'application/json'}
# apidomain = 'http://127.0.0.1:8082/'
apidomain = 'http://127.0.0.1:8085/api/'
model = Model()

# https://flask-restless.readthedocs.io/en/stable/requestformat.html#clientpagination
article = requests.get(apidomain + 'article', headers=headers).json()
document1 = 'A great game Sports'
document2 = 'The election was over np '
document3 = 'Very clean match, go ball sport'
document4 = 'A clean but forgettable game sports'
document5 = 'It was a close election np'
doclist = [document1, document2, document3, document4, document5]
print(len(article["objects"]))
# ?page=1
numberofpages = article["total_pages"]
nextpage = 1
while nextpage <= numberofpages:
    for i in range(0, len(article["objects"])):
        print(article["objects"][i]["id"])
        doclist.append(article["objects"][i]["CONTENT"])
    nextpage += 1
    article = requests.get(apidomain + 'article?page=' + str(nextpage),
                           headers=headers).json()

# print(article["objects"][i]["CONTENT"])
コード例 #4
0
    return words


def applynounfilter(thedata):
    tokenized = nltk.word_tokenize(thedata)
    is_noun = lambda pos: pos[:2] == 'NN'
    nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)]
    # nouns = remove_stop_words(nouns)
    thedata = ' '.join(nouns)
    # thedata = re.sub("[-()\"#/@;:<>{}`+=~*\\|.!?,]", "", thedata)

    # print(thedata)
    return thedata


model = Model()
topics = {}
# topics['model'] = [
#     'ar1t','sch1ool','cr1ime',
#     # 'di1saster', 'ec1onomy',
#     # 'en1vironment','he1alth',
#     # 'aw1ard','labor','po1litics',
#     # 're1ligion','so1ciety','sp1ort'
# ]
topics['model'] = [
    'art', 'school', 'crime', 'disaster', 'economy', 'environment', 'health',
    'award', 'labor', 'politics', 'religion', 'society', 'sport'
]
keys = {}
keys['art'] = [
    'art', 'culture', 'entertainment', 'music', 'history', 'film', 'media',