コード例 #1
0
from gbk.gbk import GBK as Model
from collections import Counter  #https://www.geeksforgeeks.org/find-k-frequent-words-data-set-python/
# https://docs.python.org/2/library/collections.html
from documents import document
model = Model()

topics = {}
keys = {}
# tags must be unique and must have a space before and after(to distinguish from other words)
topics['model'] = ['sport', 'notsport']
keys['sport'] = ['sp']  #,'sports','game','ball','match']
keys['notsport'] = [' np']  #,'np','death']
# topics['model2'] = ['np','election','close']
# topics['model3'] = ['close']
model.init(topics, keys)  #.MinKey(2)

document1 = 'A great game sports  occur sp '
document2 = 'The election was over death np '
document3 = 'Very clean match, go ball sport sp '
document4 = 'A clean but forgettable game sports sp '
document5 = 'It was a close election np'
doclist = [document1, document2, document3, document4, document5]

# def countPhrase(key,content):
#     i=0
#     while key in content:
#         i+=1
#         content = content.replace(key,"",1)
#     return i

# print(countPhrase("have a nice","i,have a nice comma,have a yes but I also have a nice way to live"))
コード例 #2
0
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize


def applynounfilter(thedata):
    thedata = thedata.lower()
    tokenized = nltk.word_tokenize(thedata)
    is_noun = lambda pos: pos[:2] == 'NN'
    nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)]
    thedata = ' '.join(nouns)
    # print(thedata)
    return thedata


model = Model()
model.load("articlemodel.json")

# document ='''
# FIVE people, including a woman, have been shot dead overnight in four separate incidents in Tunapuna, Santa Cruz, La Horquetta and Laventille. Three of the people have so far been identified while police are yet to reveal the identities of the other two. In the first incident, police said Khelon "Kokey" McLeod, was shot dead at the savannah at Achong Trace, Tunapuna. Around the same time, Fabian Thomas was gunned down in Santa Cruz. A man and a woman, both of whom were standing along Morgan Lane, off Pashley Street, Laventille, were shot and killed around midnight. Their identities are yet to be released. And sometime around 2 a.m. Patrick Aaron was shot dead at Phase 4 LaHorquetta. Meanwhile, one man who was listed in critical condition at hospital after being shot together with three other people at the corner of Duncan and Charlotte Streets, Port of Spain, on Saturday night succumbed to his injuries yesterday evening. He has been identified as Israel Cox, 23. In that incident, Jevon Assing, 35, of Duncan Street, Port of Spain, Akeem Grant, 26 and Aria Haynes, 29, were also shot. Assing was pronounced dead at the scene. The latest incidents have taken the country's murder toll for the year so far to 157.
# '''
document = '''
Evidence is growing that Cunupia doubles man Barry Choon killed his family before he committed suicide in Toco more than a week ago.
Police found the weapon in the vehicle used to slit the throats of his pregnant wife, Shalini Sookdeo-Choon, and his daughter Sarah, before he likely turned it on himself.
The weapon – a box cutter – was found on Choon’s lap where he sat in the driver’s seat of the family’s car, police said.
Police said that an autopsy found that Shalini Sookdeo-Choon was five months pregnant with her third child at the time of her killing on April 12.
 Barry Choon
Sookdeo-Choon would have given birth in August.
She had the couple’s second child last year – a boy – Jacob, seven months old.
The infant was next to her in the back seat when the bodies found in the car parked at Hambug Trace.
Choon, his wife and seven year old daughter Sarah, pathologist found, died by cut throat injuries.
コード例 #3
0
from gbk.gbk import GBK as Model
from topics import topics
import requests

headers = {'Content-Type': 'application/json'}
# apidomain = 'http://127.0.0.1:8082/'
apidomain = 'http://127.0.0.1:8085/api/'
model = Model()

# https://flask-restless.readthedocs.io/en/stable/requestformat.html#clientpagination
article = requests.get(apidomain + 'article', headers=headers).json()
document1 = 'A great game Sports'
document2 = 'The election was over np '
document3 = 'Very clean match, go ball sport'
document4 = 'A clean but forgettable game sports'
document5 = 'It was a close election np'
doclist = [document1, document2, document3, document4, document5]
print(len(article["objects"]))
# ?page=1
numberofpages = article["total_pages"]
nextpage = 1
while nextpage <= numberofpages:
    for i in range(0, len(article["objects"])):
        print(article["objects"][i]["id"])
        doclist.append(article["objects"][i]["CONTENT"])
    nextpage += 1
    article = requests.get(apidomain + 'article?page=' + str(nextpage),
                           headers=headers).json()

# print(article["objects"][i]["CONTENT"])
コード例 #4
0
from gbk.gbk import GBK as Model

model = Model()
# model.load("merged.json")
model.load("sportsmodelTFIDF.json")

document1 = "At  election  time the game of politics is played"
document2 = "A  very close sport game was played "

result1 = model.predict('model', document1).getTopics()
result2 = model.predict('model', document2).getTopics()

print("Result1:{}\nResult2:{}".format(result1, result2))
コード例 #5
0
    for k, v in topic.items():
        arr.append(v)
    avg = Average(arr, num)
    return avg


def get_total(cat):
    featuresum = 0
    for k, v in cat.items():
        featuresum += v
        # print(v)
    # print(featuresum)
    return featuresum


model = Model()
model.load("articlemodel.json")

# document ='''
# FIVE people, including a woman, have been shot dead overnight in four separate incidents in Tunapuna, Santa Cruz, La Horquetta and Laventille. Three of the people have so far been identified while police are yet to reveal the identities of the other two. In the first incident, police said Khelon "Kokey" McLeod, was shot dead at the savannah at Achong Trace, Tunapuna. Around the same time, Fabian Thomas was gunned down in Santa Cruz. A man and a woman, both of whom were standing along Morgan Lane, off Pashley Street, Laventille, were shot and killed around midnight. Their identities are yet to be released. And sometime around 2 a.m. Patrick Aaron was shot dead at Phase 4 LaHorquetta. Meanwhile, one man who was listed in critical condition at hospital after being shot together with three other people at the corner of Duncan and Charlotte Streets, Port of Spain, on Saturday night succumbed to his injuries yesterday evening. He has been identified as Israel Cox, 23. In that incident, Jevon Assing, 35, of Duncan Street, Port of Spain, Akeem Grant, 26 and Aria Haynes, 29, were also shot. Assing was pronounced dead at the scene. The latest incidents have taken the country's murder toll for the year so far to 157.
# '''
document = '''
Evidence is growing that Cunupia doubles man Barry Choon killed his family before he committed suicide in Toco more than a week ago.
Police found the weapon in the vehicle used to slit the throats of his pregnant wife, Shalini Sookdeo-Choon, and his daughter Sarah, before he likely turned it on himself.
The weapon – a box cutter – was found on Choon’s lap where he sat in the driver’s seat of the family’s car, police said.
Police said that an autopsy found that Shalini Sookdeo-Choon was five months pregnant with her third child at the time of her killing on April 12.
 Barry Choon
Sookdeo-Choon would have given birth in August.
She had the couple’s second child last year – a boy – Jacob, seven months old.
The infant was next to her in the back seat when the bodies found in the car parked at Hambug Trace.
Choon, his wife and seven year old daughter Sarah, pathologist found, died by cut throat injuries.
コード例 #6
0
from gbk.gbk import GBK as Model
from gbk.gbk import Merger

topics = {}
topics2 = {}
keys = {}
topics['model'] = ['sport', 'notsport']
topics2['model'] = ['sport', 'notsport', 'election']
keys['sport'] = ['sp']
keys['notsport'] = ['np']
keys['election'] = ['elc', 'election']

model1 = Model()
model2 = Model()
model1.init(topics, keys)
model2.init(topics, keys)

document1 = 'A great game sports  occur sp '
document2 = 'The election was over death np elc'
document3 = 'Very clean match, go ball sport sp '
document4 = 'A clean but forgettable game sports sp '
document5 = 'It was a close election np elc'
document6 = 'The new president Trump won the election elc'
doclist1 = [document1, document2, document3]
doclist2 = [document4, document5]

for i in range(0, len(doclist1)):
    # print(doclist1[i])
    model1.build(doclist1[i])

# print(model1.model)
コード例 #7
0
from gbk.gbk import GBK as Model
from collections import Counter  #https://www.geeksforgeeks.org/find-k-frequent-words-data-set-python/
# https://docs.python.org/2/library/collections.html
from documents import document
model = Model()

topics = {}
keys = {}
# tags must be unique and must have a space before and after(to distinguish from other words)
topics['model'] = ['sport', 'notsport']
keys['sport'] = ['sp']  #,'sports','game','ball','match']
keys['notsport'] = ['np']  #,'np','death']
# topics['model2'] = ['np','election','close']
# topics['model3'] = ['close']
model.init(topics)  #.MinKey(2)

document1 = 'A great game sports  occur sp '
document2 = 'The election was over death np '
document3 = 'Very clean match, go ball sport sp '
document4 = 'A clean but forgettable game sports sp '
document5 = 'It was a close election np'
doclist = [document1, document2, document3, document4, document5]

# def countPhrase(key,content):
#     i=0
#     while key in content:
#         i+=1
#         content = content.replace(key,"",1)
#     return i

# print(countPhrase("have a nice","i,have a nice comma,have a yes but I also have a nice way to live"))
コード例 #8
0
def getTopicCustom(topic, num):
    largest = 0
    key = ""
    arr = []
    for k, v in topic.items():
        arr.append(v)
    avg = Average(arr, num)
    for k, v in topic.items():
        if v <= avg:
            largest = v
            key = k

    return key, largest


model = Model()
model.load("articlemodel.json")

# model.setpenaltyborder(1)
# print(model.penaltyborder)/

# result = (model.predict('Conflicts and War and Peace',document))
# print (result)
# key,value = getTopic(result,"")
# print("Conflicts and War and Peace ==>TAG_RETURNED: {} {}\n".format(key,value))


def getTopic(topic, category):
    largest = 0
    key = ""
    for k, v in topic.items():
コード例 #9
0
# ==========================================================================================================================

# =================================================================================================================================
print(
    "==========Group BY Key================================================================================================================"
)
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
from gbk.gbk import GBK as Model

topics = {}
keys = {}
topics['model'] = ['spam', 'ham']
keys['spam'] = ['spamx']
keys['ham'] = ['hamx']
model = Model()
model.init(topics, keys)

# print(len(x_test))
# print(len(y_test))

for i in range(0, len(xx_train)):
    model.build((xx_train.iloc[i]) + " " + yy_train.iloc[i] + "x ")
model.setweights()
gbkcount = 0
y_true = []
y_pred = []
for i in range(0, len(xx_test)):
    predictedtag, weight = model.predict('model', (xx_test.iloc[i])).getTopic()
    y_true.append(yy_test.iloc[i])
    y_pred.append(predictedtag)
コード例 #10
0
    return words


def applynounfilter(thedata):
    tokenized = nltk.word_tokenize(thedata)
    is_noun = lambda pos: pos[:2] == 'NN'
    nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)]
    # nouns = remove_stop_words(nouns)
    thedata = ' '.join(nouns)
    # thedata = re.sub("[-()\"#/@;:<>{}`+=~*\\|.!?,]", "", thedata)

    # print(thedata)
    return thedata


model = Model()
topics = {}
# topics['model'] = [
#     'ar1t','sch1ool','cr1ime',
#     # 'di1saster', 'ec1onomy',
#     # 'en1vironment','he1alth',
#     # 'aw1ard','labor','po1litics',
#     # 're1ligion','so1ciety','sp1ort'
# ]
topics['model'] = [
    'art', 'school', 'crime', 'disaster', 'economy', 'environment', 'health',
    'award', 'labor', 'politics', 'religion', 'society', 'sport'
]
keys = {}
keys['art'] = [
    'art', 'culture', 'entertainment', 'music', 'history', 'film', 'media',
コード例 #11
0
                                                    df_y,
                                                    test_size=0.2,
                                                    random_state=4)

# print(len(x_train))
# print(len(y_train))
# print(x_train)
# for row in x_train:
#     print(row[0])
# https://github.com/shreyans29/thesemicolon/blob/master/Text%20Analytics%20tfidf.ipynb
topics = {}
keys = {}
topics['model'] = ['spam', 'ham']
keys['spam'] = ['spam']
keys['ham'] = ['ham']
model = Model()
model.init(topics, keys)

for i in range(0, len(x_train)):  #applynounfilter
    # print(y_train.iloc[i])
    model.build((x_train.iloc[i]) + " " + y_train.iloc[i] + " ")
    # print(df["Status"][i])
model.setweights()
model.tojson("spammodel")
test = "CLAIRE   havin borin time  now alone  wanna cum over 2nite? Chat now 09099725823 hope    Luv CLAIRE  Calls£1/minmoremobsEMSPOBox45PO139WA"

numFalse = numTrue = 0
for i in range(0, len(x_test)):
    result = model.predict('model', (x_test.iloc[i])).getTopic()
    predictedtag, weight = result
    if predictedtag == y_test.iloc[i]:
コード例 #12
0
# =================================================================================================================================
print(
    "==========Group BY Key================================================================================================================"
)
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
from gbk.gbk import GBK as Model

topics = {}
keys = {}
topics['model'] = ['spam', 'ham']
topics['model2'] = ['spam']
topics['model3'] = ['ham']
keys['spam'] = ['spam']
keys['ham'] = ['ham']
model = Model()
model.init(topics)

# print(len(x_test))
# print(len(y_test))

for i in range(0, len(xx_train)):
    model.build(topics, keys,
                (xx_train.iloc[i]) + " " + yy_train.iloc[i] + " ")


def computeTF(wordDict, bow):
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bowCount)
コード例 #13
0
The infant was next to her in the back seat when the bodies found in the car parked at Hambug Trace.

Choon, his wife and seven year old daughter Sarah, pathologist found, died by cut throat injuries.

Jacob had been smothered to death.





'''

document = " death of boy"

model = Model()
model.load("articlemodel.json")

# document = "opera"
results = {}

# for topic,catigories in topics.items():
#     results[topic] = (model.predict(topic,document).copy())
#     print("{}: {}".format(topic,results[topic]))

results["topicmodel"] = (model.predict("topicmodel", document).copy())
# print("{}: {}".format("topicmodel",results["topicmodel"]))

key, value = getTopic(results["topicmodel"])
print("\nTAG_RETURNED: {} {}".format(key, value))
コード例 #14
0
from gbk.gbk import GBK as Model
from topics import topics
import requests

headers = {'Content-Type': 'application/json'}
# apidomain = 'http://127.0.0.1:8082/'
apidomain = 'http://127.0.0.1:8085/api/'
model = Model()

# https://flask-restless.readthedocs.io/en/stable/requestformat.html#clientpagination
article = requests.get(apidomain + 'article', headers=headers).json()
# document1 = 'A great game Sports'
# document2 = 'The election was over np '
# document3 = 'Very clean match, go ball sport'
# document4 = 'A clean but forgettable game sports'
# document5 = 'It was a close election np'
# doclist = [document1, document2, document3, document4, document5]
doclist = []
print("Number of Articles: {}".format(len(article["objects"])))
# ?page=1
numberofpages = article["total_pages"]
nextpage = 1
while nextpage <= numberofpages:
    for i in range(0, len(article["objects"])):
        # print("ArticleID: {}".format(article["objects"][i]["id"]))
        doclist.append(article["objects"][i]["CONTENT"])
    nextpage += 1
    article = requests.get(apidomain + 'article?page=' + str(nextpage),
                           headers=headers).json()

# print(article["objects"][i]["CONTENT"])