from gbk.gbk import GBK as Model from collections import Counter #https://www.geeksforgeeks.org/find-k-frequent-words-data-set-python/ # https://docs.python.org/2/library/collections.html from documents import document model = Model() topics = {} keys = {} # tags must be unique and must have a space before and after(to distinguish from other words) topics['model'] = ['sport', 'notsport'] keys['sport'] = ['sp'] #,'sports','game','ball','match'] keys['notsport'] = [' np'] #,'np','death'] # topics['model2'] = ['np','election','close'] # topics['model3'] = ['close'] model.init(topics, keys) #.MinKey(2) document1 = 'A great game sports occur sp ' document2 = 'The election was over death np ' document3 = 'Very clean match, go ball sport sp ' document4 = 'A clean but forgettable game sports sp ' document5 = 'It was a close election np' doclist = [document1, document2, document3, document4, document5] # def countPhrase(key,content): # i=0 # while key in content: # i+=1 # content = content.replace(key,"",1) # return i # print(countPhrase("have a nice","i,have a nice comma,have a yes but I also have a nice way to live"))
import nltk from nltk.tokenize import word_tokenize, sent_tokenize def applynounfilter(thedata): thedata = thedata.lower() tokenized = nltk.word_tokenize(thedata) is_noun = lambda pos: pos[:2] == 'NN' nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)] thedata = ' '.join(nouns) # print(thedata) return thedata model = Model() model.load("articlemodel.json") # document =''' # FIVE people, including a woman, have been shot dead overnight in four separate incidents in Tunapuna, Santa Cruz, La Horquetta and Laventille. Three of the people have so far been identified while police are yet to reveal the identities of the other two. In the first incident, police said Khelon "Kokey" McLeod, was shot dead at the savannah at Achong Trace, Tunapuna. Around the same time, Fabian Thomas was gunned down in Santa Cruz. A man and a woman, both of whom were standing along Morgan Lane, off Pashley Street, Laventille, were shot and killed around midnight. Their identities are yet to be released. And sometime around 2 a.m. Patrick Aaron was shot dead at Phase 4 LaHorquetta. Meanwhile, one man who was listed in critical condition at hospital after being shot together with three other people at the corner of Duncan and Charlotte Streets, Port of Spain, on Saturday night succumbed to his injuries yesterday evening. He has been identified as Israel Cox, 23. In that incident, Jevon Assing, 35, of Duncan Street, Port of Spain, Akeem Grant, 26 and Aria Haynes, 29, were also shot. Assing was pronounced dead at the scene. The latest incidents have taken the country's murder toll for the year so far to 157. # ''' document = ''' Evidence is growing that Cunupia doubles man Barry Choon killed his family before he committed suicide in Toco more than a week ago. Police found the weapon in the vehicle used to slit the throats of his pregnant wife, Shalini Sookdeo-Choon, and his daughter Sarah, before he likely turned it on himself. The weapon – a box cutter – was found on Choon’s lap where he sat in the driver’s seat of the family’s car, police said. Police said that an autopsy found that Shalini Sookdeo-Choon was five months pregnant with her third child at the time of her killing on April 12. Barry Choon Sookdeo-Choon would have given birth in August. She had the couple’s second child last year – a boy – Jacob, seven months old. The infant was next to her in the back seat when the bodies found in the car parked at Hambug Trace. Choon, his wife and seven year old daughter Sarah, pathologist found, died by cut throat injuries.
from gbk.gbk import GBK as Model from topics import topics import requests headers = {'Content-Type': 'application/json'} # apidomain = 'http://127.0.0.1:8082/' apidomain = 'http://127.0.0.1:8085/api/' model = Model() # https://flask-restless.readthedocs.io/en/stable/requestformat.html#clientpagination article = requests.get(apidomain + 'article', headers=headers).json() document1 = 'A great game Sports' document2 = 'The election was over np ' document3 = 'Very clean match, go ball sport' document4 = 'A clean but forgettable game sports' document5 = 'It was a close election np' doclist = [document1, document2, document3, document4, document5] print(len(article["objects"])) # ?page=1 numberofpages = article["total_pages"] nextpage = 1 while nextpage <= numberofpages: for i in range(0, len(article["objects"])): print(article["objects"][i]["id"]) doclist.append(article["objects"][i]["CONTENT"]) nextpage += 1 article = requests.get(apidomain + 'article?page=' + str(nextpage), headers=headers).json() # print(article["objects"][i]["CONTENT"])
from gbk.gbk import GBK as Model model = Model() # model.load("merged.json") model.load("sportsmodelTFIDF.json") document1 = "At election time the game of politics is played" document2 = "A very close sport game was played " result1 = model.predict('model', document1).getTopics() result2 = model.predict('model', document2).getTopics() print("Result1:{}\nResult2:{}".format(result1, result2))
for k, v in topic.items(): arr.append(v) avg = Average(arr, num) return avg def get_total(cat): featuresum = 0 for k, v in cat.items(): featuresum += v # print(v) # print(featuresum) return featuresum model = Model() model.load("articlemodel.json") # document =''' # FIVE people, including a woman, have been shot dead overnight in four separate incidents in Tunapuna, Santa Cruz, La Horquetta and Laventille. Three of the people have so far been identified while police are yet to reveal the identities of the other two. In the first incident, police said Khelon "Kokey" McLeod, was shot dead at the savannah at Achong Trace, Tunapuna. Around the same time, Fabian Thomas was gunned down in Santa Cruz. A man and a woman, both of whom were standing along Morgan Lane, off Pashley Street, Laventille, were shot and killed around midnight. Their identities are yet to be released. And sometime around 2 a.m. Patrick Aaron was shot dead at Phase 4 LaHorquetta. Meanwhile, one man who was listed in critical condition at hospital after being shot together with three other people at the corner of Duncan and Charlotte Streets, Port of Spain, on Saturday night succumbed to his injuries yesterday evening. He has been identified as Israel Cox, 23. In that incident, Jevon Assing, 35, of Duncan Street, Port of Spain, Akeem Grant, 26 and Aria Haynes, 29, were also shot. Assing was pronounced dead at the scene. The latest incidents have taken the country's murder toll for the year so far to 157. # ''' document = ''' Evidence is growing that Cunupia doubles man Barry Choon killed his family before he committed suicide in Toco more than a week ago. Police found the weapon in the vehicle used to slit the throats of his pregnant wife, Shalini Sookdeo-Choon, and his daughter Sarah, before he likely turned it on himself. The weapon – a box cutter – was found on Choon’s lap where he sat in the driver’s seat of the family’s car, police said. Police said that an autopsy found that Shalini Sookdeo-Choon was five months pregnant with her third child at the time of her killing on April 12. Barry Choon Sookdeo-Choon would have given birth in August. She had the couple’s second child last year – a boy – Jacob, seven months old. The infant was next to her in the back seat when the bodies found in the car parked at Hambug Trace. Choon, his wife and seven year old daughter Sarah, pathologist found, died by cut throat injuries.
from gbk.gbk import GBK as Model from gbk.gbk import Merger topics = {} topics2 = {} keys = {} topics['model'] = ['sport', 'notsport'] topics2['model'] = ['sport', 'notsport', 'election'] keys['sport'] = ['sp'] keys['notsport'] = ['np'] keys['election'] = ['elc', 'election'] model1 = Model() model2 = Model() model1.init(topics, keys) model2.init(topics, keys) document1 = 'A great game sports occur sp ' document2 = 'The election was over death np elc' document3 = 'Very clean match, go ball sport sp ' document4 = 'A clean but forgettable game sports sp ' document5 = 'It was a close election np elc' document6 = 'The new president Trump won the election elc' doclist1 = [document1, document2, document3] doclist2 = [document4, document5] for i in range(0, len(doclist1)): # print(doclist1[i]) model1.build(doclist1[i]) # print(model1.model)
from gbk.gbk import GBK as Model from collections import Counter #https://www.geeksforgeeks.org/find-k-frequent-words-data-set-python/ # https://docs.python.org/2/library/collections.html from documents import document model = Model() topics = {} keys = {} # tags must be unique and must have a space before and after(to distinguish from other words) topics['model'] = ['sport', 'notsport'] keys['sport'] = ['sp'] #,'sports','game','ball','match'] keys['notsport'] = ['np'] #,'np','death'] # topics['model2'] = ['np','election','close'] # topics['model3'] = ['close'] model.init(topics) #.MinKey(2) document1 = 'A great game sports occur sp ' document2 = 'The election was over death np ' document3 = 'Very clean match, go ball sport sp ' document4 = 'A clean but forgettable game sports sp ' document5 = 'It was a close election np' doclist = [document1, document2, document3, document4, document5] # def countPhrase(key,content): # i=0 # while key in content: # i+=1 # content = content.replace(key,"",1) # return i # print(countPhrase("have a nice","i,have a nice comma,have a yes but I also have a nice way to live"))
def getTopicCustom(topic, num): largest = 0 key = "" arr = [] for k, v in topic.items(): arr.append(v) avg = Average(arr, num) for k, v in topic.items(): if v <= avg: largest = v key = k return key, largest model = Model() model.load("articlemodel.json") # model.setpenaltyborder(1) # print(model.penaltyborder)/ # result = (model.predict('Conflicts and War and Peace',document)) # print (result) # key,value = getTopic(result,"") # print("Conflicts and War and Peace ==>TAG_RETURNED: {} {}\n".format(key,value)) def getTopic(topic, category): largest = 0 key = "" for k, v in topic.items():
# ========================================================================================================================== # ================================================================================================================================= print( "==========Group BY Key================================================================================================================" ) # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html from gbk.gbk import GBK as Model topics = {} keys = {} topics['model'] = ['spam', 'ham'] keys['spam'] = ['spamx'] keys['ham'] = ['hamx'] model = Model() model.init(topics, keys) # print(len(x_test)) # print(len(y_test)) for i in range(0, len(xx_train)): model.build((xx_train.iloc[i]) + " " + yy_train.iloc[i] + "x ") model.setweights() gbkcount = 0 y_true = [] y_pred = [] for i in range(0, len(xx_test)): predictedtag, weight = model.predict('model', (xx_test.iloc[i])).getTopic() y_true.append(yy_test.iloc[i]) y_pred.append(predictedtag)
return words def applynounfilter(thedata): tokenized = nltk.word_tokenize(thedata) is_noun = lambda pos: pos[:2] == 'NN' nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)] # nouns = remove_stop_words(nouns) thedata = ' '.join(nouns) # thedata = re.sub("[-()\"#/@;:<>{}`+=~*\\|.!?,]", "", thedata) # print(thedata) return thedata model = Model() topics = {} # topics['model'] = [ # 'ar1t','sch1ool','cr1ime', # # 'di1saster', 'ec1onomy', # # 'en1vironment','he1alth', # # 'aw1ard','labor','po1litics', # # 're1ligion','so1ciety','sp1ort' # ] topics['model'] = [ 'art', 'school', 'crime', 'disaster', 'economy', 'environment', 'health', 'award', 'labor', 'politics', 'religion', 'society', 'sport' ] keys = {} keys['art'] = [ 'art', 'culture', 'entertainment', 'music', 'history', 'film', 'media',
df_y, test_size=0.2, random_state=4) # print(len(x_train)) # print(len(y_train)) # print(x_train) # for row in x_train: # print(row[0]) # https://github.com/shreyans29/thesemicolon/blob/master/Text%20Analytics%20tfidf.ipynb topics = {} keys = {} topics['model'] = ['spam', 'ham'] keys['spam'] = ['spam'] keys['ham'] = ['ham'] model = Model() model.init(topics, keys) for i in range(0, len(x_train)): #applynounfilter # print(y_train.iloc[i]) model.build((x_train.iloc[i]) + " " + y_train.iloc[i] + " ") # print(df["Status"][i]) model.setweights() model.tojson("spammodel") test = "CLAIRE havin borin time now alone wanna cum over 2nite? Chat now 09099725823 hope Luv CLAIRE Calls£1/minmoremobsEMSPOBox45PO139WA" numFalse = numTrue = 0 for i in range(0, len(x_test)): result = model.predict('model', (x_test.iloc[i])).getTopic() predictedtag, weight = result if predictedtag == y_test.iloc[i]:
# ================================================================================================================================= print( "==========Group BY Key================================================================================================================" ) # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html from gbk.gbk import GBK as Model topics = {} keys = {} topics['model'] = ['spam', 'ham'] topics['model2'] = ['spam'] topics['model3'] = ['ham'] keys['spam'] = ['spam'] keys['ham'] = ['ham'] model = Model() model.init(topics) # print(len(x_test)) # print(len(y_test)) for i in range(0, len(xx_train)): model.build(topics, keys, (xx_train.iloc[i]) + " " + yy_train.iloc[i] + " ") def computeTF(wordDict, bow): tfDict = {} bowCount = len(bow) for word, count in wordDict.items(): tfDict[word] = count / float(bowCount)
The infant was next to her in the back seat when the bodies found in the car parked at Hambug Trace. Choon, his wife and seven year old daughter Sarah, pathologist found, died by cut throat injuries. Jacob had been smothered to death. ''' document = " death of boy" model = Model() model.load("articlemodel.json") # document = "opera" results = {} # for topic,catigories in topics.items(): # results[topic] = (model.predict(topic,document).copy()) # print("{}: {}".format(topic,results[topic])) results["topicmodel"] = (model.predict("topicmodel", document).copy()) # print("{}: {}".format("topicmodel",results["topicmodel"])) key, value = getTopic(results["topicmodel"]) print("\nTAG_RETURNED: {} {}".format(key, value))
from gbk.gbk import GBK as Model from topics import topics import requests headers = {'Content-Type': 'application/json'} # apidomain = 'http://127.0.0.1:8082/' apidomain = 'http://127.0.0.1:8085/api/' model = Model() # https://flask-restless.readthedocs.io/en/stable/requestformat.html#clientpagination article = requests.get(apidomain + 'article', headers=headers).json() # document1 = 'A great game Sports' # document2 = 'The election was over np ' # document3 = 'Very clean match, go ball sport' # document4 = 'A clean but forgettable game sports' # document5 = 'It was a close election np' # doclist = [document1, document2, document3, document4, document5] doclist = [] print("Number of Articles: {}".format(len(article["objects"]))) # ?page=1 numberofpages = article["total_pages"] nextpage = 1 while nextpage <= numberofpages: for i in range(0, len(article["objects"])): # print("ArticleID: {}".format(article["objects"][i]["id"])) doclist.append(article["objects"][i]["CONTENT"]) nextpage += 1 article = requests.get(apidomain + 'article?page=' + str(nextpage), headers=headers).json() # print(article["objects"][i]["CONTENT"])