from gbk.gbk import GBK as Model from collections import Counter #https://www.geeksforgeeks.org/find-k-frequent-words-data-set-python/ # https://docs.python.org/2/library/collections.html from documents import document model = Model() topics = {} keys = {} # tags must be unique and must have a space before and after(to distinguish from other words) topics['model'] = ['sport', 'notsport'] keys['sport'] = ['sp'] #,'sports','game','ball','match'] keys['notsport'] = [' np'] #,'np','death'] # topics['model2'] = ['np','election','close'] # topics['model3'] = ['close'] model.init(topics, keys) #.MinKey(2) document1 = 'A great game sports occur sp ' document2 = 'The election was over death np ' document3 = 'Very clean match, go ball sport sp ' document4 = 'A clean but forgettable game sports sp ' document5 = 'It was a close election np' doclist = [document1, document2, document3, document4, document5] # def countPhrase(key,content): # i=0 # while key in content: # i+=1 # content = content.replace(key,"",1) # return i # print(countPhrase("have a nice","i,have a nice comma,have a yes but I also have a nice way to live"))
from gbk.gbk import GBK as Model from gbk.gbk import Merger topics = {} topics2 = {} keys = {} topics['model'] = ['sport', 'notsport'] topics2['model'] = ['sport', 'notsport', 'election'] keys['sport'] = ['sp'] keys['notsport'] = ['np'] keys['election'] = ['elc', 'election'] model1 = Model() model2 = Model() model1.init(topics, keys) model2.init(topics, keys) document1 = 'A great game sports occur sp ' document2 = 'The election was over death np elc' document3 = 'Very clean match, go ball sport sp ' document4 = 'A clean but forgettable game sports sp ' document5 = 'It was a close election np elc' document6 = 'The new president Trump won the election elc' doclist1 = [document1, document2, document3] doclist2 = [document4, document5] for i in range(0, len(doclist1)): # print(doclist1[i]) model1.build(doclist1[i]) # print(model1.model)
from gbk.gbk import GBK as Model from topics import topics import requests headers = {'Content-Type': 'application/json'} # apidomain = 'http://127.0.0.1:8082/' apidomain = 'http://127.0.0.1:8085/api/' model = Model() # https://flask-restless.readthedocs.io/en/stable/requestformat.html#clientpagination article = requests.get(apidomain + 'article', headers=headers).json() document1 = 'A great game Sports' document2 = 'The election was over np ' document3 = 'Very clean match, go ball sport' document4 = 'A clean but forgettable game sports' document5 = 'It was a close election np' doclist = [document1, document2, document3, document4, document5] print(len(article["objects"])) # ?page=1 numberofpages = article["total_pages"] nextpage = 1 while nextpage <= numberofpages: for i in range(0, len(article["objects"])): print(article["objects"][i]["id"]) doclist.append(article["objects"][i]["CONTENT"]) nextpage += 1 article = requests.get(apidomain + 'article?page=' + str(nextpage), headers=headers).json() # print(article["objects"][i]["CONTENT"])
return words def applynounfilter(thedata): tokenized = nltk.word_tokenize(thedata) is_noun = lambda pos: pos[:2] == 'NN' nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if is_noun(pos)] # nouns = remove_stop_words(nouns) thedata = ' '.join(nouns) # thedata = re.sub("[-()\"#/@;:<>{}`+=~*\\|.!?,]", "", thedata) # print(thedata) return thedata model = Model() topics = {} # topics['model'] = [ # 'ar1t','sch1ool','cr1ime', # # 'di1saster', 'ec1onomy', # # 'en1vironment','he1alth', # # 'aw1ard','labor','po1litics', # # 're1ligion','so1ciety','sp1ort' # ] topics['model'] = [ 'art', 'school', 'crime', 'disaster', 'economy', 'environment', 'health', 'award', 'labor', 'politics', 'religion', 'society', 'sport' ] keys = {} keys['art'] = [ 'art', 'culture', 'entertainment', 'music', 'history', 'film', 'media',