def fetch(self, quantity=5000, format_str=", "): mmseg.dict_load_defaults() items = Item.query.from_statement('select id, name from items order by id desc limit ' + str(quantity)) f = open('data.basket', 'a') o = open('original.txt', 'a') banlist = ['、','(',')','★','【','】','!',':'] for i in items: seg = Segment.query.filter_by(id=i.id).first() if(seg==None): seg = Segment(item_id=i.id) #session.commit() text = i.name text = text.encode("utf-8") o.write(i.name.encode('utf-8') + "\n") algor = mmseg.Algorithm(text) sep = "|" s = "" for tok in algor: if tok.text in banlist: continue sep += tok.text.decode('utf-8') sep += "|" seg.content = sep session.commit() f.write(self.format(sep).encode('utf-8') + "\n") f.close()
def HY_pymmseg(file1,file2): if(os.path.isfile(file1)): mmseg.dict_load_defaults() Dict={} f1=open(file1,'r') f2=open(file2,'w') for item in f1.readlines(): alg=mmseg.Algorithm(item) wordlist=[] for tok in alg: wordlist.append(tok.text+"//") print "tok.text",tok.text if tok.text not in Dict: Dict[tok.text]=1 else: Dict[tok.text]+=1 f2.writelines(wordlist) f1.close f2.close() print "HY_pymmseg FINISHED" """ for item in Dict: print "DICT" print item print Dict[item] """ else: print "EROR:HY_pymmseg eror"
def __init__(self): self.wordSegFlag = False self.idfMethod = 'userIndependent' self.segInMemo = False self.invInMemo = False self.segLst = dict() self.invLst = dict() self.TFIDFLst = dict() #self.mmseg = mmseg mmseg.dict_load_defaults()
def SplitKeyword(req): mmseg.dict_load_defaults() com_list = company.objects.filter() for com in com_list: words = com.Company_Name.encode("utf-8") algor = mmseg.Algorithm(words) for tok in algor: word = tok.text.decode("utf-8") print word keytable = keyword(word=word, mycom = com) keytable.save() return HttpResponse("split end")
def fileToDict(file): if(os.path.isfile(file)): mmseg.dict_load_defaults() Dict={} f=open(file,'r') for item in f.readlines(): alg=mmseg.Algorithm(item) for tok in alg: if tok.text not in Dict: Dict[tok.text]=1 else: Dict[tok.text]+=1 return Dict else: print "Eror:<segment.py->fileToDict()> File not found"
def __init__(self, *args, **kwargs): # 参数检查 if args: if len(args) % 2 != 0: raise ParameterError("Config requires an equal number of values and scores") # 动态初始化实例变量 for i in range(len(args) / 2): setattr(self, args[i * 2], args[i * 2 + 1]) for key in kwargs: setattr(self, key, kwargs[key]) # redis pool = redis.ConnectionPool(host=self.config.redis['host'], port=self.config.redis['port'], db=self.config.redis['db']) self.r = redis.Redis(connection_pool=pool) # self.r = redis.StrictRedis(host=self.config.redis['host'], port=self.config.redis['port'], db=self.config.redis['db']) self.pipeline = self.r.pipeline() # 加载分词 mmseg.dict_load_defaults()
from pymmseg import mmseg import thread import md5 import os import socket import time, datetime from beaker.middleware import SessionMiddleware from func import SendEmailThread, html # link the database db = DBModel("g_azure") db.link_database() # rsa token TOKEN = 2113 # load dict mmseg.dict_load_defaults() # the state of mission WAITING = 0 COMPILING = 1 RUNNING = 2 COMPLETED = 3 def user_auth(func): """ user authenticate this is a decorator for all url that need user's authtication before dealing with url,we need to get the identify of the visitor Args: is_login: a bool indicating whether the user has been logined or not
# -*- coding: utf8 -*- from pymmseg import mmseg mmseg.dict_load_defaults() text = '工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作' algor = mmseg.Algorithm(text) for tok in algor: print '%s [%d..%d]' % (tok.text.decode('utf8'), tok.start, tok.end)
def handle(self, *args, **options): self.stdout.write('Start training videos\n') mmseg.dict_load_defaults() #Build feature_dict which stores index of feature names in features list #used for building feature vector for each document feature_dict = {} index = 0 for f in features: feature_dict[f] = index index += 1 #Build con_1 dictionary which is used to construct con_2 #Initialize prior dictionary con_1 = {} prior = {} channels = Channel.objects.filter(type=1) id_name_mapping = {} for c in channels: con_1[c.id] = {} #will be filled with: {word:(has_count, total_count),...} id_name_mapping[c.id] = c.name prior[c.id] = 0 #Update con_1 by processing all training data #..Warning..: With the assumption that all the videos in the database are already classified # Set share_count to 1 to represent training data; share_count = 2 means test data items = Item.objects.raw("select * from main_item where type=1 and channels<>'' and share_count=1") self.stdout.write(str(len(list(items))) + ' items to train\n') for item in items: feature_vector = [0 for f in features] #Changes to 1 if this feature is present item_channels = [int(c) for c in item.channels.split(',')] #update prior for c in item_channels: prior[c] += 1 algor = mmseg.Algorithm((item.name + item.snippet).encode('utf-8')) #mmseg requires utf-8 for tok in algor: try: token_text = tok.text.decode('utf-8') #The text come out is utf-8, turn it to unicode except: continue token_text = token_text.lower() if token_text in feature_dict: feature_vector[feature_dict[token_text]] = 1 #Now update con_1 for i in range(len(feature_vector)): if feature_vector[i] != 0: for c in item_channels: if features[i] in con_1[c]: has_count, total_count = con_1[c][features[i]] else: has_count, total_count = (0,0) has_count += feature_vector[i] total_count += feature_vector[i] con_1[c][features[i]] = (has_count, total_count) else: for c in item_channels: if features[i] in con_1[c]: has_count, total_count = con_1[c][features[i]] else: has_count, total_count = (0,0) total_count += 1 con_1[c][features[i]] = (has_count, total_count) #Builds con_2, pickle it and store it self.stdout.write('Start building con_2\n') con_2 = [] for channel_id, d in con_1.items(): d_2 = {} for word, stats in d.items(): if stats[0] == 0: d_2[word] = 0.001 #normalize the difference of sample size for different channels elif stats[0] == stats[1]: d_2[word] = 0.999 #So the the opposite won't be 0 else: d_2[word] = (stats[0]+1)*1.0/(stats[1]+1) #Boost the features that are representative of the channel #..Warning:some value will exceed 1, so the premise is that we won't use 1-P in classification if word in boosting_features[channel_id]: d_2[word] *= boosting_features[channel_id][word] con_2.append((channel_id, id_name_mapping[channel_id], d_2)) with open(settings.PROJECT_ROOT+'dataset/video_train_conditional.pkl', 'wb') as out: pickle.dump(con_2, out, -1) #Caculate prios dict, pickle it and store it self.stdout.write('Start building prior\n') num_items = len(list(items)) for k,v in prior.items(): prior[k] = v*1.0/num_items with open(settings.PROJECT_ROOT+'dataset/video_train_prior.pkl', 'wb') as out: pickle.dump(prior, out, -1) self.stdout.write('Finished building training data for videos\n')
def classify_video_channel(self): con = pickle.load(open(settings.PROJECT_ROOT + 'dataset/video_train_conditional.pkl', 'rb')) prior = pickle.load(open(settings.PROJECT_ROOT + 'dataset/video_train_prior.pkl', 'rb')) mmseg.dict_load_defaults() feature_dict = {} #The same as that in train_video_channel index = 0 for f in features: feature_dict[f] = index index += 1 days_ago = datetime.now() - timedelta(days=2) #items = Item.objects.filter(type=1, channels='', share_count=2) #In dev, share_count=2 means test data items = Item.objects.filter(type=1, channels='', create_date__gt=days_ago) item_count = len(list(items)) print(str(item_count) + 'items to classify\n') # if settings.DEBUG: # count = 0 # correct_count = 0 # no_class_count = 0 # false_list = [] for item in items: feature_vector = [0 for f in features] algor = mmseg.Algorithm((item.name + item.snippet).encode('utf-8')) #mmseg requires utf-8 for tok in algor: try: token_text = tok.text.decode('utf-8') #The text come out is utf-8, turn it to unicode except: continue token_text = token_text.lower() if token_text in feature_dict: feature_vector[feature_dict[token_text]] = 1 max_score = 0.0 max_channel = 0 for c in con: score = 1.0 has_count = 0 for i in range(len(feature_vector)): if feature_vector[i] != 0: #do not multiply 1 - c[2][features[i]] otherwise score *= c[2][features[i]] has_count += 1 if has_count == 0 or has_count == 1: #no confidence score = 0.0 score *= prior[c[0]] if score > max_score: max_score = score max_channel = c[0] if settings.PRODUCTION and item.channels: channels = item.channels c_list = channels.split(',') if not str(max_channel) in c_list: c_list.append(str(max_channel)) item.channels = ','.join(c_list) else: if max_score != 0.0: #do not use default score if we have no confidence item.channels = str(max_channel) item.save() # if settings.DEBUG: #Need to change previous setting of channels to channels_predict # if count % 10 == 0: # print count # count += 1 # if not item.channels: # no_class_count += 1 # elif item.channels_predict in item.channels : # correct_count += 1 # else: # false_list.append(item.id) # if settings.DEBUG: # print('Correct(not false) ratio:' + str((correct_count+no_class_count)*1.0/item_count) +'\n' ) # print('Number of items that could not be classfied:' + str(no_class_count) +'\n') # #print(str(false_list))
def search(req): if req.method == 'POST': words = req.POST.get('keywords', '') #print words com_count = {} cont = {} #分词 mmseg.dict_load_defaults() #print words.encode("utf-8") algor = mmseg.Algorithm(words.encode("utf-8")) for tok in algor: word = tok.text.decode("utf-8") print word res = keyword.objects.filter(word = word) for item in res: print "get............." try: com_count[str(item.mycom.id)] += 1 except: com_count[str(item.mycom.id)] = 1 cont[str(item.mycom.id)] = item.mycom sortres = sorted(com_count.iteritems(),key=lambda asd:asd[1], reverse=True) resultlist = [] for each in sortres: eachres = {} eachres['id'] = str(each[0]) eachres['count'] = str(each[1]) eachres['Company_Name'] = cont[str(each[0])].Company_Name eachres['company_id'] = cont[str(each[0])].id resultlist.append(cont[str(each[0])]) a={} if not resultlist: resultlist = company.objects.filter() test=[] favor_com = favor.objects.filter(who_id=req.user.id) for item in favor_com: test.append(item.which_id) for item in resultlist: if item.id in test: item.is_in_attention = True else: item.is_in_attention = False a["result"] = resultlist if req.user.is_authenticated(): req.user.is_authenticated = True a['user'] = req.user a["keyword"] = words return render_to_response("search.html",a) else: a={} resultlist = company.objects.filter() a["result"] = resultlist favor_com = favor.objects.filter(who_id=req.user.id) test=[] for item in favor_com: test.append(item.which_id) for item in resultlist: if item.id in test: item.is_in_attention = True else: item.is_in_attention = False if req.user.is_authenticated(): req.user.is_authenticated = True a['user'] = req.user return render_to_response("search.html",a)
def go (): while 1: focus_id = get_all_fid() for fid in focus_id: config = load_config() scrapy_content (config,fid) split_word (config) print 'sleep %s seconds ' % config.get('scy_stop') time.sleep(int(config.get('scy_stop'))) #thread.start_new_thread(scrapy_content, (config,)) #thread.start_new_thread(split_word, (config,)) if __name__ == "__main__": dbname = "webpy" dbuser = "******" dbpawd = "1234" conn = psycopg2.connect(database=dbname, user=dbuser, password=dbpawd, host='localhost', port=5432) cur = conn.cursor() mmseg.dict_load_defaults() #split chinese word go() conn.commit() cur.close() conn.close()
def test(text): mmseg.dict_load_defaults() algor = mmseg.Algorithm(text) for tok in algor: print '%s [%d..%d]' % (tok.text, tok.start, tok.end)
def classify_influence(inf_id, other=False): """ TODO: 1, Calculate the most popular tags, so that we can assign idf score 2, Accumulate more meaningful tags """ inf_id = inf_id.decode('gbk') print inf_id.encode('gbk') try: inf_id = int(inf_id) inf = Influence.objects.get(pk=inf_id) except: inf, created = Influence.objects.get_or_create(screen_name=inf_id) if created: auth = OAuthHandler(settings.SINA_CONSUMER_KEY, settings.SINA_CONSUMER_SECRET) auth.setToken('128021658f2bfdae185d89bdffb3cede','1713185d5c8208e8f1ef27a1f484ebc9') api = API(auth) user = api.get_user(screen_name=inf_id) inf.sina_account = getAtt(user, 'id') inf.verified = getAtt(user,'verified') inf.screen_name = getAtt(user, 'screen_name') inf.description = getAtt(user, 'description') inf.follower_count = getAtt(user, 'followers_count') inf.following_count = getAtt(user, 'friends_count') inf.status_count = getAtt(user, 'statuses_count') inf.favourites_count = getAtt(user, 'favourites_count') inf.create_date = getAtt(user, 'created_at') inf.save() auth = OAuthHandler(settings.SINA_CONSUMER_KEY, settings.SINA_CONSUMER_SECRET) if other: auth.setToken('128021658f2bfdae185d89bdffb3cede', '1713185d5c8208e8f1ef27a1f484ebc9') else: auth.setToken(inf.sina_key, inf.sina_secret) api = API(auth) mmseg.dict_load_defaults() """Put this in db first""" candidate_tags = KeyValue.objects.get(key='CANDIDATE_TAGS') area_dict = {} # id_list = api.followers_ids(user_id=inf.sina_account, count=100) #default to 500, maximum is 5000; This consumes a lot of api limit # ids = id_list[0].ids #Weird that getAtt won't work # for id in ids: # tags = api.tags(user_id=id) #user_id is required! # tag_list = [] # for tag in tags: # tag_list.append(getAtt(tag, 'value').lower().encode('utf-8')) # mmseg_text = mmseg.Algorithm(' '.join(tag_list)) # for token in mmseg_text: # try: # term = token.text.decode('utf-8').lower() # #next_term = mmseg_text[i+1].text.decode('utf-8') if i < len_list - 1 else '' # except: # continue # train_value = area_train_data.get(term, None) # #if not train_value: # # train_value = area_train_data.get(term + next_term, None) # if train_value: # print 'in dict' # for index in train_value: # if index in area_dict: # area_dict[index] += 1 # else: # area_dict[index] = 1 # else: # candidate_tags.value += ' ' + term candidate_tags.save() area_distr_dict = {} mid_list = [] ids_list = [] tweet_list = [] #Store the text of tweet and retweet rt_count_list = [] tried_count = 0 while True: timeline = api.user_timeline(user_id=inf.sina_account, count=200) if len(timeline) == 0 and inf.status_count >0: tried_count += 1 print 'try again in getting timeline' else: break if tried_count > 3: raise Exception('weibo api error. No timeline got') break for line in timeline: text = getAtt(line, 'text') retweet = getAtt(line, 'retweeted_status') retweet_text = getAtt(retweet, 'text') if retweet_text: text += retweet_text tweet_list.append(text) mid_list.append(str(getAtt(line, "id"))) if len(mid_list) == 20: ids_list.append(','.join(mid_list)) mid_list = [] if mid_list: #append the remaining ids ids_list.append(','.join(mid_list)) if inf.status_count > 0 and not ids_list: raise Exception('weibo api fails') tweet_list_correct = [] correct_index = 20 for ids in ids_list: counts = api.counts(ids=ids) if len(counts) == 0: print 'error in counts!' correct_index += 20 continue for obj in counts: rt_count_list.append(getAtt(obj, 'rt')) tweet_list_correct.extend(tweet_list[correct_index-20:correct_index]) correct_index += 20 if len(tweet_list_correct) == 0 or len(tweet_list_correct) != len(rt_count_list): raise Exception('weibo api fails') print 'length of tweet list and rt_count list', len(tweet_list_correct), len(rt_count_list) #Remedy for those user who has posted less than 200 status amplify_ratio = 1.0 if len(tweet_list_correct) == 200 else 200.0/len(tweet_list_correct) for i in range(len(tweet_list_correct)): print i #This number 100 should be replaced by avg_follower_count #Use math.sqrt to boost those tweet that has not been retweeted, #and smooth the effect of famous people tweeting about things not related to them added_count = (rt_count_list[i]*100 + math.sqrt(inf.follower_count)) * amplify_ratio assigned_area = {} try: #In Unix environment from sinal import signal, SIGALRM, alarm #@UnresolvedImport def handler(signum, frame): #print 'Signal handler called with signal', signum raise Exception("This code block runs for too long time!") signal(SIGALRM, handler) alarm(3) mmseg_text = mmseg.Algorithm(tweet_list_correct[i].encode('utf-8')) alarm(0) #cancel the alarm after finised except ImportError: # In windows, SIGALRM, alarm is not available in signal module mmseg_text = mmseg.Algorithm(tweet_list_correct[i].encode('utf-8')) except: #mmseg halts for too long, process next tweet continue for token in mmseg_text: try: term = token.text.decode('utf-8').lower() except: continue train_value = area_train_data.get(term, None) if train_value: print 'in dict' for index in train_value: if index not in assigned_area: #This tweet has already been assigned to one area if index in area_dict: area_dict[index] += added_count else: area_dict[index] = added_count assigned_area[index] = True if index in area_distr_dict: area_distr_dict[index] += 1 else: area_distr_dict[index] = 1 else: area_distr_dict[index] += 1 candidate_tags.save() sorted_tuple = sorted(area_dict.iteritems(), key=operator.itemgetter(1), reverse=True) if inf.follower_count > 100000: for i in range(1,len(sorted_tuple)): #Only normalize on secondary influence areas and belows index = sorted_tuple[i][0] model_follower_count = areas[index][4] if inf.follower_count > model_follower_count: area_dict[index] = area_dict[index]*1.0/inf.follower_count*model_follower_count num_areas = len(area_distr_dict) total_keyword_count = 0 for index in area_distr_dict: total_keyword_count += area_distr_dict[index] for k in area_dict: area_distr_ratio = num_areas * area_distr_dict[k]*1.0/total_keyword_count print k , area_distr_ratio, area_distr_dict[k] area_dict[k] = 100.0/math.log(areas[k][3]) * math.log(area_dict[k]*area_distr_ratio) if area_dict[k] > 100: area_dict[k] = 100.0 sorted_tuple = sorted(area_dict.iteritems(), key=operator.itemgetter(1), reverse=True) for st in sorted_tuple: print areas[st[0]][1].encode('gbk'), st[1]