def statistics(labels, feature_file_name, threshold, collection=Connection().jd.train_users): #collection=Connection().jd.train_users label_dimention = max(labels.values()) + 1 label_distribute = Counter(labels.values()) label_distribute = [ label_distribute[i] if i in label_distribute else 0 for i in xrange(label_dimention) ] all_features = get_features(feature_file_name) bar = progress_bar(collection.count()) feature_distribute = dict([f, [0.] * label_dimention] for f in all_features) for index, user in enumerate(collection.find()): try: label = labels[user['_id']] except: continue features = combine_dict(user['mentions'], Counter(user['products'])) for f in features: if f in feature_distribute: feature_distribute[f][label] += 1.0 bar.draw(index) for f in feature_distribute.keys(): s = 1.0 * sum(feature_distribute[f]) if s == 0 or s < threshold: feature_distribute.pop(f) continue for i in xrange(label_dimention): feature_distribute[f][i] /= label_distribute[i] for f in feature_distribute.keys(): s = 1.0 * sum(feature_distribute[f]) for i in xrange(label_dimention): feature_distribute[f][i] /= s score = dict() for f, v in feature_distribute.items(): #score[f]=eta_score(v) score[f] = abs_score(v) return score, feature_distribute
def statistics(labels,feature_file_name,threshold,collection=Connection().jd.train_users): #collection=Connection().jd.train_users label_dimention=max(labels.values())+1 label_distribute=Counter(labels.values()) label_distribute=[label_distribute[i] if i in label_distribute else 0 for i in xrange(label_dimention)] all_features=get_features(feature_file_name) bar=progress_bar(collection.count()) feature_distribute=dict([f,[0.]*label_dimention] for f in all_features) for index,user in enumerate(collection.find()): try: label=labels[user['_id']] except: continue features=combine_dict(user['mentions'],Counter(user['products'])) for f in features: if f in feature_distribute: feature_distribute[f][label]+=1.0 bar.draw(index) for f in feature_distribute.keys(): s=1.0*sum(feature_distribute[f]) if s==0 or s<threshold: feature_distribute.pop(f) continue for i in xrange(label_dimention): feature_distribute[f][i]/=label_distribute[i] for f in feature_distribute.keys(): s=1.0*sum(feature_distribute[f]) for i in xrange(label_dimention): feature_distribute[f][i]/=s score=dict() for f,v in feature_distribute.items(): #score[f]=eta_score(v) score[f]=abs_score(v) return score,feature_distribute
def update_all_fund(taskobj): fund_list = fund_api.fund_all() l = len(fund_list) log.Info("update_all_fund", taskobj, l) thread_num = 5 args_list = split_args(thread_num,fund_list) result = thread_api.start_args(thread_spike_fund, args_list) all_ok = 0 all_error = 0 stock_total = {} top_fund = [] for data in result: all_ok+=data["ok"] all_error+=data["error"] tools.combine_dict(stock_total, data["data"]["stock"]) for code, v in data["data"]["yeild"].items(): top_fund.append((code, v)) top_stock = [ (k, v) for k,v in stock_total.items() ] top_stock = sorted(top_stock, key = lambda k:tofloat(k[1]), reverse = True) top_fund = sorted(top_fund, key = lambda d:tofloat(d[1]["now"]), reverse = True) def write_stock_list(): with open("stock_list.txt", "w") as fp: for v in top_stock: fp.write("%s %s\n"%(v[0],v[1])) #write_stock_list() top20stock = top_stock[:40] top20fund = top_fund[:20] tail20fund = list(reversed(top_fund[len(top_fund) - 20:])) def make_fund(fund_list): ls = [] for v in fund_list: code = v[0] data = v[1] d = data["history"] l = [data["name"], code, data["now"], d.get("month1",0), d.get("month3", 0), d.get("month6", 0) ,d.get("year1", 0) ] ls.append(l) return ls htmobj = html.CHtml("韭菜排行:") if len(top20stock) > 0: htmobj.AddLine("基金持仓top20股票") htmobj.AddTable(top20stock, head = ["股票名","基金持有数"]) head = ["基金名","代码","今日收益", "近1月收益", "近3月收益", "近6月收益", "近1年收益"] if len(top20fund) > 0: htmobj.AddLine("收益top20") htmobj.AddTable(make_fund(top20fund), head = head) if len(tail20fund) > 0: htmobj.AddLine("亏损top20") htmobj.AddTable(make_fund(tail20fund), head = head) html_text = htmobj.GetHtml() mailobj = global_obj.get_obj("mail") message = mailobj.HtmlMailMessage() if message.SendMessage("韭菜排行榜", html_text): log.Info("send jiucai mail done")
def construct_train_set(attribute,training_count): product_features=get_features(feature_file=base_dir+'/features/product.feature') mention_features=get_features(feature_file=base_dir+'/features/mention.feature',existent_features=product_features) review_featuers=get_features(feature_file=base_dir+'/features/review.feature',existent_features=mention_features) mention_features_1=get_features(feature_file=base_dir+'/features/mention_1.feature',existent_features=review_featuers) mention_features_2=get_features(feature_file=base_dir+'/features/mention_2.feature',existent_features=mention_features_1) test_uids=get_test_uids() labeled_feature_file='%s/review_constraint_%s.constraints'%(labeled_feature_file_dir,attribute) label_arbiter=LabelArbiter(labeled_feature_file=labeled_feature_file) collection=Connection().jd.train_users bar=progress_bar(collection.count()) guess=[] for index,user in enumerate(collection.find()): if user['_id'] in test_uids: continue #features=combine_dict(user['mentions_0'],Counter(user['products'])) features=combine_dict(user['mentions_0'],Counter('products')) label,confidence=label_arbiter.arbitrate_label(features) x=[] #user['products']=[] for f,v in Counter(user['products']).items(): if f not in product_features: continue x.append((product_features[f],v)) #user['mentions_0']={} for f,v in user['mentions_0'].items(): if f not in mention_features: continue x.append((mention_features[f],v)) #user['review']=[] for f,v in Counter(user['review']).items(): if f not in review_featuers: continue x.append((review_featuers[f],v)) user['mentions_1']={} for f,v in user['mentions_1'].items(): f=f+'_1' if f not in mention_features_1: continue x.append((mention_features_1[f],v)) user['mentions_2']={} for f,v in user['mentions_2'].items(): if f not in mention_features_2: continue x.append((mention_features_2[f],v)) x=sorted(x,key=lambda d:d[0]) str_x=' '.join(map(lambda f:'%s:%f'%f,x)) guess.append( (user['_id'], label, abs(confidence), str_x, sum(user['mentions'].values()), )) bar.draw(index+1) data0=filter(lambda d:d[1]==0,guess) data0=sorted(data0,key=lambda d:d[2],reverse=True) data1=filter(lambda d:d[1]==1,guess) data1=sorted(data1,key=lambda d:d[2],reverse=True) data2=filter(lambda d:d[1]==-1,guess) data2=sorted(data2,key=lambda d:d[4],reverse=True) dimention=min(len(data0),len(data1),training_count/2) data0=data0[:dimention] data1=data1[:dimention] data2=data2[:dimention] fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train.data'%attribute,'w') uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_uids.data'%attribute,'w') for d in data0+data1: fout.write('%d %s\n'%(d[1],d[3])) uid_output.write('%s\n'%d[0]) fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_unlabel.data'%attribute,'w') uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_unlabel_uids.data'%attribute,'w') for d in data2: fout.write('%d %s\n'%(d[1],d[3])) uid_output.write('%s\n'%d[0])
def construct_train_set(attribute, training_count): product_features = get_features(feature_file=base_dir + '/features/product.feature') mention_features = get_features(feature_file=base_dir + '/features/mention.feature', existent_features=product_features) review_featuers = get_features(feature_file=base_dir + '/features/review.feature', existent_features=mention_features) mention_features_1 = get_features(feature_file=base_dir + '/features/mention_1.feature', existent_features=review_featuers) mention_features_2 = get_features(feature_file=base_dir + '/features/mention_2.feature', existent_features=mention_features_1) test_uids = get_test_uids() labeled_feature_file = '%s/review_constraint_%s.constraints' % ( labeled_feature_file_dir, attribute) label_arbiter = LabelArbiter(labeled_feature_file=labeled_feature_file) collection = Connection().jd.train_users bar = progress_bar(collection.count()) guess = [] for index, user in enumerate(collection.find()): if user['_id'] in test_uids: continue #features=combine_dict(user['mentions_0'],Counter(user['products'])) features = combine_dict(user['mentions_0'], Counter('products')) label, confidence = label_arbiter.arbitrate_label(features) x = [] #user['products']=[] for f, v in Counter(user['products']).items(): if f not in product_features: continue x.append((product_features[f], v)) #user['mentions_0']={} for f, v in user['mentions_0'].items(): if f not in mention_features: continue x.append((mention_features[f], v)) #user['review']=[] for f, v in Counter(user['review']).items(): if f not in review_featuers: continue x.append((review_featuers[f], v)) user['mentions_1'] = {} for f, v in user['mentions_1'].items(): f = f + '_1' if f not in mention_features_1: continue x.append((mention_features_1[f], v)) user['mentions_2'] = {} for f, v in user['mentions_2'].items(): if f not in mention_features_2: continue x.append((mention_features_2[f], v)) x = sorted(x, key=lambda d: d[0]) str_x = ' '.join(map(lambda f: '%s:%f' % f, x)) guess.append(( user['_id'], label, abs(confidence), str_x, sum(user['mentions'].values()), )) bar.draw(index + 1) data0 = filter(lambda d: d[1] == 0, guess) data0 = sorted(data0, key=lambda d: d[2], reverse=True) data1 = filter(lambda d: d[1] == 1, guess) data1 = sorted(data1, key=lambda d: d[2], reverse=True) data2 = filter(lambda d: d[1] == -1, guess) data2 = sorted(data2, key=lambda d: d[4], reverse=True) dimention = min(len(data0), len(data1), training_count / 2) data0 = data0[:dimention] data1 = data1[:dimention] data2 = data2[:dimention] fout = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_train.data' % attribute, 'w') uid_output = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_train_uids.data' % attribute, 'w') for d in data0 + data1: fout.write('%d %s\n' % (d[1], d[3])) uid_output.write('%s\n' % d[0]) fout = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_train_unlabel.data' % attribute, 'w') uid_output = open( RAW_DATA_DIR + 'iterate_label2trainset/%s_train_unlabel_uids.data' % attribute, 'w') for d in data2: fout.write('%d %s\n' % (d[1], d[3])) uid_output.write('%s\n' % d[0])