コード例 #1
0
def statistics(labels,
               feature_file_name,
               threshold,
               collection=Connection().jd.train_users):
    #collection=Connection().jd.train_users
    label_dimention = max(labels.values()) + 1
    label_distribute = Counter(labels.values())
    label_distribute = [
        label_distribute[i] if i in label_distribute else 0
        for i in xrange(label_dimention)
    ]
    all_features = get_features(feature_file_name)
    bar = progress_bar(collection.count())
    feature_distribute = dict([f, [0.] * label_dimention]
                              for f in all_features)
    for index, user in enumerate(collection.find()):
        try:
            label = labels[user['_id']]
        except:
            continue
        features = combine_dict(user['mentions'], Counter(user['products']))
        for f in features:
            if f in feature_distribute:
                feature_distribute[f][label] += 1.0
        bar.draw(index)

    for f in feature_distribute.keys():
        s = 1.0 * sum(feature_distribute[f])
        if s == 0 or s < threshold:
            feature_distribute.pop(f)
            continue
        for i in xrange(label_dimention):
            feature_distribute[f][i] /= label_distribute[i]

    for f in feature_distribute.keys():
        s = 1.0 * sum(feature_distribute[f])
        for i in xrange(label_dimention):
            feature_distribute[f][i] /= s
    score = dict()
    for f, v in feature_distribute.items():
        #score[f]=eta_score(v)
        score[f] = abs_score(v)
    return score, feature_distribute
コード例 #2
0
def statistics(labels,feature_file_name,threshold,collection=Connection().jd.train_users):
    #collection=Connection().jd.train_users
    label_dimention=max(labels.values())+1
    label_distribute=Counter(labels.values())
    label_distribute=[label_distribute[i] if i in label_distribute else 0 for i in xrange(label_dimention)]
    all_features=get_features(feature_file_name)
    bar=progress_bar(collection.count())
    feature_distribute=dict([f,[0.]*label_dimention] for f in all_features)
    for index,user in enumerate(collection.find()):
        try:
            label=labels[user['_id']]
        except:
            continue
        features=combine_dict(user['mentions'],Counter(user['products']))
        for f in features:
            if f in feature_distribute:
                feature_distribute[f][label]+=1.0
        bar.draw(index)

    for f in feature_distribute.keys():
        s=1.0*sum(feature_distribute[f])
        if s==0 or s<threshold:
            feature_distribute.pop(f)
            continue
        for i in xrange(label_dimention):
            feature_distribute[f][i]/=label_distribute[i]

    for f in feature_distribute.keys():
        s=1.0*sum(feature_distribute[f])
        for i in xrange(label_dimention):
            feature_distribute[f][i]/=s
    score=dict()
    for f,v in feature_distribute.items():
        #score[f]=eta_score(v)
        score[f]=abs_score(v)
    return score,feature_distribute
コード例 #3
0
def update_all_fund(taskobj):
    fund_list = fund_api.fund_all()
    l = len(fund_list)
    log.Info("update_all_fund", taskobj, l)
    thread_num = 5
    args_list = split_args(thread_num,fund_list)
    result = thread_api.start_args(thread_spike_fund, args_list)
    all_ok = 0
    all_error = 0
    stock_total = {}
    top_fund = []
    for data in result:
        all_ok+=data["ok"]
        all_error+=data["error"]
        tools.combine_dict(stock_total, data["data"]["stock"])
        for code, v in data["data"]["yeild"].items():
            top_fund.append((code, v))
    
    top_stock = [ (k, v) for k,v in stock_total.items() ]
    top_stock = sorted(top_stock, key = lambda k:tofloat(k[1]), reverse = True)
    top_fund = sorted(top_fund, key = lambda d:tofloat(d[1]["now"]), reverse = True)

    def write_stock_list():
        with open("stock_list.txt", "w") as fp:
            for v in top_stock:
                fp.write("%s    %s\n"%(v[0],v[1]))


    #write_stock_list()

    top20stock = top_stock[:40]
    top20fund = top_fund[:20]
    tail20fund = list(reversed(top_fund[len(top_fund) - 20:]))

    def make_fund(fund_list):
        ls = []
        for v in fund_list:
            code = v[0]
            data = v[1]
            d = data["history"]
            l = [data["name"], code, data["now"], 
                d.get("month1",0), d.get("month3", 0), d.get("month6", 0) ,d.get("year1", 0)
                ]
            ls.append(l)
        return ls


    htmobj = html.CHtml("韭菜排行:")
    if len(top20stock) > 0:
        htmobj.AddLine("基金持仓top20股票")
        htmobj.AddTable(top20stock, head = ["股票名","基金持有数"])

    head = ["基金名","代码","今日收益", "近1月收益", "近3月收益", "近6月收益", "近1年收益"]
    if len(top20fund) > 0:
        htmobj.AddLine("收益top20")
        htmobj.AddTable(make_fund(top20fund), head = head)
    if len(tail20fund) > 0:
        htmobj.AddLine("亏损top20")
        htmobj.AddTable(make_fund(tail20fund), head = head)

    html_text = htmobj.GetHtml()
    mailobj = global_obj.get_obj("mail")
    message  = mailobj.HtmlMailMessage()
    if message.SendMessage("韭菜排行榜", html_text):
        log.Info("send jiucai mail done")
コード例 #4
0
def construct_train_set(attribute,training_count):
    product_features=get_features(feature_file=base_dir+'/features/product.feature')
    mention_features=get_features(feature_file=base_dir+'/features/mention.feature',existent_features=product_features)
    review_featuers=get_features(feature_file=base_dir+'/features/review.feature',existent_features=mention_features)
    mention_features_1=get_features(feature_file=base_dir+'/features/mention_1.feature',existent_features=review_featuers)
    mention_features_2=get_features(feature_file=base_dir+'/features/mention_2.feature',existent_features=mention_features_1)
    test_uids=get_test_uids()

    labeled_feature_file='%s/review_constraint_%s.constraints'%(labeled_feature_file_dir,attribute)
    label_arbiter=LabelArbiter(labeled_feature_file=labeled_feature_file)
    collection=Connection().jd.train_users
    bar=progress_bar(collection.count())
    guess=[]
    for index,user in enumerate(collection.find()):
        if user['_id'] in test_uids:
            continue
        #features=combine_dict(user['mentions_0'],Counter(user['products']))
        features=combine_dict(user['mentions_0'],Counter('products'))
        label,confidence=label_arbiter.arbitrate_label(features)
        x=[]

        #user['products']=[]
        for f,v in Counter(user['products']).items():
            if f not in product_features:
                continue
            x.append((product_features[f],v))

        #user['mentions_0']={}
        for f,v in user['mentions_0'].items():
            if f not in mention_features:
                continue
            x.append((mention_features[f],v))

        #user['review']=[]
        for f,v in Counter(user['review']).items():
            if f not in review_featuers:
                continue
            x.append((review_featuers[f],v))

        user['mentions_1']={}
        for f,v in user['mentions_1'].items():
            f=f+'_1'
            if f not in mention_features_1:
                continue
            x.append((mention_features_1[f],v))

        user['mentions_2']={}
        for f,v in user['mentions_2'].items():
            if f not in mention_features_2:
                continue
            x.append((mention_features_2[f],v))

        x=sorted(x,key=lambda d:d[0])
        str_x=' '.join(map(lambda f:'%s:%f'%f,x))
        guess.append(
                (user['_id'],
                    label,
                    abs(confidence),
                    str_x,
                    sum(user['mentions'].values()),
                    ))
        bar.draw(index+1)

    data0=filter(lambda d:d[1]==0,guess)
    data0=sorted(data0,key=lambda d:d[2],reverse=True)
    data1=filter(lambda d:d[1]==1,guess)
    data1=sorted(data1,key=lambda d:d[2],reverse=True)
    data2=filter(lambda d:d[1]==-1,guess)
    data2=sorted(data2,key=lambda d:d[4],reverse=True)

    dimention=min(len(data0),len(data1),training_count/2)

    data0=data0[:dimention]
    data1=data1[:dimention]
    data2=data2[:dimention]


    fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train.data'%attribute,'w')
    uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_uids.data'%attribute,'w')
    for d in data0+data1:
        fout.write('%d %s\n'%(d[1],d[3]))
        uid_output.write('%s\n'%d[0])

    fout=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_unlabel.data'%attribute,'w')
    uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/%s_train_unlabel_uids.data'%attribute,'w')
    for d in data2:
        fout.write('%d %s\n'%(d[1],d[3]))
        uid_output.write('%s\n'%d[0])
コード例 #5
0
def construct_train_set(attribute, training_count):
    product_features = get_features(feature_file=base_dir +
                                    '/features/product.feature')
    mention_features = get_features(feature_file=base_dir +
                                    '/features/mention.feature',
                                    existent_features=product_features)
    review_featuers = get_features(feature_file=base_dir +
                                   '/features/review.feature',
                                   existent_features=mention_features)
    mention_features_1 = get_features(feature_file=base_dir +
                                      '/features/mention_1.feature',
                                      existent_features=review_featuers)
    mention_features_2 = get_features(feature_file=base_dir +
                                      '/features/mention_2.feature',
                                      existent_features=mention_features_1)
    test_uids = get_test_uids()

    labeled_feature_file = '%s/review_constraint_%s.constraints' % (
        labeled_feature_file_dir, attribute)
    label_arbiter = LabelArbiter(labeled_feature_file=labeled_feature_file)
    collection = Connection().jd.train_users
    bar = progress_bar(collection.count())
    guess = []
    for index, user in enumerate(collection.find()):
        if user['_id'] in test_uids:
            continue
        #features=combine_dict(user['mentions_0'],Counter(user['products']))
        features = combine_dict(user['mentions_0'], Counter('products'))
        label, confidence = label_arbiter.arbitrate_label(features)
        x = []

        #user['products']=[]
        for f, v in Counter(user['products']).items():
            if f not in product_features:
                continue
            x.append((product_features[f], v))

        #user['mentions_0']={}
        for f, v in user['mentions_0'].items():
            if f not in mention_features:
                continue
            x.append((mention_features[f], v))

        #user['review']=[]
        for f, v in Counter(user['review']).items():
            if f not in review_featuers:
                continue
            x.append((review_featuers[f], v))

        user['mentions_1'] = {}
        for f, v in user['mentions_1'].items():
            f = f + '_1'
            if f not in mention_features_1:
                continue
            x.append((mention_features_1[f], v))

        user['mentions_2'] = {}
        for f, v in user['mentions_2'].items():
            if f not in mention_features_2:
                continue
            x.append((mention_features_2[f], v))

        x = sorted(x, key=lambda d: d[0])
        str_x = ' '.join(map(lambda f: '%s:%f' % f, x))
        guess.append((
            user['_id'],
            label,
            abs(confidence),
            str_x,
            sum(user['mentions'].values()),
        ))
        bar.draw(index + 1)

    data0 = filter(lambda d: d[1] == 0, guess)
    data0 = sorted(data0, key=lambda d: d[2], reverse=True)
    data1 = filter(lambda d: d[1] == 1, guess)
    data1 = sorted(data1, key=lambda d: d[2], reverse=True)
    data2 = filter(lambda d: d[1] == -1, guess)
    data2 = sorted(data2, key=lambda d: d[4], reverse=True)

    dimention = min(len(data0), len(data1), training_count / 2)

    data0 = data0[:dimention]
    data1 = data1[:dimention]
    data2 = data2[:dimention]

    fout = open(
        RAW_DATA_DIR + 'iterate_label2trainset/%s_train.data' % attribute, 'w')
    uid_output = open(
        RAW_DATA_DIR + 'iterate_label2trainset/%s_train_uids.data' % attribute,
        'w')
    for d in data0 + data1:
        fout.write('%d %s\n' % (d[1], d[3]))
        uid_output.write('%s\n' % d[0])

    fout = open(
        RAW_DATA_DIR +
        'iterate_label2trainset/%s_train_unlabel.data' % attribute, 'w')
    uid_output = open(
        RAW_DATA_DIR +
        'iterate_label2trainset/%s_train_unlabel_uids.data' % attribute, 'w')
    for d in data2:
        fout.write('%d %s\n' % (d[1], d[3]))
        uid_output.write('%s\n' % d[0])