Esempio n. 1
0
def doc2vec_embedding(file_name):
    import sys
    import gensim
    from pymongo import Connection

    users = Connection().jd.jd_users
    dimensionality_size = 200
    window_size = 8
    workers = 5
    min_count = 5

    # load sentences
    finish_count = 0
    total_count = users.find({'got_review': True}).count()
    #total_count=users.count()
    sentences = []
    print total_count
    old_review = ''
    for user in users.find({'got_review': True}):
        #for user in users.find():
        if finish_count % 10000 == 0:
            sys.stdout.write("\r%f" % (finish_count * 1.0 / total_count))
            sys.stdout.flush()
        finish_count += 1
        content = []
        for behavior in user['behaviors']:
            #content.append(str(behavior['item']))
            #content.append(behavior['item_class'][0])
            review = ' '.join(behavior['review']['parsed_review_general'])
            if review == old_review:
                continue
            old_review == review
            content += review.split()
        for ch in [' ', '\n', '\r', '\u3000']:
            while 1:
                try:
                    content.remove(ch)
                except:
                    break
        #print ' '.join(content)
        if len(content) < 10:
            continue
        sentence = gensim.models.doc2vec.LabeledSentence(
            words=content, labels=['USER_%d' % user['_id']])
        sentences.append(sentence)

    print 'load corpus completed...'

    # train word2vc
    model = gensim.models.Doc2Vec(sentences,
                                  size=200,
                                  window=7,
                                  workers=20,
                                  min_count=3,
                                  sample=1e-3)
    model.save_word2vec_format('/mnt/data1/adoni/jd_data/vectors/' +
                               file_name + '.data',
                               binary=False)
    print 'embedding done'
    return model
Esempio n. 2
0
def insert_LINE_vector(file_name=RAW_DATA_DIR + 'normalize2.data'):
    vectors = dict()
    fin = open(file_name)
    line = fin.readline().strip().split(' ')
    count, dimention = int(line[0]), int(line[1])
    bar = progress_bar(count)
    for index in xrange(count):
        line = fin.readline()
        line = line.strip().split(' ')
        vector = map(lambda d: float(d), line[1:])
        vectors[line[0]] = vector
        bar.draw(index + 1)
    collection = Connection().jd.train_users
    bar = progress_bar(collection.count())
    for index, user in enumerate(collection.find()):
        if user['_id'] not in vectors:
            vectors[user['_id']] = [0.] * dimention
            continue
        collection.update(
            {'_id': user['_id']},
            {'$set': {
                'user_product_vector_from_line': vectors[user['_id']]
            }})
        bar.draw(index + 1)
    collection = Connection().jd.test_users
    bar = progress_bar(collection.count())
    for index, user in enumerate(collection.find()):
        if user['_id'] not in vectors:
            continue
        collection.update(
            {'_id': user['_id']},
            {'$set': {
                'user_product_vector_from_line': vectors[user['_id']]
            }})
        bar.draw(index + 1)
Esempio n. 3
0
File: app.py Progetto: bbqsrc/vapour
class Application(tornado.web.Application):
    def __init__(self, handlers, **settings):
        tornado.web.Application.__init__(self, handlers, **settings)
        self.collection = Connection().vapour.urls
        self.templates = TemplateLookup(directories=["templates"])

    def get_link_by_id(self, id):
        record = self.collection.find_one({'_id': uuid.UUID(id)})
        return fix_id(record)

    def get_links_by_tag(self, tag):
        records = self.collection.find({'tags': re.compile(tag, re.I)})
        return fix_ids(records)

    def get_links_by_url(self, url):
        records = self.collection.find({'url': re.compile(url, re.I)})
        return fix_ids(records)

    def insert_link(self, url, desc, tags):
        return self.collection.insert({
            '_id': uuid.uuid4(),
            'url': url,
            'desc': desc,
            'tags': tags,
            'added': datetime.datetime.utcnow()
        })
Esempio n. 4
0
class Food(object):

    def __init__(self):
        self.db = Connection()["food"]["choices"]

    def add(self, name):
        name = str(name).lower()
        self.db.update({'name':name}, {'name':name}, upsert=True)

    def remove(self, name):
        self.db.remove({'name':name})

    def get_all(self):
        retVal = []
        try:
            cur = self.db.find()
            for i in cur:
                retVal.append(i['name'])
        except:
            pass
        return retVal
        

    def choose(self):
        try:
            cur = self.db.find()
            tmp = []
            for i in cur:
                tmp.append(i['name'])
            index = random.randrange(len(tmp))
            return tmp[index]
        except:
            return "unknown"
Esempio n. 5
0
def doc2vec_embedding(file_name):
    import sys
    import gensim
    from pymongo import Connection

    users=Connection().jd.jd_users
    dimensionality_size=200
    window_size=8
    workers=5
    min_count=5

    # load sentences
    finish_count=0
    total_count=users.find({'got_review':True}).count()
    #total_count=users.count()
    sentences = []
    print total_count
    old_review=''
    for user in users.find({'got_review':True}):
    #for user in users.find():
        if finish_count%10000==0:
            sys.stdout.write("\r%f"%(finish_count*1.0/total_count))
            sys.stdout.flush()
        finish_count+=1
        content=[]
        for behavior in user['behaviors']:
            #content.append(str(behavior['item']))
            #content.append(behavior['item_class'][0])
            review=' '.join(behavior['review']['parsed_review_general'])
            if review==old_review:
                continue
            old_review==review
            content+=review.split()
        for ch in [' ','\n','\r','\u3000']:
            while 1:
                try:
                    content.remove(ch)
                except:
                    break
        #print ' '.join(content)
        if len(content)<10:
            continue
        sentence = gensim.models.doc2vec.LabeledSentence(words=content,labels=['USER_%d'%user['_id']])
        sentences.append(sentence)

    print 'load corpus completed...'

    # train word2vc
    model = gensim.models.Doc2Vec(sentences,size=200,window=7, workers=20,min_count=3,sample=1e-3)
    model.save_word2vec_format('/mnt/data1/adoni/jd_data/vectors/'+file_name+'.data',binary=False)
    print 'embedding done'
    return model
Esempio n. 6
0
def update_min_max_sum(entity):
    '''
    Used to insert min max and sum
    '''
    collection=Connection().jd['train_%s_mentions'%entity]
    collection_user=Connection().jd['train_%ss'%entity]
    mentions=[line[:-1].decode('utf8') for line in open('../features/mention.feature')]
    min_d=dict()
    max_d=dict()
    sum_d=dict()
    sum_u_d=dict()
    for m in mentions:
        min_d[m]=float('inf')
        max_d[m]=-1
        sum_d[m]=0
        sum_u_d[m]=0
    for user in collection_user.find():
        for m in user['mentions']:
            v=user['mentions'][m]
            if v<min_d[m]:
                min_d[m]=v
            if v>max_d[m]:
                max_d[m]=v
            sum_d[m]+=v
            sum_u_d[m]+=1
    for m in mentions:
        collection.insert({'_id':m,'distribute':[min_d[m],max_d[m],sum_d[m],sum_u_d[m]]})
Esempio n. 7
0
def generate_name_feature():
    from pymongo import Connection
    lastnames = [
        name.replace('\n', '').decode('utf8') for name in open('./lastname')
    ]
    from pymongo import Connection
    users = Connection().user_profilling.users
    bar = get_progressive_bar(users.count())
    corpus = []
    finish_count = 0
    y = []
    for user in users.find():
        name = user['screen_name']
        normal_name = ''
        for n in name:
            if n[0] in lastnames:
                normal_name = n
            else:
                continue
        if normal_name == '':
            continue
        if len(normal_name) < 2:
            continue
        corpus.append(normal_name[1:])
        finish_count += 1
        bar.cursor.restore()
        bar.draw(value=finish_count)
    feature_selection_df(corpus)
Esempio n. 8
0
class DaoSunPosition():

    def __init__(self):
        self.col = Connection()['rdam']['sunpos']
        self.bulk = []

    def create_datetime_index(self):
        self.col.create_index('datetime')

    def persist(self, sunpos):
        self.bulk.append(sunpos)

    def flush(self):
        self.col.insert(self.bulk)
        self.bulk = []

    def find_within_time(self, start_date, end_date):
        result = []
        for bson in self.col.find({ '$and': [
                { 'datetime' : { '$gt': start_date }}, 
                { 'datetime' : { '$lt': end_date }}
            ]}):
            sunpos = SunPosition(bson['az'], bson['el'], bson['datetime'])
            result.append(sunpos)
        return result
Esempio n. 9
0
def construct_train_data():
    import random
    all_features = get_features(feature_file_name=feature_file_name)
    review_features = get_features(feature_file_name=base_dir +
                                   '/features/review.feature',
                                   start_index=max(all_features.values()) + 1)
    collection = Connection().jd.train_users
    bar = progress_bar(collection.count())
    data = []
    uids = get_test_uids()
    for index, user in enumerate(collection.find()):
        uid = user['_id']
        if uid in uids:
            continue
        features = combine_features(user['mentions'],
                                    Counter(user['products']))
        x = dict()
        for f in features:
            if f not in all_features:
                continue
            x[all_features[f]] = features[f]
        for f, v in Counter(user['review']).items():
            if f not in review_features:
                continue
            x[review_features[f]] = v
        y = random.randint(0, 1)
        data.append([uid, y, x])
        bar.draw(index + 1)
    output(RAW_DATA_DIR + 'mallet/mallet_train.data', data)
Esempio n. 10
0
def construct_test_data(attribute):
    collection=Connection().jd.test_users
    all_features=get_features(feature_file_name=feature_file_name)
    review_features=get_features(feature_file_name=base_dir+'/features/review.feature',start_index=max(all_features.values())+1)
    data=[]
    bar=progress_bar(collection.count())
    for index,user in enumerate(collection.find()):
        uid=user['_id']
        features=combine_features(user['mentions'],Counter(user['products']))
        try:
            y=user['profile'][attribute].index(1)
        except:
            continue
        x=dict()
        for f in features:
            if f not in all_features:
                continue
            x[all_features[f]]=features[f]

        for f,v in Counter(user['review']).items():
            if f not in review_features:
                continue
            x[review_features[f]]=v
        data.append([uid,y,x])
        bar.draw(index+1)
    #data=balance(data,target_index=1)
    output(RAW_DATA_DIR+'mallet/mallet_test_%s.data'%attribute,data)
Esempio n. 11
0
def construct_test_data(attribute):
    collection = Connection().jd.test_users
    all_features = get_features(feature_file_name=feature_file_name)
    review_features = get_features(feature_file_name=base_dir +
                                   '/features/review.feature',
                                   start_index=max(all_features.values()) + 1)
    data = []
    bar = progress_bar(collection.count())
    for index, user in enumerate(collection.find()):
        uid = user['_id']
        features = combine_features(user['mentions'],
                                    Counter(user['products']))
        try:
            y = user['profile'][attribute].index(1)
        except:
            continue
        x = dict()
        for f in features:
            if f not in all_features:
                continue
            x[all_features[f]] = features[f]

        for f, v in Counter(user['review']).items():
            if f not in review_features:
                continue
            x[review_features[f]] = v
        data.append([uid, y, x])
        bar.draw(index + 1)
    #data=balance(data,target_index=1)
    output(RAW_DATA_DIR + 'mallet/mallet_test_%s.data' % attribute, data)
Esempio n. 12
0
def construct_train_data():
    import random
    all_features=get_features(feature_file_name=feature_file_name)
    review_features=get_features(feature_file_name=base_dir+'/features/review.feature',start_index=max(all_features.values())+1)
    collection=Connection().jd.train_users
    bar=progress_bar(collection.count())
    data=[]
    uids=get_test_uids()
    for index,user in enumerate(collection.find()):
        uid=user['_id']
        if uid in uids:
            continue
        features=combine_features(user['mentions'],Counter(user['products']))
        x=dict()
        for f in features:
            if f not in all_features:
                continue
            x[all_features[f]]=features[f]
        for f,v in Counter(user['review']).items():
            if f not in review_features:
                continue
            x[review_features[f]]=v
        y=random.randint(0,1)
        data.append([uid,y,x])
        bar.draw(index+1)
    output(RAW_DATA_DIR+'mallet/mallet_train.data',data)
def output_graph_matrix():
    from pymongo import Connection
    users = Connection().user_profilling.users
    graph = Connection().user_profilling.graph_embedding
    print graph.count()
    bar = get_progressive_bar(users.count())
    x = []
    y = []
    finish_count = 0
    uids = []
    for user in users.find({'int_id': {
            '$exists': True
    }}, {
            'information': 1,
            'int_id': 1
    }):
        finish_count += 1
        print finish_count
        #bar.cursor.restore()
        #bar.draw(value=finish_count)
        user_embedding = graph.find_one({'_id': user['int_id']})
        if user_embedding is None:
            print user_embedding
            continue
        gender = user['information']['gender']
        if gender == 'f':
            y.append(0)
        else:
            y.append(1)
        x.append(user_embedding['embedding'])
        uids.append(user['information']['uid'])
    #dump_train_valid_test(x,y,'gender_graph.data')
    dump_user_vector(x, y, uids, 'user_graph_vector.data')
Esempio n. 14
0
def update_user_id():
    GRAPH_DATA_DIR='/mnt/data1/weibo_graph/'
    id_map_file=open(GRAPH_DATA_DIR+'id_map.txt')
    uids=dict()
    total_count=107628903
    finish_count=0
    #bar=get_progressive_bar(total_count=total_count)
    for line in id_map_file:
        line=line.replace('\n','').split(' ')
        uids[line[0]]=line[1]
        finish_count+=1
        #bar.cursor.restore()
        #bar.draw(value=finish_count)
    #uids=set(uids)
    from pymongo import Connection
    users=Connection().user_profilling.users
    count=0
    finish_count=0
    u=set()
    for user in users.find({},{'uid':True}):
        finish_count+=1
        uid=user['uid']
        u.add(uid)
        try:
            int_id=uids[uid]
        except Exception as e:
            continue
        users.update({'_id':user['_id']},{'$set':{'int_id':int_id}})
        #bar.cursor.restore()
        #bar.draw(value=finish_count)
    uids=set(uids.keys())
    together=uids & u
    print len(together)
    print len(uids)
    print len(u)
Esempio n. 15
0
def output_review_star_matrix(feature_length=1000):
    from pymongo import Connection
    feature_map={}
    for i in range(0,6):
        feature_map[str(i)]=i
    all_x=[]
    index=0
    #进度条相关参数
    users=Connection().jd.weibo_users
    total_count=users.count()
    bar=progress_bar(total_count)
    finish_count=0
    uids=[]
    for user in users.find():
        features=[]
        for behavior in user['behaviors']:
            feature=str(behavior['review']['review_stars'])
            features.append(feature)
        if features==[]:
            continue
        vector=get_one_hot_vector(features, feature_map)
        if not vector.any():
            continue
        #y=get_location_class(user['location'],key_map)
        all_x.append(vector)
        uids.append(user['_id'])
        index+=1
        finish_count+=1
        bar.draw(value=finish_count)
    all_x=numpy.array(all_x)
    dump_user_vector(all_x,uids,'jd_review_star')
    return
    return dump_train_valid_test(all_x, all_y, 'jd_review_star')
Esempio n. 16
0
def output_user_user_propagate_vectors(order):
    from pymongo import Connection
    all_x=[]
    index=0
    #进度条相关参数
    users=Connection().jd.weibo_users
    vectors=load_user_user_graph_propagate_vector(order)
    total_count=users.count()
    bar=progress_bar(total_count)
    finish_count=0
    uids=[]
    for user in users.find():
        try:
            vector=vectors[int(user['_id'])]
        except:
            continue
        if not vector.any():
            continue
        #y=get_location_class(user['location'],key_map)
        all_x.append(vector)
        uids.append(user['_id'])
        index+=1
        finish_count+=1
        bar.draw(value=finish_count)
    all_x=numpy.array(all_x)
    dump_user_vector(all_x,uids,'jd_user_user_propagate'+str(order))
    return
    return dump_train_valid_test(all_x, all_y, 'jd_user_user_propagate')
Esempio n. 17
0
def output_sentence_embedding_matrix(file_name1,file_name2):
    from pymongo import Connection
    all_x=[]
    index=0
    embedding=doc2vec_embedding(file_name1)
    #embedding=load_doc2vec_embedding(file_name1)
    #进度条相关参数
    users=Connection().jd.weibo_users
    total_count=users.count()
    bar=progress_bar(total_count)
    finish_count=0
    uids=[]
    count_male=0
    for user in users.find():
        try:
            vector=embedding['USER_%d'%user['jd_id']]
        except:
            continue
        #if y==-1:
        #    continue
        all_x.append(vector)
        uids.append(user['_id'])
        index+=1
        finish_count+=1
        bar.draw(value=finish_count)
    all_x=numpy.array(all_x)
    #return dump_train_valid_test(all_x, all_y, 'jd_user_embedding')
    #dump_user_vector(all_x, all_y, uids, 'jd_user_embedding_with_item_class')
    dump_user_vector(all_x, uids, file_name2)
Esempio n. 18
0
def output_goods_class_matrix(order=0):
    from pymongo import Connection
    feature_map={}
    f=open('./features/item_class_order_%d.feature'%order).readlines()
    tmp_feature=[]
    for index,line in enumerate(f):
        tmp_feature.append(line.decode('utf8').split(' ')[0])
    for index,f in enumerate(tmp_feature):
        feature_map[f]=index
    all_x=[]
    index=0
    #进度条相关参数
    users=Connection().jd.weibo_users
    total_count=users.count()
    bar=progress_bar(total_count)
    finish_count=0
    uids=[]
    for user in users.find():
        features=[]
        behaviors=user['behaviors']
        for behavior in behaviors:
            feature=behavior['item_class'][order-1]
            features.append(feature)
        vector=get_one_hot_vector(features,feature_map)
        if not vector.any():
            continue
        all_x.append(vector)
        uids.append(user['_id'])
        index+=1
        finish_count+=1
        bar.draw(value=finish_count)
    all_x=numpy.array(all_x)
    dump_user_vector(all_x,uids,'jd_item_class_order_'+str(order))
    return
Esempio n. 19
0
def output_simple_matrix(feature_length=10000):
    from pymongo import Connection
    from collections import Counter
    feature_map={}
    f=open('./features/product.feature').readlines()
    for i in range(0,len(f)):
        if feature_length is not None and i>=feature_length:
            break
        feature_map[f[i].decode('utf8').split(' ')[0]]=i
    all_x=[]
    index=0
    #进度条相关参数
    users=Connection().jd.weibo_users
    total_count=users.count()
    bar=progress_bar(total_count)
    finish_count=0
    uids=[]
    for user in users.find():
        features=[]
        for behavior in user['behaviors']:
            feature=str(int(behavior['item']))
            features.append(feature)
        vector=get_one_hot_light_vector(features, feature_map)
        if len(vector)==0:
            continue
        all_x.append(vector)
        uids.append(user['_id'])
        index+=1
        finish_count+=1
        bar.draw(value=finish_count)
    all_x=numpy.array(all_x)
    dump_user_vector(all_x,uids,'jd_user_simple',dimention=len(feature_map))
    return all_x,uids
Esempio n. 20
0
def output_shopping_tf_matrix(feature_length=3):
    from pymongo import Connection
    all_x=[]
    index=0
    #进度条相关参数
    users=Connection().jd.weibo_users
    total_count=users.count()
    bar=progress_bar(total_count)
    finish_count=0
    uids=[]
    count_male=0
    for user in users.find():
        vector=numpy.zeros((feature_length))
        tf=dict()
        for behavior in user['behaviors']:
            try:
                tf[behavior['timestamp']]+=1
            except:
                tf[behavior['timestamp']]=1
        if len(tf)<feature_length:
            continue
        tf=sorted(tf.iteritems(), key=lambda d:d[1], reverse=True)
        for i in range(0,feature_length):
            vector[i]=tf[i][1]
        all_x.append(vector)
        uids.append(user['_id'])
        index+=1
        finish_count+=1
        bar.draw(value=finish_count)
    all_x=numpy.array(all_x)
    all_y=numpy.array(all_y)
    return dump_train_valid_test(all_x, all_y, 'jd_user_simple')
Esempio n. 21
0
def age_distribute():
    from small_utils.progress_bar import progress_bar
    from pymongo import Connection
    from collections import Counter
    collection=Connection().jd.test_users
    weibo_collection=Connection().jd.weibo_users
    linked_jd_ids=dict()
    ages=[]
    for line in open('/mnt/data1/adoni/data/linked_uids.data'):
        linked_jd_ids[line[:-1].split(' ')[1]]=line.split(' ')[0]
    bar=progress_bar(collection.count())
    for index,user in enumerate(collection.find()):
        if sum(user['profile']['age'])==0:
            continue
        weibo_id=linked_jd_ids[user['_id']]
        weibo_user=weibo_collection.find_one({'_id':weibo_id})
        if weibo_user==None:
            continue
        age=2015-int(weibo_user['birthday'].split(u'年')[0])
        if age>50 or age<10:
            continue
        ages.append(age)
        if age<30:
            user['profile']['age']=[1,0]
        else:
            user['profile']['age']=[0,1]
        collection.update({'_id':user['_id']},{'$set':{'profile':user['profile']}})
        bar.draw(index)
    s=sum(Counter(ages).values())
    ages=sorted(Counter(ages).items(),key=lambda d:d[0])
    ss=0.
    for age in ages:
        ss+=age[1]
        print age[0],(ss)/s
def output_name_matrix_of_two_words():
    from helper import get_progressive_bar
    from pymongo import Connection
    users=Connection().user_profilling.users
    lastnames=[name.replace('\n','').decode('utf8') for name in open('./lastname')]
    bar=get_progressive_bar(users.count())
    finish_count=0
    tf=pickle.load(open('./tf.data'))
    x=[]
    y=[]
    for user in users.find():
        name=user['screen_name']
        finish_count+=1
        if finish_count>5000:
            break
        for n in name:
            if n[0] not in lastnames or len(n)>3 and len(n)<3:
                continue
            try:
                x0=1.0*tf[n[1]][0]/sum(tf[n[1]])
                x1=1.0*tf[n[2]][0]/sum(tf[n[2]])
            except:
                continue
            if user['information']['gender']=='m':
                y.append(1)
            else:
                y.append(0)
            x.append([x0,x1])
        bar.cursor.restore()
        bar.draw(value=finish_count)
    dump_train_valid_test(x,y,'gender_name_simple.data')
def output_graph_matrix():
    from pymongo import Connection
    users=Connection().user_profilling.users
    graph=Connection().user_profilling.graph_embedding
    print graph.count()
    bar=get_progressive_bar(users.count())
    x=[]
    y=[]
    finish_count=0
    uids=[]
    for user in users.find({'int_id':{'$exists':True}},{'information':1,'int_id':1}):
        finish_count+=1
        print finish_count
        #bar.cursor.restore()
        #bar.draw(value=finish_count)
        user_embedding=graph.find_one({'_id':user['int_id']})
        if user_embedding is None:
            print user_embedding
            continue
        gender=user['information']['gender']
        if gender=='f':
            y.append(0)
        else:
            y.append(1)
        x.append(user_embedding['embedding'])
        uids.append(user['information']['uid'])
    #dump_train_valid_test(x,y,'gender_graph.data')
    dump_user_vector(x,y,uids,'user_graph_vector.data')
Esempio n. 24
0
def get_tf():
    from helper import get_progressive_bar
    from pymongo import Connection
    users=Connection().user_profilling.users
    lastnames=[name.replace('\n','').decode('utf8') for name in open('./lastname')]
    bar=get_progressive_bar(users.count())
    finish_count=0
    tf=dict()
    for user in users.find():
        name=user['screen_name']
        finish_count+=1
        for n in name:
            if n[0] not in lastnames or len(n)>3 and len(n)<2:
                continue
            if user['information']['gender']=='m':
                gender=1
            else:
                gender=0
            for w in n[1:]:
                if w not in tf:
                    tf[w]=[0,0]
                tf[w][gender]+=1
        bar.cursor.restore()
        bar.draw(value=finish_count)
    return tf
Esempio n. 25
0
def update_all():
    tags = Connection()["reddit"]["tags"]
    index = Connection()["reddit"]["inverted_index"]

    invalid = ['.', '$']
    for tag in tags.find():
        for key in tag.keys():
            if key != "_id":
                word_list = tag[key]
                for w in word_list:
                    for i in invalid:
                        if i in w:
                            w = w.replace(i,'')
                    row = index.find_one({"key" : w})
                    if not row:
                        index.insert({"key": w, "ids" : [key]})
                    else:
                        print "Updating", w
                        print row, row["ids"]
                        lst = list(row["ids"])
                        print lst, key
                        lst.append(key)
                        new_row = {"key":w, "ids": lst}
                        print new_row
                        index.update({"key":w}, new_row)
Esempio n. 26
0
def output_review_embedding_matrix():
    from helper import get_mentions
    from pymongo import Connection
    from my_vector_reader import read_vectors
    all_x=[]
    #进度条相关参数
    users=Connection().jd.weibo_users
    bar=progress_bar(users.count())
    finish_count=0
    uids=[]
    mentions=get_mentions()
    #review_vocab,review_embedding=read_vectors('/mnt/data1/adoni/jd_data/vectors/word_vectors.data','utf8')
    review_vocab,review_embedding=read_vectors('../myword2vec/word_vectors.data','utf8')
    mentions=filter(lambda d:d in review_vocab,mentions)
    mention_embedding=map(lambda x:review_embedding[review_vocab.index(x)],mentions)
    vector_size=len(mention_embedding[0])
    for user in users.find():
        x=numpy.zeros(vector_size)
        review=' '.join(map(lambda d:d['review']['review_general'],user['behaviors']))
        for index,mention in enumerate(mentions):
            count=review.count(mention)
            x+=count*mention_embedding[index]
        if not x.any():
            continue
        all_x.append(x)
        uids.append(user['_id'])
        finish_count+=1
        bar.draw(value=finish_count)
    all_x=numpy.array(all_x)
    dump_user_vector(all_x,uids,'user_review_embedding')
def output_description_matrix():
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = CountVectorizer(min_df=1)
    from pymongo import Connection
    users = Connection().user_profilling.users
    bar = get_progressive_bar(users.count())
    corpus = []
    finish_count = 0
    y = []
    for user in users.find():
        if 'descriptions' not in user['information']:
            continue
        description = user['information']['descriptions']
        corpus.append(get_str_description(description))
        finish_count += 1
        bar.cursor.restore()
        bar.draw(value=finish_count)
        if user['information']['gender'] == 'm':
            y.append(1)
        else:
            y.append(0)
    x = vectorizer.fit_transform(corpus)
    all_data_x = x.toarray()
    all_data_y = numpy.array(y)
    dump_train_valid_test(all_data_x, all_data_y, 'gender_description.data')
def construct_all_data():
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features=get_features(feature_file_name=feature_file_name)
    all_features_1=get_features(feature_file_name=base_dir+'/features/mention_1.feature',start_index=max(all_features.values())+1)
    collection=Connection().jd.train_users
    bar=progress_bar(collection.count())
    fout=open(RAW_DATA_DIR+'iterate_label2trainset/all_train.data','w')
    uid_output=open(RAW_DATA_DIR+'iterate_label2trainset/all_train_uids.data','w')
    for index,user in enumerate(collection.find()):
        label=0
        fout.write('%d'%label)
        uid_output.write('%s\n'%user['_id'])
        features=combine_features(user['mentions_1'],Counter(user['products']))
        sorted_feature=[]
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f],features[f]))
        for f,v in user['mentions_1_1'].items():
            f=f+'_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f],v))
        sorted_feature=sorted(sorted_feature,key=lambda d:d[0])
        for f in sorted_feature:
            fout.write(' %s:%d'%f)
        fout.write('\n')
        bar.draw(index+1)
Esempio n. 29
0
def construct_test_set(attribute):
    all_features=get_features(feature_file_name=base_dir+'/features/mention.feature')
    collection=Connection().jd.test_users
    balance_params=get_balance_params(attribute,collection)
    print balance_params
    bar=progress_bar(collection.count())
    fout=open(self_training_file_dir+'test_%s.data'%attribute,'w')
    for index,user in enumerate(collection.find()):
        features=dict(Counter(user['products']))
        for m in user['mentions']:
            features[m]=user['mentions'][m]
        try:
            label=user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random()>balance_params[label]:
            continue
        sorted_feature=[]
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f],features[f]))
        if len(sorted_feature)==0:
            continue
        fout.write('%d'%label)
        sorted_feature=sorted(sorted_feature,key=lambda d:d[0])
        for f in sorted_feature:
            fout.write(' %s:%d'%f)
        fout.write('\n')
        bar.draw(index+1)
def output_name_matrix_of_two_words():
    from helper import get_progressive_bar
    from pymongo import Connection
    users = Connection().user_profilling.users
    lastnames = [
        name.replace('\n', '').decode('utf8') for name in open('./lastname')
    ]
    bar = get_progressive_bar(users.count())
    finish_count = 0
    tf = pickle.load(open('./tf.data'))
    x = []
    y = []
    for user in users.find():
        name = user['screen_name']
        finish_count += 1
        if finish_count > 5000:
            break
        for n in name:
            if n[0] not in lastnames or len(n) > 3 and len(n) < 3:
                continue
            try:
                x0 = 1.0 * tf[n[1]][0] / sum(tf[n[1]])
                x1 = 1.0 * tf[n[2]][0] / sum(tf[n[2]])
            except:
                continue
            if user['information']['gender'] == 'm':
                y.append(1)
            else:
                y.append(0)
            x.append([x0, x1])
        bar.cursor.restore()
        bar.draw(value=finish_count)
    dump_train_valid_test(x, y, 'gender_name_simple.data')
Esempio n. 31
0
def get_features_and_labels(attribute):
    from collections import Counter
    import random
    collection = Connection().jd.test_users
    features = dict()
    labels = dict()
    values = []
    for user in collection.find():
        #if attribute=='kids':
        #    for w in [u'男朋友',u'女朋友',u'孩子',u'宝宝',u'儿子',u'女儿']:
        #        if w in user['mentions']:
        #            user['mentions'].pop(w)
        #if len(user['mentions'])==0:
        #    continue
        if len(user['products']) == 0:
            continue
        if sum(user['profile'][attribute]) == 0:
            continue
        features[user['_id']] = dict(Counter(user['products']))
        labels[user['_id']] = numpy.array(user['profile'][attribute],
                                          dtype='float32')
        values.append(str(labels[user['_id']]))
    values = Counter(values)
    min_value = 1.0 * min(values.values())
    for key in values:
        values[key] = min_value / values[key]
    for uid in features.keys():
        if random.random() > values[str(labels[uid])]:
            features.pop(uid)
            labels.pop(uid)
    return features, labels
def construct_mallet_data(profile_key):
    from pymongo import Connection
    from my_progress_bar import progress_bar
    from collections import Counter
    users=Connection().jd.weibo_users
    bar=progress_bar(users.count())
    fout=open(MATRIXES_DIR+'mallet/construced_data.mallet','w')
    data=[]
    for index,user in enumerate(users.find()):
        try:
            label=user['profile'][profile_key].index(1)
        except:
            continue
        reviews=[]
        for behavior in user['behaviors']:
            #reviews.append('Pro'+str(behavior['item']))
            reviews+=behavior['parsed_review']['review_general']
        reviews=Counter(reviews)
        reviews=' '.join(map(lambda word:'%s:%d'%(word,reviews[word]),reviews.keys()))
        line='%s %d %s\n'%(user['_id'],label,reviews)
        data.append((label,line))
    data=balance(data,target_index=0)
    #balanced_data=data
    for label,line in balanced_data:
        fout.write(line.encode('utf8'))
        bar.draw(index)
def get_all_uids():
    from pymongo import Connection
    users=Connection().jd.weibo_users
    uids=[]
    for user in users.find({},{'_id':1}):
        uids.append(user['_id'].encode('utf8'))
    return uids
Esempio n. 34
0
def construct_test_set(attribute):
    all_features = get_features(feature_file_name=base_dir +
                                '/features/mention.feature')
    collection = Connection().jd.test_users
    balance_params = get_balance_params(attribute, collection)
    print balance_params
    bar = progress_bar(collection.count())
    fout = open(self_training_file_dir + 'test_%s.data' % attribute, 'w')
    for index, user in enumerate(collection.find()):
        features = dict(Counter(user['products']))
        for m in user['mentions']:
            features[m] = user['mentions'][m]
        try:
            label = user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random() > balance_params[label]:
            continue
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        if len(sorted_feature) == 0:
            continue
        fout.write('%d' % label)
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        for f in sorted_feature:
            fout.write(' %s:%d' % f)
        fout.write('\n')
        bar.draw(index + 1)
Esempio n. 35
0
def construct_all_data():
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features = get_features(feature_file_name=feature_file_name)
    all_features_1 = get_features(feature_file_name=base_dir +
                                  '/features/mention_1.feature',
                                  start_index=max(all_features.values()) + 1)
    collection = Connection().jd.train_users
    bar = progress_bar(collection.count())
    fout = open(RAW_DATA_DIR + 'mylabel2trainset/all_train.data', 'w')
    uid_output = open(RAW_DATA_DIR + 'mylabel2trainset/all_train_uids.data',
                      'w')
    for index, user in enumerate(collection.find()):
        label = 0
        fout.write('%d' % label)
        uid_output.write('%s\n' % user['_id'])
        features = combine_features(user['mentions_1'],
                                    Counter(user['products']))
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        for f, v in user['mentions_1_1'].items():
            f = f + '_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f], v))
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        for f in sorted_feature:
            fout.write(' %s:%d' % f)
        fout.write('\n')
        bar.draw(index + 1)
Esempio n. 36
0
def get_tf():
    from helper import get_progressive_bar
    from pymongo import Connection
    users = Connection().user_profilling.users
    lastnames = [
        name.replace('\n', '').decode('utf8') for name in open('./lastname')
    ]
    bar = get_progressive_bar(users.count())
    finish_count = 0
    tf = dict()
    for user in users.find():
        name = user['screen_name']
        finish_count += 1
        for n in name:
            if n[0] not in lastnames or len(n) > 3 and len(n) < 2:
                continue
            if user['information']['gender'] == 'm':
                gender = 1
            else:
                gender = 0
            for w in n[1:]:
                if w not in tf:
                    tf[w] = [0, 0]
                tf[w][gender] += 1
        bar.cursor.restore()
        bar.draw(value=finish_count)
    return tf
Esempio n. 37
0
def insert_age_vector():
    from collections import Counter
    users=Connection().jd.weibo_users
    all_vec=[]
    for user in users.find():
        profile=user['profile']
        if user['birthday'] is None:
            age_vec=[0,0]
            profile['age']=age_vec
            users.update({'_id':user['_id']},{'$set':{'profile':profile}})
            continue
        if u'年' not in user['birthday']:
            age_vec=[0,0]
            profile['age']=age_vec
            users.update({'_id':user['_id']},{'$set':{'profile':profile}})
            continue
        age=user['birthday']
        age=age[0:age.find(u'年')]
        if len(age)<4:
            age='19'+age
        age=int(age)
        if age<1950 or age>2010:
            age_vec=[0,0]
            profile['age']=age_vec
            users.update({'_id':user['_id']},{'$set':{'profile':profile}})
            continue
        if age<1987:
            age_vec=[1,0]
        else:
            age_vec=[0,1]
        profile['age']=age_vec
        users.update({'_id':user['_id']},{'$set':{'profile':profile}})
        all_vec.append(str(age_vec))
    print Counter(all_vec)
Esempio n. 38
0
def update_user_id():
    GRAPH_DATA_DIR = '/mnt/data1/weibo_graph/'
    id_map_file = open(GRAPH_DATA_DIR + 'id_map.txt')
    uids = dict()
    total_count = 107628903
    finish_count = 0
    #bar=get_progressive_bar(total_count=total_count)
    for line in id_map_file:
        line = line.replace('\n', '').split(' ')
        uids[line[0]] = line[1]
        finish_count += 1
        #bar.cursor.restore()
        #bar.draw(value=finish_count)
    #uids=set(uids)
    from pymongo import Connection
    users = Connection().user_profilling.users
    count = 0
    finish_count = 0
    u = set()
    for user in users.find({}, {'uid': True}):
        finish_count += 1
        uid = user['uid']
        u.add(uid)
        try:
            int_id = uids[uid]
        except Exception as e:
            continue
        users.update({'_id': user['_id']}, {'$set': {'int_id': int_id}})
        #bar.cursor.restore()
        #bar.draw(value=finish_count)
    uids = set(uids.keys())
    together = uids & u
    print len(together)
    print len(uids)
    print len(u)
Esempio n. 39
0
def construct_all_data():
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features = get_features(feature_file_name=feature_file_name)
    collection = Connection().jd.train_users
    bar = progress_bar(collection.count())
    fout = open(RAW_DATA_DIR + 'label2trainset/all_train.data', 'w')
    uid_output = open(RAW_DATA_DIR + 'label2trainset/all_train_uids.data', 'w')
    for index, user in enumerate(collection.find()):
        features = dict(Counter(user['products']))
        for m in user['mentions']:
            features[m] = user['mentions'][m]
        label = 0
        fout.write('%d' % label)
        uid_output.write('%s\n' % user['_id'])
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        for f in sorted_feature:
            fout.write(' %s:%d' % f)
        fout.write('\n')
        bar.draw(index + 1)
Esempio n. 40
0
def construct_train_set(attribute, training_count):
    '''
    The format of labeled_feature_file is as the same as mallet
    '''
    all_features = get_features(feature_file_name=feature_file_name)
    labeled_feature_file = open('%s/review_constraint_%s.constraints' %
                                (labeled_feature_file_dir, attribute))
    labeled_features = dict()
    for line in labeled_feature_file:
        line = line[:-1].split(' ')
        labeled_features[line[0].decode('utf8')] = map(
            lambda d: float(d.split(':')[1]), line[1:])
    collection = Connection().jd.train_users

    bar = progress_bar(collection.count())
    confidence = []
    for index, user in enumerate(collection.find()):
        label_distributed = [1, 1]
        for f, value in user['mentions'].items():
            if f in labeled_features:
                label_distributed[0] *= labeled_features[f][0] * value
                label_distributed[1] *= labeled_features[f][1] * value
        s = 1.0 * sum(label_distributed)
        label_distributed[0] /= s
        label_distributed[1] /= s
        if label_distributed[0] > label_distributed[1]:
            label = 0
        elif label_distributed[0] < label_distributed[1]:
            label = 1
        else:
            continue
        features = user['mentions']
        #features=Counter(user['products'])
        #features=combine_features(user['mentions'],Counter(user['products']))
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        str_features = ' '.join(map(lambda f: '%s:%f' % f, sorted_feature))
        confidence.append(
            (user['_id'], label,
             abs(label_distributed[0] - label_distributed[1]), str_features))
        bar.draw(index + 1)

    #confidence=sorted(confidence,key=lambda d:d[2],reverse=True)
    confidence0 = filter(lambda d: d[1] == 0, confidence)
    confidence1 = filter(lambda d: d[1] == 1, confidence)
    #dimention=min(len(confidence0),len(confidence1),training_count/2)
    #confidence0=confidence0#[:dimention]
    #confidence1=confidence1#[:dimention]
    print len(confidence0), len(confidence1)
    fout = open(RAW_DATA_DIR + 'label2trainset/%s_train.data' % attribute, 'w')
    uid_output = open(
        RAW_DATA_DIR + 'label2trainset/%s_train_uids.data' % attribute, 'w')
    #for d in confidence0+confidence1:
    for d in confidence:
        fout.write('%d %s\n' % (d[1], d[3]))
        uid_output.write('%s\n' % d[0])
Esempio n. 41
0
def construct_test_set(attribute):
    all_features = get_features(feature_file_name=feature_file_name)
    collection = Connection().jd.test_users
    balance_params = get_balance_params(attribute, collection)
    print balance_params
    bar = progress_bar(collection.count())
    fout = open(RAW_DATA_DIR + 'label2trainset/%s_test.data' % attribute, 'w')
    uid_output = open(
        RAW_DATA_DIR + 'label2trainset/%s_test_uids.data' % attribute, 'w')
    for index, user in enumerate(collection.find()):
        try:
            label = user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random() > balance_params[label]:
            continue
        features = user['mentions']
        #features=Counter(user['products'])
        #features=combine_features(user['mentions'],Counter(user['products']))
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))
        if len(sorted_feature) == 0:
            continue
        fout.write('%d' % label)
        uid_output.write('%s\n' % user['_id'])
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        for f in sorted_feature:
            fout.write(' %s:%f' % f)
        fout.write('\n')
        bar.draw(index + 1)
Esempio n. 42
0
def update_min_max_sum(entity):
    '''
    Used to insert min max and sum
    '''
    collection = Connection().jd['train_%s_mentions' % entity]
    collection_user = Connection().jd['train_%ss' % entity]
    mentions = [
        line[:-1].decode('utf8')
        for line in open('../features/mention.feature')
    ]
    min_d = dict()
    max_d = dict()
    sum_d = dict()
    sum_u_d = dict()
    for m in mentions:
        min_d[m] = float('inf')
        max_d[m] = -1
        sum_d[m] = 0
        sum_u_d[m] = 0
    for user in collection_user.find():
        for m in user['mentions']:
            v = user['mentions'][m]
            if v < min_d[m]:
                min_d[m] = v
            if v > max_d[m]:
                max_d[m] = v
            sum_d[m] += v
            sum_u_d[m] += 1
    for m in mentions:
        collection.insert({
            '_id':
            m,
            'distribute': [min_d[m], max_d[m], sum_d[m], sum_u_d[m]]
        })
def output_description_matrix():
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = CountVectorizer(min_df=1)
    from pymongo import Connection
    users=Connection().user_profilling.users
    bar=get_progressive_bar(users.count())
    corpus=[]
    finish_count=0
    y=[]
    for user in users.find():
        if 'descriptions' not in user['information']:
            continue
        description=user['information']['descriptions']
        corpus.append(get_str_description(description))
        finish_count+=1
        bar.cursor.restore()
        bar.draw(value=finish_count)
        if user['information']['gender']=='m':
            y.append(1)
        else:
            y.append(0)
    x = vectorizer.fit_transform(corpus)
    all_data_x=x.toarray()
    all_data_y=numpy.array(y)
    dump_train_valid_test(all_data_x,all_data_y,'gender_description.data')
Esempio n. 44
0
def get_features_and_labels(attribute):
    from collections import Counter
    import random
    collection=Connection().jd.test_users
    features=dict()
    labels=dict()
    values=[]
    for user in collection.find():
        #if attribute=='kids':
        #    for w in [u'男朋友',u'女朋友',u'孩子',u'宝宝',u'儿子',u'女儿']:
        #        if w in user['mentions']:
        #            user['mentions'].pop(w)
        #if len(user['mentions'])==0:
        #    continue
        if len(user['products'])==0:
            continue
        if sum(user['profile'][attribute])==0:
            continue
        features[user['_id']]=dict(Counter(user['products']))
        labels[user['_id']]=numpy.array(user['profile'][attribute],dtype='float32')
        values.append(str(labels[user['_id']]))
    values=Counter(values)
    min_value=1.0*min(values.values())
    for key in values:
        values[key]=min_value/values[key]
    for uid in features.keys():
        if random.random()>values[str(labels[uid])]:
            features.pop(uid)
            labels.pop(uid)
    return features,labels
Esempio n. 45
0
 def test_calling_delete_on_a_message_returned_removes_it_from_mongodb(
         self):
     collection = Connection().karait_test.queue_test
     queue = Queue(database='karait_test', queue='queue_test')
     queue.write(Message({'foo': 1}))
     self.assertEqual(1, collection.find({}).count())
     queue.read()[0].delete()
     self.assertEqual(0, len(queue.read()))
Esempio n. 46
0
def output_user_product_graph():
    fout = open(RAW_DATA_DIR + 'graph.data', 'w')
    collection = Connection().jd.train_users
    bar = progress_bar(collection.count())
    for index, user in enumerate(collection.find()):
        uid = user['_id']
        for pid in user['products']:
            fout.write('%s %s\n' % (uid, pid))
        bar.draw(index + 1)

    collection = Connection().jd.test_users
    bar = progress_bar(collection.count())
    for index, user in enumerate(collection.find()):
        uid = user['_id']
        for pid in user['products']:
            fout.write('%s %s\n' % (uid, pid))
        bar.draw(index + 1)
Esempio n. 47
0
def get_train_user_products():
    collection=Connection().jd.train_users
    bar=progress_bar(collection.count())
    user_products=dict()
    for index,user in enumerate(collection.find()):
        user_products[user['_id']]=dict(Counter(user['products']))
        #user_products[user['_id']]=user['mentions']
        bar.draw(index)
    return user_products
Esempio n. 48
0
 def f(l_min, l_max):
     from pymongo import Connection
     C = Connection(address).research
     C.authenticate(user, password)
     C = C.ellcurves
     for v in C.find({'level':{'$gte':level_min, '$lt':level_max},
                      'sel2':{'$exists':False}}):
         sel2 = selmer2(eval(v['weq']), max_time)
         C.update({'_id':v['_id']}, {'$set':{'sel2':sel2}})
Esempio n. 49
0
def get_companies_list(mongo_host, start_idx=None):
    col = Connection(mongo_host)['crunch']['company']
    query = {}
    if start_idx:
        query['_id'] = {'$gte': start_idx}
    companies = col.find(query, {'_id': 1}).sort([
        ('_id', pymongo.ASCENDING),
        ])
    return companies
def construct_test_set(attribute):
    all_features = get_features(feature_file=feature_file_name)
    all_features_1 = get_features(feature_file=base_dir +
                                  '/features/mention_1.feature',
                                  existent_features=all_features)
    review_featuers = get_features(feature_file=base_dir +
                                   '/features/review.feature',
                                   existent_features=all_features_1)
    collection = Connection().jd.test_users
    balance_params = get_balance_params(attribute, collection)
    print balance_params
    bar = progress_bar(collection.count())
    fout = open(
        RAW_DATA_DIR + 'iterate_label2trainset/%s_test.data' % attribute, 'w')
    uid_output = open(
        RAW_DATA_DIR + 'iterate_label2trainset/%s_test_uids.data' % attribute,
        'w')
    for index, user in enumerate(collection.find()):
        try:
            label = user['profile'][attribute].index(1)
        except Exception as e:
            continue
        if random.random() > balance_params[label]:
            continue

        features = combine_features(user['mentions_0'],
                                    Counter(user['products']))
        sorted_feature = []
        for f in features:
            if f not in all_features:
                continue
            sorted_feature.append((all_features[f], features[f]))

        for f, v in user['mentions_1_1'].items():
            f = f + '_1'
            if f not in all_features_1:
                continue
            sorted_feature.append((all_features_1[f], v))

        for f, v in Counter(user['review']).items():
            if f not in review_featuers:
                continue
            sorted_feature.append((review_featuers[f], v))

        if len(sorted_feature) == 0:
            continue
        fout.write('%d' % label)
        uid_output.write('%s\n' % user['_id'])
        keys = map(lambda d: d[0], sorted_feature)
        if not len(keys) == len(set(keys)):
            print Counter(keys).values()
        sorted_feature = sorted(sorted_feature, key=lambda d: d[0])
        for f in sorted_feature:
            fout.write(' %s:%f' % f)
        fout.write('\n')
        bar.draw(index + 1)
Esempio n. 51
0
def attribute_statistics(attribute):
    from collections import Counter
    print attribute
    collection = Connection().jd.test_users
    profiles = []
    for user in collection.find():
        if sum(user['profile'][attribute]) > 0:
            profiles.append(str(user['profile'][attribute]))
    print len(profiles)
    print Counter(profiles)
Esempio n. 52
0
def get_train_uids():
    collection = Connection().jd.train_users
    uids = set()
    for user in collection.find():
        uids.add(user['_id'])
    collection = Connection().jd.train_users
    bar = progress_bar(len(uids))
    for index, uid in enumerate(uids):
        collection.delete_one({'_id': uid})
        bar.draw(index + 1)
Esempio n. 53
0
def statistics(attribute,
               threshold=-1,
               feature_file_name=base_dir + '/features/mention.feature',
               show=False):
    import random
    collection = Connection().jd.test_users
    balance_params = get_balance_params(attribute, collection)
    all_features = get_features(feature_file_name)
    bar = progress_bar(collection.count())
    distribute = dict([f, [0., 0.]] for f in all_features)
    labels_distribute = [0., 0.]
    for index, user in enumerate(collection.find()):
        try:
            label = user['profile'][attribute].index(1)
        except:
            continue
        #if random.random()>balance_params[label]:
        #    continue
        features = dict(user['mentions'])
        products = Counter(user['products'])
        for p in products:
            features[p] = products[p]
        if len(features) < 10:
            continue
        for f in features:
            if f in distribute:
                distribute[f][label] += 1  #features[f]
        labels_distribute[label] += 1
        bar.draw(index)
    for f in distribute.keys():
        if sum(distribute[f]) < threshold:
            distribute.pop(f)
    print labels_distribute
    for f in distribute:
        distribute[f][0] /= labels_distribute[0]
        distribute[f][1] /= labels_distribute[1]
    for f in distribute:
        s = sum(distribute[f])
        distribute[f][0] /= s
        distribute[f][1] /= s
    if not show:
        return distribute
    #distribute=filter(lambda d:d[1][0]<d[1][1], distribute.items())
    distribute = sorted(distribute.items(),
                        key=lambda d: abs(1 - 2 * (d[1][0] + 0.1) /
                                          (sum(d[1]) + 0.1)),
                        reverse=True)
    #distribute=sorted(distribute,key=lambda d:sum(d[1]), reverse=True)
    print ''
    for d in distribute[:50]:
        print '%s 0:%0.3f 1:%0.3f' % (
            d[0].encode('utf8'),
            (d[1][0] + 0.1) / (sum(d[1]) + 0.1),
            1 - (d[1][0] + 0.1) / (sum(d[1]) + 0.1),
        )
Esempio n. 54
0
File: ap.py Progetto: merbst/psage
 def f(l_min, l_max):
     from pymongo import Connection
     C = Connection(address).research
     C.authenticate(user, password)
     C = C.ellcurves
     for v in C.find({'level':{'$gte':level_min, '$lt':level_max},
                      'number':1,
                      'ap':{'$exists':False}}):
         E = pari('ellinit(%s,1)'%v['weq'])
         ap = dict([(str(p),int(E.ellap(p))) for p in P])
         C.update({'_id':v['_id']}, {'$set':{'ap':ap}})
Esempio n. 55
0
def get_word_count():
    collection = Connection().jd.train_users
    count = dict()
    for user in collection.find():
        for m, v in user['mentions'].items():
            if m in count:
                count[m] += v
            else:
                count[m] = v
    for m, v in sorted(count.items(), key=lambda d: d[1]):
        print m, v
Esempio n. 56
0
def count_occur(words):
    from pymongo import Connection
    occur=dict([(w, 0) for w in words])
    collection=Connection().jd.train_users
    keys=set(occur.keys())
    for user in collection.find():
        for w in set(user['mentions'].keys())&keys:
            if w in occur:
                occur[w]+=1
    for w in words:
        v=occur[w]
        print '%s\n(%0.2f\\%%)'%(w.encode('utf8'),100.0*v/100000)
Esempio n. 57
0
def count_attribute(attribute):
    from pymongo import Connection
    from collections import Counter
    collection=Connection().jd.test_users
    a=[]
    for user in collection.find():
        try:
            label=user['profile'][attribute].index(1)
        except:
            continue
        a.append(label)
    print Counter(a),len(a)