Esempio n. 1
0
    cur.execute('set character_set_connection=utf8')
    cur.execute('set character_set_database=utf8')
    cur.execute('set character_set_results=utf8')
    cur.execute('set character_set_server=utf8')
    #sql = 'select userid from jobs_uinfotest'
    import pdb
    pdb.set_trace()
    position_dct = {}

    sql = 'select count(industry), industry from work_size group by industry'
    cur.execute(sql)
    positionlst = cur.fetchall()
    for industry in positionlst:
        position_dct[industry[1]] = []
        sq = 'select count(position_name) as nu, position_name from work_size where industry = "%s" group by position_name order by nu desc limit 1' % industry[
            1]
        cur.execute(sq)
        indusp = cur.fetchall()
        position_dct[industry[1]].append(indusp[0][1])
        position_dct[industry[1]].append(float(indusp[0][0]) / industry[0])
    utils.store_rst(position_dct, 'industryr.txt')
    print position_dct
    conn.commit()
    conn.close()
    end = time.clock()
    print(end - start)
except Exception as e:
    conn.commit()
    conn.close()
    print e
Esempio n. 2
0
    first_fea = decesion_tree.keys()[0]
    fea_index = features.index(first_fea)
    if not decesion_tree[first_fea].has_key(test_data[fea_index]):
        return None
    if type(decesion_tree[first_fea][test_data[fea_index]]) is types.DictType:
            if len(decesion_tree[first_fea][test_data[fea_index]]) == 0:
                return test_data
            return classify_prun(decesion_tree[first_fea][test_data[fea_index]], features, test_data)
    else:
        return decesion_tree[first_fea][test_data[fea_index]]

def classify_t(decesion_tree, features, test_data, mean_values=None):
    first_fea = decesion_tree.keys()[0]
    fea_index = features.index(first_fea)
    if not decesion_tree[first_fea].has_key(test_data[fea_index]):
        return 1
    if type(decesion_tree[first_fea][test_data[fea_index]]) is types.DictType:
            return classify_t(decesion_tree[first_fea][test_data[fea_index]], features, test_data)
    else:
        return decesion_tree[first_fea][test_data[fea_index]]
    
if __name__ == '__main__':
    #if len(sys.argv) != 3:
    #    print "please use: python decision.py train_file test_file"
    #    sys.exit()
    test_file = 'd:/jobs/dctree/size/test.csv'
    decesion_tree = read_tree('size_tree')
    dataset, features = format_data(test_file)
    prun_tree = prun(decesion_tree, features, dataset)
    utils.store_rst(prun_tree, 'prunsize_tree')
    print prun_tree
Esempio n. 3
0
    cur.execute('set character_set_results=utf8')
    cur.execute('set character_set_server=utf8')

    sql = 'select position_name from work_sizetest'
    cur.execute(sql)
    wordprobdct = utils.read_rst('position_word')
    wordlst = cur.fetchall()
    i = 0
    result = []
    #     pdb.set_trace()
    for j in xrange(20000):
        tworks = wordlst[i:i + 2]
        i += 2
        position_prob = {}
        for key in position_dct.keys():
            position_prob[key] = get_position_prob(key, wordprobdct, tworks)
        sortedprob = sorted(position_prob.iteritems(),
                            key=lambda jj: jj[1],
                            reverse=True)
        #         for prob in sortedprob:
        #             print prob[0] + str(prob[1])
        result.append(sortedprob[0][0])

    utils.store_rst(result, 'positionlet')
    conn.commit()
    conn.close()
except Exception as e:
    conn.close()
    print e
end = time.clock()
print(end - start)
Esempio n. 4
0
    for j in xrange(70000):
        poslst = positionlst[j:j+3]
        j += 3
        if not position_dct.has_key(poslst[1][0]):
            continue
        seg_lst = jieba.cut(poslst[0][0])
        for term in seg_lst:
            if position_word_dct[poslst[1][0]]['pos1'].has_key(term):
                position_word_dct[poslst[1][0]]['pos1'][term] += 1
            else:
                position_word_dct[poslst[1][0]]['pos1'][term] = 1
            position_word_dct[poslst[1][0]]['pos1']['total'] += 1
        seg_lst = jieba.cut(poslst[2][0])
        for term in seg_lst:
            if position_word_dct[poslst[1][0]]['pos2'].has_key(term):
                position_word_dct[poslst[1][0]]['pos2'][term] += 1
            else:
                position_word_dct[poslst[1][0]]['pos2'][term] = 1
            position_word_dct[poslst[1][0]]['pos2']['total'] += 1
        position_word_dct[poslst[1][0]]['total'] += 1
        position_word_dct['total'] += 1
    
    utils.store_rst(position_word_dct, 'position_word')
                
    conn.commit()
    conn.close()
except Exception as e:
    conn.close()
    print e
end = time.clock()
print (end - start)
Esempio n. 5
0
    cur.execute('set character_set_client=utf8')
    cur.execute('set character_set_connection=utf8')
    cur.execute('set character_set_database=utf8')
    cur.execute('set character_set_results=utf8')
    cur.execute('set character_set_server=utf8')
    #sql = 'select userid from jobs_uinfotest'
    import pdb
    pdb.set_trace()
    position_dct = {}
    
    sql = 'select count(industry), industry from work_size group by industry'
    cur.execute(sql)
    positionlst = cur.fetchall()
    for industry in positionlst: 
        position_dct[industry[1]] = []
        sq = 'select count(position_name) as nu, position_name from work_size where industry = "%s" group by position_name order by nu desc limit 1' % industry[1]
        cur.execute(sq)
        indusp = cur.fetchall()
        position_dct[industry[1]].append(indusp[0][1])
        position_dct[industry[1]].append(float(indusp[0][0])/industry[0])
    utils.store_rst(position_dct, 'industryr.txt')
    print position_dct
    conn.commit()
    conn.close()
    end = time.clock()
    print (end-start)
except Exception as e:
    conn.commit()
    conn.close()
    print e
    
Esempio n. 6
0
    cur.execute('set character_set_connection=utf8')
    cur.execute('set character_set_database=utf8')
    cur.execute('set character_set_results=utf8')
    cur.execute('set character_set_server=utf8')
    #sql = 'select userid from jobs_uinfotest'
    majorsql = 'select distinct(shortmar) from jobs_uinfo'
    sqlts = 'select distinct(shortmar) from jobs_uinfotest'
    cur.execute(majorsql)
    majorlst = cur.fetchall()
    cur.execute(sqlts)
    sharedct = {}
    shortmar = cur.fetchall()
    majordct = {}
    for major in majorlst:
        majordct[major[0]] = 1
    for major in shortmar:
        if majordct.has_key(major[0]):
            sharedct[major[0]] = 1
            print major[0]
        
    print len(sharedct)
    store_rst(sharedct, 'sharemajor')
    conn.commit()
    conn.close()
    end = time.clock()
    print (end-start)
except Exception as e:
    conn.commit()
    conn.close()
    print e
    
Esempio n. 7
0
    return dataset,features

if __name__ == '__main__':
    #    print "please use: python decision.py train_file test_file"
    #    sys.exit()
    train_file = 'd:/jobs/dctree/random/sal-train.csv'
    test_file = 'd:/jobs/dctree/random/sal-test.csv'
    
    labels = get_labels(train_file,14)
    train_dataset, train_features = format_data(train_file)
    test_dataset, test_features = format_data(test_file)
    tree_num = 3990
    feature_num = 2
    result = []
    pdb.set_trace()
    for j in range(tree_num):
        features_index = generate_feature_index(train_features, feature_num)
        features = [train_features[l] for l in features_index]
        print features_index
        print features
        train_set, labels = generate_train_file(train_dataset, features_index)
        decesion_tree = tree.rand(train_set, features, labels, 0.0005)
        test_set = generate_test_file(test_dataset, features_index)
        rst = tree.rand_test(test_set, features, decesion_tree)
        result.append(rst)
    
    finalrst = generate_result(result)
    store_rst(finalrst, 'salary')
    
end = time.clock()
print (end - start)
Esempio n. 8
0
        if len(decesion_tree[first_fea][test_data[fea_index]]) == 0:
            return test_data
        return classify_prun(decesion_tree[first_fea][test_data[fea_index]],
                             features, test_data)
    else:
        return decesion_tree[first_fea][test_data[fea_index]]


def classify_t(decesion_tree, features, test_data, mean_values=None):
    #     pdb.set_trace()
    first_fea = decesion_tree.keys()[0]
    fea_index = features.index(first_fea)
    if not decesion_tree[first_fea].has_key(test_data[fea_index]):
        return 1
    if type(decesion_tree[first_fea][test_data[fea_index]]) is types.DictType:
        return classify_t(decesion_tree[first_fea][test_data[fea_index]],
                          features, test_data)
    else:
        return decesion_tree[first_fea][test_data[fea_index]]


if __name__ == '__main__':
    #if len(sys.argv) != 3:
    #    print "please use: python decision.py train_file test_file"
    #    sys.exit()
    test_file = 'd:/jobs/dctree/salary/test.csv'
    decesion_tree = read_tree('salary_tree')
    dataset, features = format_data(test_file)
    prun_tree = prun(decesion_tree, features, dataset)
    utils.store_rst(prun_tree, 'prunsalary_tree')
    print prun_tree
Esempio n. 9
0
    sql = 'select industry, position_name from work_sizetest'
    cur.execute(sql)
    workprobdct = utils.read_rst('workprobdct')
    worklst = cur.fetchall()
    i = 0
    result = []

    for j in xrange(20000):
        tworks = worklst[i:i + 2]
        i += 2
        position_prob = {}
        for key in position_dct.keys():
            #             pdb.set_trace()
            position_prob[key] = get_position_prob(key, workprobdct, tworks)
#         pdb.set_trace()
        sortedprob = sorted(position_prob.iteritems(),
                            key=lambda jj: jj[1],
                            reverse=True)
        #         for prob in sortedprob:
        #             print prob[0] + str(prob[1])
        result.append(sortedprob[0][0])

    utils.store_rst(result, 'position13')
    conn.commit()
    conn.close()
except Exception as e:
    conn.close()
    print e
end = time.clock()
print(end - start)
Esempio n. 10
0
    for j in xrange(70000):
        poslst = positionlst[j:j + 3]
        j += 3
        if not position_dct.has_key(poslst[1][0]):
            continue
        seg_lst = jieba.cut(poslst[0][0])
        for term in seg_lst:
            if position_word_dct[poslst[1][0]]['pos1'].has_key(term):
                position_word_dct[poslst[1][0]]['pos1'][term] += 1
            else:
                position_word_dct[poslst[1][0]]['pos1'][term] = 1
            position_word_dct[poslst[1][0]]['pos1']['total'] += 1
        seg_lst = jieba.cut(poslst[2][0])
        for term in seg_lst:
            if position_word_dct[poslst[1][0]]['pos2'].has_key(term):
                position_word_dct[poslst[1][0]]['pos2'][term] += 1
            else:
                position_word_dct[poslst[1][0]]['pos2'][term] = 1
            position_word_dct[poslst[1][0]]['pos2']['total'] += 1
        position_word_dct[poslst[1][0]]['total'] += 1
        position_word_dct['total'] += 1

    utils.store_rst(position_word_dct, 'position_word')

    conn.commit()
    conn.close()
except Exception as e:
    conn.close()
    print e
end = time.clock()
print(end - start)
Esempio n. 11
0
    sql = 'select industry, salary from work_sizetest'
    cur.execute(sql)
    salaryprobdct = utils.read_rst('salaryprobdct')
    worklst = cur.fetchall()
    i = 0
    result = []

    for j in xrange(20000):
        salarys = worklst[i:i + 2]
        i += 2
        salary_prob = {}
        for key in range(7):
            #             pdb.set_trace()
            salary_prob[key] = get_salary_prob(key, salaryprobdct, salarys)
#         pdb.set_trace()
        sortedprob = sorted(salary_prob.iteritems(),
                            key=lambda jj: jj[1],
                            reverse=True)
        #         for prob in sortedprob:
        #             print prob[0] + str(prob[1])
        result.append(sortedprob[0][0])

    utils.store_rst(result, 'salary')
    conn.commit()
    conn.close()
except Exception as e:
    conn.close()
    print e
end = time.clock()
print(end - start)
Esempio n. 12
0
    keyshare = 0
    keynos = 0
    keysharedct = copy.deepcopy(position_dct)
    for key in tst_dict.keys():
        if positiondct.has_key(key):
            keyshare += 1
            if not position_dct.has_key(key):
                keysharedct[key] = keynm
                keynm += 1
        else:
            keynos += 1
            print key
    for industry in industrydct.keys():
        keysharedct[industry] = keynm
        keynm += 1
    utils.store_rst(keysharedct, 'keyshare')
    
    print 'poslenght : %d' % len(positiondct)
    print 'tstlenght : %d' % len(tst_dict)
    print 'share : %d' % share
    print 'nos : %d' % nos
    print 'keyshare : %d' % keyshare
    print 'keynos : %d' % keynos
    conn.commit()
    conn.close()
    end = time.clock()
#     print (end-start)
except Exception as e:
    conn.commit()
    conn.close()
    print e
Esempio n. 13
0
    cur.execute('set character_set_server=utf8')
    sql = 'select userid, size, salary from work_sizetest ;'
    cur.execute(sql)
    worksizelst = cur.fetchall()
    userid = ''
    worklst = []
    salarylst = []
    resultst = []
    pdb.set_trace()
    for work_size in worksizelst:
        if len(worklst) < 2:
            worklst.append(work_size[1])
            salarylst.append(work_size[2])
        else:
            worklst = sorted(worklst)
            salarylst = sorted(salarylst)
            resultst.append([random.randint(worklst[0], worklst[1]), random.randint(salarylst[0], salarylst[1])])
            worklst = [work_size[1]]
            salarylst = [work_size[2]]
    worklst = sorted(worklst)
    salarylst = sorted(salarylst)
    resultst.append([random.randint(worklst[0], worklst[1]), random.randint(salarylst[0], salarylst[1])])
    pdb.set_trace()
    utils.store_rst(resultst, 'wsresult.txt')
    conn.commit()
    conn.close()
except Exception as e:
    conn.close()
    print e
end = time.clock()
print (end - start)
Esempio n. 14
0
        tfidf_dct[keylst[j]][2] = np.array(tfidf_dct[keylst[j]][2])
        tfidf_dct[keylst[j]][2] = np.log(tfidf_dct[keylst[j]][2])
        tfidf_dct[keylst[j]][1] = tfidf_dct[keylst[j]][1] * tfidf_dct[keylst[j]][2]

    tfidf_sort_dct = {}

    for key in keylst:
        #         print '===================',key,'==================='
        tfidf_sort_dct[key] = {}
        for i in range(len(tfidf_dct[key][0])):
            tfidf_sort_dct[key][tfidf_dct[key][0][i]] = tfidf_dct[key][1][i]
    #             print tfidf_dct[key][0][i],':',tfidf_dct[key][1][i]

    for key in tfidf_sort_dct.keys():
        tfidf_sort_dct[key] = sorted(tfidf_sort_dct[key].items(), key=lambda item: item[1], reverse=True)
        print "==============================", key, "============================================"
        for index, item in enumerate(tfidf_sort_dct[key]):
            print item[0], ":", item[1]
            if index > 50:
                break

    utils.store_rst(tfidf_sort_dct, "tfidf_dct")

    conn.commit()
    conn.close()
except Exception as e:
    conn.close()
    print e
end = time.clock()
print (end - start)
Esempio n. 15
0
            salaryprobdct[works[1][1]]['salary2'][works[2][1]] = 1
        if salaryprobdct[works[1][1]]['industry1'].has_key(works[0][0]):
            salaryprobdct[works[1][1]]['industry1'][works[0][0]] += 1
        else:
            salaryprobdct[works[1][1]]['industry1'][works[0][0]] = 1
        if salaryprobdct[works[1][1]]['industry2'].has_key(works[2][0]):
            salaryprobdct[works[1][1]]['industry2'][works[2][0]] += 1
        else:
            salaryprobdct[works[1][1]]['industry2'][works[2][0]] = 1
        salaryprobdct[works[1][1]]['total'] += 1
    salaryprobdct['total'] = t
    for key in range(7):
        salaryprobdct[key]['salary1']['total'] = reduce(lambda x,y:x+y, salaryprobdct[key]['salary1'].itervalues())
        salaryprobdct[key]['salary2']['total'] = reduce(lambda x,y:x+y, salaryprobdct[key]['salary2'].itervalues())
        salaryprobdct[key]['industry1']['total'] = reduce(lambda x,y:x+y, salaryprobdct[key]['industry1'].itervalues())
        salaryprobdct[key]['industry2']['total'] = reduce(lambda x,y:x+y, salaryprobdct[key]['industry2'].itervalues())
    pdb.set_trace()
    for key in salaryprobdct.keys():
        if key == 'total':
            continue
        print str(key) +str(salaryprobdct[key]['salary1']['total'])
    
    utils.store_rst(salaryprobdct, 'salaryprobdct')
        
    conn.commit()
    conn.close()
except Exception as e:
    conn.close()
    print e
end = time.clock()
print (end - start)
Esempio n. 16
0
#             workprobdct[works[1][1]]['pos2'][works[2][1]] = 1
#         if workprobdct[works[1][1]]['industry1'].has_key(works[0][0]):
#             workprobdct[works[1][1]]['industry1'][works[0][0]] += 1
#         else:
#             workprobdct[works[1][1]]['industry1'][works[0][0]] = 1
#         if workprobdct[works[1][1]]['industry2'].has_key(works[2][0]):
#             workprobdct[works[1][1]]['industry2'][works[2][0]] += 1
#         else:
#             workprobdct[works[1][1]]['industry2'][works[2][0]] = 1
        workprobdct[works[1][1]]['total'] += 1
    workprobdct['total'] = t
    for key in position_dct.keys():
        workprobdct[key]['pos1']['total'] = reduce(lambda x,y:x+y, workprobdct[key]['pos1'].itervalues())
        workprobdct[key]['pos2']['total'] = reduce(lambda x,y:x+y, workprobdct[key]['pos2'].itervalues())
        workprobdct[key]['industry1']['total'] = reduce(lambda x,y:x+y, workprobdct[key]['industry1'].itervalues())
        workprobdct[key]['industry2']['total'] = reduce(lambda x,y:x+y, workprobdct[key]['industry2'].itervalues())
#     pdb.set_trace()
    for key in workprobdct.keys():
        if key == 'total':
            continue
        print key +str(workprobdct[key]['pos1']['total'])
    
    utils.store_rst(workprobdct, 'workletterprobdct')
        
    conn.commit()
    conn.close()
except Exception as e:
    conn.close()
    print e
end = time.clock()
print (end - start)
Esempio n. 17
0
        tfidf_dct[
            keylst[j]][1] = tfidf_dct[keylst[j]][1] * tfidf_dct[keylst[j]][2]

    tfidf_sort_dct = {}

    for key in keylst:
        #         print '===================',key,'==================='
        tfidf_sort_dct[key] = {}
        for i in range(len(tfidf_dct[key][0])):
            tfidf_sort_dct[key][tfidf_dct[key][0][i]] = tfidf_dct[key][1][i]
#             print tfidf_dct[key][0][i],':',tfidf_dct[key][1][i]

    for key in tfidf_sort_dct.keys():
        tfidf_sort_dct[key] = sorted(tfidf_sort_dct[key].items(),
                                     key=lambda item: item[1],
                                     reverse=True)
        print '==============================', key, '============================================'
        for index, item in enumerate(tfidf_sort_dct[key]):
            print item[0], ':', item[1]
            if index > 50:
                break

    utils.store_rst(tfidf_sort_dct, 'tfidf_dct')

    conn.commit()
    conn.close()
except Exception as e:
    conn.close()
    print e
end = time.clock()
print(end - start)
Esempio n. 18
0
    return dataset,features

if __name__ == '__main__':
    #    print "please use: python decision.py train_file test_file"
    #    sys.exit()
    train_file = 'd:/jobs/dctree/random/train.csv'
    test_file = 'd:/jobs/dctree/random/test.csv'
    
    labels = get_labels(train_file,7)
    train_dataset, train_features = format_data(train_file)
    test_dataset, test_features = format_data(test_file)
    tree_num = 3990
    feature_num = 2
    result = []

    for j in range(tree_num):
        features_index = generate_feature_index(train_features, feature_num)
        features = [train_features[l] for l in features_index]
        print features_index
        print features
        train_set, labels = generate_train_file(train_dataset, features_index)
        decesion_tree = tree.rand(train_set, features, labels, 0.001)
        test_set = generate_test_file(test_dataset, features_index)
        rst = tree.rand_test(test_set, features, decesion_tree)
        result.append(rst)
    
    finalrst = generate_result(result)
    store_rst(finalrst, 'finalrut')
    
end = time.clock()
print (end - start)
Esempio n. 19
0
    cur.execute("set character_set_database=utf8")
    cur.execute("set character_set_results=utf8")
    cur.execute("set character_set_server=utf8")

    sql = "select position_name from work_sizetest"
    cur.execute(sql)
    wordprobdct = utils.read_rst("position_word")
    wordlst = cur.fetchall()
    i = 0
    result = []
    #     pdb.set_trace()
    for j in xrange(20000):
        tworks = wordlst[i : i + 2]
        i += 2
        position_prob = {}
        for key in position_dct.keys():
            position_prob[key] = get_position_prob(key, wordprobdct, tworks)
        sortedprob = sorted(position_prob.iteritems(), key=lambda jj: jj[1], reverse=True)
        #         for prob in sortedprob:
        #             print prob[0] + str(prob[1])
        result.append(sortedprob[0][0])

    utils.store_rst(result, "positionlet")
    conn.commit()
    conn.close()
except Exception as e:
    conn.close()
    print e
end = time.clock()
print (end - start)
Esempio n. 20
0
    cur.execute('set character_set_server=utf8')
    
    sql = 'select industry, position_name from work_sizetest'
    cur.execute(sql)
    workprobdct = utils.read_rst('workletterprobdct')
    worklst = cur.fetchall()
    i = 0
    result = []
    
    for j in xrange(20000):
        tworks = worklst[i:i+2]
        i += 2
        position_prob = {}
        pdb.set_trace()
        for key in position_dct.keys():
#             pdb.set_trace()
            position_prob[key] = get_position_prob(key, workprobdct, tworks)
#         pdb.set_trace()
        sortedprob = sorted(position_prob.iteritems(), key=lambda jj:jj[1], reverse=True)
#         for prob in sortedprob:
#             print prob[0] + str(prob[1])
        result.append(sortedprob[0][0])
        
    utils.store_rst(result, 'positionlet')
    conn.commit()
    conn.close()
except Exception as e:
    conn.close()
    print e
end = time.clock()
print (end - start)
Esempio n. 21
0
                letter_dct[poslst[1][0]][0].append(term)
            if not letterdct.has_key(term):
                letterdct[term] = 1
            else:
                letterdct[term] += 1
        seg_lst = jieba.cut(poslst[2][0])
        for term in seg_lst:
            if len(term) > 1:
                letter_dct[poslst[1][0]][1].append(term)
            if not letterdct.has_key(term):
                letterdct[term] = 1
            else:
                letterdct[term] += 1

    positions = letterdct.keys()
    positions_num = []
#     for letter in positions:
#         print letter
#         if letter == '\\':
#             continue
#         sq = 'insert into letter(name, type, num, stopped) values ("%s", "%s", %d, 0)' % (letter, 'work_size', letterdct[letter])
#         print sq
#         cur.execute(sq)
    utils.store_rst(letter_dct, 'letterdct')
    conn.commit()
    conn.close()
except Exception as e:
    conn.close()
    print e
end = time.clock()
print (end - start)
Esempio n. 22
0
    usertst = cur.fetchall()
    i = 0
    for user in usertst:
        if majordct.has_key(user[1]):
            majorat = majordct[user[1]]
            mnu = max(majorat)
            index = majorat.index(mnu)
            resultlst.append(index)
        else:
            resultlst.append(0)
        i = i + 1
#     for user in usertst:
#         if majordct.has_key(user[1]):
#             majorat = majordct[user[1]]
#             mnu = max(majorat)
#             if mnu >= 0.8:
#                 index = majorat.index(mnu)
#                 resultlst.append(index)
#             else:
#                 resultlst.append(treelst[i])
#         else:
#             resultlst.append(treelst[i])
#         i = i + 1
    utils.store_rst(resultlst, 'degree.txt')
    conn.commit()
    conn.close()
except Exception as e:
    conn.close()
    print e
end = time.clock()
print(end - start)
Esempio n. 23
0
    salarylst = []
    resultst = []
    pdb.set_trace()
    for work_size in worksizelst:
        if len(worklst) < 2:
            worklst.append(work_size[1])
            salarylst.append(work_size[2])
        else:
            worklst = sorted(worklst)
            salarylst = sorted(salarylst)
            resultst.append([
                random.randint(worklst[0], worklst[1]),
                random.randint(salarylst[0], salarylst[1])
            ])
            worklst = [work_size[1]]
            salarylst = [work_size[2]]
    worklst = sorted(worklst)
    salarylst = sorted(salarylst)
    resultst.append([
        random.randint(worklst[0], worklst[1]),
        random.randint(salarylst[0], salarylst[1])
    ])
    pdb.set_trace()
    utils.store_rst(resultst, 'wsresult.txt')
    conn.commit()
    conn.close()
except Exception as e:
    conn.close()
    print e
end = time.clock()
print(end - start)
Esempio n. 24
0
        else:
            shortmar_dusdct[shortmar[0]] = {}
            shortmar_dusdct[shortmar[0]][shortmar[1]] = []
#     for shortmar in shortmartestlst:
#         if shortmar_dusdct.has_key(shortmar[0]):
#             if not shortmar_dusdct[shortmar[0]][shortmar[1]]:
#                 shortmar_dusdct[shortmar[0]][shortmar[1]] = ''
#         else:
#             shortmar_dusdct[shortmar[0]] = {}
#             shortmar_dusdct[shortmar[0]][shortmar[1]] = ''
    pdb.set_trace()
    for keym in shortmar_dusdct.keys():
        for key in shortmar_dusdct[keym].keys():
            sqlkey = 'select position_name, count(position_name) as nu from workexperience\
                                            where major = "%s" and industry = "%s" order by nu desc limit 5' % (keym, key)
            cur.execute(sqlkey)
            poslst = cur.fetchall()
            for pos in poslst:
                shortmar_dusdct[keym][key].append(pos[0])
    
    store_rst(shortmar_dusdct, 'shortmar_dusdct')
    
    conn.commit()
    conn.close()
    file.close()
except Exception as e:
    file.close()
    conn.close()
    print e
end = time.clock()
print (end - start)
Esempio n. 25
0
                        else:
                            result.append(postdct[key][0])
                        flag = True
                        i += 1
                        break
                if not flag:
                    result.append('test')
                    l += 1
            flag = False
            positions = [position[0]]
    if position_dct.has_key(positions[0]):
        result.append(positions[0])
    elif position_dct.has_key(positions[1]):
        result.append(positions[1])
    else:
        result.append('test')

    print i
    print p
    print len(result)
    utils.store_rst(result, 'position.txt')
    print l
    conn.commit()
    conn.close()
    end = time.clock()
    print(end - start)
except Exception as e:
    conn.commit()
    conn.close()
    print e
Esempio n. 26
0
    return dataset, features

if __name__ == '__main__':
    #    print "please use: python decision.py train_file test_file"
    #    sys.exit()
    train_file = 'd:/jobs/dctree/random/train.csv'
    test_file = 'd:/jobs/dctree/random/test.csv'

    labels = get_labels(train_file, 7)
    train_dataset, train_features = format_data(train_file)
    test_dataset, test_features = format_data(test_file)
    tree_num = 3990
    feature_num = 2
    result = []

    for j in range(tree_num):
        features_index = generate_feature_index(train_features, feature_num)
        features = [train_features[l] for l in features_index]
        print features_index
        print features
        train_set, labels = generate_train_file(train_dataset, features_index)
        decesion_tree = tree.rand(train_set, features, labels, 0.001)
        test_set = generate_test_file(test_dataset, features_index)
        rst = tree.rand_test(test_set, features, decesion_tree)
        result.append(rst)

    finalrst = generate_result(result)
    store_rst(finalrst, 'finalrut')

end = time.clock()
print(end - start)
Esempio n. 27
0
    cur.execute('set character_set_results=utf8')
    cur.execute('set character_set_server=utf8')
    #sql = 'select userid from jobs_uinfotest'
    majorsql = 'select distinct(industry) from workexperience'
    sqlts = 'select distinct(industry) from workexperiencetest'
    cur.execute(majorsql)
    majorlst = cur.fetchall()
    cur.execute(sqlts)
    sharedct = {}
    shortmar = cur.fetchall()
    majordct = {}
    for major in majorlst:
        majordct[major[0]] = 1
    for major in shortmar:
        if majordct.has_key(major[0]):
            sharedct[major[0]] = 1
            #print major[0]
        else:
            print major[0]
        
    print len(sharedct)
    store_rst(sharedct, 'shareindustry')
    conn.commit()
    conn.close()
    end = time.clock()
    print (end-start)
except Exception as e:
    conn.commit()
    conn.close()
    print e
    
Esempio n. 28
0
    fea_index = features.index(first_fea)
    if not decesion_tree[first_fea].has_key(test_data[fea_index]):
        return None
    if type(decesion_tree[first_fea][test_data[fea_index]]) is types.DictType:
            if len(decesion_tree[first_fea][test_data[fea_index]]) == 0:
                return test_data
            return classify_prun(decesion_tree[first_fea][test_data[fea_index]], features, test_data)
    else:
        return decesion_tree[first_fea][test_data[fea_index]]

def classify_t(decesion_tree, features, test_data, mean_values=None):
#     pdb.set_trace()
    first_fea = decesion_tree.keys()[0]
    fea_index = features.index(first_fea)
    if not decesion_tree[first_fea].has_key(test_data[fea_index]):
        return 1
    if type(decesion_tree[first_fea][test_data[fea_index]]) is types.DictType:
            return classify_t(decesion_tree[first_fea][test_data[fea_index]], features, test_data)
    else:
        return decesion_tree[first_fea][test_data[fea_index]]
    
if __name__ == '__main__':
    #if len(sys.argv) != 3:
    #    print "please use: python decision.py train_file test_file"
    #    sys.exit()
    test_file = 'd:/jobs/dctree/salary/test.csv'
    decesion_tree = read_tree('salary_tree')
    dataset, features = format_data(test_file)
    prun_tree = prun(decesion_tree, features, dataset)
    utils.store_rst(prun_tree, 'prunsalary_tree')
    print prun_tree
Esempio n. 29
0
    keyshare = 0
    keynos = 0
    keysharedct = copy.deepcopy(position_dct)
    for key in tst_dict.keys():
        if positiondct.has_key(key):
            keyshare += 1
            if not position_dct.has_key(key):
                keysharedct[key] = keynm
                keynm += 1
        else:
            keynos += 1
            print key
    for industry in industrydct.keys():
        keysharedct[industry] = keynm
        keynm += 1
    utils.store_rst(keysharedct, 'keyshare')

    print 'poslenght : %d' % len(positiondct)
    print 'tstlenght : %d' % len(tst_dict)
    print 'share : %d' % share
    print 'nos : %d' % nos
    print 'keyshare : %d' % keyshare
    print 'keynos : %d' % keynos
    conn.commit()
    conn.close()
    end = time.clock()
#     print (end-start)
except Exception as e:
    conn.commit()
    conn.close()
    print e
Esempio n. 30
0
#                         ratio2 = industryr[industrys[1]][1]
#                     else:
#                         ratio2 = 0
#                     if ratio1 >= ratio2:
#                         if industryr.has_key(industrys[0]):
#                             result.append(industryr[industrys[0]][0])
#                         else:
#                             result.append('test')
#                     else:
#                         result.append(industryr[industrys[1]][0])
            flag = False
            positions = [position[0]]
            industrys = [position[1]]
    if position_dct.has_key(positions[0]): #handle the last 
        result.append(positions[0])
    elif position_dct.has_key(positions[1]):
        result.append(positions[1])
    else:
        result.append('test')
        
    print len(result)
    utils.store_rst(result, 'position.txt')
    conn.commit()
    conn.close()
    end = time.clock()
    print (end-start)
except Exception as e:
    conn.commit()
    conn.close()
    print e
    
Esempio n. 31
0
            salaryprobdct[works[1][1]]['industry1'][works[0][0]] = 1
        if salaryprobdct[works[1][1]]['industry2'].has_key(works[2][0]):
            salaryprobdct[works[1][1]]['industry2'][works[2][0]] += 1
        else:
            salaryprobdct[works[1][1]]['industry2'][works[2][0]] = 1
        salaryprobdct[works[1][1]]['total'] += 1
    salaryprobdct['total'] = t
    for key in range(7):
        salaryprobdct[key]['salary1']['total'] = reduce(
            lambda x, y: x + y, salaryprobdct[key]['salary1'].itervalues())
        salaryprobdct[key]['salary2']['total'] = reduce(
            lambda x, y: x + y, salaryprobdct[key]['salary2'].itervalues())
        salaryprobdct[key]['industry1']['total'] = reduce(
            lambda x, y: x + y, salaryprobdct[key]['industry1'].itervalues())
        salaryprobdct[key]['industry2']['total'] = reduce(
            lambda x, y: x + y, salaryprobdct[key]['industry2'].itervalues())
    pdb.set_trace()
    for key in salaryprobdct.keys():
        if key == 'total':
            continue
        print str(key) + str(salaryprobdct[key]['salary1']['total'])

    utils.store_rst(salaryprobdct, 'salaryprobdct')

    conn.commit()
    conn.close()
except Exception as e:
    conn.close()
    print e
end = time.clock()
print(end - start)
Esempio n. 32
0
    cur.execute('set character_set_results=utf8')
    cur.execute('set character_set_server=utf8')
    
    sql = 'select industry, salary from work_sizetest'
    cur.execute(sql)
    salaryprobdct = utils.read_rst('salaryprobdct')
    worklst = cur.fetchall()
    i = 0
    result = []
    
    for j in xrange(20000):
        salarys = worklst[i:i+2]
        i += 2
        salary_prob = {}
        for key in range(7):
#             pdb.set_trace()
            salary_prob[key] = get_salary_prob(key, salaryprobdct, salarys)
#         pdb.set_trace()
        sortedprob = sorted(salary_prob.iteritems(), key=lambda jj:jj[1], reverse=True)
#         for prob in sortedprob:
#             print prob[0] + str(prob[1])
        result.append(sortedprob[0][0])
        
    utils.store_rst(result, 'salary')
    conn.commit()
    conn.close()
except Exception as e:
    conn.close()
    print e
end = time.clock()
print (end - start)
Esempio n. 33
0
    usertst = cur.fetchall()
    i = 0
    for user in usertst:
        if majordct.has_key(user[1]):
            majorat = majordct[user[1]]
            mnu = max(majorat)
            index = majorat.index(mnu)
            resultlst.append(index)
        else:
            resultlst.append(0)
        i = i + 1
#     for user in usertst:
#         if majordct.has_key(user[1]):
#             majorat = majordct[user[1]]
#             mnu = max(majorat)
#             if mnu >= 0.8:
#                 index = majorat.index(mnu)
#                 resultlst.append(index)
#             else:
#                 resultlst.append(treelst[i])
#         else:
#             resultlst.append(treelst[i])
#         i = i + 1
    utils.store_rst(resultlst, 'degree.txt')
    conn.commit()
    conn.close()
except Exception as e:
    conn.close()
    print e
end = time.clock()
print (end - start)
Esempio n. 34
0
                        ratio2 = industryr[industrys[1]][1]
                    else:
                        ratio2 = 0
                    if ratio1 >= ratio2:
                        if industryr.has_key(industrys[0]):
                            result.append(industryr[industrys[0]][0])
                        else:
                            result.append("test")
                    else:
                        result.append(industryr[industrys[1]][0])
            flag = False
            positions = [position[0]]
            industrys = [position[1]]
    if position_dct.has_key(positions[0]):  # handle the last
        result.append(positions[0])
    elif position_dct.has_key(positions[1]):
        result.append(positions[1])
    else:
        result.append("test")

    print len(result)
    utils.store_rst(result, "position.txt")
    conn.commit()
    conn.close()
    end = time.clock()
    print (end - start)
except Exception as e:
    conn.commit()
    conn.close()
    print e
Esempio n. 35
0
    cur.execute('set character_set_results=utf8')
    cur.execute('set character_set_server=utf8')
    
    sql = 'select industry, position_name from work_sizetest'
    cur.execute(sql)
    workprobdct = utils.read_rst('workprobdct')
    worklst = cur.fetchall()
    i = 0
    result = []
    
    for j in xrange(20000):
        tworks = worklst[i:i+2]
        i += 2
        position_prob = {}
        for key in position_dct.keys():
#             pdb.set_trace()
            position_prob[key] = get_position_prob(key, workprobdct, tworks)
#         pdb.set_trace()
        sortedprob = sorted(position_prob.iteritems(), key=lambda jj:jj[1], reverse=True)
#         for prob in sortedprob:
#             print prob[0] + str(prob[1])
        result.append(sortedprob[0][0])
        
    utils.store_rst(result, 'position13')
    conn.commit()
    conn.close()
except Exception as e:
    conn.close()
    print e
end = time.clock()
print (end - start)
Esempio n. 36
0
if __name__ == "__main__":
    #    print "please use: python decision.py train_file test_file"
    #    sys.exit()
    train_file = "d:/jobs/dctree/random/degree-train.csv"
    test_file = "d:/jobs/dctree/random/degree-test.csv"

    labels = get_labels(train_file, 0)
    train_dataset, train_features = format_data(train_file)
    test_dataset, test_features = format_data(test_file)
    tree_num = 3990
    feature_num = 2
    result = []

    for j in range(tree_num):
        features_index = generate_feature_index(train_features, feature_num)
        features = [train_features[l] for l in features_index]
        print features_index
        print features
        train_set, labels = generate_train_file(train_dataset, features_index)
        decesion_tree = tree.rand(train_set, features, labels, 0.0005)
        test_set = generate_test_file(test_dataset, features_index)
        rst = tree.rand_test(test_set, features, decesion_tree)
        result.append(rst)

    finalrst = generate_result(result)
    store_rst(finalrst, "finalrut")

end = time.clock()
print (end - start)
Esempio n. 37
0
                letter_dct[poslst[1][0]][0].append(term)
            if not letterdct.has_key(term):
                letterdct[term] = 1
            else:
                letterdct[term] += 1
        seg_lst = jieba.cut(poslst[2][0])
        for term in seg_lst:
            if len(term) > 1:
                letter_dct[poslst[1][0]][1].append(term)
            if not letterdct.has_key(term):
                letterdct[term] = 1
            else:
                letterdct[term] += 1

    positions = letterdct.keys()
    positions_num = []
    #     for letter in positions:
    #         print letter
    #         if letter == '\\':
    #             continue
    #         sq = 'insert into letter(name, type, num, stopped) values ("%s", "%s", %d, 0)' % (letter, 'work_size', letterdct[letter])
    #         print sq
    #         cur.execute(sq)
    utils.store_rst(letter_dct, 'letterdct')
    conn.commit()
    conn.close()
except Exception as e:
    conn.close()
    print e
end = time.clock()
print(end - start)