Beispiel #1
0
from jobs import utils
import pdb

test_file = 'd:/jobs/dctree/degree/test.csv'
labels = utils.get_labels(test_file, 0)
pdb.set_trace()
plabel = utils.read_rst('result')
i = 0
rst = 0
for label in labels:
    if plabel[i] == label:
        rst += 1
    i += 1
    
print rst
Beispiel #2
0
from jobs import utils
import pdb
#pdb.set_trace()

test_file = 'd:/jobs/dctree/random/test.csv'
labels = utils.get_labels(test_file, 7)
plabel = []
# for j in range(10):
#     plabel.append(utils.read_rst('result03' +str(j)+ '.txt'))
# for j in range(10):
#     plabel.append(utils.read_rst('result04' +str(j)+ '.txt'))
# for j in range(10):
#     plabel.append(utils.read_rst('result08' +str(j)+ '.txt'))
# for j in range(10):
#     plabel.append(utils.read_rst('result09' +str(j)+ '.txt'))

rst = [0 for j in xrange(1)]
plabel.append(utils.read_rst('finalrut'))
i = 0
for label in labels:
    for j in range(1):
        if plabel[j][i] == label:
            rst[j] += 1
    i += 1

print rst
Beispiel #3
0
try:
    
    conn = MySQLdb.connect(host='localhost', user='******', passwd='123456', db='jobs', use_unicode=True, charset='utf8')
    cur = conn.cursor()
    cur.execute('set character_set_client=utf8')
    cur.execute('set character_set_connection=utf8')
    cur.execute('set character_set_database=utf8')
    cur.execute('set character_set_results=utf8')
    cur.execute('set character_set_server=utf8')
    file = open('d:/jobs/dctree/position/train.csv', 'w+')
    sql = 'select userid, age, bstart_year, gender, shortmar from jobs_uinfo'
    cur.execute(sql)
    userdlst = cur.fetchall()
    sqlze = 'select wk.salary, wk.industry, wk.position_name from work_size as wk'
    position_dct = get_position_meta()
    major_dct = read_rst('sharemajor')
    cur.execute(sqlze)
    sizelst = cur.fetchall()
    i = 0
    for userd in userdlst:
        sizes = sizelst[i:i+3]
        i += 3
        if not position_dct.has_key(sizes[1][2]):
            continue
#         print userd
        userid = []
        userd = list(userd)
        if int(userd[1]) <= 20:
            userd[1] = '18'
        elif int(userd[1]) >= 60:
            userd[1] = '60'
Beispiel #4
0
                        passwd='123456',
                        db='jobs',
                        use_unicode=True,
                        charset='utf8')
 cur = conn.cursor()
 cur.execute('set character_set_client=utf8')
 cur.execute('set character_set_connection=utf8')
 cur.execute('set character_set_database=utf8')
 cur.execute('set character_set_results=utf8')
 cur.execute('set character_set_server=utf8')
 sql = 'select name, degreer0, degreer1, degreer2 from major ;'
 cur.execute(sql)
 majorlst = cur.fetchall()
 majordct = {}
 pdb.set_trace()
 treelst = utils.read_rst('result.txt')
 resultlst = []
 for major in majorlst:
     majordct[major[0]] = [major[1], major[2], major[3]]
 sq = 'select userid, major from jobs_uinfotest'
 cur.execute(sq)
 usertst = cur.fetchall()
 i = 0
 for user in usertst:
     if majordct.has_key(user[1]):
         majorat = majordct[user[1]]
         mnu = max(majorat)
         index = majorat.index(mnu)
         resultlst.append(index)
     else:
         resultlst.append(0)
Beispiel #5
0
from jobs import utils
import pdb
pdb.set_trace()

test_file = 'd:/jobs/dctree/size/test.csv'
labels = utils.get_labels(test_file, 5)
plabel = []
# for j in range(10):
#     plabel.append(utils.read_rst('result03' +str(j)+ '.txt'))
# for j in range(10):
#     plabel.append(utils.read_rst('result04' +str(j)+ '.txt'))
for j in range(10):
    plabel.append(utils.read_rst('result05' +str(j)+ '.txt'))
for j in range(6):
    plabel.append(utils.read_rst('result06' +str(j)+ '.txt'))
rst = [0 for j in xrange(16)]
i = 0
for label in labels:
    for j in range(16):
        if plabel[j][i] == label:
            rst[j] += 1
    i += 1
    
print rst
Beispiel #6
0
from jobs import utils
import pdb
# pdb.set_trace()

test_file = 'd:/jobs/dctree/size/test.csv'
labels = utils.get_labels(test_file, 5)
plabel = utils.read_rst('sizeresult.txt')
i = 0
rst = 0
for label in labels:
    if plabel[i] == label:
        rst += 1
    i += 1
    
print rst
Beispiel #7
0
from jobs import utils
import pdb

test_file = 'd:/jobs/dctree/salary/test.csv'
labels = utils.get_labels(test_file, 6)
# pdb.set_trace()
plabel1 = utils.read_rst('sizeresult.txt')
plabel2 = utils.read_rst('sizeresult1.txt')
i = 0
rst = 0
for label in plabel1:
    if plabel2[i] == label:
        rst += 1
    i += 1
    
print rst
postdct = get_position.get_pos()

try:

    conn = MySQLdb.connect(host="localhost", user="******", passwd="123456", db="jobs", use_unicode=True, charset="utf8")
    cur = conn.cursor()
    cur.execute("set character_set_client=utf8")
    cur.execute("set character_set_connection=utf8")
    cur.execute("set character_set_database=utf8")
    cur.execute("set character_set_results=utf8")
    cur.execute("set character_set_server=utf8")
    # sql = 'select userid from jobs_uinfotest'

    #     pdb.set_trace()
    position_dct = {}
    industryr = utils.read_rst("industryr.txt")
    with codecs.open("position_meta.txt") as file:
        lines = file.readlines()
        for linet in lines:
            line = linet[:-2]
            uline = unicode(line)
            position_dct[uline] = "1"
    sql = "select position_name, industry from work_sizetest"
    cur.execute(sql)
    positionlst = cur.fetchall()
    positions = []
    industrys = []
    result = []
    flag = False
    i = 0
    for position in positionlst:
Beispiel #9
0
    conn = MySQLdb.connect(host='localhost',
                           user='******',
                           passwd='123456',
                           db='jobs',
                           use_unicode=True,
                           charset='utf8')
    cur = conn.cursor()
    cur.execute('set character_set_client=utf8')
    cur.execute('set character_set_connection=utf8')
    cur.execute('set character_set_database=utf8')
    cur.execute('set character_set_results=utf8')
    cur.execute('set character_set_server=utf8')

    sql = 'select industry, position_name from work_sizetest'
    cur.execute(sql)
    workprobdct = utils.read_rst('workprobdct')
    worklst = cur.fetchall()
    i = 0
    result = []

    for j in xrange(20000):
        tworks = worklst[i:i + 2]
        i += 2
        position_prob = {}
        for key in position_dct.keys():
            #             pdb.set_trace()
            position_prob[key] = get_position_prob(key, workprobdct, tworks)
#         pdb.set_trace()
        sortedprob = sorted(position_prob.iteritems(),
                            key=lambda jj: jj[1],
                            reverse=True)
Beispiel #10
0
#coding:utf8
import sys
from jobs import utils
reload(sys)
import pdb
sys.setdefaultencoding('utf8')

test_file = 'd:/jobs/dctree/salary/test.csv'
labels = utils.get_labels(test_file, 6)
# pdb.set_trace()
plabel1 = utils.read_rst('position.txt')
plabel2 = utils.read_rst('positionlet')
i = 0
rst = 0
for label in plabel1:
    if plabel2[i] == label:
        rst += 1
    i += 1

print rst
Beispiel #11
0
    conn = MySQLdb.connect(host='localhost',
                           user='******',
                           passwd='123456',
                           db='jobs',
                           use_unicode=True,
                           charset='utf8')
    cur = conn.cursor()
    cur.execute('set character_set_client=utf8')
    cur.execute('set character_set_connection=utf8')
    cur.execute('set character_set_database=utf8')
    cur.execute('set character_set_results=utf8')
    cur.execute('set character_set_server=utf8')

    sql = 'select industry, salary from work_sizetest'
    cur.execute(sql)
    salaryprobdct = utils.read_rst('salaryprobdct')
    worklst = cur.fetchall()
    i = 0
    result = []

    for j in xrange(20000):
        salarys = worklst[i:i + 2]
        i += 2
        salary_prob = {}
        for key in range(7):
            #             pdb.set_trace()
            salary_prob[key] = get_salary_prob(key, salaryprobdct, salarys)
#         pdb.set_trace()
        sortedprob = sorted(salary_prob.iteritems(),
                            key=lambda jj: jj[1],
                            reverse=True)
Beispiel #12
0
 cur.execute('set character_set_results=utf8')
 cur.execute('set character_set_server=utf8')
 #sql = 'select userid from jobs_uinfotest'
 
 position_dct = {}
 with codecs.open('position_meta.txt') as file:
     lines = file.readlines()
     for linet in lines:
         line = linet[:-2]
         uline = unicode(line)
         lintlst = uline.split(',')
         position_dct[lintlst[0]] = lintlst[1]
 
 file = open('d:/jobs/xgboost/data.csv', 'w+')
 pdb.set_trace()
 keyshare = utils.read_rst('keyshare')
 sql = 'select position_name, industry from work_size'
 cur.execute(sql)
 datalst = []
 positionlst = cur.fetchall()
 i = 0
 for j in xrange(70000):
     worklst = positionlst[i:i+3]
    
     i += 1
     if not position_dct.has_key(worklst[1][0]):
         continue
     else:
         rst = []
         if keyshare.has_key(worklst[0][0]):
             rst.append(keyshare[worklst[0][0]])
Beispiel #13
0
from jobs import utils
import pdb

test_file = 'd:/jobs/dctree/degree/test.csv'
labels = utils.get_labels(test_file, 0)
pdb.set_trace()
plabel = utils.read_rst('result')
i = 0
rst = 0
for label in labels:
    if plabel[i] == label:
        rst += 1
    i += 1

print rst
Beispiel #14
0
    cur.execute('set character_set_client=utf8')
    cur.execute('set character_set_connection=utf8')
    cur.execute('set character_set_database=utf8')
    cur.execute('set character_set_results=utf8')
    cur.execute('set character_set_server=utf8')
    sql = 'select jb.userid,jb.degree,jb.age,jb.start_age,jb.bstart_year,jb.gender,jb.start_salary,wk.size \
                                        from jobs_uinfotest as jb left join workexperiencetest as wk on \
                                        jb.userid = wk.userid and wk.num = 1'
    cur.execute(sql)
    file = open('d:/jobs/dctree/salary/salary.csv', 'w+')
    useridlst = cur.fetchall()
#     rsultlabel = utils.read_rst('result.txt')
#     pdb.set_trace()
    #wsresult = utils.read_rst('wsresult.txt')
#     sizeresult = utils.read_rst('sizeresult.txt')
    salaryresult = utils.read_rst('salary')
#     posresult = utils.read_rst('position13')
    #degreelst = utils.read_rst('degree.txt')
#     degreelst = utils.read_rst('result1.txt')
    file.write('id,degree,size,salary,position_name\n')
    i = 0
#     pdb.set_trace()
    j = 0
    for userid in useridlst:
        result = []
        result.append(userid[0])
        print i
#         result.append(degreelst[i])
#         result.append(sizeresult[i])
        result.append(8)
        result.append(8)
Beispiel #15
0
#coding:utf8
import sys
from jobs import utils
reload(sys)
import pdb
sys.setdefaultencoding('utf8')

test_file = 'd:/jobs/dctree/random/test.csv'
labels = utils.get_labels(test_file, 9)
# pdb.set_trace()
plabel1 = utils.read_rst('result.txt')
plabel2 = utils.read_rst('finalrut')
i = 0
rst = 0
for label in labels:
    if plabel2[i] == label:
        rst += 1
    i += 1

print rst
Beispiel #16
0
                           db='jobs',
                           use_unicode=True,
                           charset='utf8')
    cur = conn.cursor()
    cur.execute('set character_set_client=utf8')
    cur.execute('set character_set_connection=utf8')
    cur.execute('set character_set_database=utf8')
    cur.execute('set character_set_results=utf8')
    cur.execute('set character_set_server=utf8')
    file = open('d:/jobs/dctree/position/train.csv', 'w+')
    sql = 'select userid, age, bstart_year, gender, shortmar from jobs_uinfo'
    cur.execute(sql)
    userdlst = cur.fetchall()
    sqlze = 'select wk.salary, wk.industry, wk.position_name from work_size as wk'
    position_dct = get_position_meta()
    major_dct = read_rst('sharemajor')
    cur.execute(sqlze)
    sizelst = cur.fetchall()
    i = 0
    for userd in userdlst:
        sizes = sizelst[i:i + 3]
        i += 3
        if not position_dct.has_key(sizes[1][2]):
            continue
#         print userd
        userid = []
        userd = list(userd)
        if int(userd[1]) <= 20:
            userd[1] = '18'
        elif int(userd[1]) >= 60:
            userd[1] = '60'
Beispiel #17
0
from jobs import utils
import pdb
 #pdb.set_trace()

test_file = 'd:/jobs/dctree/random/test.csv'
labels = utils.get_labels(test_file, 7)
plabel = []
# for j in range(10):
#     plabel.append(utils.read_rst('result03' +str(j)+ '.txt'))
# for j in range(10):
#     plabel.append(utils.read_rst('result04' +str(j)+ '.txt'))
# for j in range(10):
#     plabel.append(utils.read_rst('result08' +str(j)+ '.txt'))
# for j in range(10):
#     plabel.append(utils.read_rst('result09' +str(j)+ '.txt'))

rst = [0 for j in xrange(1)]
plabel.append(utils.read_rst('finalrut'))
i = 0
for label in labels:
    for j in range(1):
        if plabel[j][i] == label:
            rst[j] += 1
    i += 1
    
print rst
Beispiel #18
0
from jobs import utils
import pdb

test_file = 'd:/jobs/dctree/salary/test.csv'
labels = utils.get_labels(test_file, 6)
# pdb.set_trace()
plabel1 = utils.read_rst('sizeresult.txt')
plabel2 = utils.read_rst('sizeresult1.txt')
i = 0
rst = 0
for label in plabel1:
    if plabel2[i] == label:
        rst += 1
    i += 1

print rst
Beispiel #19
0
#coding:utf8

from jobs import utils
import jieba
import jieba.posseg as pseg
import os
import pdb
import sys
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
reload(sys)
sys.setdefaultencoding('utf8')

letter_dct = utils.read_rst('letterdct')
tfidf_dct = {}

# pdb.set_trace()
corpus = []
keys = []
for item in letter_dct.items():
    keys.append(item[0])
    corpus.append(' '.join(item[1][0]))
# corpus = [[' '.join(item[1][0])] for item in letter_dct.items()]

vectorizer = CountVectorizer()  #该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
transformer = TfidfTransformer()  #该类会统计每个词语的tf-idf权值
tfidf = transformer.fit_transform(vectorizer.fit_transform(
    corpus))  #第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
word = vectorizer.get_feature_names()  #获取词袋模型中的所有词语
weight = tfidf.toarray()  #将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
Beispiel #20
0
    total = total*(pos1prob + pos2prob + industry1prob + industry2prob)
    
    return total

try:    
    conn = MySQLdb.connect(host='localhost', user='******', passwd='123456', db='jobs', use_unicode=True, charset='utf8')
    cur = conn.cursor()
    cur.execute('set character_set_client=utf8')
    cur.execute('set character_set_connection=utf8')
    cur.execute('set character_set_database=utf8')
    cur.execute('set character_set_results=utf8')
    cur.execute('set character_set_server=utf8')
    
    sql = 'select industry, position_name from work_sizetest'
    cur.execute(sql)
    workprobdct = utils.read_rst('workprobdct')
    worklst = cur.fetchall()
    i = 0
    result = []
    
    for j in xrange(20000):
        tworks = worklst[i:i+2]
        i += 2
        position_prob = {}
        for key in position_dct.keys():
#             pdb.set_trace()
            position_prob[key] = get_position_prob(key, workprobdct, tworks)
#         pdb.set_trace()
        sortedprob = sorted(position_prob.iteritems(), key=lambda jj:jj[1], reverse=True)
#         for prob in sortedprob:
#             print prob[0] + str(prob[1])
Beispiel #21
0
try:

    conn = MySQLdb.connect(host="localhost", user="******", passwd="123456", db="jobs", use_unicode=True, charset="utf8")
    cur = conn.cursor()
    cur.execute("set character_set_client=utf8")
    cur.execute("set character_set_connection=utf8")
    cur.execute("set character_set_database=utf8")
    cur.execute("set character_set_results=utf8")
    cur.execute("set character_set_server=utf8")
    file = open("d:/jobs/dctree/position/test.csv", "w+")
    sql = "select userid, age, bstart_year, gender, shortmar from jobs_uinfo limit 50000, 20000"
    cur.execute(sql)
    userdlst = cur.fetchall()
    sqlze = "select wk.salary, wk.industry, wk.position_name from work_size as wk limit 150000, 60000"
    position_dct = get_position_meta()
    major_dct = read_rst("sharemajor")
    cur.execute(sqlze)
    sizelst = cur.fetchall()
    i = 0
    for userd in userdlst:
        sizes = sizelst[i : i + 3]
        i += 3
        if not position_dct.has_key(sizes[1][2]):
            continue
        print userd
        userid = []
        userd = list(userd)
        if int(userd[1]) <= 20:
            userd[1] = "18"
        elif int(userd[1]) >= 60:
            userd[1] = "60"
Beispiel #22
0
    total = total*(pos1prob + pos2prob + industry1prob + industry2prob)
    
    return total

try:    
    conn = MySQLdb.connect(host='localhost', user='******', passwd='123456', db='jobs', use_unicode=True, charset='utf8')
    cur = conn.cursor()
    cur.execute('set character_set_client=utf8')
    cur.execute('set character_set_connection=utf8')
    cur.execute('set character_set_database=utf8')
    cur.execute('set character_set_results=utf8')
    cur.execute('set character_set_server=utf8')
    
    sql = 'select industry, position_name from work_sizetest'
    cur.execute(sql)
    workprobdct = utils.read_rst('workletterprobdct')
    worklst = cur.fetchall()
    i = 0
    result = []
    
    for j in xrange(20000):
        tworks = worklst[i:i+2]
        i += 2
        position_prob = {}
        pdb.set_trace()
        for key in position_dct.keys():
#             pdb.set_trace()
            position_prob[key] = get_position_prob(key, workprobdct, tworks)
#         pdb.set_trace()
        sortedprob = sorted(position_prob.iteritems(), key=lambda jj:jj[1], reverse=True)
#         for prob in sortedprob:
Beispiel #23
0
    return total


try:
    conn = MySQLdb.connect(host="localhost", user="******", passwd="123456", db="jobs", use_unicode=True, charset="utf8")
    cur = conn.cursor()
    cur.execute("set character_set_client=utf8")
    cur.execute("set character_set_connection=utf8")
    cur.execute("set character_set_database=utf8")
    cur.execute("set character_set_results=utf8")
    cur.execute("set character_set_server=utf8")

    sql = "select position_name from work_sizetest"
    cur.execute(sql)
    wordprobdct = utils.read_rst("position_word")
    wordlst = cur.fetchall()
    i = 0
    result = []
    #     pdb.set_trace()
    for j in xrange(20000):
        tworks = wordlst[i : i + 2]
        i += 2
        position_prob = {}
        for key in position_dct.keys():
            position_prob[key] = get_position_prob(key, wordprobdct, tworks)
        sortedprob = sorted(position_prob.iteritems(), key=lambda jj: jj[1], reverse=True)
        #         for prob in sortedprob:
        #             print prob[0] + str(prob[1])
        result.append(sortedprob[0][0])
Beispiel #24
0
from jobs import utils
import pdb
pdb.set_trace()

test_file = 'd:/jobs/dctree/salary/test.csv'
labels = utils.get_labels(test_file, 6)
plabel = []
# for j in range(10):
#     plabel.append(utils.read_rst('result03' +str(j)+ '.txt'))
# for j in range(10):
#     plabel.append(utils.read_rst('result04' +str(j)+ '.txt'))
for j in range(10):
    plabel.append(utils.read_rst('result1' + str(j) + '.txt'))
for j in range(10):
    plabel.append(utils.read_rst('result2' + str(j) + '.txt'))
rst = [0 for j in xrange(20)]
i = 0
for label in labels:
    for j in range(20):
        if plabel[j][i] == label:
            rst[j] += 1
    i += 1

print rst
Beispiel #25
0
                           user='******',
                           passwd='123456',
                           db='jobs',
                           use_unicode=True,
                           charset='utf8')
    cur = conn.cursor()
    cur.execute('set character_set_client=utf8')
    cur.execute('set character_set_connection=utf8')
    cur.execute('set character_set_database=utf8')
    cur.execute('set character_set_results=utf8')
    cur.execute('set character_set_server=utf8')
    #sql = 'select userid from jobs_uinfotest'

    #     pdb.set_trace()
    position_dct = {}
    industryr = utils.read_rst('industryr.txt')
    with codecs.open('position_meta.txt') as file:
        lines = file.readlines()
        for linet in lines:
            line = linet[:-2]
            uline = unicode(line)
            position_dct[uline] = '1'
    sql = 'select position_name, industry from work_sizetest'
    cur.execute(sql)
    positionlst = cur.fetchall()
    positions = []
    industrys = []
    result = []
    flag = False
    i = 0
    for position in positionlst:
Beispiel #26
0
from jobs import utils
import pdb

test_file = 'd:/jobs/dctree/salary/test.csv'
labels = utils.get_labels(test_file, 6)
pdb.set_trace()
plabel = utils.read_rst('salary')
i = 0
rst = 0
for label in labels:
    if plabel[i] == label:
        rst += 1
    i += 1

print rst
Beispiel #27
0
#coding:utf8
import sys
from jobs import utils
reload(sys)
import pdb
sys.setdefaultencoding('utf8')

test_file = 'd:/jobs/dctree/random/test.csv'
labels = utils.get_labels(test_file, 9)
# pdb.set_trace()
plabel1 = utils.read_rst('result.txt')
plabel2 = utils.read_rst('finalrut')
i = 0
rst = 0
for label in labels:
    if plabel2[i] == label:
        rst += 1
    i += 1
    
print rst
Beispiel #28
0
    conn = MySQLdb.connect(host='localhost',
                           user='******',
                           passwd='123456',
                           db='jobs',
                           use_unicode=True,
                           charset='utf8')
    cur = conn.cursor()
    cur.execute('set character_set_client=utf8')
    cur.execute('set character_set_connection=utf8')
    cur.execute('set character_set_database=utf8')
    cur.execute('set character_set_results=utf8')
    cur.execute('set character_set_server=utf8')

    sql = 'select position_name from work_sizetest'
    cur.execute(sql)
    wordprobdct = utils.read_rst('position_word')
    wordlst = cur.fetchall()
    i = 0
    result = []
    #     pdb.set_trace()
    for j in xrange(20000):
        tworks = wordlst[i:i + 2]
        i += 2
        position_prob = {}
        for key in position_dct.keys():
            position_prob[key] = get_position_prob(key, wordprobdct, tworks)
        sortedprob = sorted(position_prob.iteritems(),
                            key=lambda jj: jj[1],
                            reverse=True)
        #         for prob in sortedprob:
        #             print prob[0] + str(prob[1])
Beispiel #29
0
from jobs import utils
import pdb

test_file = 'd:/jobs/dctree/salary/test.csv'
labels = utils.get_labels(test_file, 6)
pdb.set_trace()
plabel1 = utils.read_rst('salaryresult.txt')
plabel2 = utils.read_rst('salaryresult1.txt')
i = 0
rst = 0
for label in plabel1:
    if plabel2[i] == label:
        rst += 1
    i += 1

print rst
postdct = get_position.get_pos()
        
try:
    
    conn = MySQLdb.connect(host='localhost', user='******', passwd='123456', db='jobs', use_unicode=True, charset='utf8')
    cur = conn.cursor()
    cur.execute('set character_set_client=utf8')
    cur.execute('set character_set_connection=utf8')
    cur.execute('set character_set_database=utf8')
    cur.execute('set character_set_results=utf8')
    cur.execute('set character_set_server=utf8')
    #sql = 'select userid from jobs_uinfotest'
    import pdb
    pdb.set_trace()
    position_dct = {}
    industryr = utils.read_rst('industryr.txt');
    with codecs.open('position_meta.txt') as file:
        lines = file.readlines()
        for linet in lines:
            line = linet[:-2]
            uline = unicode(line)
            position_dct[uline] = '1'
    sql = 'select position_name, industry from work_sizetest'
    cur.execute(sql)
    positionlst = cur.fetchall()
    positions = []
    industrys = []
    result = []
    flag = False
    i = 0
    for position in positionlst:
Beispiel #31
0
#coding:utf8

from jobs import utils
import jieba
import jieba.posseg as pseg
import os
import pdb
import sys
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
reload(sys)
sys.setdefaultencoding('utf8')

letter_dct = utils.read_rst('letterdct')
tfidf_dct = {}

# pdb.set_trace()
corpus = []
keys = []
for item in letter_dct.items():
    keys.append(item[0])
    corpus.append(' '.join(item[1][0]))
# corpus = [[' '.join(item[1][0])] for item in letter_dct.items()]

vectorizer=CountVectorizer()#该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
transformer=TfidfTransformer()#该类会统计每个词语的tf-idf权值
tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))#第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
word=vectorizer.get_feature_names()#获取词袋模型中的所有词语
weight=tfidf.toarray()#将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
Beispiel #32
0
    total = total*salary1prob*salary2prob
#     pdb.set_trace()
    return total

try:    
    conn = MySQLdb.connect(host='localhost', user='******', passwd='123456', db='jobs', use_unicode=True, charset='utf8')
    cur = conn.cursor()
    cur.execute('set character_set_client=utf8')
    cur.execute('set character_set_connection=utf8')
    cur.execute('set character_set_database=utf8')
    cur.execute('set character_set_results=utf8')
    cur.execute('set character_set_server=utf8')
    
    sql = 'select industry, salary from work_sizetest'
    cur.execute(sql)
    salaryprobdct = utils.read_rst('salaryprobdct')
    worklst = cur.fetchall()
    i = 0
    result = []
    
    for j in xrange(20000):
        salarys = worklst[i:i+2]
        i += 2
        salary_prob = {}
        for key in range(7):
#             pdb.set_trace()
            salary_prob[key] = get_salary_prob(key, salaryprobdct, salarys)
#         pdb.set_trace()
        sortedprob = sorted(salary_prob.iteritems(), key=lambda jj:jj[1], reverse=True)
#         for prob in sortedprob:
#             print prob[0] + str(prob[1])
Beispiel #33
0
try:
    
    conn = MySQLdb.connect(host='localhost', user='******', passwd='123456', db='jobs', use_unicode=True, charset='utf8')
    cur = conn.cursor()
    cur.execute('set character_set_client=utf8')
    cur.execute('set character_set_connection=utf8')
    cur.execute('set character_set_database=utf8')
    cur.execute('set character_set_results=utf8')
    cur.execute('set character_set_server=utf8')
    sql = 'select jb.userid,jb.degree,jb.age,jb.start_age,jb.bstart_year,jb.gender,jb.start_salary,wk.size \
                                        from jobs_uinfotest as jb left join workexperiencetest as wk on \
                                        jb.userid = wk.userid and wk.num = 1'
    cur.execute(sql)
    file = open('d:/jobs/dctree/bresult.csv', 'w+')
    useridlst = cur.fetchall()
    rsultlabel = utils.read_rst('result.txt')
    pdb.set_trace()
    #wsresult = utils.read_rst('wsresult.txt')
    sizeresult = utils.read_rst('sizeresult.txt')
    salaryresult = utils.read_rst('salaryresult.txt')
    posresult = utils.read_rst('position.txt')
    #degreelst = utils.read_rst('degree.txt')
    degreelst = utils.read_rst('result.txt')
    rsultlabel = map(str, rsultlabel)
    file.write('id,degree,size,salary,position_name\n')
    i = 0
    print rsultlabel
    pdb.set_trace()
    j = 0
    for userid in useridlst:
        result = []
Beispiel #34
0
#coding:utf8
import sys
from jobs import utils
reload(sys)
import pdb
sys.setdefaultencoding('utf8')

test_file = 'd:/jobs/dctree/salary/test.csv'
labels = utils.get_labels(test_file, 6)
# pdb.set_trace()
plabel1 = utils.read_rst('salaryresult1.txt')
plabel2 = utils.read_rst('salary')
i = 0
rst = 0
for label in plabel1:
    if str(plabel2[i]) == label:
        rst += 1
    i += 1
    
print rst
Beispiel #35
0
# coding:utf8
import sys
from jobs import utils

reload(sys)
import pdb

sys.setdefaultencoding("utf8")

test_file = "d:/jobs/dctree/salary/test.csv"
labels = utils.get_labels(test_file, 6)
# pdb.set_trace()
plabel1 = utils.read_rst("position.txt")
plabel2 = utils.read_rst("positionlet")
i = 0
rst = 0
for label in plabel1:
    if plabel2[i] == label:
        rst += 1
    i += 1

print rst
Beispiel #36
0
try:

    conn = MySQLdb.connect(host="localhost", user="******", passwd="123456", db="jobs", use_unicode=True, charset="utf8")
    cur = conn.cursor()
    cur.execute("set character_set_client=utf8")
    cur.execute("set character_set_connection=utf8")
    cur.execute("set character_set_database=utf8")
    cur.execute("set character_set_results=utf8")
    cur.execute("set character_set_server=utf8")
    sql = "select jb.userid,jb.degree,jb.age,jb.start_age,jb.bstart_year,jb.gender,jb.start_salary,wk.size \
                                        from jobs_uinfotest as jb left join workexperiencetest as wk on \
                                        jb.userid = wk.userid and wk.num = 1"
    cur.execute(sql)
    file = open("d:/jobs/baysian/position.csv", "w+")
    useridlst = cur.fetchall()
    rsultlabel = utils.read_rst("result.txt")
    pdb.set_trace()
    # wsresult = utils.read_rst('wsresult.txt')
    sizeresult = utils.read_rst("sizeresult.txt")
    salaryresult = utils.read_rst("salaryresult.txt")
    posresult = utils.read_rst("positionlet")
    # degreelst = utils.read_rst('degree.txt')
    degreelst = utils.read_rst("result1.txt")
    rsultlabel = map(str, rsultlabel)
    file.write("id,degree,size,salary,position_name\n")
    i = 0
    print rsultlabel
    #     pdb.set_trace()
    j = 0
    for userid in useridlst:
        result = []
Beispiel #37
0
start = time.clock()
try:
    
    conn = MySQLdb.connect(host='localhost', user='******', passwd='123456', db='jobs', use_unicode=True, charset='utf8')
    cur = conn.cursor()
    cur.execute('set character_set_client=utf8')
    cur.execute('set character_set_connection=utf8')
    cur.execute('set character_set_database=utf8')
    cur.execute('set character_set_results=utf8')
    cur.execute('set character_set_server=utf8')
    sql = 'select name, degreer0, degreer1, degreer2 from major ;'
    cur.execute(sql)
    majorlst = cur.fetchall()
    majordct = {}
    pdb.set_trace()
    treelst = utils.read_rst('result.txt')
    resultlst = []
    for major in majorlst:
        majordct[major[0]] = [major[1], major[2], major[3]]
    sq = 'select userid, major from jobs_uinfotest'    
    cur.execute(sq)
    usertst = cur.fetchall()
    i = 0
    for user in usertst:
        if majordct.has_key(user[1]):
            majorat = majordct[user[1]]
            mnu = max(majorat)
            index = majorat.index(mnu)
            resultlst.append(index)
        else:
            resultlst.append(0)
Beispiel #38
0
from jobs import utils
import pdb
pdb.set_trace()

test_file = 'd:/jobs/dctree/salary/test.csv'
labels = utils.get_labels(test_file, 6)
plabel = []
# for j in range(10):
#     plabel.append(utils.read_rst('result03' +str(j)+ '.txt'))
# for j in range(10):
#     plabel.append(utils.read_rst('result04' +str(j)+ '.txt'))
for j in range(10):
    plabel.append(utils.read_rst('result1' +str(j)+ '.txt'))
for j in range(10):
    plabel.append(utils.read_rst('result2' +str(j)+ '.txt'))
rst = [0 for j in xrange(20)]
i = 0
for label in labels:
    for j in range(20):
        if plabel[j][i] == label:
            rst[j] += 1
    i += 1
    
print rst
Beispiel #39
0
from jobs import utils
import pdb

test_file = 'd:/jobs/dctree/salary/test.csv'
labels = utils.get_labels(test_file, 6)
pdb.set_trace()
plabel = utils.read_rst('salary')
i = 0
rst = 0
for label in labels:
    if plabel[i] == label:
        rst += 1
    i += 1
    
print rst