cur.execute('set character_set_connection=utf8') cur.execute('set character_set_database=utf8') cur.execute('set character_set_results=utf8') cur.execute('set character_set_server=utf8') #sql = 'select userid from jobs_uinfotest' import pdb pdb.set_trace() position_dct = {} sql = 'select count(industry), industry from work_size group by industry' cur.execute(sql) positionlst = cur.fetchall() for industry in positionlst: position_dct[industry[1]] = [] sq = 'select count(position_name) as nu, position_name from work_size where industry = "%s" group by position_name order by nu desc limit 1' % industry[ 1] cur.execute(sq) indusp = cur.fetchall() position_dct[industry[1]].append(indusp[0][1]) position_dct[industry[1]].append(float(indusp[0][0]) / industry[0]) utils.store_rst(position_dct, 'industryr.txt') print position_dct conn.commit() conn.close() end = time.clock() print(end - start) except Exception as e: conn.commit() conn.close() print e
first_fea = decesion_tree.keys()[0] fea_index = features.index(first_fea) if not decesion_tree[first_fea].has_key(test_data[fea_index]): return None if type(decesion_tree[first_fea][test_data[fea_index]]) is types.DictType: if len(decesion_tree[first_fea][test_data[fea_index]]) == 0: return test_data return classify_prun(decesion_tree[first_fea][test_data[fea_index]], features, test_data) else: return decesion_tree[first_fea][test_data[fea_index]] def classify_t(decesion_tree, features, test_data, mean_values=None): first_fea = decesion_tree.keys()[0] fea_index = features.index(first_fea) if not decesion_tree[first_fea].has_key(test_data[fea_index]): return 1 if type(decesion_tree[first_fea][test_data[fea_index]]) is types.DictType: return classify_t(decesion_tree[first_fea][test_data[fea_index]], features, test_data) else: return decesion_tree[first_fea][test_data[fea_index]] if __name__ == '__main__': #if len(sys.argv) != 3: # print "please use: python decision.py train_file test_file" # sys.exit() test_file = 'd:/jobs/dctree/size/test.csv' decesion_tree = read_tree('size_tree') dataset, features = format_data(test_file) prun_tree = prun(decesion_tree, features, dataset) utils.store_rst(prun_tree, 'prunsize_tree') print prun_tree
cur.execute('set character_set_results=utf8') cur.execute('set character_set_server=utf8') sql = 'select position_name from work_sizetest' cur.execute(sql) wordprobdct = utils.read_rst('position_word') wordlst = cur.fetchall() i = 0 result = [] # pdb.set_trace() for j in xrange(20000): tworks = wordlst[i:i + 2] i += 2 position_prob = {} for key in position_dct.keys(): position_prob[key] = get_position_prob(key, wordprobdct, tworks) sortedprob = sorted(position_prob.iteritems(), key=lambda jj: jj[1], reverse=True) # for prob in sortedprob: # print prob[0] + str(prob[1]) result.append(sortedprob[0][0]) utils.store_rst(result, 'positionlet') conn.commit() conn.close() except Exception as e: conn.close() print e end = time.clock() print(end - start)
for j in xrange(70000): poslst = positionlst[j:j+3] j += 3 if not position_dct.has_key(poslst[1][0]): continue seg_lst = jieba.cut(poslst[0][0]) for term in seg_lst: if position_word_dct[poslst[1][0]]['pos1'].has_key(term): position_word_dct[poslst[1][0]]['pos1'][term] += 1 else: position_word_dct[poslst[1][0]]['pos1'][term] = 1 position_word_dct[poslst[1][0]]['pos1']['total'] += 1 seg_lst = jieba.cut(poslst[2][0]) for term in seg_lst: if position_word_dct[poslst[1][0]]['pos2'].has_key(term): position_word_dct[poslst[1][0]]['pos2'][term] += 1 else: position_word_dct[poslst[1][0]]['pos2'][term] = 1 position_word_dct[poslst[1][0]]['pos2']['total'] += 1 position_word_dct[poslst[1][0]]['total'] += 1 position_word_dct['total'] += 1 utils.store_rst(position_word_dct, 'position_word') conn.commit() conn.close() except Exception as e: conn.close() print e end = time.clock() print (end - start)
cur.execute('set character_set_client=utf8') cur.execute('set character_set_connection=utf8') cur.execute('set character_set_database=utf8') cur.execute('set character_set_results=utf8') cur.execute('set character_set_server=utf8') #sql = 'select userid from jobs_uinfotest' import pdb pdb.set_trace() position_dct = {} sql = 'select count(industry), industry from work_size group by industry' cur.execute(sql) positionlst = cur.fetchall() for industry in positionlst: position_dct[industry[1]] = [] sq = 'select count(position_name) as nu, position_name from work_size where industry = "%s" group by position_name order by nu desc limit 1' % industry[1] cur.execute(sq) indusp = cur.fetchall() position_dct[industry[1]].append(indusp[0][1]) position_dct[industry[1]].append(float(indusp[0][0])/industry[0]) utils.store_rst(position_dct, 'industryr.txt') print position_dct conn.commit() conn.close() end = time.clock() print (end-start) except Exception as e: conn.commit() conn.close() print e
cur.execute('set character_set_connection=utf8') cur.execute('set character_set_database=utf8') cur.execute('set character_set_results=utf8') cur.execute('set character_set_server=utf8') #sql = 'select userid from jobs_uinfotest' majorsql = 'select distinct(shortmar) from jobs_uinfo' sqlts = 'select distinct(shortmar) from jobs_uinfotest' cur.execute(majorsql) majorlst = cur.fetchall() cur.execute(sqlts) sharedct = {} shortmar = cur.fetchall() majordct = {} for major in majorlst: majordct[major[0]] = 1 for major in shortmar: if majordct.has_key(major[0]): sharedct[major[0]] = 1 print major[0] print len(sharedct) store_rst(sharedct, 'sharemajor') conn.commit() conn.close() end = time.clock() print (end-start) except Exception as e: conn.commit() conn.close() print e
return dataset,features if __name__ == '__main__': # print "please use: python decision.py train_file test_file" # sys.exit() train_file = 'd:/jobs/dctree/random/sal-train.csv' test_file = 'd:/jobs/dctree/random/sal-test.csv' labels = get_labels(train_file,14) train_dataset, train_features = format_data(train_file) test_dataset, test_features = format_data(test_file) tree_num = 3990 feature_num = 2 result = [] pdb.set_trace() for j in range(tree_num): features_index = generate_feature_index(train_features, feature_num) features = [train_features[l] for l in features_index] print features_index print features train_set, labels = generate_train_file(train_dataset, features_index) decesion_tree = tree.rand(train_set, features, labels, 0.0005) test_set = generate_test_file(test_dataset, features_index) rst = tree.rand_test(test_set, features, decesion_tree) result.append(rst) finalrst = generate_result(result) store_rst(finalrst, 'salary') end = time.clock() print (end - start)
if len(decesion_tree[first_fea][test_data[fea_index]]) == 0: return test_data return classify_prun(decesion_tree[first_fea][test_data[fea_index]], features, test_data) else: return decesion_tree[first_fea][test_data[fea_index]] def classify_t(decesion_tree, features, test_data, mean_values=None): # pdb.set_trace() first_fea = decesion_tree.keys()[0] fea_index = features.index(first_fea) if not decesion_tree[first_fea].has_key(test_data[fea_index]): return 1 if type(decesion_tree[first_fea][test_data[fea_index]]) is types.DictType: return classify_t(decesion_tree[first_fea][test_data[fea_index]], features, test_data) else: return decesion_tree[first_fea][test_data[fea_index]] if __name__ == '__main__': #if len(sys.argv) != 3: # print "please use: python decision.py train_file test_file" # sys.exit() test_file = 'd:/jobs/dctree/salary/test.csv' decesion_tree = read_tree('salary_tree') dataset, features = format_data(test_file) prun_tree = prun(decesion_tree, features, dataset) utils.store_rst(prun_tree, 'prunsalary_tree') print prun_tree
sql = 'select industry, position_name from work_sizetest' cur.execute(sql) workprobdct = utils.read_rst('workprobdct') worklst = cur.fetchall() i = 0 result = [] for j in xrange(20000): tworks = worklst[i:i + 2] i += 2 position_prob = {} for key in position_dct.keys(): # pdb.set_trace() position_prob[key] = get_position_prob(key, workprobdct, tworks) # pdb.set_trace() sortedprob = sorted(position_prob.iteritems(), key=lambda jj: jj[1], reverse=True) # for prob in sortedprob: # print prob[0] + str(prob[1]) result.append(sortedprob[0][0]) utils.store_rst(result, 'position13') conn.commit() conn.close() except Exception as e: conn.close() print e end = time.clock() print(end - start)
for j in xrange(70000): poslst = positionlst[j:j + 3] j += 3 if not position_dct.has_key(poslst[1][0]): continue seg_lst = jieba.cut(poslst[0][0]) for term in seg_lst: if position_word_dct[poslst[1][0]]['pos1'].has_key(term): position_word_dct[poslst[1][0]]['pos1'][term] += 1 else: position_word_dct[poslst[1][0]]['pos1'][term] = 1 position_word_dct[poslst[1][0]]['pos1']['total'] += 1 seg_lst = jieba.cut(poslst[2][0]) for term in seg_lst: if position_word_dct[poslst[1][0]]['pos2'].has_key(term): position_word_dct[poslst[1][0]]['pos2'][term] += 1 else: position_word_dct[poslst[1][0]]['pos2'][term] = 1 position_word_dct[poslst[1][0]]['pos2']['total'] += 1 position_word_dct[poslst[1][0]]['total'] += 1 position_word_dct['total'] += 1 utils.store_rst(position_word_dct, 'position_word') conn.commit() conn.close() except Exception as e: conn.close() print e end = time.clock() print(end - start)
sql = 'select industry, salary from work_sizetest' cur.execute(sql) salaryprobdct = utils.read_rst('salaryprobdct') worklst = cur.fetchall() i = 0 result = [] for j in xrange(20000): salarys = worklst[i:i + 2] i += 2 salary_prob = {} for key in range(7): # pdb.set_trace() salary_prob[key] = get_salary_prob(key, salaryprobdct, salarys) # pdb.set_trace() sortedprob = sorted(salary_prob.iteritems(), key=lambda jj: jj[1], reverse=True) # for prob in sortedprob: # print prob[0] + str(prob[1]) result.append(sortedprob[0][0]) utils.store_rst(result, 'salary') conn.commit() conn.close() except Exception as e: conn.close() print e end = time.clock() print(end - start)
keyshare = 0 keynos = 0 keysharedct = copy.deepcopy(position_dct) for key in tst_dict.keys(): if positiondct.has_key(key): keyshare += 1 if not position_dct.has_key(key): keysharedct[key] = keynm keynm += 1 else: keynos += 1 print key for industry in industrydct.keys(): keysharedct[industry] = keynm keynm += 1 utils.store_rst(keysharedct, 'keyshare') print 'poslenght : %d' % len(positiondct) print 'tstlenght : %d' % len(tst_dict) print 'share : %d' % share print 'nos : %d' % nos print 'keyshare : %d' % keyshare print 'keynos : %d' % keynos conn.commit() conn.close() end = time.clock() # print (end-start) except Exception as e: conn.commit() conn.close() print e
cur.execute('set character_set_server=utf8') sql = 'select userid, size, salary from work_sizetest ;' cur.execute(sql) worksizelst = cur.fetchall() userid = '' worklst = [] salarylst = [] resultst = [] pdb.set_trace() for work_size in worksizelst: if len(worklst) < 2: worklst.append(work_size[1]) salarylst.append(work_size[2]) else: worklst = sorted(worklst) salarylst = sorted(salarylst) resultst.append([random.randint(worklst[0], worklst[1]), random.randint(salarylst[0], salarylst[1])]) worklst = [work_size[1]] salarylst = [work_size[2]] worklst = sorted(worklst) salarylst = sorted(salarylst) resultst.append([random.randint(worklst[0], worklst[1]), random.randint(salarylst[0], salarylst[1])]) pdb.set_trace() utils.store_rst(resultst, 'wsresult.txt') conn.commit() conn.close() except Exception as e: conn.close() print e end = time.clock() print (end - start)
tfidf_dct[keylst[j]][2] = np.array(tfidf_dct[keylst[j]][2]) tfidf_dct[keylst[j]][2] = np.log(tfidf_dct[keylst[j]][2]) tfidf_dct[keylst[j]][1] = tfidf_dct[keylst[j]][1] * tfidf_dct[keylst[j]][2] tfidf_sort_dct = {} for key in keylst: # print '===================',key,'===================' tfidf_sort_dct[key] = {} for i in range(len(tfidf_dct[key][0])): tfidf_sort_dct[key][tfidf_dct[key][0][i]] = tfidf_dct[key][1][i] # print tfidf_dct[key][0][i],':',tfidf_dct[key][1][i] for key in tfidf_sort_dct.keys(): tfidf_sort_dct[key] = sorted(tfidf_sort_dct[key].items(), key=lambda item: item[1], reverse=True) print "==============================", key, "============================================" for index, item in enumerate(tfidf_sort_dct[key]): print item[0], ":", item[1] if index > 50: break utils.store_rst(tfidf_sort_dct, "tfidf_dct") conn.commit() conn.close() except Exception as e: conn.close() print e end = time.clock() print (end - start)
salaryprobdct[works[1][1]]['salary2'][works[2][1]] = 1 if salaryprobdct[works[1][1]]['industry1'].has_key(works[0][0]): salaryprobdct[works[1][1]]['industry1'][works[0][0]] += 1 else: salaryprobdct[works[1][1]]['industry1'][works[0][0]] = 1 if salaryprobdct[works[1][1]]['industry2'].has_key(works[2][0]): salaryprobdct[works[1][1]]['industry2'][works[2][0]] += 1 else: salaryprobdct[works[1][1]]['industry2'][works[2][0]] = 1 salaryprobdct[works[1][1]]['total'] += 1 salaryprobdct['total'] = t for key in range(7): salaryprobdct[key]['salary1']['total'] = reduce(lambda x,y:x+y, salaryprobdct[key]['salary1'].itervalues()) salaryprobdct[key]['salary2']['total'] = reduce(lambda x,y:x+y, salaryprobdct[key]['salary2'].itervalues()) salaryprobdct[key]['industry1']['total'] = reduce(lambda x,y:x+y, salaryprobdct[key]['industry1'].itervalues()) salaryprobdct[key]['industry2']['total'] = reduce(lambda x,y:x+y, salaryprobdct[key]['industry2'].itervalues()) pdb.set_trace() for key in salaryprobdct.keys(): if key == 'total': continue print str(key) +str(salaryprobdct[key]['salary1']['total']) utils.store_rst(salaryprobdct, 'salaryprobdct') conn.commit() conn.close() except Exception as e: conn.close() print e end = time.clock() print (end - start)
# workprobdct[works[1][1]]['pos2'][works[2][1]] = 1 # if workprobdct[works[1][1]]['industry1'].has_key(works[0][0]): # workprobdct[works[1][1]]['industry1'][works[0][0]] += 1 # else: # workprobdct[works[1][1]]['industry1'][works[0][0]] = 1 # if workprobdct[works[1][1]]['industry2'].has_key(works[2][0]): # workprobdct[works[1][1]]['industry2'][works[2][0]] += 1 # else: # workprobdct[works[1][1]]['industry2'][works[2][0]] = 1 workprobdct[works[1][1]]['total'] += 1 workprobdct['total'] = t for key in position_dct.keys(): workprobdct[key]['pos1']['total'] = reduce(lambda x,y:x+y, workprobdct[key]['pos1'].itervalues()) workprobdct[key]['pos2']['total'] = reduce(lambda x,y:x+y, workprobdct[key]['pos2'].itervalues()) workprobdct[key]['industry1']['total'] = reduce(lambda x,y:x+y, workprobdct[key]['industry1'].itervalues()) workprobdct[key]['industry2']['total'] = reduce(lambda x,y:x+y, workprobdct[key]['industry2'].itervalues()) # pdb.set_trace() for key in workprobdct.keys(): if key == 'total': continue print key +str(workprobdct[key]['pos1']['total']) utils.store_rst(workprobdct, 'workletterprobdct') conn.commit() conn.close() except Exception as e: conn.close() print e end = time.clock() print (end - start)
tfidf_dct[ keylst[j]][1] = tfidf_dct[keylst[j]][1] * tfidf_dct[keylst[j]][2] tfidf_sort_dct = {} for key in keylst: # print '===================',key,'===================' tfidf_sort_dct[key] = {} for i in range(len(tfidf_dct[key][0])): tfidf_sort_dct[key][tfidf_dct[key][0][i]] = tfidf_dct[key][1][i] # print tfidf_dct[key][0][i],':',tfidf_dct[key][1][i] for key in tfidf_sort_dct.keys(): tfidf_sort_dct[key] = sorted(tfidf_sort_dct[key].items(), key=lambda item: item[1], reverse=True) print '==============================', key, '============================================' for index, item in enumerate(tfidf_sort_dct[key]): print item[0], ':', item[1] if index > 50: break utils.store_rst(tfidf_sort_dct, 'tfidf_dct') conn.commit() conn.close() except Exception as e: conn.close() print e end = time.clock() print(end - start)
return dataset,features if __name__ == '__main__': # print "please use: python decision.py train_file test_file" # sys.exit() train_file = 'd:/jobs/dctree/random/train.csv' test_file = 'd:/jobs/dctree/random/test.csv' labels = get_labels(train_file,7) train_dataset, train_features = format_data(train_file) test_dataset, test_features = format_data(test_file) tree_num = 3990 feature_num = 2 result = [] for j in range(tree_num): features_index = generate_feature_index(train_features, feature_num) features = [train_features[l] for l in features_index] print features_index print features train_set, labels = generate_train_file(train_dataset, features_index) decesion_tree = tree.rand(train_set, features, labels, 0.001) test_set = generate_test_file(test_dataset, features_index) rst = tree.rand_test(test_set, features, decesion_tree) result.append(rst) finalrst = generate_result(result) store_rst(finalrst, 'finalrut') end = time.clock() print (end - start)
cur.execute("set character_set_database=utf8") cur.execute("set character_set_results=utf8") cur.execute("set character_set_server=utf8") sql = "select position_name from work_sizetest" cur.execute(sql) wordprobdct = utils.read_rst("position_word") wordlst = cur.fetchall() i = 0 result = [] # pdb.set_trace() for j in xrange(20000): tworks = wordlst[i : i + 2] i += 2 position_prob = {} for key in position_dct.keys(): position_prob[key] = get_position_prob(key, wordprobdct, tworks) sortedprob = sorted(position_prob.iteritems(), key=lambda jj: jj[1], reverse=True) # for prob in sortedprob: # print prob[0] + str(prob[1]) result.append(sortedprob[0][0]) utils.store_rst(result, "positionlet") conn.commit() conn.close() except Exception as e: conn.close() print e end = time.clock() print (end - start)
cur.execute('set character_set_server=utf8') sql = 'select industry, position_name from work_sizetest' cur.execute(sql) workprobdct = utils.read_rst('workletterprobdct') worklst = cur.fetchall() i = 0 result = [] for j in xrange(20000): tworks = worklst[i:i+2] i += 2 position_prob = {} pdb.set_trace() for key in position_dct.keys(): # pdb.set_trace() position_prob[key] = get_position_prob(key, workprobdct, tworks) # pdb.set_trace() sortedprob = sorted(position_prob.iteritems(), key=lambda jj:jj[1], reverse=True) # for prob in sortedprob: # print prob[0] + str(prob[1]) result.append(sortedprob[0][0]) utils.store_rst(result, 'positionlet') conn.commit() conn.close() except Exception as e: conn.close() print e end = time.clock() print (end - start)
letter_dct[poslst[1][0]][0].append(term) if not letterdct.has_key(term): letterdct[term] = 1 else: letterdct[term] += 1 seg_lst = jieba.cut(poslst[2][0]) for term in seg_lst: if len(term) > 1: letter_dct[poslst[1][0]][1].append(term) if not letterdct.has_key(term): letterdct[term] = 1 else: letterdct[term] += 1 positions = letterdct.keys() positions_num = [] # for letter in positions: # print letter # if letter == '\\': # continue # sq = 'insert into letter(name, type, num, stopped) values ("%s", "%s", %d, 0)' % (letter, 'work_size', letterdct[letter]) # print sq # cur.execute(sq) utils.store_rst(letter_dct, 'letterdct') conn.commit() conn.close() except Exception as e: conn.close() print e end = time.clock() print (end - start)
usertst = cur.fetchall() i = 0 for user in usertst: if majordct.has_key(user[1]): majorat = majordct[user[1]] mnu = max(majorat) index = majorat.index(mnu) resultlst.append(index) else: resultlst.append(0) i = i + 1 # for user in usertst: # if majordct.has_key(user[1]): # majorat = majordct[user[1]] # mnu = max(majorat) # if mnu >= 0.8: # index = majorat.index(mnu) # resultlst.append(index) # else: # resultlst.append(treelst[i]) # else: # resultlst.append(treelst[i]) # i = i + 1 utils.store_rst(resultlst, 'degree.txt') conn.commit() conn.close() except Exception as e: conn.close() print e end = time.clock() print(end - start)
salarylst = [] resultst = [] pdb.set_trace() for work_size in worksizelst: if len(worklst) < 2: worklst.append(work_size[1]) salarylst.append(work_size[2]) else: worklst = sorted(worklst) salarylst = sorted(salarylst) resultst.append([ random.randint(worklst[0], worklst[1]), random.randint(salarylst[0], salarylst[1]) ]) worklst = [work_size[1]] salarylst = [work_size[2]] worklst = sorted(worklst) salarylst = sorted(salarylst) resultst.append([ random.randint(worklst[0], worklst[1]), random.randint(salarylst[0], salarylst[1]) ]) pdb.set_trace() utils.store_rst(resultst, 'wsresult.txt') conn.commit() conn.close() except Exception as e: conn.close() print e end = time.clock() print(end - start)
else: shortmar_dusdct[shortmar[0]] = {} shortmar_dusdct[shortmar[0]][shortmar[1]] = [] # for shortmar in shortmartestlst: # if shortmar_dusdct.has_key(shortmar[0]): # if not shortmar_dusdct[shortmar[0]][shortmar[1]]: # shortmar_dusdct[shortmar[0]][shortmar[1]] = '' # else: # shortmar_dusdct[shortmar[0]] = {} # shortmar_dusdct[shortmar[0]][shortmar[1]] = '' pdb.set_trace() for keym in shortmar_dusdct.keys(): for key in shortmar_dusdct[keym].keys(): sqlkey = 'select position_name, count(position_name) as nu from workexperience\ where major = "%s" and industry = "%s" order by nu desc limit 5' % (keym, key) cur.execute(sqlkey) poslst = cur.fetchall() for pos in poslst: shortmar_dusdct[keym][key].append(pos[0]) store_rst(shortmar_dusdct, 'shortmar_dusdct') conn.commit() conn.close() file.close() except Exception as e: file.close() conn.close() print e end = time.clock() print (end - start)
else: result.append(postdct[key][0]) flag = True i += 1 break if not flag: result.append('test') l += 1 flag = False positions = [position[0]] if position_dct.has_key(positions[0]): result.append(positions[0]) elif position_dct.has_key(positions[1]): result.append(positions[1]) else: result.append('test') print i print p print len(result) utils.store_rst(result, 'position.txt') print l conn.commit() conn.close() end = time.clock() print(end - start) except Exception as e: conn.commit() conn.close() print e
return dataset, features if __name__ == '__main__': # print "please use: python decision.py train_file test_file" # sys.exit() train_file = 'd:/jobs/dctree/random/train.csv' test_file = 'd:/jobs/dctree/random/test.csv' labels = get_labels(train_file, 7) train_dataset, train_features = format_data(train_file) test_dataset, test_features = format_data(test_file) tree_num = 3990 feature_num = 2 result = [] for j in range(tree_num): features_index = generate_feature_index(train_features, feature_num) features = [train_features[l] for l in features_index] print features_index print features train_set, labels = generate_train_file(train_dataset, features_index) decesion_tree = tree.rand(train_set, features, labels, 0.001) test_set = generate_test_file(test_dataset, features_index) rst = tree.rand_test(test_set, features, decesion_tree) result.append(rst) finalrst = generate_result(result) store_rst(finalrst, 'finalrut') end = time.clock() print(end - start)
cur.execute('set character_set_results=utf8') cur.execute('set character_set_server=utf8') #sql = 'select userid from jobs_uinfotest' majorsql = 'select distinct(industry) from workexperience' sqlts = 'select distinct(industry) from workexperiencetest' cur.execute(majorsql) majorlst = cur.fetchall() cur.execute(sqlts) sharedct = {} shortmar = cur.fetchall() majordct = {} for major in majorlst: majordct[major[0]] = 1 for major in shortmar: if majordct.has_key(major[0]): sharedct[major[0]] = 1 #print major[0] else: print major[0] print len(sharedct) store_rst(sharedct, 'shareindustry') conn.commit() conn.close() end = time.clock() print (end-start) except Exception as e: conn.commit() conn.close() print e
fea_index = features.index(first_fea) if not decesion_tree[first_fea].has_key(test_data[fea_index]): return None if type(decesion_tree[first_fea][test_data[fea_index]]) is types.DictType: if len(decesion_tree[first_fea][test_data[fea_index]]) == 0: return test_data return classify_prun(decesion_tree[first_fea][test_data[fea_index]], features, test_data) else: return decesion_tree[first_fea][test_data[fea_index]] def classify_t(decesion_tree, features, test_data, mean_values=None): # pdb.set_trace() first_fea = decesion_tree.keys()[0] fea_index = features.index(first_fea) if not decesion_tree[first_fea].has_key(test_data[fea_index]): return 1 if type(decesion_tree[first_fea][test_data[fea_index]]) is types.DictType: return classify_t(decesion_tree[first_fea][test_data[fea_index]], features, test_data) else: return decesion_tree[first_fea][test_data[fea_index]] if __name__ == '__main__': #if len(sys.argv) != 3: # print "please use: python decision.py train_file test_file" # sys.exit() test_file = 'd:/jobs/dctree/salary/test.csv' decesion_tree = read_tree('salary_tree') dataset, features = format_data(test_file) prun_tree = prun(decesion_tree, features, dataset) utils.store_rst(prun_tree, 'prunsalary_tree') print prun_tree
# ratio2 = industryr[industrys[1]][1] # else: # ratio2 = 0 # if ratio1 >= ratio2: # if industryr.has_key(industrys[0]): # result.append(industryr[industrys[0]][0]) # else: # result.append('test') # else: # result.append(industryr[industrys[1]][0]) flag = False positions = [position[0]] industrys = [position[1]] if position_dct.has_key(positions[0]): #handle the last result.append(positions[0]) elif position_dct.has_key(positions[1]): result.append(positions[1]) else: result.append('test') print len(result) utils.store_rst(result, 'position.txt') conn.commit() conn.close() end = time.clock() print (end-start) except Exception as e: conn.commit() conn.close() print e
salaryprobdct[works[1][1]]['industry1'][works[0][0]] = 1 if salaryprobdct[works[1][1]]['industry2'].has_key(works[2][0]): salaryprobdct[works[1][1]]['industry2'][works[2][0]] += 1 else: salaryprobdct[works[1][1]]['industry2'][works[2][0]] = 1 salaryprobdct[works[1][1]]['total'] += 1 salaryprobdct['total'] = t for key in range(7): salaryprobdct[key]['salary1']['total'] = reduce( lambda x, y: x + y, salaryprobdct[key]['salary1'].itervalues()) salaryprobdct[key]['salary2']['total'] = reduce( lambda x, y: x + y, salaryprobdct[key]['salary2'].itervalues()) salaryprobdct[key]['industry1']['total'] = reduce( lambda x, y: x + y, salaryprobdct[key]['industry1'].itervalues()) salaryprobdct[key]['industry2']['total'] = reduce( lambda x, y: x + y, salaryprobdct[key]['industry2'].itervalues()) pdb.set_trace() for key in salaryprobdct.keys(): if key == 'total': continue print str(key) + str(salaryprobdct[key]['salary1']['total']) utils.store_rst(salaryprobdct, 'salaryprobdct') conn.commit() conn.close() except Exception as e: conn.close() print e end = time.clock() print(end - start)
cur.execute('set character_set_results=utf8') cur.execute('set character_set_server=utf8') sql = 'select industry, salary from work_sizetest' cur.execute(sql) salaryprobdct = utils.read_rst('salaryprobdct') worklst = cur.fetchall() i = 0 result = [] for j in xrange(20000): salarys = worklst[i:i+2] i += 2 salary_prob = {} for key in range(7): # pdb.set_trace() salary_prob[key] = get_salary_prob(key, salaryprobdct, salarys) # pdb.set_trace() sortedprob = sorted(salary_prob.iteritems(), key=lambda jj:jj[1], reverse=True) # for prob in sortedprob: # print prob[0] + str(prob[1]) result.append(sortedprob[0][0]) utils.store_rst(result, 'salary') conn.commit() conn.close() except Exception as e: conn.close() print e end = time.clock() print (end - start)
usertst = cur.fetchall() i = 0 for user in usertst: if majordct.has_key(user[1]): majorat = majordct[user[1]] mnu = max(majorat) index = majorat.index(mnu) resultlst.append(index) else: resultlst.append(0) i = i + 1 # for user in usertst: # if majordct.has_key(user[1]): # majorat = majordct[user[1]] # mnu = max(majorat) # if mnu >= 0.8: # index = majorat.index(mnu) # resultlst.append(index) # else: # resultlst.append(treelst[i]) # else: # resultlst.append(treelst[i]) # i = i + 1 utils.store_rst(resultlst, 'degree.txt') conn.commit() conn.close() except Exception as e: conn.close() print e end = time.clock() print (end - start)
ratio2 = industryr[industrys[1]][1] else: ratio2 = 0 if ratio1 >= ratio2: if industryr.has_key(industrys[0]): result.append(industryr[industrys[0]][0]) else: result.append("test") else: result.append(industryr[industrys[1]][0]) flag = False positions = [position[0]] industrys = [position[1]] if position_dct.has_key(positions[0]): # handle the last result.append(positions[0]) elif position_dct.has_key(positions[1]): result.append(positions[1]) else: result.append("test") print len(result) utils.store_rst(result, "position.txt") conn.commit() conn.close() end = time.clock() print (end - start) except Exception as e: conn.commit() conn.close() print e
cur.execute('set character_set_results=utf8') cur.execute('set character_set_server=utf8') sql = 'select industry, position_name from work_sizetest' cur.execute(sql) workprobdct = utils.read_rst('workprobdct') worklst = cur.fetchall() i = 0 result = [] for j in xrange(20000): tworks = worklst[i:i+2] i += 2 position_prob = {} for key in position_dct.keys(): # pdb.set_trace() position_prob[key] = get_position_prob(key, workprobdct, tworks) # pdb.set_trace() sortedprob = sorted(position_prob.iteritems(), key=lambda jj:jj[1], reverse=True) # for prob in sortedprob: # print prob[0] + str(prob[1]) result.append(sortedprob[0][0]) utils.store_rst(result, 'position13') conn.commit() conn.close() except Exception as e: conn.close() print e end = time.clock() print (end - start)
if __name__ == "__main__": # print "please use: python decision.py train_file test_file" # sys.exit() train_file = "d:/jobs/dctree/random/degree-train.csv" test_file = "d:/jobs/dctree/random/degree-test.csv" labels = get_labels(train_file, 0) train_dataset, train_features = format_data(train_file) test_dataset, test_features = format_data(test_file) tree_num = 3990 feature_num = 2 result = [] for j in range(tree_num): features_index = generate_feature_index(train_features, feature_num) features = [train_features[l] for l in features_index] print features_index print features train_set, labels = generate_train_file(train_dataset, features_index) decesion_tree = tree.rand(train_set, features, labels, 0.0005) test_set = generate_test_file(test_dataset, features_index) rst = tree.rand_test(test_set, features, decesion_tree) result.append(rst) finalrst = generate_result(result) store_rst(finalrst, "finalrut") end = time.clock() print (end - start)
letter_dct[poslst[1][0]][0].append(term) if not letterdct.has_key(term): letterdct[term] = 1 else: letterdct[term] += 1 seg_lst = jieba.cut(poslst[2][0]) for term in seg_lst: if len(term) > 1: letter_dct[poslst[1][0]][1].append(term) if not letterdct.has_key(term): letterdct[term] = 1 else: letterdct[term] += 1 positions = letterdct.keys() positions_num = [] # for letter in positions: # print letter # if letter == '\\': # continue # sq = 'insert into letter(name, type, num, stopped) values ("%s", "%s", %d, 0)' % (letter, 'work_size', letterdct[letter]) # print sq # cur.execute(sq) utils.store_rst(letter_dct, 'letterdct') conn.commit() conn.close() except Exception as e: conn.close() print e end = time.clock() print(end - start)