def query_major_rc(): sql = 'SELECT TITLE,COUNT(TITLE),MOST_MONEY\ from job\ where info like "%软件测试%"\ group by title having count(*) > 0 \ ORDER BY COUNT(title) DESC\ LIMIT 10\ ' cur = db_connect.Db_Connect(host='localhost', user='******', passwd='1234', db='51job', port=3306, charset='utf8') cur.execute(sql) data_list = [] res = list(cur.fetchall()) res_sorted_by_money = money_int.Money_Int(res) for i in range(len(res_sorted_by_money)): data_list.append({ 'name': res_sorted_by_money[i][0], 'value': (int(res_sorted_by_money[i][1] / 1000 / 12)) }) print(jsonify(data_list)) return jsonify(data_list)
def Gensim_Search(text_url, keyword): if os.path.exists(text_url): pass else: text = '' cur = db_connect.Db_Connect(host='localhost', user='******', passwd='1234', db='51job', port=3306, charset='utf8') sql = 'select keyword from job' cur.execute(sql) for i in (cur.fetchall()): text = text + str(i[0]) with open(text_url, 'w+', encoding='utf-8') as f: f.write(text) up_and_low.Up_And_Low(text_url) sentences = word2vec.Text8Corpus(text_url) model = word2vec.Word2Vec(sentences, size=200) model.save('./textmodel.model') try: most_similar_word = model.most_similar(keyword.upper()) return most_similar_word except: return None
def query_talents(): sql = 'SELECT title,COUNT(title) as titlenum\ FROM job\ WHERE info LIKE "%大数据%" and title LIKE "%师"\ group by title having count(*) > 0\ ORDER BY COUNT(title) DESC\ LIMIT 10\ ' cur = db_connect.Db_Connect(host='localhost', user='******', passwd='1234', db='51job', port=3306, charset='utf8') cur.execute(sql) data_dict = {} data_dict['names'] = [] data_dict['values'] = [] data_dict['extra'] = [] res = list(cur.fetchall()) num = 0 for i in range(len(res)): num += res[i][1] for i in range(len(res)): data_dict['names'].append({"name": res[i][0]}) data_dict['extra'].append(res[i][0]) for i in range(len(res)): data_dict['values'].append(int((res[i][1]))) # print(data_dict) return jsonify(data_dict)
def query_city(): sql = 'SELECT place_province,count(place_province) as num\ FROM job\ WHERE info LIKE "%大数据%"\ group by place_province having count(*) > 0\ ORDER BY COUNT(title) DESC\ LIMIT 10\ ' cur = db_connect.Db_Connect(host='localhost', user='******', passwd='1234', db='51job', port=3306, charset='utf8') cur.execute(sql) data_list = [] res = list(cur.fetchall()) num = 0 for i in range(len(res)): num += res[i][1] for i in range(len(res)): data_list.append({'name': res[i][0], 'value': int((res[i][1]))}) # print(data_dict) return jsonify(data_list)
def query_demand_dmt(): sql = 'SELECT TITLE,COUNT(TITLE)\ from job\ where info like "%多媒体%"\ group by title having count(*) > 0\ ORDER BY COUNT(title) DESC\ LIMIT 10\ ' cur = db_connect.Db_Connect(host='localhost', user='******', passwd='1234', db='51job', port=3306, charset='utf8') cur.execute(sql) data_list = [] res = list(cur.fetchall()) num = 0 for i in range(len(res)): num += res[i][1] for i in range(len(res)): data_list.append({'name': res[i][0], 'value': int((res[i][1]))}) # print(data_dict) return jsonify(data_list)
def query_salary_bj(): sql = 'SELECT place_city,least_money,most_money\ FROM job\ WHERE place_province="北京" AND place_city!="北京"\ ' cur = db_connect.Db_Connect(host='localhost', user='******', passwd='1234', db='51job', port=3306, charset='utf8') cur.execute(sql) data_dict = {} res = list(cur.fetchall()) is_not_same = [] res_sorted_by_least = money_least.Money_Least(res) res_sorted_by_most = money_most.Money_Most(res) res_fi1 = [] res_fi2 = [] for i in res_sorted_by_least: if i["地区名"] not in is_not_same: is_not_same.append(i["地区名"]) res_fi1.append({"地区名": i["地区名"], "最低工资": i["最低工资"]}) is_not_same = [] for i in res_sorted_by_most: if i["地区名"] not in is_not_same: is_not_same.append(i["地区名"]) res_fi2.append({"地区名": i["地区名"], "最高工资": i["最高工资"]}) for i in range(len(res_fi1)): res_fi1[i]["最高工资"] = res_fi2[i]["最高工资"] #{'numsh':[最高],'numsl':[最低],'names':[名称]} data_dict['numsh'] = [] data_dict['numsl'] = [] data_dict['names'] = [] for i in range(len(res_fi1)): data_dict['numsh'].append(res_fi1[i]['最高工资']) data_dict['numsl'].append(res_fi1[i]['最低工资']) data_dict['names'].append(res_fi1[i]['地区名']) # print(data_dict) data_dict['names'].remove('昌平区') del data_dict['numsh'][5] del data_dict['numsl'][5] print(data_dict) return jsonify(data_dict)
def query_location(): sql = 'SELECT DISTINCT place_province,count(place_province)\ FROM job\ group by place_province having count(*) > 0\ ' cur = db_connect.Db_Connect(host='localhost', user='******', passwd='1234', db='51job', port=3306, charset='utf8') cur.execute(sql) data_list = [] res = list(cur.fetchall()) num = 0 for i in range(len(res)): num += res[i][1] for i in range(len(res)): data_list.append({'name': res[i][0], 'value': int((res[i][1]))}) # print(data_dict) return jsonify(data_list)
def Resume(place,major,text): if text!='': cur = db_connect.Db_Connect(host='localhost', user='******', passwd='1234', db='51job', port=3306, charset='utf8') sql = "select distinct title,company,least_money,most_money,keyword,info from job where keyword like '%" + major + "%' or info like '%" + major + "%' or title like '%" + major + "%' and place_province like '%" + place +"%' or place_city like '%" + place +"%' limit 100" cur.execute(sql) #resume是添加交集功能, data = cur.fetchall() doc_test=text #print(doc0,doc1,doc_test) all_doc_chara=[] all_doc=[] try: for i in data: all_doc.append(i[0]+'-!-!-'+i[1]+'-!-!-'+i[2]+'-!-!-'+i[3]+'-!-!-'+i[4].split(' ')[5]) all_doc_chara.append("".join(re.sub(',|!|\?','',i[0].upper()))+"".join(re.sub(',|!|\?| ',' ',i[5].upper()))) except: pass all_doc_list=[] for doc in all_doc_chara: doc_list=[word for word in jieba.cut(doc)] all_doc_list.append(doc_list) #print(all_doc_list)#原文本的分词列表 doc_test_list=[word for word in jieba.cut(doc_test)] #print(doc_test_list)#测试文本的分词列表 dictionary=corpora.Dictionary(all_doc_list) #print(dictionary.keys())#原文本的字典键 #print(dictionary.token2id)#原文本键键名对应 corpus=[dictionary.doc2bow(doc) for doc in all_doc_list] #print(corpus)#原文本键和出现次数[[(),()],[(),()]] doc_test_vec=dictionary.doc2bow(doc_test_list) #print(doc_test_vec) tfidf=models.TfidfModel(corpus)#不同的转化需要不同的参数,在TF-IDF转化中,训练的过程就是简单的遍历训练语料库(corpus),然后计算文档中每个特征的频率。 #找到有特征性的词作为区分的标准,tf计算是局部的,idf计算是全局的 #tf-idf算法是创建在这样一个假设之上的:对区别文档最有意义的词语应该是那些在文档中出现频率高,而在整个文档集合的其他文档中出现频率少的词语 #但是在本质上idf是一种试图抑制噪声的加权,并且单纯地认为文本频率小的单词就越重要,文本频率大的单词就越无用,显然这并不是完全正确的。idf的简单结构并不能有效地反映单词的重要程度和特征词的分布情况,使其无法很好地完成对权值调整的功能,所以tf-idf法的精度并不是很高。 index=similarities.SparseMatrixSimilarity(tfidf[corpus],num_features=len(dictionary.keys())) sim=index[tfidf[doc_test_vec]] # print(sim) res=sorted(enumerate(sim),key=lambda item:-item[1]) res_list=[] child_res_dict={} child_res_dict['node']=[] child_res_dict['link']=[] print(res) # print(len(res)) if len(res) <= 5: cou_num = 1 for i in range(0,len(res)): if res[i][1] > 0.005: node_dict1 = {'name': '序号:' + str(i) + ',公司名:' + all_doc[res[i][0]].split('-!-!-')[1], 'value': '%.2f' % (res[i][1] * 1000), 'x': (cou_num + 1) * 10, 'y': 200, 'symbolSize': size1, 'label': {'normal': {'position': 'inside', 'fontSize': 10, 'color': '#FF6633'}}, "draggable": "true"} node_dict2 = {'name': '序号:' + str(i) + ',职位名:' + all_doc[res[i][0]].split('-!-!-')[0] + ',工资区间:' + \ all_doc[res[i][0]].split('-!-!-')[2] + '-' + \ all_doc[res[i][0]].split('-!-!-')[3], 'value': '%.2f' % (res[i][1] * 1000), 'x': (cou_num + 1) * 10, 'y': 200, 'symbolSize': size2, 'label': {'normal': {'position': 'inside', 'fontSize': 10, 'color': '#FF6633'}}, "draggable": "true"} # node_dict3 = {'name': '序号:' + str(i) + ',关键词:' + all_doc[res[i][0]].split('-!-!-')[4], # 'value': '%.2f' % (res[i][1] * 1000), 'x': (cou_num + 1) * 10, 'y': 200, # 'symbolSize': size2, # 'label': {'normal': {'position': 'inside', 'fontSize': 10, 'color': '#FF6633'}}, # "draggable": "true"} cou_num += 3 child_res_dict['node'].append(node_dict1) child_res_dict['node'].append(node_dict2) # child_res_dict['node'].append(node_dict3) link_dict1 = {'source': '序号:' + str(i) + ',公司名:' + all_doc[res[i][0]].split('-!-!-')[1], 'target': '序号:' + str(i) + ',公司名:' + all_doc[res[i][0]].split('-!-!-')[1], 'name': '', 'value': '', 'label': '', 'lineStyle': {"normal": {"width": 2.0, "curveness": 0.2, "color": '#FF6633'}}} link_dict2 = {'source': '序号:' + str(i) + ',职位名:' + all_doc[res[i][0]].split('-!-!-')[0] + ',工资区间:' + \ all_doc[res[i][0]].split('-!-!-')[2] + '-' + \ all_doc[res[i][0]].split('-!-!-')[3], 'target': '序号:' + str(i) + ',公司名:' + all_doc[res[i][0]].split('-!-!-')[1], 'name': '', 'value': '', 'label': '', 'lineStyle': {"normal": {"width": 2.0, "curveness": 0.2, "color": '#FF6633'}}} # link_dict3 = {'source': '序号:' + str(i) + ',关键词:' + all_doc[res[i][0]].split('-!-!-')[4], # 'target': '序号:' + str(i) + ',公司名:' + all_doc[res[i][0]].split('-!-!-')[1], # 'name': '', 'value': '', 'label': '', # 'lineStyle': {"normal": {"width": 2.0, "curveness": 0.2, "color": '#FF6633'}}} child_res_dict['link'].append(link_dict1) child_res_dict['link'].append(link_dict2) # child_res_dict['link'].append(link_dict3) child_res_dict['node'].append({'name': '分析结果', 'value': '', 'x': 10, 'y': 200, 'symbolSize': size0, 'label': {'normal': {'position': 'inside', 'fontSize': 14, 'color': '#FF6633'}}, "draggable": "true"}) copy_list = child_res_dict['link'].copy() for j in range(0, len(copy_list), 3): child_res_dict['link'].append({'source': copy_list[j]['source'], 'target': child_res_dict['node'][len(child_res_dict['node']) - 1]['name'], 'name': '', 'value': '', 'label': '', 'lineStyle': { "normal": {"width": 2.0, "curveness": 0.2, "color": '#FF6633'}}}) res_list.append(child_res_dict) else: cou_num = 1 for i in range(0,8): if res[i][1] > 0.005: node_dict1 = {'name': '序号:' + str(i) + ',公司名:' + all_doc[res[i][0]].split('-!-!-')[1], 'value': '%.2f' % (res[i][1] * 1000), 'x': (cou_num+1)*10, 'y': 200, 'symbolSize': size1, 'label': {'normal': {'position': 'inside', 'fontSize': 10, 'color': '#FF6633'}}, "draggable": "true"} node_dict2 = {'name': '序号:' + str(i) + ',职位名:' + all_doc[res[i][0]].split('-!-!-')[0] + ',工资区间:' + \ all_doc[res[i][0]].split('-!-!-')[2] + '-' + \ all_doc[res[i][0]].split('-!-!-')[3], 'value': '%.2f' % (res[i][1] * 1000), 'x': (cou_num + 1) * 10, 'y': 200, 'symbolSize': size2, 'label': {'normal': {'position': 'inside', 'fontSize': 10, 'color': '#FF6633'}}, "draggable": "true"} # node_dict3 = {'name': '序号:' + str(i) + ',关键词:' + all_doc[res[i][0]].split('-!-!-')[4], # 'value': '%.2f' % (res[i][1] * 1000), 'x': (cou_num + 1) * 10, 'y': 200, # 'symbolSize': size2, # 'label': {'normal': {'position': 'inside', 'fontSize': 10, 'color': '#FF6633'}}, # "draggable": "true"} cou_num += 3 child_res_dict['node'].append(node_dict1) child_res_dict['node'].append(node_dict2) # child_res_dict['node'].append(node_dict3) link_dict1 = {'source':'序号:' + str(i) + ',公司名:' + all_doc[res[i][0]].split('-!-!-')[1], 'target': '序号:' + str(i) + ',公司名:' + all_doc[res[i][0]].split('-!-!-')[1], 'name': '', 'value': '', 'label': '','lineStyle':{"normal": {"width": 2.0, "curveness": 0.2, "color": '#FF6633'}}} link_dict2 = {'source': '序号:' + str(i) + ',职位名:' + all_doc[res[i][0]].split('-!-!-')[0] + ',工资区间:' + \ all_doc[res[i][0]].split('-!-!-')[2] + '-' + \ all_doc[res[i][0]].split('-!-!-')[3], 'target': '序号:' + str(i) + ',公司名:' + all_doc[res[i][0]].split('-!-!-')[1], 'name': '', 'value': '', 'label': '','lineStyle':{"normal": {"width": 2.0, "curveness": 0.2, "color": '#FF6633'}}} link_dict3 = {'source': '序号:' + str(i) + ',关键词:' + all_doc[res[i][0]].split('-!-!-')[4], 'target': '序号:' + str(i) + ',公司名:' + all_doc[res[i][0]].split('-!-!-')[1], 'name': '', 'value': '', 'label': '','lineStyle':{"normal": {"width": 2.0, "curveness": 0.2, "color": '#FF6633'}}} child_res_dict['link'].append(link_dict1) child_res_dict['link'].append(link_dict2) child_res_dict['link'].append(link_dict3) child_res_dict['node'].append({'name': '分析结果', 'value': '', 'x': 10, 'y': 200, 'symbolSize': size0, 'label': {'normal': {'position': 'inside', 'fontSize': 14, 'color': '#FF6633'}}, "draggable": "true"}) copy_list = child_res_dict['link'].copy() for j in range(0, len(copy_list), 3): child_res_dict['link'].append({'source': copy_list[j]['source'], 'target': child_res_dict['node'][len(child_res_dict['node']) - 1]['name'], 'name': '', 'value': '', 'label': '','lineStyle':{"normal": {"width": 2.0, "curveness": 0.2, "color": '#FF6633'}}}) res_list.append(child_res_dict) return res_list else: return []