class AutoGrocery(object): """ """ def __init__(self, name, train_data): self._train_data = train_data self._grocery = Grocery(project_dir + '/models/model_data/' + name) def train(self): self._grocery.train(self._train_data) def save(self): self._grocery.save() def load(self): self._grocery.load() def predicate(self, src): if not self._grocery.get_load_status(): try: self.load() except ValueError: self.train() self.save() pr = self._grocery.predict(src) label = pr.predicted_y return label, pr.dec_values[label]
def tGrocery(): outFile = open('testResult.tmp', 'w') [trainingSet, benchmark] = pickle.load(open('SampleSeg.pk')) testingSet = [] correctLabel = [] for i in xrange(len(benchmark)): print '%d out of %d' % (i, len(benchmark)) testingSet.append(benchmark[i][1]) correctLabel.append(benchmark[i][0]) grocery = Grocery('test') grocery.train(trainingSet) grocery.save() # load new_grocery = Grocery('test') new_grocery.load() Prediction = [] for i in xrange(len(testingSet)): print '%d out of %d' % (i, len(testingSet)) prediction = new_grocery.predict(testingSet[i]) Prediction.append(prediction) temp = correctLabel[i] + '<-->' + prediction + ' /x01' + testingSet[i] + '\n' outFile.write(temp) correct = 0 for i in xrange(len(Prediction)): print Prediction[i], correctLabel[i], if Prediction[i] == correctLabel[i]: correct += 1 print 'Correct' else: print 'False' print 'Correct Count:', correct print 'Accuracy: %f' % (1.0 * correct / len(Prediction))
def test(test_path): new_grocery = Grocery('cv_' + str(fold) + '_model') #, custom_tokenize=segment) new_grocery.load() test_src = [] with open(test_path) as f: for line in f: label, text = line.strip().split("|text|") label = yiji_label[classify_dict[label]] test_src.append((label, text)) test_result = new_grocery.test(test_src) #print test_result #print test_result.accuracy_overall #accs = test_result.accuracy_labels recalls = test_result.recall_labels #print "Recall for each class: ", recalls predictlabels = test_result.predicted_y truelabels = test_result.true_y acc = accuracy_score(truelabels, predictlabels) macro_precision, macro_recall, macro_fscore, _ = precision_recall_fscore_support( truelabels, predictlabels, average='macro') print "Accuracy: ", acc, "Macro-average Precision:", macro_precision, "Macro-average Recall:", macro_recall, "Macro-average Fscore:", macro_fscore labellist = [ 'safe_and_stable', 'industrial_information', 'politics', 'culture_health', 'social_livelihood', 'economic_and_financial' ] precision, recall, fscore, _ = precision_recall_fscore_support( truelabels, predictlabels, average=None, labels=labellist) precisions = dict() recalls = dict() for idx, p in enumerate(precision): precisions[labellist[idx]] = p for idx, c in enumerate(recall): recalls[labellist[idx]] = c
def __init__(self, keyword): print '进行新闻分类' (db, cursor) = connectdb() cursor.execute("update task set status=1 where keyword=%s", [keyword]) cursor.execute("select id, title from news where keyword=%s", [keyword]) news = cursor.fetchall() new_grocery = Grocery('static/paris') new_grocery.load() for item in news: tag = new_grocery.predict(item['title']) if tag == '新闻背景': tag = 1 elif tag == '事实陈述': tag = 2 elif tag == '事件演化': tag = 3 elif tag == '各方态度': tag = 4 elif tag == '直接关联': tag = 6 elif tag == '暂无关联': tag = 7 cursor.execute("update news set tag=%s where id=%s", [tag, item['id']]) closedb(db, cursor) return
def predict_test(model_path, data): # 加载模型 try: model_path = os.path.join(BASE_DIR, 'learn', model_path) new_grocery = Grocery(model_path.encode('utf-8')) new_grocery.load() except Exception as e: return {'IsErr': True, 'ErrDesc': u'学习模型加载不成功,请检查路径'} # 整理输入数据 result = list() sentences = data.split(';') if sentences[-1] == '': sentences.pop() if len(sentences) == 0: return {'IsErr': True, 'ErrDesc': u'输入的句子结构有错误或没有数据'} # 分词,再判断 stop_words = read_lines(os.path.join(BASE_DIR, 'learn', 's_w.txt')) for s in sentences: tmp_s = '' words = jieba.cut(s) for word in words: if word in stop_words: continue else: tmp_s += word + ' ' result.append({ 'tag': str(new_grocery.predict(tmp_s.strip().encode('utf-8'))), 'sentence': s, }) return {'IsErr': False, 'ErrDesc': u'成功', 'data': result}
class GroceryModel(object): def __init__(self): self.grocery = Grocery('TextClassify') def train(self,train_file): f = open(train_file,'r') line = f.readline().decode('utf8') dataset = [] while line: tmp = line.split('\t') dataset.append((tmp[0],''.join(tmp[1:]))) line = f.readline().decode('utf8') f.close() self.grocery.train(dataset) self.grocery.save() def load_model(self): self.grocery.load() def test(self,test_src): self.load_model() f = open(test_src,'r') line = f.readline().decode('utf8') dataset = [] while line: tmp = line.split('\t') dataset.append((tmp[0],''.join(tmp[1:]))) line = f.readline().decode('utf8') f.close() result = self.grocery.test(dataset) print result def predict(self,text): print self.grocery.predict(text)
def get_data(ids, b_date, end_data, log, stop_word): b_date = b_date.strftime('%Y-%m-%d') end_data = end_data.strftime('%Y-%m-%d') # 选择数据来源 df = load_data(ids, b_date, end_data) # df = load_data_excel() # df = pd.read_excel('data_treasure.xls') df['RateDate'] = pd.to_datetime(df['RateDate']) # df_group = df['RateDate'].groupby([df.RateDate.values.astype('datetime64[D]')]).size() res = list() log.info('Have %d comments need to process' % len(df)) # 分类模型导入 new_grocery = Grocery('sample2') new_grocery.load() for record_data in range(0, len(df)): # 按日期分类摘取内容 # tmp_df = df[df['RateDate'] > df_group.index[record_data]][df['RateDate'] < df_group.index[record_data + 1]] # 自然语言处理 content_sw, level, tag = nlp_process_with_sw(df.iloc[record_data], new_grocery, stop_word) # 记录结果 res.append({ 'RateContent': json.dumps(content_sw, ensure_ascii=False), 'RateDate': df.iloc[record_data]['RateDate'], 'TreasureID': df.iloc[record_data]['TreasureID'], 'Level': level, 'Tag': tag, 'Sentence': df.iloc[record_data]['RateContent'], }) return res
def __init__(self, keyword): print '进行新闻分类' (db, cursor) = connectdb() cursor.execute("update task set status=1 where keyword=%s", [keyword]) cursor.execute("select id, title from news where keyword=%s",[keyword]) news = cursor.fetchall() new_grocery = Grocery('static/paris') new_grocery.load() for item in news: tag = new_grocery.predict(item['title']) if tag == '新闻背景': tag = 1 elif tag == '事实陈述': tag = 2 elif tag == '事件演化': tag = 3 elif tag == '各方态度': tag = 4 elif tag == '直接关联': tag = 6 elif tag == '暂无关联': tag = 7 cursor.execute("update news set tag=%s where id=%s", [tag, item['id']]) closedb(db, cursor) return
def labelmaker(self): result=[] grocery = Grocery('11c_20k_20171226') grocery.load() label_confidence=sorted(grocery.predict(self.shorttext).dec_values.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)[0] result.append(label_confidence[0])#置信度最高的分类结果 result.append(label_confidence[1])# 置信度 return result
def test_grocery(): grocery = Grocery('model_redian') grocery.train('trdata_4.txt') grocery.save() new_grocery = Grocery('model_redian') new_grocery.load() test_result = new_grocery.test('tedata_4.txt') print test_result.accuracy_labels print test_result.recall_labels test_result.show_result()
def phgrocery(text): # result_text = [] model_grocery = Grocery('model_redian_5') model_grocery.load() result = int(model_grocery.predict(text).predicted_y) # print result # if result == 1: # result_text.append(text) return result
def GET(self,name): #i = web.input(name=None) #url = "http://"+name #html = urllib2.urlopen(url).read() #soup = BeautifulSoup(html) #title = soup.html.head.title.contents.pop().encode('utf-8') title = name.encode('utf-8') new_grocery = Grocery('sample') new_grocery.load() return new_grocery.predict(title)
def test_main(self): grocery = Grocery(self.grocery_name) grocery.train(self.train_src) grocery.save() new_grocery = Grocery('test') new_grocery.load() assert grocery.get_load_status() assert grocery.predict('考生必读:新托福写作考试评分标准') == 'education' # cleanup if self.grocery_name and os.path.exists(self.grocery_name): shutil.rmtree(self.grocery_name)
def predict_phrasing(self, text=u'曾被年轻人嫌弃,如今却媲美Zara'): ''' :param text: :param model_name: :return: ''' new_grocery = Grocery(self.model_name) new_grocery.load() result = new_grocery.predict(text) return result.dec_values[u'postive']
def test_main(self): grocery = Grocery(self.grocery_name) grocery.train(self.train_src) grocery.save() new_grocery = Grocery('test') new_grocery.load() assert grocery.get_load_status() result = grocery.predict('just a testing') print(result) result = grocery.predict('考生必读:新托福写作考试评分标准') print(result) print("type of result is :",type(result)) assert str(grocery.predict('考生必读:新托福写作考试评分标准')) == 'education' assert str(grocery.predict('法网')) == 'sports' # cleanup if self.grocery_name and os.path.exists(self.grocery_name): shutil.rmtree(self.grocery_name)
class MyGrocery(object): def __init__(self, name): super(MyGrocery, self).__init__() self.grocery = Grocery(name) self.loaded = False self.correct = 1.0 def train(self, src): lines = [] for line in csv.reader(open(src)): label, s = line[0],line[1] text = s.decode('utf8') lines.append((label, text)) self.grocery.train(lines) def save_model(self): self.grocery.save() def train_and_save(self, src): self.train(src) self.save_model() def load_model(self): if not self.loaded: self.grocery.load() self.loaded = True def predict(self, text): self.load_model() return self.grocery.predict(text) def test(self, src): self.load_model() total, wrong_num = 0.0, 0.0 for line in csv.reader(open(src)): total += 1 if line[0] != self.predict(line[1]): wrong_num += 1 print "load test file from " + src correct = (total - wrong_num ) / total self.correct = correct print "total: %d , wrong_num: %d, success percentage: %f" %(total, wrong_num, correct) result = dict(type="test", total=total, wrong_num=wrong_num, correct=correct) return json.dumps(result)
def tgrocery_train(train_data,test_data): '''model预测''' print("训练语料总数为: " + str(len(train_data))) test_corpus, test_label = test_split(test_data) grocery = Grocery('TextGrocery') print("start training......") grocery.train(train_data) grocery.save() new_grocery = Grocery('TextGrocery') new_grocery.load() predict_label = [] for sample in test_corpus: label = new_grocery.predict(sample) predict_label.append(str(label)) # print(predict_label) return test_corpus,test_label,predict_label
def predict_corpus(input_file,output_csv): import csv csvfile = file(output_csv, 'wb') writer = csv.writer(csvfile) corpus = [] f = xlrd.open_workbook(input_file) table = f.sheet_by_name('Sheet1') nrows = table.nrows # 读取行数 for rownum in range(0, nrows): row = table.row_values(rownum) row[2].strip() corpus.append(row[2]) corpus_grocery = Grocery(project_name) corpus_grocery.load() output = [] for sentence in corpus: predict = corpus_grocery.predict(sentence) output.append((sentence,predict)) writer.writerows(output) print('Done!') csvfile.close()
class jdParser(object): def __init__(self): self.clf = Grocery("./jdclf") self.clf.load() self.LINE_SPLIT = re.compile(u"[;。;\n]") def get_demand_and_duty(self,jdstr): linelist = [ line.strip() for line in self.LINE_SPLIT.split(jdstr) if len(line.strip()>4) ] result = {} demand = [] duty = [] for line in linelist: pred = str(self.clf.predict(line)) if pred =="demand": demand.append(line) elif pred == "duty": duty.append(line) result['demand'] = '\n'.join(demand) result['duty'] = '\n'.join(duty)
class jdParser(object): def __init__(self): self.clf = Grocery("./jdclf") self.clf.load() self.LINE_SPLIT = re.compile(u"[;。;\n]") def get_demand_and_duty(self, jdstr): linelist = [ line.strip() for line in self.LINE_SPLIT.split(jdstr) if len(line.strip() > 4) ] result = {} demand = [] duty = [] for line in linelist: pred = str(self.clf.predict(line)) if pred == "demand": demand.append(line) elif pred == "duty": duty.append(line) result['demand'] = '\n'.join(demand) result['duty'] = '\n'.join(duty)
class TagPredictor(object): def _custom_tokenize(self, line, **kwargs): try: kwargs["method"] except: method = str(self.kwargs["method"]) else: method = str(kwargs["method"]) if method == "normal": tokens = self.key_ext.calculateTokens(line, doc_len_lower_bound=5, doc_len_upper_bound=500, method="normal") elif method == "processed": tokens = line.split(',') return tokens def __init__(self, *args, **kwargs): self.grocery_name = str(kwargs["grocery_name"]) method = str(kwargs["method"]) train_src = str(kwargs["train_src"]) self.PREFIX = conf.load("predict_label")["prefix"] self.MODEL_DIR = conf.load("predict_label")["model_dir"] self.kwargs = kwargs if method == "normal": self.key_ext = keyExt() self.grocery = Grocery(self.grocery_name, custom_tokenize=self._custom_tokenize) elif method == "jieba": self.grocery = Grocery(self.grocery_name) elif method == "processed": self.grocery = Grocery(self.grocery_name, custom_tokenize=self._custom_tokenize) pass def trainFromDocs(self, *args, **kwargs): model = self.grocery.train(self.kwargs["train_src"]) return model def autoEvaluation(self, *args, **kwargs): prune_threshold = float(kwargs["threshold"]) excluded_labels = kwargs["excluded_labels"] excluded_docs = kwargs["excluded_docs"] train_data = [] with open(self.kwargs["train_src"], 'rb') as f: for line in f: try: line.split('\t', 1)[1] except: continue else: train_data.append( (line.split('\t', 1)[0], line.split('\t', 1)[1].split('\n', 1)[0])) f.close() print "#items before filtering:", len(train_data) print "-- Now we filter out the excluded docs --" train_data = [i for i in train_data if i[1] not in excluded_docs] print "#items after filtering:", len(train_data) print "-- Now we filter out the excluded labels --" train_data = [i for i in train_data if i[0] not in excluded_labels] print "#items after filtering:", len(train_data) n = len(train_data) #number of rows in your dataset indices = range(n) indices = shuffle(indices) train_set = map(lambda x: train_data[x], indices[:n * 10 // 10]) test_set = map(lambda x: train_data[x], indices[:n * 10 // 10]) self.grocery.train(train_set) test_result = self.grocery.test(test_set) print '-- Accuracy after training --' print 'Accuracy, A-0:', test_result low_recall_label = [] for item in test_result.recall_labels.items(): if item[1] < prune_threshold: low_recall_label.append(item[0]) new_train_set = [ item for item in train_set if item[0] not in low_recall_label ] new_test_set = [ item for item in train_set if item[0] not in low_recall_label ] self.grocery.train(new_train_set) new_test_result = self.grocery.test(new_test_set) print '-- Accuracy after training, with low-recall labels (less than', str( prune_threshold * 100), '%) pruned --' print 'Accuracy, A-1:', new_test_result return self.grocery, new_test_result def manualEvaluation(self, *args, **kwargs): n_docs = int(kwargs["n_docs"]) excluded_labels = kwargs["excluded_labels"] excluded_docs = kwargs["excluded_docs"] train_data = [] with open(self.kwargs["train_src"], 'rb') as f: for line in f: try: line.split('\t', 1)[1] except: continue else: train_data.append( (line.split('\t', 1)[0], line.split('\t', 1)[1].split('\n', 1)[0])) f.close() train_data = [ item for item in train_data if item[0] not in excluded_labels ] train_data = [i for i in train_data if i[1] not in excluded_docs] n = len(train_data) #number of rows in your dataset indices = range(n) indices = shuffle(indices) test_set = map(lambda x: train_data[x], indices[0:n_docs]) g = self.loadTrainModel() test_result = g.test(test_set) return test_set, test_result def saveTrainModel(self, *args, **kwargs): self.grocery.save() os.rename( self.PREFIX + self.grocery_name + '_train.svm', self.PREFIX + self.MODEL_DIR + self.grocery_name + '_train.svm') return def loadTrainModel(self, *args, **kwargs): os.rename( self.PREFIX + self.MODEL_DIR + self.grocery_name + '_train.svm', self.PREFIX + self.grocery_name + '_train.svm') self.grocery.load() os.rename( self.PREFIX + self.grocery_name + '_train.svm', self.PREFIX + self.MODEL_DIR + self.grocery_name + '_train.svm') return self.grocery def predict(self, line, **kwargs): tag = self.grocery.predict(line) return tag def test(self, *args, **kwargs): test_src = str(kwargs["test_src"]) test_result = self.grocery.test(test_src) print "Total Accuracy", test_result return test_result
class JdParserTop(object): def __init__(self): self.CLEAN_TEXT = re.compile( u"[^\u4e00-\u9fa5\w\d;::;,。、\.,。!!@()\r\n\(\)\-\+ - ]") self.clf = Grocery(base_dir + "/jdclf") self.clf.load() self.SPLIT_LINE = re.compile(u"[\r\n;::。!?;]|[ \s \xa0\u724b]{4,}") self.CLEAN_LINE = re.compile( u"^[\u2022(【\[\s\t\r\n\(\- ]?[\da-z12345789]{1,2}[\.,。、,::)】\]\)\s]|^[!@#¥%……&×()\(\){}:“|、-\-,。::\.]|^[一二三四五六七八九123456789\d]{0,2}[\.、\s:: ]|[,;。、\s \.]$|^[\s \u2022 \uff0d \u25cf]" ) self.CLEAN_JOBNAME = re.compile( u"急聘|诚聘|高薪|包[食住宿餐]|.险一金|待遇|^急?招|职位编号\s?[\s\d::]") self.PAY = re.compile("(\d{3,}\-)?\d{3,}元") self.SEX = re.compile(u"性别|男|女") self.AGE = re.compile(u"\d+周?岁|年龄") self.JOB_TAG = re.compile(u"全职|实习") self.DEGREE = re.compile( u"小学|初中|高中|职技|本科|研究生|硕士|博士|教授|专科|大专|中专|无要求|不限|无限") self.START_DEMAND = re.compile( u"(任职资格|岗位要求|工作要求|任职条件|任职要求|职位要求)[::\s】\n ]?") self.START_DUTY = re.compile( u"(工作内容|岗位职责|工作职责|职位描述|工作描述|职位介绍|职位职责|岗位描述)[::\s 】\n ]") self.START_BENEFIT = re.compile(u"(福利待遇|待遇|福利)[::\s\n】]") self.INC_URL = re.compile(u"(主页|网站|网址|官网).{0,3}[\w\d_/\.:\-]+") self.DEMAND = re.compile(u"精通|熟悉|熟练|有.+经验") self.DUTY = re.compile(u"负责|促成|为客户|安排的其.工作") self.BENEFIT = re.compile(u".险一金|福利|晋身|休假|带薪|补助|补贴") self.CERT = re.compile( u"(\S{2,8}证书|CET-\d|普通话|英语|口语|.语|日文|雅思|托福|托业)(至少)?(通过)?[\d一二三四五六七八九]级[及或]?(以上)?|(英语)?CET-\d级?(以上)?|\ 医学.{0,3}证|会计.{0,3}证|律师.{0,3}证|有.{1,8}证书") self.degreedic = set([ line.strip() for line in codecs.open(base_dir + '/data/degrees.txt', 'rb', 'utf-8') ]) self.majordic = set([ line.strip() for line in codecs.open(base_dir + '/data/majordic.txt', 'rb', 'utf-8') ]) self.skilldic = set([ line.strip() for line in codecs.open(base_dir + '/data/skills.txt', 'rb', 'utf-8') ]) self.jobdic = set([ line.strip() for line in codecs.open(base_dir + '/data/jobnames.txt', 'rb', 'utf-8') ]) jieba.load_userdict(base_dir + '/data/majordic.txt') jieba.load_userdict(base_dir + '/data/skills.txt') jieba.load_userdict(base_dir + '/data/firm.txt') jieba.load_userdict(base_dir + '/data/degrees.txt') jieba.load_userdict(base_dir + '/data/benefits.txt') def clean_line(self, line): """ 清除一个句子首尾的标点符号 """ line = self.CLEAN_LINE.sub("", line).strip() line = re.sub("\s+|^/d+[;’、,/。\.]", "", line) return line def clean_cnNum(self, line): """ 经验年限提取时,中文一二三等转为123 """ line = unicode(line) a = [u"一", u"二", u"三", u"四", u"五", u"六", u"七", u"八", u"九", u"十", u"两"] b = range(1, 11) + [2] table = dict((ord(aa), bb) for aa, bb in zip(a, b)) return line.translate(table) def line2vec(self, line): """ 句子转换为向量 """ vec = np.zeros(50) for word in jieba.cut(line): if word in self.w2v.vocab: vec += self.w2v[word] return vec def clean_jobname(self, jobname): """ 职位名清洗 """ if jobname.lower() in self.jobdic: return jobname.lower() else: res = [(lcs_len(jobname, job), job) for job in self.jobdic] res.sort() return res[-1][1]
########################################## # init model_choose = "svm" # svm, lda, rnn grocery_name = "./SVM_models/svm_for_news" corpus_path = "./Corpus/NewsClassCorpus/" file_path = "./" file_name = "post.txt" t_text = delete_stop_words(codecs.open(file_path + file_name, encoding="UTF-8").read()) ########################################### # 调用 SVM 模型分类 if model_choose == "svm": tic = time.time() grocery = Grocery(grocery_name) grocery.load() t_pre_result = grocery.predict(delete_stop_words(t_text)) toc = time.time() t_label = t_pre_result.predicted_y print("Sentiment: ", t_label) print("How much: ", t_pre_result.dec_values[t_label]) print("Elapsed time of predict is: %s s" % (toc - tic)) elif model_choose == "lda": pass elif model_choose == "rnn": pass else: print("")
#!/usr/bin/env python # -*- coding: utf-8 -*- import MySQLdb from tgrocery import Grocery import sys reload(sys) sys.setdefaultencoding('utf8') grocery = Grocery('sample') dict_list = list() conn = MySQLdb.connect(host = 'localhost', db = 'newsdata', user = '******', passwd = 'root', charset = 'utf8', use_unicode = False) cur = conn.cursor() cur.execute('select com_new_type_id, com_new_name from tbl_new where com_new_type_id is not null') for row in cur.fetchall(): dict_list.append(row) grocery.train(dict_list) grocery.save() news_grocery = Grocery('sample') news_grocery.load() while True: result = news_grocery.predict(raw_input('please input title:' )) print result
def load_first_classifier(model_path): new_grocery = Grocery(model_path) new_grocery.load() return new_grocery
class JdParserTop(object): def __init__(self): self.CLEAN_TEXT = re.compile(u"[^\u4e00-\u9fa5\w\d;::;,。、\.,。!!@()\r\n\(\)\-\+ - ]") self.clf = Grocery(base_dir+"/jdclf") self.clf.load() self.SPLIT_LINE = re.compile(u"[\r\n;::。!?;]|[ \s \xa0\u724b]{4,}") self.CLEAN_LINE = re.compile(u"^[\u2022(【\[\s\t\r\n\(\- ]?[\da-z12345789]{1,2}[\.,。、,::)】\]\)\s]|^[!@#¥%……&×()\(\){}:“|、-\-,。::\.]|^[一二三四五六七八九123456789\d]{0,2}[\.、\s:: ]|[,;。、\s \.]$|^[\s \u2022 \uff0d \u25cf]") self.CLEAN_JOBNAME = re.compile(u"急聘|诚聘|高薪|包[食住宿餐]|.险一金|待遇|^急?招|职位编号\s?[\s\d::]") self.PAY = re.compile("(\d{3,}\-)?\d{3,}元") self.SEX = re.compile(u"性别|男|女") self.AGE = re.compile(u"\d+周?岁|年龄") self.JOB_TAG = re.compile(u"全职|实习") self.DEGREE = re.compile(u"小学|初中|高中|职技|本科|研究生|硕士|博士|教授|专科|大专|中专|无要求|不限|无限") self.START_DEMAND = re.compile(u"(任职资格|岗位要求|工作要求|任职条件|任职要求|职位要求)[::\s】\n ]?") self.START_DUTY = re.compile(u"(工作内容|岗位职责|工作职责|职位描述|工作描述|职位介绍|职位职责|岗位描述)[::\s 】\n ]") self.START_BENEFIT = re.compile(u"(福利待遇|待遇|福利)[::\s\n】]") self.INC_URL = re.compile(u"(主页|网站|网址|官网).{0,3}[\w\d_/\.:\-]+") self.DEMAND = re.compile(u"精通|熟悉|熟练|有.+经验") self.DUTY = re.compile(u"负责|促成|为客户|安排的其.工作") self.BENEFIT = re.compile(u".险一金|福利|晋身|休假|带薪|补助|补贴") self.CERT = re.compile(u"(\S{2,8}证书|CET-\d|普通话|英语|口语|.语|日文|雅思|托福|托业)(至少)?(通过)?[\d一二三四五六七八九]级[及或]?(以上)?|(英语)?CET-\d级?(以上)?|\ 医学.{0,3}证|会计.{0,3}证|律师.{0,3}证|有.{1,8}证书") self.degreedic = set([line.strip() for line in codecs.open(base_dir+'/data/degrees.txt','rb','utf-8')]) self.majordic = set([line.strip() for line in codecs.open(base_dir+'/data/majordic.txt','rb','utf-8')]) self.skilldic = set([line.strip() for line in codecs.open(base_dir+'/data/skills.txt','rb','utf-8')]) self.jobdic = set([line.strip() for line in codecs.open(base_dir+'/data/jobnames.txt','rb','utf-8')]) jieba.load_userdict(base_dir+'/data/majordic.txt') jieba.load_userdict(base_dir+'/data/skills.txt') jieba.load_userdict(base_dir+'/data/firm.txt') jieba.load_userdict(base_dir+'/data/degrees.txt') jieba.load_userdict(base_dir+'/data/benefits.txt') def clean_line(self,line): """ 清除一个句子首尾的标点符号 """ line = self.CLEAN_LINE.sub("",line).strip() line = re.sub("\s+|^/d+[;’、,/。\.]","",line) return line def clean_cnNum(self,line): """ 经验年限提取时,中文一二三等转为123 """ line = unicode(line) a = [u"一",u"二",u"三",u"四",u"五",u"六",u"七",u"八",u"九",u"十",u"两"] b = range(1,11)+[2] table = dict((ord(aa),bb) for aa,bb in zip(a,b)) return line.translate(table) def line2vec(self,line): """ 句子转换为向量 """ vec = np.zeros(50) for word in jieba.cut(line): if word in self.w2v.vocab: vec += self.w2v[word] return vec def clean_jobname(self,jobname): """ 职位名清洗 """ if jobname.lower() in self.jobdic: return jobname.lower() else: res = [(lcs_len(jobname,job),job) for job in self.jobdic] res.sort() return res[-1][1]
class JdParserTop(object): def __init__(self): self.result = OrderedDict() self.result["jdFrom"] = "" self.result["pubTime"] = "" inc_keys = [ "incName", "incScale", "incType", "incIndustry", "incLocation", "incUrl", "incStage", "incAliasName", "investIns", "incContactInfo", "incCity", "incZipCode", "incContactName", "incIntro" ] job_keys = [ "jobType", "jobPosition", "jobCate", "jobSalary", "jobWorkAge", "jobDiploma", "jobNum", "jobWorkCity", "jobWorkLoc", "jobWelfare", "age", "jobEndTime", "email", "gender", "jobMajorList", "jobDesc" ] others_keys = [ "keyWords", "isFullTime", "jdRemedy", "posType", "urgent", "holidayWelfare", "livingWelfare", "salaryCombine", "socialWelfare", "trafficWelfare", "jobDepartment", "jobReport", "jobReportDetail", "jobSubSize", "language", "overSea" ] jdInc = OrderedDict() for k in inc_keys: jdInc[k] = "" self.result["jdInc"] = jdInc jdJob = OrderedDict() for k in job_keys: jdJob[k] = "" self.result["jdJob"] = jdJob others = OrderedDict() for k in others_keys: others[k] = "" self.result["others"] = others self.CLEAN_TEXT = re.compile( u"[^\u4e00-\u9fa5\w\d;::;,。、%\.,/。!!@()\r\n\(\)\-\+ - `]") self.clf = Grocery(base_dir + "/jdclf") self.clf.load() self.SPLIT_LINE = re.compile(u"[\r\n;::。!?;]|[ \s \xa0\u724b]{4,}") self.CLEAN_LINE = re.compile( u"^[\u2022(【\[\s\t\r\n\(\- ]?[\da-z12345789]{1,2}[\.,。、,::)】\]\)\s]|^[!@#¥%……&×()\(\){}:“|、-\-,。::\.]|^[一二三四五六七八九123456789\d]{0,2}[\.、\s:: ]|[,;。、\s \.]$|^[\s \u2022 \uff0d \u25cf]" ) self.CLEAN_JOBNAME = re.compile( u"急聘|诚聘|高薪|包[食住宿餐]|.险一金|待遇|^急?招|职位编号\s?[\s\d::]") self.PAY = re.compile("(\d{3,}\-)?\d{3,}元") self.SEX = re.compile(u"性别|男|女") self.AGE = re.compile(u"\d+周?岁|年龄") self.JOB_TAG = re.compile(u"全职|实习") self.DEGREE = re.compile( u"小学|初中|高中|职技|本科|研究生|硕士|博士|教授|专科|大专|中专|无要求|不限|无限") self.MAIL = re.compile(u"\w+@[\w\.]+") self.START_DEMAND = re.compile( u"(任职资格|岗位要求|工作要求|任职条件|任职要求|职位要求)[::\s】\n ]?") self.START_DUTY = re.compile( u"(工作内容|岗位职责|工作职责|职位描述|工作描述|职位介绍|职位职责|岗位描述)[::\s 】\n ]") self.START_BENEFIT = re.compile(u"(福利待遇|待遇|福利)[::\s\n】]") self.INC_URL = re.compile(u"(主页|网站|网址|官网).{0,3}[\w\d_/\.:\-]+") self.DEMAND = re.compile(u"精通|熟悉|熟练|有.+经验") self.DUTY = re.compile(u"负责|促成|为客户|安排的其.工作") self.BENEFIT = re.compile(u".险一金|福利|晋身|休假|带薪|补助|补贴") self.CERT = re.compile( u"(\S{2,8}证书|CET-\d|普通话|英语|口语|.语|日文|雅思|托福|托业)(至少)?(通过)?[\d一二三四五六七八九]级[及或]?(以上)?|(英语)?CET-\d级?(以上)?|\ 医学.{0,3}证|会计.{0,3}证|律师.{0,3}证|有.{1,8}证书") self.degreedic = set([ line.strip() for line in codecs.open(base_dir + '/data/degrees.txt', 'rb', 'utf-8') ]) self.majordic = set([ line.strip() for line in codecs.open(base_dir + '/data/majordic.txt', 'rb', 'utf-8') ]) self.skilldic = set([ line.strip() for line in codecs.open(base_dir + '/data/skills.txt', 'rb', 'utf-8') ]) self.jobdic = set([ line.strip() for line in codecs.open(base_dir + '/data/jobnames.txt', 'rb', 'utf-8') ]) self.citydic = set([ line.strip() for line in codecs.open(base_dir + '/data/citydic.txt', 'rb', 'utf-8') ]) self.province_city = set([ line.strip() for line in codecs.open(base_dir + '/data/province_city.txt', 'rb', 'utf-8') ]) self.SALARY = re.compile(u'万') jieba.load_userdict(base_dir + '/data/majordic.txt') jieba.load_userdict(base_dir + '/data/skills.txt') jieba.load_userdict(base_dir + '/data/firm.txt') jieba.load_userdict(base_dir + '/data/degrees.txt') jieba.load_userdict(base_dir + '/data/benefits.txt') jieba.load_userdict(base_dir + '/data/citydic.txt') jieba.load_userdict(base_dir + '/data/province_city.txt') def refresh(self): self.result = OrderedDict() self.result["jdFrom"] = "" self.result["pubTime"] = "" inc_keys = [ "incName", "incScale", "incType", "incIndustry", "incLocation", "incUrl", "incStage", "incAliasName", "investIns", "incContactInfo", "incCity", "incZipCode", "incContactName", "incIntro" ] job_keys = [ "jobType", "jobPosition", "jobCate", "jobSalary", "jobWorkAge", "jobDiploma", "jobNum", "jobWorkCity", "jobWorkLoc", "jobWelfare", "age", "jobEndTime", "email", "gender", "jobMajorList", "jobDesc" ] others_keys = [ "keyWords", "isFullTime", "jdRemedy", "posType", "urgent", "holidayWelfare", "livingWelfare", "salaryCombine", "socialWelfare", "trafficWelfare", "jobDepartment", "jobReport", "jobReportDetail", "jobSubSize", "language", "overSea" ] jdInc = OrderedDict() for k in inc_keys: jdInc[k] = "" self.result["jdInc"] = jdInc jdJob = OrderedDict() for k in job_keys: jdJob[k] = "" self.result["jdJob"] = jdJob others = OrderedDict() for k in others_keys: others[k] = "" self.result["others"] = others def clean_line(self, line): """ 清除一个句子首尾的标点符号 """ line = self.CLEAN_LINE.sub("", line).strip() line = re.sub("\s+|^/d+[;’、,/。\.]", "", line) return line def clean_cnNum(self, line): """ 经验年限提取时,中文一二三等转为123 """ line = unicode(line) a = [u"一", u"二", u"三", u"四", u"五", u"六", u"七", u"八", u"九", u"十", u"两"] b = range(1, 11) + [2] table = dict((ord(aa), bb) for aa, bb in zip(a, b)) return line.translate(table) def line2vec(self, line): """ 句子转换为向量 """ vec = np.zeros(50) for word in jieba.cut(line): if word in self.w2v.vocab: vec += self.w2v[word] return vec def clean_jobname(self, jobname): """ 职位名清洗 """ print jobname if jobname.lower() in self.jobdic: return jobname else: res = [(lcs_len(jobname, job), job) for job in self.jobdic] res.sort() return res[-1][1] def desc_extract(self, soup): line_list = soup.find_all("p") return '\n'.join([line.get_text() for line in line_list]) #去除img标签,1-7位空格, removeImg = re.compile('<img.*?>| {1,7}| ') #删除超链接标签 removeAddr = re.compile('<a.*?>|</a>') #把换行的标签换为\n replaceLine = re.compile('<tr>|<div>|</div>|</p>') #将表格制表<td>替换为\t replaceTD = re.compile('<td>') #将换行符或双换行符替换为\n replaceBR = re.compile('<br><br>|<br>') #将其余标签剔除 removeExtraTag = re.compile('<.*?>') #将多行空行删除 removeNoneLine = re.compile('\n+') def replace(self, x): x = re.sub(self.removeImg, "", x) x = re.sub(self.removeAddr, "", x) x = re.sub(self.replaceLine, "\n", x) x = re.sub(self.replaceTD, "\t", x) x = re.sub(self.replaceBR, "\n", x) x = re.sub(self.removeExtraTag, "", x) x = re.sub(self.removeNoneLine, "\n", x) #strip()将前后多余内容删除 return x.strip()
class JdParser(object): def __init__(self): self.degreedic = set(line.strip() for line in codecs.open( './data/degrees.txt', 'rb', 'utf-8')) # 载入学历词库 self.majordic = set(line.strip() for line in codecs.open( './data/majordic.txt', 'rb', 'utf-8')) # 载入专业词库 self.citydic = set(line.strip() for line in codecs.open( "./data/citydic.txt", 'rb', 'utf-8')) # 载入城市词库 self.firmnames = set(line.strip() for line in codecs.open( './data/firm.txt', 'rb', 'utf-8')) # 载入公司缩写名库 self.jobdic = set(line.strip() for line in codecs.open( './data/jobposition.txt', 'rb', 'utf-8')) # 载入招聘职位名库 self.skills = set( line.strip() for line in codecs.open('./data/skills.txt', 'rb', 'utf-8')) # self.wordlisttf = pickle.load(open('./data/wordlist.pkl')) # 出现频率最高的2000个单词 # self.w2vdict = json.load(open('./data/word2vec_50.json')) # 2000个词的word2vector self.clf = Grocery("jdclf") # 句子分类器,分为demand,duty,other self.clf.load() self.SEX = re.compile(u"性别不限|性别|男|女") self.AGE = re.compile(u"\d+周?岁|年龄") self.DEGREE = re.compile( u"(全日制)?(初中|高中|中专|大专|专科|大学专科|中职|本科|大学本科|硕士|研究生|博士|博士后)(.?以上)?") self.MAJOR = re.compile(u"\S+(相关专业|专业优先|及其.专业|[类等]专业[优先]?)") self.EXP = re.compile(u"工作经验|工作年限|工作经历|项目经[历验]|\d年经[历验]|.{1,2}年相关工作经验") self.PUB_TIME = re.compile(u"(\d+)(天前发布)") self.INCNAME = re.compile( u"\S+(有限公司|酒店|银行|集团|厂|研究中心|研究所|学校|旅行社|中心/s|分?公司|研发中心|技术部|事.部|招聘|商务平台)" ) self.INCTAG = re.compile( u"大公司|五百强|全球500强|小公司|成长型公司|创业公司|私有经济|集体经济|集团|外企|已上市|稳定性高|平均年龄\d+岁|妹纸多|学历高|福利待遇好|晋升机会大|民营公司|民营企业|互联网|创业型|国企|央企" ) self.JOBNAME = re.compile( u'\S*(研发工程师|工程师|经理|助理|顾问|前台|秘书|主管|研究员|实习生|操作员|专员|教学人员|技术人员|管理员|业务员|公关|程序员|教师|老师|培训生|\ 文员|研究员|策划|主任|总监|设计师|分析师|架构师|摄影师|编辑|BD|游戏UI|Android(开发)?|PHP(开发)?|Python(开发)?|.?(急招|急聘|初级|中级|高级|方向).?[\s)】\)])|\ |行政人事|网店设计|客服|会计|电话销售|外贸跟单|web前端|游戏UI|后.开发|产品运营|商业数据分析' ) self.START_DEMAND = re.compile( u"(岗位要求|应聘条件|任职要求|岗位资格|任职资格|岗位条件|工作要求|任职条件|人员条件|职位.求|职位条件|职位描述|岗位资格|职位资格|具备条件)[::\s]\ |如果你.{0,10}[::\s]|我们希望你.{0,12}[::\s]|(要求|条件)[::\s]|你需要?具备什么.+[?\?::\s]|任职资格[::\s]" ) self.DEMAND = re.compile( u"熟悉|熟练|具有|善于|懂得|掌握|具备|能够|优先|不少于|不超过|至少|团队.作|良好的|工作经验|开发经验|实习经历|能力强|富有|学历|经验|喜欢|较强的.{2,8}能力|相关专业|相关学历|者优先|精通|了解|及以上|技术全面|.强的责任心|[能有]独立|英文流利" ) self.DUTY = re.compile( u"跟进|协助|负责|配合|其他工作|领导交办的|对.+提供|审核|参与|为.+提出|日常.+工作|指导|对.+进行|为.+提供|跟进|拓展|运营|用户|客户|协调|拟写|通过|协同|完成|沟通|需求|秘书.{2,5}翻译" ) self.START_DUTY = re.compile( u"(岗位职责|岗位描述|职位描述|职责描述|任职描述|职位职责|工作职责|工作职能|职位职能|工作内容|实习内容|职位内容)[::\s]|做这样的事[::\s]|职责.{0,5}[::\s]" ) self.PAY = re.compile(u"薪酬|待遇|月薪|薪资|年薪|底薪|\d+k|\d+万|\d+元|工资|报酬|薪水|福利") self.BENEFIT = re.compile( u"周休|补助|补贴|假日|餐补|提成|交通补助|食宿|加班工资|期权|年假|领导|扁平化|管理|氛围|空间|休假|月假|带薪|全休|晋升|培训|舒适的|旅游|奖励|过节费|五险一金|奖金|\ |弹性工作|氛围|成长空间|实训|培训|高薪|前景|旅游|活动") self.SPLIT_JD = re.compile( u"岗位[【(]?[一二三四五六七八九][】)][::\s]|(^招聘岗位\S+|岗位\d|岗位[一二三四五六])[::\s]") self.CLEAR_NUM = re.compile(u"^\d[\.: :。、]|^[\((【]?\d[\))】\.]") self.CLEAR_COLO = re.compile(u"^[\s\.。)(【】,,]|[。;,\.;,]$|^\d[\.]") self.SKILL = re.compile( u"精通|了解|熟练|熟悉|掌握|懂得|优先|具备|具有|者优先|擅长|善于|较强的.{2,6}能力|良好的|有.+经验|能力|极强的" ) jieba.load_userdict('./data/majordic.txt') jieba.load_userdict('./data/skills.txt') jieba.load_userdict('./data/firm.txt') jieba.load_userdict('./data/degrees.txt') jieba.load_userdict('./data/benefits.txt') self.jdStr = "" self.linelist = [] self.lineindex = defaultdict(int) self.result = OrderedDict() # 分句,预处理 def preprocess(self, jdstr): self.result.clear() jdstr = re.sub(u"[【】◆ \u25cf\u25c6\u2605]", "", jdstr.decode('utf-8')) self.linelist = [ line.strip() for line in jdstr.split('\n') if len(line) > 1 ] self.jdStr = '\n'.join(self.linelist) for line in self.linelist: # print self.clf.predict(line),'\t',line self.lineindex[re.sub(u"[\s ]+", " ", line)] = 0 def line2vec(self, line): vec = np.zeros(50) cnt = 1 for word in jieba.cut(line): if word in self.w2vdict: vec += self.w2vdict[word] cnt += 1 vec = vec / cnt return vec # 抽取性别要求 def regular_sex(self): res = set() for line in self.linelist: if self.clf.predict(line) == 'demand' or self.DEMAND.search(line): findsex = self.SEX.search(line) if findsex: getsex = re.search(u"性别不限|男|女", line.replace(u"男女不限", u"性别不限")) if getsex: res.add(getsex.group()) break if res: self.result['sex'] = ' / '.join(res) else: self.result['sex'] = u'性别不限' # 抽取年龄要求 def regular_age(self): res = '' for line in self.linelist: if re.search(u'\d{2}后', line): continue findage = self.AGE.search(line) if findage: age = re.findall(u'\d{2}', line) if len(age) >= 2: res = '-'.join(age) elif len(age) == 1: if re.search(u'以上|不低于', line): res = age[0] + u'以上' if re.search(u"不超过|不高于|以下", line): res = age[0] + '以下' if re.search(u"左右|大约|大概", line): res = age[0] + '左右' break if len(res) < 2: res = u'年龄不限' self.result['age'] = res return res # 抽取专业要求 def regular_major(self): res = [] for line in self.linelist: findmajor = re.search(u"专业要求[::\s]", line) if findmajor: print 'major demand', line items = self.clean_line(line[findmajor.span()[1]:]).split() items = filter( lambda x: x not in self.degreedic and not re.search( u"薪酬|经验|元|\d+|月", x), items) res.append(' / '.join(items)) break if not res: for line in self.linelist: if re.search(u"专业.限|.限专业", line) and not re.search(u"专业优先", line): res.append(u"专业不限") print 'major demand', line break else: findmajor = self.MAJOR.search(line) if findmajor: majoritem = re.split(u'[\s,,;; ]', findmajor.group()) for item in majoritem: if re.search( u'学历|年龄|岁|学校|公司|性格|具有|具备|能够|经验|有|毕业|性别|男|女', item): continue print 'major item', item if self.BENEFIT.search(line): continue print 'major item', item if re.search(u"专业", item) and len(item) < 3: continue res.append(self.clean_line(item)) break if not res: for majorword in jieba.cut(line): if majorword in self.majordic or majorword[: -2] in self.majordic: res.append(majorword) if re.search(u"[等及类]?相关专业", self.jdStr) and len(res) == 1: res[0] += u"等相关专业" if not res: res.append(u"专业不限") self.result['major'] = res # 抽取学历要求 def regular_degree(self): """ 抽查学历信息,先整找关键字,而后再切词,用词典匹配 """ degree = [ u'小学', u'初中', u'中专', u'中技', u'高中', u'专科', u'大专', u'本科', u'硕士', u'博士', u'博士后' ] res = set() for line in self.linelist: finddegree = re.search(u"学历要求[::\s]", line) if finddegree: items = self.clean_line(line[finddegree.span()[1]:]).split() items = filter(lambda x: not re.search(u"薪酬|经验|元|月|年|\d+", x), items) res.add(' / '.join(items)) break if not res: for line in self.linelist: if re.search(u"学历不限|学历要求不限|不限学历", line): res.add(u"学历不限") break else: finddegree = self.DEGREE.search(line) if finddegree: res.add(finddegree.group()) break # 如果没有匹配到学历的要求信息,就整个文本切词后匹配查找 if len(res) == 0: for word in jieba.cut(self.jdStr): if word in self.degreedic: res.add(word) res = list(res) if len(res) == 1 and re.search(u'[及或]?以上', res[0]): tmp = res[0][:2] if tmp == u'全日': tmp = u'本科' elif tmp == u'研究': tmp = u'硕士' if tmp in degree: idx = degree.index(tmp) res = degree[idx:] self.result['degree'] = ' / '.join(res) # 抽取工作经验年限要求 def regular_exp(self): cnyear = u'[半一二三四五六七八九十两]年|\d-\d{1,2}年|\d年及?以上|不少于\d年|\d年' res = set() jdStr = self.jdStr findexp = re.search(u'经验不限|(经验)?\d{1,2}年及以上|经验\d-\d{1,2}年', jdStr) if findexp: res = findexp.group() self.result['exp'] = res.replace(u"经验", "") return res findexp = self.EXP.search(jdStr) if findexp: pos = findexp.span()[1] jdStr = jdStr[max(0, pos - 25):min(pos + 15, len(jdStr))] exp = re.search(cnyear, jdStr) if exp: res.add(exp.group()) if not res: exp = re.search( u"(\d-)?\d{1,2}年(工作|开发|项目)?经[历验]|(不少于)?([半\d]年)及?(以上)?经[历验]|经[历验]\s?(\d-)?\d{1,2}年", ' '.join(self.regular_jobtag())) if exp: res.add(exp.group()) else: exp = re.search(cnyear, ' '.join(self.regular_jobtag())) if exp: res.add(exp.group()) self.result["exp"] = "-".join(res) self.result["exp"] = self.result['exp'].replace(u'经验', "").replace(u"经历", "") return res def regular_jobtag(self): """ 有关职位标签信息 """ res = [] job_tag = re.search(u"应届生|全职|兼职|实习生|应届毕业生|社招|急招|急聘", self.jdStr) if job_tag: res.append(job_tag.group()) job_tag = re.search(u"招聘人数[::]?|招聘[::\s]|人数[::\s]", self.jdStr) if job_tag: jdstr = self.jdStr[job_tag.span()[1]:] for line in jdstr.split(): if len(line.strip()) < 1: continue else: num = re.search(u"(\d+\-)?\d+人?|若干|\d+位", line) if num: res.append(u"招聘人数:" + num.group()) break job_tag = re.search(u"(职能类别|职位标签)[:: ]?", self.jdStr) if job_tag: jdstr = self.jdStr[job_tag.span()[1]:] for line in jdstr.split('\n'): if len(line.strip()) < 3: continue else: res.append("职业标签:" + line.strip()) break if len(line) > 25: break # 根据产品部需求专门切割出包含经验的句子等有关职位标注信息,句子进行更精细化切割 linelist = [ line for line in re.split(u"[,。;\s]", self.jdStr) if 5 < len(line) < 15 ] for line in linelist: if re.search(u"经验", line) and not re.search(u"月薪|地点|日期", line): if re.search(u"\d+k|[。?)\)\]]", line): continue res.append(self.clean_line(line)) break self.result["job_tag"] = res return res # 清除句子前的数字和标点符合 def clean_line(self, line): line = self.CLEAR_NUM.sub("", line.strip()) line = self.CLEAR_COLO.sub("", line) return line # 抽取工作地点 def regular_workplace(self): res = set() jdstr = self.jdStr pos = list(re.finditer(u"(工作地.|上班地.|实习地.|地址|地点)[::\s]", jdstr)) if pos: jdstr = jdstr[pos[0].span()[1]:] for line in jdstr.split(): if len(line.strip()) < 2: continue if len(line) < 26: res.add(line.strip().replace(":", "").replace(":", "")) else: for city in jieba.cut(line): if city in self.citydic and city[:-1] not in res: res.add(city) break if not res: for city in jieba.cut(jdstr): if city in self.citydic and city[: -1] not in res and u"国" not in city: res.add(city) break self.result["workplace"] = " / ".join(res) return res # 抽取证书获奖情况等其他要求 def regular_cert(self): res = set() linelist = [ line for line in re.split(u"[\s ,。;,]", self.jdStr) if len(line) > 3 ] for line in linelist: findcert = re.search( u"(\S+证书|CET-\d|普通话|英语|口语|.语|日文|雅思|托福|托业)(至少)?(通过)?[\d一二三四五六七八九]级[及或]?(以上)?|(英语)?CET-\d级?(以上)?|职业资格|律师证|会计证", line) if findcert: res.add(findcert.group()) else: findcert = re.search(u"有(.+证)书?", line) if findcert: res.add(findcert.group(1)) else: findcert = re.search(u"有.+资格", line) if findcert: res.add(findcert.group()) self.result['cert'] = re.sub(u"[或及以上]", "", ' / '.join(res)) if self.result['cert']: self.result['cert'] = self.result['cert'].split(' / ') else: self.result['cert'] = [] # 利用技能词库抽取相关技能 def regular_skill(self, num=6): res = [] for line in self.linelist: if self.DEMAND.search(line) or self.clf.predict(line) == 'demand': for word in jieba.cut(line): word = strQ2B(word).lower() if word in self.skills: res.append(word) sorted_words = [w[0] for w in Counter(res).most_common(2 * num)] for word in jieba.cut(self.result['job_name']): word = strQ2B(word).lower() if word in self.skills and word not in sorted_words: sorted_words.insert(0, word) after_top3 = sorted_words[3:] np.random.shuffle(after_top3) self.result['skill'] = sorted_words[:3] + after_top3[:num - 3] # 抽取岗位职责 def regular_duty(self): res = [] jdStr = self.jdStr pos = list(self.START_DUTY.finditer(jdStr)) if len(pos) > 0: linelist = [ re.sub("[\s ]+", " ", line) for line in jdStr[pos[-1].span()[1]:].split("\n") if len(line) > 2 ] for i in xrange(len(linelist)): line = linelist[i] if self.START_DUTY.search( line) or self.lineindex[line] == 1 or ( re.search(u".年来|谢谢|请在|公司介绍|举报|收藏|岗位职责", line) and not re.search(u"了解", line)): continue if re.search(u"要求[::\s]?|岗位要求", line) and len(line) < 6: break if re.match(u"\d{1,2}|\u25cf|[\uff0d(\(\-\+]|[a-z][\.、\s]", line.strip()) or self.DUTY.search( line) or self.clf.predict(line) == 'duty': res.append(line.strip()) elif i < len(linelist) - 1 and self.clf.predict( linelist[i + 1]) == 'duty': res.append(line) else: break if not res: for line in self.linelist: if re.search(u"粉丝团", line) and len(line) < 12: continue if self.DUTY.search(line) and self.clf.predict(line) == "duty": if self.lineindex[line] != 1: res.append(line) self.result["duty"] = "\n".join(res) for line in res: self.lineindex[line] = 1 return res # 抽取岗位要求 def regular_demand(self): res = [] jdStr = self.jdStr pos = list(self.START_DEMAND.finditer(jdStr)) if len(pos) > 0: tmppos = pos[-1].span()[0] if re.search(u"具有|具备", jdStr[tmppos - 5:tmppos + 5]) or re.search( u"证书|证", jdStr[tmppos:tmppos + 8]): pos.pop() if pos: linelist = [ re.sub("[\s ]+", " ", line) for line in jdStr[pos[-1].span()[1]:].split("\n") if len(line) > 2 ] else: linelist = [] for i in xrange(len(linelist)): line = linelist[i] if self.START_DEMAND.search(linelist[i]) or re.search( u"谢谢|请在|公司介绍|举报|收藏|\d+k?元|加分", line): continue if re.match(u"\d{1,2}|\u25cf|[\uff0d(\(\-\+]|[a-z][\.、\s]", line) or self.DEMAND.search( line) or self.clf.predict(line) == 'demand': res.append(line) elif i < len(linelist) - 1 and self.clf.predict( linelist[i + 1]) == 'demand': res.append(line) else: break if not res: for line in self.linelist: if self.lineindex[line] == 1 or len(line.split()) > 6: continue # 如果该句已经被处理过,就不再重复显示 if self.clf.predict(line) == 'demand' or self.DEMAND.search( line): res.append(line.strip()) self.result['demand'] = '\n'.join(res) for line in res: self.lineindex[line] = 1 return res # 招聘的职位名 def regular_jobname(self): res = set() jdStr = self.jdStr findpos = re.search(u"(招聘岗位|招聘职位|职位名称|岗位名称|岗位[一二三四五六七八九])[:、:\s ]", jdStr) # if not findpos: # findpos = re.search(u"(职位类别|职位职能)[::\s ]",jdStr) if findpos: pos = findpos.span()[1] linelist = jdStr[pos:].split("\n") for line in linelist: if len(line) < 2: continue if len(line) >= 2 and len(line) < 20: if re.search(u"职位描述|查看|地址|工作|分享|举报|下一条|时间|福利|待遇|周末|双休", line): continue res.add(re.sub(u"聘请|高薪诚聘|诚聘|[,。、\d!]+", "", line.strip())) break # 如果没有匹配到招聘的具体职位信息,就切词后到职位列表去匹配 if not res: for line in self.linelist: if re.search(u"招聘|高薪|诚聘", line): continue if len(line) < 6 and not re.search( u'岗位|岗位内容|工作内容|职责|任职|资格', line) and self.clf.predict(line) == 'job_name': res.add(line) break findPos = self.JOBNAME.search(line) if findPos and len(findPos.group()) < 20 and not re.match( u'\d', findPos.group()): jobname = findPos.group() res.add(re.sub(u"聘请|高薪诚聘|诚聘|急.|[,。、!]+", "", jobname)) break # res.add(re.sub(u"\(.+\)|(.+)|【.+】|[,。、\s\d]+|聘请|高薪诚聘|诚聘|急招|","",line.strip())) if not res: for line in self.linelist: for word in jieba.cut(line.lower()): if word in self.jobdic: res.add(word) self.result["job_name"] = " / ".join(res) return res if not res: tag = re.search(u"实习生|兼职", self.jdStr) if tag: res.add(tag.group()) self.result["job_name"] = strQ2B(" / ".join(res)).lower() return res # 薪酬 def regular_pay(self): pay = "" lagoup = re.search( u"(\d+[kK][-——]\d+[kK])|(\d{3,5}-\d{3,5}元?/[月日天])|(\d{3,5}-\d{3,5}元)|((\d+[-~]\d+)万.[年月])|底薪\d+(-\d+)?元?|\d{3,5}元(左右|以上)?|年薪\d+万?元(左右|以上)?", self.jdStr) # 针对拉勾网,没有待遇等关键字符 if lagoup: pay = lagoup.group() self.result["pay"] = pay self.result["pay"] = pay.replace(u'k', '000').replace(u'K', '000') return pay findpay = self.PAY.search(self.jdStr) if findpay: pos = findpay.span()[1] jdstr = self.jdStr[max(0, pos - 5):min(pos + 10, len(self.jdStr))] if re.search(u"面议", jdstr): pay = u"面议" else: findpay = re.findall(u"\d{3,7}", jdstr) pay = "-".join(findpay) self.result["pay"] = pay.replace(u'k', '000').replace(u'K', '000') return pay # 抽取薪资福利 def regular_benefits(self): res = [] jdStr = self.jdStr findpos = list(re.finditer(u"薪酬福利[::\s]|(福利|待遇)\s?[::]", jdStr)) if not findpos: findpos = list( re.finditer(u"(晋升制度|工作环境|职位诱惑|你会获得什么)\s?[?\?::]", jdStr)) if findpos: pos = findpos[-1].span()[1] linelist = jdStr[pos:].split('\n') for line in linelist: print 'benefits', line if len(line.strip()) < 3: continue if re.match(ur"[((]?\d+", line) or self.BENEFIT.search(line): res.append(line.strip()) self.lineindex[line.strip()] = 1 else: break if not res: for line in jdStr.split(): if len(line) > 1 and re.search( u"带薪|双休|股票期权|五险一金|发展空间|福利|诱惑|休假|薪酬|补助|年假|弹性工作", line): if re.search(u"福利|待遇|诱惑", line) and len(line.strip()) < 6: continue res.append(line.strip()) if len(res) == 1 and re.search( u"险一金", res[0]) and not re.search(u"[,、]", res[0]): res[0] = self.clean_line(' '.join(jieba.cut(res[0]))) self.result["benefits"] = "\n".join(res) return res
# -*- coding: utf-8 -*- import sys reload(sys) sys.path.append('../../') from config import * from tgrocery import Grocery STOP_WORDS_FILE = 'stopwords.txt' USER_DICT_FILE = 'user_dict.txt' model_fintext = Grocery('model_fintext') model_fintext.load() sys.path.append('../') from get_es import * es = Elasticsearch([{'host':ES_HOST,'port':ES_PORT}]) def search(index_name): es_search_options = set_search_optional() es_result = get_search_result(es_search_options,index=index_name) # final_result = get_result_list(es_result) # return final_result return es_result def get_result_list(es_result): final_result = [] for item in es_result: final_result.append(item['_source']) return final_result
def test_sample(path, test_path): new_grocery = Grocery(path.encode('utf-8')) new_grocery.load() test_path = os.path.join(BASE_DIR, 'learn', test_path) res = new_grocery.test(test_path.encode('utf-8')) return str(res)
import rospy from tgrocery import Grocery from nlu.srv import * import jieba import jieba.posseg as pseg model = '/home/hntea/ros-speech/nlu-model/model' new_grocery = Grocery(model) def handle_is_music(req): # print "Request = [%s]"%(req.topic) label = str(new_grocery.predict(req.topic)) if label == 'music': ret = 1 else: ret = 0 return ret def is_music_server(): rospy.init_node('nlu_is_music') s = rospy.Service('is_music', IsMusic, handle_is_music) print "Servic Is Ready" rospy.spin() if __name__ == "__main__": print "Servic Load Model..." new_grocery.load() # 加载模型 is_music_server()
# -*- coding: utf-8 -*- # 分类模型 # Author: Alex # Created Time: 2016年12月30日 星期五 14时18分58秒 from tgrocery import Grocery gr = Grocery("test") gr.load() lessVal = 0.2 # 统计变量 statDict = { "total": 0, # 总数 "notEq": 0, # title和content的识别结果不一致 "less": 0, # 小于某个阀值的数量 "title": 0, # 以title为结果的次数 "content": 0 # 以content为结果的次数 } def getStatDict(): return statDict def classify(title, content): """ 分类器 :return cat 预测的分类
# -*- coding: utf-8 -*- import csv, codecs from tgrocery import Grocery import preprocessing as pp testFileName = '../data/test.txt' outputFileName = '../output/upload.csv' # test ################################## #grocery=Grocery('sample') grocery = Grocery('version1.0') grocery.load() print 'start test' filetest = codecs.open(testFileName, 'r', 'utf-8') test_reader = filetest.readlines() fileOutput = codecs.open(outputFileName, 'w', 'utf-8') i = 0 for line in test_reader: content = pp.getcontent(test_reader, i) i = i + 1 #if(i>10): #break if (i % 5000 == 0): print("%d " % (i)) + '#' * 30 if (content == ''): print "test.py#" * 3 + line
class JdCRF(object): def __init__(self): self.data = [] self.clf = Grocery("jdclf") self.clf.load() self.SEX = re.compile(u"性别不限|性别|男|女") self.AGE = re.compile(u"\d+周?岁|年龄") self.DEGREE = re.compile(u"(全日制)?(初中|高中|中专|大专|专科|大学专科|中职|本科|大学本科|硕士|研究生|博士|博士后)(.?以上)?") self.MAJOR = re.compile(u"\S+(相关专业|专业优先|及其.专业|[类等]专业[优先]?)") self.EXP = re.compile(u"工作经验:|工作经[历验]|工作年限|年.{0,4}经[历验]|经[历验].{1,6}年") self.PUB_TIME = re.compile(u"(\d+)(天前发布)") self.INCNAME = re.compile(u"\S+(有限公司|酒店|银行|集团|研究中心|研究所|学校|旅行社|分?公司|研发中心|技术部|事.部|招聘)") self.NOT_INC = re.compile(u"职位|描述|收藏|推荐|地址|邮箱|主页|介绍|欢迎|加入|要求|简介|险一金|奖金|包吃住|社区|厂房|人员|职责") self.INCTAG = re.compile(u"大公司|五百强|全球500强|小公司|成长型公司|创业公司|私有经济|集体经济|集团|外企|已上市|稳定性高|平均年龄\d岁|妹纸多|学历高|福利待遇好|晋升机会大|民营公司|民营企业\ |互联网|创业型|国企|央企") self.JOBNAME = re.compile(u'\S*(研发工程师|工程师|经理|助理|顾问|前台|秘书|主管|研究员|实习生|操作员|专员|教学人员|技术人员|管理员|业务员|公关|程序员|教师|老师|培训生|\ 文员|研究员|策划|主任|总监|设计师|分析师|架构师|摄影师|编辑|BD|游戏UI|Android(开发)?|PHP(开发)?|Python(开发)?|.?(急招|急聘|初级|中级|高级|方向).?[\s)】\)])|\ |行政人事|网店设计|客服|会计|电话销售|外贸跟单|web前端|游戏UI|后.开发|产品运营|商业数据分析') self.START_DEMAND = re.compile(u"(岗位要求|应聘条件|任职要求|岗位资格|任职资格|岗位条件|工作要求|任职条件|人员条件|职位.求|职位条件|职位描述|岗位资格|职位资格|具备条件)[::\s]\ |如果你.{0,10}[::\s]|我们希望你.{0,12}[::\s]|(要求|条件)[::\s]|你需要?具备什么.+[?\?::\s]|任职资格[::\s]") self.DEMAND = re.compile(u"熟悉|熟练|具有|善于|懂得|掌握|具备|能够|优先|不少于|不超过|至少|团队.作|良好的|工作经验|开发经验|实习经历|能力强|富有|以上学历|经验|喜欢|\ 较强的.{2,8}能力|相关专业|相关学历|者优先|精通|了解|及以上|技术全面|.强的责任心|[能有]独立|英文流利") self.DUTY = re.compile(u"跟进|协助|负责|配合|其他工作|领导交办的|对.+提供|审核|参与|提出|跟踪|报告|为.+提出|日常.+工作|指导|跟进|拓展|运营|用户|客户|协调|拟写|通过|协同\ |完成|沟通|需求|秘书.{2,5}翻译") self.START_DUTY = re.compile(u"(岗位职责|岗位描述|职位描述|职责描述|任职描述|职位职责|工作职责|工作职能|职位职能|工作内容|实习内容|职位内容)[::\s]|做这样的事[::\s]|职责.{0,5}[::\s]") self.PAY = re.compile(u"薪酬|待遇|月薪|薪资|年薪|底薪|\d+k|\d+万|\d+元|工资|报酬|薪水|福利") self.BENEFIT = re.compile(u"周休|补助|补贴|假日|餐补|提成|交通补助|食宿|加班工资|期权|年假|领导|扁平化|管理|氛围|空间|休假|月假|带薪|全休|晋升|培训|舒适的|旅游|奖励|过节费|五险一金|奖金|\ |弹性工作|氛围|成长空间|实训|培训|高薪|前景|旅游|活动|分红") def gen_data(self,fname='./data/lagou_train.txt'): fw = codecs.open('./data/jd_train_crf.txt','wb','utf-8') cnt = 1 for line in codecs.open(fname,'rb','utf-8'): if line.startswith(u"====="): fw.write(line) continue cnt +=1 if len(line.strip())>1: pred = self.clf.predict(line) newline = pred+'\t\t'+line.strip()+'\t\t'+str(len(line))+"\n" fw.write(newline) print cnt print 'done' def load_data(self,fname="./data/jd_train_crf.txt"): data = [] tmp = [] for line in codecs.open(fname,'rb','utf-8'): if line.startswith(u"===="): data.append(tmp) tmp = [] continue else: tag_data = line.strip().split('\t\t') if len(tag_data)==3: tmp.append(tuple(tag_data)) else: print '\t '.join(tag_data) n = len(data)/2 print 'train data',n print 'test data',len(data)-n return data[n:],data[:n] def word2features(self,sent,i): word = sent[i][0] postag = sent[i][1] features = [ 'bias', 'word.lower=' + word.lower(), 'word[:2]=' +word[:2], 'word.isdigit=%s'%word.isdigit(), 'postag='+postag, 'demand=%s'% '1' if self.DEMAND.search(word) else '0', 'start_demand=%s'% '1' if self.START_DEMAND.search(word) else '0', 'start_duty=%s'% '1' if self.START_DUTY.search(word) else '0', 'duty=%s'% '1' if self.DUTY.search(word) else '0', 'jobname=%s'% '1' if self.JOBNAME.search(word) else '0', 'incname=%s'% '1' if self.INCNAME.search(word) else '0', 'benefit = %s'% '1' if self.BENEFIT.search(word) else '0', 'pred=%s' % self.clf.predict(word) ] if i>0: word1 = sent[i-1][0] postag1 = sent[i-1][1] features.extend([ '-1:postag='+postag1, '-1:word.islower='+word1[:3].lower(), '-1:start_demand=%s'% '1' if self.START_DEMAND.search(word) else '1', '-1:start_duty=%s'% '1' if self.START_DUTY.search(word) else '0', '-1:demand=%s'% '1' if self.DEMAND.search(word1) else '0', '-1:duty=%s'% '1' if self.DUTY.search(word1) else '0', '-1:jobname=%s'% '1' if self.JOBNAME.search(word1) else '0', '-1:incname=%s'% '1' if self.INCNAME.search(word1) else '0', '-1:benefit = %s'% '1' if self.BENEFIT.search(word) else '0', '-1:pred=%s' % self.clf.predict(word), ]) else: features.append('BOS') if i<len(sent)-1: word1 = sent[i+1][1] postag1 = sent[i+1][1] features.extend([ '+1:word.lower=' + word1[:3].lower(), '+1:word.istitle=%s' % word1.istitle(), '+1:word.isupper=%s' % word1.isupper(), '+1:postag=' + postag1, '+1:postag[:2]=' + postag1[:2], '+1:start_demand=%s'% '1' if self.START_DEMAND.search(word) else '0', '+1:start_duty=%s'% '1' if self.START_DUTY.search(word) else '0', '+1:demand=%s'% '1' if self.DEMAND.search(word1) else '0', '+1:duty=%s'% '1' if self.DUTY.search(word1) else '0', '+1:jobname=%s'% '1' if self.JOBNAME.search(word1) else '0', '+1:incname=%s'% '1' if self.INCNAME.search(word1) else '0', '+1:benefit = %s'% '1' if self.BENEFIT.search(word) else '0', '+1:pred=%s' % self.clf.predict(word), ]) else: features.append('EOS') return features def sent2features(self,sent): return [self.word2features(sent,i) for i in range(len(sent))] def sent2labels(self,sent): return [label for (label,token,postag) in sent] def sent2tokens(self,sent): return [token for (label,token,postag) in sent] def train(self,x_train,y_train): assert len(x_train)==len(y_train),"not the same %d %d"%(len(x_train),len(y_train)) trainer = pycrfsuite.Trainer(verbose=False) for xseq,yseq in zip(x_train,y_train): trainer.append(xseq,yseq) trainer.set_params({ 'c1':1.0, 'c2':1e-3, 'max_iterations':50, 'feature.possible_transitions':True }) trainer.train('jd_skill.crfsuite') def test(self,sent): tagger = pycrfsuite.Tagger() tagger.open('./jd_skill.crfsuite') print 'tokens ','\n '.join(self.sent2tokens(sent)) print 'Predicted','\t '.join(tagger.tag(self.sent2features(sent))) print 'Correct ','\t '.join(self.sent2labels(sent))
class XzParserTop(object): def __init__(self): self.result = OrderedDict() inc_keys = [ "jdFrom", "incName", "incAliasName", "incLogo", "incScale", "incType", "incIndustry", "incIntro", "incCity", "incLocation", "incZipCode", "incContactName", "incContactInfo", "incUrl" ] # job_keys = ["pub_time", "jobEndTime", "jobPosition", "jobCate", "jobSalary", "jobWorkAge","jobDiploma", "jobDesc", # "jobType","jobNum", "jobWorkCity","jobWorkLoc","jobWelfare", "jobMajorList", "age", "gender", "email", # "jobCVformat", "jobMinimumDays","jobSkill","jobCertificate"] jdInc = OrderedDict() for k in inc_keys: jdInc[k] = "" self.result["jdInc"] = jdInc jdJob = OrderedDict() # for k in job_keys: # jdJob[k] = "" self.result["jdJob"] = jdJob self.CLEAN_TEXT = re.compile( u"[^\u4e00-\u9fa5\w\d;::;,。、%\.,/。!!@()\r\n\(\)\-\+ - `]") self.clf = Grocery(base_dir + "/jdclf") self.clf.load() self.SPLIT_LINE = re.compile(u"[\r\n;::。!?;]|[ \s \xa0\u724b]{4,}") self.CLEAN_LINE = re.compile( u"^[\u2022(【\[\s\t\r\n\(\- ]?[\da-z12345789]{1,2}[\.,。、,::)】\]\)\s]|^[!@#¥%……&×()\(\){}:“|、-\-,。::\.]|^[一二三四五六七八九123456789\d]{0,2}[\.、\s:: ]|[,;。、\s \.]$|^[\s \u2022 \uff0d \u25cf]" ) self.CLEAN_JOBNAME = re.compile( u"急聘|诚聘|高薪|包[食住宿餐]|.险一金|待遇|^急?招|职位编号\s?[\s\d::]") self.SEX = re.compile(u"性别|男|女") self.JOB_TAG = re.compile(u"全职|实习") self.START_DEMAND = re.compile( u"(任职资格|岗位要求|工作要求|任职条件|任职要求|职位要求)[::\s】\n ]?") self.START_DUTY = re.compile( u"(工作内容|岗位职责|工作职责|职位描述|工作描述|职位介绍|职位职责|岗位描述)[::\s 】\n ]") self.START_BENEFIT = re.compile(u"(福利待遇|待遇|福利)[::\s\n】]") self.DEMAND = re.compile(u"精通|掌握|熟悉|熟练|有.+经验") self.DUTY = re.compile(u"负责|促成|为客户|安排的其.工作") self.BENEFIT = re.compile(u".险一金|福利|晋身|休假|带薪|补助|补贴") self.CERT = re.compile( u"(\S{2,8}证书|CET-\d|普通话|英语|口语|.语|日文|雅思|托福|托业)(至少)?(通过)?[\d一二三四五六七八九]级[及或]?(以上)?|(英语)?CET-\d级?(以上)?|\ 医学.{0,3}证|会计.{0,3}证|律师.{0,3}证|有.{1,8}证书") self.degreedic = set([ line.strip() for line in codecs.open(base_dir + '/data/degrees.txt', 'rb', 'utf-8') ]) self.majordic = set([ line.strip() for line in codecs.open(base_dir + '/data/majordic_new.txt', 'rb', 'utf-8') ]) self.skilldic = set([ line.strip() for line in codecs.open(base_dir + '/data/skills.txt', 'rb', 'utf-8') ]) self.jobdic = set([ line.strip() for line in codecs.open(base_dir + '/data/jobnames.txt', 'rb', 'utf-8') ]) self.position = set([ line.strip() for line in codecs.open(base_dir + '/data/jobposition_new.txt', 'rb', 'utf-8') ]) self.position_prefix = set([ line.strip() for line in codecs.open( base_dir + '/data/jobposition_prefix.txt', 'rb', 'utf-8') ]) self.position_postfix = set([ line.strip() for line in codecs.open( base_dir + '/data/jobposition_postfix.txt', 'rb', 'utf-8') ]) self.citydic = set([ line.strip() for line in codecs.open(base_dir + '/data/citydic.txt', 'rb', 'utf-8') ]) self.province_city = set([ line.strip() for line in codecs.open(base_dir + '/data/province_city.txt', 'rb', 'utf-8') ]) self.city_area = set([ line.strip() for line in codecs.open(base_dir + '/data/city_area.txt', 'rb', 'utf-8') ]) jieba.load_userdict(base_dir + '/data/majordic_new.txt') jieba.load_userdict(base_dir + '/data/skills.txt') jieba.load_userdict(base_dir + '/data/jobposition.txt') jieba.load_userdict(base_dir + '/data/firm.txt') jieba.load_userdict(base_dir + '/data/degrees.txt') jieba.load_userdict(base_dir + '/data/benefits.txt') jieba.load_userdict(base_dir + '/data/citydic.txt') jieba.load_userdict(base_dir + '/data/province_city.txt') #new self.INTRO = re.compile(u"公司介绍|公司简介|企业简介|企业介绍|关于我们|单位简介|关于") self.JOBNAME_LINE = re.compile( u"岗位:|招聘岗位|实习生招聘|职位名称|招聘职位|实习岗位|岗位方向|定向岗位|岗位$|岗位名称") self.JOBNAME = re.compile( u".*?工程师|\S{2}专员$|\S{4,}岗$|工程师$|\S{4,}实习生招聘$|职位\d.*?分析|[^招聘]实习生$|研究员$\ |经理|.*?实习生[((].*?[))|培训生$]") self.CONTACTINFO = re.compile( u'联络方式|联络电话|固定电话|固话|电话|联系电话|QQ|联系方式|传真|Tel') self.CONTACTNAME = re.compile(u'联络|联系人$|联络人') self.NUMBER = re.compile( u'(?<=[^-Q/——_ 0123456789])([-/_ ——0123456789]{7,})') self.QQ = re.compile(u'QQ\d{6,}|QQ|qq') self.PUNCTION = re.compile(u'[~\],.;:: ,、。《》【】!#……<>;“”]') self.INC_URL = re.compile(u"(主页|网站|网址|官网):(.{0,5}[\w\d_/\.:\-]+)") self.MAIL = re.compile(u"\w+@[\w\.]+|\w+\.\w+@[\w\.]+|\w.*?at.*?com") self.FORMART = re.compile( u'(邮件|简历)名称?以?(主题|格式|标题){0,}为?[~\],.;::,、。()《》【】!#……()-<>;“”](.*)|\ ("|“){1,}(.*?姓名.*?)["”]|(“|"){1,}(.*?姓名.*?学校.*?)[”"]|("|“){1,}(\S{2,}-\S{2,}-\S{2,}.*?)[”"]|([ 姓名年级学校职位可入职时间-]{5,})' ) self.JOBLOC = re.compile(u"工作地址|上班地点|公司地址|工作地点|实习地点|总部地址|[^邮件](地址|地点)") self.MAJOR = re.compile(u"相关专业|等?专业|专业优先|以上学历|优先考虑|专业背景") self.AGE_LINE = re.compile(u"\d+周?岁|年龄|\d{2}岁") self.AGE = re.compile( u"\d{2}?\s?[\- -~到至]?\s?\d{2}周?岁|(至少|不低于|不超过|不大于|大概|大约|不少于|大于)\d+周?岁|\d+周?岁(以上|以下|左右|上下)" ) self.WEEKTIME = re.compile( u"(每|一)周(最好)?(至少|最少|保证|起码)?(实习|工作)?[\d一二三四五六七八九十].*?(天|日)(以上)?|实习天数[\d一二三四五六七](天|日)|\ (每|一)周.*?(最好|可到岗)?(至少|最少|保证|起码)?(实习|工作)?[\d一二三四五六七八九十].*?(天|日)(以上)?") self.JOBLENGTH = re.compile( u"(实习期)?(至少|保证|起码)?(工作)?[\d一二三四五六七八九十]个.*?月(或)?(以上)|\ 周期?.*?(\d个月[-―]{1,2}\d个月)|(实习期)?(至少|保证|起码)(工作)?[\d一二三四五六七八九十]个.*?月(以上)?|至少.{1,5}年(以上)?|(实习).{1,5}年以上" ) self.XUHAO = re.compile(u"[0123456789一二三四五六七八九]") self.JOBNUM_LINE = re.compile(u"招?(\d{1,}人)|招聘人数:|岗位人数|人数:") self.JOBNUM = re.compile( u"(\(| )?(\d{1,}[人名]?)|(\d{1,}[-_~]?\d{1,}[人名])") self.PHONE = re.compile( u'(?<!\d)\d{11,12}(?!\d)|\d{3,4} \d{7,8}|\d{3,4}-\d{7,8}|\d{3,4}—\d{7,8}|\d{3}-\d{3}-\d{4}|(?<!\d)\d{8}(?!\d) ' ) self.ENDTIME = re.compile( u"(\d.*?[号|日])之?.{0,5}前|截止时间(\d.*日)|截止日期(\d.*日)|(\d.*日)之前") self.DEGREE_LINE = re.compile(u"(最低)?学历要求|(以上)?学历") self.DEGREE = re.compile( u"(小学|初中|高中|职技|本科|研究生|硕士|博士|教授|专科|大专|中专|无要求|不限|Master)(?!优先)|(小学|初中|高中|职技|本科|研究生|硕士|博士|教授|专科|大专|中专|无要求|不限|无限|Master)$" ) self.SALARY = re.compile( u"\d{3,}元?\-?\d{3,}元|(本科|研究生|硕士)?\d{2,4}(元|RMB)/?(天|周|月|day|Day)|\dk[--]\dk" ) #找标题规则 self.FIRST = re.compile(u"一、|二、|三、|四、|五、|六、|七、|八、|九、") self.THIRD = re.compile(u"[\[【]\S{3,6}[\[】]|^\S{4,5}:$") #CNN模型 # self.model,self.word_idx_map,self.config = load_model() #创一个ac自动机用于关键词的匹配。 # builder = AcoraBuilder(list(self.position)) # self.ac = builder.build(builder) ''' 逐行去找信息 self.flag是标志位,主要处理抽取信息跨行的情况: 工作地址: 广东省深圳市XX路 self.extra_info有四个参数: 第一个line是一行文本(不带标签信息) 第二个idx是该行文本所在的行号 第三个add_desc是是否将该行文本加入职位描述信息 第四个clean_major为是否将之前得到的专业与技能信息进行清空 ''' def extra_info(self, line, idx=None, add_desc=True, clean_major=False): ''' Args: line (): 输入一行文本 idx (): 该行文本在段落中的行号 add_desc (): 是否将该行信息加入职位描述中,默认True clean_major (): 是否清空之前的优先专业与技能需求 Returns: ''' if self.jdType == "regular": jobName = deepcopy(self.jobName) elif self.jdType == "special": jobName = deepcopy(self.jobNameList[idx]) # 加入职位描述 if add_desc: if self.jdJob[jobName].has_key("jobDesc"): self.jdJob[jobName]["jobDesc"] += line + u'\n' else: self.jdJob[jobName]["jobDesc"] = line + u'\n' if clean_major: self.majorList = [] self.skillList = [] # 如果职位名发现人数情况 if self.JOBNUM.search(jobName): jobNum = re.search(u"\d{1,}人", jobName).group(0) jobName = jobName.split(jobNum)[0] self.jdJob[jobName]["jobNum"] = jobNum if self.flag == "workloc": self.jdJob[jobName]["jobWorkLoc"] = line self.flag = None if self.JOBLOC.search(line): print 'workloc' if (len(line) < 20 or len(line) > 100) and not re.search(u"[^城]市|路", line): pass elif line.count(u":") == 1: workloc = line.split(":")[1] if len(workloc) > 60: workloc = re.split(self.PUNCTION, workloc)[0] self.jdJob[jobName]["jobWorkLoc"] = workloc if not line.split(":")[1].strip(): self.flag = "workloc" elif line.count(u":") > 1: for tag in filter(None, line.split(" ")): if self.JOBLOC.search(tag): if tag.count(u":") == 1: self.jdJob[jobName]["jobWorkLoc"] = tag.split( u":")[1] # elif len(filter(None, self.PUNCTION.split(line))) > 1: self.jdJob[jobName]["jobWorkLoc"] = filter( None, self.PUNCTION.split(line))[-1] else: self.flag = "workloc" # 专业、技能采用词库匹配, if self.DEMAND.search(line): word_split = jieba.cut(line, cut_all=True) # 做分词去词库匹配 print "demand" for word in word_split: word = word.lower() if word in self.skilldic: self.skillList.append(word) if self.MAJOR.search(line): word_split = jieba.cut(line) # 做分词去词库匹配 print "major" for word in word_split: word = word.lower() # print word word = re.sub(u"相关|学校|专业", u"", word) if word in self.majordic: self.majorList.append(word) if self.FORMART.search(line): print "format" if line.count(u":") == 1 and len(line) < 30: self.jdJob[jobName]["jobCVformat"] = line.split(u":")[1] else: groups = filter(None, self.FORMART.search(line).groups()) format = groups[np.argmax(map(lambda x: len(x), groups))] self.jdJob[jobName]["jobCVformat"] = format if self.ENDTIME.search(line): print "endtime" self.jdJob[jobName]["jobEndTime"] = self.ENDTIME.search( line).group() if self.MAIL.search(line): print "email" if len(self.MAIL.search(line).group()) > 8: self.jdJob[jobName]["email"] = self.MAIL.search(line).group() else: if line.count(u":") == 1: self.jdJob[jobName]["email"] = line.split(u":")[1] if self.SALARY.search(line): print "salary" for item in re.split(u" |;|,|,", line): if self.SALARY.search(item): if self.jdJob[jobName].has_key("jobSalary"): self.jdJob[jobName][ "jobSalary"] += u" " + self.SALARY.search( item).group() else: self.jdJob[jobName]["jobSalary"] = self.SALARY.search( item).group() if self.JOBNUM_LINE.search(line): print "jobnum" self.jdJob[jobName]["jobNum"] = self.JOBNUM.search(line).group() if self.WEEKTIME.search(line): print "weektime" self.jdJob[jobName]["jobMinimumDays"] = self.WEEKTIME.search( line).group() if self.JOBLENGTH.search(line): print "jobLength" self.jdJob[jobName]["jobLength"] = self.JOBLENGTH.search( line).group() if self.DEGREE.search(line): print "degree" line = re.sub(u"士研究生", u"士", line) print filter(lambda x: len(x) > 1, self.DEGREE.findall(line)) self.jdJob[jobName]["jobDiploma"] = list( set(filter(None, self.DEGREE.findall(line)[0]))) if self.AGE_LINE.search(line): print "age" findage = self.AGE.search(line) if findage: self.jdJob[jobName]["age"] = findage.group() if len(self.majorList) > 0: self.jdJob[jobName]["jobMajorList"] = list(set(self.majorList)) if len(self.skillList) > 0: self.jdJob[jobName]["jobSkill"] = list(set(self.skillList)) # elif self.WEEKTIME.search(line): # print "worktime" # if line.count(u":") == 2: # worktime = self.CLEAN_TEXT.sub(u"", line.split(u":")[1]) # self.jdJob[jobName]["jobMinimumDays"] = worktime # if not worktime: # self.flag = "worktime" # else: # # if self.flag == "worktime": # self.jdJob[jobName]["jobMinimumDays"] += self.WORKTIME.search(line).group(0) # else: # if self.WEEKTIME.search(line).group(0).find(u":")<2: # self.jdJob[jobName]["jobMinimumDays"] = self.WORKTIME.search(line).group(0) def refresh(self): self.result = OrderedDict() inc_keys = [ "jdFrom", "incName", "incAliasName", "incLogo", "incScale", "incType", "incIndustry", "incIntro", "incCity", "incLocation", "incZipCode", "incContactName", "incContactInfo", "incUrl" ] # job_keys = ["pub_time", "jobEndTime", "jobPosition", "jobCate", "jobSalary", "jobWorkAge", "jobDiploma", # "jobDesc", # "jobType", "jobNum", "jobWorkCity", "jobWorkLoc", "jobWelfare", "jobMajorList", "age", "gender", # "email", # "jobCVformat", "jobMinimumDays", "jobSkill", "jobCertificate"] jdInc = OrderedDict() for k in inc_keys: jdInc[k] = "" self.result["jdInc"] = jdInc jdJob = OrderedDict() # for k in job_keys: # jdJob[k] = "" self.result["jdJob"] = jdJob self.first = [] self.second = [] self.third = [] # 记录jd是何种类型的,是否名企界面,是否包含表格 self.jdType = None # 是否包含表格 self.has_table = False # 职位名列表 self.jobNameList = [] self.jobNameLine = [] self.jobType = [] # 专业列表和技能列表 self.majorList = [] self.skillList = [] self.intro_range = [] # 用来存职位信息 self.jdJob = defaultdict(lambda: defaultdict(unicode)) def replace_space(self, line): ''' 输入:一行文本 输出:将具有多个空格的文本替换为一个空格之后的文本 ''' regex = re.compile(u' +') line = re.sub(regex, u' ', line) return line def judge_eng(self, word): ''' 输入:一个单词 功能:判断这个单词是否为英文 ''' if len(re.split(u"\w", word.lower())) > 4: return True else: return False def clean_line(self, line): """ 清除一个句子首尾的标点符号 """ line = self.CLEAN_LINE.sub("", line).strip() line = re.sub("\s+|^/d+[;’、,/。\.]", "", line) return line def clean_cnNum(self, line): """ 经验年限提取时,中文一二三等转为123 """ line = unicode(line) a = [u"一", u"二", u"三", u"四", u"五", u"六", u"七", u"八", u"九", u"十", u"两"] b = range(1, 11) + [2] table = dict((ord(aa), bb) for aa, bb in zip(a, b)) return line.translate(table) def line2vec(self, line): """ 句子转换为向量 """ vec = np.zeros(50) for word in jieba.cut(line): if word in self.w2v.vocab: vec += self.w2v[word] return vec def clean_jobname(self, jobname): """ 职位名清洗 """ print jobname if jobname.lower() in self.jobdic: return jobname else: res = [(lcs_len(jobname, job), job) for job in self.jobdic] res.sort() return res[-1][1] def desc_extract(self, soup): line_list = soup.find_all("p") return '\n'.join([line.get_text() for line in line_list]) # 去除img标签,1-7位空格, removeImg = re.compile('<img.*?>| {1,7}| ') # 删除超链接标签 removeAddr = re.compile('<a.*?>|</a>') # 把换行的标签换为\n replaceLine = re.compile('<tr>|<div>|</div>|</p>') # 将表格制表<td>替换为\t replaceTD = re.compile('<td>') # 将换行符或双换行符替换为\n replaceBR = re.compile('<br><br>|<br>') # 将其余标签剔除 removeExtraTag = re.compile('<.*?>') # 将多行空行删除 removeNoneLine = re.compile('\n+') def replace(self, x): x = re.sub(self.removeImg, "", x) x = re.sub(self.removeAddr, "", x) x = re.sub(self.replaceLine, "\n", x) x = re.sub(self.replaceTD, "\t", x) x = re.sub(self.replaceBR, "\n", x) x = re.sub(self.removeExtraTag, "", x) x = re.sub(self.removeNoneLine, "\n", x) # strip()将前后多余内容删除 return x.strip()
# coding=utf-8 from tgrocery import Grocery grocery = Grocery('sample') train_src = [('education', '名师指导托福语法技巧:名词的复数形式'), ('education', '中国高考成绩海外认可 是“狼来了”吗?'), ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'), ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与')] grocery.train(train_src) #grocery.train('/home/wangjianfei/git/data/train_ch.txt') # grocery.train('train_ch.txt') grocery.save() new_grocery = Grocery('sample') new_grocery.load() print( new_grocery.predict( 'Abbott government spends $8 million on higher education media blitz')) test_src = [ ('education', '福建春季公务员考试报名18日截止 2月6日考试'), ('sports', '意甲首轮补赛交战记录:米兰客场8战不败国米10年连胜'), ] print("start test..................") #grocery.test('/home/wangjianfei/git/data/test.txt') # grocery.train('train_ch.txt')) # custom_grocery = Grocery('custom', custom_tokenize=list) print(new_grocery.test(test_src))
class CoParserTop(object): def __init__(self): self.result = OrderedDict() co_keys = ['incName', 'incAliasName', 'incScale', 'incType', 'incIndustry', 'incSubIndustry', 'incIntro', 'incIntroShort', 'incCity', 'incLocation', 'locationInfo', 'incZipCode', \ 'incContactName', 'incContactEmail', 'incContactPhone', 'incContactQQ', 'incUrl', 'investIns', 'incStage', 'incLogo', 'incPhoto' \ , 'incLabel', 'prdInfo', 'leaderInfo', 'developInfo', \ 'incWeiboName', 'incWeiboUrl', 'incWechatName', 'incWechatUrl', 'incWechatCode'] coInc = OrderedDict() for k in co_keys: coInc[k] = "" self.result["coInc"] = coInc self.CLEAN_TEXT = re.compile( u"[^\u4e00-\u9fa5\w\d;::;,。、%\.,/。!!@()\r\n\(\)\-\+ - `]") self.clf = Grocery(base_dir + "/jdclf") self.clf.load() self.SPLIT_LINE = re.compile(u"[\r\n;::。!?;]|[ \s \xa0\u724b]{4,}") self.CLEAN_LINE = re.compile( u"^[\u2022(【\[\s\t\r\n\(\- ]?[\da-z12345789]{1,2}[\.,。、,::)】\]\)\s]|^[!@#¥%……&×()\(\){}:“|、-\-,。::\.]|^[一二三四五六七八九123456789\d]{0,2}[\.、\s:: ]|[,;。、\s \.]$|^[\s \u2022 \uff0d \u25cf]" ) self.CLEAN_JOBNAME = re.compile( u"急聘|诚聘|高薪|包[食住宿餐]|.险一金|待遇|^急?招|职位编号\s?[\s\d::]") self.PAY = re.compile("(\d{3,}\-)?\d{3,}元") self.SEX = re.compile(u"性别|男|女") self.AGE = re.compile(u"\d+周?岁|年龄") self.JOB_TAG = re.compile(u"全职|实习") self.DEGREE = re.compile( u"小学|初中|高中|职技|本科|研究生|硕士|博士|教授|专科|大专|中专|无要求|不限|无限") self.MAIL = re.compile(u"\w+@[\w\.]+") self.ZIP = re.compile(u"(\d{6})") self.QQ = re.compile(u"\d{6,10}") self.PHONE = re.compile( u"1\d{10}|0\d{11}|\d{3,4}-\d{3,4}-\d{3,4}|\d{3,4}-\d{7,8}-\d{7,8}-\d{7,8}|\d{3,4}-\d{7,8}-\d{7,8}|\d{3,4}-\d{7,8}|\d{3,4}-\d{7,8}" ) self.START_DEMAND = re.compile( u"(任职资格|岗位要求|工作要求|任职条件|任职要求|职位要求)[::\s】\n ]?") self.START_DUTY = re.compile( u"(工作内容|岗位职责|工作职责|职位描述|工作描述|职位介绍|职位职责|岗位描述)[::\s 】\n ]") self.START_BENEFIT = re.compile(u"(福利待遇|待遇|福利)[::\s\n】]") self.INC_URL = re.compile(u"(主页|网站|网址|官网).{0,3}[\w\d_/\.:\-]+") self.DEMAND = re.compile(u"精通|熟悉|熟练|有.+经验") self.DUTY = re.compile(u"负责|促成|为客户|安排的其.工作") self.BENEFIT = re.compile(u".险一金|福利|晋身|休假|带薪|补助|补贴") self.CERT = re.compile( u"(\S{2,8}证书|CET-\d|普通话|英语|口语|.语|日文|雅思|托福|托业)(至少)?(通过)?[\d一二三四五六七八九]级[及或]?(以上)?|(英语)?CET-\d级?(以上)?|\ 医学.{0,3}证|会计.{0,3}证|律师.{0,3}证|有.{1,8}证书") self.degreedic = set([ line.strip() for line in codecs.open(base_dir + '/data/degrees.txt', 'rb', 'utf-8') ]) self.majordic = set([ line.strip() for line in codecs.open(base_dir + '/data/majordic.txt', 'rb', 'utf-8') ]) self.skilldic = set([ line.strip() for line in codecs.open(base_dir + '/data/skills.txt', 'rb', 'utf-8') ]) self.jobdic = set([ line.strip() for line in codecs.open(base_dir + '/data/jobnames.txt', 'rb', 'utf-8') ]) self.citydic = set([ line.strip() for line in codecs.open(base_dir + '/data/citydic.txt', 'rb', 'utf-8') ]) self.province_city = set([ line.strip() for line in codecs.open(base_dir + '/data/province_city.txt', 'rb', 'utf-8') ]) self.city_area = set([ line.strip() for line in codecs.open(base_dir + '/data/city_area.txt', 'rb', 'utf-8') ]) self.SALARY = re.compile(u'万') jieba.load_userdict(base_dir + '/data/majordic.txt') jieba.load_userdict(base_dir + '/data/skills.txt') jieba.load_userdict(base_dir + '/data/firm.txt') jieba.load_userdict(base_dir + '/data/degrees.txt') jieba.load_userdict(base_dir + '/data/benefits.txt') jieba.load_userdict(base_dir + '/data/citydic.txt') jieba.load_userdict(base_dir + '/data/province_city.txt') def refresh(self): self.result = OrderedDict() co_keys = ['incName', 'incAliasName', 'incScale', 'incType', 'incIndustry', 'incSubIndustry', 'incIntro', 'incIntroShort', 'incCity', 'incLocation', 'locationInfo','incZipCode', \ 'incContactName', 'incContactEmail', 'incContactPhone', 'incContactQQ', 'incUrl', 'investIns', 'incStage', 'incLogo', 'incPhoto' \ , 'incLabel', 'prdInfo', 'leaderInfo','developInfo',\ 'incWeiboName', 'incWeiboUrl', 'incWechatName', 'incWechatUrl', 'incWechatCode'] coInc = OrderedDict() for k in co_keys: coInc[k] = "" self.result["coInc"] = coInc
#coding=utf-8 from tgrocery import Grocery text_model = Grocery('all_no_town') text_model.load() #输入文本,预测出class_name和class_prob def predict(text): c = text_model.predict(' '.join(list(text))) class_name = str(c) class_prob = c.dec_values[class_name] return class_name, class_prob print predict(u'100')
tdic['id'].append(_id) tdic['type'].append(_type) tdic['contents'].append(contents) i +=1 #train = pd.read_csv( train_file, header = 0, delimiter = "\t", quoting = 3 ) #test = pd.read_csv( test_file, header = 1, delimiter = "\t", quoting = 3 ) train = DataFrame(dic) test = DataFrame(tdic) # #classfynews_instance 是模型保存路径 grocery = Grocery('classfynews_instance') train_in = [train['contents'],train['type']] grocery.train(train_in) print grocery.get_load_status() #grocery.save() copy_grocery = Grocery('classfynews_instance') copy_grocery.load() #copy_grocery = grocery test_in = [test['contents'],test['type']] #输入类似 ['我是中国人','台北*****'] #输出 [11,12] test_result = copy_grocery.predict(test['contents']) print test_result.predicted_y #test_result = copy_grocery.test(test_in) #print test_result.show_result()
def main(): # Get market_sentiment of word from NTUSD-Fin train_t = [] train_s = [] targetIn = {} targetDict = dict() with open('NTUSD-Fin/NTUSD_Fin_hashtag_v1.0.json', 'r', encoding='utf-8') as f: targetIn = json.load(f) N = len(targetIn) for i in range(N): word = "#" + targetIn[i]['token'] targetDict[word] = targetIn[i]['market_sentiment'] sg = str(GroupValue_s(str(targetDict[word] / 3.5))) train_s.append((sg, word)) with open('NTUSD-Fin/NTUSD_Fin_word_v1.0.json', 'r', encoding='utf-8') as f: targetIn = json.load(f) N = len(targetIn) for i in range(N): word = targetIn[i]['token'] targetDict[word] = targetIn[i]['market_sentiment'] sg = str(GroupValue_s(str(targetDict[word] / 3.5))) train_s.append((sg, word)) # Training File: Load data & Use tgrocery to train classification model TrainingFile = open('training_set.json', 'r') TrainingData = json.load(TrainingFile) TrainingFile.close() DataList = [] grocery_t = Grocery("tweet") grocery_s = Grocery("snippet") for DataElement in TrainingData: tempt = DataManager() tempt.insertData(DataElement) tempt.group_t = GroupValue_t(tempt.sentiment) tempt.group_s = GroupValue_s(tempt.sentiment) line = re.sub("https?://[\w\-]+(\.[\w\-]+)+\S*", " ", DataElement["tweet"]) train_t.append((str(tempt.group_t), line)) if isinstance(DataElement["snippet"], list): for line in DataElement["snippet"]: train_s.append((str(tempt.group_s), line)) elif DataElement["snippet"] != "": train_s.append((str(tempt.group_s), DataElement["snippet"])) else: tempt.group_s = 0.0 DataList.append(tempt) grocery_t.train(train_t + train_s) grocery_t.save() grocery_s.train(train_s) grocery_s.save() # Save training data created by WordScore() and GroupValue_*() # Data will be uesd for LinearRegression() in BOTH.py outfile = open('TG_train.txt', 'w', encoding='utf-8') dataScore = [] dataSentiment = [] for row in DataList: dataSentiment.append([float(row.sentiment)]) a = WordScore(row.tweet, targetDict) b = WordScore(row.snippet, targetDict) c = row.group_t d = row.group_s dataScore.append([a, b, c, d]) print(a, b, c, d, file=outfile) outfile.close() ''' # Train linear regression model model = LinearRegression() model.fit(dataScore, dataSentiment) # Test for training data print('(train)R-squared: %.3f' % model.score(dataScore, dataSentiment)) #0.915 predictions = model.predict(dataScore) rms = mean_squared_error(dataSentiment,predictions) print('RMSE: %.3f' % sqrt(rms)) #0.110 print('MSE: %.3f' % rms) #0.012 ''' # Testing File: Load data & Use tgrocery classification model to predict TestingFile = open('test_set.json', 'r') TestingData = json.load(TestingFile) TestingFile.close() DataList = [] new_grocery_t = Grocery('tweet') new_grocery_t.load() new_grocery_s = Grocery('snippet') new_grocery_s.load() for DataElement in TestingData: tempt = DataManager() tempt.insertData(DataElement) line = re.sub("https?://[\w\-]+(\.[\w\-]+)+\S*", " ", DataElement["tweet"]) tempt.group_t = float('{0}'.format(new_grocery_t.predict(line))) value = 0.0 if isinstance(DataElement["snippet"], list): for line in DataElement["snippet"]: value = value + float('{0}'.format( new_grocery_s.predict(line))) value = value / len(DataElement["snippet"]) elif DataElement["snippet"] != "": value = float('{0}'.format( new_grocery_s.predict(DataElement["snippet"]))) tempt.group_s = value DataList.append(tempt) # Save testing data created by WordScore() and classification prediction # Data will be uesd for LinearRegression() in BOTH.py outfile = open('TG_test.txt', 'w', encoding='utf-8') dataScore = [] dataSentiment = [] for row in DataList: dataSentiment.append([float(row.sentiment)]) a = WordScore(row.tweet, targetDict) b = WordScore(row.snippet, targetDict) c = row.group_t d = row.group_s dataScore.append([a, b, c, d]) print(a, b, c, d, file=outfile) outfile.close() '''
grocery = Grocery('sample') # 训练文本可以用列表传入 train_src = [ ('education', '名师指导托福语法技巧:名词的复数形式'), ('education', '中国高考成绩海外认可 是“狼来了”吗?'), ('sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼'), ('sports', '四川丹棱举行全国长距登山挑战赛 近万人参与') ] grocery.train(train_src) # 也可以用文件传入(默认以tab为分隔符,也支持自定义) #grocery.train('train_ch.txt') # 保存模型 grocery.save() # 加载模型(名字和保存的一样) new_grocery = Grocery('sample') new_grocery.load() # 预测 new_grocery.predict('考生必读:新托福写作考试评分标准') #education # 测试 test_src = [ ('education', '福建春季公务员考试报名18日截止 2月6日考试'), ('sports', '意甲首轮补赛交战记录:米兰客场8战不败国米10年连胜'), ] new_grocery.test(test_src) # 输出测试的准确率 #0.5 # 同样可支持文件传入 #new_grocery.test('test_ch.txt')
ftest = open(path2, 'w') for line in open(path): if random.random() < theta: ftest.write(line) else: ftrain.write(line) ftrain.close() ftest.close() def train(path,name): grocery = Grocery(name) grocery.train(path) grocery.save() if __name__ == "__main__": data2tt(sys.argv[3], sys.argv[1], sys.argv[2], 0.02) train(sys.argv[1], "music") new_grocey = Grocery("music") new_grocey.load() n = 0 for line in open(sys.argv[2],"r"): ls = line.strip().split("\t") predict = new_grocey.predict(ls[1]) test = ls[0] result = 0 if test == str(predict): result = 1 n += result print predict,test,result print n