def update_status_db(et_info_url): if et_info_url['etwebsite']: print utils.current_time(), '正在更新状态表数据status=3......' conn = utils.get_local_db() et_status = {} et_status['etid'] = et_info_url['etid'] et_status['url_status'] = 3 update_db(conn, et_status, 'et_info_status') else: print utils.current_time(), '正在更新状态表数据status=2......' conn = utils.get_local_db() et_status = {} et_status['etid'] = et_info_url['etid'] et_status['url_status'] = 2 update_db(conn, et_status, 'et_info_status')
def get_lt_etid(): logging.info('12. 正在获取其他的企业 ') print utils.current_time(), '建立odps链接..' o = ODPS('LTAIzEuNzcL6qJJ8', 'eUAgj9ijhWCvOQ3w5Uv3FkwhNxvPF2', 'database_test', 'http://service.odps.aliyun.com/api') print utils.current_time(), '进行查询...' pt = time.strftime('%Y%m%d', time.localtime(int(time.time() - 86400))) res = o.execute_sql( "select distinct etid from et_jobs where pt='{}' and isheadhunter=1". format(pt)) print utils.current_time(), '处理查询结果...' etid_set = set() conn = utils.get_local_db() addtime = int(time.time()) cnt = 0 with res.open_reader() as reader: print utils.current_time(), '共需处理{}条!'.format(reader.count) for record in reader: etid_set.add((record['etid'], )) if len(etid_set) >= 1000: conn.executemany( "insert into et_info_status(etid,addtime) values(%s,{})on duplicate key update etid=values(etid), addtime=values(addtime)" .format(addtime), list(etid_set)) cnt += 1000 print utils.current_time(), '当前已写入{}条!'.format(cnt) etid_set.clear() if len(etid_set) > 0: conn.executemany( "insert into et_info_status(etid,addtime) values(%s,{})on duplicate key update etid=values(etid), addtime=values(addtime)" .format(addtime), list(etid_set)) cnt += len(etid_set) print utils.current_time(), '当前已写入{}条!'.format(cnt) conn.close() return reader.count
def get_36kr_etid(): logging.info('10. 正在获取36hr的企业 ') print utils.current_time(), '建立数据库连接...' conn = utils.get_read_db(db='contact_datastore') print utils.current_time(), '查询需要采集的etid...' res = conn.query("select etid from dt_daily_36kr") conn.close() print utils.current_time(), '查询完成!' insert_list = [] addtime = int(time.time()) for x in res: insert_list.append([x['etid'], addtime]) print utils.current_time(), '准备写入数据库...' conn = utils.get_local_db() total = len(insert_list) print utils.current_time(), '共需写入', total, '条!' for i in range(0, total, 1000): start = i end = min(start + 1000, total) conn.executemany( "insert into et_info_status(etid,addtime) values(%s,%s)on duplicate key update etid=values(etid), addtime=values(addtime)", insert_list[start:end]) print utils.current_time(), '当前写入 {}/{}!'.format(end, total) conn.close() print '写入完成!' return total
def extract_all(use_random_forest): if use_random_forest: emails = rf_model() emails = [email for email in emails if email[0] != 'negatives_clean'] else: db = utils.get_local_db() for collection in db.collection_names(): if collection != 'negatives_clean': for record in db.get_collection(collection).find(): emails.append([collection] + [record['Text']]) # find features for each email email_data = [] for email_set in emails: email = email_set[1] fields = features[email_set[0]] # extract named entities tokenized_email = nltk.word_tokenize(email) tagged_email = nltk.pos_tag(tokenized_email) named_entity_email = nltk.ne_chunk(tagged_email) entities = [] # concatenate multi-word entities for branch in named_entity_email: if isinstance(branch, nltk.tree.Tree): entity = '' for sub_entity in branch: entity += (sub_entity[0] + ' ') if [branch.label(), entity.strip()] not in entities: entities.append([branch.label(), entity.strip()]) # use entities to fill in fields matches = [] for field in fields: field_matches = [] for entity in entities: # compute semantic distance and threshold dist = 0 description = describe(entity[1]) if description: for word in description.split(): a = wn.synsets(field[1]) b = wn.synsets(word) if a and b: a = a[0] b = b[0] segment = a.path_similarity(b) if segment: dist += segment if dist > 0.1: field_matches.append([dist, entity[1]]) field_matches.sort(key=lambda x: x[0], reverse=True) matches.append({field[1]: field_matches}) email_data.append([email_set[0], email, matches]) return email_data
def get_etid(): """ 获取未处理的etid :return: """ logging.info('%s 本地表中读取所有未处理的etid...' % utils.current_time()) conn = utils.get_local_db() result = conn.query( "select etid from et_info_status where url_status=1 limit 200") conn.close() return result
def rf_model(): percent_training = .70 # proportion of data to use for training #get emails from local mongodb emails = [] db = utils.get_local_db() for collection in db.collection_names(): for record in db.get_collection(collection).find(): emails.append([collection] + [record['Text']]) # shuffle and split emails random.shuffle(emails) training_set = emails[:int(percent_training * len(emails))] testing_set = emails[int(percent_training * len(emails)):] training_labels = [row[0] for row in training_set] training_data = [row[1] for row in training_set] testing_data = [row[1] for row in testing_set] # tf-idf vectorize training set vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(training_data) X = X.toarray() # tf-idf vectorize testing set vectorized_testing_data = [ vectorizer.transform([email]) for email in testing_data ] total = len(vectorized_testing_data) # create random forest forest = RandomForestClassifier(n_estimators=int(sqrt(len(X[0]))) + 1) forest.fit(X, training_labels) # generate and return predictions tagged_emails = [] for i in range(total): tagged_emails.append( [forest.predict(vectorized_testing_data[i])[0], testing_data[i]]) return tagged_emails
def rf_categorize(email): # get training corpus emails = [] db = utils.get_local_db() for collection in db.collection_names(): for record in db.get_collection(collection).find(): emails.append([collection] + [record['Text']]) # vectorize corpus labels = [row[0] for row in emails] data = [row[1] for row in emails] vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(data) X = X.toarray() # vectorize input email_vector = vectorizer.transform([email]) # create random forest and return prediction forest = RandomForestClassifier(n_estimators=int(sqrt(len(X[0]))) + 1) forest.fit(X, labels) return forest.predict(email_vector)[0]
import utils from sklearn.ensemble import RandomForestClassifier from sklearn.feature_extraction.text import TfidfVectorizer from sklearn import linear_model from sklearn.cross_validation import cross_val_score from math import sqrt if __name__ == '__main__': num_samples = 100 # number of random forests to compute and then average # get emails from local mongodb emails = [] db = utils.get_local_db() for collection in db.collection_names(): for record in db.get_collection(collection).find(): emails.append([collection] + [record['Text']]) # create labels and vectorize data labels = [row[0] for row in emails] vectorizer = TfidfVectorizer() data = vectorizer.fit_transform([row[1] for row in emails]).toarray() # create random forst and perform cross validation forest = RandomForestClassifier(n_estimators=int(sqrt(len(data[0]))) + 1) scores = cross_val_score(forest, data, labels, cv=num_samples) # write output to file output = open('random_forest_cross_validation.txt', 'w') for i in range(len(scores)): print(str(i) + ": " + str(scores[i]), file=output) output.close()