def update_author_networking_from_meta(): start = 0 db = Database() total = int( data_fetch("COUNT(*)", "essays", start=start, limit=None)[0][0]) scan_count = start insert_count = 0 limit = 10000 while scan_count < total: if limit > total - scan_count: limit = total - scan_count data = data_fetch("*", "essays", start=start, limit=limit) for raw_item in data: obj = Essay(raw_item) response = obj.author_insert(db.conn) scan_count += 1 if response: insert_count += response if scan_count % 1000 == 0: print( "Scanned {} out of {} essays \n Inserted {} relation \n \n" .format(scan_count, total, insert_count)) start += limit
def get_word_freq(): with open("../../../data/nlp/stop_words.pickle", "rb") as file: stopwords = pickle.load(file) df = pd.read_excel("../../../data/output/关键词.xlsx") keywords = [] for ind in df.keys(): for word in df[ind]: if not pd.isna(word): keywords.append(str(word)) with open("../../../data/output/keywords.txt", "w", encoding="utf8") as file: for words in keywords: file.write("{} 3 nt\n".format(words)) jieba.load_userdict("../../../data/output/keywords.txt") frequency = [] num = row_count("wechat_essays", host_IP="192.168.164.11", database="wechat_v1") n = 0 limit = 1000 while n < num: if num - n < limit: limit = num - n data = data_fetch("content", "wechat_essays", limit=limit, start=n, host_IP="192.168.164.11", database="wechat_v1") for item in data: fre = {} cleaned = html_cleanup(item[0]) seg = jieba.cut(cleaned) for word in seg: if word.replace(" ", "") == "": pass else: if word not in stopwords: if word in fre.keys(): fre[word] += 1 else: fre[word] = 1 frequency.append(fre) n += limit print("=== Done {} rows".format(n)) with open("../../../data/output/word_freq.pickle", "wb") as file: pickle.dump(frequency, file) return frequency
def worker(w_id, start, end): with open("../../data/nlp/stop_words.pickle", "rb") as file: stopwords = pickle.load(file) df = pd.read_excel("../../data/output/关键词.xlsx") keywords = [] for ind in df.keys(): for word in df[ind]: if not pd.isna(word): keywords.append(str(word)) with open("../../data/output/keywords.txt", "w", encoding="utf8") as file: for words in keywords: file.write("{} 3 nt\n".format(words)) jieba.load_userdict("../../data/output/keywords.txt") n = start limit = 1000 with open("../../data/output/train{}.dat".format(w_id), "w", encoding="utf-8") as f: while n < end: if end - n < limit: limit = end - n data = data_fetch("content", "wechat_essays_v2", limit=limit, start=n, host_IP="192.168.164.11", database="wechat") for item in data: cleaned = html_cleanup(item[0]) seg = jieba.cut(cleaned) output = "" for word in seg: if word.replace(" ", "") == "": pass else: if word not in stopwords: output += word + " " f.write(output + "\n") n += limit print("id: {} === Done {} rows".format(id, n))
def __init__(self): g = Networks() count = 0 relationships = data_fetch("`id`, `media_id`, `src_media_id`", "media_media", limit=99999, host_IP="10.0.0.101", database="processing") for item in relationships: if item[1] == item[2]: count += 1 else: media1 = Media(item[1]) media2 = Media(item[2]) g.add_media(media1) g.add_media(media2) g.add_relationship(Relationship(media1, media2, item[0]))
def ntwk_rating(): """ :return: """ try: data = json.loads(request.data) mids = data["medias"] # 处理报错,直接返回失败并在data中附带错误信息 except Exception as e: return json.dumps({ 'code': 0, 'msg': 'FAILURE', 'data': { 'error_msg': str(e) } }) relations = data_fetch("")
def worker(w_id, start, end): print("===============Process {} has started================".format(w_id)) db = Database() total = end scan_count = 0 limit = 10000 chunk_size = end - start media_inserted = 0 author_inserted = 0 while scan_count < (end - start): if limit > total - scan_count - start: limit = total - scan_count - start data = data_fetch("*", "essays", start=start, limit=limit, tail_condition="ORDER BY `insert_time` DESC") for raw_item in data: obj = Essay(raw_item) media_count, author_count = obj.extractor_info_insert(db.conn) author_count += obj.meta_author_insert(db.conn) scan_count += 1 if media_count + author_count: media_inserted += media_count author_inserted += author_count if scan_count % 1000 == 0: print( "Process {} has scanned {} out of {} essays \n Inserted {} author relations \n Inserted {} media relations\n" .format(w_id, scan_count, chunk_size, author_inserted, media_inserted)) start += limit print("===============Process {} has ended================".format(w_id))
'2462acae8ed692945720e6acf6825c56', 'c0b0b4d041856d52c0c4c97adf8a5985', 'f365ff75ae3c3b39d7dd5f0b9335ed3a', 'd4fa139b651f9efc5b5298a58e89f0ee', '54bf7dea08ac923ad6cb9bf894d50013', '036519b7296d1e5b0273e354cd310c06', '138df8b28925617a759e425c63e7f642', '6d4c13d26a41d6fc6c57d5c2aa652616', '9a6079cfa40e89c923754d677d2a586e', '26eb934f3637b266c44f2b2f46f5df2a' ] nick_list = [ '北京女主', '马铃薯精英网', '毕节市银行卡协会', '一诺法鼎财税', '财经野史', '独区企业服务平台', '每日股市秘闻', '融邦投资', '营销案例分析', '大安热线' ] data_from_media = data_fetch( "`media_id`, `media_nick`, `src_media_id`, `src_media_nick`, `type`, `essay_id`", "media_media", condition=build_condition("media_id", id_list, "OR"), limit=None, host_IP="10.0.0.101", database="processing") data_from_src = data_fetch( "`media_id`, `media_nick`, `src_media_id`, `src_media_nick`, `type`, `essay_id`", "media_media", condition=build_condition("src_media_id", id_list, "OR"), limit=None, host_IP="10.0.0.101", database="processing") if __name__ == "__main__": _ = 1
from data_structure.graph.graph import Vertex, Edge, Graph from utils.mysql import data_fetch class Author(Vertex): def __init__(self, name, hobby): Vertex.__init__(self, name) self.hobby = hobby dic = {} data = data_fetch("wechat_name`, `meta_content", "wechat_essays_v1", limit=100000) for item in data: wechat_name = item[0] content = item[1] entities = content.replace(":", " ").split(" ") for obj in entities: if "原创" in obj: entities.remove(obj) if "点击" in obj: entities.remove(obj) dic[wechat_name] = entities # n = 0 # onehot = [] # for key in dic.keys(): # n += len(dic[key]) # onehot += dic[key]
pass else: print(e) except Exception as e: print(e) return media_count, author_count if __name__ == "__main__": _ = 1 count = 0 g = Networks() relationships = data_fetch("`id`, `media_id`, `src_media_id`", "media_media", limit=99999, host_IP="10.0.0.101", database="processing") for item in relationships: if item[1] == item[2]: count += 1 else: media1 = Media(item[1]) media2 = Media(item[2]) g.add_media(media1) g.add_media(media2) g.add_relationship(Relationship(media1, media2, item[0])) conn_count = {} for item in g.vertices:
def worker(w_id, start, end): print( "===================Process {} has Started==============".format(w_id)) if w_id % 2 == 0: url = "http://192.168.164.15:49001/seg/s" else: url = "http://10.0.0.59:49001/seg/s" with open("../../../../data/nlp/stop_words.pickle", "rb") as file: stopwords = pickle.load(file) dic_path = "../../../../data/output/account_name_unique_jieba.txt" jieba.load_userdict(dic_path) n = start limit = min(end - start, 30000) count = 0 tmp = 0 cou = 0 while n < end: title_whole = [] content_whole = [] if end - n < limit: limit = end - n data = data_fetch("`title`, `content`", "essays", host_IP="192.168.164.15", user_name="raw", password="******", database="raw", limit=limit, start=start, tail_condition="ORDER BY `update_time`") for item in data: title_dic = {} content_dic = {} title = item[0] content = item[1] if title is None: t_result = None else: try: title = replace_punctuation( html_cleanup(title).replace(" ", "").replace("\n", "")) t_result = "/".join(jieba.cut(title)) except Exception as e: print(e) t_result = None time.sleep(1) if content is None: c_result = None else: try: content = replace_punctuation( html_cleanup(content).replace(" ", "").replace("\n", "")) c_result = "/".join(jieba.cut(content)) except KeyError: c_result = None pass except Exception as e: print(e) c_result = None time.sleep(1) if t_result is None: pass else: t_wordlist = t_result.split("/") for item in t_wordlist: if len(item) > 0 and item != " ": if item in stopwords: pass elif isPunctuation(item): pass else: if item in title_dic.keys(): title_dic[item] += 1 else: title_dic[item] = 1 if c_result is None: pass else: c_wordlist = c_result.split("/") for item in c_wordlist: if len(item) > 0 and item != " ": if item in stopwords: pass else: if item in content_dic.keys(): content_dic[item] += 1 else: content_dic[item] = 1 title_whole.append(title_dic) content_whole.append(content_dic) count += 1 if count % 10000 == 0: with open( "../../../../data/output/w_freq0/title/result{}-{}.pickle" .format(w_id, cou), "wb") as f: pickle.dump(title_whole, f) with open( "../../../../data/output/w_freq0/content/result{}-{}.pickle" .format(w_id, cou), "wb") as f: pickle.dump(content_whole, f) print("Process {} has processed {} essays... \n".format( w_id, count)) n += limit cou += 1 start += limit with open( "../../../../data/output/w_freq0/title/result{}[-1].pickle".format( w_id), "wb") as f: pickle.dump(title_whole, f) with open( "../../../../data/output/w_freq0/content/result{}[-1].pickle". format(w_id), "wb") as f: pickle.dump(content_whole, f) print("===================Process {} has ended==============".format(w_id))
def __init__(self, action, data): if action == "channel": self.uid = data[1] self.action = action self.time_occurred = data[5] channel_details = \ data_fetch("`title`, `content`, `pid`, `id`, `show`", "biz_channels", "id={}".format(data[0]), host_IP="192.168.164.11", user_name="dwapi", password="******", database="dw_biz")[0] self.description = str({ "id": data[2], "data": { "title": channel_details[0], "content": channel_details[1], "pid": channel_details[2], "id": channel_details[3], "show": channel_details[4] } }).replace("'", '"') self.related_id = data[0] if action == "search": self.uid = data[0] self.action = action self.time_occurred = data[7] self.description = str({ "content": data[1], "count": data[3], "type": data[4], "status": data[2], "id": data[5] }).replace("'", '"') self.related_id = data[1][:min(len(data[1]), 32)] if action == "topic": self.uid = data[1] self.action = action self.time_occurred = data[5] topic_details = data_fetch( "`title`, `content`, `id`, `sources`, `filters`, `fav_count`, `status`, `only_match_title`, `uid`", "biz_topics", "id={}".format(data[0]), host_IP="192.168.164.11", user_name="dwapi", password="******", database="dw_biz")[0] # print(topic_details) self.description = str({ "id": data[3], "enable": str(data[2]), "data": { "title": topic_details[0], "content": topic_details[1], "id": topic_details[2], "sources": topic_details[2], "filters": topic_details[2], "fav_count": topic_details[2], "status": topic_details[2], "only_match_title": topic_details[2], "uid": topic_details[2], } }).replace("'", '"') self.related_id = data[0] if action == "eclick": self.uid = data[0] self.action = action self.time_occurred = data[2] self.description = str({ "outbound_time": data[3], "view_percent": data[4], "id": data[5] }).replace("'", '"') self.related_id = data[1] if action == "elike": self.uid = data[1] self.action = action self.time_occurred = data[4] self.description = str({"id": data[2]}).replace("'", '"') self.related_id = data[0] if action == "ecomment": self.uid = data[1] self.action = action self.time_occurred = data[9] self.description = str({ "content": data[2], "to_uid": data[3], "f_id": data[4], "f_comment": data[5], "like_count": data[6], "status": data[7], "id": data[8] }).replace("'", '"') self.related_id = data[0] if action == "efav": self.uid = data[1] self.action = action self.time_occurred = data[5] self.description = str({ "enable": data[2], "id": data[3] }).replace("'", '"') self.related_id = data[0]
import pickle from datetime import datetime from pymysql import IntegrityError from utils.mysql import data_fetch, Database from utils.text_utilizer import get_md5 with open("../../../../data/szqj/medias.pickle", "rb") as f: known_medias = pickle.load(f) data = data_fetch("`name`, `media_id`, `media_nick`, `platform_id`, `essay_id`, `essay_pubdate`", "author_media", host_IP="10.0.0.101", database="processing", limit=None) connection = Database().conn for item in data: platform_id = item[3] if platform_id == 1: platform = "WX" author_name = item[0] media_id = item[1] media_nick = item[2] essay_id = item[4] essay_pubdate = item[5] if author_name == media_nick: pass else: if author_name in known_medias: author_id = get_md5(platform + "-" + author_name) sql_cols = """`id`, `media_id`, `media_nick`, `platform_id`, `src_media_id`, `src_media_nick`, `type`, `essay_id`, `essay_pubdate`, `insert_time`""" sql_values = """VALUES ('{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}', '{}');""".format(
def worker(w_id, start, end): print( "===================Process {} has Started==============".format(w_id)) with open("../../../../data/nlp/idf.pickle", "rb") as file: idf = pickle.load(file) connection = Database().conn n = start limit = min(end - start, 10000) dic_path = "../../../../data/output/account_name_unique_jieba.txt" jieba.load_userdict(dic_path) count = 1 dup = 0 print(end) while n < end: if end - n < limit: limit = end - n data = data_fetch("`id`, `content`, `pubdate`", "essays", host_IP="192.168.164.15", user_name="raw", password="******", database="raw", limit=limit, start=start, tail_condition="ORDER BY `insert_time`") for item in data: if item is None: pass else: essay_id = item[0] content = item[1] pubdate = item[2] if content is None: pass else: duplicated = 1 try: duplicated = row_count( "essay_keywords", condition="`essay_id` = '{}'".format(essay_id), host_IP="10.0.0.101", database="processing") > 0 except Exception as e: print(e) if duplicated: dup += 1 else: result = tfidf(content, idf, method=1)[0] sql_cols = """`essay_id`, `content`, `pubdate`, `insert_time`""" sql_values = """VALUES ('{}', '{}', '{}', '{}');""".format( essay_id, json.dumps(result, ensure_ascii=False), pubdate, datetime.now().replace(microsecond=0)) sql = """INSERT INTO `essay_keywords` ({}) {}""".format( sql_cols, sql_values) try: with connection.cursor() as cur: # print(sql) cur.execute(sql) connection.commit() count += 1 except Exception as e: print(e) if (count + dup) % 1000 == 0: print( "Process {} has inserted {} essays, duplicated skipped {} \n" .format(w_id, count, dup)) n += limit start += limit print("===================Process {} has ended==============".format(w_id))
def essay_tfidf(): """ :return: """ try: data = json.loads(request.data) eid = data["essay_id"] limit = int(data["limit"]) method = int(data["method"]) except Exception as e: return json.dumps({ 'code': 0, 'msg': 'FAILURE', 'data': { 'error_msg': str(e) } }) if limit <= 50: try: data = data_fetch("`content`", "essay_keywords", database="processing", host_IP="10.0.0.101", condition="`essay_id`='{}'".format(eid), user_name="lduan", password="******") if len(data) == 0: pass else: content_dic = json.loads(data[0][0], encoding="utf8") ll = list(content_dic.items()) ll.sort(key=operator.itemgetter(1), reverse=True) tops = ll[:min(limit, len(ll))] output = {} for item in tops: output[item[0]] = item[1] return json.dumps({ 'code': 1, 'msg': 'SUCCESS', 'data': output }, ensure_ascii=False) except Exception as e: return json.dumps({ 'code': 0, 'msg': 'FAILURE', 'data': { 'error_msg': str(e) } }) try: essay, pubdate = data_fetch("`content`, `pubdate`", "essays", condition="`id` = '{}'".format(eid), host_IP="192.168.164.15", user_name="raw", password="******", database="raw")[0] result, fifty = tfidf(essay, idf, limit, method=method) except Exception as e: return json.dumps({ 'code': 0, 'msg': 'FAILURE', 'data': { 'error_msg': str(e) } }) sql_cols = """`essay_id`, `content`, `pubdate`, `insert_time`""" sql_values = """VALUES ('{}', '{}', '{}', '{}');""".format( eid, json.dumps(fifty, ensure_ascii=False), pubdate, datetime.now().replace(microsecond=0)) sql = """INSERT INTO `essay_keywords` ({}) {}""".format( sql_cols, sql_values) try: with connection.cursor() as cur: cur.execute(sql) connection.commit() return json.dumps({'code': 1, 'msg': 'SUCCESS', 'data': result}) except IntegrityError as e: if e.args[0] == 1062: return json.dumps({'code': 1, 'msg': 'SUCCESS', 'data': result}) else: raise except Exception as e: return json.dumps({ 'code': 0, 'msg': 'FAILURE', 'data': { 'error_msg': str(e) } })
def worker(w_id, start, end): print( "===================Process {} has Started==============".format(w_id)) if w_id % 2 == 0: url = "http://192.168.164.15:49001/seg/s" else: url = "http://10.0.0.59:49001/seg/s" with open("../../../../data/nlp/stop_words.pickle", "rb") as file: stopwords = pickle.load(file) n = start limit = min(end - start, 10000) title_whole = [] content_whole = [] count = 0 tmp = 0 while n < end: if end - n < limit: limit = end - n data = data_fetch("`title`, `content`", "essays", host_IP="192.168.164.15", user_name="raw", password="******", database="raw", limit=limit, start=start, tail_condition="ORDER BY `update_time`") for item in data: title_dic = {} content_dic = {} title = item[0] content = item[1] if title is None: t_result = None else: try: title = html_cleanup(title).replace(" ", "").replace("\n", "") t_result = requests.post(url, data={ "_q": title }).json()["data"] except Exception as e: print(e) t_result = None time.sleep(1) if content is None: c_result = None else: try: content = html_cleanup(content).replace(" ", "").replace( "\n", "") # if len(content) > tmp: # tmp = len(content) # print(len(content)) # print(content) if len(content) < 10000: c_result = requests.post(url, data={ "_q": content }).json()["data"] else: content_list = text_spliter(content) reqtoolong = [ requests.post(url, data={ "_q": item }).json()["data"] for item in content_list ] c_result = reqtoolong[0] for evenmore in reqtoolong[1:]: c_result = c_result + " " + evenmore except KeyError: c_result = None pass except Exception as e: print(e) c_result = None time.sleep(1) if t_result is None: pass else: t_wordlist = t_result.split(" ") for item in t_wordlist: if len(item) > 0: # item_l = item.split("/") # word = item_l[0] # pos = item_l[1] # if pos == "w": # pass # else: if item in stopwords: pass elif isPunctuation(item): pass else: if item in title_dic.keys(): title_dic[item] += 1 else: title_dic[item] = 1 if c_result is None: pass else: c_wordlist = c_result[1:-1].split(" ") for item in c_wordlist: if len(item) > 0: # item_l = item.split("/") # word = item_l[0] # pos = item_l[1] # if pos == "w": # pass # else: if item in stopwords: pass else: if item in content_dic.keys(): content_dic[item] += 1 else: content_dic[item] = 1 title_whole.append(title_dic) content_whole.append(content_dic) count += 1 if count % 1000 == 0: with open( "../../../../data/output/w_freq/title/result{}.pickle". format(w_id), "wb") as f: pickle.dump(title_whole, f) with open( "../../../../data/output/w_freq/content/result{}.pickle" .format(w_id), "wb") as f: pickle.dump(content_whole, f) print("Process {} has processed {} essays... \n".format( w_id, count)) n += limit start += limit with open( "../../../../data/output/w_freq/title/result{}.pickle".format( w_id), "wb") as f: pickle.dump(title_whole, f) with open( "../../../../data/output/w_freq/content/result{}.pickle".format( w_id), "wb") as f: pickle.dump(content_whole, f) print("===================Process {} has ended==============".format(w_id))
import pickle import re from utils.mysql import data_fetch from utils.read_txt import read_txt from utils.text_cleaner import html_cleanup content = "" essay_id = "" start = 0 limit = 5000 data = data_fetch("`title`, `meta_content`, `content`", "essays", limit=limit, start=start) with open("../../../data/temp/essays_tmp.pickle", "wb") as f: pickle.dump(data, f) with open("../../../data/temp/essays_tmp.pickle", "rb") as f: data = pickle.load(f) keywords = read_txt("../../../data/nlp/essay_author/author_keywords.txt") black_list = ["图片来源", "配图来源", "来源为网络", "数据来源", "请勿转载", "转载以及向", "来源为网络"] count = 0 for content in data: title = content[0] meta_data = content[1] if content[2] is not None:
cur.execute(sql) connection.commit() except Exception as e: print(sql) print(e) host_IP = "192.168.164.11" db_name = "dw_biz" uname = "dwapi" pword = "api@szqj" channel_data = data_fetch( "`channel_id`, `uid`,`id`, `insert_time`, `update_time`", "biz_user_channels", limit=None, host_IP=host_IP, user_name=uname, password=pword, database=db_name) search_data = data_fetch( "`uid`, `content`, `status`, `result_count`, `type`, `id`, `insert_time`, `update_time`", "biz_search_records", limit=None, host_IP=host_IP, user_name=uname, password=pword, database=db_name) topic_data = data_fetch( "`topic_id`, `uid`, `enable`, `id`, `insert_time`, `update_time`", "biz_topic_subscribers", limit=None,