def info_extract(content): account_associate_rules = read_txt("../../../data/nlp/essay_author/account_associate_rules.txt") author_associate_rules = read_txt("../../../data/nlp/essay_author/author_associate_rules.txt") author_blklist = read_txt("../../../data/nlp/essay_author/author_blklist.txt") account_blklist = read_txt("../../../data/nlp/essay_author/account_blklist.txt") count = 0 media_list = [] author_list = [] if content is not None: content = html_cleanup(content) for auth_rule in author_associate_rules: result = re_between_findall(content, auth_rule) if result is not None: for item in result: s = remove_punctuation(item[1], exception="、") if s != "": authors = s.split("、") for auth in authors: if len(auth) < 2: break else: blk = 0 for blk_item in author_blklist: if blk_item in auth: blk += 1 if blk: pass else: count += 1 author_list.append(auth) for acc_rule in account_associate_rules: result = re_between_findall(content, acc_rule) if result is not None: for item in result: s = remove_punctuation(item[1], exception="、") if s != "" and len(s) == len(item[1]): medias = s.split("、") for media in medias: if len(media) < 2: break else: # obj = SogouWeixin(media) # info = obj.extract_user_info() # print("What we found: {} \nWhat we got: {}".format(media, info["nickname"])) # if media == info["nickname"]: if len(media) > 22 and "、" not in media: pass else: blk = 0 for blk_item in account_blklist: if blk_item in media: blk += 1 if blk: pass else: count += 1 media_list.append(media) # print(media.replace(" ", "").replace("文丨", "")) return [list(set(media_list)), list(set(author_list))]
def get_word_freq(): with open("../../../data/nlp/stop_words.pickle", "rb") as file: stopwords = pickle.load(file) df = pd.read_excel("../../../data/output/关键词.xlsx") keywords = [] for ind in df.keys(): for word in df[ind]: if not pd.isna(word): keywords.append(str(word)) with open("../../../data/output/keywords.txt", "w", encoding="utf8") as file: for words in keywords: file.write("{} 3 nt\n".format(words)) jieba.load_userdict("../../../data/output/keywords.txt") frequency = [] num = row_count("wechat_essays", host_IP="192.168.164.11", database="wechat_v1") n = 0 limit = 1000 while n < num: if num - n < limit: limit = num - n data = data_fetch("content", "wechat_essays", limit=limit, start=n, host_IP="192.168.164.11", database="wechat_v1") for item in data: fre = {} cleaned = html_cleanup(item[0]) seg = jieba.cut(cleaned) for word in seg: if word.replace(" ", "") == "": pass else: if word not in stopwords: if word in fre.keys(): fre[word] += 1 else: fre[word] = 1 frequency.append(fre) n += limit print("=== Done {} rows".format(n)) with open("../../../data/output/word_freq.pickle", "wb") as file: pickle.dump(frequency, file) return frequency
def __init__(self, text): """ Takes raw text data as input then parse and store processed data :param text: raw text input """ # clean up html format non-sense self.text = html_cleanup(text) self.output_raw = str(HanLP.parseDependency(self.text)) self.parsed, self.core_ind = self.parse() self.core_ind_update()
def worker(w_id, start, end): with open("../../data/nlp/stop_words.pickle", "rb") as file: stopwords = pickle.load(file) df = pd.read_excel("../../data/output/关键词.xlsx") keywords = [] for ind in df.keys(): for word in df[ind]: if not pd.isna(word): keywords.append(str(word)) with open("../../data/output/keywords.txt", "w", encoding="utf8") as file: for words in keywords: file.write("{} 3 nt\n".format(words)) jieba.load_userdict("../../data/output/keywords.txt") n = start limit = 1000 with open("../../data/output/train{}.dat".format(w_id), "w", encoding="utf-8") as f: while n < end: if end - n < limit: limit = end - n data = data_fetch("content", "wechat_essays_v2", limit=limit, start=n, host_IP="192.168.164.11", database="wechat") for item in data: cleaned = html_cleanup(item[0]) seg = jieba.cut(cleaned) output = "" for word in seg: if word.replace(" ", "") == "": pass else: if word not in stopwords: output += word + " " f.write(output + "\n") n += limit print("id: {} === Done {} rows".format(id, n))
with open("../../../data/temp/essays_tmp.pickle", "wb") as f: pickle.dump(data, f) with open("../../../data/temp/essays_tmp.pickle", "rb") as f: data = pickle.load(f) keywords = read_txt("../../../data/nlp/essay_author/author_keywords.txt") black_list = ["图片来源", "配图来源", "来源为网络", "数据来源", "请勿转载", "转载以及向", "来源为网络"] count = 0 for content in data: title = content[0] meta_data = content[1] if content[2] is not None: content = html_cleanup(content[2]) # print(content) for kw in keywords: iter = re.finditer(kw, content) indices = [m.start(0) for m in iter] if len(indices) > 0: for ind in indices: interesting = content[max(0, ind - 10):min(ind + 50, len(content))] has_black_list = False for blk in black_list: if blk in interesting: has_black_list = True break if not has_black_list: # print(title) print(interesting + "\n")
def worker(w_id, start, end): print( "===================Process {} has Started==============".format(w_id)) if w_id % 2 == 0: url = "http://192.168.164.15:49001/seg/s" else: url = "http://10.0.0.59:49001/seg/s" with open("../../../../data/nlp/stop_words.pickle", "rb") as file: stopwords = pickle.load(file) n = start limit = min(end - start, 10000) title_whole = [] content_whole = [] count = 0 tmp = 0 while n < end: if end - n < limit: limit = end - n data = data_fetch("`title`, `content`", "essays", host_IP="192.168.164.15", user_name="raw", password="******", database="raw", limit=limit, start=start, tail_condition="ORDER BY `update_time`") for item in data: title_dic = {} content_dic = {} title = item[0] content = item[1] if title is None: t_result = None else: try: title = html_cleanup(title).replace(" ", "").replace("\n", "") t_result = requests.post(url, data={ "_q": title }).json()["data"] except Exception as e: print(e) t_result = None time.sleep(1) if content is None: c_result = None else: try: content = html_cleanup(content).replace(" ", "").replace( "\n", "") # if len(content) > tmp: # tmp = len(content) # print(len(content)) # print(content) if len(content) < 10000: c_result = requests.post(url, data={ "_q": content }).json()["data"] else: content_list = text_spliter(content) reqtoolong = [ requests.post(url, data={ "_q": item }).json()["data"] for item in content_list ] c_result = reqtoolong[0] for evenmore in reqtoolong[1:]: c_result = c_result + " " + evenmore except KeyError: c_result = None pass except Exception as e: print(e) c_result = None time.sleep(1) if t_result is None: pass else: t_wordlist = t_result.split(" ") for item in t_wordlist: if len(item) > 0: # item_l = item.split("/") # word = item_l[0] # pos = item_l[1] # if pos == "w": # pass # else: if item in stopwords: pass elif isPunctuation(item): pass else: if item in title_dic.keys(): title_dic[item] += 1 else: title_dic[item] = 1 if c_result is None: pass else: c_wordlist = c_result[1:-1].split(" ") for item in c_wordlist: if len(item) > 0: # item_l = item.split("/") # word = item_l[0] # pos = item_l[1] # if pos == "w": # pass # else: if item in stopwords: pass else: if item in content_dic.keys(): content_dic[item] += 1 else: content_dic[item] = 1 title_whole.append(title_dic) content_whole.append(content_dic) count += 1 if count % 1000 == 0: with open( "../../../../data/output/w_freq/title/result{}.pickle". format(w_id), "wb") as f: pickle.dump(title_whole, f) with open( "../../../../data/output/w_freq/content/result{}.pickle" .format(w_id), "wb") as f: pickle.dump(content_whole, f) print("Process {} has processed {} essays... \n".format( w_id, count)) n += limit start += limit with open( "../../../../data/output/w_freq/title/result{}.pickle".format( w_id), "wb") as f: pickle.dump(title_whole, f) with open( "../../../../data/output/w_freq/content/result{}.pickle".format( w_id), "wb") as f: pickle.dump(content_whole, f) print("===================Process {} has ended==============".format(w_id))
def worker(w_id, start, end): print( "===================Process {} has Started==============".format(w_id)) if w_id % 2 == 0: url = "http://192.168.164.15:49001/seg/s" else: url = "http://10.0.0.59:49001/seg/s" with open("../../../../data/nlp/stop_words.pickle", "rb") as file: stopwords = pickle.load(file) dic_path = "../../../../data/output/account_name_unique_jieba.txt" jieba.load_userdict(dic_path) n = start limit = min(end - start, 30000) count = 0 tmp = 0 cou = 0 while n < end: title_whole = [] content_whole = [] if end - n < limit: limit = end - n data = data_fetch("`title`, `content`", "essays", host_IP="192.168.164.15", user_name="raw", password="******", database="raw", limit=limit, start=start, tail_condition="ORDER BY `update_time`") for item in data: title_dic = {} content_dic = {} title = item[0] content = item[1] if title is None: t_result = None else: try: title = replace_punctuation( html_cleanup(title).replace(" ", "").replace("\n", "")) t_result = "/".join(jieba.cut(title)) except Exception as e: print(e) t_result = None time.sleep(1) if content is None: c_result = None else: try: content = replace_punctuation( html_cleanup(content).replace(" ", "").replace("\n", "")) c_result = "/".join(jieba.cut(content)) except KeyError: c_result = None pass except Exception as e: print(e) c_result = None time.sleep(1) if t_result is None: pass else: t_wordlist = t_result.split("/") for item in t_wordlist: if len(item) > 0 and item != " ": if item in stopwords: pass elif isPunctuation(item): pass else: if item in title_dic.keys(): title_dic[item] += 1 else: title_dic[item] = 1 if c_result is None: pass else: c_wordlist = c_result.split("/") for item in c_wordlist: if len(item) > 0 and item != " ": if item in stopwords: pass else: if item in content_dic.keys(): content_dic[item] += 1 else: content_dic[item] = 1 title_whole.append(title_dic) content_whole.append(content_dic) count += 1 if count % 10000 == 0: with open( "../../../../data/output/w_freq0/title/result{}-{}.pickle" .format(w_id, cou), "wb") as f: pickle.dump(title_whole, f) with open( "../../../../data/output/w_freq0/content/result{}-{}.pickle" .format(w_id, cou), "wb") as f: pickle.dump(content_whole, f) print("Process {} has processed {} essays... \n".format( w_id, count)) n += limit cou += 1 start += limit with open( "../../../../data/output/w_freq0/title/result{}[-1].pickle".format( w_id), "wb") as f: pickle.dump(title_whole, f) with open( "../../../../data/output/w_freq0/content/result{}[-1].pickle". format(w_id), "wb") as f: pickle.dump(content_whole, f) print("===================Process {} has ended==============".format(w_id))
def tfidf(content, idf, limit=50, method=0): with open("../../../../data/nlp/stop_words.pickle", "rb") as stopfile: stopwords = pickle.load(stopfile) # print("/".join(jieba.cut("众智财税智库告诉了我们一个严肃的道理"))) content_dic = {} url = "http://10.0.0.59:49001/seg/s" # content = """<p>智联招聘日前发布的《2018年区块链人才供需与发展研究报告》显示,今年以来,区块链人才需求增长迅猛。以2017年第三季度的人才需求量为基数,2018年第二季度的区块链人才较2017年第三季度暴增636.83%。从分季度数据看,区块链人才的需求有逐步扩张的趋势,但波动也较大,随概念的热度呈现起伏态势。</p><p>区块链人才供应量充足但拥有技能的人十分稀少,过去一年向区块链相关岗位投递简历的人数远高于行业整体需求,是需求的3.6倍,从数量上看供给十分充足,人们对这个新兴领域的向往可见一斑。但具备区块链相关技能和工作经验的求职者,也就是存量人才仅占需求量的7%。</p><p><strong>区块链的人才需求主要集中在计算机、金融行业</strong></p><p>区块链职位最为集中的行业主要有互联网行业,占比35.2%居首;IT服务行业,占比20%;计算机软件行业,占比10.8%;以及基金证券行业,占比8.3%;网络游戏行业占比5.2%。需求结构与区块链技术落地的实际应用场景相关,业务发展速度较快的领域赢得了更多青睐。</p><p><strong>算法工程师和软件工程师是紧俏岗位</strong></p><p>算法工程师和软件工程师是紧俏岗位,但供给端基本空白:在企业需求方面,算法工程师是需求最多的岗位,占比10.9%,但投递量却很低。从存量人才结构上看,在核心技术岗位上的占比并不高,当前管理人员占比较高。</p><p><strong>区块链招聘需求集中在一线、新一线城市</strong></p><p>从目前区块链职位的城市分布来看,该领域的岗位需求主要集中在一线和新一线城市中。其中,北京、上海和深圳位于第一梯队,职位占比分别达到24%、20%和10%。杭州和广州紧随其后,分别占7%和5%。无一例外,北京、上海、广东、江苏、浙江和山东等省市,均颁布了区块链相关优惠政策和发展规划,鼓励区块链相关产业在当地创业和发展给予产业加持。</p><p><strong>区块链职位高薪难匹配存量人才薪酬更高</strong></p><p>需求的高速增长,加上满足条件的人才稀缺,企业想到的第一个手段就是通过高薪揽才。从薪酬分布区间来看,区块链招聘职位分布最多的区间为10000-15000元/月,占比23%;以及15000-25000元/月,占比29.2%。</p><p>智联招聘2018年第二季度全国37个主要城市的平均招聘薪酬为7832元/月,而区块链招聘职位中,8000元以上的高薪职位却占据了主流。可以看出,该领域的工资水平远远超过全国平均招聘薪酬,为了吸引供应有限的相关人才,企业不惜高薪抢人。</p><p>从投递供给人群看,他们当前薪酬主要集中在10001-15000元/月的区间,占比20.5%;6001-8000元/月区间占比19.1%,整体薪酬区间偏低,他们向往更高的薪酬。但从整体技能上来看,追求高薪确实存在挑战,这也导致虽然市场上有数倍于需求的人才供应量,但企业依然难招到合适的人才。</p><p>END</p><hr /><p><img src="https://mmbiz.qpic.cn/mmbiz_jpg/zq0bhlY6SQ5JvRRZKN4K9sNPBSicCqL0GNJ6kW8NiaFD3mXwPGc7QtiaGERNtGXqLxIO1KV4WYYRPeZbG2ibUTtfmQ/640?wx_fmt=jpeg"></p>""" content = replace_punctuation(html_cleanup(content)) if content is None: c_result = None else: if method == 0: try: content = html_cleanup(content).replace(" ", "").replace("\n", "") if len(content) < 10000: c_result = requests.post(url, data={ "_q": content }).json()["data"] else: content_list = text_spliter(content) reqtoolong = [ requests.post(url, data={ "_q": item }).json()["data"] for item in content_list ] c_result = reqtoolong[0] for evenmore in reqtoolong[1:]: c_result = c_result + " " + evenmore except KeyError: c_result = None pass except Exception as e: print(e) c_result = None time.sleep(1) elif method == 1: try: content = replace_punctuation( html_cleanup(content).replace(" ", "").replace("\n", "")) c_result = "/".join(jieba.cut(content)) except KeyError: c_result = None pass except Exception as e: print(e) c_result = None time.sleep(1) else: c_result = None if c_result is None: return {} else: if method == 0: c_wordlist = c_result[1:-1].split(" ") elif method == 1: c_wordlist = c_result.split("/") else: c_wordlist = None for item in c_wordlist: if len(item) > 0 and item != " ": if item in content_dic.keys(): content_dic[item] += 1 else: content_dic[item] = 1 for item in content_dic.keys(): if isPureNumber(item): content_dic[item] = 0 elif item in stopwords: content_dic[item] = 0 else: try: item_idf = idf[item] except KeyError: item_idf = 1 try: content_dic[item] = content_dic[item] / item_idf except ZeroDivisionError: content_dic[item] = 0 ll = list(content_dic.items()) ll.sort(key=operator.itemgetter(1), reverse=True) llout = ll[:min(limit, len(ll))] llfifty = ll[:min(50, len(ll))] output = {} fiftyshades = {} for item in llout: output[item[0]] = item[1] for item in llfifty: fiftyshades[item[0]] = item[1] return output, fiftyshades
from nlp.dependency_parsing.parser import Sentence from utils.text_cleaner import html_cleanup def pruning(txt): s = Sentence(txt) output = "" for w in s.parsed: if w.dependency in s.core_ind or w.id in s.core_ind: output += w.word return output if __name__ == "__main__": # txt = "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作" # txt = "张先生还具体帮助他确定了把画雄鹰、松鼠和麻雀作为主攻目标。" txt = "<p>这几天,有个新闻很重要,但被很多人忽略。就是北大资源和北大附中终于“在一起”了。<p>新闻是这样说的:近日,北大资源集团与北大附中战略合作签约仪式在京举行,双方宣布北大附中首批外埠学校将落地北大资源项目所在地――天津和开封。<p>邦主可以预见,看了这则新闻,很多人想拍邦主一板砖:这不就是学区房概念吗!只不过是北大附中!但也不值得邦主一惊一乍且专文论述吧!<p>且慢这么看!在这个最牛学区房的背后,邦主想表达的是,不要忘记这个“在一起”背后的一个关键信号:在万科郁亮所定义的“白银十年”,“服务”超越“关系”和资本,成为房地产市场一个新的核心竞争要素,而那些能击中客户需求的“痛点”,且能创造独特客户价值的房企,将有机会实现新一轮的“弯道超车”。<p><strong>1、“超车”的逻辑</strong><p>郁亮之所以把未来十年称为“白银十年”,意思是跟过去十年相比,虽然房产需求将大幅减少,但也有得做。而在这样的市场背景下,除了像融创这样定位于高端市场的公司继续满足改善性需求外,大量房企开始觊觎“社区运营”(其实年初孙宏斌也曾发微博说,融创今年要认真做好物业管理服务)。这样做有两方面的目的:一是开辟新的利润增长点,也就是从挣盖房子的钱,转变为挣为业主服务的钱;二是提升自身产品的附加价值,构筑差异化的市场竞争力。<p>最近在香港上市的“彩生活”,以及乐居和分众这几天正在力推的“实惠”,都属于前一种;而万科在社区内开食堂、运营养老公寓,北大资源在社区内开设新文化中心、北大医疗健康管理中心“双中心”,以及世茂房地产的“云服务”,则属于后一种。<p>在地产邦7月8日推送的“世茂副总裁蔡雪梅:高周转模式Out了,未来需要‘以利定产’”一文中,蔡雪梅认为,在房地产的“下半场”中,仅仅靠复制、靠提高周转率,这些招数都不足以胜出,“要能给客户全然不同的产品和服务,把服务作为整个开发商链条的重要一环,这个真的很有效。”<p>也就是说,未来服务做得好的房企,将有较高概率后来居上,成为市场的主流,而首要目标仍是冲规模的房企则要小心了。<p><strong>2、找准需求的“痛点”</strong><p>下半场是服务的竞争,其实现在已是共识了。不过,邦主在此提醒一声:不要为做服务而做,关键是要找准需求的“痛点”。比如说,万科开设“第五食堂”,确确实实解决了远郊小青年们的吃饭问题,而其养老公寓则解决的是,小青年们想把外地的父母接到一起生活,但可能又不太愿意住在同一个屋檐下的尴尬现实,况且,养老公寓还能提供必要的医疗服务。<p>邦主之所以拿“北大资源+北大附中”这个例子来重点分析,也主要是为了讲怎么找准市场需求的“痛点”。不少人知道五道口号称宇宙中心,就是因为五道口的房子是北京最好的学区房,每平方米高达10万元以上。<p>可以说,孩子的教育,是普天之下父母的一个“痛点”需求。北大资源集团董事长余丽告诉媒体,北大资源“追求”北大附中好多年,就是看中了这个“痛点”。<p>要知道,北大附中是名校中的名校,跟绝大多数学校把孩子当成考试机器不同的是,北大附中提倡的是一种“散养”式的教育理念,而这会让北大资源的客户“眼前一亮”,因为完全超出了预期嘛。<p>因此,做服务一定要找准“痛点”,不然就可能是隔靴搔痒。比如,海尔地产前几年就在做类似的事――开发业主之间沟通的“云平台”,但事实上一个社区的业主之间是很少沟通的,这件事后来也就不了了之了。<p><strong>3、服务要具备“稀缺性”</strong><p>物以稀为贵,稀缺的,才是最具价值且不可复制的。咱们中国人最喜欢一哄而上,你开“第五食堂”,我就开“第六食堂”,你有“云服务”,我就有“云云服务”,总之,你有我有全都有。<p>这种同质化的社区服务,不用猜今后肯定会大量涌现。这就要考验房企的运营管理能力了。如果管理得当,完全可以再出几个“彩生活”,但若管理不当,反而会拉低企业自身产品的竞争力。所以一定要慎之又慎。</p><p>不过,邦主认为最理想的状态是,你提供的“服务”,具有资源的稀缺性,有独特而不可复制的价值,五道口能成为宇宙中心,靠的就是学区房的“稀缺性”。<p>北大附中就具有这种“稀缺性”。北大附中此次落地天津和河南开封,是其建校以来首次直接投入师资和管理队伍在异地办校,可见其对扩张的慎重,但这也保证了这种学区房的稀缺性。<p>再比如北大资源今年暑期协办的北京大学“2014中学生考古文博训练营”,北大教授全程参与、讲解,北大资源项目所在的9个城市的中学生可报名参加,这种服务也具备“稀缺性”。<p>所以,我们可以得出,“白银十年”房企有两条路可走:一是社区运营管理制胜,可能管得好了,甚至都不收物业费了;二是独有资源制胜,试想,如果北大资源再带上北大医院,再次击中客户对于健康这一痛点需求,还有哪个开发商可以抵挡得住呢?<p>说到底,“白银十年”房企比拼的是整合优质、稀缺资源的能力。今后,会有更多的“北大资源+”、“万科+”、“世茂+”出现。至于“+”后面是什么,我们现在可能还难以想象。</p>" txt = html_cleanup(txt) sentences = txt.replace("!", "。").replace("?", "。").replace(".", "。").split("。") for sentence in sentences: print("Original: {} \n Pruned: {} \n".format(sentence, pruning(sentence))) print(Sentence(sentence)) print("\n")