def load_data(): train_list = [] for line in open('../data/train_clean.json', 'r'): train_list.append(json.loads(line)) train = pd.DataFrame(train_list) #train_work = train[names[-1]] test_list = [] for line in open('../data/test_clean.json', 'r'): test_list.append(json.loads(line)) test = pd.DataFrame(test_list) print('--- NLP on major, simply cut the first word') le = LabelEncoder() print len(set(train['major'])) train['major'] = train['major'].apply(lambda x : " ".join(jieba.cut(x, cut_all = False)).split()[0] if x is not None and len(" ".join(jieba.cut(x)).split()) > 0 else 'none') test['major'] = test['major'].apply(lambda x : " ".join(jieba.cut(x, cut_all = False)).split()[0] if x is not None and len(" ".join(jieba.cut(x)).split()) > 0 else 'none') print len(set(train['major'])) le.fit(list(train['major']) + list(test['major'])) train['major'] = le.transform(train['major']) test['major'] = le.transform(test['major']) le = LabelEncoder() train['gender'] = le.fit_transform(train['gender']) names = train.columns le = LabelEncoder() test['gender'] = le.fit_transform(test['gender']) del train['_id'] del test['_id'] train = train.fillna(0) test = test.fillna(0) #test['age'] = test['age'].apply(lambda x : int(x.replace(u'岁','').encode('ascii'))) return train, test
def process(self): client = MongoClient('localhost',44444) db_temp_train = client['vsm_all_second'] collection1_temp_train = db_temp_train['collection1'] collection2_temp_train = db_temp_train['collection2'] collection3_temp_train = db_temp_train['collection3'] collection4_temp_train = db_temp_train['collection4'] collection5_temp_train = db_temp_train['collection5'] lineNum = 1 pat = "sa(\d)(.*)" with open(os.path.join(self.fileroot,self.filename),"r") as fr: for line in fr: # 这是处理有的评论行的格式是异常的 if not re.findall(pat,line): print("\n " + str(lineNum) + " something wrong !") continue result = re.findall(pat,line) starNum = result[0][0] if starNum == '1': collection1_temp_train.insert(dict(content = list(jieba.cut(result[0][1].strip())))) elif starNum == '2': collection2_temp_train.insert(dict(content = list(jieba.cut(result[0][1].strip())))) elif starNum == '3': collection3_temp_train.insert(dict(content = list(jieba.cut(result[0][1].strip())))) elif starNum == '4': collection4_temp_train.insert(dict(content = list(jieba.cut(result[0][1].strip())))) elif starNum == '5': collection5_temp_train.insert(dict(content = list(jieba.cut(result[0][1].strip())))) print('process {0} lines'.format(lineNum),end='\r\t') lineNum += 1 client.close()
def neural_init(raw): """ 用于将clean_person中的字段初始化为神经网络能接受的初始值 字符串只简单分词 :param raw: :return: """ test = raw[0] re.sub('\s*', '', test[0]) re.sub('\s*', '', test[1]) re.sub('\s*', '', test[3]) raw_without_space = map(lambda a: [re.sub('\s*', '', a[0]), re.sub('\s*', '', a[1]), a[2], re.sub('\s*', '', str(a[3])), re.sub('\s*', '', str(a[4])), a[5]], raw) jieba.load_userdict("data/jieba_dict.txt") raw_cut = [[jieba.cut(x[0], cut_all=False), jieba.cut(x[1], cut_all=False), x[2], jieba.cut(x[3], cut_all=False), jieba.cut(x[4], cut_all=False), x[5]] for x in raw_without_space] raw_without_sw = map(lambda a: [filter(not_in, a[0]), filter(not_in, a[1]), a[2], filter(not_in, a[3]), filter(not_in, a[4]), a[5]], raw_cut) # 将所有文本list合并在一起 raw_text_all = map(lambda a: a[0]+a[1]+a[3]+a[4], raw_without_sw) # 产生词典和语料库 dic_corpus = algorithm_collection.digitalize(raw_text_all) # 生成词典矩阵 arr = matrix_former(dic_corpus[0], dic_corpus[1], dic_corpus) raw_digitalized = map(lambda a, b: np.hstack((a, [b[2], b[5]])), arr, raw_without_sw) return raw_digitalized
def get_skill(self,jobname="python",num=5): """ 从demand中关键词抽出相关技能短语 """ key_words = {} jd_skill = self.clear_jd(self.jd_database[jobname]['demand']) for line in jd_skill: for word in jieba.cut(line): word = strQ2B(word).lower() if word in self.skill_words: key_words[word] = key_words.get(word,1)+1 key_words = sorted(key_words.iteritems(),key=lambda x:x[1],reverse=True) res = [ w[0] for w in key_words[:int(num*np.log(num))]] print 'key_words:' print '\n'.join(res) for word in jieba.cut(jobname): word = strQ2B(word).lower() if word in self.skill_words and word not in res: res.insert(0,word) after_top3 = res[3:] np.random.shuffle(after_top3) return res[:3]+after_top3[:num-3]
def calc_string_sim(self, a, b): try: a_seg_list = [x for x in jieba.cut(a, cut_all=False)] b_seg_list = [x for x in jieba.cut(b, cut_all=False)] except Exception, e: print e assert False
def process(filename = ''): con = readFile(filename) if con == '': return global mydict global idfdict global alpha mydict1 = {} seg_list = list(jieba.cut(con, cut_all = False)) print '/ '.join(jieba.cut(con, cut_all = False)) print seg_list f = open('out.txt', 'w') #print len(seg_list) for o in seg_list: s = o.encode('utf-8') o = o.encode('gbk') if s not in PUNCTUATION: mydict.add(o) #加到词典 mydict1.setdefault(o, 0)#该文档对应的词频集合 mydict1[o] += 1 f.write(o+'\n') f.close() mmax = 0 for o in mydict1.keys(): idfdict.setdefault(o, 0) inverindex.setdefault(o, set()) idfdict[o] += 1 #逆文档频率 inverindex[o].add(fileindex[filename]) if mydict1[o] > mmax: mmax = mydict1[o] for o in mydict1.keys(): #归一化处理,增强准确率 mydict1[o] = alpha + (1-alpha) * mydict1[o] / mmax return mydict1
def TextProcessing(train_data_folder,test_data_folder): train_data_list=[] train_class_list=[] train_data_folder_list=os.listdir(train_data_folder) for folder in train_data_folder_list: new_path_folder=os.path.join(train_data_folder,folder) files=os.listdir(new_path_folder) for file in files: new_path_file=os.path.join(new_path_folder,file) with open(new_path_file,'r') as fp: raw=fp.read() words=list(jieba.cut(raw,cut_all=False)) train_data_list.append(words) train_class_list.append(folder.decode('utf-8')) test_data_list=[] test_data_files=os.listdir(test_data_folder) for file in test_data_files: new_path_file=os.path.join(test_data_folder,file) with open(new_path_file,'r') as fp: raw=fp.read() words=list(jieba.cut(raw,cut_all=False)) test_data_list.append(words) all_words_list={} for word_list in train_data_list: for word in word_list: if all_words_list.has_key(word): all_words_list[word]+=1 else: all_words_list[word]=1 all_words_tuple_list=sorted(all_words_list.items(),key=lambda f:f[1],reverse=True) all_words_list=list(zip(*all_words_tuple_list)[0]) return all_words_list,train_data_list,train_class_list,test_data_list
def CalculatorIDF(self): #统计IDF(耗时最长,约3300s) for keyword in self.nkeywords: count = 0 KeyWord=keyword[0].encode('latin-1').decode('gbk') print(KeyWord) for row in self.NTrainSet: Words = jieba.cut(row[0].encode('latin-1').decode('gbk')) if KeyWord in Words: count=count+1 continue count = log(295/count+0.5) print(count) sql = "insert into nkeyword(word,DF) values('%s',%s)" %(keyword[0].encode('latin-1').decode('gbk'),count) mydatabase.ExeUpdateQuery(sql) for keyword in self.pkeywords: count = 0 KeyWord=keyword[0].encode('latin-1').decode('gbk') print(KeyWord) for row in self.PTrainSet: Words = jieba.cut(row[0].encode('latin-1').decode('gbk')) if KeyWord in Words: count=count+1 continue count = log(1012/count+0.5) print(count) sql = "insert into pkeyword(word,DF) values('%s',%s)" %(keyword[0].encode('latin-1').decode('gbk'),count) mydatabase.ExeUpdateQuery(sql)
def keyword_frequency(keyword, directory): freq_table = {} for source in glob.glob(os.path.join(directory, '*')): words = '' vect = CountVectorizer(ngram_range=(1, 3)) analyzer = vect.build_analyzer() for f in glob.glob(os.path.join(source, '*.json')): j = json.load(open(f)) if j['Language'] == 'chinese': words += ' '.join(jieba.cut(j['Title'])) words += ' '.join(jieba.cut(j['Content'])) elif j['Language'] == 'english': words += j['Title'] words += j['Content'] ngram_query = analyzer(words) fdist = nltk.FreqDist(ngram_query) freq = fdist.freq(keyword.lower()) freq_table[os.path.basename(source)] = freq pprint.pprint(freq_table) sorted_list = sorted(freq_table, key=freq_table.get, reverse=True) print('=================') print("%s loves %s most." % (sorted_list[0], keyword)) plt.bar(range(len(freq_table)), freq_table.values(), align="center") plt.xticks(range(len(freq_table)), list(freq_table.keys())) plt.show()
def get_position_prob(key, wordprobdct, works): seg_lst = jieba.cut(works[0][0]) pos1prob = get_letter_position(key, "pos1", seg_lst, wordprobdct) seglst = jieba.cut(works[1][0]) pos2prob = get_letter_position(key, "pos2", seglst, wordprobdct) # if workprobdct[key]['pos1'].has_key(works[0][1]): # pos1prob = 1000*float(workprobdct[key]['pos1'][works[0][1]])/workprobdct[key]['pos1']['total'] # else: # pos1prob = 0.0001/workprobdct[key]['pos1']['total'] # if workprobdct[key]['pos2'].has_key(works[1][1]): # pos2prob = 500*float(workprobdct[key]['pos2'][works[1][1]])/workprobdct[key]['pos2']['total'] # else: # pos2prob = 0.0001/workprobdct[key]['pos2']['total'] # if workprobdct[key]['industry1'].has_key(works[0][0]): # industry1prob = float(workprobdct[key]['industry1'][works[0][0]])/workprobdct[key]['industry1']['total'] # else: # industry1prob = 0.0001/workprobdct[key]['industry1']['total'] # if workprobdct[key]['industry2'].has_key(works[1][0]): # industry2prob = float(workprobdct[key]['industry2'][works[1][0]])/workprobdct[key]['industry2']['total'] # else: # industry2prob = 0.0001/workprobdct[key]['industry2']['total'] total = float(wordprobdct[key]["total"]) / wordprobdct["total"] total = total * (50 * pos1prob + pos2prob) return total
def init_from_mongo(self): client = MongoClient('mongodb://localhost:27017/') db = client.ptt posts = db.gossiping_38k jieba.set_dictionary('extra_dict/dict.txt.big') jieba.analyse.set_stop_words("extra_dict/stop_words_cht.txt") for post in posts.find(): #For content d = defaultdict(int) content = post['content'] if post['score'] != 0: for l in content.split('\n'): if l: for w in jieba.cut(l): d[w] += 1 if len(d) > 0: self.words.append(d) self.scores.append(1 if post['score'] > 0 else 0) #For comments for comment in post['comments']: l = comment['content'].strip() if l and comment['score'] != 0: d = defaultdict(int) for w in jieba.cut(l): d[w] += 1 if len(d) > 0: self.c_words.append(d) self.c_scores.append(1 if comment['score'] > 0 else 0) client.close()
def get_description_feature_old(com_description, pro_hisprojects, pro_descriptions, pro_otherinfo): try: com_seg = jieba.cut(com_description, cut_all=False) comlst = [seg.lower() for seg in com_seg if keyword_dct.has_key(seg.lower())] description_feature = [0 for i in range(len(keyword_dct))] his_seg = jieba.cut(pro_hisprojects, cut_all=False) hislst = [seg.lower() for seg in his_seg if keyword_dct.has_key(seg.lower())] deslst = [] for description in pro_descriptions: des_seg = jieba.cut(description[0], cut_all=False) deslst += [seg.lower() for seg in des_seg if keyword_dct.has_key(seg.lower())] oth_seg = jieba.cut(pro_otherinfo, cut_all=False) othlst = [seg.lower() for seg in oth_seg if keyword_dct.has_key(seg.lower())] feature_lst = [] pro_dec_lst = hislst + deslst + othlst com_count = Counter(comlst) pro_count = Counter(pro_dec_lst) for key in com_count.keys(): description_feature[keyword_dct[key]] = np.exp(pro_count.get(key, 0.0) / float(com_count[key])) except: pdb.set_trace() traceback.print_exc() return description_feature
def repeatability(self ,str_1,str_2): #判断语句是否重复,是则返回1 list_1=jieba.cut(str_1) list_2=jieba.cut(str_2) a=list(list_1) b=list(list_2) count_a=count_b=0 for i in a: for j in b: if i==j : count_a+=1 break for j in b: for i in a: if i==j : count_b+=1 break if float(count_a)/len(a)>0.78 or float(count_b)/len(b)>0.78: return 1 else: return 0
def clean_text(text, remove_stopwords = True): #jieba.load_userdict("dict.txt") text = jieba.cut(text) text = " ".join(text) new_text = [] for word in text: if word in contractions: new_text.append(contractions[word]) else: new_text.append(word) text = " ".join(new_text) text = re.sub('(.*?)', '', text) text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", "",text) text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) text = re.sub(r'\<a href', '', text) text = re.sub(r'&', '', text) text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', '', text) text = re.sub(r'<br />', '', text) text = re.sub(r'\'', '', text) text = re.sub(r'\”', '',text) text = re.sub(r'\#', '', text) text = re.sub(r'\“', '',text) text = re.sub(r'\《', '',text) text = re.sub(r'\》', '',text) text = jieba.cut(text) text = " ".join(text) return text
def addDictToJieba(): ##### roadList content=open('../data_crawl/finalRoads.txt','r').read().strip('\n') contentList=content.split('\n');print len(contentList) #############load district dict districtNameList=grab('/home/yr/intellicredit/data/'+'districtNameList0503') test_sent = [ "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿,上海市浦东区\n" ] ######## print cut before add dictionary #print test_sent[0].decode('utf-8') words = jieba.cut(test_sent[0].decode('utf-8')) #print('/'.join(words)) ####add word_dictionary to jieba for w in districtNameList[:]+contentList: #print w jieba.add_word(w) ####add district-name-not-in-dictionary to jieba jieba.add_word('浦东区'); jieba.add_word('浦东新区') jieba.del_word('上海市') jieba.add_word('兰城路') words = jieba.cut(test_sent[0].decode('utf-8'))
def article_search(request): new_comment_list = [] article_list = [] keywords = request.GET.get('search_key') for article in Article.objects.all(): title_data = ';'.join(jieba.cut(article.title,cut_all=True)) title_data_list = title_data.split(';') content_data = ';'.join(jieba.cut(article.body,cut_all=True)) content_data_list = content_data.split(';') if keywords in content_data_list or keywords in title_data_list: article_list.append(article) try: user_name = request.session["username"] user = User.objects.get(name = user_name) new_comment_list = Comment.objects.filter(comment_reminder=user,comment_status="N") except: pass comment_count_user = len(new_comment_list) category_list = Category.objects.all().order_by('name') popular_article = Article.objects.all().order_by('-likes')[:5] views_article = Article.objects.all().order_by('views')[:5] user_count = User.objects.all().count() article_count = Article.objects.all().count() latest_login_user = User.objects.all().order_by('-last_login_time')[:5] return render(request,"search_list.html",{"comment_list_user":new_comment_list,"comment_count_user":comment_count_user,\ "latest_login_user":latest_login_user,"user_count":user_count,"article_count":article_count,\ "category_list":category_list,"popular_article":popular_article,"views_article":views_article,"article_list":article_list})
def learn_model(data,target): bestwords = best_of_words(data, target) # preparing data for split validation. 80% training, 20% test data_train,data_test,target_train,target_test = cross_validation.train_test_split(data,target,test_size=0.1,random_state=43) #classifier = BernoulliNB().fit(data_train,target_train) train_feature=[] test_feature=[] for i in range(len(data_train)): d=data_train[i] d=jieba.cut(d, cut_all=False) l=target_train[i] #tmp=[bigram(d),l] tmp = [dict([(word, True) for word in d if word in bestwords]), l] train_feature.append(tmp) for i in range(len(data_test)): d=data_test[i] d=jieba.cut(d, cut_all=False) l=target_test[i] #tmp=bigram(d) tmp = dict([(word, True) for word in d if word in bestwords]) test_feature.append(tmp) classifier = SklearnClassifier(MultinomialNB()) classifier.train(train_feature) predicted = classifier.classify_many(test_feature) evaluate_model(target_test,predicted) return classifier, bestwords
def get_description_feature(com_description, pro_hisprojects, pro_descriptions, pro_otherinfo): try: com_seg = jieba.cut(com_description, cut_all=False) comlst = [seg.lower() for seg in com_seg if keyword_dct.has_key(seg.lower())] description_feature = [0 for i in range(len(keyword_dct))] his_seg = jieba.cut(pro_hisprojects, cut_all=False) hislst = [seg.lower() for seg in his_seg if keyword_dct.has_key(seg.lower())] deslst = [] for description in pro_descriptions: des_seg = jieba.cut(description[0], cut_all=False) deslst += [seg.lower() for seg in des_seg if keyword_dct.has_key(seg.lower())] oth_seg = jieba.cut(pro_otherinfo, cut_all=False) othlst = [seg.lower() for seg in oth_seg if keyword_dct.has_key(seg.lower())] feature_lst = [] for seg in hislst: if seg in comlst: pdb.set_trace() description_feature[keyword_dct[seg]] += 1 for seg in deslst: if seg in comlst: pdb.set_trace() description_feature[keyword_dct[seg]] += 1 for seg in othlst: if seg in comlst: pdb.set_trace() description_feature[keyword_dct[seg]] += 1 except: pdb.set_trace() traceback.print_exc() return description_feature
def get_feature_word(): with open('d:/naren/ter1.csv', 'r') as file: lines = file.readlines() for line in lines: line = line.decode('gbk').split('|')[0] seglst = jieba.cut(line, cut_all=False) for seg in seglst: if wordct.has_key(seg): wordct[seg] += 1 else: wordct[seg] = 1 # words = sorted(wordct.iteritems(), key=operator.itemgetter(1), reverse=True) # for word in words: # print word[0],':',word[1] print '=================分割线===================' with open('d:/naren/spam1.csv', 'r') as file: lines = file.readlines() for line in lines: line = line.decode('gbk').split('|')[0] seglst = jieba.cut(line, cut_all=False) for seg in seglst: if wordct.has_key(seg): wordct[seg] += 1 else: wordct[seg] = 1 wordsp = sorted(wordct.iteritems(), key=operator.itemgetter(1), reverse=True) for word in wordsp: print word[0],':',word[1] # print len(words) print len(wordsp)
def cutWords(self, dataset, stop_words_path="None"): #分词/去停用词 result = [] if stop_words_path == "None": for i in xrange(len(dataset)): temp = " ".join(jieba.cut(dataset[i])) result.append(temp) return result else: stop_words = self.read_txt2(stop_words_path) # print stop_words for i in xrange(len(dataset)): tup = [] temp = " ".join(jieba.cut(dataset[i])) temp = temp.split() # print temp if len(temp) != 0: for j in range(len(temp)): # print j # print stop_words if temp[j] not in stop_words: tup.append(temp[j]) tup = " ".join(tup) result.append(tup) break return result
def get_description_feature(com_description, pro_hisprojects, pro_descriptions, pro_otherinfo): description_feature = [] try: com_seg = jieba.cut(com_description, cut_all=False) comlst = [seg.lower() for seg in com_seg if keyword_dct.has_key(seg.lower())] his_seg = jieba.cut(pro_hisprojects, cut_all=False) hislst = [seg.lower() for seg in his_seg if keyword_dct.has_key(seg.lower())] deslst = [] for description in pro_descriptions: des_seg = jieba.cut(description[0], cut_all=False) deslst += [seg.lower() for seg in des_seg if keyword_dct.has_key(seg.lower())] oth_seg = jieba.cut(pro_otherinfo, cut_all=False) othlst = [seg.lower() for seg in oth_seg if keyword_dct.has_key(seg.lower())] feature_lst = [] pro_dec_lst = hislst + deslst + othlst com_count = Counter(comlst) pro_count = Counter(pro_dec_lst) description_feature.append(len(com_count)) inter_con = set(com_count.keys()).intersection(set(pro_count.keys())) description_feature.append(len(inter_con)) description_feature.append(len(comlst)) # pdb.set_trace() description_feature.append(reduce(lambda x, y:x+y, [com_count[key] for key in inter_con] + [0])) description_feature.append(reduce(lambda x, y:x+y, [pro_count[key] for key in inter_con] + [0])) except: pdb.set_trace() traceback.print_exc() return description_feature
def run_seg(pid, core_cnt, output_fname): '''''' mg_conn = MongoSource() conn_lst = [mg_conn.get_connection('finance', 'golden_pages'),\ mg_conn.get_connection('finance', 'usa_pages')] cnt = 0 with open(output_fname, 'w') as fp: for conn in conn_lst: for i, page in enumerate(conn.find()): if i % core_cnt != pid: continue if cnt % 20000 == pid: print 'Process %s dealed %s pages' % (pid, cnt) cnt += 1 page_id = page['_id'] if type(page['article_date']) == type(int): article_date = str(page['article_date']) else: article_date = page['article_date'].encode('utf8') article_date = article_date[:-4] title = strip_words(page['title']) content = strip_words(page['content']) cut_words = [word for word in jieba.cut(title)] cut_words.extend([word for word in jieba.cut(content)]) output_str = '%s\t%s\t%s\n' \ % (page_id.encode('utf8'), article_date, ' '.join(cut_words).encode('utf8')) fp.write(output_str)
def load_data(): corpus_train = [] # 字典 target_train = [] # 分类 filepath = 'E:python_pananteng/程序6:文本挖掘/文本分类/实例2/train' filelist = os.listdir(filepath) for num in range(len(filelist)): filetext = filepath + "/" + filelist[num] filename = os.path.basename(filetext) myfile = codecs.open(filetext, 'r', 'utf-8') temp = myfile.readlines() # 文本内容 myfile.close() for i in range(0, 100): len_0 = len(temp) seg_list = jieba.cut(','.join(temp[int(i * len_0 / 100):int((i + 1) * len_0 / 100)]), cut_all=False) words = " ".join(seg_list) # ?????? target_train.append(filename) corpus_train.append(words) # --------------# corpus_test = [] target_test = [] filepath = 'E:python_pananteng/程序6:文本挖掘/文本分类/实例2/test' filelist = os.listdir(filepath) for num in range(len(filelist)): filetext = filepath + "/" + filelist[num] myfile = open(filetext, 'r') temp = myfile.readlines() myfile.close() seg_list = jieba.cut(','.join(temp[1:]), cut_all=False) target_test.append(temp[0]) corpus_test.append(words) return [[corpus_train, target_train], [corpus_test, target_test]]
def getQuerys(self): tree = ET.parse(self.url) root = tree.getroot() jieba.add_word("黄世铭") jieba.add_word("特侦组") jieba.add_word("黄色小鸭") jieba.add_word("大统") jieba.add_word("太阳花") jieba.add_word("服贸协定") jieba.add_word("服贸") jieba.add_word("波卡") jieba.add_word("台商") jieba.add_word("北捷") jieba.add_word("郑捷") jieba.add_word("瓦斯") jieba.add_word("气爆") querys = [] for query in root: # print "ok" queryDoc = [] querys.append(queryDoc) for doc in query: if type(doc) is not ListType: for term in jieba.cut(simplify(doc.attrib["title"].rstrip("\n")), cut_all=True): queryDoc.append(term) if doc[0].text != None: for term in jieba.cut(simplify(doc[0].text.rstrip("\n")), cut_all=True): queryDoc.append(term) return querys
def main(): lib = load_workbook('D:\project\project-carNew\csv\dealer_name_mapping.xlsx') # 创建新的excel文件 out_file = 'D:\project\project-carNew\csv\dealer_name_mapping_result.xlsx' wt_wb = Workbook(write_only=True) wt_ws = wt_wb.create_sheet() wordsList = [] std_wordsList = [] all_wordsList = [] for row in lib['Sheet1'].iter_rows(): fenci_list = list(jieba.cut(row[0].value, cut_all=False)) wordsList.append(fenci_list) all_wordsList.append(fenci_list) for row in lib['Sheet2'].iter_rows(): fenci_list = list(jieba.cut(row[0].value, cut_all=False)) std_wordsList.append(fenci_list) all_wordsList.append(fenci_list) print printChinese(all_wordsList) myVocabLst = createVocabLst(all_wordsList) row_nm=0 for words in wordsList: words_vec = setOfWords2Vec(myVocabLst, words) # print words_vec n=0 right_word= [] for std_words in std_wordsList: std_word_vec = setOfWords2Vec(myVocabLst, std_words) cos_nm = cosVector(words_vec,std_word_vec) # print cos_nm if cos_nm>n: print cos_nm n = cos_nm right_word = std_words left_word = ''.join(words) ok_word = ''.join(right_word) writeRowList = [left_word,ok_word,n] print printChinese(writeRowList) if row_nm == 0: wt_ws.append(['left_word','ok_word','cos_nm']) else: wt_ws.append(writeRowList) row_nm+=1 # if row_nm == 10:break wt_wb.save(out_file)
def __iter__(self): if self.filename: with codecs.open(self.filename,"r",encoding="utf8") as fp: for index,line in enumerate(fp): yield TaggedDocument(list(jieba.cut(line.strip().replace(" ",""))),tags=[index]) if self.sentences: for index,line in enumerate(self.sentences): yield TaggedDocument(list(jieba.cut(line.strip().replace(" ",""))), tags = [index])
def jieba_cut(words=words): import jieba print 'before adding dict:' print u' /'.join(jieba.cut(words)) print 'after adding dict:' jieba.load_userdict("/etc/jieba/jieba.dic") print u'/'.join(jieba.cut(words))
def get_similarity(self, string1, string2): segments_list1 = jieba.cut(string1, cut_all=False) segments_list2 = jieba.cut(string2, cut_all=False) total = 0 for combination in itertools.product(*[segments_list1, segments_list2]): if combination[0] == combination[1]: total += 1 return total
def evaluate_features(feature_select, classify_method): posFeatures = [] negFeatures = [] with open(RT_POLARITY_POS_FILE, 'r') as posSentences: posWords = [] for i in posSentences: if "<review" in i: continue if "</review" in i: posWords = [feature_select(posWords), 'pos'] posFeatures.append(posWords) posWords = [] continue i = i.decode('utf8') line = re.sub(r'[{}]+'.format(PUNCTUATION).decode("utf8"), "".decode("utf8"),i) posWords += jieba.cut(line, cut_all=False) with open(RT_POLARITY_NEG_FILE, 'r') as negSentences: negWords = [] for i in negSentences: if "<review" in i: continue if "</review" in i: negWords = [feature_select(negWords), 'neg'] negFeatures.append(negWords) negWords = [] continue i = i.decode('utf8') line = re.sub(r'[{}]+'.format(PUNCTUATION).decode("utf8"), "".decode("utf8"),i) negWords += jieba.cut(line, cut_all=False) #get trainFeatures and testFeatures trainFeatures = posFeatures + negFeatures testFeatures = getTestFeatures(feature_select) classifier = nltk.classify.SklearnClassifier(classify_method) classifier.train(trainFeatures) #initiates referenceSets and testSets referenceSets = collections.defaultdict(set) testSets = collections.defaultdict(set) #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets for i, (features, label) in enumerate(testFeatures): referenceSets[label].add(i) predicted = classifier.classify(features) testSets[predicted].add(i) #prints metrics to show how well the feature selection did print 'train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures)) print 'accuracy:', nltk.classify.util.accuracy(classifier, testFeatures) print 'pos precision:', scores.precision(referenceSets['pos'], testSets['pos']) print 'pos recall:', scores.recall(referenceSets['pos'], testSets['pos']) print 'neg precision:', scores.precision(referenceSets['neg'], testSets['neg']) print 'neg recall:', scores.recall(referenceSets['neg'], testSets['neg']) print 'F1 Pos:',2*scores.precision(referenceSets['pos'], testSets['pos'])* scores.recall(referenceSets['pos'], testSets['pos'])/ \ (scores.precision(referenceSets['pos'], testSets['pos'])+scores.recall(referenceSets['pos'], testSets['pos'])) print 'F1 neg:',2*scores.precision(referenceSets['neg'], testSets['neg'])* scores.recall(referenceSets['neg'], testSets['neg'])/ \ (scores.precision(referenceSets['neg'], testSets['neg'])+scores.recall(referenceSets['neg'], testSets['neg']))
def ProcessOneMusic(musicID): ''' metas 0 歌名 1 歌手 2 专辑名 3 歌曲介绍 ''' metas = [] try: if(not os.path.exists('basic_info/'+musicID+'.txt')): #if(1): htmlFile = 'html/'+ musicID+'.html' f = open(htmlFile,'r') html = f.read() f.close() bs = BeautifulSoup(html,'html.parser') description = bs.find('meta',{'name':'description'})['content'] title = bs.title.string if(len(title.split(' - '))==0): return metas.append(title.split(' - ')[0]) metas.append(title.split(' - ')[1]) begin = description.find('所属专辑:')#获取专辑名称 end = description.find('。',begin) metas.append(description[begin+5:end]) if('网易云音乐' not in description): metas.append(" ".join(jieba.cut(description.replace('\n','')))) else: metas.append("no description") f = open('basic_info/'+musicID+'.txt','w') for m in metas: if(m==None): m = " "; f.write(m.encode('utf8')+'\n') f.close() if(not os.path.exists('lrc_jieba/'+musicID+'.txt')): lrc = api.saveLyrics(musicID) if(lrc != None): f = open('lrc_jieba/'+musicID+'.txt','w') lrc = re.sub(r"\[.*?\]", "", lrc)#删除时间 lrc = " ".join(jieba.cut(lrc))#分词 if(lrc==None): lrc = ' ' f.write(lrc.encode('utf8')+'\n') f.close() global count print count,musicID,'OK' count+=1 except Exception,e: print e print count,musicID,'Not OK' return
def clean_cut(s): fil = re.compile(u'[^a-zA-Z\d\u4e00-\u9fa5]+', re.UNICODE) s = fil.sub('', s) s = ' '.join(jb.cut(s)) return s
def segment(sline): words = jieba.cut(sline,cut_all=False) return ' '.join(words)
data_list.iloc[i] = sentence train_data = load_data("train_data.csv") test_data = load_data("test_data.csv") with open("stop_words.csv", 'r') as fp: raw_data = fp.readlines() stop_word = set([word.strip() for word in raw_data]) # keyword keyword_list_train = train_data['keyword'].apply(split_keyword) keyword_list_test = test_data['keyword'].apply(split_keyword) # 切title seg_list_train = train_data['title'].apply(lambda x: list(jieba.cut(x))) seg_list_test = test_data['title'].apply(lambda x: list(jieba.cut(x))) # remove stopword remove_stopword(seg_list_train) remove_stopword(seg_list_test) # TF-IDF sentence_vectorizer = TfidfVectorizer() title_list = [' '.join(words) for words in seg_list_train] X_old = sentence_vectorizer.fit_transform(title_list) title_list_test = [' '.join(words) for words in seg_list_test] X_test_old = sentence_vectorizer.transform(title_list_test) keyword_vectorizer = TfidfVectorizer()
rstring += unichr(inside_code) return rstring def p(s): s1 = strQ2B(s.decode()) p = re.compile('[()]', re.S) s1 = p.sub('', s1) return s1 if __name__ == "__main__": jieba.add_word("TAG_NUMBER") line = "第TAG_NUMBER届奥林匹克运动会官方网站" line1 = "月工资TAG_NUMBER元" text = jieba.cut(line) text1 = jieba.cut(line1) print(" ".join(word for word in text1)) line = "1234|||很高兴认识你234" mat_number = re.match(r"\d+\|\|\|", line) if mat_number: print(mat_number.group(0)) # line3 = "2017年6月9日20时15分,“中星九号”广播电视直播卫星在西昌卫星发射中心成功发射。" # line2 = "6月24日20时15分下午,浙江绍兴市档案馆正在接受捐赠收藏5.12米“心系汶川”丝绸长卷珍品。" # line2_ = p(line2).encode("utf-8") # line3_ = p(line3).encode("utf-8") # # print(p(line3)) # print (line3_) # line4='2017年6月9日20时15分,“中星九号”广播电视直播卫星在西昌卫星发射中心成功发射。'
def predict_one(s): #单个句子的预测函数 word_index = load_word_index(wiPath) model = load_LSTM_model(modelPath, weightPath) s = np.array(doc2num(list(jieba.cut(s)), MAX_SEQUENCE_LENGTH, word_index)) s = s.reshape((1, s.shape[0])) return model.predict_classes(s)
# data = data[data["innovation"] > data['innovation'].quantile(0.1)] # len2 = len(data) # print("筛选前的数据数:", len1) # print("筛选后的数据数:", len2) #dates =data['date_num'].tolist() jieba.load_userdict('./data/jieba.txt') start = time.time() stop_words = [ word.strip() for word in open( './data/stop_words.txt', 'r', encoding='utf-8').readlines() ] # print(stop_words) print("正在分词...") # 分词,去除停用词 data['splitword'] = data["content"].apply(lambda x: [ word for word in jieba.cut(str(x)) if (word not in stop_words) and len(word) > 1 ]) splitword = data['splitword'].tolist() ''' to_sort = list(zip(dates, splitword)) to_sort = sorted(to_sort, key=lambda x:x[0], reverse=True) dates[:], splitword[:] = zip(*to_sort) ''' #合并每个时间窗内的keywords time_word_dict = {} count = {} for i in range(len(dates)): if dates[i] in time_word_dict: time_word_dict[dates[i]] = time_word_dict[dates[i]] + splitword[i]