def load_data():
    train_list = []
    for line in open('../data/train_clean.json', 'r'):
        train_list.append(json.loads(line))
    train = pd.DataFrame(train_list)
    
    #train_work = train[names[-1]]
    test_list = []
    for line in open('../data/test_clean.json', 'r'):
        test_list.append(json.loads(line))
    test = pd.DataFrame(test_list)
    
    print('--- NLP on major, simply cut the first word')
    le = LabelEncoder()
    print len(set(train['major']))
    train['major'] = train['major'].apply(lambda x :  " ".join(jieba.cut(x, cut_all = False)).split()[0] if x is not None and len(" ".join(jieba.cut(x)).split()) > 0 else 'none')
    test['major']  = test['major'].apply(lambda x :  " ".join(jieba.cut(x,  cut_all = False)).split()[0] if x is not None  and len(" ".join(jieba.cut(x)).split()) > 0 else 'none')

    print len(set(train['major']))
    le.fit(list(train['major']) + list(test['major']))
    train['major'] = le.transform(train['major'])
    test['major'] = le.transform(test['major'])
 
    le = LabelEncoder()
    train['gender'] = le.fit_transform(train['gender'])
    names =  train.columns
    
    le = LabelEncoder()
    test['gender'] = le.fit_transform(test['gender'])
    del train['_id']
    del test['_id']
    train = train.fillna(0)
    test = test.fillna(0)
    #test['age'] = test['age'].apply(lambda x : int(x.replace(u'岁','').encode('ascii')))
    return train, test
Exemple #2
0
	def process(self):
		client = MongoClient('localhost',44444)
		db_temp_train = client['vsm_all_second']
		collection1_temp_train = db_temp_train['collection1']
		collection2_temp_train = db_temp_train['collection2']		
		collection3_temp_train = db_temp_train['collection3']
		collection4_temp_train = db_temp_train['collection4']
		collection5_temp_train = db_temp_train['collection5']

		lineNum = 1
		pat = "sa(\d)(.*)"
		with open(os.path.join(self.fileroot,self.filename),"r") as fr:
			for line in fr:
				# 这是处理有的评论行的格式是异常的
				if not re.findall(pat,line):
					print("\n " + str(lineNum) + " something wrong !")
					continue

				
				result = re.findall(pat,line)
				starNum = result[0][0]
				if starNum ==   '1':
					collection1_temp_train.insert(dict(content = list(jieba.cut(result[0][1].strip()))))
				elif starNum == '2':
					collection2_temp_train.insert(dict(content = list(jieba.cut(result[0][1].strip()))))
				elif starNum == '3':
					collection3_temp_train.insert(dict(content = list(jieba.cut(result[0][1].strip()))))
				elif starNum == '4':
					collection4_temp_train.insert(dict(content = list(jieba.cut(result[0][1].strip()))))
				elif starNum == '5':
					collection5_temp_train.insert(dict(content = list(jieba.cut(result[0][1].strip()))))
	
				print('process {0} lines'.format(lineNum),end='\r\t')
				lineNum += 1
		client.close()
Exemple #3
0
def neural_init(raw):
    """
    用于将clean_person中的字段初始化为神经网络能接受的初始值
    字符串只简单分词
    :param raw:
    :return:
    """
    test = raw[0]
    re.sub('\s*', '', test[0])
    re.sub('\s*', '', test[1])
    re.sub('\s*', '', test[3])

    raw_without_space = map(lambda a: [re.sub('\s*', '', a[0]), re.sub('\s*', '', a[1]), a[2],
                                       re.sub('\s*', '', str(a[3])), re.sub('\s*', '', str(a[4])), a[5]], raw)

    jieba.load_userdict("data/jieba_dict.txt")
    raw_cut = [[jieba.cut(x[0], cut_all=False), jieba.cut(x[1], cut_all=False), x[2], jieba.cut(x[3], cut_all=False),
                jieba.cut(x[4], cut_all=False), x[5]] for x in raw_without_space]

    raw_without_sw = map(lambda a: [filter(not_in, a[0]), filter(not_in, a[1]), a[2], filter(not_in, a[3]),
                                    filter(not_in, a[4]), a[5]], raw_cut)

    # 将所有文本list合并在一起
    raw_text_all = map(lambda a: a[0]+a[1]+a[3]+a[4], raw_without_sw)

    # 产生词典和语料库
    dic_corpus = algorithm_collection.digitalize(raw_text_all)

    # 生成词典矩阵
    arr = matrix_former(dic_corpus[0], dic_corpus[1], dic_corpus)

    raw_digitalized = map(lambda a, b: np.hstack((a, [b[2], b[5]])), arr, raw_without_sw)

    return raw_digitalized
Exemple #4
0
    def get_skill(self,jobname="python",num=5):
        """
        从demand中关键词抽出相关技能短语
        """
        key_words = {}
        
        jd_skill = self.clear_jd(self.jd_database[jobname]['demand'])
        
        for line in jd_skill:
            for word in jieba.cut(line):
                word = strQ2B(word).lower()
                if word in self.skill_words:
                    key_words[word] = key_words.get(word,1)+1


        key_words = sorted(key_words.iteritems(),key=lambda x:x[1],reverse=True)
        
        res = [ w[0] for w in key_words[:int(num*np.log(num))]]
        
        print 'key_words:'
        print '\n'.join(res)

        for word in jieba.cut(jobname):
            word = strQ2B(word).lower()
            if word in self.skill_words and word not in res:
                res.insert(0,word)

        after_top3 = res[3:]
        np.random.shuffle(after_top3)

        return res[:3]+after_top3[:num-3]
 def calc_string_sim(self, a, b):
     try:
         a_seg_list = [x for x in jieba.cut(a, cut_all=False)]
         b_seg_list = [x for x in jieba.cut(b, cut_all=False)]
     except Exception, e:
         print e
         assert False
def process(filename = ''):
    con = readFile(filename)
    if con == '':
        return
    global mydict
    global idfdict
    global alpha
    mydict1 = {}
    seg_list = list(jieba.cut(con, cut_all = False))
    print '/ '.join(jieba.cut(con, cut_all = False))
    print seg_list
    f = open('out.txt', 'w')
    #print len(seg_list)
    for o in seg_list:
        s = o.encode('utf-8')
        o = o.encode('gbk')
        if s not in PUNCTUATION:
            mydict.add(o) #加到词典
            mydict1.setdefault(o, 0)#该文档对应的词频集合
            mydict1[o] += 1
        f.write(o+'\n')
    f.close()
    mmax = 0
    for o in mydict1.keys():
        idfdict.setdefault(o, 0)
        inverindex.setdefault(o, set())
        idfdict[o] += 1 #逆文档频率
        inverindex[o].add(fileindex[filename])
        if mydict1[o] > mmax:
            mmax = mydict1[o]
    for o in mydict1.keys(): #归一化处理,增强准确率
        mydict1[o] = alpha + (1-alpha) * mydict1[o] / mmax
    return mydict1
def TextProcessing(train_data_folder,test_data_folder):
    train_data_list=[]
    train_class_list=[]
    train_data_folder_list=os.listdir(train_data_folder)
    for folder in train_data_folder_list:
        new_path_folder=os.path.join(train_data_folder,folder)
        files=os.listdir(new_path_folder)
        for file in files:
            new_path_file=os.path.join(new_path_folder,file)
            with open(new_path_file,'r') as fp:
                raw=fp.read()
                words=list(jieba.cut(raw,cut_all=False))
                train_data_list.append(words)
                train_class_list.append(folder.decode('utf-8'))


    test_data_list=[]
    test_data_files=os.listdir(test_data_folder)
    for file in test_data_files:
        new_path_file=os.path.join(test_data_folder,file)
        with open(new_path_file,'r') as fp:
            raw=fp.read()
            words=list(jieba.cut(raw,cut_all=False))
            test_data_list.append(words)

    all_words_list={}
    for word_list in train_data_list:
        for word in word_list:
            if all_words_list.has_key(word):
                all_words_list[word]+=1
            else:
                all_words_list[word]=1
    all_words_tuple_list=sorted(all_words_list.items(),key=lambda f:f[1],reverse=True)
    all_words_list=list(zip(*all_words_tuple_list)[0])
    return all_words_list,train_data_list,train_class_list,test_data_list
 def CalculatorIDF(self):
     #统计IDF(耗时最长,约3300s)
     for keyword in self.nkeywords:
         count = 0
         KeyWord=keyword[0].encode('latin-1').decode('gbk')
         print(KeyWord)
         for row in self.NTrainSet:
             Words = jieba.cut(row[0].encode('latin-1').decode('gbk'))
             if KeyWord in Words:
                 count=count+1
                 continue
         count = log(295/count+0.5)
         print(count)
         sql = "insert into nkeyword(word,DF) values('%s',%s)" %(keyword[0].encode('latin-1').decode('gbk'),count)
         mydatabase.ExeUpdateQuery(sql)
          
     for keyword in self.pkeywords:
         count = 0
         KeyWord=keyword[0].encode('latin-1').decode('gbk')
         print(KeyWord)
         for row in self.PTrainSet:
             Words = jieba.cut(row[0].encode('latin-1').decode('gbk'))
             if KeyWord in Words:
                 count=count+1
                 continue
         count = log(1012/count+0.5)
         print(count)
         sql = "insert into pkeyword(word,DF) values('%s',%s)" %(keyword[0].encode('latin-1').decode('gbk'),count)
         mydatabase.ExeUpdateQuery(sql)
def keyword_frequency(keyword, directory):
    freq_table = {}

    for source in glob.glob(os.path.join(directory, '*')):
        words = ''
        vect = CountVectorizer(ngram_range=(1, 3))
        analyzer = vect.build_analyzer()

        for f in glob.glob(os.path.join(source, '*.json')):
            j = json.load(open(f))
            if j['Language'] == 'chinese':
                words += ' '.join(jieba.cut(j['Title']))
                words += ' '.join(jieba.cut(j['Content']))
            elif j['Language'] == 'english':
                words += j['Title']
                words += j['Content']
        ngram_query = analyzer(words)
        fdist = nltk.FreqDist(ngram_query)
        freq = fdist.freq(keyword.lower())
        freq_table[os.path.basename(source)] = freq

    pprint.pprint(freq_table)

    sorted_list = sorted(freq_table, key=freq_table.get, reverse=True)
    print('=================')
    print("%s loves %s most." % (sorted_list[0], keyword))

    plt.bar(range(len(freq_table)), freq_table.values(), align="center")
    plt.xticks(range(len(freq_table)), list(freq_table.keys()))
    plt.show()
Exemple #10
0
def get_position_prob(key, wordprobdct, works):
    seg_lst = jieba.cut(works[0][0])
    pos1prob = get_letter_position(key, "pos1", seg_lst, wordprobdct)
    seglst = jieba.cut(works[1][0])
    pos2prob = get_letter_position(key, "pos2", seglst, wordprobdct)
    #     if workprobdct[key]['pos1'].has_key(works[0][1]):
    #         pos1prob = 1000*float(workprobdct[key]['pos1'][works[0][1]])/workprobdct[key]['pos1']['total']
    #     else:
    #         pos1prob = 0.0001/workprobdct[key]['pos1']['total']
    #     if workprobdct[key]['pos2'].has_key(works[1][1]):
    #         pos2prob = 500*float(workprobdct[key]['pos2'][works[1][1]])/workprobdct[key]['pos2']['total']
    #     else:
    #         pos2prob = 0.0001/workprobdct[key]['pos2']['total']
    #     if workprobdct[key]['industry1'].has_key(works[0][0]):
    #         industry1prob = float(workprobdct[key]['industry1'][works[0][0]])/workprobdct[key]['industry1']['total']
    #     else:
    #         industry1prob = 0.0001/workprobdct[key]['industry1']['total']
    #     if workprobdct[key]['industry2'].has_key(works[1][0]):
    #         industry2prob = float(workprobdct[key]['industry2'][works[1][0]])/workprobdct[key]['industry2']['total']
    #     else:
    #         industry2prob = 0.0001/workprobdct[key]['industry2']['total']

    total = float(wordprobdct[key]["total"]) / wordprobdct["total"]
    total = total * (50 * pos1prob + pos2prob)

    return total
Exemple #11
0
    def init_from_mongo(self):
        client = MongoClient('mongodb://localhost:27017/') 
        db = client.ptt
        posts = db.gossiping_38k 
        jieba.set_dictionary('extra_dict/dict.txt.big')
        jieba.analyse.set_stop_words("extra_dict/stop_words_cht.txt")   
        for post in posts.find():
            #For content
            d = defaultdict(int)
            content = post['content']
            if post['score'] != 0:
                for l in content.split('\n'):
                    if l:
                        for w in jieba.cut(l):
                            d[w] += 1
            if len(d) > 0:
                self.words.append(d)
                self.scores.append(1 if post['score'] > 0 else 0)
            #For comments
            for comment in post['comments']:
                l = comment['content'].strip()
                if l and comment['score'] != 0:
                    d = defaultdict(int)
                    for w in jieba.cut(l):
                        d[w] += 1
                    if len(d) > 0:
                        self.c_words.append(d)
                        self.c_scores.append(1 if comment['score'] > 0 else 0)

        client.close()   
def get_description_feature_old(com_description, pro_hisprojects, pro_descriptions, pro_otherinfo):
    try:
        com_seg = jieba.cut(com_description, cut_all=False)
        comlst = [seg.lower() for seg in com_seg if keyword_dct.has_key(seg.lower())]
        description_feature = [0 for i in range(len(keyword_dct))]
        his_seg = jieba.cut(pro_hisprojects, cut_all=False)
        hislst = [seg.lower() for seg in his_seg if keyword_dct.has_key(seg.lower())]
        deslst = []
        for description in pro_descriptions:
            des_seg = jieba.cut(description[0], cut_all=False)
            deslst += [seg.lower() for seg in des_seg if keyword_dct.has_key(seg.lower())]
        oth_seg = jieba.cut(pro_otherinfo, cut_all=False)
        othlst = [seg.lower() for seg in oth_seg if keyword_dct.has_key(seg.lower())]
        feature_lst = []
        pro_dec_lst = hislst + deslst + othlst
        com_count = Counter(comlst)
        pro_count = Counter(pro_dec_lst)
        for key in com_count.keys():
            description_feature[keyword_dct[key]] = np.exp(pro_count.get(key, 0.0) / float(com_count[key]))

    except:
        pdb.set_trace()
        traceback.print_exc()


    return description_feature
Exemple #13
0
    def repeatability(self ,str_1,str_2): #判断语句是否重复,是则返回1
        list_1=jieba.cut(str_1)
        list_2=jieba.cut(str_2)
        
        a=list(list_1)
        b=list(list_2)

        count_a=count_b=0
        
        for i in a:
            for j in b:
                if i==j :
                    count_a+=1
                    break

        for j in b:
            for i in a:
                if i==j :
                    count_b+=1
                    break

        if  float(count_a)/len(a)>0.78 or float(count_b)/len(b)>0.78:
            return 1
        else:
            return 0
Exemple #14
0
def clean_text(text, remove_stopwords = True):
    #jieba.load_userdict("dict.txt")
    text = jieba.cut(text)
    text = " ".join(text)
    
    new_text = []
    for word in text:
        if word in contractions:
            new_text.append(contractions[word])
        else:
            new_text.append(word)
    text = " ".join(new_text)
    
    text = re.sub('(.*?)', '', text)
    text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", "",text)
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', '', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', '', text)
    text = re.sub(r'<br />', '', text)
    text = re.sub(r'\'', '', text)
    text = re.sub(r'\”', '',text)
    text = re.sub(r'\#', '', text)
    text = re.sub(r'\“', '',text)
    text = re.sub(r'\《', '',text)
    text = re.sub(r'\》', '',text)
    
    
    text = jieba.cut(text)
    text = " ".join(text)
    
    return text
def addDictToJieba():
    ##### roadList
    content=open('../data_crawl/finalRoads.txt','r').read().strip('\n')
    contentList=content.split('\n');print len(contentList)
    #############load district dict
    districtNameList=grab('/home/yr/intellicredit/data/'+'districtNameList0503')
    test_sent = [
    "李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿,上海市浦东区\n"
    ]
    ######## print cut before add dictionary
    #print test_sent[0].decode('utf-8')
    words = jieba.cut(test_sent[0].decode('utf-8'))
    #print('/'.join(words))

    ####add word_dictionary to jieba
    for w in districtNameList[:]+contentList:
	    #print w
	    jieba.add_word(w)
        ####add district-name-not-in-dictionary to jieba
    jieba.add_word('浦东区');
    jieba.add_word('浦东新区')
    jieba.del_word('上海市')
    jieba.add_word('兰城路')



    words = jieba.cut(test_sent[0].decode('utf-8'))
Exemple #16
0
def article_search(request):
	new_comment_list = []
	article_list = []
	keywords = request.GET.get('search_key')
	for article in Article.objects.all():
		title_data = ';'.join(jieba.cut(article.title,cut_all=True))
		title_data_list = title_data.split(';')
		content_data = ';'.join(jieba.cut(article.body,cut_all=True))
		content_data_list = content_data.split(';')
		if keywords in content_data_list or keywords in title_data_list:
			article_list.append(article)
	try:
		user_name = request.session["username"]
		user = User.objects.get(name = user_name)
		new_comment_list = Comment.objects.filter(comment_reminder=user,comment_status="N")
	except:
		pass
	comment_count_user = len(new_comment_list)
	category_list = Category.objects.all().order_by('name')
	popular_article = Article.objects.all().order_by('-likes')[:5]
	views_article = Article.objects.all().order_by('views')[:5]
	user_count = User.objects.all().count()
	article_count = Article.objects.all().count()
	latest_login_user = User.objects.all().order_by('-last_login_time')[:5]		
	return render(request,"search_list.html",{"comment_list_user":new_comment_list,"comment_count_user":comment_count_user,\
			"latest_login_user":latest_login_user,"user_count":user_count,"article_count":article_count,\
			"category_list":category_list,"popular_article":popular_article,"views_article":views_article,"article_list":article_list})	
def learn_model(data,target):
    bestwords = best_of_words(data, target)
    # preparing data for split validation. 80% training, 20% test
    data_train,data_test,target_train,target_test = cross_validation.train_test_split(data,target,test_size=0.1,random_state=43)
    #classifier = BernoulliNB().fit(data_train,target_train)
    train_feature=[]
    test_feature=[]
    for i in range(len(data_train)):
        d=data_train[i]
        d=jieba.cut(d, cut_all=False)
        l=target_train[i]
        #tmp=[bigram(d),l]
        tmp = [dict([(word, True) for word in d if word in bestwords]), l]
        train_feature.append(tmp)
        
    for i in range(len(data_test)):
        d=data_test[i]
        d=jieba.cut(d, cut_all=False)
        l=target_test[i]
        #tmp=bigram(d)
        tmp = dict([(word, True) for word in d if word in bestwords])
        test_feature.append(tmp)
    
        
    classifier = SklearnClassifier(MultinomialNB())
    classifier.train(train_feature)
   
    predicted = classifier.classify_many(test_feature)
    
    evaluate_model(target_test,predicted)

    return classifier, bestwords
def get_description_feature(com_description, pro_hisprojects, pro_descriptions, pro_otherinfo):
    try:
        com_seg = jieba.cut(com_description, cut_all=False)
        comlst = [seg.lower() for seg in com_seg if keyword_dct.has_key(seg.lower())]
        description_feature = [0 for i in range(len(keyword_dct))]
        his_seg = jieba.cut(pro_hisprojects, cut_all=False)
        hislst = [seg.lower() for seg in his_seg if keyword_dct.has_key(seg.lower())]
        deslst = []
        for description in pro_descriptions:
            des_seg = jieba.cut(description[0], cut_all=False)
            deslst += [seg.lower() for seg in des_seg if keyword_dct.has_key(seg.lower())]
        oth_seg = jieba.cut(pro_otherinfo, cut_all=False)
        othlst = [seg.lower() for seg in oth_seg if keyword_dct.has_key(seg.lower())]
        feature_lst = []
        for seg in hislst:
            if seg in comlst:
                pdb.set_trace()
                description_feature[keyword_dct[seg]] += 1
        for seg in deslst:
            if seg in comlst:
                pdb.set_trace()
                description_feature[keyword_dct[seg]] += 1
        for seg in othlst:
            if seg in comlst:
                pdb.set_trace()
                description_feature[keyword_dct[seg]] += 1


    except:
        pdb.set_trace()
        traceback.print_exc()


    return description_feature
Exemple #19
0
def get_feature_word():
    with open('d:/naren/ter1.csv', 'r') as file:
        lines = file.readlines()
        for line in lines:
            line = line.decode('gbk').split('|')[0]
            seglst = jieba.cut(line, cut_all=False)
            for seg in seglst:
                if wordct.has_key(seg):
                    wordct[seg] += 1
                else:
                    wordct[seg] = 1

    # words = sorted(wordct.iteritems(), key=operator.itemgetter(1), reverse=True)
    # for word in words:
    #     print word[0],':',word[1]

    print '=================分割线==================='
    with open('d:/naren/spam1.csv', 'r') as file:
        lines = file.readlines()
        for line in lines:
            line = line.decode('gbk').split('|')[0]
            seglst = jieba.cut(line, cut_all=False)
            for seg in seglst:
                if wordct.has_key(seg):
                    wordct[seg] += 1
                else:
                    wordct[seg] = 1

    wordsp = sorted(wordct.iteritems(), key=operator.itemgetter(1), reverse=True)
    for word in wordsp:
        print word[0],':',word[1]
    # print len(words)
    print len(wordsp)
Exemple #20
0
 def cutWords(self, dataset, stop_words_path="None"):
     #分词/去停用词
     result = []
     if stop_words_path == "None":
         for i in xrange(len(dataset)):
             temp = " ".join(jieba.cut(dataset[i]))
             result.append(temp)
         return result
     else:
         stop_words = self.read_txt2(stop_words_path)
         # print stop_words
         for i in xrange(len(dataset)):
             tup = []
             temp = " ".join(jieba.cut(dataset[i]))
             temp = temp.split()
             # print temp
             if len(temp) != 0:
                 for j in range(len(temp)):
                     # print j
                     # print stop_words
                     if temp[j] not in stop_words:
                         tup.append(temp[j])
             tup = " ".join(tup)
             result.append(tup)
             break
         return result
def get_description_feature(com_description, pro_hisprojects, pro_descriptions, pro_otherinfo):

    description_feature = []

    try:
        com_seg = jieba.cut(com_description, cut_all=False)
        comlst = [seg.lower() for seg in com_seg if keyword_dct.has_key(seg.lower())]
        his_seg = jieba.cut(pro_hisprojects, cut_all=False)
        hislst = [seg.lower() for seg in his_seg if keyword_dct.has_key(seg.lower())]
        deslst = []
        for description in pro_descriptions:
            des_seg = jieba.cut(description[0], cut_all=False)
            deslst += [seg.lower() for seg in des_seg if keyword_dct.has_key(seg.lower())]
        oth_seg = jieba.cut(pro_otherinfo, cut_all=False)
        othlst = [seg.lower() for seg in oth_seg if keyword_dct.has_key(seg.lower())]
        feature_lst = []
        pro_dec_lst = hislst + deslst + othlst
        com_count = Counter(comlst)
        pro_count = Counter(pro_dec_lst)
        description_feature.append(len(com_count))
        inter_con = set(com_count.keys()).intersection(set(pro_count.keys()))
        description_feature.append(len(inter_con))
        description_feature.append(len(comlst))
        # pdb.set_trace()
        description_feature.append(reduce(lambda x, y:x+y, [com_count[key] for key in inter_con] + [0]))
        description_feature.append(reduce(lambda x, y:x+y, [pro_count[key] for key in inter_con] + [0]))

    except:
        pdb.set_trace()
        traceback.print_exc()

    return description_feature
Exemple #22
0
def run_seg(pid, core_cnt, output_fname):
    ''''''
    mg_conn = MongoSource()
    conn_lst = [mg_conn.get_connection('finance', 'golden_pages'),\
            mg_conn.get_connection('finance', 'usa_pages')]

    cnt = 0
    with open(output_fname, 'w') as fp:
        for conn in conn_lst:
            for i, page in enumerate(conn.find()):
                if i % core_cnt != pid:
                    continue
                if cnt % 20000 == pid:
                    print 'Process %s dealed %s pages' % (pid, cnt)
                cnt += 1

                page_id = page['_id']
                if type(page['article_date']) == type(int):
                    article_date = str(page['article_date'])
                else:
                    article_date = page['article_date'].encode('utf8')
                article_date = article_date[:-4]
                title = strip_words(page['title'])
                content = strip_words(page['content'])

                cut_words = [word for word in jieba.cut(title)]
                cut_words.extend([word for word in jieba.cut(content)])

                output_str = '%s\t%s\t%s\n' \
                        % (page_id.encode('utf8'), article_date, ' '.join(cut_words).encode('utf8'))
                fp.write(output_str)
Exemple #23
0
def load_data():
    corpus_train = []  # 字典
    target_train = []  # 分类
    filepath = 'E:python_pananteng/程序6:文本挖掘/文本分类/实例2/train'
    filelist = os.listdir(filepath)
    for num in range(len(filelist)):
        filetext = filepath + "/" + filelist[num]
        filename = os.path.basename(filetext)
        myfile = codecs.open(filetext, 'r', 'utf-8')
        temp = myfile.readlines()  # 文本内容
        myfile.close()
        for i in range(0, 100):
            len_0 = len(temp)
            seg_list = jieba.cut(','.join(temp[int(i * len_0 / 100):int((i + 1) * len_0 / 100)]), cut_all=False)
            words = " ".join(seg_list)  # ??????
            target_train.append(filename)
            corpus_train.append(words)
            # --------------#
            corpus_test = []
            target_test = []
            filepath = 'E:python_pananteng/程序6:文本挖掘/文本分类/实例2/test'
            filelist = os.listdir(filepath)
        for num in range(len(filelist)):
            filetext = filepath + "/" + filelist[num]
            myfile = open(filetext, 'r')
            temp = myfile.readlines()
            myfile.close()
            seg_list = jieba.cut(','.join(temp[1:]), cut_all=False)
            target_test.append(temp[0])
            corpus_test.append(words)
    return [[corpus_train, target_train], [corpus_test, target_test]]
Exemple #24
0
    def getQuerys(self):
        tree = ET.parse(self.url)
        root = tree.getroot()

        jieba.add_word("黄世铭")
        jieba.add_word("特侦组")
        jieba.add_word("黄色小鸭")
        jieba.add_word("大统")
        jieba.add_word("太阳花")
        jieba.add_word("服贸协定")
        jieba.add_word("服贸")
        jieba.add_word("波卡")
        jieba.add_word("台商")
        jieba.add_word("北捷")
        jieba.add_word("郑捷")
        jieba.add_word("瓦斯")
        jieba.add_word("气爆")

        querys = []
        for query in root:
            # print "ok"
            queryDoc = []
            querys.append(queryDoc)
            for doc in query:
                if type(doc) is not ListType:
                    for term in jieba.cut(simplify(doc.attrib["title"].rstrip("\n")), cut_all=True):
                        queryDoc.append(term)
                    if doc[0].text != None:
                        for term in jieba.cut(simplify(doc[0].text.rstrip("\n")), cut_all=True):
                            queryDoc.append(term)
        return querys
def main():

	lib = load_workbook('D:\project\project-carNew\csv\dealer_name_mapping.xlsx')
	# 创建新的excel文件
	out_file = 'D:\project\project-carNew\csv\dealer_name_mapping_result.xlsx'

	wt_wb = Workbook(write_only=True)
	wt_ws = wt_wb.create_sheet()

	wordsList = []
	std_wordsList = []
	all_wordsList = []

	for row in lib['Sheet1'].iter_rows():
		fenci_list = list(jieba.cut(row[0].value, cut_all=False))
		wordsList.append(fenci_list)
		all_wordsList.append(fenci_list)


	for row in lib['Sheet2'].iter_rows():
		fenci_list = list(jieba.cut(row[0].value, cut_all=False))
		std_wordsList.append(fenci_list)
		all_wordsList.append(fenci_list)


	print printChinese(all_wordsList)

	myVocabLst = createVocabLst(all_wordsList)

	row_nm=0
	for words in wordsList:
		words_vec = setOfWords2Vec(myVocabLst, words)
		# print words_vec

		n=0
		right_word= []
		for std_words in std_wordsList:
			std_word_vec = setOfWords2Vec(myVocabLst, std_words)
			cos_nm = cosVector(words_vec,std_word_vec)
			# print cos_nm
			if cos_nm>n:
				print cos_nm
				n = cos_nm
				right_word = std_words

		left_word = ''.join(words)
		ok_word = ''.join(right_word)

		writeRowList = [left_word,ok_word,n]
		print printChinese(writeRowList)
		if row_nm == 0:
			wt_ws.append(['left_word','ok_word','cos_nm'])
		else:
			wt_ws.append(writeRowList)
		row_nm+=1
		# if row_nm == 10:break


	wt_wb.save(out_file)
Exemple #26
0
	def __iter__(self):
		if self.filename:
			with codecs.open(self.filename,"r",encoding="utf8") as fp:
				for index,line in enumerate(fp):
					yield TaggedDocument(list(jieba.cut(line.strip().replace(" ",""))),tags=[index])
		if self.sentences:
			for index,line in enumerate(self.sentences):
				yield TaggedDocument(list(jieba.cut(line.strip().replace(" ",""))), tags = [index])
Exemple #27
0
def jieba_cut(words=words):
    import jieba

    print 'before adding dict:'
    print u' /'.join(jieba.cut(words))
    print 'after adding dict:'
    jieba.load_userdict("/etc/jieba/jieba.dic")
    print u'/'.join(jieba.cut(words))
 def get_similarity(self, string1, string2):
     segments_list1 = jieba.cut(string1, cut_all=False)
     segments_list2 = jieba.cut(string2, cut_all=False)
     total = 0
     for combination in itertools.product(*[segments_list1, segments_list2]):
         if combination[0] == combination[1]:
             total += 1
     return total
Exemple #29
0
def evaluate_features(feature_select, classify_method):
    posFeatures = []
    negFeatures = []
    with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
        posWords = []
        for i in posSentences:
            if "<review" in i:
                continue
            if "</review" in i:
                posWords = [feature_select(posWords), 'pos']
                posFeatures.append(posWords)
                posWords = []
                continue
            i = i.decode('utf8')
            line = re.sub(r'[{}]+'.format(PUNCTUATION).decode("utf8"), "".decode("utf8"),i)
            posWords += jieba.cut(line, cut_all=False)
    with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
        negWords = []
        for i in negSentences:
            if "<review" in i:
                continue
            if "</review" in i:
                negWords = [feature_select(negWords), 'neg']
                negFeatures.append(negWords)
                negWords = []
                continue
            i = i.decode('utf8')
            line = re.sub(r'[{}]+'.format(PUNCTUATION).decode("utf8"), "".decode("utf8"),i)
            negWords += jieba.cut(line, cut_all=False)

    #get trainFeatures and testFeatures
    trainFeatures = posFeatures + negFeatures
    testFeatures = getTestFeatures(feature_select)
    classifier = nltk.classify.SklearnClassifier(classify_method)
    classifier.train(trainFeatures)

    #initiates referenceSets and testSets
    referenceSets = collections.defaultdict(set)
    testSets = collections.defaultdict(set)

    #puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets
    for i, (features, label) in enumerate(testFeatures):
        referenceSets[label].add(i)
        predicted = classifier.classify(features)
        testSets[predicted].add(i)

    #prints metrics to show how well the feature selection did
    print 'train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures))
    print 'accuracy:', nltk.classify.util.accuracy(classifier, testFeatures)
    print 'pos precision:', scores.precision(referenceSets['pos'], testSets['pos'])
    print 'pos recall:', scores.recall(referenceSets['pos'], testSets['pos'])
    print 'neg precision:', scores.precision(referenceSets['neg'], testSets['neg'])
    print 'neg recall:', scores.recall(referenceSets['neg'], testSets['neg'])
    print 'F1 Pos:',2*scores.precision(referenceSets['pos'], testSets['pos'])* scores.recall(referenceSets['pos'], testSets['pos'])/ \
                    (scores.precision(referenceSets['pos'], testSets['pos'])+scores.recall(referenceSets['pos'], testSets['pos']))
    print 'F1 neg:',2*scores.precision(referenceSets['neg'], testSets['neg'])* scores.recall(referenceSets['neg'], testSets['neg'])/ \
                    (scores.precision(referenceSets['neg'], testSets['neg'])+scores.recall(referenceSets['neg'], testSets['neg']))
def ProcessOneMusic(musicID):
	
 	'''
 	metas 
 	0 歌名 
 	1 歌手 
 	2 专辑名 
 	3 歌曲介绍
 	'''

 	metas = []
 	
 	try:
	 	if(not os.path.exists('basic_info/'+musicID+'.txt')):
	 	#if(1):
	 		htmlFile = 'html/'+ musicID+'.html'
			f = open(htmlFile,'r')
			html = f.read()
			f.close()
		 	bs = BeautifulSoup(html,'html.parser')

		 	description = bs.find('meta',{'name':'description'})['content']
		 	title = bs.title.string
		 	if(len(title.split(' - '))==0):
		 		return
	 		metas.append(title.split(' - ')[0])
		 	metas.append(title.split(' - ')[1])
		 	begin = description.find('所属专辑:')#获取专辑名称
		 	end = description.find('。',begin)
		 	metas.append(description[begin+5:end])
		 	if('网易云音乐' not in description):
		 		metas.append(" ".join(jieba.cut(description.replace('\n',''))))
		 	else:
		 		metas.append("no description")
		 	f = open('basic_info/'+musicID+'.txt','w')
		 	for m in metas:
		 		if(m==None): 
		 			m = " ";
		 		f.write(m.encode('utf8')+'\n')
		 	f.close()
		if(not os.path.exists('lrc_jieba/'+musicID+'.txt')):
		 	lrc = api.saveLyrics(musicID)
		 	if(lrc != None):
		 		f = open('lrc_jieba/'+musicID+'.txt','w')
		 		lrc = re.sub(r"\[.*?\]", "", lrc)#删除时间
		 		lrc = " ".join(jieba.cut(lrc))#分词
		 		if(lrc==None):
		 			lrc = ' '
		 		f.write(lrc.encode('utf8')+'\n')
		 		f.close()
		global count
		print count,musicID,'OK'
		count+=1
	except Exception,e:
		print e
		print count,musicID,'Not OK'
		return
Exemple #31
0
def clean_cut(s):
    fil = re.compile(u'[^a-zA-Z\d\u4e00-\u9fa5]+', re.UNICODE)
    s = fil.sub('', s)
    s = ' '.join(jb.cut(s))
    return s
Exemple #32
0
def segment(sline):
    words = jieba.cut(sline,cut_all=False)
    return ' '.join(words)
Exemple #33
0
        data_list.iloc[i] = sentence


train_data = load_data("train_data.csv")
test_data = load_data("test_data.csv")

with open("stop_words.csv", 'r') as fp:
    raw_data = fp.readlines()
    stop_word = set([word.strip() for word in raw_data])

# keyword
keyword_list_train = train_data['keyword'].apply(split_keyword)
keyword_list_test = test_data['keyword'].apply(split_keyword)

# 切title
seg_list_train = train_data['title'].apply(lambda x: list(jieba.cut(x)))
seg_list_test = test_data['title'].apply(lambda x: list(jieba.cut(x)))

# remove stopword
remove_stopword(seg_list_train)
remove_stopword(seg_list_test)

# TF-IDF
sentence_vectorizer = TfidfVectorizer()
title_list = [' '.join(words) for words in seg_list_train]
X_old = sentence_vectorizer.fit_transform(title_list)

title_list_test = [' '.join(words) for words in seg_list_test]
X_test_old = sentence_vectorizer.transform(title_list_test)

keyword_vectorizer = TfidfVectorizer()
Exemple #34
0
        rstring += unichr(inside_code)
    return rstring


def p(s):
    s1 = strQ2B(s.decode())
    p = re.compile('[()]', re.S)
    s1 = p.sub('', s1)
    return s1


if __name__ == "__main__":
    jieba.add_word("TAG_NUMBER")
    line = "第TAG_NUMBER届奥林匹克运动会官方网站"
    line1 = "月工资TAG_NUMBER元"
    text = jieba.cut(line)

    text1 = jieba.cut(line1)
    print(" ".join(word for word in text1))
    line = "1234|||很高兴认识你234"
    mat_number = re.match(r"\d+\|\|\|", line)
    if mat_number:
        print(mat_number.group(0))

    # line3 = "2017年6月9日20时15分,“中星九号”广播电视直播卫星在西昌卫星发射中心成功发射。"
    # line2 = "6月24日20时15分下午,浙江绍兴市档案馆正在接受捐赠收藏5.12米“心系汶川”丝绸长卷珍品。"
    # line2_ = p(line2).encode("utf-8")
    # line3_ = p(line3).encode("utf-8")
    # # print(p(line3))
    # print (line3_)
    # line4='2017年6月9日20时15分,“中星九号”广播电视直播卫星在西昌卫星发射中心成功发射。'
def predict_one(s): #单个句子的预测函数
    word_index = load_word_index(wiPath)
    model = load_LSTM_model(modelPath, weightPath)
    s = np.array(doc2num(list(jieba.cut(s)), MAX_SEQUENCE_LENGTH, word_index))
    s = s.reshape((1, s.shape[0]))
    return model.predict_classes(s)
Exemple #36
0
# data = data[data["innovation"] > data['innovation'].quantile(0.1)]
# len2 = len(data)
# print("筛选前的数据数:", len1)
# print("筛选后的数据数:", len2)
#dates =data['date_num'].tolist()
jieba.load_userdict('./data/jieba.txt')
start = time.time()
stop_words = [
    word.strip() for word in open(
        './data/stop_words.txt', 'r', encoding='utf-8').readlines()
]
# print(stop_words)
print("正在分词...")
# 分词,去除停用词
data['splitword'] = data["content"].apply(lambda x: [
    word for word in jieba.cut(str(x))
    if (word not in stop_words) and len(word) > 1
])
splitword = data['splitword'].tolist()
'''
to_sort = list(zip(dates, splitword))
to_sort = sorted(to_sort, key=lambda x:x[0], reverse=True)
dates[:], splitword[:] = zip(*to_sort)
'''

#合并每个时间窗内的keywords
time_word_dict = {}
count = {}
for i in range(len(dates)):
    if dates[i] in time_word_dict:
        time_word_dict[dates[i]] = time_word_dict[dates[i]] + splitword[i]