Exemple #1
0
    def init_from_mongo(self):
        client = MongoClient('mongodb://localhost:27017/') 
        db = client.ptt
        posts = db.gossiping_38k 
        jieba.set_dictionary('extra_dict/dict.txt.big')
        jieba.analyse.set_stop_words("extra_dict/stop_words_cht.txt")   
        for post in posts.find():
            #For content
            d = defaultdict(int)
            content = post['content']
            if post['score'] != 0:
                for l in content.split('\n'):
                    if l:
                        for w in jieba.cut(l):
                            d[w] += 1
            if len(d) > 0:
                self.words.append(d)
                self.scores.append(1 if post['score'] > 0 else 0)
            #For comments
            for comment in post['comments']:
                l = comment['content'].strip()
                if l and comment['score'] != 0:
                    d = defaultdict(int)
                    for w in jieba.cut(l):
                        d[w] += 1
                    if len(d) > 0:
                        self.c_words.append(d)
                        self.c_scores.append(1 if comment['score'] > 0 else 0)

        client.close()   
Exemple #2
0
    def init(self, options):
        # type: (Dict) -> None
        if JIEBA:
            dict_path = options.get('dict')
            if dict_path and os.path.isfile(dict_path):
                jieba.set_dictionary(dict_path)

        if PYSTEMMER:
            class Stemmer(object):
                def __init__(self):
                    # type: () -> None
                    self.stemmer = PyStemmer('porter')

                def stem(self, word):
                    # type: (unicode) -> unicode
                    return self.stemmer.stemWord(word)
        else:
            class Stemmer(PorterStemmer):
                """All those porter stemmer implementations look hideous;
                make at least the stem method nicer.
                """
                def stem(self, word):
                    # type: (unicode) -> unicode
                    return PorterStemmer.stem(self, word, 0, len(word) - 1)

        self.stemmer = Stemmer()
Exemple #3
0
def start():
    sentence = raw_input('請輸入句子:')

    # jieba.enable_parallel(2) # 開啟多執行緒,参数為Thread數
    # jieba.disable_parallel() # 關閉多執行緒

    use_dict = True        # 是否使用繁體詞庫
    use_user_dict = False  # 是否使用使用者自定義詞庫

    if use_dict:
        jieba.set_dictionary('dict/dict.txt.big')

    if use_user_dict:
        jieba.load_userdict('dict/user_dict.txt')

    getFullMode(sentence)
    getFullModeHMM(sentence)
    getAccurate(sentence)
    getAccurateHMM(sentence)
    getNewWord(sentence)
    getSearch(sentence)
    getPostag(sentence)
    getTokenize(sentence)
    getKeyWord(sentence)
    getKeyWord(sentence, 'TextRank')
def testcase():
cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
cuttest("我不喜欢日本和服。")
cuttest("雷猴回归人间。")
cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
cuttest("我需要廉租房")
cuttest("永和服装饰品有限公司")
cuttest("我爱北京天安门")
cuttest("abc")
cuttest("隐马尔可夫")
cuttest("雷猴是个好网站")

if __name__ == "__main__":
testcase()
jieba.set_dictionary("foobar.txt")
print "================================"
testcase()



def main():
    pass


if __name__ == "__main__":
    main()
Exemple #5
0
def segment_pos(dir='rawdata', datetype='all', outdir='nohref_seg'):
	jieba.set_dictionary('dict/dict.txt.big')
	for tag in loadTag():
		jieba.add_word(tag)
	
	chinese_postagger = StanfordPOSTagger('tagger/chinese-distsim.tagger', 'tagger/stanford-postagger.jar', encoding='utf-8')
	
	for file in parseDateType(dir,datetype):
		dirname, filename = os.path.split(file)
		head = filename.split('.')[0]
		outfile = outdir + '/' + head + '.txt'
		if os.path.isfile(outfile):
			print 'pass %s...' %head
			continue

		print 'segment %s ...' %head
		f = open(outfile, 'w')
		dataList = readJson(file)
		p = re.compile("http[s]?://.*\n")
		for data in dataList:
			content = data['content']
			content = re.sub(p, '', content)
			segList = jieba.cut(content)
			wordList, tagList = postagging(chinese_postagger, segList)
			for w, t in zip(wordList, tagList):
				f.write(w.encode('utf-8'))
				f.write(' ')
				f.write(t)
				f.write(' ')
			f.write('\n')
		f.close()
def main():

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    # jieba custom setting.
    jieba.set_dictionary('jieba_dict/dict.txt.big')

    # load stopwords set
    stopwordset = set()
    with io.open('jieba_dict/stopwords.txt','r',encoding='utf-8') as sw:
        for line in sw:
            stopwordset.add(line.strip('\n'))

    texts_num = 0

    output = io.open('wiki_seg.txt','w',encoding='utf-8')
    with io.open('wiki_zh_tw.txt','r',encoding='utf-8') as content :
        for line in content:
            words = jieba.cut(line, cut_all=False)
            for word in words:
                if word not in stopwordset:
                    output.write(word +' ')

            texts_num += 1
            if texts_num % 10000 == 0:
                logging.info("已完成前 %d 行的斷詞" % texts_num)
    output.close()
    def init(self, options):
        if JIEBA:
            dict = options.get('dict')
            if os.path.isfile(dict):
                jieba.set_dictionary(dict)
                print("Dictionary path:", dict)

        if CSTEMMER:
            class Stemmer(CStemmer):
                def stem(self, word):
                    return self(word.lower())
        elif PYSTEMMER:
            class Stemmer(object):
                def __init__(self):
                    self.stemmer = PyStemmer('porter')

                def stem(self, word):
                    return self.stemmer.stemWord(word)
        else:
            class Stemmer(PorterStemmer):
                """All those porter stemmer implementations look hideous;
                make at least the stem method nicer.
                """
                def stem(self, word):
                    word = word.lower()
                    return PorterStemmer.stem(self, word, 0, len(word) - 1)

        self.stemmer = Stemmer()
    def __init__(self, category=None, *args, **kwargs):
        super(crawlcatSpider, self).__init__(*args, **kwargs)
        db_conn = sqlite3.connect('crawlcat.sqlite')
        db = db_conn.cursor()
        
        #初始化关键词
        num_list = ['0','1','2','3','4','5','6','7','8','9','一','二','三','四','五','六','七','八','九','十']
        quantifier_list = ['个','款','种','大','条','件','佳','张图']
        num_keywords = ['排行'.decode('utf-8'),'神作'.decode('utf-8'),'盘点'.decode('utf-8')]
        
        for num in num_list:
            for quantifier in quantifier_list:
                num_keywords.append((num+quantifier).decode('utf-8'))

        keywords = num_keywords[:]
        
        db.execute('SELECT node_id,keywords,alias_id,type_id,cate_id FROM nodes WHERE type_id != 2')
        for item in db.fetchall():
            if item[3] == 0:
                #搜索词,加入分词词库
                self.keywords_dict[item[1]] = item[0] if item[2] == 0 else item[2]
                keywords.append(item[1])
            elif item[3] == 1:
                #精选词
                self.select_keywords_dict[item[4]] = item[0]
            elif item[3] == 3:
                #分词失败的关键词
                self.keywords_dict[item[1]] = item[0] if item[2] == 0 else item[2]
                self.special_keywords_list.append(item[1])

        self.keywords_set = set(self.keywords_dict)
        self.select_keywords_set = set(num_keywords)
        
        #生成结巴字典
        fp = open('userdict.txt','w')
        for word in keywords:
            fp.write("%s 3\n" % word.encode('utf-8'))
        fp.close()
        jieba.set_dictionary("userdict.txt")
        
        #初始化域名
        db.execute('SELECT DISTINCT domain FROM website')
        for item in db.fetchall():
            self.allowed_domains.append(item[0])
        
        #初始化地址和规则
        db.execute("SELECT url,rules,cate_id,img_rule,src_attr,list_src_attr FROM website WHERE enabled = '1'")
        for item in db.fetchall():
            self.start_urls.append(item[0])
            self.url_sel_rules[item[0]] = {'urles':eval(item[1]),'cate_id':item[2],'img_rule':item[3],'src_attr':item[4],'list_src_attr':item[5]}
        
        #初始化已存网址
        db.execute("SELECT url,title FROM feeds");
        for item in db.fetchall():
            self.stored_url_list.append(item[0])
            #取前8个字符
            self.stored_title_list.append(item[1][:8])
        
        db.close()
        db_conn.close()
Exemple #9
0
    def init(self, options):
        # type: (Dict) -> None
        if JIEBA:
            dict_path = options.get('dict')
            if dict_path and os.path.isfile(dict_path):
                jieba.set_dictionary(dict_path)

        self.stemmer = get_stemmer()
Exemple #10
0
def loadDictionaries():
    jieba.set_dictionary('../resource/dicts/dict.txt.big') # set 繁體辭典
    jieba.load_userdict('../resource/dicts/ptt_words.txt') # load PTT 用語
    jieba.load_userdict('../resource/dicts/restaurant.txt') # load 餐廳名稱
    jieba.load_userdict('../resource/dicts/taiwan_area.txt') # load 台灣地名
    jieba.load_userdict('../resource/dicts/taiwan_words.txt') # load 台灣 words
    jieba.load_userdict('../resource/dicts/taiwan_party.txt') # load 台灣政黨
    jieba.analyse.set_stop_words('../resource/dicts/mystopwords.txt') # set stopwords
Exemple #11
0
 def testSetDictionary(self):
     jieba.set_dictionary("foobar.txt")
     for content in test_contents:
         result = jieba.cut(content)
         assert isinstance(result, types.GeneratorType), "Test SetDictionary Generator error"
         result = list(result)
         assert isinstance(result, list), "Test SetDictionary error on content: %s" % content
         print >> sys.stderr, " , ".join(result)
Exemple #12
0
 def load(self):
     # load jieba first
     if not jieba.initialized:
         jieba.set_dictionary(self.jieba_dict_path)
         jieba.initialize()
     self.pydict = {}
     f = None
     try:
         # py.txt
         f = open(self.dict_path)
         for line in f:
             try:
                 line = line.strip()
             except:
                 continue
             sps = line.split('\t')
             if len(sps) != 3:
                 print >>sys.stderr, 'bad format line [%s]' % line
                 continue
             word = sps[0]
             py = sps[1]
             freq = float(sps[2])
             if word in self.pydict:
                 wordInfoLen = len(self.pydict[word])
                 i = 0
                 dup = False
                 while i < wordInfoLen:
                     if self.pydict[word][i].py == py:
                         if self.pydict[word][i].freq < freq:
                             self.pydict[word][i].freq = freq
                         dup = True
                         break
                     if self.pydict[word][i].freq < freq:
                         break
                     i += 1
                 if not dup:
                     pyInfo = PyInfo()
                     pyInfo.py = py
                     pyInfo.freq = freq
                     self.pydict[word].insert(i, pyInfo)
                     wordInfoLen += 1
                     for j in range(i + 1, wordInfoLen):
                         if self.pydict[word][j].py == py:
                             del self.pydict[word][j]
                             break
             else:
                 pyInfo = PyInfo()
                 pyInfo.py = py
                 pyInfo.freq = freq
                 self.pydict[word] = [ pyInfo ]
     except Exception as e:
         try:
             f.close()
         except:
             pass
         return False
     self.is_load = True
     return True
Exemple #13
0
def init():
    """
    初始化jieba分词器设置
    :return:
    """
    # 设置自定义字典
    jieba.set_dictionary("data/jieba_dict.txt")
    # 设置工作目录
    jieba.tmp_dir = os.getcwd()
def tokenize(str_list):
    jieba.set_dictionary('dict.txt.big.txt')

    texts = list()
    for comment in str_list:
        comment_tokens = jieba.cut(comment, cut_all = False)
        texts.append(" ".join(comment_tokens).split(" "))

    return texts
Exemple #15
0
 def __init__(self):
     # 取得当前包路径
     _package_path_ =_context_path
     self._user_dict = _package_path_+os.sep+"dic.data"
     self._user_stword = _package_path_+os.sep+"stword.data"
     #构造停用词列表
     self._stop_word_list = list(line.strip().decode("utf8") for line in open(self._user_stword,'r').readlines())
     # print(self._user_dict,self._user_stword)
     jieba.set_dictionary(self._user_dict)
     jieba.initialize()
Exemple #16
0
def tokenizer(doc_string, parse = False):
	current_file_path = os.path.dirname(os.path.abspath(__file__))
	jieba.set_dictionary(current_file_path+'/dict/dict.txt.big')
	seg_list = list(jieba.cut(doc_string, cut_all=False))
	seg_list = stopWords.rmStopWords(seg_list)
	if parse:
	 	seg_list = ",".join(seg_list)
		seg_list = seg_list.encode("utf-8")

	return seg_list
Exemple #17
0
    def init_jieba(self, seg_dic, userdic):

        """
        jieba custom setting.
        """
        jieba.load_userdict(userdic)
        jieba.set_dictionary(seg_dic)
        with open(userdic,'r',encoding='utf-8') as input:
            for word in input:
                word = word.strip('\n')
                jieba.suggest_freq(word, True)
Exemple #18
0
    def __init__(self): 

        print "init NLP toolkit"

        self.tagger = ner.SocketNER(host='localhost', port=1234)

        # parse list of stopwords
        self.stoplist=[i.strip() for i in open(stopwords_file)]
        self.stoplist+=weibo_stopwords

        # better support for traditional character
        jieba.set_dictionary(dico_file)
Exemple #19
0
 def participle(self):
     jieba.set_dictionary("dict/dict.txt")
     jieba.initialize()
     if(self.radioButton.isChecked()):
         self.result=jieba.cut(self.filetext,cut_all=True)
     elif(self.radioButton_2.isChecked()):
         self.result=jieba.cut(self.filetext,cut_all=False)
     elif(self.radioButton_3.isChecked()):
         self.result=jieba.cut_for_search(self.filetext)
     else:
         self.result=jieba.cut(self.filetext,cut_all=False)
     self.textBrowser.clear()
     self.textBrowser.setText('/'.join(self.result))
Exemple #20
0
def start():
    sentence = raw_input('請輸入句子:')
    use_dict = True #是否使用繁體詞庫

    if use_dict:
        jieba.set_dictionary('dict/dict.txt.big')

    getFullMode(sentence)
    getFullModeHMM(sentence)
    getAccurate(sentence)
    getAccurateHMM(sentence)
    getNewWord(sentence)
    getSearch(sentence)
Exemple #21
0
def initialize():
	# Load conjuction data
	global CONJUNCTIONS
	CONJUNCTIONS = []
	
	from codecs import open
	with open('vendor/moedict.dict', 'r', encoding='utf8') as data:
		for entry in data:
			CONJUNCTIONS.append(entry.split()[0])
	
	# Load CJK parsing library
	jieba.set_dictionary('vendor/jieba_tc.dict')
	jieba.load_userdict('vendor/chewing.dict')
	jieba.initialize()
Exemple #22
0
def main():
	sys.stderr.write(' >>>>>>  python runed \n')

	jieba.set_dictionary( 'dict.txt.big' )

	#get our data as an array from read_in()
	for line in sys.stdin:
		sys.stderr.write( 'get' + line + '\n' )
		print( "0 " + " ".join( jieba.cut( line, cut_all=False ) ) ) 

		sys.stdout.flush()
		
	
	sys.stderr.write(' >>>>>>  python finished \n')
Exemple #23
0
def TextSeg(datas, lag):
    dict_path = "./Config/dict"
    if lag == "chs" and exists(dict_path): # 中文情况
        jieba.set_dictionary(dict_path) # jieba分词词典,可以修改
    datasseg = []
    for data in datas:
        if lag == "eng": # 英文情况
            word_list = nltk.word_tokenize(data)
        elif lag == "chs": # 中文情况
            word_cut = jieba.cut(data, cut_all=False) # 精确模式,返回的结构是一个可迭代的genertor
            word_list = list(word_cut) # genertor转化为list,每个词unicode格式
        # print " ".join(word_list)
        datasseg.append(word_list)
    return datasseg
Exemple #24
0
def run():
    start_time = time.clock()
    jieba.set_dictionary('jieba/dict.txt.big')
    jieba.initialize()
    print ("jieba " + str(time.clock() - start_time))
    
    start_time = time.clock()

    news_rss_url = "http://hk.news.yahoo.com/rss/hong-kong"
    # news_rss_url = "http://hk.news.yahoo.com/rss/china"
    info = feedparser.parse(news_rss_url)

    
    start_time = time.clock()

    for entry in info.entries:
        # word count of each word of summary
        word_list = getBagOfWords(preprocess(jieba.cut(stripTag(entry.summary))))
        # word count of each word of title
        bag_of_word_of_title = getBagOfWords(preprocess(jieba.cut(stripTag(entry.title))))
        
        # Combine word count of both summary and title and title weights more
        bag_of_word = Counter()
        for i in range(3):
            bag_of_word.update(bag_of_word_of_title)
        bag_of_word.update(word_list)
        entry["bag_of_words"] = bag_of_word

    print ("preprocess " + str(time.clock() - start_time))
        

#     result = Counter()
#     for entry in info.entries:
#         result.update(entry["bag_of_words"])
#     printList(result) 
        
    # Clustering them
    start_time = time.clock()
    clusters = clustering.clustering([Cluster([Vector(entry)]) for entry in info.entries])
    print ("clustering " + str(time.clock() - start_time))

    # Print the result        
    newsList = []
    for (index, cluster) in enumerate(clusters):
        for vector in cluster.listOfVectors:
            news = News(index, (vector == cluster.centroidVector), vector.data["title"], vector.data["published"], vector.data["link"])
            newsList.append(news.__dict__)
    return json.dumps(newsList)
Exemple #25
0
def init(jieba_parallel=False):
    # 加载英语/中文停止词,分别来自nltk和zhon
    global english_stopwords, chinese_stopwords
    english_stopwords = set(nltk.corpus.stopwords.words('english'))
    chinese_stopwords = {word[:-1] for word in codecs.open("stopwords.txt", "r", encoding="utf-8")}

    # 设置结巴分词log级别
    jieba.setLogLevel("INFO")
    # 设置结巴分词字典文件
    jieba.set_dictionary("./jieba_dict.txt")
    # 修改结巴分词临时工作目录
    jieba.tmp_dir = os.getcwd()
    # 开启并行分词模式,进程数为CPU核心数
    if jieba_parallel:
        jieba.enable_parallel()

    config.log.info("module algorithm has initialized successfully.")
Exemple #26
0
def handle(data):
    oper = json.loads(data)
    if oper[0] == 'cut':
        return json.dumps(tuple(jieba.cut(*oper[1], **oper[2]))).encode('utf-8')
    elif oper[0] == 'cut_for_search':
        return json.dumps(tuple(jieba.cut_for_search(*oper[1], **oper[2]))).encode('utf-8')
    elif oper[0] == 'tokenize':
        return json.dumps(tuple(jieba.tokenize(*oper[1], **oper[2]))).encode('utf-8')
    elif oper[0] == 'add_word':
        jieba.add_word(*oper[1], **oper[2])
    elif oper[0] == 'load_userdict':
        jieba.load_userdict(*oper[1])
    elif oper[0] == 'set_dictionary':
        jieba.set_dictionary(*oper[1])
    elif oper[0] == 'stopserver':
        return b'stop'
    elif oper[0] == 'ping':
        return b'pong'
Exemple #27
0
def handlemsg(data):
    oper = loadsjson(data)
    if oper[0] == 'c2m':
        return dumpsjson(mc.c2m.translate(*oper[1:]))
    elif oper[0] == 'm2c':
        return dumpsjson(mc.m2c.translate(*oper[1:]))
    elif oper[0] == 'c2m.raw':
        return dumpsjson(mc.c2m.rawtranslate(oper[1]))
    elif oper[0] == 'm2c.raw':
        return dumpsjson(mc.m2c.rawtranslate(oper[1]))
    elif oper[0] == 'modelname':
        return dumpsjson(mc.name())
    elif oper[0] == 'cut':
        return dumpsjson(tuple(jieba.cut(*oper[1], **oper[2])))
    elif oper[0] == 'cut_for_search':
        return dumpsjson(tuple(jieba.cut_for_search(*oper[1], **oper[2])))
    elif oper[0] == 'tokenize':
        return dumpsjson(tuple(jieba.tokenize(*oper[1], **oper[2])))
    elif oper[0] == 'jiebazhc.cut':
        return dumpsjson(tuple(jiebazhc.cut(*oper[1], **oper[2])))
    elif oper[0] == 'jiebazhc.cut_for_search':
        return dumpsjson(
            tuple(jiebazhc.cut_for_search(*oper[1], **oper[2])))
    elif oper[0] == 'jiebazhc.tokenize':
        return dumpsjson(tuple(jiebazhc.tokenize(*oper[1], **oper[2])))
    elif oper[0] == 'add_word':
        jieba.add_word(*oper[1], **oper[2])
    elif oper[0] == 'load_userdict':
        jieba.load_userdict(*oper[1])
    elif oper[0] == 'set_dictionary':
        jieba.set_dictionary(*oper[1])
    elif oper[0] == 'stopserver':
        return b'stop'
    elif oper[0] == 'ping':
        return b'pong'
    else:
        return dumpsjson('Command not found')
Exemple #28
0
    def __init__(self):
        self.br = br = mechanize.Browser()
        self.br.set_handle_robots(False)  # ignore robots
        self.br.set_handle_refresh(False)
        self.sixHourBeforeTime = time.time() - 60 * 60 * 6
        self.db_address = "127.0.0.1"  #'54.251.147.205'

        if platform.system() == "Windows":
            self.features = "html5lib"
        else:
            self.features = "lxml"

        oauth_args = dict(
            client_id="482698495096073",
            client_secret="8c58b055fcb762a9780638dc401c85e2",
            grant_type="client_credentials",
        )

        oauth_curl_cmd = ["curl", "https://graph.facebook.com/oauth/access_token?" + urllib.urlencode(oauth_args)]
        oauth_response = subprocess.Popen(oauth_curl_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()[
            0
        ]
        print oauth_curl_cmd
        print str(oauth_response)
        try:
            oauth_access_token = urlparse.parse_qs(str(oauth_response))["access_token"][0]
            self.graph = facebook.GraphAPI(oauth_access_token)
        except KeyError:
            print ("Unable to grab an access token!")

            # self._pre_dict_combine('combine_dict.txt')
            # jieba.set_dictionary('combine_dict.txt')
        dict_path = os.path.dirname(os.path.abspath(__file__)) + "/dict.txt"
        print dict_path
        jieba.set_dictionary(dict_path)
        jieba.initialize()
 def extract(self, text):
     jieba.set_dictionary('/usr/lib/ckan/default/src/ckanext-data_recommendation/dict.txt.big')
     result = jieba.analyse.extract_tags(text, topK=5)
     result = [i.encode('utf8') for i in result]
     result = '[' + ','.join(result) + ']'
     return result
Exemple #30
0
title = soup.find_all('a', class_='DY5T1d')
first_art_link = title[0]['href'].replace('.','https://news.google.com',1)

#print(first_art_link)
art_request = requests.get(first_art_link)
art_request.encoding='utf8'
soup_art = BeautifulSoup(art_request.text,'html.parser')

art_content = soup_art.find_all('p')
art_texts = [p.text for p in art_content]
print(art_texts)
## Create Word Cloud

import jieba

jieba.set_dictionary('../../../../_MySyncDrive/RepositoryData/data/jiaba/dict.txt.big.txt')

art_words = [w for w in jieba.cut(' '.join(art_texts))]
## Fine-tune Word Cloud

from collections import Counter
import imageio
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS
from matplotlib import pyplot as plt


## Check font paths
## !fc-list :lang=zh


## Load stopwords
    try:
        msg = "From: %s\r\nTo: %s\r\nSubject: %s\r\nDate: %s\r\n\r\n%s" % (
            from_address, str.join(',', to_address), Header(
                subject, 'utf-8').encode(), formatdate(), content)

        server = smtplib.SMTP(smtp_server, port)  # 发件人邮箱中的SMTP服务器,端口是587
        server.ehlo(name=host)
        server.starttls()
        server.ehlo(name=host)
        server.login(from_address, my_pass)  # 括号中对应的是发件人邮箱账号、邮箱密码
        server.sendmail(from_address, to_address,
                        msg)  # 括号中对应的是发件人邮箱账号、收件人邮箱账号、发送邮件

        server.quit()  # 关闭连接
        print("邮件发送成功")
    except smtplib.SMTPException:  # 如果 try 中的语句没有执行,则会执行下面的 ret=False
        ret = False
        print("邮件发送失败")
    return ret


if __name__ == '__main__':
    train = term_frequency_train('source/train.txt')
    print('load training set success')
    jieba.set_dictionary("source/dict.txt")
    jieba.initialize()
    # if you want output QR in cmd, try:
    # itchat.auto_login(enableCmdQR=True)
    itchat.auto_login(hotReload=True, exitCallback=ex, enableCmdQR=2)
    itchat.run()
Notes
-----

    These functions are based on the text normalization functions 
    provided in Text Analytics with Python 2ed.

"""

import unicodedata
import re
# from nltk.tokenize.toktok import ToktokTokenizer
import pandas as pd
import jieba

## Initialize Trad Chinese dictionary
jieba.set_dictionary('../../../RepositoryData/data/jiaba/dict.txt.jiebatw.txt')


## Normalize unicode characters
def remove_weird_chars(text):
    #     ```
    #     (NFKD) will apply the compatibility decomposition, i.e.
    #     replace all compatibility characters with their equivalents.
    #     ```
    text = unicodedata.normalize('NFKD', text).encode('utf-8',
                                                      'ignore').decode(
                                                          'utf-8', 'ignore')
    return text


## Remove extra linebreaks
Exemple #33
0
import config
import jieba
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

# ===========================
# Script purpose:
# Segment translated examples
# ===========================

# use downloaded segmenting index
jieba.set_dictionary("./data/raw/dict.txt.big.txt")

zh_translations = pd.read_feather(
    "./data/intermediate/zh_translations_filtered.feather", )


# use space delimiter to store segmented list as string
# (works because all sentence with English spaces were eliminated)
def segment(sentence):
    return " ".join(jieba.cut_for_search(sentence, HMM=True))


# process simplified sentences
zh_translations["simplified_segmented"] = zh_translations[
    "simplified"].progress_apply(segment)

# process traditional sentences
zh_translations["traditional_segmented"] = zh_translations[
Exemple #34
0
    def set_default_dict(tokenizer, path_default_dict):
        print("Setting Jieba Default Dictionary at " + str(path_default_dict))
        tokenizer.set_dictionary(path_default_dict)

        return tokenizer
Exemple #35
0
title = soup.find_all('a', class_='DY5T1d')
first_art_link = title[0]['href'].replace('.','https://news.google.com',1)

#print(first_art_link)
art_request = requests.get(first_art_link)
art_request.encoding='utf8'
soup_art = BeautifulSoup(art_request.text,'lxml')

art_content = soup_art.find_all('p')
art_texts = [p.text for p in art_content]
print(art_texts)
## Create Word Cloud

import jieba

jieba.set_dictionary('../../../Corpus/jiaba/dict.txt.big.txt')

art_words = [w for w in jieba.cut(' '.join(art_texts))]
## Fine-tune Word Cloud

from collections import Counter
import imageio
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS
from matplotlib import pyplot as plt


## Check font paths
## !fc-list :lang=zh


## Load stopwords
Exemple #36
0
# coding = utf-8
import numpy as np
import jieba
import preprocess as p
import os
from keras.preprocessing.sequence import pad_sequences

# stop_word_file = 'dicts/stop_words.txt'
jieba.set_dictionary('data/dict.txt.big')
jieba.initialize()
word_embedding_file = 'data/word_embedding_matrix.npy'


def get_word_data(char_data):
    seq_data = [''.join(l) for l in char_data]
    word_data = []
    # stop_words = [line.strip() for line in open(stop_word_file, 'r', encoding='utf-8')]
    for seq in seq_data:
        seq_cut = jieba.cut(seq, cut_all=False)
        word_data.append([w for w in seq_cut for n in range(len(w))])

    return word_data


def get_word2object():
    word2vec = {}
    f = open(r'data/word2vec.bin')  # load pre-trained word embedding
    i = 0
    for line in f:
        tep_list = line.split()
        if i == 0:
def set_dict():
    jieba.set_dictionary('dict.txt.big.txt')
Exemple #38
0
#/usr/bin/env python3
# -*- coding: utf-8 -*-
# 测试完善中

import numpy
import jieba
jieba.set_dictionary("./data/jieba/dict.txt.big")
jieba.load_userdict("./data/jieba/userdict.txt")
import jieba.posseg
import jieba.analyse
from gensim.models.word2vec import Word2Vec
model = Word2Vec.load_word2vec_format("./data/vectors.bin", binary=True)
from mytools import time_me, get_current_time

def word_similarity(w1, w2):
    return model.similarity(w1, w2)
	
def sum_cosine(matrix, threshold):
    """
    1.计算语义Jaccard中分子total,即分词相似性矩阵的Cosine和
    2.计算m: 两个集合中没有达到语义匹配标准(由阈值threshold控制)的总片段个数或者两者中取最大值
    """
    total = 0
    count = 0
    row = matrix.shape[0]
    col = matrix.shape[1]
    zero_row = numpy.zeros([1,col])
    zero_col = numpy.zeros([row,1])
    max = matrix.max()
    while max > threshold:
        total += max
Exemple #39
0
                break


# get the criminal's ID and role into a dict
def readRoles():
    dic = {}
    for line in open("../resources/roles.txt"):
        senderQQ = line.split(",")[0]
        role = line.split(",")[1]
        dic[senderQQ] = role
    return dic


importlib.reload(sys)
# sys.setdefaultencoding("utf-8")

jieba.set_dictionary('../resources/dict.txt')
stopwords = [line.strip() for line in open('../resources/stopwords.txt', encoding='utf-8').readlines()]
endingwords = [line.strip() for line in open('../resources/endingwords.txt', encoding='utf-8').readlines()]
model = models.Word2Vec.load('model/wiki_corpus.model')
freqDict = getFreqDict()
sentDict = getSentDict()
# print(freqDict)
sentence_vecs, pca = sentence2Vec(model, sentDict.keys(), freqDict)

MiscreantDict = {}
ResponseQueue = queue.Queue()
EndMiscreats = []
roleDict = readRoles()
chatEngine()
Exemple #40
0
import jieba
from collections import Counter
from wordcloud import WordCloud
from matplotlib import pyplot as plt
from PIL import Image
import numpy as np
import pymongo
import re
import test_NLU2
"""
jieba.set_dictionary('C:/Users/User/Desktop/dict.txt')
with open('C:/Users/User/Desktop/stop.txt', 'r', encoding='utf8') as f:  # 中文的停用字,我也忘記從哪裡拿到的,效果還可以,繁體字的資源真的比較少,大家將就一下吧
    stops = f.read().split('\n')

testStr = """
#理財專欄作者:黃逸強最近的熱門新聞就是美國非裔男子遭白人警察壓制致死,引發全國性示威,暴動場面怵目驚心,很難想像這是標榜民主自由的美國。但這些負面消息都不影響股市的發展,美股依然上揚,還一再創近期新高,這一波的反彈令專家很不解,更別說是一般的散戶投資人。散戶是反向指標?雖然疫情趨緩很多城市開始解封,但要經濟復甦還言之過早,更別說美中貿易談判未解,美國又進一步對華為制裁,再加上街頭暴動猶如雪上加霜;經濟數據更是難看,非農就業人數大減二千多萬人,創二戰以來最慘,美股仍不甩利空,硬是漲逾數百點,真要找一個理由就是「市場把期待押在未來的復甦上。」儘管股市一直漲,散戶投資人反倒是越漲越害怕,美國個人投資者協會發布最新報告顯示,散戶投資人看空情緒升高至52.6%,創2013年來最高;反之,看多情緒23.6%,則是30年來最低水準。如果散戶看法是反指標,是否意味著市場未來向上的機率更高。台灣投資人也是一樣,舉一檔股票為例,看漲的「台灣五十ETF」只有1380張的融資,1萬多張的周均量,而看跌的「台灣五十反向ETF」,卻有近40萬張的融資,12萬張的周均量。股市愈漲做空的人愈多,是一個很奇怪的心態。漲跌不需要理由也有專家認為,最近股市大漲並不是看多的買盤所拉抬,而是空頭回補的力道所推升。依據籌碼分析,通常跌到深處融資斷頭、或漲到創高空頭回補,就是一種反轉訊號,所以有專家提醒投資人,目前這個泡沫應該要留心。其實股市漲跌並不一定要理由,那只是記者在寫稿時需要一些題材,所以上漲就去找利多的理由、下跌就去找利空消息來搪塞,都是事後諸葛無濟於事。過去很多次的上漲是無基之彈,因為實在找不到理由。像2008年的雷曼金融風暴後,股市也是沒人看好,在現金為王的氛圍中走了十年的多頭。所以股市難測,幾百萬人在進行的金錢遊戲,不是幾個數據或幾則新聞就能決定漲跌,專家的分析都是以現在的資訊,去推測未來的發展,猜對了只是運氣好,猜錯是正常。天災人禍不是意外而是無常,在做投資或資產配置時都是必須要納入的風險因子。別聽消息做股票市場很任性,當它要漲時再多的利空壞消息它還是漲,當它要跌再多的好消息也挽不住跌勢。所以回歸技術面,趨勢的力量不可擋,只要順勢操作,不要自作聰明去抓頭部或猜底部。每一個階段採用不同的工具,承平時期用基本面分析找到長線績優股,新冠肺炎把股市打到低點,基本面無用改用技術分析搶反彈;千萬不要聽消息面,新聞有太多雜訊反而會干擾投資人。有人因為漲太多、股價很高所以會怕,其實高是一種感覺,很抽象不能用來操盤。現階段不需要預測高點,拋開理智線勇敢下單,搭上趨勢的順風車,並設好停損點,就不怕懼高症。★延伸閱讀★投資不能一窩蜂!黃金變現二管道炒短恪守三原則!超前佈署以應萬變!沒有意外 只有號外﹗
"""
#stops.append('\n')  ## 我發現我的文章中有許多分行符號,這邊加入停用字中,可以把它拿掉
#stops.append('\n\n')
# terms = [t for t in jieba.cut(testStr, cut_all=True) if t not in stops]
terms = [t for t in jieba.cut(testStr) if t not in stops]


sorted(Counter(terms).items(), key=lambda x:x[1], reverse=True)  ## 這個寫法很常出現在Counter中,他可以排序,list每個item出現的次數。




aa = sorted(Counter(terms).items(), key=lambda x:x[1], reverse=True)
"""
jieba.set_dictionary('C:/Users/User/Desktop/dict.txt')
Exemple #41
0
from bs4 import BeautifulSoup

import jieba
import csv
import json

from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pylab

horoscope = [
    "Aries", "Taurus", "Gemini", "Cancer", "Leo", "Virgo", "Libra", "Scorpio",
    "Sagittarius", "Capricornus", "Aquarius", "Pisces"
]
hash1 = {}
jieba.set_dictionary('dict.txt.big.txt')
jieba.load_userdict("userdict.txt")

#download the information from ptt
for i in range(len(horoscope)):
    url = "https://www.ptt.cc/bbs/" + horoscope[i] + "/index.html"
    title = []
    for round in range(10):
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        tag_name = "div.title a"
        articles = soup.select(tag_name)
        page2 = "div.btn-group-paging a"
        paging = soup.select(page2)
        next_url = "https://www.ptt.cc" + paging[1]["href"]
        url = next_url
Exemple #42
0
    MODEL = 'ta_addtest_test7_LSTMGRU.h5'
    ### Read files
    # train files
    print('Loading train files ...')
    train_sentences, train_labels = load_data(TRAIN_X_PATH, TRAIN_Y_PATH)

    # test file
    print('Loading test file ...')
    with open(TEST_X_PATH, 'r', encoding='utf-8') as f:
        readin = f.readlines()
        # Use regular expression to get rid of the index
        test_sentences = [re.sub('^[0-9]+,', '', s) for s in readin[1:]]

    sentences = train_sentences + test_sentences

    jieba.set_dictionary(DICT_PATH)  # Change dictionary (Optional)
    print('Jieba cutting all sets ...')
    sentences = [list(jieba.cut(s, cut_all=False)) for s in sentences]

    # Train Word2Vec model
    print('Training Word2Vec model ...')
    emb_model = Word2Vec(sentences, size=emb_dim)
    emb_model.save(w2v_model)

    print('Jieba cutting train set ...')
    train_sentences = [
        list(jieba.cut(s, cut_all=False)) for s in train_sentences
    ]

    num_words = len(emb_model.wv.vocab) + 1  # +1 for OOV words
    emb_dim = emb_model.vector_size
    cut_phones, pos = [], 0
    sent = ''.join(cut_sent)
    phones = word2phones(sent, use_tone, sep='').split()
    for word in cut_sent:
        word_len = len(word)
        word_phones = ' '.join(phones[pos:pos + word_len])
        cut_phones.append(word_phones)
        pos += word_len
    return cut_phones


if __name__ == '__main__':

    # Configuration
    use_tone = False
    jieba.set_dictionary('scripts/dict.txt.big')
    jieba.initialize()

    # Read information of validated audios
    full_tsv = join(DATA_DIR, 'validated.tsv')
    full_df = pd.read_csv(full_tsv, sep='\t')

    # Exclude audios with english
    full_df = full_df[full_df.sentence.apply(contains_no_eng)]


    ''' Prepare AM data '''
    print('Preparing AM data...\r', end='')

    # Prepare spk_id, gender and utt_id for all audios
    client_spk = full_df[['client_id']].drop_duplicates()
Exemple #44
0
Some function are used to generate training corpus of the chatbot
'''

import json
import logging
import os
import jieba
import sys
import operator
from tqdm import tqdm
import pickle
import re
dict_path = os.path.join(os.getenv("JIEBA_DATA"), "dict.txt.big")
ptt_path = (os.getenv("DATA"))
jieba.set_dictionary(dict_path)
process_files = ['Gossiping', 'Boy-Girl']
marker = {'Gossiping': '>', 'NBA': '<', 'Boy-Girl': '^'}

#count_response = {}


def main():

    Filter = ArticleFilter()


def print2file(f, title, responses, marker='', separater=True):
    if marker != '':
        f.write(marker + ' ')
    title_cutted = jieba.cut(title.strip(), cut_all=False)
# 词语相关信息记录

# 解决cmd命令行下输出中文字符乱码问题(必须放置在文本最前面)
from __future__ import unicode_literals
import os
import json
import sys
# 操作中文必须语句,解决字符问题
reload(sys)
sys.setdefaultencoding('utf8')

import jieba
import jieba.posseg as pseg

# 加载分词字典
jieba.set_dictionary("dict_file/dict.txt.big")
# 加载用户自定义词典
jieba.load_userdict("dict_file/user_dict.txt")

# 加载自定义模块
import fileHandle

# 词性过滤文件(保留形容词、副形词、名形词、成语、简称略语、习用语、动词、动语素、副动词、名动词、名词)
ALLOW_SPEECH_TAGS = [
    'a', 'ad', 'an', 'i', 'j', 'l', 'v', 'vg', 'vd', 'vn', 'n'
]

# 词语位置
Word_Location = {
    'title': 1,
    'section-start': 2,
Exemple #46
0
    def run(self):
        """
        初始化对话
        :return:
        """
        self.load_custom_plugins()
        conversation = Conversation(mic=self.mic,
                                    persona=self.persona,
                                    profile=profile,
                                    iot_client=self.iot_client)
        conversation.handle_forever()


if __name__ == "__main__":
    loggingConfiger(info=args.info, debug=args.debug,
                    output=args.output)  # 配置logging
    logger = logging.getLogger()

    if args.init:
        print('initializing...')
        device_init()
    else:
        jieba.set_dictionary(APP_RESOURCES_DATA_PATH +
                             'jieba.small.dict')  # 设置中文分词库
        jieba.initialize()
        app = App()
        if profile.remote_control_service_enable:  # server messege listen
            app.launch_server_listen_thread()
        app.run()  # start service
Exemple #47
0
# new model
_ = os.system('mkdir ./model/' + model_name)

# new word2vec model
word2vec_lookup = Word2vec(model_name)
dloader = loader(word2vec_lookup, mode='new', model_name=model_name)

# pre-trained word2vec model
# dloader = loader(mode='pre_trained', model_name=model_name, sent_len=sent_len)

# loading the training data and pre-processing
dloader.data_loading(sent_len=sent_len)

# hyperparameters
np.set_printoptions(precision=2)
jieba.set_dictionary('./libs/dict_new.txt')
vec_size = len(dloader.word2vec_lookup['<unk>'])
oneHot_size = dloader.voca_size
enc_len = sent_len
dec_len = enc_len
n_layer1 = 512
l_r = 1e-3
epoch = 8
batch_size = 128
n_hiddens = 2
rnn_cell = tf.contrib.rnn.BasicLSTMCell
op = tf.train.AdamOptimizer
max_gradient_norm = 1

# seq2seq initializing
s2s = seq2seq_chatbot(oneHot_size=oneHot_size,
Exemple #48
0
#!/usr/bin/env python
# encoding=utf-8
# Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG)
#           2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU)
# Apache 2.0

from __future__ import print_function
import sys
import jieba
reload(sys)
sys.setdefaultencoding('utf-8')

if len(sys.argv) < 3:
    sys.stderr.write(
        "word_segmentation.py <vocab> <trans> > <word-segmented-trans>\n")
    exit(1)

vocab_file = sys.argv[1]
trans_file = sys.argv[2]

jieba.set_dictionary(vocab_file)
for line in open(trans_file):
    key, trans = line.strip().split('\t', 1)
    words = jieba.cut(trans,
                      HMM=False)  # turn off new word discovery (HMM-based)
    new_line = key + '\t' + " ".join(words)
    print(new_line)
#     - TF(Term Frequency): $TF_{td}$指得是在特定的文章d中特定的字t出現了幾次。這個部分同時,也表示了一個文字在一篇文章的重要性,依但出現越多次,這個字也就越能代表這篇文章。
#     - IDF(Inverted Document Frequency): N指得是總共有機篇文章,$DF_t$中的DF是Document Frequency的意思,DFt則是詞彙t在幾篇文章中出現過。$\frac{DF_t}{N}$也就是所有文章當中,詞彙t在幾篇文章出現過,而其倒數則是Inverted Documnet Index,表著這個詞彙如果在很多文章裏面都出現過,則其重要性會受到懲罰,而取log則只是讓他在分數的影響上比較平滑而已。
#
#
# 2. Cosine Similarity
# $$\cos{\theta} = \frac{A \cdot B}{\| {A} \|_2 \| {B} \|_2}$$
#     - if $A = [1,2,0,4]$ and $B = [3,2,1,0]$
#     - $\cos{\theta} = \frac{1 \cdot 3 + 2 \cdot 2 + 0 \cdot 1 + 4 \cdot 0} {\sqrt{1^2+2^2+0^2+4^2} \cdot \sqrt{3^2+2^2+1^2+0^2}}$

# In[15]:
import jieba
import sys
import random
sys.path.append('../dict')

jieba.set_dictionary('../dict/dict.txt.big')  # 如果是使用繁體文字,請記得去下載繁體字典來使用
import numpy as np
import pandas as pd
from collections import Counter
with open('../dict/stops.txt', 'r',
          encoding='utf8') as f:  # 中文的停用字,我也忘記從哪裡拿到的,效果還可以,繁體字的資源真的比較少,大家將就一下吧
    stops = f.read().split('\n')

# 把情緒資料讀出來,做成dataframe
emo_dict = {'emo': [], 'text': []}

# 加入正面句
with open('positive.txt', encoding='utf8') as data:
    pos_train_list = data.readlines()
    random.shuffle(pos_train_list)
    for line in pos_train_list[:600]:
Exemple #50
0
#%%
import sys
import os
import logging
import jieba
import gensim
import pandas as pd
import numpy as np
from gensim.models.doc2vec import TaggedDocument

# logging information
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

jieba.set_dictionary('resources/dict.txt.big')

df_qa = pd.read_json('raw_data.json', encoding='utf8')
df_question = df_qa[['question', 'ans']].copy()
df_question.drop_duplicates(inplace=True)


def preProcess(item):
    #停用字
    with open('resources/stops.txt', 'r', encoding='utf8') as f:
        stops = f.read().split('\n')
    # stops.append('\n')
    # stops.append('\n\n')
    terms = [t for t in jieba.cut(item, cut_all=False) if t not in stops]
    return terms

Exemple #51
0
import jieba, os, json

#Init
base_path = os.path.dirname(__file__)
config = json.load(open(os.path.join(base_path, 'config.json')))
fn = config['DataSet']['NPMCorpus']
op = config['DataSet']['NPMCorpus_seg']
jieba.set_dictionary(os.path.join(base_path, 'dict.txt.big'))


def main():

    #load stopword set
    stopword_set = set()
    with open(os.path.join(base_path, 'stop_words.txt'), 'r',
              encoding='utf-8') as stopwords:
        for stopword in stopwords:
            stopword_set.add(stopword.strip('\n'))

    with open(op, 'a', newline='', encoding='utf-8-sig') as output:
        with open(fn, 'r', encoding='utf-8-sig') as sentences:
            for sentence in sentences:

                sentence = sentence.replace(" ", "")
                sentence = sentence.replace(",,,,", "")
                sentence = sentence.replace("\"", "")
                sentence = sentence.replace("□", "")
                sentence = sentence.replace("口", "")
                sentence = sentence.strip('\n')
                words = jieba.cut(sentence, cut_all=False)
Exemple #52
0
# encoding=utf-8

import csv
import jieba
import jieba.posseg as pseg

jieba.set_dictionary("../Head-first-Chinese-text-segmentation-master/data/dict.txt.big")

jieba.load_userdict("../Head-first-Chinese-text-segmentation-master/data/userdict_hp_all_character.txt")
# jieba.load_userdict("../Head-first-Chinese-text-segmentation-master/data/userdict_hp_all_place.txt")

"""
jieba.load_userdict("../Head-first-Chinese-text-segmentation-master/data/userdict_hp_all.txt")
"""
content=open("../context/HP1.txt","rb").read()

result = jieba.tokenize(u'%s' %content, 'utf-8')

"""
words=pseg.cut(content)
with open("out_0609_hp1.txt",'w',newline='',encoding='utf-8') as f:
    w=csv.writer(f)

    for word,flag in words:
        str=[word,flag]
        if(str[1]!='x'):
            w.writerow(str)
"""

for tk in result:
    #tk[0]=tk[0].encode('ascii','ignore')
Exemple #53
0
import pandas as pd
import tensorflow as tf

# Path of files
CSV_DATA = '../data/CookieTheft_51.csv'
DEMENTIA_DATA = '../data/'
CONTROL_DATA = '../data/'
WORDVEC_MODEL = '../wordvec_model/'
# Variables
DEMENTIA_NUM = 51
CONTROL_NUM = 51
WV_DIIM = 500
INDEX_CONTROL_START = 68  # The end of dementia id is 67
JIEBA_DICT = '../data/dict.txt.big'
jieba.set_dictionary(JIEBA_DICT)

punctuation = set(string.punctuation + "," + "、" + "」" + "「" + "。" + " " + "!")

# csv file of control subjects, transform to txt file


def csv_to_txt(file_name):
    csv = pd.read_csv(CSV_DATA, header=None)
    with open(file_name, 'w', encoding='utf8') as f:
        idx = INDEX_CONTROL_START
        for line in csv.iloc[1:, 1]:
            f.write(str(idx) + '\n')
            f.write(line + '\n')
            idx += 1
    print('Control subject CSV file to txt sucess ...')
Exemple #54
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

import jieba
import re
sdict= '../../libs/dict.txt.big'
jieba.set_dictionary(sdict)
userdict= '../../libs/userdict.txt'
stop_words= '../../libs/stop_words.txt'
jieba.load_userdict(userdict)
def segment(sentence, cut_all=False):

    
    

    # jieba.analyse.set_stop_words(stop_words)
    sentence = sentence.replace('\n', '').replace('\u3000', '').replace('\u00A0', '')
    # sentence = ' '.join(jieba.cut(sentence, cut_all=cut_all))
    #jieba.cut_for_search 方法接受两个参数:需要分词的字符串;是否使用 HMM 模型。该方法适合用于搜索引擎构建倒排索引的分词,粒度比较细
    sentence = ' '.join(jieba.cut_for_search(sentence))
    return re.sub('[a-zA-Z0-9.。::,,))((!!??*-_/”“\"]', '', sentence).split()
 def __init__(self, jieba_zh_path="dict.txt.big"):
     jieba.set_dictionary(jieba_zh_path)
Exemple #56
0
import json, jieba, sys, math
jieba.set_dictionary('./data/dict.txt.big')  # freindly to traditional chinese


class Load:
    def __init__(self):
        return

    def loadTestData(self, path):
        test_data = self._load_json_data(path)
        return self._getTest_feature(test_data)

    def loadTrainData(self, path):
        train_data = self._load_json_data(path)
        return self._getTrain_feature_label(train_data)

    def loadTestID(self, path):
        test_data = self._load_json_data(path)

        test_id = []  # [<context>,<quuetion>] for each row

        #get data position
        subjects = test_data['data']

        for subject in subjects:
            # subject contains title and *paragraphs*
            for paragraph in subject['paragraphs']:
                # paragraphs contains *context* and *qas*
                for qa in paragraph['qas']:

                    ######################################
Exemple #57
0
#coding=utf-8
import pandas as pd
import jieba
import re
import sys
jieba.set_dictionary('dict.txt.big')  #變更字典(繁體)
source = pd.read_csv('source.txt', sep='\t', header=None, encoding='utf8')
source.columns = ['date', 'string']
print(type(source), file=sys.stderr)
f = open('jieba_test.txt', 'w', encoding='utf8')
for index in range(0, len(source.index)):
    if (type(source['string'][index]) != str):
        source['string'][index] = str(source['string'][index])  #型態轉換=>str
    tmps = jieba.cut(source['string'][index], cut_all=False)
    source['string'][index] = ''
    for tmp in tmps:
        source['string'][index] += tmp + ' '
    #字串處理 清除數字標點符號
    source['string'][index] = re.sub(
        "[\s+\.\!\/_,$%^*(+\"\'\d]+|[+——!,。?、~@#¥%……&*()()::?+\d]+", " ",
        source['string'][index])
    source['string'][index] = re.sub("[\s+]+", " ", source['string'][index])
    print("index: " + str(index), file=sys.stderr)
    f.write(source['date'][index] + "\t" + source['string'][index] + "\n")
f.close()
Exemple #58
0
date_list = [2016, 9, 22]
date = date_list[0] * 10000 + date_list[1] * 100 + date_list[2]
today_date = (datetime.datetime.now().year) * 10000 + (
    datetime.datetime.now().month) * 100 + datetime.datetime.now().day


def get_next_date(date_in):
    date = datetime.datetime(date_in[0], date_in[1], date_in[2])
    date = date + datetime.timedelta(days=1)
    return [int(str(date)[0:4]), int(str(date)[5:7]), int(str(date)[8:10])]


stop_words = set(open("ref/stop_word.txt", "r").read().splitlines())
stop_words.update('\n', '\t', ' ')

jieba.set_dictionary('ref/dict.txt.big')
jieba.load_userdict("ref/userdict.txt")

if 1 > 0:

    try:
        db = DBConfig()
        db.dbConnect()
        query = "SELECT COUNT(*) from News WHERE date>=%s" % date
        db.executeQuery(query)
        news_num = int(db.results[0][0])

        query = "SELECT number, title, content from News WHERE date>=%s" % date
        db.executeQuery(query)

        texts = []
            print("Result retrieved...")
        except httplib.BadStatusLine:
            response = ''
        if response:
            html = response.read()
    return html
    
# Read a file of list of terms and their classifications, one term per line. Each line consistes of "Classification \t Term"
infile = open("drugList.txt")
lines = infile.readlines()
infile.close()

allWordList = ['drug-name', 'drug-type']    # To store all the possible features
drugDictList = []    # To store feature words of each drug in a list

jieba.set_dictionary('dict.txt.big')	# Read a better dictionary for text segmentation

i = 1
for line in lines:
    print "\nSearching for Entity:\t%s" % i
    i += 1
    fieldList = line.split("\t")
    queryStr = fieldList[1].strip()
    drugType = fieldList[0].strip()

    html = search('%s' % queryStr)
    soup = BeautifulSoup(html)    # Build a html object easy to parse
    
    drugDict = {'drug-name': queryStr, 'drug-type': drugType}    # Every drug has these two features
    # for content in soup.find_all("div", "c-abstract"): # Parse Baidu page
    for content in soup.find_all("span", "st"):    # Digest for each search result from google
Exemple #60
0
    for texts_num, line in enumerate(content):
        line = line.strip('\n')
        line = Converter('zh-hant').convert(line)
        line = line.split("\t")
        dict_page = {texts_num: line[1]}
        word_dictionary.append(dict_page)

        if (texts_num + 1) % 10000 == 0:
            print("已完成前 %d 行的store" % (texts_num + 1))

print(word_dictionary[15])

outputfile = open('F74056166.csv', 'w', encoding='utf-8')
#jieba.load_userdict("./wiki_seg.txt")
#jieba.set_dictionary("wiki_seg.txt")
jieba.set_dictionary('extra_dict/dict.txt.big')
model = "wiki.word2vec_50.bin"
model_w2v = word2vec.Word2Vec.load(model)
candidates = []
with open("wiki_seg.txt", encoding='utf-8') as f:
    for line in f:
        candidates.append(line.strip().split())

with open("oneinput.txt", encoding='utf-8') as inputline:
    for line in inputline:
        line = line.strip('\n')
        line = Converter('zh-hant').convert(line)
        output = line.split("\t", 1)
        text = output[0]
        answer = output[1].split("\t")
        eachans = []