def set_sentiment_and_viewpoint(self):
     self.hotelnlp = HotelNLP()
     self.thu = thulac("")
     comm_list = self.hotel_dao.get_remarks()
     print len(comm_list)
     sentiment_comm_list = []
     i = 0
     for comm in comm_list:
         if comm[8] is None or comm[9] is None:
             sentiment_value = None
             viewpoint = None
             remark = re.sub(u"\@",u"",comm[2])
             try:
                 sentiment_value = self.hotelnlp.sentiment(remark.encode("utf-8"))
                 sentiment_value = round(sentiment_value*1000)/1000
                 print sentiment_value
             except:
                 print comm[2]
                 traceback.print_exc()
             try:
                 viewpoint = self.hotelnlp.viewpoint(remark.encode("utf-8"),decoding="utf-8")
                 viewpoint = json.dumps(viewpoint, ensure_ascii=False)
             except:
                 print remark
                 traceback.print_exc()
             comm = {"guid":comm[0], "senti_value":sentiment_value, "viewpoint":viewpoint}
             sentiment_comm_list.append(comm)
         if len(sentiment_comm_list)==10000:
             i+=1
             print "update %d time"%i
             self.hotel_dao.update_remarks(sentiment_comm_list)
             sentiment_comm_list = []
def createTable(num):
	start = time.time()
	thu = thulac.thulac()
	file = open('agri_economic.json', encoding='utf-8')
	print("begin!")
	f = json.load(file)
	count = 0
	file_text = ""
	for p in f:
		count += 1
		if int(count/100) != num:
			continue
		if count % 10 == 0:
			cur = time.time()
			print("now id : " + str(count) + "  table size :" )
			print("Running Time : " + str(int(cur-start)) + " s......")
		detail = p['detail']
#		if len(detail) > 600:
#			detail = detail[0:600]
		title = p['title']
		# 分词
		text = thu.cut(detail)
		wordList = createWordList(text)
		file_text += title
		for word in wordList:
			file_text += ' ' + word
		file_text += '\n'
				
	file_object = open('article'+str(num)+".txt",'w')
	file_object.write(file_text)
	file_object.close()
 def count_word_frq(self):
     self.thu =  thulac("-input cs.txt")
     comm_list = self.hotel_dao.get_remarks()
     sentiment_comm_list = []
     i = 0
     for comm in comm_list:
         a_dict = {}
         try:
             cut_comm = map(lambda x: x.split("_"), self.thu.cut(comm[2].encode("utf-8")))
         except:
             cut_comm = []
             print comm[2]
             traceback.print_exc()
         for word in cut_comm:
             if word[1].decode("utf-8") == "a":
                 if word[0].decode("utf-8") not in a_dict:
                     a_dict[word[0].decode("utf-8")] = 1
                 else:
                     a_dict[word[0].decode("utf-8")] += 1
         comm = {"guid":comm[0], "word_freq":json.dumps(a_dict, ensure_ascii=False)}
         sentiment_comm_list.append(comm)
         if len(sentiment_comm_list)==10000:
             i+=1
             print "update %d time"%i
             self.hotel_dao.update_hotel_comm_word_freq(sentiment_comm_list)
             sentiment_comm_list = []
Ejemplo n.º 4
0
    def preprocess(self, filepath, char_or_word='char'):
        cleaned_data = list()
        cleaned_label = list()
        with open(filepath, 'rb') as fr:
            for line in fr:
                items = line.strip().split('\t')
                label = items[0]
                review = items[1].decode()
                if char_or_word == 'word':
                    import thulac
                    cutter = thulac.thulac(seg_only=True, T2S=True, filt=True)
                    words = cutter.cut(review)
                    if len(words) < 1:
                        continue
                    words, _ = zip(*words)
                elif char_or_word == 'char':
                    words = list(review)
                else:
                    raise ValueError('You must make sure the value of '
                        '[char_or_word] is either char or word')

                if words in ([''], [' ']):
                    continue
                words = map(lambda kk: kk.decode(), words)
                cleaned_data.append(words)
                cleaned_label.append(int(label))

        return cleaned_data, cleaned_label
def createTable(num):
	start = time.time()
	thu = thulac.thulac()
	file = open('agri_economic.json', encoding='utf-8')
	print("begin!")
	table = set()
	f = json.load(file)
	count = 0
	file_text = ""
	for p in f:
		count += 1
		if int(count/2000) != num:
			continue
		if count % 10 == 0:
			cur = time.time()
			print("now id : " + str(count) + "  table size :" + str(len(table)))
			print("Running Time : " + str(int(cur-start)) + " s......")
		detail = p['detail']
#		if len(detail) > 600:
#			detail = detail[0:600]
		title = p['title']
		table.add(title)
		# 分词
		text = thu.cut(detail)
		table = table | createWordSet(text)
				
	for t in table:
		file_text += t+' '
	file_object = open('table'+str(num)+".txt",'w')
	file_object.write(file_text)
	file_object.close()
Ejemplo n.º 6
0
	def __init__(self):
		self.tfidf = joblib.load('predictor/model/tfidf.model')
		self.law = joblib.load('predictor/model/law.model')
		self.accu = joblib.load('predictor/model/accu.model')
		self.time = joblib.load('predictor/model/time.model')
		self.batch_size = 1
		
		self.cut = thulac.thulac(seg_only = True)
Ejemplo n.º 7
0
def split(label='20020101am', output='20020101am_split.txt', \
		input='20020101am_clean.txt', lac = thulac.thulac(seg_only=True)):
#f_stopwords = open('./list/stop_words.txt','r')
#	stopwords = []
#	for line in f_stopwords.readlines():
#		stopwords.append(line.strip())
#	f_stopwords.close()
	lac.cut_f(input, output)
	print 'DONE: ', label
Ejemplo n.º 8
0
def ch_seg_line(eachline):

    seg_line = ""

    thu1 = thulac.thulac(seg_only=True)  #only split but not tag
    seg_line = thu1.cut(eachline, text=True)  #splitted input_ch
    seg_line = seg_line.strip()

    return seg_line
Ejemplo n.º 9
0
 def __init__(self,
              cwd=".",
              tfidf='statement_tfidf.model',
              gbt='statement_som_gbt.model'):
     print('train tfidf...', self.print_mem())
     self.tfidf = joblib.load(os.path.join(cwd, tfidf))
     print('train gbt...', self.print_mem())
     self.gbt = joblib.load(os.path.join(cwd, gbt))
     self.cut = thulac.thulac(seg_only=True)
Ejemplo n.º 10
0
def CutArticle(article):
    file = open(article, 'rb')
    data = file.read().decode('utf-8')
    file.close()
    thu = thulac.thulac()
    text = thu.cut(data)
    length = len(text)
    demo = createGenerator(text)
    return demo, length
Ejemplo n.º 11
0
def cut(content, method=1):
    """seg"""
    if method == 0:
        import thulac
        thu1 = thulac.thulac(seg_only=True, filt=True)
        words = thu1.cut(content, text=True)  # 进行一句话分词
    else:
        words = content
    return words
Ejemplo n.º 12
0
def testrmSpace():
	test_text1 = "而荔 波 肉又 丧 心 病 狂 的不肯悔改"
	test_text2 = "我爱北京天 安 门"
	thu = thulac.thulac(seg_only = True, rm_space = False)
	gold1 = thu.cut(test_text1, text = True)
	gold2 = thu.cut(test_text2, text = True)
	print(gold1, gold2)
	assert gold1 == "而 荔 波 肉 又 丧 心 病 狂 的 不 肯 悔改"
	assert gold2 == "我 爱 北京 天 安 门"
Ejemplo n.º 13
0
def testrmSpace():
    test_text1 = "而荔 波 肉又 丧 心 病 狂 的不肯悔改"
    test_text2 = "我爱北京天 安 门"
    thu = thulac.thulac(seg_only=True, rm_space=False)
    gold1 = thu.cut(test_text1, text=True)
    gold2 = thu.cut(test_text2, text=True)
    print(gold1, gold2)
    assert gold1 == "而 荔 波 肉 又 丧 心 病 狂 的 不 肯 悔改"
    assert gold2 == "我 爱 北京 天 安 门"
Ejemplo n.º 14
0
def thulac(source_text):
    """THULAC segmentator."""
    url = 'http://localhost:5000/'
    data = {'source_text': source_text}
    # resp = requests.post(url, data=data)
    # return resp.text.split(' ')
    thu = thulacSeg.thulac(seg_only=True, model_path="thulac/models/")
    segtxt = [x[0] for x in thu.cut(source_text)]
    return segtxt
Ejemplo n.º 15
0
def thulac(source_text):
    """THULAC segmentator."""
    url = 'http://localhost:5000/'
    data = {'source_text': source_text}
    # resp = requests.post(url, data=data)
    # return resp.text.split(' ')
    thu = thulacSeg.thulac(seg_only=True, model_path = "thulac/models/")
    segtxt = [x[0] for x in thu.cut(source_text)]
    return segtxt
 def seg_pos(self, text):
     thu4car = thulac.thulac(user_dict= self.user_dict, seg_only=True)
     postagger = Postagger()
     postagger.load(self.pos_file)
     item = thu4car.cut(text, text=True)
     words = item.split(' ')
     postag = postagger.postag(words)
     print('|'.join([w+'_'+p for w, p in zip(words, postag)]))
     return words,postag
Ejemplo n.º 17
0
 def __init__(self,filename):
     fin = open(filename,'r')
     self.text = fin.read()
     mthulac = thulac.thulac(user_dict=None, model_path=None, T2S=False, seg_only=False, filt=False)  #默认模式
     lst = mthulac.cut(self.text)
     self.words = []
     for w in lst:
         self.words.append((w[0],w[1]))
     fin.close()
Ejemplo n.º 18
0
def main(args):
    with open(args.input, encoding='utf8') as f:
        lines = f.read().splitlines()
        if args.format == 'lines':
            lines.append('<song>')
        tot = len(lines)
    parsed_line = []
    thu1 = thulac.thulac(seg_only=True)
    with open(args.output, encoding='utf8', mode='w') as f:
        cnt = 0
        for line in lines:
            if args.format == 'lines':
                line = HanziConv.toSimplified(line)
            if cnt % 100 == 0:
                print('status: %d/%d' % (cnt, tot))
            cnt += 1
            if line == '<song>':
                if len(parsed_line) == 0:
                    continue
                n = len(parsed_line)
                # 控制每句总长度为maxlen
                for i in range(n):
                    l = len(parsed_line[i])
                    if l > args.maxlen:
                        continue
                    ctrl_list = parsed_line[i]
                    for k in range(i + 1, n + 1):
                        if k == n or l + len(parsed_line[k]) + 1 > args.maxlen:
                            f.write(' '.join(ctrl_list) + '\n')
                            break
                        ctrl_list.append('<lbreak>')
                        ctrl_list += parsed_line[k]
                        l += len(parsed_line[k]) + 1
                parsed_line = []
                continue
            # 用thulac或jieba进行分词
            if args.segment == 0:
                seg_list = jieba.lcut(line)
            else:
                seg_list = thu1.cut(line)
                seg_list = [t[0] for t in seg_list]
            seg_list2 = []
            for word in seg_list:
                seg_list2 += parse_segged_word(word)
            seg_list = seg_list2
            if args.segment == 0:
                seg_list2 = []
                for word in seg_list:
                    if word == '<num>':
                        seg_list2.append(word)
                    else:
                        seg_list2 += list(word)
                seg_list = seg_list2
            if len(seg_list) > 0:
                parsed_line.append(seg_list)
    print('Finished')
Ejemplo n.º 19
0
 def test_cut_from_file(self):
     thu = thulac("-input cs.txt")
     neg_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..\\..\\main\\python\\service\\nlp\\pos.txt')
     neg_docs = codecs.open(neg_path, 'r', 'utf-8').readlines()
     for sent in neg_docs:
         try:
             thu.cut(sent.encode("utf-8"))
         except:
             print sent
             continue
Ejemplo n.º 20
0
    def run(self):
        starttime = time.time()
        thu = thulac.thulac(seg_only=True)

        print('Thulac:')
        print("".join([(x + '/ ') for x in thu.cut(self.test_text, text=True)
                       if x not in self.stopwords]))
        endtime = time.time()
        print('time cost:' + str(round((endtime - starttime), 4)) +
              ' seconds.\n')
Ejemplo n.º 21
0
 def __init__(self, word2VecModelFilePath='Data/wiki_han_word2vec_300维度.model', weightpara=1e-3, isRemovePc=1, isUseThulac=True):
     self.weightpara = weightpara
     self.isRemovePc = isRemovePc
     self.model = Word2Vec.load(word2VecModelFilePath)
     self.word_index_map = {}
     for index, word in enumerate(self.model.wv.index2entity):
         self.word_index_map[word] = index
     self.vectors = self.model.wv.vectors
     self.isUseThulac = isUseThulac
     if isUseThulac == True:
         self._thulac = thulac.thulac(seg_only=True)
Ejemplo n.º 22
0
 def __init__(self,
              pos_tags: bool = False,
              simplify: bool = False,
              filt: bool = False,
              only_tokens: bool = True,
              user_dict: str = None) -> None:
     if pos_tags:
         seg_only = False
     else:
         seg_only = True
     if user_dict and os.path.exists(user_dict):
         self.thunlp = thulac.thulac(seg_only=seg_only,
                                     T2S=simplify,
                                     filt=filt,
                                     user_dict=user_dict)
     else:
         self.thunlp = thulac.thulac(seg_only=seg_only,
                                     T2S=simplify,
                                     filt=filt)
     self._only_tokens = only_tokens
Ejemplo n.º 23
0
def Segmentation(wiki_file, Words_dict, output_text):
    thu1 = thulac.thulac(user_dict=Words_dict, seg_only=True)
    input_object = open(wiki_file, 'r', encoding="UTF-8").readlines()
    output_object = open(output_text, 'w', encoding="UTF-8")
    for line in tqdm(input_object):
        str = line.replace(" ", "")
        t = thu1.cut(str, text=True)
        output_object.write(t + "\n")
    input_object.close()
    output_object.close()
    print("清华分词处理完成")
Ejemplo n.º 24
0
 def __init__(self):
     self.user_dict = None
     self.model_path = None  #默认为model_path
     self.T2S = True  #繁简体转换
     self.seg_only = True  #只进行分词
     self.filt = False  #去停用词
     self.tokenizer = thulac.thulac(user_dict=self.user_dict,
                                    model_path=self.model_path,
                                    T2S=self.T2S,
                                    seg_only=self.seg_only,
                                    filt=self.filt)
Ejemplo n.º 25
0
def cut_text(alltext):
    count = 0
    cut = thulac.thulac(seg_only=True)
    train_text = []
    for text in alltext:
        count += 1
        if count % 2000 == 0:
            print(count)
        train_text.append(cut.cut(text, text=True))

    return train_text
Ejemplo n.º 26
0
 def __init__(self):
     '''
     prop_dic是一个存放知识库中所有属性名的字典及频率
     char_2_prop是一个字映射属性的倒排索引,用于提高模糊匹配的速度
     '''
     self.prop_dic = pickle.load(open('../data/prop_dic.pkl', 'rb'))  #键没有引号
     self.char_2_prop = pickle.load(open('../data/char_2_prop.pkl', 'rb'))
     self.segger = thulac.thulac()
     self.question2mention = pickle.load(
         open('../data/question_2_mention.pkl', 'rb'))
     print('prop extractor loaded')
Ejemplo n.º 27
0
def cut_qts_to_words(qts_file, saved_words_file):
  save_dir = os.path.dirname((saved_words_file))
  dumped_file = os.path.join(save_dir, 'qts_words_stat_result.pkl')

  if os.path.exists(dumped_file) and os.path.exists(saved_words_file):
    print('find preprocessed static, loading directly...')
    with open(dumped_file, 'rb') as f:
      char_counter, author_counter, vocab, word_counter, genre_counter = pickle.load(f)
  else:
    char_counter = Counter()  # 字频统计
    author_counter = Counter()  # 每个作者的写诗篇数
    vocab = set()  # 词汇库
    word_counter = Counter()  # 词频统计
    genre_counter = defaultdict(Counter)  # 针对每个词性的Counter

    fid_save = open(saved_words_file, 'w', encoding = 'utf-8')
    lex_analyzer = thulac.thulac()  # 分词器
    line_cnt = 0
    with open(qts_file, 'r', encoding = 'utf-8') as f:
      for line in f:
        text_segs = line.split()
        author = text_segs[2]
        author_counter[author] += 1

        poem = text_segs[-1]
        # 去除非汉字字符
        valid_char_list = [c for c in poem if '\u4e00' <= c <= '\u9fff' or c == ',' or c == '。']
        for char in valid_char_list:
          char_counter[char] += 1

        regularized_poem = ''.join(valid_char_list)
        word_genre_pairs = lex_analyzer.cut(regularized_poem)

        word_list = []
        for word, genre in word_genre_pairs:
          word_list.append(word)
          vocab.add(word)
          word_counter[word] += 1
          genre_counter[genre][word] += 1

        save_line = ' '.join(word_list)
        fid_save.write(save_line + '\n')

        if line_cnt % 10 == 0:
          print('%d poets processed.' % line_cnt)
        line_cnt += 1

    fid_save.close()
    # 存储下来
    dumped_data = [char_counter, author_counter, vocab, word_counter, genre_counter]
    with open(dumped_file, 'wb') as f:
      pickle.dump(dumped_data, f)

  return char_counter, author_counter, genre_counter
Ejemplo n.º 28
0
def clean_with_tf_idf(in_file_name, tf_idf_name):
    thu0 = thulac.thulac()
    data = json.load(open(in_file_name, encoding='utf-8'))
    tf_idf = json.load(open(tf_idf_name, encoding='utf-8'))
    for index_, item_ in enumerate(data):
        text_ = item_['text']
        label_ = item_['merged_label']
        tmp_ = ' '.join([c[0] for c in thu0.fast_cut(text_) if c[0] in tf_idf[str(label_)]])
        item_['text'] = tmp_
    json.dump(data, open('okoo-merged-clean-cut-data,json', 'w', encoding='utf-8'), ensure_ascii=False, indent=2,
              separators=(',', ': '))
Ejemplo n.º 29
0
    def __init__(self):
        self.tfidf = joblib.load(
            os.path.join(data_path, 'generated/tfidf.model'))
        self.law = joblib.load(os.path.join(data_path, 'generated/law.model'))
        self.accu = joblib.load(os.path.join(data_path,
                                             'generated/accu.model'))
        self.time = joblib.load(os.path.join(data_path,
                                             'generated/time.model'))
        self.batch_size = 1

        self.cut = thulac.thulac(seg_only=True)
def parse_args():
    parser = argparse.ArgumentParser(description='Check pairwise similarities of .docx files.')
    parser.add_argument('--dir', help='the directory containing .docx files', required=True)
    parser.add_argument('--out', help='the output file path', required=True)
    parser.add_argument('--hash-width', help='the word length of a hashing block (default 8)', type=int, default=8)
    parser.add_argument('--hash-step', help='the word step between hashing block (default 1)', type=int, default=1)
    parser.add_argument('--sample-cnt', help='sample count (default 1000)', type=int, default=1000)

    args = parser.parse_args()
    args.thu = thulac.thulac(seg_only=True)
    return args
 def __init__(self):
     with open("chengyu_index_r.json", "r") as json_file:
         data = json_file.read()
         self.data_dict = json.loads(data)
         # for Chinese segmentation
         self.thu = thulac.thulac(
             user_dict=None,
             model_path=None,
             T2S=False,  # 繁体到简体
             seg_only=True,  # 只分词
             filt=True,  # 过滤没有意义的词
             deli='_')  # 分隔词和词性的分割符
Ejemplo n.º 32
0
def file_tag(src_dir, dst_dir, user_dict_path):
    src_dir = pathlib.Path(src_dir)
    dst_dir = pathlib.Path(dst_dir)

    seg = thulac.thulac(user_dict=user_dict_path)

    for src_file in src_dir.glob('*.txt'):
        assert src_file.is_file(), "Not found {0}".format(src_file)

        tag_path = pathlib.Path(dst_dir / src_file.name)

        # read file and process
        fr = src_file.open(mode='r')
        contents = []
        for line in fr.readlines():

            # remove space blank
            if len(line.strip()) == 0:
                continue
            line = content_process(line)
            contents.append(line)

        # word tag
        with tag_path.open('w') as fw:
            for content in contents:
                for seg_clip in seg.cut(content):
                    words, tag = seg_clip[0], seg_clip[1]
                    word_status = 'o'
                    for word in words:
                        print(words, len(words))
                        # filter url
                        if tag in ['nz', 'ns', 'n', 'ni', 'uw'
                                   ] and len(words) > 1 and len(words) < 10:
                            if word_status in ['IT', 'o']:
                                fw.write(word + ' ' + tag_map['n_begin'] +
                                         '\n')
                                word_status = 'BE'
                            else:
                                fw.write(word + ' ' + tag_map['n_internal'] +
                                         '\n')
                                word_status = 'IE'
                        elif tag in ['v', 'vn'
                                     ] and len(words) > 1 and len(words) < 10:
                            if word_status in ['IE', 'o']:
                                fw.write(word + ' ' + tag_map['v_begin'] +
                                         '\n')
                                word_status = 'BT'
                            else:
                                fw.write(word + ' ' + tag_map['v_internal'] +
                                         '\n')
                                word_status = 'IT'
                        else:
                            fw.write(word + ' ' + tag_map['other'] + '\n')
def preprocess(dataset: str):
    global segment_tool, dictionary
    print('Loading Segment Model...')
    segment_tool = thulac(rm_space=True)
    print('Loading dictionary')
    dictionary = set(map(lambda s: s.rstrip('\n'), open('dataset/dictionary.txt', encoding='utf-8').readlines()))

    dataset_list = (['train', 'test'], [dataset])
    for dataset_type, dataset_name in product(*dataset_list):
        with open('dataset/%s/%s_seg.txt' % (dataset_name, dataset_type), 'w', encoding='utf-8') as f:
            for line in handle_data('dataset/%s/%s.txt' % (dataset_name, dataset_type)):
                f.write(json.dumps(line, ensure_ascii=False) + '\n')
Ejemplo n.º 34
0
def thulac_pos(string):
    print('THULAC的分词和词性标注:')
    num = len(string)
    print(num)
    start_time = datetime.now()
    for s in string:
        seg = thulac.thulac()  # 加载模型,给定用户词典
        pos_list = seg.cut(s)
    all_time = (datetime.now() - start_time).total_seconds()
    avg = all_time / num
    print('pos_tag time used: {} sec'.format(avg))
    print('\n\n')
Ejemplo n.º 35
0
def main():
	files = getfiles(ROOT_ARTICLE)
	thu = thulac.thulac(seg_only=True)
	out_sentence = open(SENTENCE_FILE,'w')
	out_sentence_split = open(SENTENCE_SPLIT_FILE,'w')
	for f in tqdm(files):
		with open(f) as r:
			read = r.read()
			preprocessed_s = preprocess(read)
			preprocessed_ss = [thu.cut(sen, text=True) for sen in preprocessed_s]
			out_sentence.write('\n'.join(preprocessed_s)+'\n')
			out_sentence_split.write('\n'.join(preprocessed_ss)+'\n')
Ejemplo n.º 36
0
def run_write():
    logger.info("write file 启动,加载数据...")
    update_path()
    set_logger_file()
    logger.info("加载jvm...")
    jpype.startJVM(common_keys.JVM_PATH, "-Djava.class.path=" + common_keys.JAR_PATH)
    logger.info("加载切词器...")
    thu1 = thulac.thulac(model_path=common_keys.THULAC_MODEL_PATH)

    file=create_single_file()

    write_file(file,thu1)
Ejemplo n.º 37
0
def cut():
    cutmodel = thulac.thulac()
    train_dirs = os.listdir("data/train/")
    ans = {}
    for dir in train_dirs:
        if dir.endswith('.txt'):
            train = open("data/train/" + dir, 'r', encoding='utf-8').read()
            res = cutmodel.cut(train)
            ans[dir] = res
            print(dir + ' Done!')
    with open("data/cutresult.json", 'w') as f:
        json.dump(obj=ans, fp=f)
    f.close()
Ejemplo n.º 38
0
def get_thulac_result(sentences):
    """
    Ref to: http://thulac.thunlp.org/
    Install by: `pip install thulac`
    """
    import thulac
    preds = []
    lac = thulac.thulac(seg_only=True)
    for sentence in sentences:
        sent_seg = lac.cut(sentence, text=True)
        sent_seg = to_unicode(sent_seg)
        preds.append(sent_seg)
    return preds
Ejemplo n.º 39
0
def _tokenize(user_dict=None):
    tokenzier = thulac.thulac(user_dict=user_dict)

    def _tokenize(text):
        words = []
        pos = []
        pairs = tokenzier.cut(text)
        for pair in pairs:
            words.append(pair[0])
            pos.append(pair[1])
        return words, pos

    return _tokenize
Ejemplo n.º 40
0
def cut_words():
    cutter = thulac.thulac(T2S=True, seg_only=True)
    with open('../data/reviews/reviews.txt', 'rb') as fr:
        sys.stdout.write('\r\rStart')
        sys.stdout.flush()
        fw = open('../data/reviews/cut_reviews.txt', 'wb')
        for i, line in enumerate(fr):
            items = line.strip().decode()
            words = cutter.cut(items)
            if len(words) < 2:
                continue
            words, _ = zip(*words)
            fw.write(' '.join(words) + '\n')
            sys.stdout.write('\r\rFinish %d' % i)
            sys.stdout.flush()
        fw.close()
Ejemplo n.º 41
0
def main():
    src_dir = "/home/zihao/Text"
    out_dir = "/home/zihao/segText2"

    intput_filename_list = os.listdir(src_dir)
    intput_filename_list = [f for f in intput_filename_list if f.endswith(".txt")]

    thu = thulac.thulac("-seg_only")

    counter = 0
    for name in intput_filename_list:
        segText(thu, name, src_dir, out_dir)
        counter += 1
        if counter % 1000 == 0:
            print("Done {} passages.".format(counter))
            localtime = time.asctime( time.localtime(time.time()))
            print "Time :", localtime, "\n"

    print("All done!!")
 def set_sentiment_and_viewpoint(self):
     self.thu = thulac("")
     comm_list = dao.get_hotel_comments()
     sentiment_comm_list = []
     for comm in comm_list:
         if comm[7] is None or comm[8] is None:
             sentiment_value = None
             viewpoint = None
             try:
                 sentiment_value = self.hotelnlp.sentiment(comm[2].encode("utf-8"))
             except:
                 print comm[2]
                 traceback.print_exc()
             try:
                 viewpoint = self.hotelnlp.viewpoint(comm[2].encode("utf-8"),decoding="utf-8")
                 viewpoint = json.dumps(viewpoint, ensure_ascii=False)
             except:
                 print comm[2]
                 traceback.print_exc()
             comm = {"guid":comm[0], "senti_value":sentiment_value, "viewpoint":viewpoint}
             sentiment_comm_list.append(comm)
     print len(sentiment_comm_list)
     dao.update_hotel_comm(sentiment_comm_list)
Ejemplo n.º 43
0
import os
import config
import json
import thulac

index = "law_thulac"
doc_type = "big_data"
dir_path = "/mnt/new/"
model_path = "/home/zhx/elasticsearch-5.5.2/plugins/thulac/models"

server_dir = os.path.dirname(os.path.realpath(__file__))
config_file = os.path.join(server_dir, 'config.py')
local_config_file = os.path.join(server_dir, 'local_config.py')

cutter = thulac.thulac(seg_only=True, model_path=model_path, T2S=True)


def cut(text):
    res = cutter.cut(text.encode('utf8'))
    result = ""
    first = True
    for x in res:
        if first:
            first = False
        else:
            result = result + " "
        result = result + x[0]
    return result


if __name__ == '__main__':
Ejemplo n.º 44
0
#coding:utf-8

import thulac

thu1 = thulac.thulac(seg_only=True, model_path="请查看README下载相关模型放到thulac根目录或在这里写路径")  #设置模式为行分词模式
a = thu1.cut("我爱北京天安门")

print(a)
Ejemplo n.º 45
0
import sys
import os
import json
import xlrd
reload(sys)
sys.setdefaultencoding('utf-8')
import logging
logging.basicConfig(level=logging.INFO,
                format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
                datefmt='%a, %d %b %Y %H:%M:%S')
import thulac
from collections import defaultdict

# filt : remove the useless words
# T2S : transform 繁体 to 简体
cutter = thulac.thulac(seg_only=True, T2S=True)


# 解析大连理工大学的情感词汇数据
def DUTParser():
	dut = xlrd.open_workbook(u'../dict/sentimentwords/DUT/file/情感词汇本体.xlsx')
	sheet1 = dut.sheet_by_index(0)
	words = sheet1.col_values(0)
	word_senses_num = sheet1.col_values(2)
	word_emotion_strengths = sheet1.col_values(5)
	word_emotion_types = sheet1.col_values(6)
	dut_dict = defaultdict(int)
	for i, word in enumerate(words):
		if i == 0:
			continue
		word = str(word).decode()
Ejemplo n.º 46
0
#coding=utf-8

import thulac

thu1 = thulac.thulac("-seg_only")  #设置模式为行分词模式
thu1.run() #根据参数运行分词程序,从屏幕输入输出
print " ".join(thu1.cut("我爱北京天安门")) #进行一句话分词

#==============================================
thu2 = thulac.thulac("-input cs.txt") #设置模式为分词和词性标注模式
thu2.run() #根据参数运行分词和词性标注程序,从cs.txt文件中读入,屏幕输出结果
print " ".join(thu2.cut("我爱北京天安门")) #进行一句话分词和词性标注
Ejemplo n.º 47
0
def testUserDict():
	test_text = "我爱北京天安门"
	thu = thulac.thulac(seg_only = True, user_dict = prefix + "/userDict.txt")
	gold = thu.cut(test_text, text = True)
	assert gold == "我爱北京天安门"
Ejemplo n.º 48
0
#coding: utf-8
import thulac
import sys
prefix = sys.path[0]

thu = thulac.thulac(seg_only = True)

def readFile(file_name):
    with open(file_name) as result:
        for line in result:
            return line

def testCutFile():
    thu.cut_f(prefix +"/textForTest/input.txt", prefix +"/textForTest/output.txt")
    print(readFile(prefix +"/textForTest/output.txt"))
    assert readFile(prefix + "/textForTest/output.txt") == "我 爱 北京 天安门\n"

def testFastCut():
    test_text = "我爱北京天安门"
    gold = thu.fast_cut(test_text, text = True)
    assert gold == "我 爱 北京 天安门"

def testFastCutFile():
    thu.fast_cut_f(prefix +"/textForTest/input.txt", prefix +"/textForTest/output.txt")
    print(readFile(prefix +"/textForTest/output.txt"))
    assert readFile(prefix +"/textForTest/output.txt") == "我 爱 北京 天安门\n"
Ejemplo n.º 49
0
import sys
import thulac

seg_only = False

if(len(sys.argv) >= 4 and sys.argv[3] == "-seg_only"):
	seg_only = True
lac = thulac.thulac(seg_only=seg_only)
lac.cut_f(sys.argv[1], sys.argv[2])
 def __init__(self):
     self.dao = TuniuDAO(dao_setting["host"], dao_setting["db"], dao_setting["user"], dao_setting["password"])
     self.thu =  thulac("-input cs.txt")
     self.hotelnlp = HotelNLP()
Ejemplo n.º 51
0
def testSegOnly():
	test_text = "我爱北京天安门"
	thu = thulac.thulac(seg_only = True)
	gold = thu.cut(test_text, text = True)
	assert gold == "我 爱 北京 天安门"
Ejemplo n.º 52
0
def testFilt():
	test_text = "我可以爱北京天安门"
	thu = thulac.thulac(seg_only = True, filt = True)
	gold = thu.cut(test_text, text = True)
	print(gold)
	assert gold == "我 爱 北京 天安门"
Ejemplo n.º 53
0
def testT2S():
	test_text = "我愛北京天安門"
	thu = thulac.thulac(seg_only = True, T2S = True)
	gold = thu.cut(test_text, text = True)
	print(gold)
	assert gold == "我 爱 北京 天安门"
Ejemplo n.º 54
0
reload(sys)
import xlrd
sys.setdefaultencoding('utf-8')

import logging
logging.basicConfig(level=logging.INFO,
                format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
                datefmt='%a, %d %b %Y %H:%M:%S')
import thulac
from collections import defaultdict
import numpy as np
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.model_selection import KFold

cutter = thulac.thulac(T2S=True, seg_only=True)

# 获取字典内容,包括否定词、程度词、情感词
def stop_words_parser():
    # 停用词:融合网络停用词、哈工大停用词、川大停用词
    stop_words = set()
    with open(u'../dict/stopwords/中文停用词库.txt') as fr:
        for line in fr:
            item = line.strip().decode()
            stop_words.add(item)
    with open(u'../dict/stopwords/哈工大停用词表.txt') as fr:
        for line in fr:
            item = line.strip().decode()
            stop_words.add(item)
    with open(u'../dict/stopwords/四川大学机器智能实验室停用词库.txt') as fr:
        for line in fr:
Ejemplo n.º 55
0
 def __init__(self):
     self.thul = thulac.thulac()
Ejemplo n.º 56
0
 def __init__(self):
     self.classifier = Bayes()
     self.thu = thulac("-seg_only")
     train_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'sentiment.marshal')
     self.load(train_file)
# -*- coding: utf-8 -*-
import thulac
import csv
import sys
import os
sys.path.append("..")

from Model.neo_models import Neo4j 
from Model.mongo_model import Mongo
from toolkit.vec_API import word_vector_model
from toolkit.tree_API import TREE
	
pre_load_thu = thulac.thulac()  #默认模式
print('thulac open!')

neo_con = Neo4j()   #预加载neo4j
neo_con.connectDB()
print('neo4j connected!')

predict_labels = {}   # 预加载实体到标注的映射字典
filePath = os.getcwd()
with open(filePath+'/toolkit/predict_labels.txt','r',encoding="utf-8") as csvfile:
	reader = csv.reader(csvfile, delimiter=' ')
	for row in reader:
		predict_labels[str(row[0])] = int(row[1])
print('predicted labels load over!')

# 读取word vector
wv_model = word_vector_model()
#wv_model.read_vec('toolkit/vector_5.txt') # 测试用,节约读取时间
#wv_model.read_vec('toolkit/vector.txt')
Ejemplo n.º 58
0
 def test_pos(self):
     thu2 = thulac("-input cs.txt") #设置模式为分词和词性标注模式
     # thu2.run() #根据参数运行分词和词性标注程序,从cs.txt文件中读入,屏幕输出结果
     print " ".join(thu2.cut("住宿都是途牛给推荐的,杭州的两天说实话,有点偏,吃饭打车都不太方便,又赶上下雨带着孩子游玩比较费劲")).decode("utf-8")
Ejemplo n.º 59
0
 def test_cut(self):
     s = "住宿都是途牛给推荐的,杭州的两天说实话,有点偏,吃饭打车都不太方便,又赶上下雨带着孩子游玩比较费劲。不过住的挺舒服的。南京这个酒店不知道是订单到酒店的问题,还是什么问题,第一天到了,我们要的是两间房1天,可是酒店的订单是1间房2天。而且我们在这个酒店还预定了隔一天的房间,也没有订单。给途牛打电话,说肯定没问题。结果第3天回来,没房。投诉了半天,才解决的(以后还是提前给预定的酒店打电话确认)。在扬州的住宿非常好。"
     thu = thulac("-seg_only")
     print " ".join(thu.cut(s)).decode("utf-8")
Ejemplo n.º 60
0
def testTagAndDeli():
	test_text = "我爱北京天安门"
	thu = thulac.thulac(deli = '#')
	gold = thu.cut(test_text, text = True)
	assert gold == "我#r 爱#v 北京#ns 天安门#ns"