def segment(self, cleaned_sentances: list, use_hmm: bool = False) -> list: jieba.enable_parallel(self.num_worker) cleaned_sentances = [ ' '.join(jieba.lcut(i, HMM=use_hmm)) for i in cleaned_sentances ] jieba.disable_parallel() return cleaned_sentances
def fast_words_count(filepath, cut_a=False, output_t=False): jieba.enable_parallel(cpu_count()) # 开启并行分词模式,非windows下有效 with open(filepath, 'rb') as in_text: all_lines = in_text.read() cut_line = re.sub( "[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*(),、;:?!…―ˉˇ〃‘'“”~‖∶"'`|〔〕〈〉《》「」『』.〖〗【】()[]{}]+" .decode("utf-8"), "".decode("utf-8"), all_lines.decode("utf-8")) words_c = Counter(list(jieba.cut(cut_line, cut_all=cut_a))) if output_t: with open(filepath + '_Count', 'wb') as out_count: for word, freq in words_c.most_common(): out_count.writelines( ' '.join([word, str(freq)]).encode('utf-8') + '\n') else: # 输出excel表格 import xlwt # 导入excel处理包 workbook = xlwt.Workbook(encoding='utf-8') worksheet = workbook.add_sheet('Words Count') worksheet.write(0, 0, label='Word') worksheet.write(0, 1, label='Freq.') i = 1 for word, freq in words_c.most_common(): worksheet.write(i, 0, label=word) worksheet.write(i, 1, label=freq) i += 1 workbook.save(filepath + '_Count.xls')
def split(input_path, dict_path, output_path): ''' 加载自定义词典并去除停用词,对文本进行分词 ''' jieba.enable_parallel(4) # 设置 stop_word stop_word = defaultdict(int) with open('/Users/zt/Desktop/project/stop_words/stop_test.txt', 'r') as stop: for line in stop.readlines(): stop_word[line.rstrip("\n")] = 1 # 设置 user_dict jieba.load_userdict(dict_path) # 导入文本 with open(input_path, 'r') as i: text = i.read() # 分词 seg_list = list(jieba.cut(text)) out = '' for word in seg_list: if stop_word[word] == 0: out += word + ' ' # 写入文件 with open(output_path, 'w') as o: o.write(out) return print('分词成功!')
def split_sentence(input_source_table, update_table_name): label = set() # 把停用词做成字典 stopwords = {} file_stop = open('stop_words.txt', 'r') for eachWord in file_stop: stopwords[eachWord.strip()] = eachWord.strip() file_stop.close() # print(stopwords) jieba.enable_parallel(4) # 并行分词 to_cursor.execute(extract_sql % input_source_table) source_data = to_cursor.fetchall() # 取出已有的label to_cursor.execute('select label from label_type2') old_label_type = to_cursor.fetchall() old_label_list = [] for old_label in old_label_type: old_label_list.append(old_label[0]) old_label_list = set(old_label_list) print("old_label_list!!!!!!!!!!!!", old_label_list) for eachLine in source_data: content = eachLine[2] line = content.strip() # 去除每行首尾可能出现的空格,并转为Unicode进行处理 line1 = re.sub( r"[0-9\s+\.\!\/_,$%^*()?;;:-【】\"\']+|[+—!,;:。?、~@#¥%…&*()]+", "", line) word_list = list(jieba.cut(line1)) # 用结巴分词,对每行内容进行分词 result_string = [] for word in word_list: if len(word) < 2: continue marry_count = 0 for character in word: if character in stopwords: marry_count += 1 # print(marry_count / len(word)) if marry_count / len(word) > 0.75: continue if word not in stopwords: result_string.append(word) if word not in label and word not in old_label_list: label.add(word) to_cursor.execute(update_sql % (input_source_table, result_string, eachLine[0])) print(result_string) for label_i in label: print('insert into %s VALUES("%s", NULL)' % (update_table_name + "_label_type", label_i)) to_cursor.execute('insert into %s VALUES("%s", NULL)' % (update_table_name + "_label_type", label_i)) to_connect.commit()
def init(self, stopwords_file=None, puncs_file=None, user_dict=None, silent=None, thread=None): # set default value, cat wrong value to default if not isinstance(stopwords_file, str): stopwords_file = SegJb.DEFAULT_STPW if not isinstance(puncs_file, str): puncs_file = SegJb.DEFAULT_PUNC if not isinstance(user_dict, str): user_dict = SegJb.DEFAULT_DICT if not isinstance(silent, bool): silent = True if not isinstance(thread, int): thread = 1 # init due to settings if silent: jieba.setLogLevel(logging.ERROR) jieba.initialize() if thread > 1: jieba.enable_parallel(thread) if user_dict != '': jieba.load_userdict(user_dict) if stopwords_file != '': with open(stopwords_file, encoding='utf-8') as f: self.stopwords = {x: '' for x in f.read().split('\n')} if puncs_file != '': with open(puncs_file, encoding='utf-8') as f: self.puncs = {x: '' for x in f.read().split('\n')}
def main(): enable_parallel(24) rptid_search = compile("(?<=<rptid:)[^>]*(?=>)") content_search = compile("(?<=<content:)[^>]*(?=>)") image_sub = compile("\[img\][^\[\]]+\[/img\]") br_sub = compile("\[br\]") for line in stdin: if not line.startswith("<flag:0>"): continue line = line.strip() result = rptid_search.search(line) if not result: continue rptid = result.group(0) result = content_search.search(line) if not result: continue content = result.group(0) content = image_sub.sub("", content) content = br_sub.sub(" ", content) seg_set = set([seg.encode("utf-8") for seg in cut(content)]) for word in seg_set: stdout.write("%s\t%s\n" % (word, rptid))
def __init__(self, of): self.CurrentData = "" self.title = "" self.text = "" self.counter = 0 self.file = open(of, 'w') jieba.enable_parallel(20)
def multi_process(article_path): jieba.enable_parallel() size = 0 articles = [] for ii_file in os.listdir(article_path): if ii_file == '.DS_Store': continue article_path_f = os.path.join(article_path, ii_file) with open(article_path_f, 'r') as f: i_file = f.read() print '---- processing %sth article ----' % size try: sg_list = jieba.posseg.cut(i_file) processed_article = [ word.word for word in sg_list if word.flag == 'n' and word not in stop_words ] ## 有些文章内容为空或者很短,应该排除 if len(processed_article) < 20: continue articles.append(processed_article) size += 1 except: print '**** 分词异常 ****' continue return articles
def processChinese(textContent): jieba.enable_parallel(4) seg_generator = jieba.cut(textContent) # 使用结巴分词,也可以不使用 seg_list = [i for i in seg_generator if i not in stopwords] seg_list = [i for i in seg_list if i != u' '] seg_list = r' '.join(seg_list) return seg_list
def jieba_cut(filename): """Return list with jieba.cut.""" jieba.enable_parallel(4) with open(filename, 'r') as f: data = f.read() lst = [i for i in jieba.cut(data)] return lst
def clean_data(self): """Clean the sentences. Parameters ---------- self: object Returns ------- df: columns: `cleared_words, sentiment, dataset_class, counter, word_counts, word_to_number`. """ jieba.setLogLevel(20) jieba.enable_parallel(4) df = self.read_data() stopwords = self.read_stopwords() df['cut_words'] = df['review'].map(jieba.lcut) df['cleared_words'] = apply_by_multiprocessing( df['cut_words'], remove_english_punctuation, workers=4) df['cleared_words'] = apply_by_multiprocessing( df['cleared_words'], remove_chinese_punctuation, workers=4) df['cleared_words'] = apply_by_multiprocessing( df['cleared_words'], remove_stopwords, stopwords=stopwords, workers=4) df['counter'] = apply_by_multiprocessing( df['cleared_words'], Counter, workers=4) df['word_counts'] = apply_by_multiprocessing( df['cleared_words'], len, workers=4) columns = 'dataset_class sentiment cleared_words counter word_counts' df = df.loc[:, columns.split()] return df
def get_all_keywords(file_name): word_lists = [] # 关键词列表 jieba.enable_parallel(8) with codecs.open(file_name, 'r', encoding='utf-8') as f: Lists = f.readlines() # 文本列表 for List in Lists: cut_list = list(jieba.cut(List)) for word in cut_list: word_lists.append(word) word_lists_set = set(word_lists) # 去除重复元素 word_lists_set = list(word_lists_set) length = len(word_lists_set) print("共有%d个关键词" % length) information = pd.read_excel('/Users/huazi/Desktop/zhanlang2.xlsx') world_number_list = [] word_copy = [] for w in word_lists_set: if (len(w) == 1): continue if (word_lists.count(w) > 3): world_number_list.append(word_lists.count(w)) word_copy.append(w) information['key'] = word_copy information['count'] = world_number_list information.to_excel('sun_2.xlsx')
def __init__(self, data_path='./data/context'): logger.info('fastTextfeature loading corpus ...') self.label_list = ['Military', 'Economy', 'Culture', 'Sports', 'Auto', 'Medicine'] # 枚举所有的文件 jieba.enable_parallel(8) self.context, self.label = [], [] for file in tqdm(os.listdir(path=data_path)): try: label = file.split('_')[0] filePath = os.path.join(data_path, file) with open(filePath, 'r', encoding='utf-8') as fd: context = fd.read().replace('\n', ' ').replace('\r', ' ').replace('\t', ' ') self.context.append(context) self.label.append(self.label_list.index(label)) except: logger.warning('file %s have some problem ...' % file) self.context = [' '.join(list(jieba.cut(context))) for context in tqdm(self.context)] self.train_context, self.test_context, self.train_label, self.test_label =\ train_test_split(self.context, self.label, test_size=0.05) train_data_fd = open('./data/fastTextData/train_data', 'w+') for label, context in zip(self.train_label, self.train_context): train_data_fd.write("__label__" + str(label) + '\t' + context + '\n') train_data_fd.close() valid_data_fd = open('./data/fastTextData/valid_data', 'w+') for label, context in zip(self.test_label, self.test_context): valid_data_fd.write("__label__" + str(label) + '\t' + context + '\n') valid_data_fd.close() logger.debug('self.train_context shape: %d' % len(self.train_context)) logger.debug('self.test_context shape: %d' % len(self.test_context)) logger.debug('self.train_label shape: %d' % len(self.train_label)) logger.debug('self.test_label shape: %d' % len(self.test_label))
def print_and_sorted(self, comments_t): from operator import itemgetter import jieba.posseg as pseg import jieba jieba.enable_parallel(4) # 开启并行分词模式,参数为并行进程数 cci = 0 allow_pos = ['ns', 'n'] for ci in comments_t: cci += 1 if cci%50 == 0: logging.info('pro ' + str(cci)) words = pseg.cut(ci) for word, flag in words: if flag in allow_pos: if word in self.vectobi: self.vectobi[word] += 1 else: self.vectobi[word] = 1 logging.info("sorted...") word_freq = sorted(self.vectobi.iteritems(), key=itemgetter(1), reverse=True) freqfile = open('freq.txt', 'w') for i in word_freq: freqfile.write(i[0] + '\t' + str(i[1]) + '\n') freqfile.close() logging.info("length:" + str(len(word_freq)) + " writed 100 in freq.txt")
def get_datasest(): if not os.path.exists('../data/wangyi_title_cut.txt'): #中文分词 jieba.enable_parallel() #一行代表一个标题 line_num = 0 with open('../data/wangyi_title_cut.txt', 'w') as fw: #没处理标点符号 with open("../data/wangyi_title.txt", 'r') as f: for line in f.readlines(): line = line.strip() line = line.replace("\r\n", "") line = line.replace(" ", "") line_seg = jieba.cut(line) #list line_seg = " ".join(line_seg) line_num += 1 fw.write(line_seg + '\n') print('setence_num in raw corpus:', line_num) with open("../data/wangyi_title_cut.txt", 'r') as f: corpus = f.readlines() print('setence_num in corpus_seg', len(corpus)) x_train = [] #y = np.concatenate(np.ones(len(docs))) for idx, text in enumerate(corpus): word_list = text.strip().split(' ') document = TaggededDocument(word_list, tags=[idx]) x_train.append(document) return x_train
def __init__(self, user_dict_file=user_dict_file_path): self.userdict = user_dict_file logger.info(u'正在加载自定义词典...') jieba.load_userdict(self.userdict) jieba.enable_parallel(8) logging.info(u'正在建构词袋...') self.load_word_bag()
def __init__(self, verbose=0): ''' 1. 初始化参数 2. 加载用户字典和stop word列表 :param verbose: 数值越大,打印越多的详细信息,设置为0时,什么信息都不显示. :type verbose: int ''' # 初始化参数 self.verbose = verbose # 设置jieba分词对线程 jieba.enable_parallel(10) # -------------- region start : 2. 加载用户字典和stop word列表 ------------- if verbose > 1: logging.debug('-' * 20) print '-' * 20 logging.debug('2. 加载用户字典和stop word列表') print '2. 加载用户字典和stop word列表' # -------------- code start : 开始 ------------- jieba.load_userdict(os.path.dirname(__file__) + '/userdict.txt') self.stopword_list = io.open(os.path.dirname(__file__) + '/stopword.txt', 'r', encoding='utf8').read().strip().split() self.exclude_word_list = set(['886', '88']) # -------------- code start : 结束 ------------- if verbose > 1: logging.debug('-' * 20) print '-' * 20
def __init__(self, user_dict_file="TechWord.txt"): self.userdict = user_dict_file print('正在加载自定义词典...') jieba.load_userdict(self.userdict) jieba.enable_parallel(8) print('正在建构词袋...') self.load_word_bag()
def segment_file(in_file, out_file, word_sep=' ', pos_sep='/', is_pos=True): """ segment input file to output file :param in_file: :param out_file: :param word_sep: :param pos_sep: :param is_pos: 需要词性标注 :return: """ jieba.enable_parallel() with open(in_file, 'r', encoding='utf-8') as fin, open(out_file, 'w', encoding='utf-8') as fout: count = 0 for line in fin: in_line = line.strip() seg_line = '' if is_pos: words = posseg.lcut(in_line) for word, pos in words: seg_line += word + pos_sep + pos + word_sep else: words = jieba.lcut(in_line) for word in words: seg_line += word + word_sep fout.write(seg_line + "\n") count += 1 print("segment ok. input file count:", count)
def __init__(self): self.elastic = Elastic("qabot") self.doc_type = "fqa" self.related = None logging.info("Bot initialized.") jieba.enable_parallel(4) jieba.initialize()
def gen_vocab(file_list, added_vocab_list, vocab_dict_file): jieba.enable_parallel(30) for vocab_file in added_vocab_list: jieba.load_userdict(vocab_file) vocab_dict = dict() idx = 0 for file_name in file_list: logging.info("process %s" % file_name) with open(file_name, "r", encoding="utf-8") as fp: content = fp.read() words = jieba.cut(content) for w in words: if w in vocab_dict: vocab_dict[w] += 1 else: vocab_dict[w] = 1 #lines = fp.readlines() #logging.info("readlines %d"%(len(lines))) #for line in tqdm(lines): # line = line.strip() # words = jieba.cut(line) # for w in words: # if w in vocab_dict: # vocab_dict[w] += 1 # else: # vocab_dict[w] = 1 # idx += 1 logging.info("vocab_dict len:%d" % len(vocab_dict)) wfp = open(vocab_dict_file, "wb") pickle.dump(vocab_dict, wfp) wfp.close()
def load_chinese_data(file_path, save_path, test_size=0.1, verbose=True): if os.path.exists(save_path): data = pd.read_csv(save_path, sep=",", header=0) else: data = pd.read_excel(file_path, sheet_name="sheet1") data = data.rename(index=str, columns={"分类": "label", "正文": "text"}) # tokenization jieba.enable_parallel(16) data["tokens"] = data["text"].apply(lambda x: jieba.cut(x.strip())) data["tokens"] = [" ".join(x) for x in data["tokens"]] data["tokens"] = data["tokens"].apply( lambda x: re.sub(" +", " ", x.strip().replace("\n", " ").replace("\t", " "))) data.to_csv(save_path, sep=",", header=True, index=False, na_rep="") label_encoder = preprocessing.LabelEncoder() labels = label_encoder.fit_transform(data.label.values) x_train, x_test, y_train, y_test = train_test_split(data.tokens.values, labels, stratify=labels, random_state=1234, test_size=test_size, shuffle=True) if verbose: print("sample tokenized text: {}".format(data["tokens"].values[0]), flush=True) print("labels: {}".format(data.label.unique()), flush=True) print("train set shape: {}, test set shape: {}".format( x_train.shape, x_test.shape)) return x_train, x_test, y_train, y_test
def word_seg(): corpus = [] sentences = [] jieba.enable_parallel(40) word_list = [] with open(os.path.join(DIRNAME, "baike.json"), "r") as f: for i, l in enumerate(f): print(i) entity = json.loads(l) description = entity['description'] description_seg = [] sentence_list = re.split(u'。|!|?|\?|!|', description) for sentence in sentence_list: cur_word_list = list(jieba.cut(sentence)) description_seg.append(cur_word_list) word_list += cur_word_list sentences.append(cur_word_list) entity['description'] = description_seg corpus.append(entity) with open(os.path.join(DIRNAME, "baike_seg.json"), "w") as f: json.dump(corpus, f) with open(os.path.join(DIRNAME, "baike_sentences.json"), "w") as f: json.dump(sentences, f) word_list = list(set(word_list)) word_list.insert(0,0) with open(os.path.join(DIRNAME, "baike_word_list.json"), "w") as f: json.dump(word_list, f) word_index = {} for i, word in enumerate(word_list): word_index[word] = i with open(os.path.join(DIRNAME, "baike_word_index.json"), "w") as f: json.dump(word_index, f)
def __init__(self,n_core = 16): self.rootdir = os.getcwd() self.STOP_WORDS_LIST = self.load_txt(path.join(self.rootdir, 'resources', 'stopwords_utf8.txt')) self.STOP_WORDS_LIST = set([re.sub('\n', '', item) for item in self.STOP_WORDS_LIST]) jieba.load_userdict(path.join(self.rootdir, 'resources', 'emotion_user_dict.txt')) self.n_CORE=n_core jieba.enable_parallel(self.n_CORE-1)
def train_embedding(): stop_list = [] with open('./data/stop_words.txt', 'r', encoding='utf8') as fr: for line in fr: if line.strip() != ' ': stop_list.append(line.strip()) print(len(stop_list)) jieba.enable_parallel(16) with open('./data/medical.csv', 'r') as fr: reader = csv.reader(fr) for i in reader: # print(i[0]) jieba.add_word(i[0]) sentences = [] with open('./data/corpus.txt', 'r', encoding='utf8') as fr: lines = fr.readlines() for line in lines: sentence = jieba.lcut(line.strip()) res = [] for i in sentence: if i not in stop_list: res.append(i) if len(res) > 0: sentences.append(res) model = gensim.models.Word2Vec(sentences, size=300, window=5, min_count=0, workers=16) # print(model.wv.word_vec('口腔')) model.wv.save_word2vec_format('wv300.bin')
def __init__(self, user_dicts_directory=None, stop_word_dicts_directory=None): ''' :param user_dicts_directory:自定义词典文件夹 :param stop_word_dicts_directory:停用词文件夹 ''' # 尝试并行分词(unix系统可以,windows不可以) try: from multiprocessing import cpu_count jieba.enable_parallel(cpu_count() - 1) except: pass # 加载自定义词典 if user_dicts_directory is not None: filenames = os.listdir(user_dicts_directory) for filename in filenames: jieba.load_userdict(user_dicts_directory + filename) # 加载停用词词典 self.stop_words = set() if stop_word_dicts_directory is not None: filenames = os.listdir(stop_word_dicts_directory) for filename in filenames: for line in open(stop_word_dicts_directory + filename, encoding='utf-8-sig'): self.stop_words.add(line.strip())
def CreateTagsByID(weibo_dir, dest_dir, emoji_path, customized_path, stopwords_path, topK = 40): ''' 遍历scraped_data夹中的所有文件,对同一ID的博文进行汇总,并根据TF-IDF提取属于该ID的标签,保存在user_tags文件夹中。 Args: weibo_dir: 爬取的微博博文数据所在目录 dest_dir: 聚合后,每个用户的TFIDF标签群保存目录 emoji_path: 自制表情符地址(merge后) customized_path: 自制用户词典地址(merge后) stopwords_path: 停用词地址(merge后) topK: 选取最大topK个标签 Returns: ''' jieba.load_userdict(emoji_path) jieba.load_userdict(customized_path) analyse.set_stop_words(stopwords_path) jieba.enable_parallel() for filename in os.listdir(weibo_dir): res = defaultdict() if(filename[0] == "."): continue raw_data = pd.read_csv(weibo_dir + filename) text_per_uid = raw_data.groupby("uid")["weibotxt"].sum() for idx in text_per_uid.index: res[str(idx)] = jieba.analyse.extract_tags(text_per_uid[idx], topK = topK) with open(dest_dir + filename[:-4] + ".json", "w") as f: json.dump(res, f)
def tf_idf(texts): jieba.load_userdict("./model/dict.txt") jieba.analyse.set_idf_path("./model/idf.txt") jieba.analyse.set_stop_words("./model/chinese_stopwords.txt") jieba.enable_parallel(8) corpus = [filter(jieba.analyse.extract_tags(s, topK=15)) for s in texts] return corpus
def __init__(self): self.CURRENT_PATH = path.dirname(path.abspath(__file__)) self.NW_PATH = path.join(self.CURRENT_PATH, 'words/wc_cn/newwords.txt') self.SW_PATH = path.join(self.CURRENT_PATH, 'words/wc_cn/stopwords.txt') # add user dict jieba.load_userdict(self.NW_PATH) # must before jieba.enable_parallel jieba.enable_parallel(4)
def get_words(sentence_list): words = [] jieba.enable_parallel(8) for raw in sentence_list: result = postag.cut(raw) raw = [x.word for x in result if (len(x.word) > 1 and 'n' in x.flag)] words += raw return words
def __init__(self, vocabulary, labels, model): jieba.enable_parallel(multiprocessing.cpu_count()) self.model = C.load_model(model) self.vocab = get_vocab(vocabulary) self.x_dim = len(self.vocab) self.y_dim = get_size(labels) self.x = C.sequence.input_variable(self.x_dim, is_sparse=True) self.model = self.model(self.x) self.predictor = C.argmax(self.model)
def save_jieba_result(): # 设置多线程切割 jieba.enable_parallel(4) dirs = path.join(path.dirname(__file__), '../pjl_comment.txt') with codecs.open(dirs, encoding='utf-8') as f: comment_text = f.read() cut_text = " ".join(jieba.cut(comment_text)) # 将jieba分词得到的关键词用空格连接成为字符串 with codecs.open('pjl_jieba.txt', 'a', encoding='utf-8') as f: f.write(cut_text)
def cutall(): if request.method == 'POST': text = request.form.get('text') text = text.strip() jieba.enable_parallel(4) seg_list = jieba.cut(text, cut_all=True) return jsonify( {'data': list(seg_list)} ) return render_template('api/cutall.html')
def cut_word(sentence, parallel=False, processnum=2): if parallel: # 开启并行分词模式,参数为并行进程数,不支持windows jieba.enable_parallel(processnum=processnum) word_list = jieba.lcut(sentence, cut_all=False, HMM=True) # 关闭并行分词模式 jieba.disable_parallel() else: word_list = jieba.lcut(sentence, cut_all=False, HMM=True) return word_list
def __init__(self): # read stopwords stopwords_file = open("slave/parsers/stopwords") # stopwords_file = open("stopwords") stopwords = [] for line in stopwords_file.readlines(): stopwords.append(line.strip()) self.stopwords = set(stopwords) jieba.enable_parallel(4)
def __init__(self, of, data, questionList, model): self.CurrentData = "" self.title = "" self.text = "" self.counter = 0 self.file = open(of, 'wb') self.data = data self.model = model self.questionList = questionList self.pages =[] jieba.enable_parallel(20)
def cutText(inTxt = '/Users/v_niur/Desktop/py/bb/tmp/simple.txt',outTxt = '/Users/v_niur/Desktop/py/bb/tmp/cut_simple.txt'): """将中文文本进行分词并导出""" jieba.enable_parallel(4) startTime = time.time() content = open(inTxt,"rb").read()#将该目录下的文本的内容导入为一个字符串 words = list(jieba.cut(content)) endTime= time.time() timeCost = endTime - startTime with open(outTxt,'w+') as f: for w in words: # print >> f,w.encode("utf-8"), "/" , print >> f, w.encode( "utf-8" ),' ', print 'speed',len(content),"bytes/second"
def use_jieba(self): import jieba.posseg as pseg import jieba jieba.enable_parallel(4) # 开启并行分词模式,参数为并行进程数 cuted_t = [] allow_pos = ['ns', 'n', 'vn', 'v'] # 保留的词性 for i in self.title_content: words = pseg.cut(i) flit_words = [] for word, flag in words: if flag in allow_pos: flit_words.append(word) cuted_t.append(flit_words) return cuted_t
def __init__(self, processnum=1): logger.info('Initializing jieba...') jieba.initialize() logger.info('Successfully initialized jieba.') if processnum == 0: processnum = multiprocessing.cpu_count() if processnum > 1: logger.info( 'jieba running in parallel mode with %d processes.', processnum ) jieba.enable_parallel(processnum)
def prepare_data(): print "----- Prepareing Data -----" import cPickle import jieba # load data, and make into DataFrame df tags, texts, uniq_labels = make_news_data() df = pd.DataFrame({'label': tags, 'txt': texts}) # jieba segmentaion and serialize jieba.enable_parallel(4) df['seg_word'] = df.txt.map(lambda x: ' '.join(jieba.cut(x))) # 经测试,含有 utf-8 的 df,通过普通 open 打开的文件也可以正常 dump & load cPickle.dump(df, open('df.pickle', 'wb')) # df = cPickle.load(open('df.pickle', 'rb')) return df
def init(jieba_parallel=False): # 加载英语/中文停止词,分别来自nltk和zhon global english_stopwords, chinese_stopwords english_stopwords = set(nltk.corpus.stopwords.words('english')) chinese_stopwords = {word[:-1] for word in codecs.open("stopwords.txt", "r", encoding="utf-8")} # 设置结巴分词log级别 jieba.setLogLevel("INFO") # 设置结巴分词字典文件 jieba.set_dictionary("./jieba_dict.txt") # 修改结巴分词临时工作目录 jieba.tmp_dir = os.getcwd() # 开启并行分词模式,进程数为CPU核心数 if jieba_parallel: jieba.enable_parallel() config.log.info("module algorithm has initialized successfully.")
def use_jieba(): import jieba import codecs import re k = 0 data_file = codecs.open('train.txt', 'w', 'utf-8') data_paragraph_name = codecs.open('paragraph_name.txt', 'w', 'utf-8') logging.info("start to write paragraph...") jieba.enable_parallel(4) # 开启并行分词模式,参数为并行进程数 for i in title_content: data_file.write('_*'+str(k)+' ') data_paragraph_name.write(title_name[k] + '\t' + title_id[k] + '\n') words = " ".join(jieba.cut( re.sub(u'\s|!|。|,|“|”|(|)|《|》|\(|\)|:|、',' ',i), cut_all=False)) data_file.write(words+u'\n') k += 1 data_file.close() data_paragraph_name.close()
def __init__(self, num_of_parallel=10, verbose=0): """ 1. 初始化参数 2. 加载用户字典和stop word列表 Parameters ---------- num_of_parallel : int 并行的线程数 verbose: int 数值越大,打印越多的详细信息,设置为0时,什么信息都不显示. """ # 初始化参数 self.verbose = verbose # 设置jieba分词对线程 jieba.enable_parallel(num_of_parallel) # -------------- region start : 2. 加载用户字典和stop word列表 ------------- if verbose > 1: logging.debug('-' * 20) print '-' * 20 logging.debug('2. 加载用户字典和stop word列表') print '2. 加载用户字典和stop word列表' # -------------- code start : 开始 ------------- # region 添加用户字典 jieba.load_userdict(os.path.dirname(__file__) + '/userdict.txt') # 添加 261,529 个用户词典(来自在线新华词典的抓取) jieba.load_userdict(os.path.dirname(__file__) + '/vocabulary_len2_xiandaihanyu.txt') # endregion self.stopword_list = io.open(os.path.dirname(__file__) + '/stopword.txt', 'r', encoding='utf8').read().strip().split() self.exclude_word_list = set(['886', '88']) # -------------- code start : 结束 ------------- if verbose > 1: logging.debug('-' * 20) print '-' * 20
def visit_offcanvas(request): #bug: 同个客户端同时刷新好几次,可能同时返回导致内容混合 ip = None if request.META.has_key('HTTP_X_FORWARDED_FOR'): ip = request.META['HTTP_X_FORWARDED_FOR'] else: ip = request.META['REMOTE_ADDR'] logger.info("%s BEGIN. POST:%s, GET:%s"%(ip,str(request.POST),str(request.GET))) global is_first_load mutex_update_news.acquire() if is_first_load: #print "[LOG %s] init news."%(time.strftime("%Y-%m-%d %X", time.localtime())) logger.info("init news.") if platform.system() == "Linux": jieba.enable_parallel(8) jieba.initialize() #jieba.set_dictionary('data/dict.txt.big') update_base() init_news2() thread.start_new_thread(thread_update_news, ("",)) is_first_load = False mutex_update_news.release() queryDict=None if request.method == 'GET': queryDict = request.GET elif request.method == 'POST': queryDict = request.POST jsondata = get_jsondata(queryDict) fp = open('django_composite/offcanvas.html') t = Template(fp.read()) fp.close() html = t.render(Context(jsondata)) logger.info("%s END."%ip) return HttpResponse(html) '''
def main(): parser = ArgumentParser() parser.add_argument("corpus_file", help = "row corpus file (input)") parser.add_argument("threshold", help = "idf threshold (input)", type = int) parser.add_argument("template_path", help = "template file path to dump in json format (output)") args = parser.parse_args() corpus_file = args.corpus_file threshold = args.threshold template_path = args.template_path enable_parallel(24) rptid_search = compile("(?<=<rptid:)[^>]*(?=>)") content_search = compile("(?<=<content:)[^>]*(?=>)") image_sub = compile("\[img\][^\[\]]+\[/img\]") br_sub = compile("\[br\]") rptid_set = set() df_dict = dict() with open(corpus_file, 'r') as fd: for line in fd: if not line.startswith("<flag:0>"): continue line = line.strip() result = rptid_search.search(line) if not result: continue rptid = result.group(0) if rptid in rptid_set: continue rptid_set.add(rptid) result = content_search.search(line) if not result: continue content = result.group(0) content = image_sub.sub("", content) content = br_sub.sub("\n", content) seg_set = set([seg.encode("utf-8") for seg in cut(content)]) for word in seg_set: if word not in df_dict: df_dict[word] = 0 df_dict[word] += 1 idf_dict = {} for word in df_dict: if df_dict[word] > threshold: idf_dict[word] = log(float(len(rptid_set)) / df_dict[word]) word_list = list(idf_dict) word_index_dict = dict((word_list[index], index) for index in xrange(len(word_list))) # word -> index index_word_dict = dict((index, word_list[index]) for index in xrange(len(word_list))) # index -> word with open(template_path, 'w') as fd: fd.write(dumps({"word_idf_dict": idf_dict, "word_index_dict": word_index_dict, "index_word_dict": index_word_dict}, indent=4, ensure_ascii=False))
#coding=utf-8 ''' Created on 2014-2-22 @author: yuzhang ''' import jieba.posseg as jbp import jieba as jb import time jb.enable_parallel() jb.initialize() text = ''' ''' start = time.clock() for i in range(1000000): jb.cut(text) print time.clock() - start start = time.clock() for i in range(1000000): jbp.cut(text) print time.clock() - start
import codecs import binascii import time jieba.set_dictionary('/Users/fan/anaconda/bin/Workspace/sentiment/0616/big/jieba356726.txt') jieba.load_userdict('/Users/fan/anaconda/bin/Workspace/sentiment/0616/big/cute.txt') jieba.load_userdict('/Users/fan/anaconda/bin/Workspace/sentiment/0616/big/jieba356726.txt') jieba.load_userdict('/Users/fan/anaconda/bin/Workspace/sentiment/0616/food/fooddict2027.txt') jieba.load_userdict('/Users/fan/anaconda/bin/Workspace/sentiment/0616/menu/menu50806_new.txt') jieba.load_userdict('/Users/fan/anaconda/bin/Workspace/sentiment/0616/sentiment/negativewords.txt') jieba.load_userdict('/Users/fan/anaconda/bin/Workspace/sentiment/0616/sentiment/positivewords.txt') jieba.load_userdict('/Users/fan/anaconda/bin/Workspace/sentiment/0616/sentiment/negative.txt') jieba.load_userdict('/Users/fan/anaconda/bin/Workspace/sentiment/0616/sentiment/more.txt') jieba.load_userdict('/Users/fan/anaconda/bin/Workspace/sentiment/0616/sentiment/question.txt') jieba.load_userdict('/Users/fan/anaconda/bin/Workspace/sentiment/0616/stop/stopword2292.txt') jieba.enable_parallel(6) def deleteBadWords(StrIn): Str_BadWords = u'延伸閱讀|連絡方式|電話預約|電話|營業時間|週一|週二|週三|週四|週五|週六|週日|周一|周二|周三|周四|周五|周六|\ |周日|假日|公休|平日|地址|粉絲團|星期|禮拜|時間限制|您或許對這些文章有興趣|造訪日期|全年無休|最後點餐|營業|AM|PM|上一篇|下一篇|\ |分享此文|您可能喜歡的文章|懶人包|臉書|Facebook|facebook|FB|fb|全世界便宜住宿看這兒|下載愛食記App隨時觀看|按個讚啦|喜歡我的分享嗎|\ |瘋台灣民宿網|官方網站|瀏覽人次|最新消息|餐廳名稱|消費時間|無圖文版|網誌|Postedonby|新鮮關注回聲|Christabelle的藝想世界部落格由製作|\ |也許對這些文章也有興趣|發表迴響|電子郵件|必要欄位標記|電子郵件|個人網站|輸入圖片顯示文字好證明你不是機器人|站內搜尋分類|最新動態|\ |並不會被公開|你的位址 |迴響名稱|用餐日期|留言|載入中|文章文章|粉絲頁|發表|每人平均價位|按個讚|推薦你閱讀|Instagram|instagram|\ |美食地圖|版權所有|網友回應|歡迎加入|標籤|著作權聲明|非經授權|不得轉載' strClean = re.sub(Str_BadWords,'',StrIn) return strClean def EnglishFullToHalf(StrIn): def transform(ele): alphabetInt = int(repr(ele.group('number'))[4:8],16)
#Title segmentation import jieba import csv csv.field_size_limit(1000000000) jieba.enable_parallel(20) content = csv.reader(open('/media/sunzeyeah/Personal/SENIOR/Thesis/Data/Chinese/Sina/news_8.csv','r')) f = open('/media/sunzeyeah/Personal/SENIOR/Thesis/Data/Chinese/Sina/edu.csv','w') #csvfile = open('/media/sunzeyeah/Personal/SENIOR/Thesis/Data/Chinese/title.txt','w') writer = csv.writer(f) for line in content: if line[2] != 'title': words = ' '.join(jieba.cut(line[2])) # string = words + '\n' writer.writerow([words.encode('utf-8')]) # f.write(string.encode('utf-8'))
def buildCorpus(filename="PART_III.txt"): jieba.enable_parallel(4) source,target = extract(filename) source = [tokenize(s,"s") for s in source] target = [tokenize(t,"t") for t in target] return source,target
import re # Chinese word segmentation import jieba __author__ = "Lucas Kjaero" # Optimize Chinese segmenter for running in parallel. try: from multiprocessing import cpu_count jieba.enable_parallel(cpu_count()) except NotImplementedError: pass number_pattern = re.compile("[0-9]+(.)*[0-9]*") def drop_punctuation_and_numbers(iterable_text): """A generator that returns tokens in a text if they are not punctuation or numbers. Input must be iterable""" for token in iterable_text: if token not in ",.?;'[]()`~!@#$%^&*/+_-=<>{}:,。?!·;:‘“、\"" and number_pattern.match(token) is None: yield token def segment_sentence(input_text, split_compounds=False): """Segment a Chinese sentence, returns a generator containing the words. If you select split_compounds, it will return all possible words in the sentence, including overlaps.""" return drop_punctuation_and_numbers(jieba.cut(input_text, cut_all=split_compounds))
def participle(data): jieba.enable_parallel(4) result=jieba.cut(data,cut_all=False) return result
def init_context(self): jieba.initialize() jieba.enable_parallel(4)
self.send_header("Content-Length", str(len(res_str))) self.end_headers() self.wfile.write(res_str) image_sub = compile("\[img\][^\[\]]+\[/img\]") br_sub = compile("\[br\]") idf_file = "./download/idf-new.json" tempalte_file = "./download/template-new.json" sex_file = "./download/sex.tsv" banned_file = "./download/banned-dict.json" political_file = "./download/political-dict.json" model_file = "./download/model-new.pickle" enable_parallel(6) sex_set = load_sex_set(sex_file) banned_dict = load_banned_dict(banned_file) political_dict = load_political_dict(political_file) political_name_set = set(name.encode("utf-8") for name in political_dict["name_list"]) political_verb_set = set(verb.encode("utf-8") for verb in political_dict["verb_list"]) with open(idf_file, 'r') as fd: idf_dict = loads(fd.read()) idf_dict = dict((key.encode("utf-8"), idf_dict[key]) for key in idf_dict) # word -> idf with open(tempalte_file, 'r') as fd: index_dict = loads(fd.read()) index_dict = dict((int(key), index_dict[key].encode("utf-8")) for key in index_dict) # index -> word word_dict = dict((index_dict[key], key) for key in index_dict) # word -> index
#encoding=utf-8 from __future__ import print_function import sys sys.path.append("../../") import jieba jieba.enable_parallel(4) import jieba.posseg as pseg def cuttest(test_sent): result = pseg.cut(test_sent) for w in result: print(w.word, "/", w.flag, ", ", end=' ') print("") if __name__ == "__main__": cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。") cuttest("我不喜欢日本和服。") cuttest("雷猴回归人间。") cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作") cuttest("我需要廉租房") cuttest("永和服装饰品有限公司") cuttest("我爱北京天安门") cuttest("abc") cuttest("隐马尔可夫") cuttest("雷猴是个好网站") cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成") cuttest("草泥马和欺实马是今年的流行词汇") cuttest("伊藤洋华堂总府店") cuttest("中国科学院计算技术研究所") cuttest("罗密欧与朱丽叶")
def __init__(self, ranker): jieba.initialize() jieba.enable_parallel(8) self.ranker = ranker
>>> import jieba.posseg as pseg >>> words = pseg.cut("我爱北京天安门") >>> for w in words: ... print w.word, w.flag ... 我 r 爱 v 北京 ns 天安门 ns 功能 5) : 并行分词 原理:将目标文本按行分隔后,把各行文本分配到多个python进程并行分词,然后归并结果,从而获得分词速度的可观提升 基于python自带的multiprocessing模块,目前暂不支持windows 用法: jieba.enable_parallel(4) # 开启并行分词模式,参数为并行进程数 jieba.disable_parallel() # 关闭并行分词模式 例子: import urllib2 import sys,time import sys sys.path.append("../../") import jieba jieba.enable_parallel(4) url = sys.argv[1] content = open(url,"rb").read() t1 = time.time() words = list(jieba.cut(content)) t2 = time.time()
# -*- coding: utf-8 -*- # Project : LM4paper # Created by igor on 16-12-23 import time import jieba jieba.enable_parallel(10) data_path= '/data/lawdata/raw/raw.document.txt' t1 = time.time() content = open(data_path,"rb").read() words = " ".join(jieba.cut(content)) t2 = time.time() tm_cost = t2 - t1 log_f = open("/data/lawdata/raw/raw.tokenized.document.txt",'wb') log_f.write(words.encode('utf8')) print('speed %s bytes/second' % (len(content)/tm_cost))
#======================================================= #load train data train_file = pd.read_table("train_sample.txt",names = ["ID","Value","String"]) print "load's file length:",len(train_file) #load test data test_file = pd.read_table("test.txt",names = ["ID","String"]) # print test_file.head() # print np.array(test_file["String"]) #convert to np.ndarray print train_file.tail() #========================================================== #jieba participle #parallel jieba.enable_parallel(1) #start:the paralleled num of processes;but one is better. jieba.set_dictionary("dict_for_jieba.txt") #set dictionary dir trainData = [] for s in train_file["String"]: trainData.append("/".join(jieba.cut(s))) print len(trainData) print trainData[0] jieba.disable_parallel() #turn off processes #============================================================ #TF-IDF :Extract features.
endMonth = item li[li.index(item)] = '' elif re.search('\d', item) and not startMonth: startMonth = item li[li.index(item)] = '' elif re.search('\d', item) and startMonth and not endMonth: endMonth = item li[li.index(item)] = '' print startYear, '年',startMonth, endYear, '年',endMonth, fliterList = [u'大学', u'学院',u'专业', u'本科', u'研究生'] print ''.join(li).replace(u'大学',u'大学#').replace(u'学院',u'学院#').replace(u'专业',u'专业#') \ .replace(u'本科', u'本科#').replace(u'研究生',u'研究生#').replace(u'博士',u'博士#') \ .replace(u'校', u'校#').replace(u',', u'#') def funx(arg, dire, files): for file in files: if file.endswith('.txt'): print os.path.join(dire, file) jieba.load_userdict(os.path.join(dire, file)) if __name__ == "__main__": dicPath = os.path.join(os.path.expanduser('~'), 'Desktop', u'分词字典') os.path.walk(dicPath, funx, ()) jieba.enable_parallel(4)#并行分词 BASEPATH = os.path.join(os.path.expanduser('~'), 'Desktop', 'test.html') praseHTML(BASEPATH)
def cutword(data): jieba.enable_parallel(4) result=jieba.cut(data,cut_all=True) return result