Ejemplo n.º 1
0
 def segment(self, cleaned_sentances: list, use_hmm: bool = False) -> list:
     jieba.enable_parallel(self.num_worker)
     cleaned_sentances = [
         ' '.join(jieba.lcut(i, HMM=use_hmm)) for i in cleaned_sentances
     ]
     jieba.disable_parallel()
     return cleaned_sentances
Ejemplo n.º 2
0
def fast_words_count(filepath, cut_a=False, output_t=False):
    jieba.enable_parallel(cpu_count())  # 开启并行分词模式,非windows下有效
    with open(filepath, 'rb') as in_text:
        all_lines = in_text.read()
        cut_line = re.sub(
            "[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*(),、;:?!…―ˉˇ〃‘'“”~‖∶"'`|〔〕〈〉《》「」『』.〖〗【】()[]{}]+"
            .decode("utf-8"), "".decode("utf-8"), all_lines.decode("utf-8"))
        words_c = Counter(list(jieba.cut(cut_line, cut_all=cut_a)))

    if output_t:
        with open(filepath + '_Count', 'wb') as out_count:
            for word, freq in words_c.most_common():
                out_count.writelines(
                    ' '.join([word, str(freq)]).encode('utf-8') + '\n')
    else:  # 输出excel表格
        import xlwt  # 导入excel处理包
        workbook = xlwt.Workbook(encoding='utf-8')
        worksheet = workbook.add_sheet('Words Count')
        worksheet.write(0, 0, label='Word')
        worksheet.write(0, 1, label='Freq.')
        i = 1
        for word, freq in words_c.most_common():
            worksheet.write(i, 0, label=word)
            worksheet.write(i, 1, label=freq)
            i += 1
        workbook.save(filepath + '_Count.xls')
Ejemplo n.º 3
0
def split(input_path, dict_path, output_path):
    '''
    加载自定义词典并去除停用词,对文本进行分词
    '''
    jieba.enable_parallel(4)

    # 设置 stop_word
    stop_word = defaultdict(int)
    with open('/Users/zt/Desktop/project/stop_words/stop_test.txt',
              'r') as stop:
        for line in stop.readlines():
            stop_word[line.rstrip("\n")] = 1

    # 设置 user_dict
    jieba.load_userdict(dict_path)

    # 导入文本
    with open(input_path, 'r') as i:
        text = i.read()

    # 分词
    seg_list = list(jieba.cut(text))
    out = ''
    for word in seg_list:
        if stop_word[word] == 0:
            out += word + '  '

    # 写入文件
    with open(output_path, 'w') as o:
        o.write(out)

    return print('分词成功!')
Ejemplo n.º 4
0
def split_sentence(input_source_table, update_table_name):
    label = set()

    # 把停用词做成字典
    stopwords = {}
    file_stop = open('stop_words.txt', 'r')
    for eachWord in file_stop:
        stopwords[eachWord.strip()] = eachWord.strip()
    file_stop.close()
    # print(stopwords)

    jieba.enable_parallel(4)  # 并行分词

    to_cursor.execute(extract_sql % input_source_table)
    source_data = to_cursor.fetchall()

    # 取出已有的label
    to_cursor.execute('select label from label_type2')
    old_label_type = to_cursor.fetchall()
    old_label_list = []
    for old_label in old_label_type:
        old_label_list.append(old_label[0])
    old_label_list = set(old_label_list)
    print("old_label_list!!!!!!!!!!!!", old_label_list)

    for eachLine in source_data:
        content = eachLine[2]
        line = content.strip()  # 去除每行首尾可能出现的空格,并转为Unicode进行处理
        line1 = re.sub(
            r"[0-9\s+\.\!\/_,$%^*()?;;:-【】\"\']+|[+—!,;:。?、~@#¥%…&*()]+", "",
            line)
        word_list = list(jieba.cut(line1))  # 用结巴分词,对每行内容进行分词
        result_string = []

        for word in word_list:
            if len(word) < 2:
                continue

            marry_count = 0
            for character in word:
                if character in stopwords:
                    marry_count += 1
            # print(marry_count / len(word))
            if marry_count / len(word) > 0.75:
                continue

            if word not in stopwords:
                result_string.append(word)
            if word not in label and word not in old_label_list:
                label.add(word)
        to_cursor.execute(update_sql %
                          (input_source_table, result_string, eachLine[0]))
        print(result_string)

    for label_i in label:
        print('insert into %s VALUES("%s", NULL)' %
              (update_table_name + "_label_type", label_i))
        to_cursor.execute('insert into %s VALUES("%s", NULL)' %
                          (update_table_name + "_label_type", label_i))
    to_connect.commit()
Ejemplo n.º 5
0
    def init(self,
             stopwords_file=None,
             puncs_file=None,
             user_dict=None,
             silent=None,
             thread=None):
        # set default value, cat wrong value to default
        if not isinstance(stopwords_file, str):
            stopwords_file = SegJb.DEFAULT_STPW
        if not isinstance(puncs_file, str):
            puncs_file = SegJb.DEFAULT_PUNC
        if not isinstance(user_dict, str):
            user_dict = SegJb.DEFAULT_DICT
        if not isinstance(silent, bool):
            silent = True
        if not isinstance(thread, int):
            thread = 1

        # init due to settings
        if silent:
            jieba.setLogLevel(logging.ERROR)
        jieba.initialize()
        if thread > 1:
            jieba.enable_parallel(thread)
        if user_dict != '':
            jieba.load_userdict(user_dict)
        if stopwords_file != '':
            with open(stopwords_file, encoding='utf-8') as f:
                self.stopwords = {x: '' for x in f.read().split('\n')}
        if puncs_file != '':
            with open(puncs_file, encoding='utf-8') as f:
                self.puncs = {x: '' for x in f.read().split('\n')}
Ejemplo n.º 6
0
def main():

    enable_parallel(24)

    rptid_search = compile("(?<=<rptid:)[^>]*(?=>)")
    content_search = compile("(?<=<content:)[^>]*(?=>)")
    image_sub = compile("\[img\][^\[\]]+\[/img\]")
    br_sub = compile("\[br\]")

    for line in stdin:
        if not line.startswith("<flag:0>"):
            continue
        line = line.strip()
    
        result = rptid_search.search(line)
        if not result:
            continue
        rptid = result.group(0)
    
        result = content_search.search(line)
        if not result:
            continue
        content = result.group(0)
        content = image_sub.sub("", content)
        content = br_sub.sub(" ", content)
        
        seg_set = set([seg.encode("utf-8") for seg in cut(content)])

        for word in seg_set:
            stdout.write("%s\t%s\n" % (word, rptid))
 def __init__(self, of):
     self.CurrentData = ""
     self.title = ""
     self.text = ""
     self.counter = 0
     self.file = open(of, 'w')
     jieba.enable_parallel(20)
Ejemplo n.º 8
0
def multi_process(article_path):
    jieba.enable_parallel()
    size = 0
    articles = []
    for ii_file in os.listdir(article_path):
        if ii_file == '.DS_Store':
            continue
        article_path_f = os.path.join(article_path, ii_file)
        with open(article_path_f, 'r') as f:
            i_file = f.read()
        print '---- processing %sth article ----' % size
        try:
            sg_list = jieba.posseg.cut(i_file)
            processed_article = [
                word.word for word in sg_list
                if word.flag == 'n' and word not in stop_words
            ]
            ## 有些文章内容为空或者很短,应该排除
            if len(processed_article) < 20:
                continue
            articles.append(processed_article)
            size += 1
        except:
            print '**** 分词异常 ****'
            continue
    return articles
Ejemplo n.º 9
0
def processChinese(textContent):
    jieba.enable_parallel(4)
    seg_generator = jieba.cut(textContent)  # 使用结巴分词,也可以不使用
    seg_list = [i for i in seg_generator if i not in stopwords]
    seg_list = [i for i in seg_list if i != u' ']
    seg_list = r' '.join(seg_list)
    return seg_list
Ejemplo n.º 10
0
def jieba_cut(filename):
    """Return list with jieba.cut."""
    jieba.enable_parallel(4)
    with open(filename, 'r') as f:
        data = f.read()
        lst = [i for i in jieba.cut(data)]
    return lst
Ejemplo n.º 11
0
    def clean_data(self):
        """Clean the sentences.
        Parameters
        ----------
        self: object

        Returns
        -------
        df:
            columns: `cleared_words, sentiment, dataset_class,
                counter, word_counts, word_to_number`.
        """
        jieba.setLogLevel(20)
        jieba.enable_parallel(4)
        df = self.read_data()
        stopwords = self.read_stopwords()
        df['cut_words'] = df['review'].map(jieba.lcut)
        df['cleared_words'] = apply_by_multiprocessing(
            df['cut_words'], remove_english_punctuation, workers=4)
        df['cleared_words'] = apply_by_multiprocessing(
            df['cleared_words'], remove_chinese_punctuation, workers=4)
        df['cleared_words'] = apply_by_multiprocessing(
            df['cleared_words'], remove_stopwords, stopwords=stopwords, workers=4)
        df['counter'] = apply_by_multiprocessing(
            df['cleared_words'], Counter, workers=4)
        df['word_counts'] = apply_by_multiprocessing(
            df['cleared_words'], len, workers=4)
        columns = 'dataset_class sentiment cleared_words counter word_counts'
        df = df.loc[:, columns.split()]
        return df
Ejemplo n.º 12
0
def get_all_keywords(file_name):
    word_lists = []  # 关键词列表
    jieba.enable_parallel(8)
    with codecs.open(file_name, 'r', encoding='utf-8') as f:
        Lists = f.readlines()  # 文本列表
        for List in Lists:
            cut_list = list(jieba.cut(List))
            for word in cut_list:
                word_lists.append(word)
    word_lists_set = set(word_lists)  # 去除重复元素
    word_lists_set = list(word_lists_set)
    length = len(word_lists_set)
    print("共有%d个关键词" % length)
    information = pd.read_excel('/Users/huazi/Desktop/zhanlang2.xlsx')
    world_number_list = []
    word_copy = []
    for w in word_lists_set:
        if (len(w) == 1):
            continue
        if (word_lists.count(w) > 3):
            world_number_list.append(word_lists.count(w))
            word_copy.append(w)
    information['key'] = word_copy
    information['count'] = world_number_list
    information.to_excel('sun_2.xlsx')
Ejemplo n.º 13
0
    def __init__(self, data_path='./data/context'):
        logger.info('fastTextfeature loading corpus ...')
        self.label_list = ['Military', 'Economy', 'Culture', 'Sports', 'Auto', 'Medicine']

        # 枚举所有的文件
        jieba.enable_parallel(8)
        self.context, self.label = [], []
        for file in tqdm(os.listdir(path=data_path)):
            try:
                label = file.split('_')[0]
                filePath = os.path.join(data_path, file)
                with open(filePath, 'r', encoding='utf-8') as fd:
                    context = fd.read().replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')
                self.context.append(context)
                self.label.append(self.label_list.index(label))
            except:
                logger.warning('file %s have some problem ...' % file)
        self.context = [' '.join(list(jieba.cut(context))) for context in tqdm(self.context)]
        self.train_context, self.test_context, self.train_label, self.test_label =\
            train_test_split(self.context, self.label, test_size=0.05)

        train_data_fd = open('./data/fastTextData/train_data', 'w+')
        for label, context in zip(self.train_label, self.train_context):
            train_data_fd.write("__label__" + str(label) + '\t' + context + '\n')
        train_data_fd.close()

        valid_data_fd = open('./data/fastTextData/valid_data', 'w+')
        for label, context in zip(self.test_label, self.test_context):
            valid_data_fd.write("__label__" + str(label) + '\t' + context + '\n')
        valid_data_fd.close()

        logger.debug('self.train_context shape: %d' % len(self.train_context))
        logger.debug('self.test_context shape: %d' % len(self.test_context))
        logger.debug('self.train_label shape: %d' % len(self.train_label))
        logger.debug('self.test_label shape: %d' % len(self.test_label))
Ejemplo n.º 14
0
 def print_and_sorted(self, comments_t):
     from operator import itemgetter    
     import jieba.posseg as pseg
     import jieba
     jieba.enable_parallel(4) # 开启并行分词模式,参数为并行进程数
     cci = 0
     allow_pos = ['ns', 'n']
     for ci in comments_t:
         cci += 1
         if cci%50 == 0:
             logging.info('pro ' + str(cci))
         words = pseg.cut(ci)
         for word, flag in words:
             if flag in allow_pos:
                 if word in self.vectobi:
                     self.vectobi[word] += 1
                 else:
                     self.vectobi[word] = 1
     logging.info("sorted...")
     word_freq = sorted(self.vectobi.iteritems(), key=itemgetter(1),
         reverse=True)
     freqfile = open('freq.txt', 'w')
     for i in word_freq:
         freqfile.write(i[0] + '\t' + str(i[1]) + '\n')
     freqfile.close()
     logging.info("length:" + str(len(word_freq)) + " writed 100 in freq.txt")
Ejemplo n.º 15
0
def get_datasest():
    if not os.path.exists('../data/wangyi_title_cut.txt'):
        #中文分词
        jieba.enable_parallel()
        #一行代表一个标题
        line_num = 0
        with open('../data/wangyi_title_cut.txt', 'w') as fw:
            #没处理标点符号
            with open("../data/wangyi_title.txt", 'r') as f:
                for line in f.readlines():
                    line = line.strip()
                    line = line.replace("\r\n", "")
                    line = line.replace(" ", "")
                    line_seg = jieba.cut(line)  #list
                    line_seg = " ".join(line_seg)
                    line_num += 1
                    fw.write(line_seg + '\n')

        print('setence_num in raw corpus:', line_num)

    with open("../data/wangyi_title_cut.txt", 'r') as f:
        corpus = f.readlines()
        print('setence_num in corpus_seg', len(corpus))

    x_train = []
    #y = np.concatenate(np.ones(len(docs)))
    for idx, text in enumerate(corpus):
        word_list = text.strip().split(' ')
        document = TaggededDocument(word_list, tags=[idx])
        x_train.append(document)

    return x_train
Ejemplo n.º 16
0
 def __init__(self, user_dict_file=user_dict_file_path):
     self.userdict = user_dict_file
     logger.info(u'正在加载自定义词典...')
     jieba.load_userdict(self.userdict)
     jieba.enable_parallel(8)
     logging.info(u'正在建构词袋...')
     self.load_word_bag()
Ejemplo n.º 17
0
    def __init__(self, verbose=0):
        '''
            1. 初始化参数
            2. 加载用户字典和stop word列表
        :param verbose: 数值越大,打印越多的详细信息,设置为0时,什么信息都不显示.
        :type verbose: int

        '''
        # 初始化参数
        self.verbose = verbose
        # 设置jieba分词对线程
        jieba.enable_parallel(10)

        # -------------- region start : 2. 加载用户字典和stop word列表 -------------
        if verbose > 1:
            logging.debug('-' * 20)
            print '-' * 20
            logging.debug('2. 加载用户字典和stop word列表')
            print '2. 加载用户字典和stop word列表'
        # -------------- code start : 开始 -------------

        jieba.load_userdict(os.path.dirname(__file__) + '/userdict.txt')
        self.stopword_list = io.open(os.path.dirname(__file__) +
                                     '/stopword.txt',
                                     'r',
                                     encoding='utf8').read().strip().split()
        self.exclude_word_list = set(['886', '88'])

        # -------------- code start : 结束 -------------
        if verbose > 1:
            logging.debug('-' * 20)
            print '-' * 20
Ejemplo n.º 18
0
 def __init__(self, user_dict_file="TechWord.txt"):
     self.userdict = user_dict_file
     print('正在加载自定义词典...')
     jieba.load_userdict(self.userdict)
     jieba.enable_parallel(8)
     print('正在建构词袋...')
     self.load_word_bag()
Ejemplo n.º 19
0
def segment_file(in_file, out_file, word_sep=' ', pos_sep='/', is_pos=True):
    """
    segment input file to output file
    :param in_file:
    :param out_file:
    :param word_sep:
    :param pos_sep:
    :param is_pos: 需要词性标注
    :return:
    """
    jieba.enable_parallel()
    with open(in_file, 'r', encoding='utf-8') as fin, open(out_file, 'w', encoding='utf-8') as fout:
        count = 0
        for line in fin:
            in_line = line.strip()
            seg_line = ''
            if is_pos:
                words = posseg.lcut(in_line)
                for word, pos in words:
                    seg_line += word + pos_sep + pos + word_sep
            else:
                words = jieba.lcut(in_line)
                for word in words:
                    seg_line += word + word_sep
            fout.write(seg_line + "\n")
            count += 1
    print("segment ok. input file count:", count)
Ejemplo n.º 20
0
 def __init__(self):
     self.elastic = Elastic("qabot")
     self.doc_type = "fqa"
     self.related = None
     logging.info("Bot initialized.")
     jieba.enable_parallel(4)
     jieba.initialize()
Ejemplo n.º 21
0
def gen_vocab(file_list, added_vocab_list, vocab_dict_file):
    jieba.enable_parallel(30)
    for vocab_file in added_vocab_list:
        jieba.load_userdict(vocab_file)
    vocab_dict = dict()
    idx = 0
    for file_name in file_list:
        logging.info("process %s" % file_name)
        with open(file_name, "r", encoding="utf-8") as fp:
            content = fp.read()
            words = jieba.cut(content)
            for w in words:
                if w in vocab_dict:
                    vocab_dict[w] += 1
                else:
                    vocab_dict[w] = 1
            #lines = fp.readlines()
            #logging.info("readlines %d"%(len(lines)))
            #for line in tqdm(lines):
            #    line = line.strip()
            #    words = jieba.cut(line)
            #    for w in words:
            #        if w in vocab_dict:
            #            vocab_dict[w] += 1
            #        else:
            #            vocab_dict[w] = 1
            #    idx += 1
    logging.info("vocab_dict len:%d" % len(vocab_dict))
    wfp = open(vocab_dict_file, "wb")
    pickle.dump(vocab_dict, wfp)
    wfp.close()
Ejemplo n.º 22
0
def load_chinese_data(file_path, save_path, test_size=0.1, verbose=True):
    if os.path.exists(save_path):
        data = pd.read_csv(save_path, sep=",", header=0)
    else:
        data = pd.read_excel(file_path, sheet_name="sheet1")
        data = data.rename(index=str, columns={"分类": "label", "正文": "text"})

        # tokenization
        jieba.enable_parallel(16)
        data["tokens"] = data["text"].apply(lambda x: jieba.cut(x.strip()))
        data["tokens"] = [" ".join(x) for x in data["tokens"]]
        data["tokens"] = data["tokens"].apply(
            lambda x: re.sub(" +", " ",
                             x.strip().replace("\n", " ").replace("\t", " ")))
        data.to_csv(save_path, sep=",", header=True, index=False, na_rep="")

    label_encoder = preprocessing.LabelEncoder()
    labels = label_encoder.fit_transform(data.label.values)

    x_train, x_test, y_train, y_test = train_test_split(data.tokens.values,
                                                        labels,
                                                        stratify=labels,
                                                        random_state=1234,
                                                        test_size=test_size,
                                                        shuffle=True)

    if verbose:
        print("sample tokenized text: {}".format(data["tokens"].values[0]),
              flush=True)
        print("labels: {}".format(data.label.unique()), flush=True)
        print("train set shape: {}, test set shape: {}".format(
            x_train.shape, x_test.shape))

    return x_train, x_test, y_train, y_test
def word_seg():
	corpus = []
	sentences = []
	jieba.enable_parallel(40)
	word_list = []
	with open(os.path.join(DIRNAME, "baike.json"), "r") as f:
		for i, l in enumerate(f):
			print(i)
			entity = json.loads(l)
			description = entity['description']
			description_seg = []
			sentence_list = re.split(u'。|!|?|\?|!|', description)
			for sentence in sentence_list:
				cur_word_list = list(jieba.cut(sentence))
				description_seg.append(cur_word_list)
				word_list += cur_word_list
				sentences.append(cur_word_list)
			entity['description'] = description_seg
			corpus.append(entity)
	with open(os.path.join(DIRNAME, "baike_seg.json"), "w") as f:
		json.dump(corpus, f)
	with open(os.path.join(DIRNAME, "baike_sentences.json"), "w") as f:
		json.dump(sentences, f)
	word_list = list(set(word_list))
	word_list.insert(0,0)
	with open(os.path.join(DIRNAME, "baike_word_list.json"), "w") as f:
		json.dump(word_list, f)
	word_index = {}
	for i, word in enumerate(word_list):
		word_index[word] = i
	with open(os.path.join(DIRNAME, "baike_word_index.json"), "w") as f:
		json.dump(word_index, f)
Ejemplo n.º 24
0
 def __init__(self,n_core = 16):
     self.rootdir = os.getcwd()
     self.STOP_WORDS_LIST = self.load_txt(path.join(self.rootdir, 'resources', 'stopwords_utf8.txt'))
     self.STOP_WORDS_LIST = set([re.sub('\n', '', item) for item in self.STOP_WORDS_LIST])
     jieba.load_userdict(path.join(self.rootdir, 'resources', 'emotion_user_dict.txt'))
     self.n_CORE=n_core
     jieba.enable_parallel(self.n_CORE-1)
Ejemplo n.º 25
0
def train_embedding():

    stop_list = []
    with open('./data/stop_words.txt', 'r', encoding='utf8') as fr:
        for line in fr:
            if line.strip() != ' ':
                stop_list.append(line.strip())

    print(len(stop_list))

    jieba.enable_parallel(16)
    with open('./data/medical.csv', 'r') as fr:
        reader = csv.reader(fr)
        for i in reader:
            # print(i[0])
            jieba.add_word(i[0])
    sentences = []
    with open('./data/corpus.txt', 'r', encoding='utf8') as fr:
        lines = fr.readlines()
        for line in lines:
            sentence = jieba.lcut(line.strip())
            res = []
            for i in sentence:
                if i not in stop_list:
                    res.append(i)
            if len(res) > 0:
                sentences.append(res)
    model = gensim.models.Word2Vec(sentences,
                                   size=300,
                                   window=5,
                                   min_count=0,
                                   workers=16)
    # print(model.wv.word_vec('口腔'))

    model.wv.save_word2vec_format('wv300.bin')
Ejemplo n.º 26
0
 def __init__(self,
              user_dicts_directory=None,
              stop_word_dicts_directory=None):
     '''
     :param user_dicts_directory:自定义词典文件夹
     :param stop_word_dicts_directory:停用词文件夹
     '''
     # 尝试并行分词(unix系统可以,windows不可以)
     try:
         from multiprocessing import cpu_count
         jieba.enable_parallel(cpu_count() - 1)
     except:
         pass
     # 加载自定义词典
     if user_dicts_directory is not None:
         filenames = os.listdir(user_dicts_directory)
         for filename in filenames:
             jieba.load_userdict(user_dicts_directory + filename)
     # 加载停用词词典
     self.stop_words = set()
     if stop_word_dicts_directory is not None:
         filenames = os.listdir(stop_word_dicts_directory)
         for filename in filenames:
             for line in open(stop_word_dicts_directory + filename,
                              encoding='utf-8-sig'):
                 self.stop_words.add(line.strip())
Ejemplo n.º 27
0
def CreateTagsByID(weibo_dir, dest_dir, emoji_path, customized_path, stopwords_path, topK = 40):
    '''
    遍历scraped_data夹中的所有文件,对同一ID的博文进行汇总,并根据TF-IDF提取属于该ID的标签,保存在user_tags文件夹中。
    Args:
        weibo_dir: 爬取的微博博文数据所在目录
        dest_dir: 聚合后,每个用户的TFIDF标签群保存目录
        emoji_path: 自制表情符地址(merge后)
        customized_path: 自制用户词典地址(merge后)
        stopwords_path: 停用词地址(merge后)
        topK: 选取最大topK个标签
    Returns:
    
    
    '''
    jieba.load_userdict(emoji_path)
    jieba.load_userdict(customized_path)
    analyse.set_stop_words(stopwords_path)
    jieba.enable_parallel()
    for filename in os.listdir(weibo_dir):
        res = defaultdict()
        if(filename[0] == "."):
            continue
        raw_data = pd.read_csv(weibo_dir + filename)
        text_per_uid = raw_data.groupby("uid")["weibotxt"].sum()
        for idx in text_per_uid.index:
            res[str(idx)] = jieba.analyse.extract_tags(text_per_uid[idx], topK = topK)
        with open(dest_dir + filename[:-4] + ".json", "w") as f:
            json.dump(res, f)
Ejemplo n.º 28
0
def tf_idf(texts):
    jieba.load_userdict("./model/dict.txt")
    jieba.analyse.set_idf_path("./model/idf.txt")
    jieba.analyse.set_stop_words("./model/chinese_stopwords.txt")
    jieba.enable_parallel(8)

    corpus = [filter(jieba.analyse.extract_tags(s, topK=15)) for s in texts]
    return corpus
Ejemplo n.º 29
0
 def __init__(self):
     self.CURRENT_PATH = path.dirname(path.abspath(__file__))
     self.NW_PATH = path.join(self.CURRENT_PATH, 'words/wc_cn/newwords.txt')
     self.SW_PATH = path.join(self.CURRENT_PATH,
                              'words/wc_cn/stopwords.txt')
     # add user dict
     jieba.load_userdict(self.NW_PATH)  # must before jieba.enable_parallel
     jieba.enable_parallel(4)
Ejemplo n.º 30
0
def get_words(sentence_list):
    words = []
    jieba.enable_parallel(8)
    for raw in sentence_list:
        result = postag.cut(raw)
        raw = [x.word for x in result if (len(x.word) > 1 and 'n' in x.flag)]
        words += raw
    return words
Ejemplo n.º 31
0
 def __init__(self, vocabulary, labels, model):
     jieba.enable_parallel(multiprocessing.cpu_count())
     self.model = C.load_model(model)
     self.vocab = get_vocab(vocabulary)
     self.x_dim = len(self.vocab)
     self.y_dim = get_size(labels)
     self.x = C.sequence.input_variable(self.x_dim, is_sparse=True)
     self.model = self.model(self.x)
     self.predictor = C.argmax(self.model)
Ejemplo n.º 32
0
def save_jieba_result():
    # 设置多线程切割
    jieba.enable_parallel(4)
    dirs = path.join(path.dirname(__file__), '../pjl_comment.txt')
    with codecs.open(dirs, encoding='utf-8') as f:
        comment_text = f.read()
    cut_text = " ".join(jieba.cut(comment_text))  # 将jieba分词得到的关键词用空格连接成为字符串
    with codecs.open('pjl_jieba.txt', 'a', encoding='utf-8') as f:
        f.write(cut_text)
Ejemplo n.º 33
0
def cutall():
    if request.method == 'POST':
        text = request.form.get('text')
        text = text.strip()
        jieba.enable_parallel(4)
        seg_list = jieba.cut(text, cut_all=True)
        return jsonify(
           {'data': list(seg_list)}
        )
    return render_template('api/cutall.html')
Ejemplo n.º 34
0
def cut_word(sentence, parallel=False, processnum=2):
    if parallel:
        # 开启并行分词模式,参数为并行进程数,不支持windows
        jieba.enable_parallel(processnum=processnum)
        word_list = jieba.lcut(sentence, cut_all=False, HMM=True)
        # 关闭并行分词模式
        jieba.disable_parallel()
    else:
        word_list = jieba.lcut(sentence, cut_all=False, HMM=True)
    return word_list
Ejemplo n.º 35
0
    def __init__(self):

        # read stopwords
        stopwords_file = open("slave/parsers/stopwords")
        # stopwords_file = open("stopwords")
        stopwords = []
        for line in stopwords_file.readlines():
            stopwords.append(line.strip())
        self.stopwords = set(stopwords)

        jieba.enable_parallel(4)
Ejemplo n.º 36
0
	def __init__(self, of, data, questionList, model):
		self.CurrentData = ""
		self.title = ""
		self.text = ""
		self.counter = 0
		self.file = open(of, 'wb')

		self.data = data
		self.model = model
		self.questionList = questionList
		self.pages =[]

		jieba.enable_parallel(20)
Ejemplo n.º 37
0
def cutText(inTxt = '/Users/v_niur/Desktop/py/bb/tmp/simple.txt',outTxt = '/Users/v_niur/Desktop/py/bb/tmp/cut_simple.txt'):
    """将中文文本进行分词并导出"""
    jieba.enable_parallel(4)
    startTime = time.time()
    content = open(inTxt,"rb").read()#将该目录下的文本的内容导入为一个字符串
    words = list(jieba.cut(content))
    endTime= time.time()
    timeCost = endTime - startTime
    with open(outTxt,'w+') as f:
        for w in words:
            # print >> f,w.encode("utf-8"), "/" ,
            print >> f, w.encode( "utf-8" ),' ',
    print 'speed',len(content),"bytes/second"
Ejemplo n.º 38
0
 def use_jieba(self):
     import jieba.posseg as pseg
     import jieba
     jieba.enable_parallel(4) # 开启并行分词模式,参数为并行进程数
     cuted_t = []
     allow_pos = ['ns', 'n', 'vn', 'v'] # 保留的词性
     for i in self.title_content:
         words = pseg.cut(i)
         flit_words = []
         for word, flag in words:
             if flag in allow_pos:
                 flit_words.append(word)
         cuted_t.append(flit_words)
     return cuted_t
Ejemplo n.º 39
0
    def __init__(self, processnum=1):
        logger.info('Initializing jieba...')
        jieba.initialize()
        logger.info('Successfully initialized jieba.')

        if processnum == 0:
            processnum = multiprocessing.cpu_count()

        if processnum > 1:
            logger.info(
                'jieba running in parallel mode with %d processes.',
                processnum
            )
            jieba.enable_parallel(processnum)
Ejemplo n.º 40
0
def prepare_data():
    print "----- Prepareing Data -----"
    import cPickle
    import jieba
    # load data, and make into DataFrame df
    tags, texts, uniq_labels = make_news_data()
    df = pd.DataFrame({'label': tags, 'txt': texts})

    # jieba segmentaion and serialize
    jieba.enable_parallel(4)
    df['seg_word'] = df.txt.map(lambda x: ' '.join(jieba.cut(x)))
    # 经测试,含有 utf-8 的 df,通过普通 open 打开的文件也可以正常 dump & load
    cPickle.dump(df, open('df.pickle', 'wb'))
    # df = cPickle.load(open('df.pickle', 'rb'))

    return df
Ejemplo n.º 41
0
def init(jieba_parallel=False):
    # 加载英语/中文停止词,分别来自nltk和zhon
    global english_stopwords, chinese_stopwords
    english_stopwords = set(nltk.corpus.stopwords.words('english'))
    chinese_stopwords = {word[:-1] for word in codecs.open("stopwords.txt", "r", encoding="utf-8")}

    # 设置结巴分词log级别
    jieba.setLogLevel("INFO")
    # 设置结巴分词字典文件
    jieba.set_dictionary("./jieba_dict.txt")
    # 修改结巴分词临时工作目录
    jieba.tmp_dir = os.getcwd()
    # 开启并行分词模式,进程数为CPU核心数
    if jieba_parallel:
        jieba.enable_parallel()

    config.log.info("module algorithm has initialized successfully.")
Ejemplo n.º 42
0
def use_jieba():
    import jieba
    import codecs
    import re
    k = 0
    data_file = codecs.open('train.txt', 'w', 'utf-8')
    data_paragraph_name = codecs.open('paragraph_name.txt', 'w', 'utf-8')
    logging.info("start to write paragraph...")
    jieba.enable_parallel(4) # 开启并行分词模式,参数为并行进程数
    for i in title_content:
        data_file.write('_*'+str(k)+' ')
        data_paragraph_name.write(title_name[k] + '\t' + title_id[k] + '\n')
        words = " ".join(jieba.cut(
            re.sub(u'\s|!|。|,|“|”|(|)|《|》|\(|\)|:|、',' ',i),
            cut_all=False))
        data_file.write(words+u'\n')
        k += 1
    data_file.close()
    data_paragraph_name.close()
Ejemplo n.º 43
0
    def __init__(self,
                 num_of_parallel=10,
                 verbose=0):
        """
            1. 初始化参数
            2. 加载用户字典和stop word列表

        Parameters
        ----------
        num_of_parallel : int
            并行的线程数
        verbose: int
            数值越大,打印越多的详细信息,设置为0时,什么信息都不显示.
        """
        # 初始化参数
        self.verbose = verbose
        # 设置jieba分词对线程
        jieba.enable_parallel(num_of_parallel)

        # -------------- region start : 2. 加载用户字典和stop word列表 -------------
        if verbose > 1:
            logging.debug('-' * 20)
            print '-' * 20
            logging.debug('2. 加载用户字典和stop word列表')
            print '2. 加载用户字典和stop word列表'
        # -------------- code start : 开始 -------------


        # region 添加用户字典
        jieba.load_userdict(os.path.dirname(__file__) + '/userdict.txt')
        # 添加 261,529 个用户词典(来自在线新华词典的抓取)
        jieba.load_userdict(os.path.dirname(__file__) + '/vocabulary_len2_xiandaihanyu.txt')
        # endregion
        self.stopword_list = io.open(os.path.dirname(__file__) + '/stopword.txt', 'r',
                                     encoding='utf8').read().strip().split()
        self.exclude_word_list = set(['886', '88'])

        # -------------- code start : 结束 -------------
        if verbose > 1:
            logging.debug('-' * 20)
            print '-' * 20
Ejemplo n.º 44
0
def visit_offcanvas(request):
  #bug: 同个客户端同时刷新好几次,可能同时返回导致内容混合
  ip = None
  if request.META.has_key('HTTP_X_FORWARDED_FOR'):  
    ip =  request.META['HTTP_X_FORWARDED_FOR']  
  else:  
    ip = request.META['REMOTE_ADDR'] 
  logger.info("%s BEGIN. POST:%s, GET:%s"%(ip,str(request.POST),str(request.GET)))

  global is_first_load
  mutex_update_news.acquire()
  if is_first_load:
    #print "[LOG %s] init news."%(time.strftime("%Y-%m-%d %X", time.localtime()))
    logger.info("init news.")
    if platform.system() == "Linux":
      jieba.enable_parallel(8)
    jieba.initialize()
    #jieba.set_dictionary('data/dict.txt.big')
    update_base()
    init_news2()
    thread.start_new_thread(thread_update_news, ("",))
    is_first_load = False
  mutex_update_news.release()

  queryDict=None
  if request.method == 'GET':
    queryDict = request.GET
  elif request.method == 'POST':
    queryDict = request.POST
  jsondata = get_jsondata(queryDict)
  fp = open('django_composite/offcanvas.html')  
  t = Template(fp.read())  
  fp.close()  
  html = t.render(Context(jsondata))  
  logger.info("%s END."%ip)
  return HttpResponse(html) 
  '''
Ejemplo n.º 45
0
def main():

    parser = ArgumentParser()
    parser.add_argument("corpus_file", help = "row corpus file (input)")
    parser.add_argument("threshold", help = "idf threshold (input)", type = int)
    parser.add_argument("template_path", help = "template file path to dump in json format (output)")
    args = parser.parse_args()

    corpus_file = args.corpus_file
    threshold = args.threshold
    template_path = args.template_path

    enable_parallel(24)
    
    rptid_search = compile("(?<=<rptid:)[^>]*(?=>)")
    content_search = compile("(?<=<content:)[^>]*(?=>)")
    image_sub = compile("\[img\][^\[\]]+\[/img\]")
    br_sub = compile("\[br\]")

    rptid_set = set()
    df_dict = dict()
    with open(corpus_file, 'r') as fd:
        for line in fd:
            if not line.startswith("<flag:0>"):
                continue
            line = line.strip()

            result = rptid_search.search(line)
            if not result:
                continue
            rptid = result.group(0)
            if rptid in rptid_set:
                continue
            rptid_set.add(rptid)

            result = content_search.search(line)
            if not result:
                continue
            content = result.group(0)
            content = image_sub.sub("", content)
            content = br_sub.sub("\n", content)
            
            seg_set = set([seg.encode("utf-8") for seg in cut(content)])
            for word in seg_set:
                if word not in df_dict:
                    df_dict[word] = 0
                df_dict[word] += 1

    idf_dict = {}
    for word in df_dict:
        if df_dict[word] > threshold:
            idf_dict[word] = log(float(len(rptid_set)) / df_dict[word])

    word_list = list(idf_dict)
    word_index_dict = dict((word_list[index], index) for index in xrange(len(word_list))) # word -> index
    index_word_dict = dict((index, word_list[index]) for index in xrange(len(word_list))) # index -> word

    with open(template_path, 'w') as fd:
        fd.write(dumps({"word_idf_dict": idf_dict,
                        "word_index_dict": word_index_dict,
                        "index_word_dict": index_word_dict}, indent=4, ensure_ascii=False))
Ejemplo n.º 46
0
#coding=utf-8
'''
Created on 2014-2-22

@author: yuzhang
'''

import jieba.posseg as jbp
import jieba as jb
import time

jb.enable_parallel()
jb.initialize()
text = '''
'''

start = time.clock()
for i in range(1000000):
    jb.cut(text)
print time.clock() - start

start = time.clock()
for i in range(1000000):
    jbp.cut(text)
print time.clock() - start
Ejemplo n.º 47
0
import codecs
import binascii
import time

jieba.set_dictionary('/Users/fan/anaconda/bin/Workspace/sentiment/0616/big/jieba356726.txt')
jieba.load_userdict('/Users/fan/anaconda/bin/Workspace/sentiment/0616/big/cute.txt')
jieba.load_userdict('/Users/fan/anaconda/bin/Workspace/sentiment/0616/big/jieba356726.txt')
jieba.load_userdict('/Users/fan/anaconda/bin/Workspace/sentiment/0616/food/fooddict2027.txt')
jieba.load_userdict('/Users/fan/anaconda/bin/Workspace/sentiment/0616/menu/menu50806_new.txt')
jieba.load_userdict('/Users/fan/anaconda/bin/Workspace/sentiment/0616/sentiment/negativewords.txt')
jieba.load_userdict('/Users/fan/anaconda/bin/Workspace/sentiment/0616/sentiment/positivewords.txt')
jieba.load_userdict('/Users/fan/anaconda/bin/Workspace/sentiment/0616/sentiment/negative.txt')
jieba.load_userdict('/Users/fan/anaconda/bin/Workspace/sentiment/0616/sentiment/more.txt')
jieba.load_userdict('/Users/fan/anaconda/bin/Workspace/sentiment/0616/sentiment/question.txt')
jieba.load_userdict('/Users/fan/anaconda/bin/Workspace/sentiment/0616/stop/stopword2292.txt')
jieba.enable_parallel(6)

def deleteBadWords(StrIn):
    Str_BadWords = u'延伸閱讀|連絡方式|電話預約|電話|營業時間|週一|週二|週三|週四|週五|週六|週日|周一|周二|周三|周四|周五|周六|\
                    |周日|假日|公休|平日|地址|粉絲團|星期|禮拜|時間限制|您或許對這些文章有興趣|造訪日期|全年無休|最後點餐|營業|AM|PM|上一篇|下一篇|\
                    |分享此文|您可能喜歡的文章|懶人包|臉書|Facebook|facebook|FB|fb|全世界便宜住宿看這兒|下載愛食記App隨時觀看|按個讚啦|喜歡我的分享嗎|\
                    |瘋台灣民宿網|官方網站|瀏覽人次|最新消息|餐廳名稱|消費時間|無圖文版|網誌|Postedonby|新鮮關注回聲|Christabelle的藝想世界部落格由製作|\
                    |也許對這些文章也有興趣|發表迴響|電子郵件|必要欄位標記|電子郵件|個人網站|輸入圖片顯示文字好證明你不是機器人|站內搜尋分類|最新動態|\
                    |並不會被公開|你的位址 |迴響名稱|用餐日期|留言|載入中|文章文章|粉絲頁|發表|每人平均價位|按個讚|推薦你閱讀|Instagram|instagram|\
                    |美食地圖|版權所有|網友回應|歡迎加入|標籤|著作權聲明|非經授權|不得轉載'
    strClean = re.sub(Str_BadWords,'',StrIn)
    return strClean

def EnglishFullToHalf(StrIn):
    def transform(ele):
        alphabetInt = int(repr(ele.group('number'))[4:8],16)
#Title segmentation
import jieba
import csv
csv.field_size_limit(1000000000)
jieba.enable_parallel(20)

content = csv.reader(open('/media/sunzeyeah/Personal/SENIOR/Thesis/Data/Chinese/Sina/news_8.csv','r'))
f = open('/media/sunzeyeah/Personal/SENIOR/Thesis/Data/Chinese/Sina/edu.csv','w')
#csvfile = open('/media/sunzeyeah/Personal/SENIOR/Thesis/Data/Chinese/title.txt','w')
writer = csv.writer(f)
for line in content:
 if line[2] != 'title':
  words = ' '.join(jieba.cut(line[2]))
# string = words + '\n'
  writer.writerow([words.encode('utf-8')])
#  f.write(string.encode('utf-8'))
Ejemplo n.º 49
0
def buildCorpus(filename="PART_III.txt"):
	jieba.enable_parallel(4)
	source,target = extract(filename)
	source = [tokenize(s,"s") for s in source]
	target = [tokenize(t,"t") for t in target]
	return source,target
import re

# Chinese word segmentation
import jieba

__author__ = "Lucas Kjaero"

# Optimize Chinese segmenter for running in parallel.
try:
    from multiprocessing import cpu_count

    jieba.enable_parallel(cpu_count())
except NotImplementedError:
    pass

number_pattern = re.compile("[0-9]+(.)*[0-9]*")


def drop_punctuation_and_numbers(iterable_text):
    """A generator that returns tokens in a text if they are not punctuation or numbers. Input must be iterable"""
    for token in iterable_text:
        if token not in ",.?;'[]()`~!@#$%^&*/+_-=<>{}:,。?!·;:‘“、\"" and number_pattern.match(token) is None:
            yield token


def segment_sentence(input_text, split_compounds=False):
    """Segment a Chinese sentence, returns a generator containing the words.
    If you select split_compounds, it will return all possible words in the sentence, including overlaps."""
    return drop_punctuation_and_numbers(jieba.cut(input_text, cut_all=split_compounds))

Ejemplo n.º 51
0
def participle(data):
    jieba.enable_parallel(4)
    result=jieba.cut(data,cut_all=False)
    return result
Ejemplo n.º 52
0
 def init_context(self):
     jieba.initialize()
     jieba.enable_parallel(4)
Ejemplo n.º 53
0
        self.send_header("Content-Length", str(len(res_str)))
        self.end_headers()
        self.wfile.write(res_str)


image_sub = compile("\[img\][^\[\]]+\[/img\]")
br_sub = compile("\[br\]")

idf_file = "./download/idf-new.json"
tempalte_file = "./download/template-new.json"
sex_file = "./download/sex.tsv"
banned_file = "./download/banned-dict.json"
political_file = "./download/political-dict.json"
model_file = "./download/model-new.pickle"

enable_parallel(6)

sex_set = load_sex_set(sex_file)
banned_dict = load_banned_dict(banned_file)

political_dict = load_political_dict(political_file)
political_name_set = set(name.encode("utf-8") for name in political_dict["name_list"])
political_verb_set = set(verb.encode("utf-8") for verb in political_dict["verb_list"])

with open(idf_file, 'r') as fd:
    idf_dict = loads(fd.read())
    idf_dict = dict((key.encode("utf-8"), idf_dict[key]) for key in idf_dict) # word -> idf
with open(tempalte_file, 'r') as fd:
    index_dict = loads(fd.read())
    index_dict = dict((int(key), index_dict[key].encode("utf-8")) for key in index_dict) # index -> word
word_dict = dict((index_dict[key], key) for key in index_dict) # word -> index
Ejemplo n.º 54
0
#encoding=utf-8
from __future__ import print_function
import sys
sys.path.append("../../")
import jieba
jieba.enable_parallel(4)
import jieba.posseg as pseg

def cuttest(test_sent):
    result = pseg.cut(test_sent)
    for w in result:
        print(w.word, "/", w.flag, ", ", end=' ')  
    print("")


if __name__ == "__main__":
    cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。")
    cuttest("我不喜欢日本和服。")
    cuttest("雷猴回归人间。")
    cuttest("工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作")
    cuttest("我需要廉租房")
    cuttest("永和服装饰品有限公司")
    cuttest("我爱北京天安门")
    cuttest("abc")
    cuttest("隐马尔可夫")
    cuttest("雷猴是个好网站")
    cuttest("“Microsoft”一词由“MICROcomputer(微型计算机)”和“SOFTware(软件)”两部分组成")
    cuttest("草泥马和欺实马是今年的流行词汇")
    cuttest("伊藤洋华堂总府店")
    cuttest("中国科学院计算技术研究所")
    cuttest("罗密欧与朱丽叶")
Ejemplo n.º 55
0
 def __init__(self, ranker):
     jieba.initialize()
     jieba.enable_parallel(8)
     self.ranker = ranker
>>> import jieba.posseg as pseg
>>> words = pseg.cut("我爱北京天安门")
>>> for w in words:
...    print w.word, w.flag
...
我 r
爱 v
北京 ns
天安门 ns
功能 5) : 并行分词

原理:将目标文本按行分隔后,把各行文本分配到多个python进程并行分词,然后归并结果,从而获得分词速度的可观提升 
基于python自带的multiprocessing模块,目前暂不支持windows 
用法:

jieba.enable_parallel(4) # 开启并行分词模式,参数为并行进程数
jieba.disable_parallel() # 关闭并行分词模式
例子:
import urllib2
import sys,time
import sys
sys.path.append("../../")
import jieba
jieba.enable_parallel(4)

url = sys.argv[1]
content = open(url,"rb").read()
t1 = time.time()
words = list(jieba.cut(content))

t2 = time.time()
Ejemplo n.º 57
0
# -*- coding: utf-8 -*-
# Project : LM4paper
# Created by igor on 16-12-23

import time

import jieba

jieba.enable_parallel(10)

data_path= '/data/lawdata/raw/raw.document.txt'
t1 = time.time()
content = open(data_path,"rb").read()
words = " ".join(jieba.cut(content))
t2 = time.time()
tm_cost = t2 - t1
log_f = open("/data/lawdata/raw/raw.tokenized.document.txt",'wb')
log_f.write(words.encode('utf8'))

print('speed %s bytes/second' % (len(content)/tm_cost))
Ejemplo n.º 58
0
#=======================================================
#load  train data
train_file = pd.read_table("train_sample.txt",names = ["ID","Value","String"])
print "load's file length:",len(train_file)
#load test data
test_file = pd.read_table("test.txt",names = ["ID","String"])
# print test_file.head()
# print np.array(test_file["String"])
#convert to np.ndarray
print train_file.tail()

#==========================================================
#jieba participle
#parallel
jieba.enable_parallel(1) #start:the paralleled num of processes;but one is better.
jieba.set_dictionary("dict_for_jieba.txt") #set dictionary dir

trainData = []
for s in train_file["String"]:
    trainData.append("/".join(jieba.cut(s)))

print len(trainData)
print trainData[0]


jieba.disable_parallel() #turn off processes

#============================================================
#TF-IDF :Extract features.
Ejemplo n.º 59
0
                        endMonth = item
                        li[li.index(item)] = ''
                    elif re.search('\d', item) and not startMonth:
                        startMonth = item
                        li[li.index(item)] = ''
                    elif re.search('\d', item) and startMonth and not endMonth:
                        endMonth = item
                        li[li.index(item)] = ''
                print startYear, '年',startMonth, endYear, '年',endMonth,
                fliterList = [u'大学', u'学院',u'专业', u'本科', u'研究生']
                print ''.join(li).replace(u'大学',u'大学#').replace(u'学院',u'学院#').replace(u'专业',u'专业#') \
                    .replace(u'本科', u'本科#').replace(u'研究生',u'研究生#').replace(u'博士',u'博士#') \
                    .replace(u'校', u'校#').replace(u',', u'#')



def funx(arg, dire, files):
    for file in files:
        if file.endswith('.txt'):
            print os.path.join(dire, file)
            jieba.load_userdict(os.path.join(dire, file))

if __name__ == "__main__":

    dicPath = os.path.join(os.path.expanduser('~'), 'Desktop', u'分词字典')
    os.path.walk(dicPath, funx, ())

    jieba.enable_parallel(4)#并行分词
    BASEPATH = os.path.join(os.path.expanduser('~'), 'Desktop', 'test.html')
    praseHTML(BASEPATH)
Ejemplo n.º 60
0
def cutword(data):
    jieba.enable_parallel(4)
    result=jieba.cut(data,cut_all=True)
    return result