Exemple #1
0
def corpus_segment(corpus_path, seg_path):
    catelist = os.listdir(
        corpus_path)  # Gets all subdirectories under corpus_path
    ## In fact, the name of subdirectories is the category
    #print("Segmenting..Please wait.")

    # Gets all the files under each directory (category)
    for mydir in catelist:
        class_path = corpus_path + mydir + "/"
        seg_dir = seg_path + mydir + "/"
        if not os.path.exists(
                seg_dir
        ):  # Whether there is a word segmentation directory, if not, create it
            os.makedirs(seg_dir)
        file_list = os.listdir(
            class_path
        )  # Get all the text in a category in an unsegmented term repository

        # Traverse all files in the category directory and to process
        for file_path in file_list:
            fullname = class_path + file_path
            content = readfile(fullname)
            content = content.replace(
                '\r\n'.encode('utf-8'),
                ''.encode('utf-8')).strip()  # Delete line breaks
            content = content.replace(
                ' '.encode('utf-8'),
                ''.encode('utf-8')).strip()  # Delete empty lines, extra spaces
            content_seg = jieba.cut(content)  # segment
            savefile(seg_dir + file_path, ' '.join(content_seg).encode(
                'utf-8'))  # Save the segmented file
Exemple #2
0
def vector_space(stopword_path, bunch_path, space_path, train_tfidf_path=None):
    stpwrdlst = readfile(stopword_path).splitlines()
    bunch = readbunchobj(bunch_path)
    tfidfspace = Bunch(target_name=bunch.target_name,
                       label=bunch.label,
                       filenames=bunch.filenames,
                       tdm=[],
                       vocabulary={})

    if train_tfidf_path is not None:
        trainbunch = readbunchobj(train_tfidf_path)
        tfidfspace.vocabulary = trainbunch.vocabulary
        vectorizer = TfidfVectorizer(stop_words=stpwrdlst,
                                     sublinear_tf=True,
                                     max_df=0.5,
                                     vocabulary=trainbunch.vocabulary)
        tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)

    else:
        vectorizer = TfidfVectorizer(stop_words=stpwrdlst,
                                     sublinear_tf=True,
                                     max_df=0.5)
        tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
        tfidfspace.vocabulary = vectorizer.vocabulary_

    writebunchobj(space_path, tfidfspace)
    print("if-idf词向量空间实例创建成功!!!")
Exemple #3
0
def corpus_segment(corpus_path, seg_path):
    catelist = os.listdir(corpus_path)  # 获取corpus_path下的所有子目录

    print("玩儿命分词中...")

    # 获取每个目录(类别)下所有的文件

    for mydir in catelist:

        class_path = corpus_path + mydir + "/"  # 拼出分类子目录的路径

        seg_dir = seg_path + mydir + "/"  # 拼出分词后存贮的对应目录路径
        if not os.path.exists(seg_dir):  # 是否存在分词目录,如果没有则创建该目录

            os.makedirs(seg_dir)
        file_list = os.listdir(class_path)  # 获取未分词语料库中某一类别中的所有文本
        for file_path in file_list:  # 遍历类别目录下的所有文件

            fullname = class_path + file_path  # 拼出文件名全路径如:train_corpus/art/21.txt

            content = readfile(fullname)  # 读取文件内容

            content = content.replace('\r\n'.encode('utf-8'), ''.encode('utf-8')).strip()  # 删除换行

            content = content.replace(' '.encode('utf-8'), ''.encode('utf-8')).strip()  # 删除空行、多余的空格

            content_seg = jieba.cut(content)  # 为文件内容分词

            savefile(seg_dir + file_path, ' '.join(content_seg).encode('utf-8'))  # 将处理后的文件保存到分词后语料目录
    print("中文语料分词结束!!!")
Exemple #4
0
def tf_idf(bunch):
    stpwrdlst = readfile("stopwords.txt").splitlines()
    tfidfspace = Bunch(target_name=bunch.target_name, tdm=[], vocabulary={})
    trainbunch = readbunchobj("train_word_bag/tfdifspace.dat")
    tfidfspace.vocabulary = trainbunch.vocabulary
    vectorizer = TfidfVectorizer(stop_words=stpwrdlst,
                                 sublinear_tf=True,
                                 max_df=0.5,
                                 vocabulary=trainbunch.vocabulary)
    print(bunch.contents)
    tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
    return tfidfspace.tdm
def bunch2Space(bunch):
    stopword_path = "train_word_bag/hlt_stop_words.txt"
    stpwrdlst = readfile(stopword_path).splitlines()
    tfidfspace = Bunch(target_name=bunch.target_name,
                       label=bunch.label,
                       filenames=bunch.filenames,
                       tdm=[],
                       vocabulary={})
    vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True)
    tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
    tfidfspace.vocabulary = vectorizer.vocabulary_
    return tfidfspace
def corpus2Bunch(filename):
    wordbag_path = "upload_word_bag/" + filename + "_set.dat"  # Bunch存储路径
    seg_path = "upload_corpus_seg/" + filename + ".txt"  # 分词后分类语料库路径
    # 创建一个Bunch实例
    bunch = Bunch(filenames=[seg_path], contents=[readfile(seg_path)])
    '''
    extend(addlist)是python list中的函数,意思是用新的list(addlist)去扩充
    原来的list
    '''

    # 将bunch存储到wordbag_path路径中
    with open(wordbag_path, "wb") as file_obj:
        pickle.dump(bunch, file_obj)
    print("构建文本对象结束!!!")
def corpus_segment(corpus_path, seg_path):
    '''
    corpus_path是未分词语料库路径
    seg_path是分词后语料库存储路径
    '''
    catelist = os.listdir(corpus_path)  # 获取corpus_path下的所有子目录
    '''
    其中子目录的名字就是类别名
    train_corpus/it/21.txt中,'train_corpus/'是corpus_path,'it'是catelist中的一个成员
    '''
    print("玩儿命分词中...")
    # 获取每个目录(类别)下所有的文件
    for mydir in catelist:
        '''
        这里mydir就是train_corpus/it/21.txt中的it(即catelist中的一个类别)
        '''
        class_path = corpus_path + mydir + "/"  # 拼出分类子目录的路径如:train_corpus/it/
        seg_dir = seg_path + mydir + "/"  # 拼出分词后存贮的对应目录路径如:train_corpus_seg/it/

        if not os.path.exists(seg_dir):  # 是否存在分词目录,如果没有则创建该目录
            os.makedirs(seg_dir)

        file_list = os.listdir(class_path)  # 获取未分词语料库中某一类别中的所有文本
        '''
        train_corpus/it/中的
        21.txt,
        22.txt,
        23.txt
        ...
        file_list=['21.txt','22.txt',...]
        '''
        for file_path in file_list:  # 遍历类别目录下的所有文件
            fullname = class_path + file_path  # 拼出文件名全路径如:train_corpus/it/21.txt
            content = readfile(fullname)  # 读取文件内容
            '''此时,content里面存贮的是原文本的所有字符,例如多余的空格、空行、回车等等,
            接下来,我们需要把这些无关痛痒的字符统统去掉,变成只有标点符号做间隔的紧凑的文本内容
            '''
            content = content.replace('\r\n'.encode('utf-8'),
                                      ''.encode('utf-8')).strip()  # 删除换行
            content = content.replace(' '.encode('utf-8'),
                                      ''.encode('utf-8')).strip()  # 删除空行、多余的空格
            content_seg = jieba.cut(content)  # 为文件内容分词
            savefile(
                seg_dir + file_path,
                ' '.join(content_seg).encode('utf-8'))  # 将处理后的文件保存到分词后语料目录

    print("中文语料分词结束!!!")
def corpus2Bunchtest(wordbag_path, seg_path):
    catelist = os.listdir(seg_path)  # 获取seg_path下的所有子目录,也就是分类信息
    # 创建一个Bunch实例
    bunch = Bunch(target_name=[], label=[], filenames=[], contents=[])
    bunch.target_name.extend(catelist)

    # 获取每个目录下所有的文件
    for file_path in catelist:  # 遍历类别目录下文件
        fullname = seg_path + file_path  # 拼出文件名全路径
        id = file_path[:-4]
        bunch.filenames.append(id)
        bunch.contents.append(readfile(fullname))  # 读取文件内容

    # 将bunch存储到wordbag_path路径中
    with open(wordbag_path, "wb") as file_obj:
        pickle.dump(bunch, file_obj)
    print("构建文本对象结束!!!")
def vector_space(filename):
    stopword_path = "upload_word_bag/hlt_stop_words.txt"
    bunch_path = "upload_word_bag/" + filename + "_set.dat"
    space_path = "upload_word_bag/" + filename + "space.dat"
    train_tfidf_path = "train_word_bag/tfdifspace.dat"
    stpwrdlst = readfile(stopword_path).splitlines()
    bunch = readbunchobj(bunch_path)
    tfidfspace = Bunch(filenames=bunch.filenames, tdm=[], vocabulary={})
    trainbunch = readbunchobj(train_tfidf_path)
    tfidfspace.vocabulary = trainbunch.vocabulary
    vectorizer = TfidfVectorizer(stop_words=stpwrdlst,
                                 sublinear_tf=True,
                                 max_df=0.5,
                                 vocabulary=trainbunch.vocabulary)
    tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)

    writebunchobj(space_path, tfidfspace)
    print("if-idf词向量空间实例创建成功!!!")
def corpus_segment(filename):
    # 对上传文件进行分词
    corpus_path = "./upload_corpus/" + filename + ".txt"  # 未分词分类语料库路径
    seg_path = "./upload_corpus_seg/" + filename + ".txt"  # 分词后分类语料库路径
    '''
    corpus_path是未分词语料库路径
    seg_path是分词后语料库存储路径
    '''
    content = readfile(corpus_path)  # 读取文件内容
    '''此时,content里面存贮的是原文本的所有字符,例如多余的空格、空行、回车等等,
    接下来,我们需要把这些无关痛痒的字符统统去掉,变成只有标点符号做间隔的紧凑的文本内容
    '''
    content = content.replace('\r\n'.encode('utf-8'),
                              ''.encode('utf-8')).strip()  # 删除换行
    content = content.replace(' '.encode('utf-8'),
                              ''.encode('utf-8')).strip()  # 删除空行、多余的空格
    content_seg = jieba.cut(content)  # 为文件内容分词
    savefile(seg_path,
             ' '.join(content_seg).encode('utf-8'))  # 将处理后的文件保存到分词后语料目录
Exemple #11
0
def corpus2Bunch(wordbag_path, seg_path):
    catelist = os.listdir(seg_path)
    bunch = Bunch(target_name=[], label=[], filenames=[], contents=[])
    bunch.target_name.extend(
        catelist)  # Expand the original list with the new list (addlist)

    # Add an element to the original list
    for mydir in catelist:
        class_path = seg_path + mydir + "/"
        file_list = os.listdir(class_path)
        for file_path in file_list:
            fullname = class_path + file_path
            bunch.label.append(mydir)
            bunch.filenames.append(fullname)
            bunch.contents.append(readfile(fullname))

    # Store bunch in wordbag_path
    with open(wordbag_path, "wb") as file_obj:
        pickle.dump(bunch, file_obj)
Exemple #12
0
def corpus2Bunch(wordbag_path, seg_path):
    catelist = os.listdir(seg_path)  # get the category 
    # create a bunch
    bunch = Bunch(target_name=[], label=[], filenames=[], contents=[])
    bunch.target_name.extend(catelist)
  
  
    # obtain the file under path
    for mydir in catelist:
        class_path = seg_path + mydir + "/"  # give the full path 
        file_list = os.listdir(class_path)  # get all files under class_path
        for file_path in file_list:  # visit all the files under path
            fullname = class_path + file_path  
            bunch.label.append(mydir)
            bunch.filenames.append(fullname)
            bunch.contents.append(readfile(fullname))  #read the txt
            
    # store bunch into the wordbag_path
    with open(wordbag_path, "wb") as file_obj:
        pickle.dump(bunch, file_obj)
    print("the construction of text object is finished!!!")
def corpus_segment(corpus_path, seg_path):
    '''
    corpus_path is the path for file before division
    seg_path is the path for file after division
    '''
    catelist = os.listdir(corpus_path)
    '''
    catelist record all the folder names in the corpus_path, including 'art','literature','education'...
   
    '''
    print("the jieba is working")
    # to obtain the file under each folder
    for mydir in catelist:

        class_path = corpus_path + mydir + "/"  # train_corpus/art/
        seg_dir = seg_path + mydir + "/"  # train_corpus_seg/art/

        if not os.path.exists(seg_dir):  # create the train_corpus_seg
            os.makedirs(seg_dir)

        file_list = os.listdir(class_path)

        for file_path in file_list:  # visit all the file under file_list
            fullname = class_path + file_path  # give the full path:train_corpus/art/21.txt
            content = readfile(fullname)  #read the .txt file
            '''delete the white space,null string,return 
            '''
            content = content.replace(
                '\r\n'.encode('utf-8'),
                ''.encode('utf-8')).strip()  # delete return
            content = content.replace(
                ' '.encode('utf-8'),
                ''.encode('utf-8')).strip()  # delete white space
            content_seg = jieba.cut(content)  # 为文件内容分词
            savefile(seg_dir + file_path,
                     ' '.join(content_seg).encode('utf-8'))
            # put the file after division into seg_path

    print("the division of sentences is finished!!!")
Exemple #14
0
def corpus2Bunch(wordbag_path, seg_path):
    catelist = os.listdir(seg_path)  # 获取seg_path下的所有子目录,也就是分类信息
    # 创建一个Bunch实例
    bunch = Bunch(target_name=[], label=[], filenames=[], contents=[])
    bunch.target_name.extend(catelist)
    '''
    extend(addlist)是python list中的函数,意思是用新的list(addlist)去扩充
    原来的list
    '''
    # 获取每个目录下所有的文件
    for mydir in catelist:
        class_path = seg_path + mydir + "/"  # 拼出分类子目录的路径
        file_list = os.listdir(class_path)  # 获取class_path下的所有文件
        for file_path in file_list:  # 遍历类别目录下文件
            fullname = class_path + file_path  # 拼出文件名全路径
            bunch.label.append(mydir)
            bunch.filenames.append(fullname)
            bunch.contents.append(readfile(fullname))  # 读取文件内容
            '''append(element)是python list中的函数,意思是向原来的list中添加element,注意与extend()函数的区别'''
    # 将bunch存储到wordbag_path路径中
    with open(wordbag_path, "wb") as file_obj:
        pickle.dump(bunch, file_obj)
    print("构建文本对象结束!!!")
def vector_space(stopword_path, bunch_path, space_path, train_tfidf_path=None):
    # 读取停用词
    stpwrdlst = readfile(stopword_path).splitlines()
    bunch = readbunchobj(bunch_path)  # 导入分词后的词向量bunch对象
    # 构建tf-idf词向量空间对象
    tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames=bunch.filenames, tdm=[],
                       vocabulary={})
    '''    
    在前面几节中,我们已经介绍了Bunch。   
    target_name,label和filenames这几个成员都是我们自己定义的玩意儿,前面已经讲过不再赘述。   
    下面我们讲一下tdm和vocabulary(这俩玩意儿也都是我们自己创建的):   
    tdm存放的是计算后得到的TF-IDF权重矩阵。
    请记住,我们后面分类器需要的东西,其实就是训练集的tdm和标签label,因此这个成员是   
    很重要的。 
    vocabulary是词典索引,例如   
    vocabulary={"我":0,"喜欢":1,"相国大人":2},这里的数字对应的就是tdm矩阵的列  
    我们现在就是要构建一个词向量空间,因此在初始时刻,这个tdm和vocabulary自然都是空的。
    如果你在这一步将vocabulary赋值了一个  
    自定义的内容,那么,你是傻逼。  
    '''
    '''  
    与下面这2行代码等价的代码是: 
    vectorizer=CountVectorizer()#构建一个计算词频(TF)的玩意儿,当然这里面不只是可以做这些 
    transformer=TfidfTransformer()#构建一个计算TF-IDF的玩意儿   
    tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus)) 
    #vectorizer.fit_transform(corpus)将文本corpus输入,得到词频矩阵 
    #将这个矩阵作为输入,用transformer.fit_transform(词频矩阵)得到TF-IDF权重矩阵  
    看名字你也应该知道:  
    Tfidf-Transformer + Count-Vectorizer  = Tfidf-Vectorizer   
    下面的代码一步到位,把上面的两个步骤一次性全部完成   
    值得注意的是,CountVectorizer()和TfidfVectorizer()里面都有一个成员叫做vocabulary_(后面带一个下划线) 
    这个成员的意义,与我们之前在构建Bunch对象时提到的自己定义的那个vocabulary的意思是一样的,
    只不过一个是私有成员,一个是外部输入,原则上应该保持一致。
    显然,我们在第45行中创建tfidfspace中定义的vocabulary就应该被赋值为这个vocabulary_
    '''

    # 构建一个快乐地一步到位的玩意儿,专业一点儿叫做:使用TfidfVectorizer初始化向量空间模型
    # 这里面有TF-IDF权重矩阵还有我们要的词向量空间坐标轴信息vocabulary_
    if train_tfidf_path is not None:
        # 导入训练集的TF-IDF词向量空间
        trainbunch = readbunchobj(train_tfidf_path)
        tfidfspace.vocabulary = trainbunch.vocabulary
        vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5,
                                     vocabulary=trainbunch.vocabulary)
        tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)

    else:
        vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5)
        # 此时tdm里面存储的就是if-idf权值矩阵
        tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
        tfidfspace.vocabulary = vectorizer.vocabulary_
    '''   
       关于参数,你只需要了解这么几个就可以了:    
       stop_words:    
       传入停用词,以后我们获得vocabulary_的时候,就会根据文本信息去掉停用词得到    
       vocabulary:    
       之前说过,不再解释。    
       sublinear_tf:    
       计算tf值采用亚线性策略。比如,我们以前算tf是词频,现在用1+log(tf)来充当词频。   
       smooth_idf:    
       计算idf的时候log(分子/分母)分母有可能是0,smooth_idf会采用log(分子/(1+分母))的方式解决。
       默认已经开启,无需关心。    
       norm:    
       归一化,我们计算TF-IDF的时候,是用TF*IDF,TF可以是归一化的,
       也可以是没有归一化的,一般都是采用归一化的方法,默认开启.  
       max_df:   
       有些词,他们的文档频率太高了(一个词如果每篇文档都出现,那还有必要用它来区分文本类别吗?当然不用了呀),
       所以,我们可以    
       设定一个阈值,比如float类型0.5(取值范围[0.0,1.0]),
       表示这个词如果在整个数据集中超过50%的文本都出现了,那么我们也把它列  
       为临时停用词。
       当然你也可以设定为int型,例如max_df=10,表示这个词如果在整个数据集中超过10的文本都出现了,
       那么我们也把它列为临时停用词。    
       min_df:    
       与max_df相反,虽然文档频率越低,似乎越能区分文本,可是如果太低,
       例如10000篇文本中只有1篇文本出现过这个词,仅仅因为这1篇   
       文本,就增加了词向量空间的维度,太不划算。    
       当然,max_df和min_df在给定vocabulary参数时,就失效了。   
       '''
    writebunchobj(space_path, tfidfspace)
    print("if-idf词向量空间实例创建成功!!!")