Ejemplo n.º 1
0
def createVec(modelDir, srcDir, disDir):

    fileList = os.listdir(disDir + '\\vec')
    for index in fileList:
        os.remove(disDir + '\\vec\\' + index)
    fileList = os.listdir(disDir + '\\label')
    for index in fileList:
        os.remove(disDir + '\\label\\' + index)

    model = gensim.models.Word2Vec.load(modelDir)

    classth = 0  # used for label
    for classname in os.listdir(srcDir):
        classnum = len(os.listdir(srcDir))
        fileth = 0
        for fname in os.listdir(os.path.join(srcDir, classname)):
            file = open(os.path.join(srcDir, classname, fname),
                        'r',
                        encoding='utf-8')
            lines = file.readlines()
            if lines.index('$\n'):
                pos = lines.index('$\n')
                str = lines[pos + 1]

                rubbishWords = improveBayes.someRubbishWords()
                text = [word for word in list(jieba.cut(str))]
                for word in text:
                    if word in rubbishWords:
                        text.remove(word)
                if len(text) >= 40:
                    tempVec = []
                    Label = [0] * classnum
                    Label[classth] = 1
                    for word in text:
                        if word in model.wv:
                            tempVec.append(model.wv[word])
                    from sklearn import decomposition
                    pca = decomposition.PCA(n_components=28)
                    pca.fit(tempVec)
                    # print(pca.components_.shape)  # word -> vec
                    # print(Label)  # one hot representation

                    #  write to tfrecords
                    vec = pca.components_
                    try:
                        vec = vec.reshape([28 * 40])
                        label = np.array(Label)
                        fileth += 1
                        vecfilename = disDir + '\\vec\\%.5d-of-%.5d.npy' % (
                            classth, fileth)
                        labelfilename = disDir + '\\label\\%.5d-of-%.5d.npy' % (
                            classth, fileth)
                        np.save(vecfilename, vec)
                        np.save(labelfilename, label)
                    except ValueError:
                        print('ValueError')

        classth += 1
Ejemplo n.º 2
0
def createVec(dirname):
    #  the result will be written to D:\\VecTest in tht format of tfrecords
    model = gensim.models.Word2Vec.load('D:\Git\model')

    classth = 0  # used for label
    for classname in os.listdir(dirname):
        classnum = len(os.listdir(dirname))
        fileth = 0
        for fname in os.listdir(os.path.join(dirname, classname)):
            file = open(os.path.join(dirname, classname, fname),
                        'r',
                        encoding='utf-8')
            lines = file.readlines()
            if lines.index('$\n'):
                pos = lines.index('$\n')
                str = lines[pos + 1]
                from DingQiMin.DataProcessing.Normal import improveBayes
                rubbishWords = improveBayes.someRubbishWords()
                text = [word for word in list(jieba.cut(str))]
                for word in text:
                    if word in rubbishWords:
                        text.remove(word)
                if len(text) >= 40:
                    tempVec = []
                    Label = [0] * classnum
                    Label[classth] = 1
                    for word in text:
                        if word in model.wv:
                            tempVec.append(model.wv[word])
                    from sklearn import decomposition
                    pca = decomposition.PCA(n_components=28)
                    pca.fit(tempVec)
                    # print(pca.components_.shape)  # word -> vec
                    # print(Label)  # one hot representation

                    #  write to tfrecords
                    fileth += 1
                    vec_raw = pca.components_.tostring()
                    label_raw = np.array(Label).tostring()
                    filename = 'D:\\VecTest\\vecData.tfrecords-%.5d-of-%.5d' % (
                        classth, fileth)
                    writer = tf.python_io.TFRecordWriter(filename)
                    example = tf.train.Example(features=tf.train.Features(
                        feature={
                            'vec': _bytes_feature(vec_raw),
                            # 'label':_int64_feature(np.argmax(Label))
                            'label': _bytes_feature(label_raw)
                        }))
                    writer.write(example.SerializeToString())
                    writer.close()
                    #  end of writing tfrecords
        classth += 1
Ejemplo n.º 3
0
def afterCut(srcDir,disDir):
    classth = 0  # used for label
    for classname in os.listdir(srcDir):
        classnum = len(os.listdir(srcDir))
        fileth = 0
        for fname in os.listdir(os.path.join(srcDir, classname)):
            file = open(os.path.join(srcDir, classname, fname), 'r', encoding='utf-8')
            lines = file.readlines()
            if lines.index('$\n'):
                pos = lines.index('$\n')
                str = lines[pos + 1]

                rubbishWords = improveBayes.someRubbishWords()
                text = [word for word in list(jieba.cut(str))]
                for word in text:
                    if word in rubbishWords:
                        text.remove(word)

        classth += 1