Exemple #1
0
def map_word_list(read_directory1, read_directory2, write_filename):

    #文件总数
    file_number = np.sum(
        [len(files) for root, dirs, files in os.walk(read_directory1)])

    result = []

    for i in range(file_number):

        word_list = []
        f = open(read_directory2 + '/' + str(i + 1) + '.txt')
        line = f.readline()
        while line:
            word_list.append(line.strip())
            line = f.readline()

        f.close()

        vsm = np.loadtxt(read_directory1 + '/' + str(i + 1) + '.txt')
        vsm = vsm.T
        for each in vsm:
            result.append(" ".join(reflect_vsm_to_wordlist(each, word_list)))

    quick_write_list_to_text(result, write_filename)
Exemple #2
0
def get_final_center(read_filename1, read_filename2, write_filename):

    result = []

    word_list = []
    get_text_to_single_list(word_list, read_filename2)

    vsm = np.loadtxt(read_filename1)
    vsm = vsm.T
    for each in vsm:
        result.append(" ".join(reflect_vsm_to_wordlist(each, word_list)))

    quick_write_list_to_text(result, write_filename)
Exemple #3
0
def get_final_center(read_filename1, read_filename2, write_filename):

    result = []

        
    word_list = []
    get_text_to_single_list(word_list, read_filename2)
        
    vsm = np.loadtxt(read_filename1)
    vsm = vsm.T
    for each in vsm:
        result.append(" ".join(reflect_vsm_to_wordlist(each, word_list)))
    
    quick_write_list_to_text(result, write_filename)
def map_word_list(read_filename1, read_filename2, write_filename):
    
    word_list = []
    f = open(read_filename2, 'rb')
    line = f.readline()
    while line:
        word_list.append(line.strip().split(',')[0])
        line = f.readline()
    
    f.close()
    
    word_result = []
    vsm = np.loadtxt(read_filename1)
    vsm = vsm.T
    for each in vsm:
        word_result.append(" ".join(reflect_vsm_to_wordlist(each, word_list)))
    
    quick_write_list_to_text(word_result, write_filename)
Exemple #5
0
def sample_real_center(read_filename1, read_filename2, write_filename):

    result = []

    word_list = []
    f = open(read_filename2)
    line = f.readline()
    while line:
        word_list.append(line.strip().split()[0])
        line = f.readline()
    f.close()

    word_list = word_list[0:1000]

    vsm = np.loadtxt(read_filename1)
    vsm = vsm.T
    for each in vsm:
        result.append(" ".join(reflect_vsm_to_wordlist(each, word_list)))

    quick_write_list_to_text(result, write_filename)
def map_word_list(read_directory1, read_filename2, write_filename):
    
    #文件总数
    file_number = np.sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    
    word_list = []
    f = open(read_filename2, 'rb')
    line = f.readline()
    while line:
        word_list.append(line.strip())
        line = f.readline()
    
    f.close()
    
    result = []
    for i in range(file_number):
        vsm = np.loadtxt(read_directory1 + '/' + str(i + 1) + '.txt')
        vsm = vsm.T
        for each in vsm:
            result.append(" ".join(reflect_vsm_to_wordlist(each, word_list)))
    
    quick_write_list_to_text(result, write_filename)
Exemple #7
0
def merge_all_center(read_directory1, read_directory2, start_batch, end_batch):
    #文件总数
    file_number = np.sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    
    new_word_list = []
    
    for i in range(start_batch, end_batch):
        word_list = []
        f1 = open(read_directory2 + '/' + str(i) + '.txt', 'rb')
        line = f1.readline()
        while line:
            word_list.append(line.split()[0])
            line = f1.readline()
        
        f1.close()
        
        center = np.loadtxt(read_directory1 + '/' + str(i) + '.txt')
        if len(center) >= 200:
            center = np.array([center])
        
        for each in center:
            word_result = reflect_vsm_to_wordlist(each, word_list)
            for word in set(word_result).difference(new_word_list):
                new_word_list.append(word)
    
    result = []
    all_count = 0
    for i in range(start_batch, end_batch):
        word_list = []
        f1 = open(read_directory2 + '/' + str(i) + '.txt', 'rb')
        line = f1.readline()
        while line:
            word_list.append(line.split()[0])
            line = f1.readline()
        
        f1.close()
        
        center = np.loadtxt(read_directory1 + '/' + str(i) + '.txt')
        if len(center) >= 200:
            center = np.array([center])
        
        this_result = []
        for each in center:
            tf_dict = {}
            for k in range(len(each)):
                if each[k] > 0.000001:
                    tf_dict[word_list[k]] = each[k]
                
            tf_dict2 = {}
            for each1 in new_word_list:
                if each1 in tf_dict.keys():
                    tf_dict2[each1] = tf_dict[each1]
                else:
                    tf_dict2[each1] = 0
            
            this_line = np.zeros(len(new_word_list))
            count = 0
            for key in new_word_list:
                this_line[count] = tf_dict2[key]
                count += 1
            
            #每一行合并为字符串,方便写入
            this_result.append(this_line)
            all_count += 1
        
        result.append(this_result)
        #quick_write_list_to_text(result, write_directory + '/' + str(i + 1) + '.txt')
    
    return result, new_word_list, all_count
def merge_all_center(read_directory1, read_directory2, read_filename, write_directory, write_filename):
    #文件总数
    file_number = np.sum([len(files) for root, dirs, files in os.walk(read_directory1)])
    
    new_word_list = []
    
    batch_index = []
    f = open(read_filename)
    line = f.readline()
    while line:
        batch_index.append(line.split()[0])
        line = f.readline()
        
    f.close()
    
    
    for i in range(file_number):
        word_list = []
        f1 = open(read_directory2 + '/' + batch_index[i] + '.txt', 'rb')
        line = f1.readline()
        while line:
            word_list.append(line.split()[0])
            line = f1.readline()
        
        f1.close()
        
        
        center = np.loadtxt(read_directory1 + '/' + str(i + 1) + '.txt')
        center = center.T
        
        for each in center:
            word_result = reflect_vsm_to_wordlist(each, word_list)
            for word in set(word_result).difference(new_word_list):
                new_word_list.append(word)
    
    
    for i in range(file_number):
        word_list = []
        f1 = open(read_directory2 + '/' + batch_index[i] + '.txt', 'rb')
        line = f1.readline()
        while line:
            word_list.append(line.split()[0])
            line = f1.readline()
        
        f1.close()
        
        center = np.loadtxt(read_directory1 + '/' + str(i + 1) + '.txt')
        center = center.T
        
        #tf_dict = {}
        result = []
        for each in center:
            tf_dict = {}
            for k in range(len(each)):
                if each[k] > 0.0001:
                    tf_dict[word_list[k]] = each[k]
                
            tf_dict2 = {}
            for each1 in new_word_list:
                if each1 in tf_dict.keys():
                    tf_dict2[each1] = tf_dict[each1]
                else:
                    tf_dict2[each1] = 0
            
            this_line = []
            for key in new_word_list:
                this_line.append(str(tf_dict2[key]))
            
            #每一行合并为字符串,方便写入
            result.append(" ".join(this_line))
        
        quick_write_list_to_text(result, write_directory + '/' + str(i + 1) + '.txt')
    
    quick_write_list_to_text(new_word_list, write_filename)
def merge_all_center(read_directory1, read_directory2, read_filename,
                     write_directory, write_filename):
    #文件总数
    file_number = np.sum(
        [len(files) for root, dirs, files in os.walk(read_directory1)])

    new_word_list = []

    batch_index = []
    f = open(read_filename)
    line = f.readline()
    while line:
        batch_index.append(line.split()[0])
        line = f.readline()

    f.close()

    for i in range(file_number):
        word_list = []
        f1 = open(read_directory2 + '/' + batch_index[i] + '.txt', 'rb')
        line = f1.readline()
        while line:
            word_list.append(line.split()[0])
            line = f1.readline()

        f1.close()

        center = np.loadtxt(read_directory1 + '/' + str(i + 1) + '.txt')
        center = center.T

        for each in center:
            word_result = reflect_vsm_to_wordlist(each, word_list)
            for word in set(word_result).difference(new_word_list):
                new_word_list.append(word)

    for i in range(file_number):
        word_list = []
        f1 = open(read_directory2 + '/' + batch_index[i] + '.txt', 'rb')
        line = f1.readline()
        while line:
            word_list.append(line.split()[0])
            line = f1.readline()

        f1.close()

        center = np.loadtxt(read_directory1 + '/' + str(i + 1) + '.txt')
        center = center.T

        #tf_dict = {}
        result = []
        for each in center:
            tf_dict = {}
            for k in range(len(each)):
                if each[k] > 0.0001:
                    tf_dict[word_list[k]] = each[k]

            tf_dict2 = {}
            for each1 in new_word_list:
                if each1 in tf_dict.keys():
                    tf_dict2[each1] = tf_dict[each1]
                else:
                    tf_dict2[each1] = 0

            this_line = []
            for key in new_word_list:
                this_line.append(str(tf_dict2[key]))

            #每一行合并为字符串,方便写入
            result.append(" ".join(this_line))

        quick_write_list_to_text(result,
                                 write_directory + '/' + str(i + 1) + '.txt')

    quick_write_list_to_text(new_word_list, write_filename)