def pre_process():
    source_data_file = listdir(path1)
    for i in range(len(source_data_file)):
        if path.exists(path2 + '/' + source_data_file[i]) == False:
            mkdir(path2 + '/' + source_data_file[i])
        source_data = listdir(path1 + '/' + source_data_file[i])
        for j in range(len(source_data)):
            target_dir = path2 + '/' + source_data_file[i] + '/' + source_data[
                j]
            f = open(target_dir, 'w')
            source_dir = path1 + '/' + source_data_file[i] + '/' + source_data[
                j]
            data = open(source_dir, 'rb').readlines()
            for line in data:
                line = line.decode("utf-8", errors='ignore')
                #nltk.download('stopwords')
                #nltk.download('punkt')
                text_list = re.sub("[^a-zA-Z]", " ", line).split()
                stoplist = stopwords.words('english')
                #wordlist = nltk.word_tokenize(sentence)
                for word in text_list:
                    word = PorterStemmer().stem(word.lower())
                    if word not in stoplist:
                        f.write('%s\n' % word)
            f.close()
Esempio n. 2
0
def read_file():
    print('Read data:')
    for floder in os.listdir(srcPath):
        if path.exists(toPath + '/' + floder) == False:
            mkdir(toPath + '/' + floder)
        floderPath = srcPath + '/' + floder + '/'
        for file in os.listdir(floderPath):
            filePath = floderPath + file
            tarfilePath = toPath + '/' + floder + '/' + file
            targetfile = open(tarfilePath, 'w')
            for line in open(filePath, 'rb').readlines():
                line = line.decode("utf-8", errors='ignore')
                text_list = re.sub("[^a-zA-Z]", " ", line).split()
                stoplist = stopwords.words('english')
                for word in text_list:
                    word = PorterStemmer().stem(word.lower())
                    if word not in stoplist:
                        targetfile.write('%s\n' % word)
            targetfile.close()