def get_stop_words(self): print(' -> Getting stop word list...') file = 'stopwords_list.csv' stop_words_list = [] if os.path.isfile(self.data_path+file): print(' -> Stop Words File is found') dm = DataManager() df = dm.load_csv(file=self.data_path + file, encoding='utf-8') stop_words_list = df['Stopwords'].tolist() else: print(' -> Stop Words File is not found') return stop_words_list
def get_including_words(self): print(' -> Getting including word list...') file = 'including_words_list.csv' including_words_list = [] if os.path.isfile(self.data_path+file): print(' -> Including Words File is found') dm = DataManager() df = dm.load_csv(file=self.data_path+file, encoding='utf-8') including_words_list = df['Includingwords'].tolist() else: print(' -> Including Words File is not found') print(including_words_list) return including_words_list
def get_including_words(self, path): file = 'including_words_list.csv' including_words_list = [] if os.path.isfile(path + '/' + file): print(' ..Including Words File is found..') dm = DataManager() df = dm.load_csv(file=path + 'including_words_list.csv', encoding='utf-8') including_words_list = df['Includingwords'].tolist() else: print(' ..Including Words File is not found..') print(including_words_list) return including_words_list
def get_stop_words(self, path): file = 'stopwords_list.csv' stop_words_list = [] if os.path.isfile(path + '/' + file): print(' ..Stop Words File is found..') dm = DataManager() df = dm.load_csv( file='data/doc2vec_test_data/0702/stopwords_list.csv', encoding='utf-8') stop_words_list = df['Stopwords'].tolist() else: print(' ..Stop Words File is not found..') return stop_words_list
def pre_prosseccing(self): dm = DataManager() data = dm.load_csv(file=self.data_path + self.data_file_name+'.csv', encoding='utf-8') with open(self.data_path + self.data_file_name+'.documents', 'wb') as f: pickle.dump(data, f) with open(self.data_path + self.data_file_name+'_tm.documents', 'wb') as f: pickle.dump(data['job_description'], f) # # 수정된 job_title에서 posting_id 가지고 오기 # posting_ids = data['posting_id'] # posting_list = posting_ids.to_list() # # # posting_id에 따라 description_data set 만들기 # des_data = [data['job_description'][id] for id in posting_ids] # title_data = [data['job_title'][id] for id in posting_ids] # id_list = [i for i in range(len(posting_list))] # df = pd.DataFrame({'id': posting_list, 'job_title': title_data, 'job_description': des_data, 'posting_id':posting_list}) # df.to_csv('data/doc2vec_test_data/0702/merge_0629_adj.csv', mode='w', encoding='utf-8') # 수정된 description set 불러와 데이터 전처리 수행 # data = dm.load_csv(file='data/doc2vec_test_data/0702/merge_0629_adj.csv', encoding='utf-8') sentences = self.data_text_cleansing(data) data_words = list(self.sent_to_words(sentences)) data_words_nostops = self.remove_stopwords(data_words) data_lemmatized = self.lematization(data_words_nostops) bigram = self.make_ngram(data_lemmatized, n=2) # bigram = self.make_bigram(data_words_nostops) # data_lemmatized = self.lematization(bigram) # for i in range(len(bigram)): # print(f'[{i}] : {bigram[i]}') data_lemmatized_filter = self.word_filtering(bigram) for i in range(len(data_lemmatized_filter)): print(f'[{i}] : {data_lemmatized_filter[i]}') # # uniquewords = self.make_unique_words(data_lemmatized) with open(self.data_path + self.data_file_name+'.corpus', 'wb') as f: pickle.dump(data_lemmatized_filter, f) print('=== end preprocessing ===') return data['id'], data_lemmatized_filter
def pre_prosseccing(self): dm = DataManager() data = dm.load_csv( file='analysis/doc2vec_test_data/0702/job_title.csv', encoding='utf-8') print(data['job Title'].size) sentences = self.data_text_cleansing(data, 'job Title') data_words = list(self.sent_to_words(sentences)) data_words_nostops = self.remove_stopwords(data_words) # bigram = self.make_bigram(data_words_nostops) # data_lemmatized = self.lematization(data_words_nostops) # data_words_nostops_sorted = [sorted(i) for i in data_words_nostops] data_words_nostops = [' '.join(i) for i in data_words_nostops] print(data_words_nostops) df = pd.DataFrame({ 'id': data['id'], 'Job_Title': data_words_nostops, 'posting_id': data['posting_id'] }) # df = df.append(data_words_nostops) df.to_csv('analysis/doc2vec_test_data/0702/job_title_adj_extend.csv', mode='w', encoding='utf-8')