Example #1
0
 def get_stop_words(self):
     print('   -> Getting stop word list...')
     file = 'stopwords_list.csv'
     stop_words_list = []
     if os.path.isfile(self.data_path+file):
         print('     -> Stop Words File is found')
         dm = DataManager()
         df = dm.load_csv(file=self.data_path + file, encoding='utf-8')
         stop_words_list = df['Stopwords'].tolist()
     else:
         print('     -> Stop Words File is not found')
     return stop_words_list
Example #2
0
 def get_including_words(self):
     print('    -> Getting including word list...')
     file = 'including_words_list.csv'
     including_words_list = []
     if os.path.isfile(self.data_path+file):
         print('     -> Including Words File is found')
         dm = DataManager()
         df = dm.load_csv(file=self.data_path+file, encoding='utf-8')
         including_words_list = df['Includingwords'].tolist()
     else:
         print('     -> Including Words File is not found')
     print(including_words_list)
     return including_words_list
Example #3
0
 def get_including_words(self, path):
     file = 'including_words_list.csv'
     including_words_list = []
     if os.path.isfile(path + '/' + file):
         print('  ..Including Words File is found..')
         dm = DataManager()
         df = dm.load_csv(file=path + 'including_words_list.csv',
                          encoding='utf-8')
         including_words_list = df['Includingwords'].tolist()
     else:
         print('  ..Including Words File is not found..')
     print(including_words_list)
     return including_words_list
Example #4
0
 def get_stop_words(self, path):
     file = 'stopwords_list.csv'
     stop_words_list = []
     if os.path.isfile(path + '/' + file):
         print('  ..Stop Words File is found..')
         dm = DataManager()
         df = dm.load_csv(
             file='data/doc2vec_test_data/0702/stopwords_list.csv',
             encoding='utf-8')
         stop_words_list = df['Stopwords'].tolist()
     else:
         print('  ..Stop Words File is not found..')
     return stop_words_list
Example #5
0
    def pre_prosseccing(self):
        dm = DataManager()
        data = dm.load_csv(file=self.data_path + self.data_file_name+'.csv', encoding='utf-8')

        with open(self.data_path + self.data_file_name+'.documents', 'wb') as f:
            pickle.dump(data, f)
        with open(self.data_path + self.data_file_name+'_tm.documents', 'wb') as f:
            pickle.dump(data['job_description'], f)
        # # 수정된 job_title에서 posting_id 가지고 오기
        # posting_ids = data['posting_id']
        # posting_list = posting_ids.to_list()
        #
        # # posting_id에 따라 description_data set 만들기
        # des_data = [data['job_description'][id] for id in posting_ids]
        # title_data = [data['job_title'][id] for id in posting_ids]
        # id_list = [i for i in range(len(posting_list))]
        # df = pd.DataFrame({'id': posting_list, 'job_title': title_data, 'job_description': des_data, 'posting_id':posting_list})
        # df.to_csv('data/doc2vec_test_data/0702/merge_0629_adj.csv', mode='w', encoding='utf-8')

        # 수정된 description set 불러와 데이터 전처리 수행
        # data = dm.load_csv(file='data/doc2vec_test_data/0702/merge_0629_adj.csv', encoding='utf-8')
        sentences = self.data_text_cleansing(data)
        data_words = list(self.sent_to_words(sentences))
        data_words_nostops = self.remove_stopwords(data_words)
        data_lemmatized = self.lematization(data_words_nostops)

        bigram = self.make_ngram(data_lemmatized, n=2)

        # bigram = self.make_bigram(data_words_nostops)
        # data_lemmatized = self.lematization(bigram)
        # for i in range(len(bigram)):
        #     print(f'[{i}] : {bigram[i]}')

        data_lemmatized_filter = self.word_filtering(bigram)
        for i in range(len(data_lemmatized_filter)):
            print(f'[{i}] : {data_lemmatized_filter[i]}')
        # # uniquewords = self.make_unique_words(data_lemmatized)
        with open(self.data_path + self.data_file_name+'.corpus', 'wb') as f:
            pickle.dump(data_lemmatized_filter, f)

        print('=== end preprocessing ===')
        return data['id'], data_lemmatized_filter
Example #6
0
    def pre_prosseccing(self):
        dm = DataManager()
        data = dm.load_csv(
            file='analysis/doc2vec_test_data/0702/job_title.csv',
            encoding='utf-8')
        print(data['job Title'].size)
        sentences = self.data_text_cleansing(data, 'job Title')
        data_words = list(self.sent_to_words(sentences))
        data_words_nostops = self.remove_stopwords(data_words)
        # bigram = self.make_bigram(data_words_nostops)
        # data_lemmatized = self.lematization(data_words_nostops)
        # data_words_nostops_sorted = [sorted(i) for i in data_words_nostops]
        data_words_nostops = [' '.join(i) for i in data_words_nostops]
        print(data_words_nostops)
        df = pd.DataFrame({
            'id': data['id'],
            'Job_Title': data_words_nostops,
            'posting_id': data['posting_id']
        })
        # df = df.append(data_words_nostops)

        df.to_csv('analysis/doc2vec_test_data/0702/job_title_adj_extend.csv',
                  mode='w',
                  encoding='utf-8')