def __init__(self, fname_parties, fname_ngram, ngram_min_freq=3, train_ratio=0.8, train=True): # TODO # 1. shuffle # 2. divide as train and test all_dict = csv2dictlist(fname_parties) self.data = [] self.label = [] for dic in all_dict: clean_text = dic['clean_text'] press = dic['press'] if clean_text == '': continue self.data.append(clean_text) self.label.append(0 if press == '더불어민주당' else 1) self.bigram_cvec = BigramCounterVector(fname_ngram, min_freq=ngram_min_freq)
def __init__(self, fname, min_freq): """Read a ngram file and load ngram words that are over min_freq """ # todo: Error check if fname has csv ext or not self.fname = fname self.min_freq = min_freq self.bigram_list = csv2dictlist(fname=fname) self.cvec_words = [] for bigram_dict in self.bigram_list: if int(bigram_dict['frequency']) >= min_freq: bigram_tup = tuple(bigram_dict['2gram word'].split()) self.cvec_words.append(bigram_tup) self.cvec = [0] * (len(self.cvec_words) + 1)
def save_clean_text(press: str, dirname: str, out_dirname: str): # get file name fname = get_fname_sewolho(press, dirname) # open file and get dict list total_news = csv2dictlist(fname) # execute clean text for news in total_news: news['clean_text'] = clean_text(news['text']) # determine output file name out_fname = fname.split('/')[-1] out_fname = out_fname.replace('.csv', '_clean.csv') out_fname = os.path.join(out_dirname, out_fname) if not os.path.isdir(out_dirname): os.makedirs(out_dirname) # save to out_fname with open(out_fname, 'w', newline='', encoding='utf-8-sig') as csvoutput: writer = csv.DictWriter(csvoutput, fieldnames=total_news[0].keys()) writer.writeheader() writer.writerows(total_news) print('saved to [' + out_fname + ']')
def get_texts(fname: str): news = csv2dictlist(fname) texts = [line['text'] for line in news] return texts
def get_cleantexts(fname: str): news = csv2dictlist(fname) clean_texts = [line['clean_text'] for line in news] return clean_texts
from tools import csv2dictlist, dictlist2csv import random # open data file and convert to list of dict all_dict = csv2dictlist('data/train/parties_merged.csv') # shuffle data randomly random.shuffle(all_dict) # calculate number of train and test total = len(all_dict) num_train = int(total * 0.8) num_test = total - num_train print('total:', total) print('num_train:', num_train) print('num_test:', num_test) # split data into two train = all_dict[:num_train] test = all_dict[num_train:] # save as csv file dictlist2csv(dict_list=train, out_name='train.csv') dictlist2csv(dict_list=test, out_name='test.csv')