def __init__(self, file_path,vocab,tokenizer): self.file_path = file_path self.data =[] self.vocab =vocab self.tokenizer = tokenizer file = open(self.file_path, 'r', encoding='utf-8') #df = pd.read_csv(self.file_path) df = pd.read_csv(self.file_path)#,delimiter='\t') #tsv datasets = [] for _, row in df.iterrows(): datasets.append([row["playlist"]])#[row["playlist"], row["genre"], row["weight"]]) print("tokenizer ending") for line in datasets: if not line[0]: break #print(len(line[0])) if len(line[0]) < 3: continue toeknized_line = tokenizer(line[0][:-1]) index_of_words = [vocab[vocab.bos_token], ] + vocab[toeknized_line] + [vocab[vocab.eos_token]] #print(len(index_of_words)) if len(index_of_words) > 25: #25 continue elif len(index_of_words) < 7: #5 #7 continue #print(len(index_of_words)) #print(line) self.data.append(index_of_words)#[index_of_words, line[1], line[2]]) 장르랑 가중치 안씀 print(np.shape(self.data))
def __init__(self, file_path,vocab,tokenizer): self.file_path = file_path self.data =[] self.vocab =vocab self.tokenizer = tokenizer file = open(self.file_path, 'r', encoding='utf-8') df = pd.read_csv(self.file_path,delimiter='\t') print("data read: --------OK-------") datasets = [] for _, row in df.iterrows(): datasets.append([row["lyrics"]]) # , row["genre"], row["score"] print("tokenizer ending") print(datasets.shape()) for line in datasets: if not line[0]: break if len(line[0]) < 3: continue toeknized_line = tokenizer(line[0][:-1]) index_of_words = [vocab[vocab.bos_token], ] + vocab[toeknized_line] + [vocab[vocab.eos_token]] if len(index_of_words) > 1024: continue #print(line) self.data.append(index_of_words) #, line[1], line[2] print(np.shape(self.data))
def __init__(self, file_path, vocab, tokenizer): self.file_path = file_path self.data = [] self.vocab = vocab self.tokenizer = tokenizer file = open(self.file_path, 'r', encoding='utf-8') lines = file.read() lines = lines.split("<|endoftext|>") lines = [tokenizer(line) for line in lines] datasets = [] for i, line in enumerate(lines): line = tokenizer(line) while (1): if len(line) > 1020: datasets.append(line[:1020]) line = line[:1020] else: datasets.append(line) break #now = "" #for i, line in enumerate(lines): # if i % 20 == 0 and i != 0: # datasets.append(now) # now = "" # now = now + "\n" + line for line in datasets: if not line: break if len(line) < 3: continue toeknized_line = line[:-1] ### 여기부터는 그대로 index_of_words = [ vocab[vocab.bos_token], ] + vocab[toeknized_line] + [vocab[vocab.eos_token]] self.data.append(index_of_words) print(np.shape(self.data)) file.close()
def __init__(self, file_path, vocab, tokenizer): self.file_path = file_path self.sentence_list = [] self.vocab = vocab self.tokenizer = tokenizer df = pd.read_csv(self.file_path) for line in df['script']: tokenized_line = tokenizer(str(line)) index_of_words = [ vocab[vocab.bos_token], ] + vocab[tokenized_line] + [vocab[vocab.eos_token]] self.sentence_list.append(index_of_words) print("sentence list length :", len(self.sentence_list))
def __init__(self, file_path, vocab, tokenizer): self.file_path = file_path self.data = [] self.vocab = vocab self.tokenizer = tokenizer file = open(self.file_path, 'r', encoding='utf-8') while True: line = file.readline() if not line: break toeknized_line = tokenizer(line[:-1]) index_of_words = [ vocab[vocab.bos_token], ] + vocab[toeknized_line] + [vocab[vocab.eos_token]] self.data.append(index_of_words) file.close()
def __init__(self, file_path, vocab, tokenizer): self.file_path = file_path self.data = [] self.vocab = vocab self.tokenizer = tokenizer file = open(self.file_path, 'r', encoding='utf-8') lines = lines.split("<|endoftext|>") lines = [line.split("\n") for line in lines] lines = [str(line) for line in lines] datasets = [] print("tokenizer start") for line in lines: now = "" for i, l in enumerate(line): if i % 20 == 0 and i != 0: datasets.append(now) now = "" now = now + "\n" + l if i == len(line) - 1: datasets.append(now) print("tokenizer ending") for line in datasets: if not line: break if len(line) < 3: continue toeknized_line = tokenizer(line[:-1]) ### 여기부터는 그대로 index_of_words = [ vocab[vocab.bos_token], ] + vocab[toeknized_line] + [vocab[vocab.eos_token]] self.data.append(index_of_words) print(np.shape(self.data)) file.close()
def __init__(self, file_path, vocab, tokenizer): self.file_path = file_path self.data = [] self.vocab = vocab self.tokenizer = tokenizer file = open(self.file_path, 'r', encoding='utf-8') lines = file.read() lines = lines.split("\n") datasets = [] now = "" for i, line in enumerate(lines): if i % 30 == 0 and i != 0: datasets.append(now) now = "" now = now + "\n" + line # lines = lines.split("<|endoftext|>") # lines = [line.split("\n") for line in lines] # lines = [str(line) for line in lines] # # datasets = [] # # print("tokenizer start") # for line in lines: # now = "" # for i, l in enumerate(line): # if i % 20 == 0 and i != 0: # datasets.append(now) # now = "" # now = now + "\n" + l # if i == len(line) - 1: # datasets.append(now) # while 1: # if len(line) > 20: # datasets.append("\n".join(line[20:])) # line = line[:20] # else: # datasets.append(line) # break # now = "" # for i, line in enumerate(lines): # if i % 20 == 0 and i != 0: # datasets.append(now) # now = "" # now = now + "\n" + line print("tokenizer ending") for line in datasets: if not line: break if len(line) < 3: continue toeknized_line = tokenizer(line[:-1]) ### 여기부터는 그대로 index_of_words = [ vocab[vocab.bos_token], ] + vocab[toeknized_line] + [vocab[vocab.eos_token]] self.data.append(index_of_words) print(np.shape(self.data)) file.close()
def __init__(self, file_path, vocab, tokenizer): self.file_path = file_path self.sentence_list = [] self.vocab = vocab self.tokenizer = tokenizer df = pd.read_csv(self.file_path) df["genre"] = df["genre"].str.strip("[]").str.split(",") # df['genre'] = df['genre'].fillna('none') ### gen_to_idx, genre_to_vocab 설정 gen_to_vocab = {} genres = [ "SF", "TV영화", "공포", "느와르", "다큐멘터리", "드라마", "멜로", "로맨스", "모험", "무협", "뮤지컬", "미스터리", "범죄", "서부", "서스펜스", "스릴러", "애니메이션", "액션", "멜로/로맨스", "가족", "서사", "전쟁", "코미디", "판타지", ] print(f"We have {len(genres)} genres") gen_to_idx = {} for idx, gen in enumerate(genres): gen_to_idx[gen] = idx + 6 idx_to_gen = {v: k for k, v in gen_to_idx.items()} for idx, gen in idx_to_gen.items(): gen_to_vocab[gen] = vocab.idx_to_token[idx] count = 0 err = 0 for idx in range(len(df)): line = df.loc[idx, "content"] genres = df.loc[idx, "genre"] tokenized_line = tokenizer(str(line)) if genres == "'none'": print(genres) index_of_words = ([ vocab[vocab.bos_token], ] + vocab[tokenized_line] + [vocab[vocab.eos_token]]) else: tmp = [] for gen in genres: try: tmp.append(gen_to_vocab[gen.strip("' '")]) except Exception as e: pass if len(tmp) > 0: count += 1 else: err += 1 index_of_words = ([ vocab[vocab.bos_token], ] + vocab[tmp] + vocab[tokenized_line] + [vocab[vocab.eos_token]]) self.sentence_list.append(index_of_words) print( f"average length of data : {sum(df['content'].str.len()) / len(df)}" ) print("total data :", len(self.sentence_list)) print("=== test genres ===") print(f"we got {count} synos which have genres.") print(f"we lose {err} synos because their genres are not included.") print( f"match full == count + err {len(self.sentence_list) == count+err}" )