Beispiel #1
0
	def __init__(self, file_path,vocab,tokenizer):
		self.file_path = file_path
		self.data =[]
		self.vocab =vocab
		self.tokenizer = tokenizer
		file = open(self.file_path, 'r', encoding='utf-8')

		#df = pd.read_csv(self.file_path)
		df = pd.read_csv(self.file_path)#,delimiter='\t') #tsv
		datasets = []
		for _, row in df.iterrows():
			datasets.append([row["playlist"]])#[row["playlist"], row["genre"], row["weight"]])
			
		print("tokenizer ending")
		for line in datasets:
			if not line[0]:
				break
			#print(len(line[0]))
			if len(line[0]) < 3:
				continue
			toeknized_line = tokenizer(line[0][:-1])

			index_of_words = [vocab[vocab.bos_token], ] + vocab[toeknized_line] + [vocab[vocab.eos_token]]
			#print(len(index_of_words))
			if len(index_of_words) > 25: #25
				continue
			elif len(index_of_words) < 7: #5 #7
				continue

			#print(len(index_of_words))
			#print(line)
			self.data.append(index_of_words)#[index_of_words, line[1], line[2]]) 장르랑 가중치 안씀

		print(np.shape(self.data))
Beispiel #2
0
	def __init__(self, file_path,vocab,tokenizer):
		self.file_path = file_path
		self.data =[]
		self.vocab =vocab
		self.tokenizer = tokenizer
		file = open(self.file_path, 'r', encoding='utf-8')

		df = pd.read_csv(self.file_path,delimiter='\t')
		print("data read: --------OK-------")
		datasets = []
		for _, row in df.iterrows():
			datasets.append([row["lyrics"]]) # , row["genre"], row["score"]
			
		print("tokenizer ending")

		print(datasets.shape())

		for line in datasets:
			if not line[0]:
				break
			if len(line[0]) < 3:
				continue
			toeknized_line = tokenizer(line[0][:-1])
			index_of_words = [vocab[vocab.bos_token], ] + vocab[toeknized_line] + [vocab[vocab.eos_token]]	
			if len(index_of_words) > 1024:
				continue
			#print(line)

			self.data.append(index_of_words) #, line[1], line[2]

		print(np.shape(self.data))
Beispiel #3
0
    def __init__(self, file_path, vocab, tokenizer):
        self.file_path = file_path
        self.data = []
        self.vocab = vocab
        self.tokenizer = tokenizer
        file = open(self.file_path, 'r', encoding='utf-8')

        lines = file.read()
        lines = lines.split("<|endoftext|>")
        lines = [tokenizer(line) for line in lines]
        datasets = []

        for i, line in enumerate(lines):
            line = tokenizer(line)
            while (1):
                if len(line) > 1020:
                    datasets.append(line[:1020])
                    line = line[:1020]
                else:
                    datasets.append(line)
                    break

        #now = ""
        #for i, line in enumerate(lines):
        # 	if i % 20 == 0 and i != 0:
        # 		datasets.append(now)
        # 		now = ""
        # 	now = now + "\n" + line

        for line in datasets:
            if not line:
                break
            if len(line) < 3:
                continue

            toeknized_line = line[:-1]

            ### 여기부터는 그대로
            index_of_words = [
                vocab[vocab.bos_token],
            ] + vocab[toeknized_line] + [vocab[vocab.eos_token]]
            self.data.append(index_of_words)

        print(np.shape(self.data))

        file.close()
Beispiel #4
0
    def __init__(self, file_path, vocab, tokenizer):
        self.file_path = file_path
        self.sentence_list = []
        self.vocab = vocab
        self.tokenizer = tokenizer

        df = pd.read_csv(self.file_path)

        for line in df['script']:
            tokenized_line = tokenizer(str(line))
            index_of_words = [
                vocab[vocab.bos_token],
            ] + vocab[tokenized_line] + [vocab[vocab.eos_token]]
            self.sentence_list.append(index_of_words)
        print("sentence list length :", len(self.sentence_list))
Beispiel #5
0
    def __init__(self, file_path, vocab, tokenizer):
        self.file_path = file_path
        self.data = []
        self.vocab = vocab
        self.tokenizer = tokenizer
        file = open(self.file_path, 'r', encoding='utf-8')

        while True:
            line = file.readline()
            if not line:
                break
            toeknized_line = tokenizer(line[:-1])
            index_of_words = [
                vocab[vocab.bos_token],
            ] + vocab[toeknized_line] + [vocab[vocab.eos_token]]

            self.data.append(index_of_words)

        file.close()
Beispiel #6
0
    def __init__(self, file_path, vocab, tokenizer):
        self.file_path = file_path
        self.data = []
        self.vocab = vocab
        self.tokenizer = tokenizer
        file = open(self.file_path, 'r', encoding='utf-8')

        lines = lines.split("<|endoftext|>")
        lines = [line.split("\n") for line in lines]
        lines = [str(line) for line in lines]

        datasets = []

        print("tokenizer start")
        for line in lines:
            now = ""
            for i, l in enumerate(line):
                if i % 20 == 0 and i != 0:
                    datasets.append(now)
                    now = ""
                now = now + "\n" + l
                if i == len(line) - 1:
                    datasets.append(now)

        print("tokenizer ending")
        for line in datasets:
            if not line:
                break
            if len(line) < 3:
                continue

            toeknized_line = tokenizer(line[:-1])

            ### 여기부터는 그대로
            index_of_words = [
                vocab[vocab.bos_token],
            ] + vocab[toeknized_line] + [vocab[vocab.eos_token]]
            self.data.append(index_of_words)

        print(np.shape(self.data))

        file.close()
Beispiel #7
0
    def __init__(self, file_path, vocab, tokenizer):
        self.file_path = file_path
        self.data = []
        self.vocab = vocab
        self.tokenizer = tokenizer
        file = open(self.file_path, 'r', encoding='utf-8')

        lines = file.read()
        lines = lines.split("\n")

        datasets = []
        now = ""
        for i, line in enumerate(lines):
            if i % 30 == 0 and i != 0:
                datasets.append(now)
                now = ""
            now = now + "\n" + line

        # lines = lines.split("<|endoftext|>")
        # lines = [line.split("\n") for line in lines]
        # lines = [str(line) for line in lines]
        #
        # datasets = []
        #
        # print("tokenizer start")
        # for line in lines:
        # 	now = ""
        # 	for i, l in enumerate(line):
        # 		if i % 20 == 0 and i != 0:
        # 			datasets.append(now)
        # 			now = ""
        # 		now = now + "\n" + l
        # 		if i == len(line) - 1:
        # 			datasets.append(now)

        # while 1:
        # 	if len(line) > 20:
        # 		datasets.append("\n".join(line[20:]))
        # 		line = line[:20]
        # 	else:
        # 		datasets.append(line)
        # 		break

        # now = ""
        # for i, line in enumerate(lines):
        # 	if i % 20 == 0 and i != 0:
        # 		datasets.append(now)
        # 		now = ""
        # 	now = now + "\n" + line

        print("tokenizer ending")
        for line in datasets:
            if not line:
                break
            if len(line) < 3:
                continue

            toeknized_line = tokenizer(line[:-1])

            ### 여기부터는 그대로
            index_of_words = [
                vocab[vocab.bos_token],
            ] + vocab[toeknized_line] + [vocab[vocab.eos_token]]
            self.data.append(index_of_words)

        print(np.shape(self.data))

        file.close()
Beispiel #8
0
    def __init__(self, file_path, vocab, tokenizer):
        self.file_path = file_path
        self.sentence_list = []
        self.vocab = vocab
        self.tokenizer = tokenizer

        df = pd.read_csv(self.file_path)

        df["genre"] = df["genre"].str.strip("[]").str.split(",")
        # df['genre'] = df['genre'].fillna('none')

        ### gen_to_idx, genre_to_vocab 설정
        gen_to_vocab = {}
        genres = [
            "SF",
            "TV영화",
            "공포",
            "느와르",
            "다큐멘터리",
            "드라마",
            "멜로",
            "로맨스",
            "모험",
            "무협",
            "뮤지컬",
            "미스터리",
            "범죄",
            "서부",
            "서스펜스",
            "스릴러",
            "애니메이션",
            "액션",
            "멜로/로맨스",
            "가족",
            "서사",
            "전쟁",
            "코미디",
            "판타지",
        ]
        print(f"We have {len(genres)} genres")
        gen_to_idx = {}
        for idx, gen in enumerate(genres):
            gen_to_idx[gen] = idx + 6
        idx_to_gen = {v: k for k, v in gen_to_idx.items()}

        for idx, gen in idx_to_gen.items():
            gen_to_vocab[gen] = vocab.idx_to_token[idx]

        count = 0
        err = 0
        for idx in range(len(df)):
            line = df.loc[idx, "content"]
            genres = df.loc[idx, "genre"]
            tokenized_line = tokenizer(str(line))
            if genres == "'none'":
                print(genres)
                index_of_words = ([
                    vocab[vocab.bos_token],
                ] + vocab[tokenized_line] + [vocab[vocab.eos_token]])
            else:
                tmp = []

                for gen in genres:
                    try:
                        tmp.append(gen_to_vocab[gen.strip("' '")])
                    except Exception as e:
                        pass
                if len(tmp) > 0:
                    count += 1
                else:
                    err += 1
                index_of_words = ([
                    vocab[vocab.bos_token],
                ] + vocab[tmp] + vocab[tokenized_line] +
                                  [vocab[vocab.eos_token]])
            self.sentence_list.append(index_of_words)

        print(
            f"average length of data : {sum(df['content'].str.len()) / len(df)}"
        )

        print("total data :", len(self.sentence_list))

        print("=== test genres ===")
        print(f"we got {count} synos which have genres.")
        print(f"we lose {err} synos because their genres are not included.")
        print(
            f"match full == count + err {len(self.sentence_list) == count+err}"
        )