def prepare(self, dataset_root, extracted_path): label_file_path = self._get_annotation_file(dataset_root) self.move_extracteds(dataset_root, extracted_path, self.label_desc_file) self.logger.info("Reading the annotation file") annotations = {} annotation_count = self.get_line_count(label_file_path) with open(label_file_path, "r", encoding="utf-8") as f: for line in xtqdm(f, total=annotation_count): a = line.strip().split(" ") cat = a[0] document_id = a[1] if document_id in annotations: annotations[document_id] += [cat] else: annotations[document_id] = [cat] descs = ReutersNewsResource.read_descriptions(dataset_root, self.kind) self.logger.info("Make annotated file") pathes = [] for t in ["train", "test"]: file_path = os.path.join(dataset_root, "{}_{}.txt".format(self.kind, t)) self.logger.info("Annotating the {} file".format(t)) data_path = os.path.join(extracted_path, "lyrl2004_tokens_{}.csv".format(t)) total_count = self.get_line_count(data_path) f = open(file_path, "w", encoding="utf-8") with open(data_path, "r", encoding="utf-8") as df: for line in xtqdm(df, total=total_count): doc_id, words = line.strip().split(",") if doc_id in annotations: ann = " ".join(annotations[doc_id]) if self.kind == "regions": f.write("\t".join([ann, words]) + "\n") else: ps = [descs[d].parent for d in annotations[doc_id]] ps = [p for p in ps if p not in ["Root", "None"]] ps = list(set(ps)) ps = " ".join(ps) f.write("\t".join([ann, ps, words]) + "\n") f.close() pathes.append(file_path) self.trush(data_path) self.trush(label_file_path) return pathes[0]
def prepare(self, dataset_root, _): original_file_path = os.path.join(dataset_root, self.original_file) write_file_path = os.path.splitext(original_file_path)[0] + ".txt" write_file = open(write_file_path, mode="w", encoding="utf-8", newline="") writer = csv.writer(write_file, delimiter="\t") self.logger.info("Preprocessing {}".format(original_file_path)) with open(original_file_path, encoding="utf-8") as rf: data = json.load(rf)["data"] make_row = getattr( self, "make_row_{}".format(self.version.replace(".", "_"))) for article in xtqdm(data): for paragraph in article["paragraphs"]: context = paragraph["context"].replace("\n", " ") for qa in paragraph["qas"]: question = qa["question"].strip().replace("\n", "") row = make_row(context, question, qa) writer.writerow(row) self.trush(original_file_path) write_file.close() return write_file_path
def _get_annotation_file(self, dataset_root): label_file_path = os.path.join(dataset_root, self.label_file) if os.path.exists(label_file_path): return label_file_path self.logger.info("Downloading the annotation file") dl_file_path = label_file_path + ".gz" r = requests.get(self.label_url) total_size = int(r.headers.get("content-length", 0)) with open(dl_file_path, "wb") as f: chunk_size = 1024 limit = total_size / chunk_size for data in xtqdm(r.iter_content(chunk_size=chunk_size), total=limit, unit="B", unit_scale=True): f.write(data) with gzip.open(dl_file_path, "rb") as g: with open(label_file_path, "wb") as f: for ln in g: f.write(ln) self.trush(dl_file_path) return label_file_path
def label_by_dir(self, file_path, target_dir, dir_and_label, task_size=10): label_dirs = dir_and_label.keys() dirs = [ d for d in os.listdir(target_dir) if os.path.isdir(os.path.join(target_dir, d)) and d in label_dirs ] write_flg = True for d in dirs: self.logger.info("Extracting {} (labeled by {}).".format( d, dir_and_label[d])) label = dir_and_label[d] dir_path = os.path.join(target_dir, d) pathes = [os.path.join(dir_path, f) for f in os.listdir(dir_path)] pathes = [p for p in pathes if os.path.isfile(p)] task_length = int(math.ceil(len(pathes) / task_size)) for i in xtqdm(range(task_length)): index = i * task_size tasks = pathes[index:(index + task_size)] lines = Parallel(n_jobs=-1)(delayed(self._make_pair)(label, t) for t in tasks) mode = "w" if write_flg else "a" with open(file_path, mode=mode, encoding="utf-8") as f: for ln in lines: f.write(ln) write_flg = False
def save_dataset(self, dataset_root): save_file_path = os.path.join(dataset_root, self._get_file_name(None)) if os.path.exists(save_file_path): self.logger.info("The dataset file already exists.") return save_file_path url = self.test_download_url if self.test_mode else self.download_url # download and save it as raw file self.logger.info("Begin downloading the {} dataset from {}.".format( self.name, url)) resp = requests.get(self.download_url, stream=True) if not resp.ok: raise Exception("Can not get dataset from {}.".format(url)) # save content in response to file total_size = int(resp.headers.get("content-length", 0)) file_name = self._get_file_name(resp) _, ext = os.path.splitext(file_name) save_file_path = os.path.abspath(os.path.join(dataset_root, file_name)) self.logger.info("The dataset is saved to {}".format(save_file_path)) with open(save_file_path, "wb") as f: chunk_size = 1024 limit = total_size / chunk_size for data in xtqdm(resp.iter_content(chunk_size=chunk_size), total=limit, unit="B", unit_scale=True): f.write(data) return save_file_path
def _prepare_polarity_v1(self, dataset_root, extracted_path): polarity_file = os.path.join(dataset_root, "review_polarity_v1.txt") with open(polarity_file, mode="w", encoding="utf-8") as f: for e in self.extract_targets: p = os.path.join(extracted_path, os.path.basename(e)) label = 0 if e.endswith(".neg") else 1 label_name = "negative" if label == 0 else "positive" self.logger.info("Extracting {} data.".format(label_name)) total = self.get_line_count(p) with open(p, mode="r", errors="replace", encoding="utf-8") as p: for ln in xtqdm(p, total=total): review = ln.strip().replace("\t", " ") f.write("\t".join([str(label), review]) + "\n") return polarity_file
def preprocess_file(self, dataset_root, path): write_file_name = os.path.basename(path).replace(".jsonl", ".txt") write_file_path = os.path.join(dataset_root, write_file_name) write_file = open(write_file_path, mode="w", encoding="utf-8") file_kind = path.split("_")[-1] self.logger.info("Preprocessing {} file".format(file_kind)) total_count = self.get_line_count(path) with open(path, encoding="utf-8") as rf: for line in xtqdm(rf, total=total_count): preprocessed = self.preprocess_jsonl(line) if preprocessed: w_line = "\t".join(preprocessed) + "\n" write_file.write(w_line) write_file.close() return write_file_path
def _prepare_subjectivity(self, dataset_root, extracted_path): subjectivity_file = os.path.join(dataset_root, "subjectivity.txt") with open(subjectivity_file, mode="w", encoding="utf-8") as f: for e in self.extract_targets: # subjective(plot) = 1 label = 1 if e.startswith("plot.") else 0 label_name = "subjective" if label == 1 else "objective" self.logger.info("Extracting {} data.".format(label_name)) p = os.path.join(extracted_path, os.path.basename(e)) total = self.get_line_count(p) with open(p, mode="r", errors="replace", encoding="utf-8") as sb: for ln in xtqdm(sb, total=total): review = ln.strip().replace("\t", " ") f.write("\t".join([str(label), review]) + "\n") return subjectivity_file
def make(self, path_or_paths, vocab_size=-1, min_word_count=0, target_column_indexes=(), separator="\t", reserved_words=()): vocab = Counter() paths = path_or_paths if isinstance(paths, str): paths = [paths] for p in paths: self.logger.info("Read {} to make vocabulary.".format(p)) count = self.get_line_count(p) for words in xtqdm(self.fetch_line(p, target_column_indexes, separator), total=count): for w in words: vocab[w] += 1 _vocab = [ k_v[0] for k_v in vocab.most_common() if not k_v[1] < min_word_count ] if self.unknown and self.unknown not in _vocab: _vocab.insert(0, self.unknown) if self.end_of_sentence and self.end_of_sentence not in _vocab: _vocab.insert(0, self.end_of_sentence) if len(reserved_words) > 0: for w in reserved_words: _vocab.insert(0, w) if vocab_size > 0: _vocab = _vocab[:vocab_size] self.logger.info( "The vocabulary count is {}. You can see it in {}.".format( len(_vocab), self._vocab_file_path)) with open(self._vocab_file_path, "w", encoding="utf-8") as f: f.write("\n".join(_vocab)) self._vocab = dict(zip(_vocab, range(len(_vocab)))) self.__rev_vocab = {}
def _prepare_polarity(self, dataset_root, extracted_path): polarity_file_path = os.path.join(dataset_root, "review_polarity.txt") negative_path = os.path.join(extracted_path, "txt_sentoken/neg") positive_path = os.path.join(extracted_path, "txt_sentoken/pos") with open(polarity_file_path, mode="w", encoding="utf-8") as f: for i, p in enumerate([negative_path, positive_path]): label = i # negative = 0, positive = 1 label_name = "negative" if label == 0 else "positive" self.logger.info("Extracting {} data.".format(label_name)) for txt in xtqdm(os.listdir(p)): with open(os.path.join(p, txt), encoding="utf-8") as tf: lines = [ ln.strip().replace("\t", " ") for ln in tf.readlines() ] review = " ".join(lines) f.write("\t".join([str(label), review]) + "\n") return polarity_file_path
def train_test_split(self, original_file_path, test_size): if test_size < 0 or test_size > 1: self.logger.error( "test_size have to be between 0 ~ 1." \ "if you don't want to split, please set 0.") return [] elif test_size == 0 or test_size == 1: return [] self.logger.info("Split to train & test file.") total_count = self.get_line_count(original_file_path) test_count = int(round(total_count * test_size)) test_targets = random.sample(range(total_count), test_count) base, ext = os.path.splitext(original_file_path) train_test_path = [base + x + ext for x in ["_train", "_test"]] train_file = open(train_test_path[0], "wb") test_file = open(train_test_path[1], "wb") with open(original_file_path, "rb") as f: i = 0 for line in xtqdm(f, total=total_count): target = test_file if i in test_targets else train_file target.write(line) i += 1 train_file.close() test_file.close() self.logger.info( "Train & Test file is {}({}rows) & {}({}rows, {:.2f}%).".format( os.path.basename(train_test_path[0]), total_count - test_count, os.path.basename(train_test_path[1]), test_count, test_count / total_count * 100)) self.trush(original_file_path) return train_test_path
def prepare(self, dataset_root, extracted_path): newsgroup20_path = os.path.join(dataset_root, "newsgroup20.txt") dataset_path = os.path.join(extracted_path, "20news-18828") with open(newsgroup20_path, mode="wb") as f: for gp in os.listdir(dataset_path): group_path = os.path.join(dataset_path, gp) if not os.path.isdir(group_path): continue if len(self.group_filter) > 0 and gp not in self.group_filter: continue self.logger.info("Extracting {} news data.".format(gp)) for news in xtqdm(os.listdir(group_path)): group_name = gp category_name = self.get_category(gp) news_path = os.path.join(group_path, news) subject, author, text = self.parse(path=news_path) ln = "\t".join([ group_name, category_name, subject, author, text ]) + "\n" f.write(ln.encode("utf-8")) return newsgroup20_path
def make(self, path_or_paths, vocab_size=-1, min_word_freq=0, separator="\t", reserved_words=(), target_column_indexes=()): vocab = Counter() paths = path_or_paths if isinstance(paths, str): paths = [paths] self.max_len = 0 for p in paths: self.logger.info("Read {} to make vocabulary.".format(p)) count = self.get_line_count(p) for words in xtqdm(self.fetch_line(p, target_column_indexes, separator), total=count): for w in words: vocab[w] += 1 if len(words) > self.max_len: self.max_len = len(words) _vocab = [k_v[0] for k_v in vocab.most_common() if not k_v[1] < min_word_freq] _rv = reserved_words if len(_rv) == 0: _rv = [w for w in [self.padding, self.unknown, self.end_of_sentence] if w] _vocab = list(_rv) + _vocab if vocab_size > 0: _vocab = _vocab[:vocab_size] self.logger.info( "The vocabulary count is {}. You can see it in {}.".format( len(_vocab), self._vocab_file_path)) with open(self._vocab_file_path, "w", encoding="utf-8") as f: f.write("\n".join(_vocab)) self._vocab = dict(zip(_vocab, range(len(_vocab)))) self.__rev_vocab = {}
def _prepare_rating(self, dataset_root, extracted_path): rating_file_path = os.path.join(dataset_root, "review_rating.txt") rating_dir = os.path.join(extracted_path, "scaledata") rating_file = open(rating_file_path, "w", encoding="utf-8") for user in os.listdir(rating_dir): user_dir = os.path.join(rating_dir, user) if not os.path.isdir(user_dir): continue sub_in_review_file = os.path.join(user_dir, "subj." + user) user_rating_file = os.path.join(user_dir, "rating." + user) total = self.get_line_count(sub_in_review_file) self.logger.info("Extracting user {}'s rating data.".format(user)) with open(sub_in_review_file, "r", encoding="utf-8") as sr: with open(user_rating_file, "r", encoding="utf-8") as ur: for review, rating in xtqdm(zip(sr, ur), total=total): _rv = review.strip().replace("\t", " ") _r = rating.strip() rating_file.write("\t".join([_r, _rv]) + "\n") rating_file.close() return rating_file_path