예제 #1
0
    def prepare(self, dataset_root, extracted_path):
        label_file_path = self._get_annotation_file(dataset_root)
        self.move_extracteds(dataset_root, extracted_path,
                             self.label_desc_file)
        self.logger.info("Reading the annotation file")
        annotations = {}
        annotation_count = self.get_line_count(label_file_path)
        with open(label_file_path, "r", encoding="utf-8") as f:
            for line in xtqdm(f, total=annotation_count):
                a = line.strip().split(" ")
                cat = a[0]
                document_id = a[1]
                if document_id in annotations:
                    annotations[document_id] += [cat]
                else:
                    annotations[document_id] = [cat]
        descs = ReutersNewsResource.read_descriptions(dataset_root, self.kind)

        self.logger.info("Make annotated file")
        pathes = []
        for t in ["train", "test"]:
            file_path = os.path.join(dataset_root,
                                     "{}_{}.txt".format(self.kind, t))
            self.logger.info("Annotating the {} file".format(t))
            data_path = os.path.join(extracted_path,
                                     "lyrl2004_tokens_{}.csv".format(t))
            total_count = self.get_line_count(data_path)

            f = open(file_path, "w", encoding="utf-8")
            with open(data_path, "r", encoding="utf-8") as df:
                for line in xtqdm(df, total=total_count):
                    doc_id, words = line.strip().split(",")
                    if doc_id in annotations:
                        ann = " ".join(annotations[doc_id])
                        if self.kind == "regions":
                            f.write("\t".join([ann, words]) + "\n")
                        else:
                            ps = [descs[d].parent for d in annotations[doc_id]]
                            ps = [p for p in ps if p not in ["Root", "None"]]
                            ps = list(set(ps))
                            ps = " ".join(ps)
                            f.write("\t".join([ann, ps, words]) + "\n")
            f.close()
            pathes.append(file_path)
            self.trush(data_path)
        self.trush(label_file_path)

        return pathes[0]
예제 #2
0
    def prepare(self, dataset_root, _):
        original_file_path = os.path.join(dataset_root, self.original_file)
        write_file_path = os.path.splitext(original_file_path)[0] + ".txt"
        write_file = open(write_file_path,
                          mode="w",
                          encoding="utf-8",
                          newline="")
        writer = csv.writer(write_file, delimiter="\t")

        self.logger.info("Preprocessing {}".format(original_file_path))
        with open(original_file_path, encoding="utf-8") as rf:
            data = json.load(rf)["data"]

        make_row = getattr(
            self, "make_row_{}".format(self.version.replace(".", "_")))

        for article in xtqdm(data):
            for paragraph in article["paragraphs"]:
                context = paragraph["context"].replace("\n", " ")
                for qa in paragraph["qas"]:
                    question = qa["question"].strip().replace("\n", "")
                    row = make_row(context, question, qa)
                    writer.writerow(row)

        self.trush(original_file_path)
        write_file.close()

        return write_file_path
예제 #3
0
    def _get_annotation_file(self, dataset_root):
        label_file_path = os.path.join(dataset_root, self.label_file)
        if os.path.exists(label_file_path):
            return label_file_path

        self.logger.info("Downloading the annotation file")
        dl_file_path = label_file_path + ".gz"

        r = requests.get(self.label_url)
        total_size = int(r.headers.get("content-length", 0))
        with open(dl_file_path, "wb") as f:
            chunk_size = 1024
            limit = total_size / chunk_size
            for data in xtqdm(r.iter_content(chunk_size=chunk_size),
                              total=limit,
                              unit="B",
                              unit_scale=True):
                f.write(data)

        with gzip.open(dl_file_path, "rb") as g:
            with open(label_file_path, "wb") as f:
                for ln in g:
                    f.write(ln)

        self.trush(dl_file_path)
        return label_file_path
예제 #4
0
    def label_by_dir(self, file_path, target_dir, dir_and_label, task_size=10):
        label_dirs = dir_and_label.keys()
        dirs = [
            d for d in os.listdir(target_dir)
            if os.path.isdir(os.path.join(target_dir, d)) and d in label_dirs
        ]

        write_flg = True
        for d in dirs:
            self.logger.info("Extracting {} (labeled by {}).".format(
                d, dir_and_label[d]))
            label = dir_and_label[d]
            dir_path = os.path.join(target_dir, d)
            pathes = [os.path.join(dir_path, f) for f in os.listdir(dir_path)]
            pathes = [p for p in pathes if os.path.isfile(p)]
            task_length = int(math.ceil(len(pathes) / task_size))
            for i in xtqdm(range(task_length)):
                index = i * task_size
                tasks = pathes[index:(index + task_size)]
                lines = Parallel(n_jobs=-1)(delayed(self._make_pair)(label, t)
                                            for t in tasks)
                mode = "w" if write_flg else "a"
                with open(file_path, mode=mode, encoding="utf-8") as f:
                    for ln in lines:
                        f.write(ln)
                write_flg = False
예제 #5
0
    def save_dataset(self, dataset_root):
        save_file_path = os.path.join(dataset_root, self._get_file_name(None))
        if os.path.exists(save_file_path):
            self.logger.info("The dataset file already exists.")
            return save_file_path

        url = self.test_download_url if self.test_mode else self.download_url
        # download and save it as raw file
        self.logger.info("Begin downloading the {} dataset from {}.".format(
            self.name, url))
        resp = requests.get(self.download_url, stream=True)
        if not resp.ok:
            raise Exception("Can not get dataset from {}.".format(url))

        # save content in response to file
        total_size = int(resp.headers.get("content-length", 0))
        file_name = self._get_file_name(resp)
        _, ext = os.path.splitext(file_name)
        save_file_path = os.path.abspath(os.path.join(dataset_root, file_name))
        self.logger.info("The dataset is saved to {}".format(save_file_path))
        with open(save_file_path, "wb") as f:
            chunk_size = 1024
            limit = total_size / chunk_size
            for data in xtqdm(resp.iter_content(chunk_size=chunk_size),
                              total=limit,
                              unit="B",
                              unit_scale=True):
                f.write(data)

        return save_file_path
예제 #6
0
파일: movie_review.py 프로젝트: yk/chazutsu
    def _prepare_polarity_v1(self, dataset_root, extracted_path):
        polarity_file = os.path.join(dataset_root, "review_polarity_v1.txt")
        with open(polarity_file, mode="w", encoding="utf-8") as f:
            for e in self.extract_targets:
                p = os.path.join(extracted_path, os.path.basename(e))
                label = 0 if e.endswith(".neg") else 1
                label_name = "negative" if label == 0 else "positive"
                self.logger.info("Extracting {} data.".format(label_name))
                total = self.get_line_count(p)
                with open(p, mode="r", errors="replace",
                          encoding="utf-8") as p:
                    for ln in xtqdm(p, total=total):
                        review = ln.strip().replace("\t", " ")
                        f.write("\t".join([str(label), review]) + "\n")

        return polarity_file
예제 #7
0
    def preprocess_file(self, dataset_root, path):
        write_file_name = os.path.basename(path).replace(".jsonl", ".txt")
        write_file_path = os.path.join(dataset_root, write_file_name)
        write_file = open(write_file_path, mode="w", encoding="utf-8")
        file_kind = path.split("_")[-1]

        self.logger.info("Preprocessing {} file".format(file_kind))
        total_count = self.get_line_count(path)
        with open(path, encoding="utf-8") as rf:
            for line in xtqdm(rf, total=total_count):
                preprocessed = self.preprocess_jsonl(line)
                if preprocessed:
                    w_line = "\t".join(preprocessed) + "\n"
                    write_file.write(w_line)

        write_file.close()
        return write_file_path
예제 #8
0
파일: movie_review.py 프로젝트: yk/chazutsu
    def _prepare_subjectivity(self, dataset_root, extracted_path):
        subjectivity_file = os.path.join(dataset_root, "subjectivity.txt")
        with open(subjectivity_file, mode="w", encoding="utf-8") as f:
            for e in self.extract_targets:
                # subjective(plot) = 1
                label = 1 if e.startswith("plot.") else 0
                label_name = "subjective" if label == 1 else "objective"
                self.logger.info("Extracting {} data.".format(label_name))
                p = os.path.join(extracted_path, os.path.basename(e))
                total = self.get_line_count(p)
                with open(p, mode="r", errors="replace",
                          encoding="utf-8") as sb:
                    for ln in xtqdm(sb, total=total):
                        review = ln.strip().replace("\t", " ")
                        f.write("\t".join([str(label), review]) + "\n")

        return subjectivity_file
예제 #9
0
    def make(self,
             path_or_paths,
             vocab_size=-1,
             min_word_count=0,
             target_column_indexes=(),
             separator="\t",
             reserved_words=()):
        vocab = Counter()
        paths = path_or_paths
        if isinstance(paths, str):
            paths = [paths]

        for p in paths:
            self.logger.info("Read {} to make vocabulary.".format(p))
            count = self.get_line_count(p)
            for words in xtqdm(self.fetch_line(p, target_column_indexes,
                                               separator),
                               total=count):
                for w in words:
                    vocab[w] += 1

        _vocab = [
            k_v[0] for k_v in vocab.most_common()
            if not k_v[1] < min_word_count
        ]
        if self.unknown and self.unknown not in _vocab:
            _vocab.insert(0, self.unknown)
        if self.end_of_sentence and self.end_of_sentence not in _vocab:
            _vocab.insert(0, self.end_of_sentence)
        if len(reserved_words) > 0:
            for w in reserved_words:
                _vocab.insert(0, w)

        if vocab_size > 0:
            _vocab = _vocab[:vocab_size]

        self.logger.info(
            "The vocabulary count is {}. You can see it in {}.".format(
                len(_vocab), self._vocab_file_path))
        with open(self._vocab_file_path, "w", encoding="utf-8") as f:
            f.write("\n".join(_vocab))
        self._vocab = dict(zip(_vocab, range(len(_vocab))))
        self.__rev_vocab = {}
예제 #10
0
파일: movie_review.py 프로젝트: yk/chazutsu
    def _prepare_polarity(self, dataset_root, extracted_path):
        polarity_file_path = os.path.join(dataset_root, "review_polarity.txt")
        negative_path = os.path.join(extracted_path, "txt_sentoken/neg")
        positive_path = os.path.join(extracted_path, "txt_sentoken/pos")

        with open(polarity_file_path, mode="w", encoding="utf-8") as f:
            for i, p in enumerate([negative_path, positive_path]):
                label = i  # negative = 0, positive = 1
                label_name = "negative" if label == 0 else "positive"
                self.logger.info("Extracting {} data.".format(label_name))
                for txt in xtqdm(os.listdir(p)):
                    with open(os.path.join(p, txt), encoding="utf-8") as tf:
                        lines = [
                            ln.strip().replace("\t", " ")
                            for ln in tf.readlines()
                        ]
                        review = " ".join(lines)
                        f.write("\t".join([str(label), review]) + "\n")

        return polarity_file_path
예제 #11
0
    def train_test_split(self, original_file_path, test_size):
        if test_size < 0 or test_size > 1:
            self.logger.error(
                "test_size have to be between 0 ~ 1." \
                "if you don't want to split, please set 0.")
            return []
        elif test_size == 0 or test_size == 1:
            return []

        self.logger.info("Split to train & test file.")

        total_count = self.get_line_count(original_file_path)
        test_count = int(round(total_count * test_size))
        test_targets = random.sample(range(total_count), test_count)

        base, ext = os.path.splitext(original_file_path)
        train_test_path = [base + x + ext for x in ["_train", "_test"]]
        train_file = open(train_test_path[0], "wb")
        test_file = open(train_test_path[1], "wb")

        with open(original_file_path, "rb") as f:
            i = 0
            for line in xtqdm(f, total=total_count):
                target = test_file if i in test_targets else train_file
                target.write(line)
                i += 1

        train_file.close()
        test_file.close()

        self.logger.info(
            "Train & Test file is {}({}rows) & {}({}rows, {:.2f}%).".format(
                os.path.basename(train_test_path[0]), total_count - test_count,
                os.path.basename(train_test_path[1]), test_count,
                test_count / total_count * 100))

        self.trush(original_file_path)

        return train_test_path
예제 #12
0
파일: news_group20.py 프로젝트: yk/chazutsu
    def prepare(self, dataset_root, extracted_path):
        newsgroup20_path = os.path.join(dataset_root, "newsgroup20.txt")
        dataset_path = os.path.join(extracted_path, "20news-18828")
        with open(newsgroup20_path, mode="wb") as f:
            for gp in os.listdir(dataset_path):
                group_path = os.path.join(dataset_path, gp)
                if not os.path.isdir(group_path):
                    continue
                if len(self.group_filter) > 0 and gp not in self.group_filter:
                    continue

                self.logger.info("Extracting {} news data.".format(gp))
                for news in xtqdm(os.listdir(group_path)):
                    group_name = gp
                    category_name = self.get_category(gp)
                    news_path = os.path.join(group_path, news)
                    subject, author, text = self.parse(path=news_path)
                    ln = "\t".join([
                        group_name, category_name, subject, author, text
                    ]) + "\n"
                    f.write(ln.encode("utf-8"))

        return newsgroup20_path
예제 #13
0
파일: vocabulary.py 프로젝트: yk/chazutsu
    def make(self,
             path_or_paths, vocab_size=-1, min_word_freq=0,
             separator="\t", reserved_words=(), target_column_indexes=()):
        vocab = Counter()
        paths = path_or_paths
        if isinstance(paths, str):
            paths = [paths]

        self.max_len = 0
        for p in paths:
            self.logger.info("Read {} to make vocabulary.".format(p))
            count = self.get_line_count(p)
            for words in xtqdm(self.fetch_line(p, target_column_indexes,
                               separator), total=count):
                for w in words:
                    vocab[w] += 1
                if len(words) > self.max_len:
                    self.max_len = len(words)

        _vocab = [k_v[0] for k_v in vocab.most_common()
                  if not k_v[1] < min_word_freq]
        _rv = reserved_words
        if len(_rv) == 0:
            _rv = [w for w in
                   [self.padding, self.unknown, self.end_of_sentence] if w]
        _vocab = list(_rv) + _vocab

        if vocab_size > 0:
            _vocab = _vocab[:vocab_size]

        self.logger.info(
            "The vocabulary count is {}. You can see it in {}.".format(
                len(_vocab), self._vocab_file_path))
        with open(self._vocab_file_path, "w", encoding="utf-8") as f:
            f.write("\n".join(_vocab))
        self._vocab = dict(zip(_vocab, range(len(_vocab))))
        self.__rev_vocab = {}
예제 #14
0
파일: movie_review.py 프로젝트: yk/chazutsu
    def _prepare_rating(self, dataset_root, extracted_path):
        rating_file_path = os.path.join(dataset_root, "review_rating.txt")
        rating_dir = os.path.join(extracted_path, "scaledata")

        rating_file = open(rating_file_path, "w", encoding="utf-8")
        for user in os.listdir(rating_dir):
            user_dir = os.path.join(rating_dir, user)
            if not os.path.isdir(user_dir):
                continue

            sub_in_review_file = os.path.join(user_dir, "subj." + user)
            user_rating_file = os.path.join(user_dir, "rating." + user)
            total = self.get_line_count(sub_in_review_file)
            self.logger.info("Extracting user {}'s rating data.".format(user))
            with open(sub_in_review_file, "r", encoding="utf-8") as sr:
                with open(user_rating_file, "r", encoding="utf-8") as ur:
                    for review, rating in xtqdm(zip(sr, ur), total=total):
                        _rv = review.strip().replace("\t", " ")
                        _r = rating.strip()
                        rating_file.write("\t".join([_r, _rv]) + "\n")

        rating_file.close()

        return rating_file_path