Example #1
0
def analyze_dataset_intent(dataset_dir: Path):
    train_data = json_load(dataset_dir / "train.json")
    train_intent_counter = Counter(map(lambda d: d["intent"], train_data))
    train_token_counter = Counter(
        chain.from_iterable(map(lambda d: d["text"].split(), train_data)))

    logger.info(
        f"Analyzing training data...\n"
        f"# Training data: {len(train_data)}\n"
        f"Example training data format: {json.dumps(train_data[0], indent=2)}\n\n"
        f"# Intents: {len(train_intent_counter)}\n"
        f"Intent distribution: {pretty_list(train_intent_counter.most_common())}]\n"
        f"# Unique words: {len(train_token_counter)}")

    val_data = json_load(dataset_dir / "eval.json")
    val_intent_counter = Counter(map(lambda d: d["intent"], val_data))
    val_token_counter = Counter(
        chain.from_iterable(map(lambda d: d["text"].split(), val_data)))

    logger.info(
        f"Analyzing validation data...\n"
        f"# Validation data: {len(val_data)}\n"
        f"Example validation data format: {json.dumps(val_data[0], indent=2)}\n\n"
        f"# Intents: {len(val_intent_counter)}\n"
        f"Intent distribution: {pretty_list(val_intent_counter.most_common())}\n"
        f"# Unique words: {len(val_token_counter)}")

    assert set(train_intent_counter.elements()) == set(
        val_intent_counter.elements())

    test_data = json_load(dataset_dir / "test_release.json")
    test_token_counter = Counter(
        chain.from_iterable(map(lambda d: d["text"].split(), test_data)))
    logger.info(
        f"Analyzing testing data...\n"
        f"# Testing data: {len(test_data)}\n"
        f"Example validation data format: {json.dumps(test_data[0], indent=2)}\n\n"
        f"# Unique words: {len(test_token_counter)}")

    train_tokens = set(train_token_counter.keys())
    val_tokens = set(val_token_counter.keys())
    test_tokens = set(test_token_counter.keys())

    common_words = train_tokens | val_tokens | test_tokens

    logger.info(
        "\n"
        f"# Common words: {len(common_words)}\n"
        f"# In val but not in train: {len(val_tokens - train_tokens)}\n"
        f"# In test but not in train: {len(test_tokens - train_tokens)}")
Example #2
0
    def from_pretrained(cls, model_dir: Path):
        word_list = json_load(model_dir / "dictionary.json")
        embeddings = np.load(model_dir / "word_vectors.npy")

        word_list = [cls.PAD_TOKEN, cls.OOV_TOKEN] + word_list
        embeddings = np.concatenate([
            np.zeros((1, embeddings.shape[1])),
            embeddings.mean(axis=0).reshape(1, -1),
            embeddings,
        ])

        return cls(word_list, embeddings)
Example #3
0
 def __init__(self, **kwargs):
     super(JSONStorage, self).__init__(**kwargs)
     tasks = {}
     if os.path.exists(self.path):
         tasks = json_load(self.path, int_keys=True)
     if len(tasks) == 0:
         self.data = {}
     elif isinstance(tasks, dict):
         self.data = tasks
     elif isinstance(self.data, list):
         self.data = {int(task['id']): task for task in tasks}
     self._save()
Example #4
0
def main(args):
    logger.info(args)

    logger.info("Analyzing context.json...")
    context = json_load(args.dataset_dir / "context.json")
    logger.info(f"#contexts: {len(context)}")
    context_lengthes = sorted(map(len, context))
    logger.info("\n".join([
        "About the lengthes (character level)",
        f"mean:\t{sum(context_lengthes) / len(context):.2f}",
        f"min:\t{min(context_lengthes)}",
        f"max:\t{max(context_lengthes)}",
        f"90%:\t{context_lengthes[int(len(context) * 0.9)]}",
        f">510:\t{sum(map(lambda t: len(t) > 510, context))} / {len(context)}",
    ]))

    def print_counter(counter):
        return "{\n" + "\n".join(f"  {k} -> {v}"
                                 for k, v in sorted(counter.items())) + "\n}"

    def analyze(json_name, is_private=False):
        logger.info(f"Analyzing {json_name}...")
        data = json_load(args.dataset_dir / json_name)
        logger.info(f"#training examples: {len(data)}")

        num_paragraph_counter = Counter(
            map(lambda d: len(d["paragraphs"]), data))
        logger.info(
            f"About the related paragraphs: {print_counter(num_paragraph_counter)}"
        )

        question_length_counter = Counter(
            map(lambda d: len(d["question"]) // 10, data))
        logger.info(
            f"About the question lengths: {print_counter(question_length_counter)}"
        )

        if not is_private:
            assert all(
                all(a["text"] == context[d["relevant"]][a["start"]:a["start"] +
                                                        len(a["text"])]
                    for a in d["answers"]) for d in data)

    analyze("train.json")
    analyze("public.json")
    analyze("private.json", is_private=True)
Example #5
0
def main(args):
    set_seed(args.seed)

    logger.info(
        f"Loading training data from {args.dataset_dir / 'train.json'}...")
    all_data = json_load(args.dataset_dir / "train.json")

    logger.info("Random shuffling the data...")
    random.shuffle(all_data)

    train_size = int(args.train_ratio * len(all_data))
    val_size = len(all_data) - train_size
    logger.info(f"Splitting the dataset into [{train_size}, {val_size}] sizes")

    train_data, val_data = all_data[:train_size], all_data[train_size:]

    json_dump(train_data, args.dataset_dir / "train_splitted.json")
    json_dump(val_data, args.dataset_dir / "val_splitted.json")
Example #6
0
    def __init__(
        self,
        contexts: List[str],
        data: List[dict],
        tokenizer: Optional[BertTokenizer] = None,
        test: bool = False,
        include_nonrelevant=0,
        split_name: str = "no_name",
        cache_dir: Optional[Path] = None,
        skip_preprocess: Optional[bool] = False,
    ):
        super().__init__()
        self._contexts = contexts
        self._raw_data = data
        self.tokenizer = tokenizer
        self.test = test
        self.split_name = split_name

        if skip_preprocess:
            return

        cache_path = ((cache_dir /
                       f"_{split_name}_preprocessed_{include_nonrelevant}.json"
                       ) if cache_dir and split_name else None)

        if cache_path and cache_path.is_file():
            logger.info(
                f"Loading cached preprocessed dataset from {cache_path}...")
            self.data = json_load(cache_path)
        else:
            self.data = self.preprocess_dataset(
                self.tokenizer,
                contexts,
                data,
                include_nonrelevant=include_nonrelevant,
                test=self.test,
            )
            if cache_path:
                logger.info(
                    f"Saving cached preprocessed dataset to {cache_path}...")
                json_dump(self.data, cache_path)
Example #7
0
    def analyze(json_name, is_private=False):
        logger.info(f"Analyzing {json_name}...")
        data = json_load(args.dataset_dir / json_name)
        logger.info(f"#training examples: {len(data)}")

        num_paragraph_counter = Counter(
            map(lambda d: len(d["paragraphs"]), data))
        logger.info(
            f"About the related paragraphs: {print_counter(num_paragraph_counter)}"
        )

        question_length_counter = Counter(
            map(lambda d: len(d["question"]) // 10, data))
        logger.info(
            f"About the question lengths: {print_counter(question_length_counter)}"
        )

        if not is_private:
            assert all(
                all(a["text"] == context[d["relevant"]][a["start"]:a["start"] +
                                                        len(a["text"])]
                    for a in d["answers"]) for d in data)
Example #8
0
    def create_project_dir(cls, project_name, args):
        """
        Create project directory in args.root_dir/project_name, and initialize there all required files
        If some files are missed, restore them from defaults.
        If config files are specified by args, copy them in project directory
        :param project_name:
        :param args:
        :return:
        """
        dir = cls.get_project_dir(project_name, args)
        if args.force:
            delete_dir_content(dir)
        os.makedirs(dir, exist_ok=True)

        # config = json_load(args.config_path) if args.config_path else json_load(find_file('default_config.json'))
        config = json_load(
            args.config_path) if args.config_path else json_load(
                'utils/schema/default_config.json')

        def already_exists_error(what, path):
            raise RuntimeError(
                '{path} {what} already exists. Use "--force" option to recreate it.'
                .format(path=path, what=what))

        input_path = args.input_path or config.get('input_path')

        # save label config
        config_xml = 'config.xml'
        config_xml_path = os.path.join(dir, config_xml)
        label_config_file = args.label_config or config.get('label_config')
        if label_config_file:
            copy2(label_config_file, config_xml_path)
            print(label_config_file + ' label config copied to ' +
                  config_xml_path)
        else:
            if os.path.exists(config_xml_path) and not args.force:
                already_exists_error('label config', config_xml_path)
            if not input_path:
                # create default config with polygons only if input data is not set
                default_label_config = 'examples/adv_region_image/config.xml'
                copy2(default_label_config, config_xml_path)
                print(default_label_config + ' label config copied to ' +
                      config_xml_path)
            else:
                with io.open(config_xml_path, mode='w') as fout:
                    fout.write('<View></View>')
                print('Empty config has been created in ' + config_xml_path)

        config['label_config'] = config_xml

        if args.source:
            config['source'] = {
                'type': args.source,
                'path': args.source_path,
                'params': args.source_params
            }
        else:
            # save tasks.json
            tasks_json = 'tasks.json'
            tasks_json_path = os.path.join(dir, tasks_json)
            if input_path:
                tasks = cls._load_tasks(input_path, args, config_xml_path)
            else:
                tasks = {}
            with io.open(tasks_json_path, mode='w') as fout:
                json.dump(tasks, fout, indent=2)
            config['input_path'] = tasks_json
            config['source'] = {
                'name': 'Tasks',
                'type': 'tasks-json',
                'path': os.path.abspath(tasks_json_path)
            }
            logger.debug(
                '{tasks_json_path} input file with {n} tasks has been created from {input_path}'
                .format(tasks_json_path=tasks_json_path,
                        n=len(tasks),
                        input_path=input_path))

        if args.target:
            config['target'] = {
                'type': args.target,
                'path': args.target_path,
                'params': args.target_params
            }
        else:
            completions_dir = os.path.join(dir, 'completions')
            if os.path.exists(completions_dir) and not args.force:
                already_exists_error('output dir', completions_dir)
            if os.path.exists(completions_dir):
                delete_dir_content(completions_dir)
                print(completions_dir +
                      ' output dir already exists. Clear it.')
            else:
                os.makedirs(completions_dir, exist_ok=True)
                print(completions_dir + ' output dir has been created.')
            config['output_dir'] = 'completions'
            config['target'] = {
                'name': 'Completions',
                'type': 'completions-dir',
                'path': os.path.abspath(completions_dir)
            }

        if 'ml_backends' not in config or not isinstance(
                config['ml_backends'], list):
            config['ml_backends'] = []
        if args.ml_backends:
            for url in args.ml_backends:
                config['ml_backends'].append(
                    cls._create_ml_backend_params(url, project_name))

        if args.sampling:
            config['sampling'] = args.sampling
        if args.port:
            config['port'] = args.port
        if args.host:
            config['host'] = args.host
        if args.allow_serving_local_files:
            config['allow_serving_local_files'] = True

        # create config.json
        config_json = 'config.json'
        config_json_path = os.path.join(dir, config_json)
        if os.path.exists(config_json_path) and not args.force:
            already_exists_error('config', config_json_path)
        with io.open(config_json_path, mode='w') as f:
            json.dump(config, f, indent=2)

        print('')
        print(
            'Label Studio has been successfully initialized. Check project states in '
            + dir)
        print('Start the server: label-studio start ' + dir)
        return dir
Example #9
0
 def from_json(cls, context_json: Path, data_json: Path, **kwargs):
     contexts = json_load(context_json)
     data = json_load(data_json)
     return cls(contexts, data, **kwargs)
Example #10
0
 def load(cls, json_path: Path, **kwargs):
     data = json_load(json_path)
     return cls(data, **kwargs)
Example #11
0
 def _load_ids(self):
     if self._save_to_file_enabled and os.path.exists(self._ids_file):
         self._ids_keys_map = json_load(self._ids_file, int_keys=True)
         self._keys_ids_map = {item['key']: id for id, item in self._ids_keys_map.items()}
Example #12
0
 def get(self, id):
     filename = os.path.join(self.path, str(id) + '.json')
     if os.path.exists(filename):
         return json_load(filename)
Example #13
0
 def _get_objects(self):
     self.data = json_load(self.path, int_keys=True)
     return (str(id) for id in self.data)
Example #14
0
 def items(self):
     for key in self.ids():
         filename = os.path.join(self.path, str(key) + '.json')
         yield key, json_load(filename)
Example #15
0
 def load(cls, config_json):
     return cls(json_load(config_json))