Exemple #1
0
 def download_data(self, data_path):
     if not is_done(Path(data_path)):
         download_decompress(
             url=
             "http://lnsigo.mipt.ru/export/datasets/insuranceQA-master.zip",
             download_path=data_path)
         mark_done(data_path)
Exemple #2
0
    def read(self, data_path: str, dialogs: bool = False) -> Dict[str, List]:
        """
        Downloads ``'dstc2_v2.tar.gz'`` archive from ipavlov internal server,
        decompresses and saves files to ``data_path``.

        Parameters:
            data_path: path to save DSTC2 dataset
            dialogs: flag which indicates whether to output list of turns or
             list of dialogs

        Returns:
            dictionary that contains ``'train'`` field with dialogs from
            ``'dstc2-trn.jsonlist'``, ``'valid'`` field with dialogs from
            ``'dstc2-val.jsonlist'`` and ``'test'`` field with dialogs from
            ``'dstc2-tst.jsonlist'``. Each field is a list of tuples ``(x_i, y_i)``.
        """
        required_files = (self._data_fname(dt) for dt in ('trn', 'val', 'tst'))
        if not all(Path(data_path, f).exists() for f in required_files):
            log.info('[downloading data from {} to {}]'.format(self.url, data_path))
            download_decompress(self.url, data_path)
            mark_done(data_path)

        data = {
            'train': self._read_from_file(
                Path(data_path, self._data_fname('trn')), dialogs),
            'valid': self._read_from_file(
                Path(data_path, self._data_fname('val')), dialogs),
            'test': self._read_from_file(
                Path(data_path, self._data_fname('tst')), dialogs)
        }
        return data
Exemple #3
0
    def build(data_path: str) -> Path:
        """Download and parse common misspellings list from `Wikipedia <https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines>`_

        Args:
            data_path: target directory to download the data to

        Returns:
            path to the resulting tsv-file
        """
        data_path = Path(data_path) / 'typos_wiki'

        fname = data_path / 'misspelings.tsv'

        if not is_done(data_path):
            url = 'https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines'

            page = requests.get(url)
            tree = html.fromstring(page.content)
            raw = tree.xpath('//pre/text()')[0].splitlines()
            data = []
            for pair in raw:
                typo, corrects = pair.strip().split('->')
                for correct in corrects.split(','):
                    data.append([typo.strip(), correct.strip()])

            fname.parent.mkdir(parents=True, exist_ok=True)
            with fname.open('w', newline='', encoding='utf8') as tsvfile:
                writer = csv.writer(tsvfile, delimiter='\t')
                for line in data:
                    writer.writerow(line)

            mark_done(data_path)

            log.info('Built')
        return fname
Exemple #4
0
    def read(self, data_path: str, dialogs: bool = False) -> Dict[str, List]:
        """
        Downloads ``'kvrest_public.tar.gz'``, decompresses, saves files to ``data_path``.

        Parameters:
            data_path: path to save data
            dialogs: flag indices whether to output list of turns or list of dialogs

        Returns:
            dictionary with ``'train'`` containing dialogs from ``'kvret_train_public.json'``, ``'valid'`` containing dialogs from ``'kvret_valid_public.json'``, ``'test'`` containing dialogs from ``'kvret_test_public.json'``. Each fields is a list of tuples ``(x_i, y_i)``.
        """

        required_files = (self._data_fname(dt)
                          for dt in ('train', 'dev', 'test'))
        if not all(Path(data_path, f).exists() for f in required_files):
            log.info('[downloading dstc2 from {} to {}]'.format(
                self.url, data_path))
            download_decompress(self.url, data_path)
            mark_done(data_path)

        data = {
            'train':
            self._read_from_file(Path(data_path, self._data_fname('train')),
                                 dialogs),
            'valid':
            self._read_from_file(Path(data_path, self._data_fname('dev')),
                                 dialogs),
            'test':
            self._read_from_file(Path(data_path, self._data_fname('test')),
                                 dialogs)
        }
        return data
    def build(data_path: str):
        data_path = Path(data_path) / 'typos_wiki'

        fname = data_path / 'misspelings.tsv'

        if not is_done(data_path):
            url = 'https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines'

            page = requests.get(url)
            tree = html.fromstring(page.content)
            raw = tree.xpath('//pre/text()')[0].splitlines()
            data = []
            for pair in raw:
                typo, corrects = pair.strip().split('->')
                for correct in corrects.split(','):
                    data.append([typo.strip(), correct.strip()])

            fname.parent.mkdir(parents=True, exist_ok=True)
            with fname.open('w', newline='') as tsvfile:
                writer = csv.writer(tsvfile, delimiter='\t')
                for line in data:
                    writer.writerow(line)

            mark_done(data_path)

            log.info('Built')
        return fname
Exemple #6
0
    def build(data_path: str):
        data_path = Path(data_path) / 'typos_wiki'

        fname = data_path / 'misspelings.tsv'

        if not is_done(data_path):
            url = 'https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines'

            download(fname, url)

            with fname.open() as f:
                data = []
                for line in f:
                    if line.strip().endswith('<pre>'):
                        break
                for line in f:
                    if line.strip().startswith('</pre>'):
                        break
                    data.append(line.strip().split('-&gt;'))

            with fname.open('w', newline='') as tsvfile:
                writer = csv.writer(tsvfile, delimiter='\t')
                for line in data:
                    writer.writerow(line)

            mark_done(data_path)

            print('Built', file=sys.stderr)
        return fname
    def read(self, data_path: str, dialogs: bool = False) -> Dict[str, List]:
        """
        Downloads ``'dstc2_v2.tar.gz'`` archive from ipavlov internal server,
        decompresses and saves files to ``data_path``.

        Parameters:
            data_path: path to save DSTC2 dataset
            dialogs: flag which indicates whether to output list of turns or
             list of dialogs

        Returns:
            dictionary that contains ``'train'`` field with dialogs from
            ``'dstc2-trn.jsonlist'``, ``'valid'`` field with dialogs from
            ``'dstc2-val.jsonlist'`` and ``'test'`` field with dialogs from
            ``'dstc2-tst.jsonlist'``. Each field is a list of tuples ``(x_i, y_i)``.
        """
        required_files = (self._data_fname(dt) for dt in ('trn', 'val', 'tst'))
        if not all(Path(data_path, f).exists() for f in required_files):
            log.info(f"[downloading data from {self.url} to {data_path}]")
            download_decompress(self.url, data_path)
            mark_done(data_path)

        data = {
            'train':
            self._read_from_file(Path(data_path, self._data_fname('trn')),
                                 dialogs),
            'valid':
            self._read_from_file(Path(data_path, self._data_fname('val')),
                                 dialogs),
            'test':
            self._read_from_file(Path(data_path, self._data_fname('tst')),
                                 dialogs)
        }
        return data
    def read(self, data_path: str, queries_per_intent: Optional[int] = None, test_validate_split: float = 0.5,
             *args, **kwargs) -> \
            Dict[str, List[Dict[str, Any]]]:
        """
        Each query in the output has the following form:
            { 'intent': intent_name,
              'data': [ { 'text': text, ('entity': slot_name)? } ]
            }

        Args:
            data_path: A path to a folder with dataset files.
            queries_per_intent: Number of queries to load for each intent. None to load all.
                If the requested number is greater than available in file, all queries are returned.
            test_validate_split: Proportion of `_validate` files to be used as test dataset (since Snips
                is split into training and validation sets without a separate test set).
        """
        data_path = Path(data_path)
        intents = [
            'AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic',
            'RateBook', 'SearchCreativeWork', 'SearchScreeningEvent'
        ]

        if not is_done(data_path):
            url = 'http://files.deeppavlov.ai/datasets/snips.tar.gz'
            log.info('[downloading data from {} to {}]'.format(url, data_path))
            download_decompress(url, data_path)
            mark_done(data_path)

        use_full_file = queries_per_intent is None or queries_per_intent > 70
        training_data = []
        validation_data = []
        test_data = []

        for intent in intents:
            intent_path = data_path / intent
            train_file_name = f"train_{intent}{'_full' if use_full_file else ''}.json"
            validate_file_name = f"validate_{intent}.json"

            train_queries = self._load_file(intent_path / train_file_name,
                                            intent, queries_per_intent)
            validate_queries = self._load_file(
                intent_path / validate_file_name, intent, queries_per_intent)
            num_test_queries = round(
                len(validate_queries) * test_validate_split)

            training_data.extend(train_queries)
            validation_data.extend(validate_queries[num_test_queries:])
            test_data.extend(validate_queries[:num_test_queries])

        return {
            'train': training_data,
            'valid': validation_data,
            'test': test_data
        }
Exemple #9
0
    def _download_data(self, data_path):
        """Download archive with the InsuranceQA dataset files and decompress if there is no dataset files in `data_path`.

        Args:
            data_path: A path to a folder where dataset files are stored.
        """
        if not is_done(Path(data_path)):
            download_decompress(
                url=
                "http://files.deeppavlov.ai/datasets/insuranceQA-master.zip",
                download_path=data_path)
            mark_done(data_path)
Exemple #10
0
    def build(data_path: str):
        data_path = Path(data_path) / 'kartaslov'

        fname = data_path / 'orfo_and_typos.L1_5.csv'

        if not is_done(data_path):
            url = 'https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv'

            download(fname, url)

            mark_done(data_path)

            print('Built', file=sys.stderr)
        return fname
    def build(data_path: str):
        data_path = Path(data_path) / 'kartaslov'

        fname = data_path / 'orfo_and_typos.L1_5.csv'

        if not is_done(data_path):
            url = 'https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv'

            download(fname, url)

            mark_done(data_path)

            log.info('Built')
        return fname
Exemple #12
0
    def __init__(self, data_dir=None, *args, **kwargs):
        if data_dir is None:
            data_dir = paths.USR_PATH
        data_dir = Path(data_dir)
        if self.dict_name is None:
            self.dict_name = args[0] if args else kwargs.get(
                'dictionary_name', 'dictionary')

        data_dir = data_dir / self.dict_name

        alphabet_path = data_dir / 'alphabet.pkl'
        words_path = data_dir / 'words.pkl'
        words_trie_path = data_dir / 'words_trie.pkl'

        if not is_done(data_dir):
            print('Trying to build a dictionary in {}'.format(data_dir),
                  file=sys.stderr)
            if data_dir.is_dir():
                shutil.rmtree(data_dir)
            data_dir.mkdir(parents=True)

            words = self._get_source(data_dir, *args, **kwargs)
            words = {self._normalize(word) for word in words}

            alphabet = {c for w in words for c in w}
            alphabet.remove('⟬')
            alphabet.remove('⟭')

            save_pickle(alphabet, alphabet_path)
            save_pickle(words, words_path)

            words_trie = defaultdict(set)
            for word in words:
                for i in range(len(word)):
                    words_trie[word[:i]].add(word[:i + 1])
                words_trie[word] = set()
            words_trie = {k: sorted(v) for k, v in words_trie.items()}

            save_pickle(words_trie, words_trie_path)

            mark_done(data_dir)
            print('built', file=sys.stderr)
        else:
            print('Loading a dictionary from {}'.format(data_dir),
                  file=sys.stderr)

        self.alphabet = load_pickle(alphabet_path)
        self.words_set = load_pickle(words_path)
        self.words_trie = load_pickle(words_trie_path)
Exemple #13
0
    def __init__(self, data_dir=None, *args, **kwargs):
        if data_dir is None:
            data_dir = paths.USR_PATH
        data_dir = Path(data_dir)
        if self.dict_name is None:
            self.dict_name = args[0] if args else kwargs.get('dictionary_name', 'dictionary')

        data_dir = data_dir / self.dict_name

        alphabet_path = data_dir / 'alphabet.pkl'
        words_path = data_dir / 'words.pkl'
        words_trie_path = data_dir / 'words_trie.pkl'

        if not is_done(data_dir):
            print('Trying to build a dictionary in {}'.format(data_dir), file=sys.stderr)
            if data_dir.is_dir():
                shutil.rmtree(data_dir)
            data_dir.mkdir(parents=True)

            words = self._get_source(data_dir, *args, **kwargs)
            words = {self._normalize(word) for word in words}

            alphabet = {c for w in words for c in w}
            alphabet.remove('⟬')
            alphabet.remove('⟭')

            save_pickle(alphabet, alphabet_path)
            save_pickle(words, words_path)

            words_trie = defaultdict(set)
            for word in words:
                for i in range(len(word)):
                    words_trie[word[:i]].add(word[:i+1])
                words_trie[word] = set()
            words_trie = {k: sorted(v) for k, v in words_trie.items()}

            save_pickle(words_trie, words_trie_path)

            mark_done(data_dir)
            print('built', file=sys.stderr)
        else:
            print('Loading a dictionary from {}'.format(data_dir), file=sys.stderr)

        self.alphabet = load_pickle(alphabet_path)
        self.words_set = load_pickle(words_path)
        self.words_trie = load_pickle(words_trie_path)
    def read(self, data_path, dialogs=False):
    #TODO: mkdir if it doesn't exist

        required_files = (self._data_fname(dt) for dt in ('trn', 'val', 'tst'))
        if not all(Path(data_path, f).exists() for f in required_files):
            print('Loading dstc2 from `{}` to `{}`'.format(self.url, data_path))
            download_decompress(self.url, data_path)
            mark_done(data_path)

        data = {
            'train': self._read_from_file(
                Path(data_path, self._data_fname('trn')), dialogs),
            'valid': self._read_from_file(
                Path(data_path, self._data_fname('val')), dialogs),
            'test': self._read_from_file(
                Path(data_path, self._data_fname('tst')), dialogs)
        }
        return data
    def __init__(self,
                 data_dir: [Path, str] = '',
                 *args,
                 dictionary_name: str = 'dictionary',
                 **kwargs):
        data_dir = expand_path(data_dir) / dictionary_name

        alphabet_path = data_dir / 'alphabet.pkl'
        words_path = data_dir / 'words.pkl'
        words_trie_path = data_dir / 'words_trie.pkl'

        if not is_done(data_dir):
            log.info('Trying to build a dictionary in {}'.format(data_dir))
            if data_dir.is_dir():
                shutil.rmtree(str(data_dir))
            data_dir.mkdir(parents=True)

            words = self._get_source(data_dir, *args, **kwargs)
            words = {self._normalize(word) for word in words}

            alphabet = {c for w in words for c in w}
            alphabet.remove('⟬')
            alphabet.remove('⟭')

            save_pickle(alphabet, alphabet_path)
            save_pickle(words, words_path)

            words_trie = defaultdict(set)
            for word in words:
                for i in range(len(word)):
                    words_trie[word[:i]].add(word[:i + 1])
                words_trie[word] = set()
            words_trie = {k: sorted(v) for k, v in words_trie.items()}

            save_pickle(words_trie, words_trie_path)

            mark_done(data_dir)
            log.info('built')
        else:
            log.info('Loading a dictionary from {}'.format(data_dir))

        self.alphabet = load_pickle(alphabet_path)
        self.words_set = load_pickle(words_path)
        self.words_trie = load_pickle(words_trie_path)
Exemple #16
0
    def read(self,
             data_path: str,
             dialogs: bool = False,
             encoding='utf-8') -> Dict[str, List]:
        """
        Downloads ``'simple_dstc2.tar.gz'`` archive from internet,
        decompresses and saves files to ``data_path``.

        Parameters:
            data_path: path to save DSTC2 dataset
            dialogs: flag which indicates whether to output list of turns or
             list of dialogs

        Returns:
            dictionary that contains ``'train'`` field with dialogs from
            ``'simple-dstc2-trn.json'``, ``'valid'`` field with dialogs
            from ``'simple-dstc2-val.json'`` and ``'test'`` field with
            dialogs from ``'simple-dstc2-tst.json'``.
            Each field is a list of tuples ``(user turn, system turn)``.
        """
        required_files = (self._data_fname(dt) for dt in ('trn', 'val', 'tst'))
        if not all(Path(data_path, f).exists() for f in required_files):
            log.info(f"{[Path(data_path, f) for f in required_files]}]")
            log.info(f"[downloading data from {self.url} to {data_path}]")
            download_decompress(self.url, data_path)
            mark_done(data_path)

        data = {
            'train':
            self._read_from_file(Path(data_path, self._data_fname('trn')),
                                 dialogs, encoding),
            'valid':
            self._read_from_file(Path(data_path, self._data_fname('val')),
                                 dialogs, encoding),
            'test':
            self._read_from_file(Path(data_path, self._data_fname('tst')),
                                 dialogs, encoding)
        }
        log.info(f"There are {len(data['train'])} samples in train split.")
        log.info(f"There are {len(data['valid'])} samples in valid split.")
        log.info(f"There are {len(data['test'])} samples in test split.")
        return data
    def read(self, data_path, dialogs=False):
        required_files = (self._data_fname(dt) for dt in ('trn', 'val', 'tst'))
        if not all(Path(data_path, f).exists() for f in required_files):
            log.info('[downloading data from {} to {}]'.format(
                self.url, data_path))
            download_decompress(self.url, data_path)
            mark_done(data_path)

        data = {
            'train':
            self._read_from_file(Path(data_path, self._data_fname('trn')),
                                 dialogs),
            'valid':
            self._read_from_file(Path(data_path, self._data_fname('val')),
                                 dialogs),
            'test':
            self._read_from_file(Path(data_path, self._data_fname('tst')),
                                 dialogs)
        }
        return data
Exemple #18
0
    def __init__(self, data_dir: [Path, str]='', *args, dictionary_name: str='dictionary', **kwargs):
        data_dir = expand_path(data_dir) / dictionary_name

        alphabet_path = data_dir / 'alphabet.pkl'
        words_path = data_dir / 'words.pkl'
        words_trie_path = data_dir / 'words_trie.pkl'

        if not is_done(data_dir):
            log.info('Trying to build a dictionary in {}'.format(data_dir))
            if data_dir.is_dir():
                shutil.rmtree(str(data_dir))
            data_dir.mkdir(parents=True)

            words = self._get_source(data_dir, *args, **kwargs)
            words = {self._normalize(word) for word in words}

            alphabet = {c for w in words for c in w}
            alphabet.remove('⟬')
            alphabet.remove('⟭')

            save_pickle(alphabet, alphabet_path)
            save_pickle(words, words_path)

            words_trie = defaultdict(set)
            for word in words:
                for i in range(len(word)):
                    words_trie[word[:i]].add(word[:i+1])
                words_trie[word] = set()
            words_trie = {k: sorted(v) for k, v in words_trie.items()}

            save_pickle(words_trie, words_trie_path)

            mark_done(data_dir)
            log.info('built')
        else:
            log.info('Loading a dictionary from {}'.format(data_dir))

        self.alphabet = load_pickle(alphabet_path)
        self.words_set = load_pickle(words_path)
        self.words_trie = load_pickle(words_trie_path)
Exemple #19
0
    def build(data_path: str) -> Path:
        """Download misspellings list from `github <https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv>`_

        Args:
            data_path: target directory to download the data to

        Returns:
            path to the resulting csv-file
        """
        data_path = Path(data_path) / 'kartaslov'

        fname = data_path / 'orfo_and_typos.L1_5.csv'

        if not is_done(data_path):
            url = 'https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv'

            download(fname, url)

            mark_done(data_path)

            log.info('Built')
        return fname
Exemple #20
0
    def build(data_path: str) -> Path:
        """Download misspellings list from `github <https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv>`_

        Args:
            data_path: target directory to download the data to

        Returns:
            path to the resulting csv-file
        """
        data_path = Path(data_path) / 'kartaslov'

        fname = data_path / 'orfo_and_typos.L1_5.csv'

        if not is_done(data_path):
            url = 'https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv'

            download(fname, url)

            mark_done(data_path)

            log.info('Built')
        return fname
    def read(self, data_path, data_types=["train"]):
        """
        Read dataset from data_path directory.
        Reading files are all data_types + extension
        (i.e for data_types=["train", "valid"] files "train.csv" and "valid.csv" form
        data_path will be read)
        Args:
            data_path: directory with files
            data_types: types of considered data (possible: "train", "valid", "test")

        Returns:
            dictionary with types from data_types.
            Each field of dictionary is a list of tuples (x_i, y_i)
        """

        for data_type in data_types:
            if not Path(data_path).joinpath(data_type + ".csv").exists():
                print("Loading {} data from {} to {}".format(
                    data_type, self.url, data_path))
                download(source_url=self.url,
                         dest_file_path=Path(data_path).joinpath(data_type +
                                                                 ".csv"))
                mark_done(data_path)

        data = {}
        for data_type in data_types:
            data[data_type] = pd.read_csv(
                Path(data_path).joinpath(data_type + ".csv"))

        new_data = {'train': [], 'valid': [], 'test': []}

        for field in data_types:
            for i in range(data[field].shape[0]):
                new_data[field].append((data[field].loc[i, 'text'],
                                        data[field].loc[i,
                                                        "intents"].split(",")))

        return new_data
Exemple #22
0
    def read(self, data_path: str, url: Optional[str] = None,
             *args, **kwargs) -> Dict[str, List[Tuple[Any, Any]]]:
        """
        Args:
            data_path: A path to a folder with dataset files.
            url: A url to the archive with the dataset to download if the data folder is empty.
        """
        data_path = Path(data_path)

        if url is None:
            url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

        if not is_done(data_path):
            log.info('[downloading data from {} to {}]'.format(url, data_path))
            download_decompress(url, data_path)
            mark_done(data_path)

        alternative_data_path = data_path / "aclImdb"
        if alternative_data_path.exists():
            data_path = alternative_data_path

        data = {"train": [],
                "test": []}
        for data_type in data.keys():
            for label in ["neg", "pos"]:
                labelpath = data_path / data_type / label
                if not labelpath.exists():
                    raise RuntimeError(f"Cannot load data: {labelpath} does not exist")
                for filename in labelpath.glob("*.txt"):
                    with filename.open(encoding='utf-8') as f:
                        text = f.read()
                    data[data_type].append((text, [label]))

            if not data[data_type]:
                raise RuntimeError(f"Could not load the '{data_type}' dataset, "
                                   "probably data dirs are empty")

        return data
    def read(self, data_path, data_types=["train"]):
        """
        Read dataset from data_path directory.
        Reading files are all data_types + extension
        (i.e for data_types=["train", "valid"] files "train.csv" and "valid.csv" form
        data_path will be read)
        Args:
            data_path: directory with files
            data_types: types of considered data (possible: "train", "valid", "test")

        Returns:
            dictionary with types from data_types.
            Each field of dictionary is a list of tuples (x_i, y_i)
        """

        for data_type in data_types:
            if not Path(data_path).joinpath(data_type + ".csv").exists():
                print("Loading {} data from {} to {}".format(data_type, self.url, data_path))
                download(source_url=self.url,
                         dest_file_path=Path(data_path).joinpath(data_type + ".csv"))
                mark_done(data_path)

        data = {}
        for data_type in data_types:
            data[data_type] = pd.read_csv(Path(data_path).joinpath(data_type + ".csv"))

        new_data = {'train': [],
                    'valid': [],
                    'test': []}

        for field in data_types:
            for i in range(data[field].shape[0]):
                new_data[field].append(
                    (data[field].loc[i, 'text'], data[field].loc[i, "intents"].split(",")))

        return new_data
    def read(self,
             data_path: Union[List, str],
             language: Optional[str] = None,
             data_types: Optional[List[str]] = None,
             **kwargs) -> Dict[str, List]:
        """Reads UD dataset from data_path.

        Args:
            data_path: can be either
                1. a directory containing files. The file for data_type 'mode'
                is then data_path / {language}-ud-{mode}.conllu
                2. a list of files, containing the same number of items as data_types
            language: a language to detect filename when it is not given
            data_types: which dataset parts among 'train', 'dev', 'test' are returned

        Returns:
            a dictionary containing dataset fragments (see ``read_infile``) for given data types
        """
        if data_types is None:
            data_types = ["train", "dev"]
        elif isinstance(data_types, str):
            data_types = list(data_types)
        for data_type in data_types:
            if data_type not in ["train", "dev", "test"]:
                raise ValueError(
                    "Unknown data_type: {}, only train, dev and test "
                    "datatypes are allowed".format(data_type))
        if isinstance(data_path, str):
            data_path = Path(data_path)
        if isinstance(data_path, Path):
            if data_path.exists():
                is_file = data_path.is_file()
            else:
                is_file = (len(data_types) == 1)
            if is_file:
                # path to a single file
                data_path, reserve_data_path = [data_path], None
            else:
                # path to data directory
                if language is None:
                    raise ValueError("You must implicitly provide language "
                                     "when providing data directory as source")
                reserve_data_path = data_path
                data_path = [
                    data_path / "{}-ud-{}.conllu".format(language, mode)
                    for mode in data_types
                ]
                reserve_data_path = [
                    reserve_data_path / language /
                    "{}-ud-{}.conllu".format(language, mode)
                    for mode in data_types
                ]
        else:
            data_path = [Path(data_path) for data_path in data_path]
            reserve_data_path = None
        if len(data_path) != len(data_types):
            raise ValueError(
                "The number of input files in data_path and data types "
                "in data_types must be equal")
        has_missing_files = any(not filepath.exists()
                                for filepath in data_path)
        if has_missing_files and reserve_data_path is not None:
            has_missing_files = any(not filepath.exists()
                                    for filepath in reserve_data_path)
            if not has_missing_files:
                data_path = reserve_data_path
        if has_missing_files:
            # Files are downloaded from the Web repository
            dir_path = data_path[0].parent
            language = language or get_language(data_path[0].parts[-1])
            url = self.URL + "{}.tar.gz".format(language)
            log.info('[downloading data from {} to {}]'.format(url, dir_path))
            dir_path.mkdir(exist_ok=True, parents=True)
            download_decompress(url, dir_path)
            mark_done(dir_path)
        data = {}
        for mode, filepath in zip(data_types, data_path):
            if mode == "dev":
                mode = "valid"
#             if mode == "test":
#                 kwargs["read_only_words"] = True
            data[mode] = read_infile(filepath, **kwargs)
        return data
 def _download_data(self, data_path: str) -> None:
     """Download dataset"""
     url = "https://github.com/SamTube405/Amazon-E-commerce-Data-set/archive/master.zip"
     download_decompress(url, data_path)
     mark_done(data_path)
Exemple #26
0
 def _download_data(self, data_path: str) -> None:
     """Download dataset"""
     url = "https://github.com/SamTube405/Amazon-E-commerce-Data-set/archive/master.zip"
     download_decompress(url, data_path)
     mark_done(data_path)