def download_data(self, data_path): if not is_done(Path(data_path)): download_decompress( url= "http://lnsigo.mipt.ru/export/datasets/insuranceQA-master.zip", download_path=data_path) mark_done(data_path)
def read(self, data_path: str, dialogs: bool = False) -> Dict[str, List]: """ Downloads ``'dstc2_v2.tar.gz'`` archive from ipavlov internal server, decompresses and saves files to ``data_path``. Parameters: data_path: path to save DSTC2 dataset dialogs: flag which indicates whether to output list of turns or list of dialogs Returns: dictionary that contains ``'train'`` field with dialogs from ``'dstc2-trn.jsonlist'``, ``'valid'`` field with dialogs from ``'dstc2-val.jsonlist'`` and ``'test'`` field with dialogs from ``'dstc2-tst.jsonlist'``. Each field is a list of tuples ``(x_i, y_i)``. """ required_files = (self._data_fname(dt) for dt in ('trn', 'val', 'tst')) if not all(Path(data_path, f).exists() for f in required_files): log.info('[downloading data from {} to {}]'.format(self.url, data_path)) download_decompress(self.url, data_path) mark_done(data_path) data = { 'train': self._read_from_file( Path(data_path, self._data_fname('trn')), dialogs), 'valid': self._read_from_file( Path(data_path, self._data_fname('val')), dialogs), 'test': self._read_from_file( Path(data_path, self._data_fname('tst')), dialogs) } return data
def build(data_path: str) -> Path: """Download and parse common misspellings list from `Wikipedia <https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines>`_ Args: data_path: target directory to download the data to Returns: path to the resulting tsv-file """ data_path = Path(data_path) / 'typos_wiki' fname = data_path / 'misspelings.tsv' if not is_done(data_path): url = 'https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines' page = requests.get(url) tree = html.fromstring(page.content) raw = tree.xpath('//pre/text()')[0].splitlines() data = [] for pair in raw: typo, corrects = pair.strip().split('->') for correct in corrects.split(','): data.append([typo.strip(), correct.strip()]) fname.parent.mkdir(parents=True, exist_ok=True) with fname.open('w', newline='', encoding='utf8') as tsvfile: writer = csv.writer(tsvfile, delimiter='\t') for line in data: writer.writerow(line) mark_done(data_path) log.info('Built') return fname
def read(self, data_path: str, dialogs: bool = False) -> Dict[str, List]: """ Downloads ``'kvrest_public.tar.gz'``, decompresses, saves files to ``data_path``. Parameters: data_path: path to save data dialogs: flag indices whether to output list of turns or list of dialogs Returns: dictionary with ``'train'`` containing dialogs from ``'kvret_train_public.json'``, ``'valid'`` containing dialogs from ``'kvret_valid_public.json'``, ``'test'`` containing dialogs from ``'kvret_test_public.json'``. Each fields is a list of tuples ``(x_i, y_i)``. """ required_files = (self._data_fname(dt) for dt in ('train', 'dev', 'test')) if not all(Path(data_path, f).exists() for f in required_files): log.info('[downloading dstc2 from {} to {}]'.format( self.url, data_path)) download_decompress(self.url, data_path) mark_done(data_path) data = { 'train': self._read_from_file(Path(data_path, self._data_fname('train')), dialogs), 'valid': self._read_from_file(Path(data_path, self._data_fname('dev')), dialogs), 'test': self._read_from_file(Path(data_path, self._data_fname('test')), dialogs) } return data
def build(data_path: str): data_path = Path(data_path) / 'typos_wiki' fname = data_path / 'misspelings.tsv' if not is_done(data_path): url = 'https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines' page = requests.get(url) tree = html.fromstring(page.content) raw = tree.xpath('//pre/text()')[0].splitlines() data = [] for pair in raw: typo, corrects = pair.strip().split('->') for correct in corrects.split(','): data.append([typo.strip(), correct.strip()]) fname.parent.mkdir(parents=True, exist_ok=True) with fname.open('w', newline='') as tsvfile: writer = csv.writer(tsvfile, delimiter='\t') for line in data: writer.writerow(line) mark_done(data_path) log.info('Built') return fname
def build(data_path: str): data_path = Path(data_path) / 'typos_wiki' fname = data_path / 'misspelings.tsv' if not is_done(data_path): url = 'https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines' download(fname, url) with fname.open() as f: data = [] for line in f: if line.strip().endswith('<pre>'): break for line in f: if line.strip().startswith('</pre>'): break data.append(line.strip().split('->')) with fname.open('w', newline='') as tsvfile: writer = csv.writer(tsvfile, delimiter='\t') for line in data: writer.writerow(line) mark_done(data_path) print('Built', file=sys.stderr) return fname
def read(self, data_path: str, dialogs: bool = False) -> Dict[str, List]: """ Downloads ``'dstc2_v2.tar.gz'`` archive from ipavlov internal server, decompresses and saves files to ``data_path``. Parameters: data_path: path to save DSTC2 dataset dialogs: flag which indicates whether to output list of turns or list of dialogs Returns: dictionary that contains ``'train'`` field with dialogs from ``'dstc2-trn.jsonlist'``, ``'valid'`` field with dialogs from ``'dstc2-val.jsonlist'`` and ``'test'`` field with dialogs from ``'dstc2-tst.jsonlist'``. Each field is a list of tuples ``(x_i, y_i)``. """ required_files = (self._data_fname(dt) for dt in ('trn', 'val', 'tst')) if not all(Path(data_path, f).exists() for f in required_files): log.info(f"[downloading data from {self.url} to {data_path}]") download_decompress(self.url, data_path) mark_done(data_path) data = { 'train': self._read_from_file(Path(data_path, self._data_fname('trn')), dialogs), 'valid': self._read_from_file(Path(data_path, self._data_fname('val')), dialogs), 'test': self._read_from_file(Path(data_path, self._data_fname('tst')), dialogs) } return data
def read(self, data_path: str, queries_per_intent: Optional[int] = None, test_validate_split: float = 0.5, *args, **kwargs) -> \ Dict[str, List[Dict[str, Any]]]: """ Each query in the output has the following form: { 'intent': intent_name, 'data': [ { 'text': text, ('entity': slot_name)? } ] } Args: data_path: A path to a folder with dataset files. queries_per_intent: Number of queries to load for each intent. None to load all. If the requested number is greater than available in file, all queries are returned. test_validate_split: Proportion of `_validate` files to be used as test dataset (since Snips is split into training and validation sets without a separate test set). """ data_path = Path(data_path) intents = [ 'AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic', 'RateBook', 'SearchCreativeWork', 'SearchScreeningEvent' ] if not is_done(data_path): url = 'http://files.deeppavlov.ai/datasets/snips.tar.gz' log.info('[downloading data from {} to {}]'.format(url, data_path)) download_decompress(url, data_path) mark_done(data_path) use_full_file = queries_per_intent is None or queries_per_intent > 70 training_data = [] validation_data = [] test_data = [] for intent in intents: intent_path = data_path / intent train_file_name = f"train_{intent}{'_full' if use_full_file else ''}.json" validate_file_name = f"validate_{intent}.json" train_queries = self._load_file(intent_path / train_file_name, intent, queries_per_intent) validate_queries = self._load_file( intent_path / validate_file_name, intent, queries_per_intent) num_test_queries = round( len(validate_queries) * test_validate_split) training_data.extend(train_queries) validation_data.extend(validate_queries[num_test_queries:]) test_data.extend(validate_queries[:num_test_queries]) return { 'train': training_data, 'valid': validation_data, 'test': test_data }
def _download_data(self, data_path): """Download archive with the InsuranceQA dataset files and decompress if there is no dataset files in `data_path`. Args: data_path: A path to a folder where dataset files are stored. """ if not is_done(Path(data_path)): download_decompress( url= "http://files.deeppavlov.ai/datasets/insuranceQA-master.zip", download_path=data_path) mark_done(data_path)
def build(data_path: str): data_path = Path(data_path) / 'kartaslov' fname = data_path / 'orfo_and_typos.L1_5.csv' if not is_done(data_path): url = 'https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv' download(fname, url) mark_done(data_path) print('Built', file=sys.stderr) return fname
def build(data_path: str): data_path = Path(data_path) / 'kartaslov' fname = data_path / 'orfo_and_typos.L1_5.csv' if not is_done(data_path): url = 'https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv' download(fname, url) mark_done(data_path) log.info('Built') return fname
def __init__(self, data_dir=None, *args, **kwargs): if data_dir is None: data_dir = paths.USR_PATH data_dir = Path(data_dir) if self.dict_name is None: self.dict_name = args[0] if args else kwargs.get( 'dictionary_name', 'dictionary') data_dir = data_dir / self.dict_name alphabet_path = data_dir / 'alphabet.pkl' words_path = data_dir / 'words.pkl' words_trie_path = data_dir / 'words_trie.pkl' if not is_done(data_dir): print('Trying to build a dictionary in {}'.format(data_dir), file=sys.stderr) if data_dir.is_dir(): shutil.rmtree(data_dir) data_dir.mkdir(parents=True) words = self._get_source(data_dir, *args, **kwargs) words = {self._normalize(word) for word in words} alphabet = {c for w in words for c in w} alphabet.remove('⟬') alphabet.remove('⟭') save_pickle(alphabet, alphabet_path) save_pickle(words, words_path) words_trie = defaultdict(set) for word in words: for i in range(len(word)): words_trie[word[:i]].add(word[:i + 1]) words_trie[word] = set() words_trie = {k: sorted(v) for k, v in words_trie.items()} save_pickle(words_trie, words_trie_path) mark_done(data_dir) print('built', file=sys.stderr) else: print('Loading a dictionary from {}'.format(data_dir), file=sys.stderr) self.alphabet = load_pickle(alphabet_path) self.words_set = load_pickle(words_path) self.words_trie = load_pickle(words_trie_path)
def __init__(self, data_dir=None, *args, **kwargs): if data_dir is None: data_dir = paths.USR_PATH data_dir = Path(data_dir) if self.dict_name is None: self.dict_name = args[0] if args else kwargs.get('dictionary_name', 'dictionary') data_dir = data_dir / self.dict_name alphabet_path = data_dir / 'alphabet.pkl' words_path = data_dir / 'words.pkl' words_trie_path = data_dir / 'words_trie.pkl' if not is_done(data_dir): print('Trying to build a dictionary in {}'.format(data_dir), file=sys.stderr) if data_dir.is_dir(): shutil.rmtree(data_dir) data_dir.mkdir(parents=True) words = self._get_source(data_dir, *args, **kwargs) words = {self._normalize(word) for word in words} alphabet = {c for w in words for c in w} alphabet.remove('⟬') alphabet.remove('⟭') save_pickle(alphabet, alphabet_path) save_pickle(words, words_path) words_trie = defaultdict(set) for word in words: for i in range(len(word)): words_trie[word[:i]].add(word[:i+1]) words_trie[word] = set() words_trie = {k: sorted(v) for k, v in words_trie.items()} save_pickle(words_trie, words_trie_path) mark_done(data_dir) print('built', file=sys.stderr) else: print('Loading a dictionary from {}'.format(data_dir), file=sys.stderr) self.alphabet = load_pickle(alphabet_path) self.words_set = load_pickle(words_path) self.words_trie = load_pickle(words_trie_path)
def read(self, data_path, dialogs=False): #TODO: mkdir if it doesn't exist required_files = (self._data_fname(dt) for dt in ('trn', 'val', 'tst')) if not all(Path(data_path, f).exists() for f in required_files): print('Loading dstc2 from `{}` to `{}`'.format(self.url, data_path)) download_decompress(self.url, data_path) mark_done(data_path) data = { 'train': self._read_from_file( Path(data_path, self._data_fname('trn')), dialogs), 'valid': self._read_from_file( Path(data_path, self._data_fname('val')), dialogs), 'test': self._read_from_file( Path(data_path, self._data_fname('tst')), dialogs) } return data
def __init__(self, data_dir: [Path, str] = '', *args, dictionary_name: str = 'dictionary', **kwargs): data_dir = expand_path(data_dir) / dictionary_name alphabet_path = data_dir / 'alphabet.pkl' words_path = data_dir / 'words.pkl' words_trie_path = data_dir / 'words_trie.pkl' if not is_done(data_dir): log.info('Trying to build a dictionary in {}'.format(data_dir)) if data_dir.is_dir(): shutil.rmtree(str(data_dir)) data_dir.mkdir(parents=True) words = self._get_source(data_dir, *args, **kwargs) words = {self._normalize(word) for word in words} alphabet = {c for w in words for c in w} alphabet.remove('⟬') alphabet.remove('⟭') save_pickle(alphabet, alphabet_path) save_pickle(words, words_path) words_trie = defaultdict(set) for word in words: for i in range(len(word)): words_trie[word[:i]].add(word[:i + 1]) words_trie[word] = set() words_trie = {k: sorted(v) for k, v in words_trie.items()} save_pickle(words_trie, words_trie_path) mark_done(data_dir) log.info('built') else: log.info('Loading a dictionary from {}'.format(data_dir)) self.alphabet = load_pickle(alphabet_path) self.words_set = load_pickle(words_path) self.words_trie = load_pickle(words_trie_path)
def read(self, data_path: str, dialogs: bool = False, encoding='utf-8') -> Dict[str, List]: """ Downloads ``'simple_dstc2.tar.gz'`` archive from internet, decompresses and saves files to ``data_path``. Parameters: data_path: path to save DSTC2 dataset dialogs: flag which indicates whether to output list of turns or list of dialogs Returns: dictionary that contains ``'train'`` field with dialogs from ``'simple-dstc2-trn.json'``, ``'valid'`` field with dialogs from ``'simple-dstc2-val.json'`` and ``'test'`` field with dialogs from ``'simple-dstc2-tst.json'``. Each field is a list of tuples ``(user turn, system turn)``. """ required_files = (self._data_fname(dt) for dt in ('trn', 'val', 'tst')) if not all(Path(data_path, f).exists() for f in required_files): log.info(f"{[Path(data_path, f) for f in required_files]}]") log.info(f"[downloading data from {self.url} to {data_path}]") download_decompress(self.url, data_path) mark_done(data_path) data = { 'train': self._read_from_file(Path(data_path, self._data_fname('trn')), dialogs, encoding), 'valid': self._read_from_file(Path(data_path, self._data_fname('val')), dialogs, encoding), 'test': self._read_from_file(Path(data_path, self._data_fname('tst')), dialogs, encoding) } log.info(f"There are {len(data['train'])} samples in train split.") log.info(f"There are {len(data['valid'])} samples in valid split.") log.info(f"There are {len(data['test'])} samples in test split.") return data
def read(self, data_path, dialogs=False): required_files = (self._data_fname(dt) for dt in ('trn', 'val', 'tst')) if not all(Path(data_path, f).exists() for f in required_files): log.info('[downloading data from {} to {}]'.format( self.url, data_path)) download_decompress(self.url, data_path) mark_done(data_path) data = { 'train': self._read_from_file(Path(data_path, self._data_fname('trn')), dialogs), 'valid': self._read_from_file(Path(data_path, self._data_fname('val')), dialogs), 'test': self._read_from_file(Path(data_path, self._data_fname('tst')), dialogs) } return data
def __init__(self, data_dir: [Path, str]='', *args, dictionary_name: str='dictionary', **kwargs): data_dir = expand_path(data_dir) / dictionary_name alphabet_path = data_dir / 'alphabet.pkl' words_path = data_dir / 'words.pkl' words_trie_path = data_dir / 'words_trie.pkl' if not is_done(data_dir): log.info('Trying to build a dictionary in {}'.format(data_dir)) if data_dir.is_dir(): shutil.rmtree(str(data_dir)) data_dir.mkdir(parents=True) words = self._get_source(data_dir, *args, **kwargs) words = {self._normalize(word) for word in words} alphabet = {c for w in words for c in w} alphabet.remove('⟬') alphabet.remove('⟭') save_pickle(alphabet, alphabet_path) save_pickle(words, words_path) words_trie = defaultdict(set) for word in words: for i in range(len(word)): words_trie[word[:i]].add(word[:i+1]) words_trie[word] = set() words_trie = {k: sorted(v) for k, v in words_trie.items()} save_pickle(words_trie, words_trie_path) mark_done(data_dir) log.info('built') else: log.info('Loading a dictionary from {}'.format(data_dir)) self.alphabet = load_pickle(alphabet_path) self.words_set = load_pickle(words_path) self.words_trie = load_pickle(words_trie_path)
def build(data_path: str) -> Path: """Download misspellings list from `github <https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv>`_ Args: data_path: target directory to download the data to Returns: path to the resulting csv-file """ data_path = Path(data_path) / 'kartaslov' fname = data_path / 'orfo_and_typos.L1_5.csv' if not is_done(data_path): url = 'https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv' download(fname, url) mark_done(data_path) log.info('Built') return fname
def build(data_path: str) -> Path: """Download misspellings list from `github <https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv>`_ Args: data_path: target directory to download the data to Returns: path to the resulting csv-file """ data_path = Path(data_path) / 'kartaslov' fname = data_path / 'orfo_and_typos.L1_5.csv' if not is_done(data_path): url = 'https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv' download(fname, url) mark_done(data_path) log.info('Built') return fname
def read(self, data_path, data_types=["train"]): """ Read dataset from data_path directory. Reading files are all data_types + extension (i.e for data_types=["train", "valid"] files "train.csv" and "valid.csv" form data_path will be read) Args: data_path: directory with files data_types: types of considered data (possible: "train", "valid", "test") Returns: dictionary with types from data_types. Each field of dictionary is a list of tuples (x_i, y_i) """ for data_type in data_types: if not Path(data_path).joinpath(data_type + ".csv").exists(): print("Loading {} data from {} to {}".format( data_type, self.url, data_path)) download(source_url=self.url, dest_file_path=Path(data_path).joinpath(data_type + ".csv")) mark_done(data_path) data = {} for data_type in data_types: data[data_type] = pd.read_csv( Path(data_path).joinpath(data_type + ".csv")) new_data = {'train': [], 'valid': [], 'test': []} for field in data_types: for i in range(data[field].shape[0]): new_data[field].append((data[field].loc[i, 'text'], data[field].loc[i, "intents"].split(","))) return new_data
def read(self, data_path: str, url: Optional[str] = None, *args, **kwargs) -> Dict[str, List[Tuple[Any, Any]]]: """ Args: data_path: A path to a folder with dataset files. url: A url to the archive with the dataset to download if the data folder is empty. """ data_path = Path(data_path) if url is None: url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz" if not is_done(data_path): log.info('[downloading data from {} to {}]'.format(url, data_path)) download_decompress(url, data_path) mark_done(data_path) alternative_data_path = data_path / "aclImdb" if alternative_data_path.exists(): data_path = alternative_data_path data = {"train": [], "test": []} for data_type in data.keys(): for label in ["neg", "pos"]: labelpath = data_path / data_type / label if not labelpath.exists(): raise RuntimeError(f"Cannot load data: {labelpath} does not exist") for filename in labelpath.glob("*.txt"): with filename.open(encoding='utf-8') as f: text = f.read() data[data_type].append((text, [label])) if not data[data_type]: raise RuntimeError(f"Could not load the '{data_type}' dataset, " "probably data dirs are empty") return data
def read(self, data_path, data_types=["train"]): """ Read dataset from data_path directory. Reading files are all data_types + extension (i.e for data_types=["train", "valid"] files "train.csv" and "valid.csv" form data_path will be read) Args: data_path: directory with files data_types: types of considered data (possible: "train", "valid", "test") Returns: dictionary with types from data_types. Each field of dictionary is a list of tuples (x_i, y_i) """ for data_type in data_types: if not Path(data_path).joinpath(data_type + ".csv").exists(): print("Loading {} data from {} to {}".format(data_type, self.url, data_path)) download(source_url=self.url, dest_file_path=Path(data_path).joinpath(data_type + ".csv")) mark_done(data_path) data = {} for data_type in data_types: data[data_type] = pd.read_csv(Path(data_path).joinpath(data_type + ".csv")) new_data = {'train': [], 'valid': [], 'test': []} for field in data_types: for i in range(data[field].shape[0]): new_data[field].append( (data[field].loc[i, 'text'], data[field].loc[i, "intents"].split(","))) return new_data
def read(self, data_path: Union[List, str], language: Optional[str] = None, data_types: Optional[List[str]] = None, **kwargs) -> Dict[str, List]: """Reads UD dataset from data_path. Args: data_path: can be either 1. a directory containing files. The file for data_type 'mode' is then data_path / {language}-ud-{mode}.conllu 2. a list of files, containing the same number of items as data_types language: a language to detect filename when it is not given data_types: which dataset parts among 'train', 'dev', 'test' are returned Returns: a dictionary containing dataset fragments (see ``read_infile``) for given data types """ if data_types is None: data_types = ["train", "dev"] elif isinstance(data_types, str): data_types = list(data_types) for data_type in data_types: if data_type not in ["train", "dev", "test"]: raise ValueError( "Unknown data_type: {}, only train, dev and test " "datatypes are allowed".format(data_type)) if isinstance(data_path, str): data_path = Path(data_path) if isinstance(data_path, Path): if data_path.exists(): is_file = data_path.is_file() else: is_file = (len(data_types) == 1) if is_file: # path to a single file data_path, reserve_data_path = [data_path], None else: # path to data directory if language is None: raise ValueError("You must implicitly provide language " "when providing data directory as source") reserve_data_path = data_path data_path = [ data_path / "{}-ud-{}.conllu".format(language, mode) for mode in data_types ] reserve_data_path = [ reserve_data_path / language / "{}-ud-{}.conllu".format(language, mode) for mode in data_types ] else: data_path = [Path(data_path) for data_path in data_path] reserve_data_path = None if len(data_path) != len(data_types): raise ValueError( "The number of input files in data_path and data types " "in data_types must be equal") has_missing_files = any(not filepath.exists() for filepath in data_path) if has_missing_files and reserve_data_path is not None: has_missing_files = any(not filepath.exists() for filepath in reserve_data_path) if not has_missing_files: data_path = reserve_data_path if has_missing_files: # Files are downloaded from the Web repository dir_path = data_path[0].parent language = language or get_language(data_path[0].parts[-1]) url = self.URL + "{}.tar.gz".format(language) log.info('[downloading data from {} to {}]'.format(url, dir_path)) dir_path.mkdir(exist_ok=True, parents=True) download_decompress(url, dir_path) mark_done(dir_path) data = {} for mode, filepath in zip(data_types, data_path): if mode == "dev": mode = "valid" # if mode == "test": # kwargs["read_only_words"] = True data[mode] = read_infile(filepath, **kwargs) return data
def _download_data(self, data_path: str) -> None: """Download dataset""" url = "https://github.com/SamTube405/Amazon-E-commerce-Data-set/archive/master.zip" download_decompress(url, data_path) mark_done(data_path)
def _download_data(self, data_path: str) -> None: """Download dataset""" url = "https://github.com/SamTube405/Amazon-E-commerce-Data-set/archive/master.zip" download_decompress(url, data_path) mark_done(data_path)