Ejemplo n.º 1
0
    def build(data_path: str):
        data_path = Path(data_path) / 'typos_wiki'

        fname = data_path / 'misspelings.tsv'

        if not is_done(data_path):
            url = 'https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines'

            download(fname, url)

            with fname.open() as f:
                data = []
                for line in f:
                    if line.strip().endswith('<pre>'):
                        break
                for line in f:
                    if line.strip().startswith('</pre>'):
                        break
                    data.append(line.strip().split('-&gt;'))

            with fname.open('w', newline='') as tsvfile:
                writer = csv.writer(tsvfile, delimiter='\t')
                for line in data:
                    writer.writerow(line)

            mark_done(data_path)

            print('Built', file=sys.stderr)
        return fname
Ejemplo n.º 2
0
def download_resource(url, dest_paths):
    dest_paths = list(dest_paths)

    if url.endswith(('.tar.gz', '.gz', '.zip')):
        download_path = dest_paths[0].parent
        download_decompress(url, download_path, dest_paths)
    else:
        file_name = url.split('/')[-1]
        dest_files = [dest_path / file_name for dest_path in dest_paths]
        download(dest_files, url)
Ejemplo n.º 3
0
def download_resource(url: str, dest_paths: Iterable[Path]) -> None:
    dest_paths = list(dest_paths)

    if check_md5(url, dest_paths):
        log.info(f'Skipped {url} download because of matching hashes')
    elif url.endswith(('.tar.gz', '.gz', '.zip')):
        download_path = dest_paths[0].parent
        download_decompress(url, download_path, dest_paths)
    else:
        file_name = url.split('/')[-1]
        dest_files = [dest_path / file_name for dest_path in dest_paths]
        download(dest_files, url)
Ejemplo n.º 4
0
    def __init__(self, data_dir: str = '', data_url: str = DB_URL, batch_size: int = None,
                 shuffle: bool = None, seed: int = None, **kwargs):

        download_dir = expand_path(data_dir)
        download_path = download_dir.joinpath(data_url.split("/")[-1])
        download(download_path, data_url, force_download=False)

        self.connect = sqlite3.connect(str(download_path), check_same_thread=False)
        self.db_name = self.get_db_name()
        self.doc_ids = self.get_doc_ids()
        self.doc2index = self.map_doc2idx()
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.random = Random(seed)
Ejemplo n.º 5
0
    def build(data_path: str):
        data_path = Path(data_path) / 'kartaslov'

        fname = data_path / 'orfo_and_typos.L1_5.csv'

        if not is_done(data_path):
            url = 'https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv'

            download(fname, url)

            mark_done(data_path)

            print('Built', file=sys.stderr)
        return fname
Ejemplo n.º 6
0
    def read(self, data_path, file_name: str='ontonotes_senna.pckl', provide_senna_pos=False, provide_senna_ner=False):
        path = Path(data_path).resolve() / file_name
        if not path.exists():
            download(str(path), self.URL)
        with open(path, 'rb') as f:
            dataset = pickle.load(f)

        dataset_filtered = {}
        for key, data in dataset.items():
            dataset_filtered[key] = []
            for (toks, pos, ner), tags in data:
                if not provide_senna_pos and not provide_senna_ner:
                    dataset_filtered[key].append((toks, tags))
                else:
                    x = [toks]
                    if provide_senna_pos:
                        x.append(pos)
                    if provide_senna_ner:
                        x.append(ner)
                    dataset_filtered[key].append((x, tags))

        return dataset_filtered
Ejemplo n.º 7
0
    def build(data_path: str) -> Path:
        """Download misspellings list from `github <https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv>`_

        Args:
            data_path: target directory to download the data to

        Returns:
            path to the resulting csv-file
        """
        data_path = Path(data_path) / 'kartaslov'

        fname = data_path / 'orfo_and_typos.L1_5.csv'

        if not is_done(data_path):
            url = 'https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv'

            download(fname, url)

            mark_done(data_path)

            log.info('Built')
        return fname
    def read(self, data_path, data_types=["train"]):
        """
        Read dataset from data_path directory.
        Reading files are all data_types + extension
        (i.e for data_types=["train", "valid"] files "train.csv" and "valid.csv" form
        data_path will be read)
        Args:
            data_path: directory with files
            data_types: types of considered data (possible: "train", "valid", "test")

        Returns:
            dictionary with types from data_types.
            Each field of dictionary is a list of tuples (x_i, y_i)
        """

        for data_type in data_types:
            if not Path(data_path).joinpath(data_type + ".csv").exists():
                print("Loading {} data from {} to {}".format(data_type, self.url, data_path))
                download(source_url=self.url,
                         dest_file_path=Path(data_path).joinpath(data_type + ".csv"))
                mark_done(data_path)

        data = {}
        for data_type in data_types:
            data[data_type] = pd.read_csv(Path(data_path).joinpath(data_type + ".csv"))

        new_data = {'train': [],
                    'valid': [],
                    'test': []}

        for field in data_types:
            for i in range(data[field].shape[0]):
                new_data[field].append(
                    (data[field].loc[i, 'text'], data[field].loc[i, "intents"].split(",")))

        return new_data
Ejemplo n.º 9
0
def download_resources(args):
    if args.all:
        urls = ALL_URLS
    else:
        urls = REQ_URLS

    for url in urls:
        download_path = Path('../download')
        download_path.mkdir(exist_ok=True)
        dest_path = download_path

        embeddings_path = download_path.joinpath('embeddings')

        if url in EMBEDDING_URLS:
            embeddings_path.mkdir(exist_ok=True)
            dest_path = embeddings_path.joinpath(url.split("/")[-1])
            download(dest_path, url)

        elif url in DATA_URLS:
            dest_path = download_path.joinpath(url.split("/")[-1].split(".")[0])
            download_decompress(url, dest_path)

        else:
            download_decompress(url, dest_path)
Ejemplo n.º 10
0
    def __init__(self, emb_folder: str, emb_url: str, save_path: str, load_path: str,
                 context_limit: int = 450, question_limit: int = 150, char_limit: int = 16,
                 level: str = 'token', *args, **kwargs):
        self.emb_folder = expand_path(emb_folder)
        self.level = level
        self.emb_url = emb_url
        self.emb_file_name = Path(emb_url).name
        self.save_path = expand_path(save_path)
        self.load_path = expand_path(load_path)
        self.context_limit = context_limit
        self.question_limit = question_limit
        self.char_limit = char_limit
        self.loaded = False

        self.NULL = "<NULL>"
        self.OOV = "<OOV>"

        self.emb_folder.mkdir(parents=True, exist_ok=True)

        if not (self.emb_folder / self.emb_file_name).exists():
            download(self.emb_folder / self.emb_file_name, self.emb_url)

        if self.load_path.exists():
            self.load()
    def read(self,
             data_path: str,
             url: str = None,
             format: str = "csv",
             class_sep: str = ",",
             *args,
             **kwargs) -> dict:
        """
        Read dataset from data_path directory.
        Reading files are all data_types + extension
        (i.e for data_types=["train", "valid"] files "train.csv" and "valid.csv" form
        data_path will be read)

        Args:
            data_path: directory with files
            url: download data files if data_path not exists or empty
            format: extension of files. Set of Values: ``"csv", "json"``
            class_sep: string separator of labels in column with labels
            sep (str): delimeter for ``"csv"`` files. Default: ``","``
            header (int): row number to use as the column names
            names (array): list of column names to use
            orient (str): indication of expected JSON string format
            lines (boolean): read the file as a json object per line. Default: ``False``

        Returns:
            dictionary with types from data_types.
            Each field of dictionary is a list of tuples (x_i, y_i)
        """
        data_types = ["train", "valid", "test"]

        train_file = kwargs.get('train', 'train.csv')

        if not Path(data_path, train_file).exists():
            if url is None:
                raise Exception(
                    "data path {} does not exist or is empty, and download url parameter not specified!"
                    .format(data_path))
            log.info("Loading train data from {} to {}".format(url, data_path))
            download(source_url=url,
                     dest_file_path=Path(data_path, train_file))

        data = {"train": [], "valid": [], "test": []}
        for data_type in data_types:
            file_name = kwargs.get(data_type,
                                   '{}.{}'.format(data_type, format))
            file = Path(data_path).joinpath(file_name)
            if file.exists():
                if format == 'csv':
                    keys = ('sep', 'header', 'names')
                    options = {k: kwargs[k] for k in keys if k in kwargs}
                    df = pd.read_csv(file, **options)
                elif format == 'json':
                    keys = ('orient', 'lines')
                    options = {k: kwargs[k] for k in keys if k in kwargs}
                    df = pd.read_json(file, **options)
                else:
                    raise Exception(
                        'Unsupported file format: {}'.format(format))

                x = kwargs.get("x", "text")
                y = kwargs.get('y', 'labels')
                if isinstance(x, list):
                    data[data_type] = [([row[x_] for x_ in x],
                                        str(row[y]).split(class_sep))
                                       for _, row in df.iterrows()]
                else:
                    data[data_type] = [(row[x], str(row[y]).split(class_sep))
                                       for _, row in df.iterrows()]
            else:
                log.warning("Cannot find {} file".format(file))

        return data
Ejemplo n.º 12
0
 def _build_slot_vals(slot_vals_json_path='data/'):
     url = 'http://files.deeppavlov.ai/datasets/dstc_slot_vals.json'
     download(slot_vals_json_path, url)
Ejemplo n.º 13
0
 def _download_slot_vals(self):
     url = 'http://lnsigo.mipt.ru/export/datasets/dstc_slot_vals.json'
     download(self.save_path, url)
Ejemplo n.º 14
0
 def load(self):
     url = 'http://lnsigo.mipt.ru/export/datasets/dstc_slot_vals.json'
     download(self.save_path, url)
Ejemplo n.º 15
0
 def _download_slot_vals(self):
     url = 'http://files.deeppavlov.ai/datasets/dstc_slot_vals.json'
     download(self.save_path, url)
Ejemplo n.º 16
0
 def _build_slot_vals(slot_vals_json_path='data/'):
     url = 'http://files.deeppavlov.ai/datasets/dstc_slot_vals.json'
     download(slot_vals_json_path, url)
    def read(self, data_path: str, url: str = None,
             format: str = "csv", class_sep: str = ",",
             *args, **kwargs) -> dict:
        """
        Read dataset from data_path directory.
        Reading files are all data_types + extension
        (i.e for data_types=["train", "valid"] files "train.csv" and "valid.csv" form
        data_path will be read)

        Args:
            data_path: directory with files
            url: download data files if data_path not exists or empty
            format: extension of files. Set of Values: ``"csv", "json"``
            class_sep: string separator of labels in column with labels
            sep (str): delimeter for ``"csv"`` files. Default: ``","``
            header (int): row number to use as the column names
            names (array): list of column names to use
            orient (str): indication of expected JSON string format
            lines (boolean): read the file as a json object per line. Default: ``False``

        Returns:
            dictionary with types from data_types.
            Each field of dictionary is a list of tuples (x_i, y_i)
        """
        data_types = ["train", "valid", "test"]

        train_file = kwargs.get('train', 'train.csv')

        if not Path(data_path, train_file).exists():
            if url is None:
                raise Exception("data path {} does not exist or is empty, and download url parameter not specified!".format(data_path))
            log.info("Loading train data from {} to {}".format(url, data_path))
            download(source_url=url, dest_file_path=Path(data_path, train_file))

        data = {"train": [],
                "valid": [],
                "test": []}
        for data_type in data_types:
            file_name = kwargs.get(data_type, '{}.{}'.format(data_type, format))
            file = Path(data_path).joinpath(file_name)
            if file.exists():
                if format == 'csv':
                    keys = ('sep', 'header', 'names')
                    options = {k: kwargs[k] for k in keys if k in kwargs}
                    df = pd.read_csv(file, **options)
                elif format == 'json':
                    keys = ('orient', 'lines')
                    options = {k: kwargs[k] for k in keys if k in kwargs}
                    df = pd.read_json(file, **options)
                else:
                    raise Exception('Unsupported file format: {}'.format(format))

                x = kwargs.get("x", "text")
                y = kwargs.get('y', 'labels')
                class_sep = kwargs.get('class_sep', ',')
                if isinstance(x, list):
                    data[data_type] = [([row[x_] for x_ in x], str(row[y]).split(class_sep)) for _, row in df.iterrows()]
                else:
                    data[data_type] = [(row[x], str(row[y]).split(class_sep)) for _, row in df.iterrows()]
            else:
                log.warning("Cannot find {} file".format(file))

        return data
Ejemplo n.º 18
0
 def _download_slot_vals(self):
     url = 'http://files.deeppavlov.ai/datasets/dstc_slot_vals.json'
     download(self.save_path, url)
Ejemplo n.º 19
0
 def _build_slot_vals(slot_vals_json_path='data/'):
     url = 'http://lnsigo.mipt.ru/export/datasets/dstc_slot_vals.json'
     download(slot_vals_json_path, url)
Ejemplo n.º 20
0
 def _build_slot_vals(slot_vals_json_path='data/'):
     url = 'http://lnsigo.mipt.ru/export/datasets/dstc_slot_vals.json'
     download(slot_vals_json_path, url)
Ejemplo n.º 21
0
from deeppavlov.core.data.utils import download
import os

if not os.path.exists('data/models/glove.txt'):
    download(
        'data/models/glove.txt',
        source_url='http://files.deeppavlov.ai/embeddings/glove.6B.100d.txt')