コード例 #1
0
ファイル: typos_reader.py プロジェクト: RileyShe/DeepPavlov
    def build(data_path: str) -> Path:
        """Download and parse common misspellings list from `Wikipedia <https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines>`_

        Args:
            data_path: target directory to download the data to

        Returns:
            path to the resulting tsv-file
        """
        data_path = Path(data_path) / 'typos_wiki'

        fname = data_path / 'misspelings.tsv'

        if not is_done(data_path):
            url = 'https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines'

            page = requests.get(url)
            tree = html.fromstring(page.content)
            raw = tree.xpath('//pre/text()')[0].splitlines()
            data = []
            for pair in raw:
                typo, corrects = pair.strip().split('->')
                for correct in corrects.split(','):
                    data.append([typo.strip(), correct.strip()])

            fname.parent.mkdir(parents=True, exist_ok=True)
            with fname.open('w', newline='', encoding='utf8') as tsvfile:
                writer = csv.writer(tsvfile, delimiter='\t')
                for line in data:
                    writer.writerow(line)

            mark_done(data_path)

            log.info('Built')
        return fname
コード例 #2
0
 def download_data(self, data_path):
     if not is_done(Path(data_path)):
         download_decompress(
             url=
             "http://lnsigo.mipt.ru/export/datasets/insuranceQA-master.zip",
             download_path=data_path)
         mark_done(data_path)
コード例 #3
0
    def build(data_path: str):
        data_path = Path(data_path) / 'typos_wiki'

        fname = data_path / 'misspelings.tsv'

        if not is_done(data_path):
            url = 'https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines'

            page = requests.get(url)
            tree = html.fromstring(page.content)
            raw = tree.xpath('//pre/text()')[0].splitlines()
            data = []
            for pair in raw:
                typo, corrects = pair.strip().split('->')
                for correct in corrects.split(','):
                    data.append([typo.strip(), correct.strip()])

            fname.parent.mkdir(parents=True, exist_ok=True)
            with fname.open('w', newline='') as tsvfile:
                writer = csv.writer(tsvfile, delimiter='\t')
                for line in data:
                    writer.writerow(line)

            mark_done(data_path)

            log.info('Built')
        return fname
コード例 #4
0
ファイル: typos.py プロジェクト: youlei5898/DeepPavlov
    def build(data_path: str):
        data_path = Path(data_path) / 'typos_wiki'

        fname = data_path / 'misspelings.tsv'

        if not is_done(data_path):
            url = 'https://en.wikipedia.org/wiki/Wikipedia:Lists_of_common_misspellings/For_machines'

            download(fname, url)

            with fname.open() as f:
                data = []
                for line in f:
                    if line.strip().endswith('<pre>'):
                        break
                for line in f:
                    if line.strip().startswith('</pre>'):
                        break
                    data.append(line.strip().split('-&gt;'))

            with fname.open('w', newline='') as tsvfile:
                writer = csv.writer(tsvfile, delimiter='\t')
                for line in data:
                    writer.writerow(line)

            mark_done(data_path)

            print('Built', file=sys.stderr)
        return fname
コード例 #5
0
ファイル: snips_reader.py プロジェクト: yinjie1230/DeepPavlov
    def read(self, data_path: str, queries_per_intent: Optional[int] = None, test_validate_split: float = 0.5,
             *args, **kwargs) -> \
            Dict[str, List[Dict[str, Any]]]:
        """
        Each query in the output has the following form:
            { 'intent': intent_name,
              'data': [ { 'text': text, ('entity': slot_name)? } ]
            }

        Args:
            data_path: A path to a folder with dataset files.
            queries_per_intent: Number of queries to load for each intent. None to load all.
                If the requested number is greater than available in file, all queries are returned.
            test_validate_split: Proportion of `_validate` files to be used as test dataset (since Snips
                is split into training and validation sets without a separate test set).
        """
        data_path = Path(data_path)
        intents = [
            'AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic',
            'RateBook', 'SearchCreativeWork', 'SearchScreeningEvent'
        ]

        if not is_done(data_path):
            url = 'http://files.deeppavlov.ai/datasets/snips.tar.gz'
            log.info('[downloading data from {} to {}]'.format(url, data_path))
            download_decompress(url, data_path)
            mark_done(data_path)

        use_full_file = queries_per_intent is None or queries_per_intent > 70
        training_data = []
        validation_data = []
        test_data = []

        for intent in intents:
            intent_path = data_path / intent
            train_file_name = f"train_{intent}{'_full' if use_full_file else ''}.json"
            validate_file_name = f"validate_{intent}.json"

            train_queries = self._load_file(intent_path / train_file_name,
                                            intent, queries_per_intent)
            validate_queries = self._load_file(
                intent_path / validate_file_name, intent, queries_per_intent)
            num_test_queries = round(
                len(validate_queries) * test_validate_split)

            training_data.extend(train_queries)
            validation_data.extend(validate_queries[num_test_queries:])
            test_data.extend(validate_queries[:num_test_queries])

        return {
            'train': training_data,
            'valid': validation_data,
            'test': test_data
        }
コード例 #6
0
    def _download_data(self, data_path):
        """Download archive with the InsuranceQA dataset files and decompress if there is no dataset files in `data_path`.

        Args:
            data_path: A path to a folder where dataset files are stored.
        """
        if not is_done(Path(data_path)):
            download_decompress(
                url=
                "http://files.deeppavlov.ai/datasets/insuranceQA-master.zip",
                download_path=data_path)
            mark_done(data_path)
コード例 #7
0
ファイル: typos.py プロジェクト: CuteCha/DeepPavlov
    def build(data_path: str):
        data_path = Path(data_path) / 'kartaslov'

        fname = data_path / 'orfo_and_typos.L1_5.csv'

        if not is_done(data_path):
            url = 'https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv'

            download(fname, url)

            mark_done(data_path)

            print('Built', file=sys.stderr)
        return fname
コード例 #8
0
    def build(data_path: str):
        data_path = Path(data_path) / 'kartaslov'

        fname = data_path / 'orfo_and_typos.L1_5.csv'

        if not is_done(data_path):
            url = 'https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv'

            download(fname, url)

            mark_done(data_path)

            log.info('Built')
        return fname
コード例 #9
0
    def __init__(self, data_dir=None, *args, **kwargs):
        if data_dir is None:
            data_dir = paths.USR_PATH
        data_dir = Path(data_dir)
        if self.dict_name is None:
            self.dict_name = args[0] if args else kwargs.get(
                'dictionary_name', 'dictionary')

        data_dir = data_dir / self.dict_name

        alphabet_path = data_dir / 'alphabet.pkl'
        words_path = data_dir / 'words.pkl'
        words_trie_path = data_dir / 'words_trie.pkl'

        if not is_done(data_dir):
            print('Trying to build a dictionary in {}'.format(data_dir),
                  file=sys.stderr)
            if data_dir.is_dir():
                shutil.rmtree(data_dir)
            data_dir.mkdir(parents=True)

            words = self._get_source(data_dir, *args, **kwargs)
            words = {self._normalize(word) for word in words}

            alphabet = {c for w in words for c in w}
            alphabet.remove('⟬')
            alphabet.remove('⟭')

            save_pickle(alphabet, alphabet_path)
            save_pickle(words, words_path)

            words_trie = defaultdict(set)
            for word in words:
                for i in range(len(word)):
                    words_trie[word[:i]].add(word[:i + 1])
                words_trie[word] = set()
            words_trie = {k: sorted(v) for k, v in words_trie.items()}

            save_pickle(words_trie, words_trie_path)

            mark_done(data_dir)
            print('built', file=sys.stderr)
        else:
            print('Loading a dictionary from {}'.format(data_dir),
                  file=sys.stderr)

        self.alphabet = load_pickle(alphabet_path)
        self.words_set = load_pickle(words_path)
        self.words_trie = load_pickle(words_trie_path)
コード例 #10
0
ファイル: typos.py プロジェクト: CuteCha/DeepPavlov
    def __init__(self, data_dir=None, *args, **kwargs):
        if data_dir is None:
            data_dir = paths.USR_PATH
        data_dir = Path(data_dir)
        if self.dict_name is None:
            self.dict_name = args[0] if args else kwargs.get('dictionary_name', 'dictionary')

        data_dir = data_dir / self.dict_name

        alphabet_path = data_dir / 'alphabet.pkl'
        words_path = data_dir / 'words.pkl'
        words_trie_path = data_dir / 'words_trie.pkl'

        if not is_done(data_dir):
            print('Trying to build a dictionary in {}'.format(data_dir), file=sys.stderr)
            if data_dir.is_dir():
                shutil.rmtree(data_dir)
            data_dir.mkdir(parents=True)

            words = self._get_source(data_dir, *args, **kwargs)
            words = {self._normalize(word) for word in words}

            alphabet = {c for w in words for c in w}
            alphabet.remove('⟬')
            alphabet.remove('⟭')

            save_pickle(alphabet, alphabet_path)
            save_pickle(words, words_path)

            words_trie = defaultdict(set)
            for word in words:
                for i in range(len(word)):
                    words_trie[word[:i]].add(word[:i+1])
                words_trie[word] = set()
            words_trie = {k: sorted(v) for k, v in words_trie.items()}

            save_pickle(words_trie, words_trie_path)

            mark_done(data_dir)
            print('built', file=sys.stderr)
        else:
            print('Loading a dictionary from {}'.format(data_dir), file=sys.stderr)

        self.alphabet = load_pickle(alphabet_path)
        self.words_set = load_pickle(words_path)
        self.words_trie = load_pickle(words_trie_path)
コード例 #11
0
    def __init__(self,
                 data_dir: [Path, str] = '',
                 *args,
                 dictionary_name: str = 'dictionary',
                 **kwargs):
        data_dir = expand_path(data_dir) / dictionary_name

        alphabet_path = data_dir / 'alphabet.pkl'
        words_path = data_dir / 'words.pkl'
        words_trie_path = data_dir / 'words_trie.pkl'

        if not is_done(data_dir):
            log.info('Trying to build a dictionary in {}'.format(data_dir))
            if data_dir.is_dir():
                shutil.rmtree(str(data_dir))
            data_dir.mkdir(parents=True)

            words = self._get_source(data_dir, *args, **kwargs)
            words = {self._normalize(word) for word in words}

            alphabet = {c for w in words for c in w}
            alphabet.remove('⟬')
            alphabet.remove('⟭')

            save_pickle(alphabet, alphabet_path)
            save_pickle(words, words_path)

            words_trie = defaultdict(set)
            for word in words:
                for i in range(len(word)):
                    words_trie[word[:i]].add(word[:i + 1])
                words_trie[word] = set()
            words_trie = {k: sorted(v) for k, v in words_trie.items()}

            save_pickle(words_trie, words_trie_path)

            mark_done(data_dir)
            log.info('built')
        else:
            log.info('Loading a dictionary from {}'.format(data_dir))

        self.alphabet = load_pickle(alphabet_path)
        self.words_set = load_pickle(words_path)
        self.words_trie = load_pickle(words_trie_path)
コード例 #12
0
ファイル: typos.py プロジェクト: RileyShe/DeepPavlov
    def __init__(self, data_dir: [Path, str]='', *args, dictionary_name: str='dictionary', **kwargs):
        data_dir = expand_path(data_dir) / dictionary_name

        alphabet_path = data_dir / 'alphabet.pkl'
        words_path = data_dir / 'words.pkl'
        words_trie_path = data_dir / 'words_trie.pkl'

        if not is_done(data_dir):
            log.info('Trying to build a dictionary in {}'.format(data_dir))
            if data_dir.is_dir():
                shutil.rmtree(str(data_dir))
            data_dir.mkdir(parents=True)

            words = self._get_source(data_dir, *args, **kwargs)
            words = {self._normalize(word) for word in words}

            alphabet = {c for w in words for c in w}
            alphabet.remove('⟬')
            alphabet.remove('⟭')

            save_pickle(alphabet, alphabet_path)
            save_pickle(words, words_path)

            words_trie = defaultdict(set)
            for word in words:
                for i in range(len(word)):
                    words_trie[word[:i]].add(word[:i+1])
                words_trie[word] = set()
            words_trie = {k: sorted(v) for k, v in words_trie.items()}

            save_pickle(words_trie, words_trie_path)

            mark_done(data_dir)
            log.info('built')
        else:
            log.info('Loading a dictionary from {}'.format(data_dir))

        self.alphabet = load_pickle(alphabet_path)
        self.words_set = load_pickle(words_path)
        self.words_trie = load_pickle(words_trie_path)
コード例 #13
0
ファイル: typos_reader.py プロジェクト: netsafe/DeepVesnin
    def build(data_path: str) -> Path:
        """Download misspellings list from `github <https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv>`_

        Args:
            data_path: target directory to download the data to

        Returns:
            path to the resulting csv-file
        """
        data_path = Path(data_path) / 'kartaslov'

        fname = data_path / 'orfo_and_typos.L1_5.csv'

        if not is_done(data_path):
            url = 'https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv'

            download(fname, url)

            mark_done(data_path)

            log.info('Built')
        return fname
コード例 #14
0
ファイル: typos_reader.py プロジェクト: RileyShe/DeepPavlov
    def build(data_path: str) -> Path:
        """Download misspellings list from `github <https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv>`_

        Args:
            data_path: target directory to download the data to

        Returns:
            path to the resulting csv-file
        """
        data_path = Path(data_path) / 'kartaslov'

        fname = data_path / 'orfo_and_typos.L1_5.csv'

        if not is_done(data_path):
            url = 'https://raw.githubusercontent.com/dkulagin/kartaslov/master/dataset/orfo_and_typos/orfo_and_typos.L1_5.csv'

            download(fname, url)

            mark_done(data_path)

            log.info('Built')
        return fname
コード例 #15
0
    def read(self, data_path: str, catalog: list,
             **kwargs) -> Dict[str, List[Tuple[Any, Any]]]:
        """Load data from specific catalog

        Parameters:
            data_path: where the dataset is located
            catalog: names of the specific subcategories

        Returns:
            dataset: loaded dataset
        """

        logger.info(f"Ecommerce loader is loaded with catalog {catalog}")

        if not isinstance(catalog, list):
            catalog = [catalog]

        ec_data_global: List[Any] = []
        data_path = Path(expand_path(data_path))

        if not is_done(data_path):
            self._download_data(data_path)

        if data_path.is_dir():
            for fname in data_path.rglob("*.txt"):
                if any(cat in fname.name for cat in catalog):
                    logger.info(f"File {fname.name} is loaded")
                    ec_data_global += self._load_amazon_ecommerce_file(fname)

        dataset = {
            'train':
            [((item['Title'], [], {}), item) for item in ec_data_global],
            'valid': [],
            'test': []
        }

        logger.info(f"In total {len(ec_data_global)} items are loaded")
        return dataset
コード例 #16
0
ファイル: imdb_reader.py プロジェクト: netsafe/DeepVesnin
    def read(self, data_path: str, url: Optional[str] = None,
             *args, **kwargs) -> Dict[str, List[Tuple[Any, Any]]]:
        """
        Args:
            data_path: A path to a folder with dataset files.
            url: A url to the archive with the dataset to download if the data folder is empty.
        """
        data_path = Path(data_path)

        if url is None:
            url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

        if not is_done(data_path):
            log.info('[downloading data from {} to {}]'.format(url, data_path))
            download_decompress(url, data_path)
            mark_done(data_path)

        alternative_data_path = data_path / "aclImdb"
        if alternative_data_path.exists():
            data_path = alternative_data_path

        data = {"train": [],
                "test": []}
        for data_type in data.keys():
            for label in ["neg", "pos"]:
                labelpath = data_path / data_type / label
                if not labelpath.exists():
                    raise RuntimeError(f"Cannot load data: {labelpath} does not exist")
                for filename in labelpath.glob("*.txt"):
                    with filename.open(encoding='utf-8') as f:
                        text = f.read()
                    data[data_type].append((text, [label]))

            if not data[data_type]:
                raise RuntimeError(f"Could not load the '{data_type}' dataset, "
                                   "probably data dirs are empty")

        return data
コード例 #17
0
    def read(self, data_path: str, catalog: list, **kwargs) -> Dict[str, List[Tuple[Any, Any]]]:
        """Load data from specific catalog

        Parameters:
            data_path: where the dataset is located
            catalog: names of the specific subcategories

        Returns:
            dataset: loaded dataset
        """

        logger.info(f"Ecommerce loader is loaded with catalog {catalog}")

        if not isinstance(catalog, list):
            catalog = [catalog]

        ec_data_global: List[Any] = []
        data_path = Path(expand_path(data_path))

        if not is_done(data_path):
            self._download_data(data_path)

        if data_path.is_dir():
            for fname in data_path.rglob("*.txt"):
                if any(cat in fname.name for cat in catalog):
                    logger.info(f"File {fname.name} is loaded")
                    ec_data_global += self._load_amazon_ecommerce_file(fname)

        dataset = {
            'train': [((item['Title'], [], {}), item) for item in ec_data_global],
            'valid': [],
            'test':  []
            }

        logger.info(f"In total {len(ec_data_global)} items are loaded")
        return dataset