Ejemplo n.º 1
0
    def test_create_delete(self):
        io_utils.create_path(self.empty_file)
        io_utils.create_folder(self.cur_dir)

        self.assertEqual(None, io_utils.check_file_readable(self.empty_file))
        self.assertEqual(None, io_utils.check_folder_readable(self.cur_dir))

        cfolder = 'test_folder'
        cfile = os.path.join(cfolder, 'test_file.txt')

        io_utils.create_path(cfile)
        with open(cfile, 'w') as ostream:
            ostream.write('')
        self.assertEqual(None, io_utils.check_file_readable(cfile))

        io_utils.delete_file(cfile)
        with self.assertRaises(Exception) as context:
            io_utils.check_file_readable(cfile)
        self.assertTrue('missing or not readable' in str(context.exception))

        io_utils.delete_folder(cfolder)
        io_utils.create_folder(cfolder)
        self.assertEqual(None, io_utils.check_folder_readable(cfolder))

        io_utils.delete_folder(cfolder)
        with self.assertRaises(Exception) as context:
            io_utils.check_folder_readable(cfolder)
        self.assertTrue('missing' in str(context.exception))

        io_utils.create_path('nofile.txt')
Ejemplo n.º 2
0
def stream(path, input_field='noisy', target_field='clean'):
    """Iterate through the data, one entry at a time"""

    io_utils.check_file_readable(path)
    with open(path, 'r', encoding='utf-8') as istream:
        for line in istream:
            entry = json.loads(line)
            yield entry[input_field], entry[target_field]
Ejemplo n.º 3
0
def stream(path):
    """Iterate through the data, one entry at a time"""

    io_utils.check_file_readable(path)
    with open(path, 'r', encoding='utf-8') as istream:
        for line in istream:
            line = line.strip()
            yield line
Ejemplo n.º 4
0
def stream_field(path, field):
    """Iterate through the data, one entry at a time"""

    io_utils.check_file_readable(path)
    with open(path, 'r', encoding='utf-8') as istream:
        for line in istream:
            entry = json.loads(line)
            yield entry[field]
Ejemplo n.º 5
0
    def setUp(self):
        """Set up local variables"""

        self.jsonl_file = os.path.join(os.path.dirname(__file__),
                                       'sample.jsonl')
        self.txt_file = io_utils.change_extension(self.jsonl_file, 'txt')
        self.copy_txt = io_utils.change_extension(self.jsonl_file, 'copy.txt')

        io_utils.check_file_readable(self.jsonl_file)
Ejemplo n.º 6
0
 def load_from_file(
     self,
     path,
     candidate='suggestion',
     gold='clean',
 ):
     """Load data from jsonl file"""
     io_utils.check_file_readable(path)
     self.data = json_controller.stream(path, candidate, gold)
Ejemplo n.º 7
0
    def setUp(self):
        """Set up local variables"""

        self.arpa = os.path.join(os.path.dirname(__file__), 'sample-model.arpa')
        self.bin = io_utils.change_extension(self.arpa, 'bin')
        self.tmp = io_utils.change_extension(self.arpa, 'tmp.bin')

        io_utils.check_file_readable(self.arpa)
        io_utils.check_file_readable(self.bin)
Ejemplo n.º 8
0
    def __init__(self, path, header="@dd", order=3, unk='<unk>'):
        """Load language model from file"""

        io_utils.check_file_readable(path)

        self.logger = logging.getLogger(__name__)

        self.order = order
        self.model = RecordTrie(header)
        self.model.load(path)
        self.unk = unk
Ejemplo n.º 9
0
    def test_checkups(self):
        self.assertEqual(None, io_utils.check_file_readable(self.empty_file))
        self.assertEqual(None, io_utils.check_folder_readable(self.cur_dir))

        with self.assertRaises(Exception) as context:
            io_utils.check_file_readable('file.txt')
        self.assertTrue('missing or not readable' in str(context.exception))

        with self.assertRaises(Exception) as context:
            io_utils.check_folder_readable('folder')
        self.assertTrue('missing' in str(context.exception))
Ejemplo n.º 10
0
def stream_chunk(path, n=100):
    """Iterate through the data, one chunk at a time"""

    io_utils.check_file_readable(path)
    with open(path, 'r', encoding='utf-8') as istream:
        data = []
        for line in istream:
            line = line.strip()
            data.append(line)
            if len(data) == n:
                yield data
                data = []
Ejemplo n.º 11
0
def stream(path, chunksize=1, header=None, names=None, sep=','):
    """Iterate through the data, one chunk at a time"""

    io_utils.check_file_readable(path)
    return pd.read_csv(
        path,
        iterator=True,
        chunksize=chunksize,
        header=header,
        names=names,
        sep=sep,
        encoding='utf-8')
Ejemplo n.º 12
0
def load(path):
    """Load entire data"""

    LOGGER.info("Load data from '{}' text file".format(path))
    io_utils.check_file_readable(path)

    data = []
    with open(path, 'r', encoding='utf-8') as istream:
        for line in istream:
            data.append(line.strip())

    LOGGER.info("Loaded {} sentences".format(len(data)))
    return data
Ejemplo n.º 13
0
    def __init__(self, path):
        """Build trie on ARPA n-grams"""

        io_utils.check_file_readable(path)
        self.logger = logging.getLogger(__name__)
        self.logger.info("Load ARPA model from {}".format(path))

        self.order = None
        self.total = {}
        self.trie = RecordTrie("@dd", self.load_ngram_tuples(path))

        self.logger.info(
            "Loaded a {}-gram LM with {} counts".format(self.order, self.total))
Ejemplo n.º 14
0
def stream_chunk(path, n, input_field='noisy', target_field='clean'):
    """Iterate through the data, one chunk at a time"""

    io_utils.check_file_readable(path)
    with open(path, 'r', encoding='utf-8') as istream:
        input_seqs, output_seqs = [], []
        for line in istream:
            entry = json.loads(line)
            input_seqs.append(entry[input_field])
            output_seqs.append(entry[target_field])
            if len(input_seqs) == n:
                yield input_seqs, output_seqs
                input_seqs, output_seqs = [], []
Ejemplo n.º 15
0
def load_field(path, field):
    """Load data for specific field"""

    LOGGER.info("Load data from '{}' json file".format(path))
    io_utils.check_file_readable(path)

    data = []
    with open(path, 'r', encoding='utf-8') as istream:
        for line in istream:
            entry = json.loads(line)
            if field in entry:
                data.append(entry[field])

    LOGGER.info("Loaded {} entries".format(len(data)))
    return data
Ejemplo n.º 16
0
    def __init__(self, hunspell_file, personal_file):
        """Read contents of both vocabularies"""

        self.logger = logging.getLogger(__name__)

        io_utils.check_file_readable(hunspell_file)
        io_utils.check_file_readable(personal_file)

        # load external hunspell dictionary
        self.hdict = text_controller.load(hunspell_file)

        # load personal dictionary
        self.pdict = text_controller.load(personal_file)

        # initialize the combined content
        self.mix_content = None
Ejemplo n.º 17
0
    def test_decompress(self):
        io_utils.decompress(self.archive, self.tmp_file)
        self.assertEqual(None, io_utils.check_file_readable(self.tmp_file))

        with self.assertRaises(Exception) as context:
            io_utils.decompress(self.empty_file, self.tmp_file)
        self.assertTrue('not a bz2 archive' in str(context.exception))
Ejemplo n.º 18
0
def stream_field(path, field, header=None, names=None, sep=','):
    """Iterate through the 'field' data, one chunk at a time"""

    io_utils.check_file_readable(path)

    with open(path, 'r', encoding='utf-8') as istream:
        for entry in pd.read_csv(
                istream,
                usecols=[field],
                header=header,
                iterator=True,
                chunksize=1,
                names=names,
                sep=sep,
                encoding='utf-8'):
            for _, value in entry[field].iteritems():
                yield value
Ejemplo n.º 19
0
def load_fields(path, fields):
    """Load data for specific fields"""

    LOGGER.info("Load data from '{}' json file".format(path))
    io_utils.check_file_readable(path)

    data = {field: [] for field in fields}
    with open(path, 'r', encoding='utf-8') as istream:
        for line in istream:
            entry = json.loads(line)
            for field in fields:
                if field in entry:
                    data[field].append(entry[field])
    for field in fields:
        LOGGER.info("Loaded {} entries for field={}".format(
            len(data[field]), field))

    return data
Ejemplo n.º 20
0
def load_configuration(path):
    """Load configuration from yaml file"""

    io_utils.check_file_readable(path)

    conf = {}
    with open(path, 'r') as stream:
        try:
            conf = yaml.load(stream)
        except yaml.YAMLError as exc:
            raise CaughtException(
                "Exception encountered during YAML load: {}".format(exc))

    if not conf:
        raise ConfigError("Empty configuration in '{}'".format(path))
    if not isinstance(conf, dict):
        raise ConfigError("Not a dict object stored in '{}'".format(path))

    return conf
Ejemplo n.º 21
0
def load(path, header=None, names=None, sep=',', fields=None):
    """Load entire data"""

    if fields:
        LOGGER.info("Load {} columns from '{}' csv file".format(fields, path))
    else:
        LOGGER.info("Load data from '{}' csv file".format(path))

    io_utils.check_file_readable(path)
    with open(path, 'r', encoding='utf-8') as istream:
        data = pd.read_csv(
            istream,
            usecols=fields,
            header=header,
            names=names,
            sep=sep,
            encoding='utf-8')

    LOGGER.info("Loaded {} entries".format(len(data)))
    return data
Ejemplo n.º 22
0
    def setUp(self):
        """Set up local variables"""

        self.extractor = WikiExtraction()
        self.sample = os.path.join(os.path.dirname(__file__), 'sample.bz2')
        self.data = os.path.join(os.path.dirname(__file__),
                                 'sample-corpus.txt')
        self.vocab = os.path.join(os.path.dirname(__file__),
                                  'sample-vocab.txt')

        io_utils.check_file_readable(self.sample)
        io_utils.check_file_readable(self.data)
        io_utils.check_file_readable(self.vocab)

        # temporary files
        self.files = {
            'dld': os.path.join(os.path.dirname(__file__), 'dld.xml'),
            'xml': io_utils.change_extension(self.sample, 'xml'),
            'jsonl': io_utils.change_extension(self.sample, 'jsonl'),
            'txt': io_utils.change_extension(self.sample, 'txt'),
            'wvoc': io_utils.change_extension(self.sample, 'wvoc.txt'),
            'wplot': io_utils.change_extension(self.sample, 'wvoc.png'),
            'cvoc': io_utils.change_extension(self.sample, 'cvoc.txt'),
            'cplot': io_utils.change_extension(self.sample, 'cvoc.png'),
        }
Ejemplo n.º 23
0
    def __init__(self, path=None, counts=None, token='word'):
        """Load tokens from path or from a counts dictionary"""

        if token != 'word' and token != 'char':
            raise ConfigError("Method expects a 'word' or a 'char' token")

        self.logger = logging.getLogger(__name__)
        self.token = token

        if path and isinstance(path, str):
            self.tokens = defaultdict(lambda: 0)
            self.occurrences = 0

            io_utils.check_file_readable(path)
            with open(path, 'r', encoding='utf-8') as istream:
                for line in istream:
                    if token == 'word':
                        for word in line.split():
                            self.occurrences += 1
                            self.tokens[word] += 1
                    elif token == 'char':
                        for char in line.strip():
                            self.occurrences += 1
                            self.tokens[char] += 1

            self.logger.info("Read {:,} {}s with {:,} occurrences".format(
                len(self.tokens), self.token, self.occurrences))

        elif counts and isinstance(counts, dict):
            self.tokens = counts.copy()
            self.occurrences = sum(counts.values())

            self.logger.info("Loaded {:,} {}s with {:,} occurrences".format(
                len(self.tokens), self.token, self.occurrences))

        else:
            raise ConfigError('Method expects a file path or a dictionary')
Ejemplo n.º 24
0
    def setUp(self):
        """Set up local variables"""

        self.dic = os.path.join(os.path.dirname(__file__), 'index.dic')
        self.aff = os.path.join(os.path.dirname(__file__), 'index.aff')
        self.samples = os.path.join(os.path.dirname(__file__),
                                    'sample-queries.jsonl')

        io_utils.check_file_readable(self.aff)
        io_utils.check_file_readable(self.dic)
        io_utils.check_file_readable(self.samples)
Ejemplo n.º 25
0
    def __init__(self, dic_file, aff_file, extra_dic=None):
        """
        Load the dictionary and affix files for spell checking.
        Allow adding an extra dictionary.
        """

        io_utils.check_file_readable(dic_file)
        io_utils.check_file_readable(aff_file)

        self.hunspell = HunSpell(dic_file, aff_file)
        if extra_dic:
            io_utils.check_file_readable(extra_dic)
            self.hunspell.add_dic(extra_dic)
Ejemplo n.º 26
0
    def setUp(self):
        """Set up local variables"""

        self.csv_file = os.path.join(os.path.dirname(__file__), 'sample.csv')
        self.jsonl_file = io_utils.change_extension(self.csv_file, 'jsonl')
        self.ft_model = '/usr/share/ccquery/models/fastText/lid.176.bin'

        io_utils.check_file_readable(self.csv_file)
        io_utils.check_file_readable(self.jsonl_file)
        io_utils.check_file_readable(self.ft_model)

        self.copy_csv = io_utils.change_extension(self.csv_file, 'copy.csv')
        self.copy_jsonl = io_utils.change_extension(self.csv_file,
                                                    'copy.jsonl')
Ejemplo n.º 27
0
    def setUp(self):
        """Set up local variables"""

        self.mfile = os.path.join(os.path.dirname(__file__),
                                  'sample-model.bin')
        sqfile = os.path.join(os.path.dirname(__file__),
                              'sample-sentences.txt')
        scfile = os.path.join(os.path.dirname(__file__), 'sample-scores.txt')

        io_utils.check_file_readable(self.mfile)
        io_utils.check_file_readable(sqfile)
        io_utils.check_file_readable(scfile)

        self.model = LanguageModel(self.mfile, order=3)
        self.data = read_data(sqfile)
        self.scores = read_data(scfile, to_float=True)
Ejemplo n.º 28
0
    def setUp(self):
        """Set up local variables"""

        nlp = 'fr_core_news_sm'
        aff = os.path.join(os.path.dirname(__file__), 'index.aff')
        dic = os.path.join(os.path.dirname(__file__), 'index.dic')

        ngram = os.path.join(os.path.dirname(__file__), '..', 'ngram',
                             'sample-model.bin')

        io_utils.check_file_readable(aff)
        io_utils.check_file_readable(dic)
        io_utils.check_file_readable(ngram)

        # load baseline
        self.model = B1Correction()
        self.model.load_spacy(nlp, disable=['ner', 'parser'])
        self.model.load_hunspell(dic, aff)
        self.model.load_ngram(ngram)
Ejemplo n.º 29
0
 def add_extra_dictionary(self, dic_file):
     """Add an extra dictionary to the current instance"""
     io_utils.check_file_readable(dic_file)
     self.hunspell.add_dic(dic_file)
Ejemplo n.º 30
0
 def setUp(self):
     """Set up local variables"""
     self.txt_file = os.path.join(os.path.dirname(__file__), 'sample.txt')
     io_utils.check_file_readable(self.txt_file)