def test_create_delete(self): io_utils.create_path(self.empty_file) io_utils.create_folder(self.cur_dir) self.assertEqual(None, io_utils.check_file_readable(self.empty_file)) self.assertEqual(None, io_utils.check_folder_readable(self.cur_dir)) cfolder = 'test_folder' cfile = os.path.join(cfolder, 'test_file.txt') io_utils.create_path(cfile) with open(cfile, 'w') as ostream: ostream.write('') self.assertEqual(None, io_utils.check_file_readable(cfile)) io_utils.delete_file(cfile) with self.assertRaises(Exception) as context: io_utils.check_file_readable(cfile) self.assertTrue('missing or not readable' in str(context.exception)) io_utils.delete_folder(cfolder) io_utils.create_folder(cfolder) self.assertEqual(None, io_utils.check_folder_readable(cfolder)) io_utils.delete_folder(cfolder) with self.assertRaises(Exception) as context: io_utils.check_folder_readable(cfolder) self.assertTrue('missing' in str(context.exception)) io_utils.create_path('nofile.txt')
def stream(path, input_field='noisy', target_field='clean'): """Iterate through the data, one entry at a time""" io_utils.check_file_readable(path) with open(path, 'r', encoding='utf-8') as istream: for line in istream: entry = json.loads(line) yield entry[input_field], entry[target_field]
def stream(path): """Iterate through the data, one entry at a time""" io_utils.check_file_readable(path) with open(path, 'r', encoding='utf-8') as istream: for line in istream: line = line.strip() yield line
def stream_field(path, field): """Iterate through the data, one entry at a time""" io_utils.check_file_readable(path) with open(path, 'r', encoding='utf-8') as istream: for line in istream: entry = json.loads(line) yield entry[field]
def setUp(self): """Set up local variables""" self.jsonl_file = os.path.join(os.path.dirname(__file__), 'sample.jsonl') self.txt_file = io_utils.change_extension(self.jsonl_file, 'txt') self.copy_txt = io_utils.change_extension(self.jsonl_file, 'copy.txt') io_utils.check_file_readable(self.jsonl_file)
def load_from_file( self, path, candidate='suggestion', gold='clean', ): """Load data from jsonl file""" io_utils.check_file_readable(path) self.data = json_controller.stream(path, candidate, gold)
def setUp(self): """Set up local variables""" self.arpa = os.path.join(os.path.dirname(__file__), 'sample-model.arpa') self.bin = io_utils.change_extension(self.arpa, 'bin') self.tmp = io_utils.change_extension(self.arpa, 'tmp.bin') io_utils.check_file_readable(self.arpa) io_utils.check_file_readable(self.bin)
def __init__(self, path, header="@dd", order=3, unk='<unk>'): """Load language model from file""" io_utils.check_file_readable(path) self.logger = logging.getLogger(__name__) self.order = order self.model = RecordTrie(header) self.model.load(path) self.unk = unk
def test_checkups(self): self.assertEqual(None, io_utils.check_file_readable(self.empty_file)) self.assertEqual(None, io_utils.check_folder_readable(self.cur_dir)) with self.assertRaises(Exception) as context: io_utils.check_file_readable('file.txt') self.assertTrue('missing or not readable' in str(context.exception)) with self.assertRaises(Exception) as context: io_utils.check_folder_readable('folder') self.assertTrue('missing' in str(context.exception))
def stream_chunk(path, n=100): """Iterate through the data, one chunk at a time""" io_utils.check_file_readable(path) with open(path, 'r', encoding='utf-8') as istream: data = [] for line in istream: line = line.strip() data.append(line) if len(data) == n: yield data data = []
def stream(path, chunksize=1, header=None, names=None, sep=','): """Iterate through the data, one chunk at a time""" io_utils.check_file_readable(path) return pd.read_csv( path, iterator=True, chunksize=chunksize, header=header, names=names, sep=sep, encoding='utf-8')
def load(path): """Load entire data""" LOGGER.info("Load data from '{}' text file".format(path)) io_utils.check_file_readable(path) data = [] with open(path, 'r', encoding='utf-8') as istream: for line in istream: data.append(line.strip()) LOGGER.info("Loaded {} sentences".format(len(data))) return data
def __init__(self, path): """Build trie on ARPA n-grams""" io_utils.check_file_readable(path) self.logger = logging.getLogger(__name__) self.logger.info("Load ARPA model from {}".format(path)) self.order = None self.total = {} self.trie = RecordTrie("@dd", self.load_ngram_tuples(path)) self.logger.info( "Loaded a {}-gram LM with {} counts".format(self.order, self.total))
def stream_chunk(path, n, input_field='noisy', target_field='clean'): """Iterate through the data, one chunk at a time""" io_utils.check_file_readable(path) with open(path, 'r', encoding='utf-8') as istream: input_seqs, output_seqs = [], [] for line in istream: entry = json.loads(line) input_seqs.append(entry[input_field]) output_seqs.append(entry[target_field]) if len(input_seqs) == n: yield input_seqs, output_seqs input_seqs, output_seqs = [], []
def load_field(path, field): """Load data for specific field""" LOGGER.info("Load data from '{}' json file".format(path)) io_utils.check_file_readable(path) data = [] with open(path, 'r', encoding='utf-8') as istream: for line in istream: entry = json.loads(line) if field in entry: data.append(entry[field]) LOGGER.info("Loaded {} entries".format(len(data))) return data
def __init__(self, hunspell_file, personal_file): """Read contents of both vocabularies""" self.logger = logging.getLogger(__name__) io_utils.check_file_readable(hunspell_file) io_utils.check_file_readable(personal_file) # load external hunspell dictionary self.hdict = text_controller.load(hunspell_file) # load personal dictionary self.pdict = text_controller.load(personal_file) # initialize the combined content self.mix_content = None
def test_decompress(self): io_utils.decompress(self.archive, self.tmp_file) self.assertEqual(None, io_utils.check_file_readable(self.tmp_file)) with self.assertRaises(Exception) as context: io_utils.decompress(self.empty_file, self.tmp_file) self.assertTrue('not a bz2 archive' in str(context.exception))
def stream_field(path, field, header=None, names=None, sep=','): """Iterate through the 'field' data, one chunk at a time""" io_utils.check_file_readable(path) with open(path, 'r', encoding='utf-8') as istream: for entry in pd.read_csv( istream, usecols=[field], header=header, iterator=True, chunksize=1, names=names, sep=sep, encoding='utf-8'): for _, value in entry[field].iteritems(): yield value
def load_fields(path, fields): """Load data for specific fields""" LOGGER.info("Load data from '{}' json file".format(path)) io_utils.check_file_readable(path) data = {field: [] for field in fields} with open(path, 'r', encoding='utf-8') as istream: for line in istream: entry = json.loads(line) for field in fields: if field in entry: data[field].append(entry[field]) for field in fields: LOGGER.info("Loaded {} entries for field={}".format( len(data[field]), field)) return data
def load_configuration(path): """Load configuration from yaml file""" io_utils.check_file_readable(path) conf = {} with open(path, 'r') as stream: try: conf = yaml.load(stream) except yaml.YAMLError as exc: raise CaughtException( "Exception encountered during YAML load: {}".format(exc)) if not conf: raise ConfigError("Empty configuration in '{}'".format(path)) if not isinstance(conf, dict): raise ConfigError("Not a dict object stored in '{}'".format(path)) return conf
def load(path, header=None, names=None, sep=',', fields=None): """Load entire data""" if fields: LOGGER.info("Load {} columns from '{}' csv file".format(fields, path)) else: LOGGER.info("Load data from '{}' csv file".format(path)) io_utils.check_file_readable(path) with open(path, 'r', encoding='utf-8') as istream: data = pd.read_csv( istream, usecols=fields, header=header, names=names, sep=sep, encoding='utf-8') LOGGER.info("Loaded {} entries".format(len(data))) return data
def setUp(self): """Set up local variables""" self.extractor = WikiExtraction() self.sample = os.path.join(os.path.dirname(__file__), 'sample.bz2') self.data = os.path.join(os.path.dirname(__file__), 'sample-corpus.txt') self.vocab = os.path.join(os.path.dirname(__file__), 'sample-vocab.txt') io_utils.check_file_readable(self.sample) io_utils.check_file_readable(self.data) io_utils.check_file_readable(self.vocab) # temporary files self.files = { 'dld': os.path.join(os.path.dirname(__file__), 'dld.xml'), 'xml': io_utils.change_extension(self.sample, 'xml'), 'jsonl': io_utils.change_extension(self.sample, 'jsonl'), 'txt': io_utils.change_extension(self.sample, 'txt'), 'wvoc': io_utils.change_extension(self.sample, 'wvoc.txt'), 'wplot': io_utils.change_extension(self.sample, 'wvoc.png'), 'cvoc': io_utils.change_extension(self.sample, 'cvoc.txt'), 'cplot': io_utils.change_extension(self.sample, 'cvoc.png'), }
def __init__(self, path=None, counts=None, token='word'): """Load tokens from path or from a counts dictionary""" if token != 'word' and token != 'char': raise ConfigError("Method expects a 'word' or a 'char' token") self.logger = logging.getLogger(__name__) self.token = token if path and isinstance(path, str): self.tokens = defaultdict(lambda: 0) self.occurrences = 0 io_utils.check_file_readable(path) with open(path, 'r', encoding='utf-8') as istream: for line in istream: if token == 'word': for word in line.split(): self.occurrences += 1 self.tokens[word] += 1 elif token == 'char': for char in line.strip(): self.occurrences += 1 self.tokens[char] += 1 self.logger.info("Read {:,} {}s with {:,} occurrences".format( len(self.tokens), self.token, self.occurrences)) elif counts and isinstance(counts, dict): self.tokens = counts.copy() self.occurrences = sum(counts.values()) self.logger.info("Loaded {:,} {}s with {:,} occurrences".format( len(self.tokens), self.token, self.occurrences)) else: raise ConfigError('Method expects a file path or a dictionary')
def setUp(self): """Set up local variables""" self.dic = os.path.join(os.path.dirname(__file__), 'index.dic') self.aff = os.path.join(os.path.dirname(__file__), 'index.aff') self.samples = os.path.join(os.path.dirname(__file__), 'sample-queries.jsonl') io_utils.check_file_readable(self.aff) io_utils.check_file_readable(self.dic) io_utils.check_file_readable(self.samples)
def __init__(self, dic_file, aff_file, extra_dic=None): """ Load the dictionary and affix files for spell checking. Allow adding an extra dictionary. """ io_utils.check_file_readable(dic_file) io_utils.check_file_readable(aff_file) self.hunspell = HunSpell(dic_file, aff_file) if extra_dic: io_utils.check_file_readable(extra_dic) self.hunspell.add_dic(extra_dic)
def setUp(self): """Set up local variables""" self.csv_file = os.path.join(os.path.dirname(__file__), 'sample.csv') self.jsonl_file = io_utils.change_extension(self.csv_file, 'jsonl') self.ft_model = '/usr/share/ccquery/models/fastText/lid.176.bin' io_utils.check_file_readable(self.csv_file) io_utils.check_file_readable(self.jsonl_file) io_utils.check_file_readable(self.ft_model) self.copy_csv = io_utils.change_extension(self.csv_file, 'copy.csv') self.copy_jsonl = io_utils.change_extension(self.csv_file, 'copy.jsonl')
def setUp(self): """Set up local variables""" self.mfile = os.path.join(os.path.dirname(__file__), 'sample-model.bin') sqfile = os.path.join(os.path.dirname(__file__), 'sample-sentences.txt') scfile = os.path.join(os.path.dirname(__file__), 'sample-scores.txt') io_utils.check_file_readable(self.mfile) io_utils.check_file_readable(sqfile) io_utils.check_file_readable(scfile) self.model = LanguageModel(self.mfile, order=3) self.data = read_data(sqfile) self.scores = read_data(scfile, to_float=True)
def setUp(self): """Set up local variables""" nlp = 'fr_core_news_sm' aff = os.path.join(os.path.dirname(__file__), 'index.aff') dic = os.path.join(os.path.dirname(__file__), 'index.dic') ngram = os.path.join(os.path.dirname(__file__), '..', 'ngram', 'sample-model.bin') io_utils.check_file_readable(aff) io_utils.check_file_readable(dic) io_utils.check_file_readable(ngram) # load baseline self.model = B1Correction() self.model.load_spacy(nlp, disable=['ner', 'parser']) self.model.load_hunspell(dic, aff) self.model.load_ngram(ngram)
def add_extra_dictionary(self, dic_file): """Add an extra dictionary to the current instance""" io_utils.check_file_readable(dic_file) self.hunspell.add_dic(dic_file)
def setUp(self): """Set up local variables""" self.txt_file = os.path.join(os.path.dirname(__file__), 'sample.txt') io_utils.check_file_readable(self.txt_file)