def make_filename(interrogation, savename): """create a filename""" if '/' in savename: return savename firstpart = '' if savename.endswith('.p'): savename = savename[:-2] savename = makesafe(savename, drop_datatype=False, hyphens_ok=True) if not savename.endswith('.p'): savename = savename + '.p' if hasattr(interrogation, 'query') and isinstance( interrogation.query, dict): corpus = interrogation.query.get('corpus', False) if corpus: if isinstance(corpus, STRINGTYPE): firstpart = corpus else: if isinstance(corpus, Datalist): firstpart = Corpus(corpus).name if hasattr(corpus, 'name'): firstpart = corpus.name else: firstpart = '' firstpart = os.path.basename(firstpart) if firstpart: return firstpart + '-' + savename else: return savename
def __init__(self, data): from corpkit.process import makesafe if isinstance(data, list): data = OrderedDict(data) # attribute access for k, v in data.items(): setattr(self, makesafe(k), v) self.query = None super(Interrodict, self).__init__(data)
def load_all_results(data_dir='saved_interrogations', **kwargs): """ Load every saved interrogation in data_dir into a dict: >>> r = load_all_results() :param data_dir: path to saved data :type data_dir: str :returns: dict with filenames as keys """ import os from time import localtime, strftime from other import load from process import makesafe root = kwargs.get('root', False) note = kwargs.get('note', False) datafiles = [f for f in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, f)) \ and f.endswith('.p')] # just load first n (for testing) if kwargs.get('n', False): datafiles = datafiles[:kwargs['n']] output = {} l = 0 for index, f in enumerate(datafiles): try: loadname = f.replace('.p', '') output[loadname] = load(f, loaddir=data_dir) time = strftime("%H:%M:%S", localtime()) print('%s: %s loaded as %s.' % (time, f, makesafe(loadname))) l += 1 except: time = strftime("%H:%M:%S", localtime()) print( '%s: %s failed to load. Try using load to find out the matter.' % (time, f)) if note and len(datafiles) > 3: note.progvar.set((index + 1) * 100.0 / len(datafiles)) if root: root.update() time = strftime("%H:%M:%S", localtime()) print('%s: %d interrogations loaded from %s.' % (time, l, os.path.basename(data_dir))) from interrogation import Interrodict return Interrodict(output)
def folderise(folder): """ Move each file into a folder """ import os import shutil from glob import glob from corpkit.process import makesafe fs = glob(os.path.join(folder, '*.txt')) for f in fs: newname = makesafe(os.path.splitext(os.path.basename(f))[0]) newpath = os.path.join(folder, newname) if not os.path.exists(newpath): os.makedirs(newpath) shutil.move(f, os.path.join(newpath))
def __getitem__(self, key): """allow slicing, indexing""" from corpkit.process import makesafe if isinstance( key, slice ) : #Get the start, stop, and step from the slice return Corpora([self[ii] for ii in xrange(*key.indices(len(self)))]) elif type(key) == int: return self.__getitem__(makesafe(self.data[key])) else: try: return self.__getattribute__(key) except: from corpkit.process import is_number if is_number(key): return self.__getattribute__('c' + key)
def __init__(self, data): import re import os from os.path import join, isfile, isdir from corpkit.process import makesafe self.current = 0 if data: self.high = len(data) else: self.high = 0 self.data = data if data and len(data) > 0: for subcorpus in data: safe_var = makesafe(subcorpus) setattr(self, safe_var, subcorpus)
def __getitem__(self, key): from corpkit.process import makesafe if isinstance(key, slice): # Get the start, stop, and step from the slice return Datalist([self[ii] for ii in range( *key.indices(len(self.subcorpora)))]) elif isinstance(key, int): return self.subcorpora.__getitem__(makesafe(self.subcorpora[key])) else: try: return self.subcorpora.__getattribute__(key) except: from corpkit.process import is_number if is_number(key): return self.__getattribute__('c' + key)
def __init__(self, data): from corpkit.process import makesafe for k, v in data.items(): setattr(self, makesafe(k), v) dict.__init__(self, data)
def __setitem__(self, key, value): from corpkit.process import makesafe setattr(self, makesafe(key), value) super(Interrodict, self).__setitem__(key, value)
def __init__(self, path, **kwargs): import re import operator import glob import os from os.path import join, isfile, isdir, abspath, dirname, basename from corpkit.process import determine_datatype # levels are 'c' for corpus, 's' for subcorpus and 'f' for file. Which # one is determined automatically below, and processed accordingly. We # assume it is a full corpus to begin with. self.data = None level = kwargs.pop('level', 'c') self.datatype = kwargs.pop('datatype', None) print_info = kwargs.get('print_info', True) if isinstance(path, (list, Datalist)): self.path = abspath(dirname(path[0].path.rstrip('/'))) self.name = basename(self.path) self.data = path elif isinstance(path, STRINGTYPE): self.path = abspath(path) self.name = basename(path) elif hasattr(path, 'path') and path.path: self.path = abspath(path.path) self.name = basename(path.path) # this messy code figures out as quickly as possible what the datatype # and singlefile status of the path is. it's messy because it shortcuts # full checking where possible some of the shortcutting could maybe be # moved into the determine_datatype() funct. self.singlefile = False if os.path.isfile(self.path): if self.path.endswith('.xml'): self.datatype = 'parse' self.singlefile = True else: if not isdir(self.path): if isdir(join('data', path)): self.path = abspath(join('data', path)) if self.path.endswith('-parsed'): self.datatype = 'parse' if len([d for d in os.listdir(self.path) if isdir(join(self.path, d))]) > 0: self.singlefile = False if len([d for d in os.listdir(self.path) if isdir(join(self.path, d))]) == 0: level = 's' else: if level == 'c': if not self.datatype: self.datatype, self.singlefile = determine_datatype( self.path) if isdir(self.path): if len([d for d in os.listdir(self.path) if isdir(join(self.path, d))]) == 0: level = 's' # if initialised on a file, process as file if self.singlefile and level == 'c': level = 'f' self.level = level # load each interrogation as an attribute if kwargs.get('load_saved', False): from corpkit.other import load from corpkit.process import makesafe if os.path.isdir('saved_interrogations'): saved_files = glob.glob(r'saved_interrogations/*') for filepath in saved_files: filename = os.path.basename(filepath) if not filename.startswith(self.name): continue not_filename = filename.replace(self.name + '-', '') not_filename = os.path.splitext(not_filename)[0] if not_filename in ['features', 'wordclasses', 'postags']: continue variable_safe = makesafe(not_filename) try: setattr(self, variable_safe, load(filename)) if print_info: print( '\tLoaded %s as %s attribute.' % (filename, variable_safe)) except AttributeError: if print_info: print( '\tFailed to load %s as %s attribute. Name conflict?' % (filename, variable_safe)) if print_info: print('Corpus: %s' % self.path)