def __init__(self, path, **kwargs): import os from os.path import join, isfile, isdir, abspath, dirname, basename import re import operator from process import determine_datatype from corpus import Datalist # levels are 'c' for corpus, 's' for subcorpus and 'f' for file. Which # one is determined automatically below, and processed accordingly. We # assume it is a full corpus to begin with. self.data = None level = kwargs.pop('level', 'c') self.datatype = kwargs.pop('datatype', None) print_info = kwargs.get('print_info', True) if path.__class__ == Datalist or type(path) == list: self.path = abspath(dirname(path[0].path.rstrip('/'))) self.name = basename(self.path) self.data = path else: self.path = abspath(path) self.name = basename(path) # this messy code figures out as quickly as possible what the datatype # and singlefile status of the path is. it's messy because it shortcuts # full checking where possible some of the shortcutting could maybe be # moved into the determine_datatype() funct. if print_info: print('\nCorpus at: %s' % self.path) self.singlefile = False if os.path.isfile(self.path): if self.path.endswith('.xml'): self.datatype = 'parse' self.singlefile = True elif self.path.endswith('-parsed'): self.datatype = 'parse' if len([d for d in os.listdir(self.path) if isdir(join(self.path, d))]) > 0: self.singlefile = False if len([d for d in os.listdir(self.path) if isdir(join(self.path, d))]) == 0: level = 's' else: if level == 'c': if not self.datatype: self.datatype, self.singlefile = determine_datatype(self.path) if len([d for d in os.listdir(self.path) if isdir(join(self.path, d))]) == 0: level = 's' # if initialised on a file, process as file if self.singlefile and level == 'c': level = 'f' self.level = level
def __init__(self, path, **kwargs): import os from os.path import join, isfile, isdir import re import operator from process import determine_datatype # levels are 'c' for corpus, 's' for subcorpus and 'f' for file. Which # one is determined automatically below, and processed accordingly. We # assume it is a full corpus to begin with. level = kwargs.pop("level", "c") self.datatype = kwargs.pop("datatype", None) print_info = kwargs.get("print_info", True) self.path = os.path.abspath(path) self.name = os.path.basename(path) # this messy code figures out as quickly as possible what the datatype # and singlefile status of the path is. it's messy because it shortcuts # full checking where possible some of the shortcutting could maybe be # moved into the determine_datatype() funct. if print_info: print("\nCorpus at: %s\n" % self.path) self.singlefile = False if os.path.isfile(self.path): if self.path.endswith(".xml"): self.datatype = "parse" self.singlefile = True elif path.endswith("-parsed"): self.datatype = "parse" if len([d for d in os.listdir(path) if isdir(join(path, d))]) > 0: self.singlefile = False if len([d for d in os.listdir(path) if isdir(join(path, d))]) == 0: level = "s" else: if level == "c": if not self.datatype: self.datatype, self.singlefile = determine_datatype(path) if len([d for d in os.listdir(path) if isdir(join(path, d))]) == 0: level = "s" # if initialised on a file, process as file if self.singlefile and level == "c": level = "f" self.level = level
def __init__(self, path, **kwargs): import os from os.path import join, isfile, isdir import re import operator from process import determine_datatype # levels are 'c' for corpus, 's' for subcorpus and 'f' for file. Which # one is determined automatically below, and processed accordingly. We # assume it is a full corpus to begin with. level = kwargs.pop('level', 'c') print_info = kwargs.get('print_info', True) path = os.path.abspath(path) self.path = os.path.relpath(path) self.name = os.path.basename(path) self.abspath = path # this messy code figures out as quickly as possible what the datatype # and singlefile status of the path is. it's messy because it shortcuts # full checking where possible some of the shortcutting could maybe be # moved into the determine_datatype() funct. self.singlefile = False if os.path.isfile(self.abspath): if self.abspath.endswith('.xml'): self.datatype = 'parse' self.singlefile = True elif path.endswith('-parsed'): self.datatype = 'parse' if len([d for d in os.listdir(path) if isdir(join(path, d))]) > 0: self.singlefile = False else: self.datatype, self.singlefile = determine_datatype(path) if len([d for d in os.listdir(path) if isdir(join(path, d))]) == 0: level = 's' self.structure = None self.subcorpora = None self.files = None # these two will become .structure and .files if they exist struct = {} all_files = [] # if initialised on a file, process as file if self.singlefile and level == 'c': level = 'f' # For corpora, make Datalist of subcorpora, make structure dict, make a # Datalist of files, and print useful information if level == 'c': if print_info: print('\nCorpus at: %s\n' % self.abspath) subcorpora = Datalist(sorted([Subcorpus(join(self.path, d)) \ for d in os.listdir(self.path) \ if isdir(join(self.path, d))], \ key=operator.attrgetter('name'))) self.subcorpora = subcorpora for sbc in subcorpora: file_list = [File(f, sbc.path) for f in os.listdir(sbc.path) \ if not f.startswith('.')] file_list = sorted(file_list, key=operator.attrgetter('name')) file_list = Datalist(file_list) struct[sbc] = file_list if print_info: print('Subcorpus: %s\n\t%s\n' % (sbc.name, \ '\n\t'.join([f.name for f in file_list[:10]]))) if len(file_list) > 10: print('... and %s more ... \n' % str(len(file_list) - 10)) for f in file_list: all_files.append(f) self.structure = struct # for subcorpora, we only need the filelist and a simple structure dict elif level == 's': all_files = sorted([File(f, self.path) for f in os.listdir(self.path) \ if not f.startswith('.')], key=operator.attrgetter('name')) self.files = Datalist(all_files) self.structure = {'.': self.files} if print_info: print('\nCorpus created with %d files:\n\t%s\n' % (len(self.files), '\n\t'.join([i.name for i in self.files][:10]))) if len(self.files) > 10: print('... and %s more ... \n' % str(len(self.files) - 10)) # for non File, we will add files attribute if level != 'f': self.files = Datalist(all_files) # this is the future home of the output of .get_stats() self.features = False # set accessible attribute names for subcorpora and files variable_safe_r = re.compile('[\W0-9_]+', re.UNICODE) if self.subcorpora is not None: if self.subcorpora and len(self.subcorpora) > 0: for subcorpus in self.subcorpora: variable_safe = re.sub(variable_safe_r, '', \ subcorpus.name.lower().split(',')[0]) setattr(self, variable_safe, subcorpus) if self.files is not None: if self.files and len(self.files) > 0: for f in self.files: variable_safe = re.sub(variable_safe_r, '', f.name.lower().split('.')[0]) setattr(self, variable_safe, f)
def __init__(self, path, **kwargs): import os from os.path import join, isfile, isdir, abspath, dirname, basename import re import operator from process import determine_datatype from corpus import Datalist # levels are 'c' for corpus, 's' for subcorpus and 'f' for file. Which # one is determined automatically below, and processed accordingly. We # assume it is a full corpus to begin with. self.data = None level = kwargs.pop('level', 'c') self.datatype = kwargs.pop('datatype', None) print_info = kwargs.get('print_info', True) if path.__class__ == Datalist or type(path) == list: self.path = abspath(dirname(path[0].path.rstrip('/'))) self.name = basename(self.path) self.data = path else: self.path = abspath(path) self.name = basename(path) # this messy code figures out as quickly as possible what the datatype # and singlefile status of the path is. it's messy because it shortcuts # full checking where possible some of the shortcutting could maybe be # moved into the determine_datatype() funct. if print_info: print('\nCorpus at: %s' % self.path) self.singlefile = False if os.path.isfile(self.path): if self.path.endswith('.xml'): self.datatype = 'parse' self.singlefile = True elif self.path.endswith('-parsed'): self.datatype = 'parse' if len([ d for d in os.listdir(self.path) if isdir(join(self.path, d)) ]) > 0: self.singlefile = False if len([ d for d in os.listdir(self.path) if isdir(join(self.path, d)) ]) == 0: level = 's' else: if level == 'c': if not self.datatype: self.datatype, self.singlefile = determine_datatype( self.path) if len([ d for d in os.listdir(self.path) if isdir(join(self.path, d)) ]) == 0: level = 's' # if initialised on a file, process as file if self.singlefile and level == 'c': level = 'f' self.level = level