def _read_blocks(self): import alpinocorpus if self._encoding not in (None, 'utf8', 'utf-8'): raise ValueError('Encoding specified in XML files, ' 'cannot be overriden.') for filename in self._filenames: corpus = alpinocorpus.CorpusReader(filename) for entry in corpus.entries(): yield entry.name(), entry.contents()
#!/usr/bin/python import alpinocorpus import os.path import sys if __name__ == "__main__": if (len(sys.argv) != 2): print "%s: corpus" % sys.argv[0] sys.exit(1) scriptDir = os.path.dirname(sys.argv[0]) scriptPath = os.path.join(scriptDir, "names.xq") script = open(scriptPath, 'r').read() reader = alpinocorpus.CorpusReader(sys.argv[1]) for entry in reader.xquery(script): print entry.contents()
ini.read("corpora.ini") corpora = {} for corpus in ini.sections(): corpora[corpus] = {} for name, value in ini.items(corpus): if name == "shortdesc" or name == "desc": corpora[corpus][name] = value.decode("utf-8") elif name == "path": corpora[corpus][name] = value removes = list() for name, corpusData in corpora.iteritems(): print "Opening", corpusData['path'] try: c = alpinocorpus.CorpusReader(corpusData['path']) corpusData['entries'] = c.size() corpusData['reader'] = c except RuntimeError: removes.append(name) continue try: if corpusData['path'].endswith('/'): size = 0 for filename in os.listdir(corpusData['path']): if filename.endswith('.dact'): size += os.stat(corpusData['path'] + filename).st_size else: size = os.stat(corpusData['path']).st_size except: size = 0