Exemple #1
0
 def _read_blocks(self):
     import alpinocorpus
     if self._encoding not in (None, 'utf8', 'utf-8'):
         raise ValueError('Encoding specified in XML files, '
                          'cannot be overriden.')
     for filename in self._filenames:
         corpus = alpinocorpus.CorpusReader(filename)
         for entry in corpus.entries():
             yield entry.name(), entry.contents()
#!/usr/bin/python

import alpinocorpus
import os.path
import sys

if __name__ == "__main__":
  if (len(sys.argv) != 2):
    print "%s: corpus" % sys.argv[0]
    sys.exit(1)

  scriptDir = os.path.dirname(sys.argv[0])
  scriptPath = os.path.join(scriptDir, "names.xq")
  script = open(scriptPath, 'r').read()

  reader = alpinocorpus.CorpusReader(sys.argv[1])

  for entry in reader.xquery(script):
    print entry.contents()

ini.read("corpora.ini")
corpora = {}
for corpus in ini.sections():
    corpora[corpus] = {}
    for name, value in ini.items(corpus):
        if name == "shortdesc" or name == "desc":
            corpora[corpus][name] = value.decode("utf-8")
        elif name == "path":
            corpora[corpus][name] = value

removes = list()

for name, corpusData in corpora.iteritems():
    print "Opening", corpusData['path']
    try:
        c = alpinocorpus.CorpusReader(corpusData['path'])
        corpusData['entries'] = c.size()
        corpusData['reader'] = c
    except RuntimeError:
        removes.append(name)
        continue
    try:
        if corpusData['path'].endswith('/'):
            size = 0
            for filename in os.listdir(corpusData['path']):
                if filename.endswith('.dact'):
                    size += os.stat(corpusData['path'] + filename).st_size
        else:
            size = os.stat(corpusData['path']).st_size
    except:
        size = 0