コード例 #1
0
ファイル: exportgraf.py プロジェクト: FrankNagel/qlc
def main(argv):
    log = logging.getLogger()
    logging.basicConfig(level=logging.INFO)
    
    conf = appconfig('config:development.ini', relative_to='.')
    config = None
    if not pylons.test.pylonsapp:
        config = load_environment(conf.global_conf, conf.local_conf)
    
    # Create the tables if they don't already exist
    #metadata.create_all(bind=Session.bind)

    c = ContextObj() 
    py_obj = PylonsContext() 
    py_obj.tmpl_context = c
    pylons.tmpl_context._push_object(c)
    #corpushistory = model.meta.Session.query(model.Corpusversion).all()
    corpusversion = model.meta.Session.query(model.Corpusversion).order_by(model.Corpusversion.updated).first()
    c.iso_time = corpusversion.updated.strftime("%Y-%m-%d")
    c.version_number = "{0}.{1}".format(corpusversion.version, corpusversion.revision)

    
    # template_entries_seg
    mylookup = TemplateLookup(directories=config['pylons.paths']['templates'])
    template_header = open(os.path.join(config['pylons.paths']['templates'][0], 'base', 'graf-header.hdr')).read()    
    template_entries = open(os.path.join(config['pylons.paths']['templates'][0], 'base', 'graf-entries.txt')).read()    
    template_entries_seg = open(os.path.join(config['pylons.paths']['templates'][0], 'base', 'graf-entries.xml')).read()
    template_annotations = open(os.path.join(config['pylons.paths']['templates'][0], 'base', 'graf-annotations.xml')).read()
    #template_annotations_seg = open(os.path.join(config['pylons.paths']['templates'][0], 'base', 'graf-annotations-seg.xml')).read()        

    metadata_file = codecs.open(os.path.join(config['pylons.paths']['static_files'], 'downloads', "xml", "sources.csv"), "w", "utf-8")
    metadata_file.write("ID\tTYPE\tLANGUAGES\tIS_READY\tTITLE\tCOMPONENT\n")
         
    #http://www.cidles.eu/quanthistling/book/minor1987/hto/spa?format=xml
    for b in quanthistling.dictdata.books.list + quanthistling.dictdata.toolboxfiles.list:
        #if b['bibtex_key'] != "leach1969":
        #    continue

        c.book = model.meta.Session.query(model.Book).filter_by(bibtex_key=b['bibtex_key']).first()
        
        if c.book:

            # escape characters for XML
            c.bookinfo = escape(c.book.bookinfo())
            c.book_title = escape(c.book.title)
            c.book_author = escape(c.book.author)

            # collect book data
            languages = [ l.language_iso.langcode for dictdata in c.book.dictdata for l in dictdata.src_languages + dictdata.tgt_languages if l.language_iso]
            components = [ dictdata.component.name for dictdata in c.book.dictdata ]
            metadata_file.write(u"{0}\t{1}\t{2}\t{3}\t{4}\t{5}\n".format(c.book.bibtex_key, "dictionary", ",".join(languages), c.book.is_ready, c.book.bookinfo(), ",".join(components)))

            print "Exporting XML data for %s..." % b['bibtex_key']
            #temppath = tempfile.mkdtemp()
            temppath = os.path.join(config['pylons.paths']['static_files'], 'downloads', 'xml', b['bibtex_key'])
            if not os.path.exists(temppath):
                os.mkdir(temppath)
            else:
                files = glob.glob(os.path.join(temppath, "*"))
                for f in files:
                    os.remove(f)

            for c.dictdata in c.book.dictdata:
                
    
                c.url_for = url_for
                c.base_url = "http://www.quanthistling.info/data"
                #c.relative_url = url_for(controller='book', action='dictdata', bibtexkey=c.book.bibtex_key, startpage=c.dictdata.startpage, endpage=c.dictdata.endpage, format='html')

                #c.heading = c.book.bookinfo()
                c.basename = "dict-%s-%i-%i" % (b['bibtex_key'], c.dictdata.startpage, c.dictdata.endpage)

                print "  getting entries..."
                
                c.entries = model.meta.Session.query(model.Entry).filter(model.Entry.dictdata_id==c.dictdata.id).order_by("startpage", "pos_on_page").all()

                print "  getting annotations..."
                
                annotations = model.meta.Session.query(model.Annotation).join(model.Entry, model.Annotation.entry_id==model.Entry.id).filter(model.Entry.dictdata_id==c.dictdata.id).order_by("startpage", "pos_on_page").all()
                c.annotations = collections.defaultdict(dict)
                for a in annotations:
                    if not c.annotations[a.entry_id]:
                        c.annotations[a.entry_id] = collections.defaultdict(list)
                    c.annotations[a.entry_id][(a.start, a.end)].append(a)

                print "  getting counts..."
                
                c.count_heads = model.meta.Session.query(model.Annotation).join(model.Entry, model.Annotation.entry_id==model.Entry.id).filter(model.Entry.dictdata_id==c.dictdata.id).filter(model.Annotation.value==u"head").count()
                c.count_translations = model.meta.Session.query(model.Annotation).join(model.Entry, model.Annotation.entry_id==model.Entry.id).filter(model.Entry.dictdata_id==c.dictdata.id).filter(model.Annotation.value==u"translation").count()
                c.count_pos = model.meta.Session.query(model.Annotation).join(model.Entry, model.Annotation.entry_id==model.Entry.id).filter(model.Entry.dictdata_id==c.dictdata.id).filter(model.Annotation.value==u"pos").count()
                c.count_examples_src = model.meta.Session.query(model.Annotation).join(model.Entry, model.Annotation.entry_id==model.Entry.id).filter(model.Entry.dictdata_id==c.dictdata.id).filter(model.Annotation.value==u"example-src").count()
                c.count_examples_tgt = model.meta.Session.query(model.Annotation).join(model.Entry, model.Annotation.entry_id==model.Entry.id).filter(model.Entry.dictdata_id==c.dictdata.id).filter(model.Annotation.value==u"example-tgt").count()
                c.count_manually_corrected = model.meta.Session.query(model.Entry).filter(model.Entry.dictdata_id==c.dictdata.id).filter(model.Entry.has_manual_annotations==True).count()

                #xml =  render('/derived/book/dictdata.xml')
                #xml = literal(template.render_unicode(c))

                print "  header..."

                # write header
                xml = Template(template_header, lookup=mylookup).render_unicode(c=c)
                oFile = open(os.path.join(temppath, "%s.hdr" % c.basename),'wb')
                oFile.write(xml.encode("utf-8"))
                oFile.close()

                print "  base data..."

                # write base data file
                xml = Template(template_entries, lookup=mylookup).render_unicode(c=c)
                oFile = open(os.path.join(temppath, "%s.txt" % c.basename),'wb')
                oFile.write(xml.encode("utf-8"))
                oFile.close()
    
                print "  entries..."

                # write entry file
                xml = Template(template_entries_seg, lookup=mylookup).render_unicode(c=c)
                oFile = open(os.path.join(temppath, "%s-entries.xml" % c.basename),'wb')
                oFile.write(xml.encode("utf-8"))
                oFile.close()

                print "  formatting annotations..."

                c.annotationtypes = [ "pagelayout", "formatting" ]
                c.annotationname = "formatting"
            
                xml = Template(template_annotations, lookup=mylookup).render_unicode(c=c)
                oFile = open(os.path.join(temppath, "%s-formatting.xml" % c.basename),'wb')
                oFile.write(xml.encode("utf-8"))
                oFile.close()          

                print "  dictinterpretation annotations..."
                c.annotationtypes = [ "dictinterpretation", "orthographicinterpretation", "errata" ]

                c.annotationname = "dictinterpretation"

                xml = Template(template_annotations, lookup=mylookup).render_unicode(c=c)
                oFile = open(os.path.join(temppath, "%s-dictinterpretation.xml" % c.basename),'wb')
                oFile.write(xml.encode("utf-8"))
                oFile.close()

            # create archive
            myzip = zipfile.ZipFile(os.path.join(config['pylons.paths']['static_files'], 'downloads', 'xml', '%s.zip' % b['bibtex_key']), 'w', zipfile.ZIP_DEFLATED)
            for file in glob.glob(os.path.join(temppath, "*.*")):
                myzip.write(file, os.path.basename(file))
            myzip.close()
            #shutil.rmtree(temppath)

    metadata_file.close()
    myzip = zipfile.ZipFile(os.path.join(config['pylons.paths']['static_files'], 'downloads', 'xml', 'data.zip'), 'w', zipfile.ZIP_DEFLATED)
    graf_dirs = [d for d in glob.glob(os.path.join(config['pylons.paths']['static_files'], 'downloads', 'xml', "*")) if os.path.isdir(d)]
    for d in graf_dirs:
        bibtex_key = d[d.rfind(os.sep)+1:]
        for f in glob.glob(os.path.join(d, "*.*")):
            myzip.write(f, os.path.join(bibtex_key, os.path.basename(f)))
    f = os.path.join(config['pylons.paths']['static_files'], 'downloads', 'xml', 'sources.csv')
    myzip.write(f, os.path.basename(f))
    myzip.close()


    pylons.tmpl_context._pop_object()