Esempio n. 1
0
def run_cms(jfile, procfilename, cmdline):
    processor = postextraction.SimpleConstructionMatchingSystem(cmdline.extdir,None,None,cmdline.verbose)
    doc = processor.post_process(mnjson.loadfile(jfile),
                                 None,
                                 cmdline.matchfield,
                                 cmdline.posfield,
                                 cmdline.reportfield,
                                 cmdline.forcetagger)
    mnjson.writefile(procfilename, doc)
 def writefile(self):
     '''
     Write out json file: this is to keep the file sizes manageable
     '''
     if self.chunkno:
         outfname = '%s/%s_%03d.%s' % (self.outdir, self.infbase,
                                       self.chunkno, self.ext)
     else:
         outfname = '%s/%s.%s' % (self.outdir, self.infbase, self.ext)
     mnjson.writefile(outfname, self.jdata)
     self.jdata = mnjson.getJSONRoot(lang='fa', docs=[], sents=[])
     self.sentidx = -1
     self.content = None
Esempio n. 3
0
def main():
    cmdline = process_command_line()
    # read and process sentences in SPL file
    insplf = codecs.open(cmdline.infile.decode('utf-8').encode('utf-8'),
                         "r",
                         encoding="utf-8")

    if not cmdline.prov:
        cmdline.prov = cmdline.name
    if not cmdline.corpus:
        cmdline.corpus = cmdline.name

    # call converter to json object
    json_obj = mnjson.convert_spl_to_json(insplf, cmdline.lang, cmdline.name,
                                          cmdline.corpus, cmdline.desc,
                                          cmdline.prov, cmdline.type,
                                          cmdline.comments)

    # add message if set
    if cmdline.message:
        for sent in json_obj[u'sentences']:
            sent[u'comment'] = cmdline.message.decode('utf-8').encode('utf-8')

    # serialize the json object to file
    if cmdline.outputfile:
        fname = cmdline.outputfile.decode('utf-8').encode('utf-8')
    else:
        (filebase, filext) = os.path.splitext(
            cmdline.infile.decode('utf-8').encode('utf-8'))
        fname = filebase + u'.json'

    # write out the file
    mnjson.writefile(fname, json_obj)

    # validate and return status
    if mnjson.validate(json_obj, cmdline.schema) is None:
        return 0
    else:
        print >> sys.stderr, "Error:", fname, "failed validation."
        return 1
def main():
    """
    Runs subcorpus generation.
    """
    # ------------------------------------------------------------------- #
    # INITIALIZATION
    
    # add some custom cmdline parameters
    aparser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Script for generating a subcorpus from a set of JSON files "\
        "usually from a large corpus.  It uses the CMS's target and source word "\
        "matching system.")
    aparser.add_argument("filelistfile",help="File containing list of corpus files "\
                         "in JSON format.")
    aparser.add_argument("outputfile",help="Name of resulting subcorpus file")
    aparser.add_argument("-l","--lang",help="Language",required=True)
    aparser.add_argument("--doc",help="Unique name to give to the subcorpus")
    aparser.add_argument("--corpus",help="Unique name of corpus")
    aparser.add_argument("--desc",help="Description of subcorpus")
    aparser.add_argument("-e", "--engine", help="Querying engine (CMS).  Options are"\
                         " (rdflib, redland, sesame)."\
                         " Use of sesame must be suffixed with the server ip or dns "\
                         " demarked with a colon, e,g, seseame:servername.",
                         default='sesame:localhost')
    aparser.add_argument("--force-pos-tag", dest="forcepos", action="store_true",
                         help="Force POS tagging in CMS. Overwrite existing tags")
    aparser.add_argument("--force-cache", dest="forcecache", action="store_true",
                         help="Force cache regeneration (CMS)")
    aparser.add_argument("--lem", help="Override default lemma field name ('lem')",
                         default="lem")
    aparser.add_argument("--pos", help="Override default POS field name ('pos')",
                         default="pos")
    aparser.add_argument("--trans-en",dest="translateEn",action="store_true",
                         help="For non-English languages, this option allows frame and "\
                         "frame families names to be given in English.  Translation is "\
                         "accomplished via Interwiki links.")
    aparser.add_argument("--config", dest="configfname",
                         help="Configuration filename to override the default "\
                         " which can also be overridden by environment variable"\
                         " %s (default=%s)" % (CONFIG_ENV,DEFAULT_CONFIGFNAME))
    aparser.add_argument("--mode", dest="configmode",
                         help="Used to activate a mode defined in the config file.")
    aparser.add_argument("--cache-dir", dest="cachedir", default=None,
                         help="To override default cache directories")
    aparser.add_argument("--use-se", dest="useSE", default='localhost',
                         help="Use Sparql endpoint at the specified server address.")


    cmdline = aparser.parse_args()
    
    # for logging / error messages
    msgformat = '%(asctime)-15s - %(message)s'
    dateformat = '%Y-%m-%dT%H:%M:%SZ'
    logging.basicConfig(format=msgformat, datefmt=dateformat,
                        level=logging.INFO)

    # parse the config file
    cfname = None
    if cmdline.configfname:
        cfname = self.cmdline.configfname
    else:
        cfname = os.environ.get(CONFIG_ENV)
        if not cfname:
            cfname = DEFAULT_CONFIGFNAME
    config = MetaNetConfigParser(cfname,"makesubcorpus",cmdline.configmode)

    startproctime = datetime.now(tzlocal()).strftime("%Y-%m-%d %H:%M:%S %z")

    docheader = mnjson.getJSONDocumentHeader(name=cmdline.doc,
                                             corp=cmdline.corpus,
                                             desc=cmdline.desc,
                                             prov=cmdline.doc,
                                             type='subcorpus',
                                             lang=cmdline.lang,
                                             pubdate=startproctime)
    jdata = mnjson.getJSONRoot(lang='en',docs=docheader)
    
    # start up metanetrep, cnmapper, cms
    jdata[u'start_processing_time'] = startproctime
    lang = cmdline.lang
    
    paramrec = {}
    
    tfamlist = config.getList('targetfamilies', lang)
    tsnamelist = config.getList('targetframes', lang)
    tconlist = config.getList('targetconcepts', lang)
    tcongrouplist = config.getList('targetconceptgroups', lang)
    sfamlist = config.getList('sourcefamilies', lang)
    ssnamelist = config.getList('sourceframes', lang)
    sconlist = config.getList('sourceconcepts', lang)
    
    filterparams = {}
    filterparams[u'targetfamilies'] = tfamlist
    filterparams[u'targetframes'] = tsnamelist
    filterparams[u'targetconcepts'] = tconlist
    filterparams[u'targetconceptgroups'] = tcongrouplist
    filterparams[u'sourcefamilies'] = sfamlist
    filterparams[u'sourceframes'] = ssnamelist
    filterparams[u'sourceconcepts'] = sconlist
    paramrec[u'filterparams'] = filterparams

    
    # configure and initialize Conceptual Network Mapper
    tconranking = config.getListFromComp('cnms','targetconceptranking', lang)
    secondaryminscore = config.getFloatFromComp('cnms','secondaryminscore', lang)
    mappinglimit = config.getIntFromComp('cnms','sourcelimit', lang)
    conceptmode = config.getValue('casemode',default='general')
    expansionTypes = config.getListFromComp('cnms','expansiontypes',lang=lang)
    expansionScoreScale = config.getFloatFromComp('cnms','expansionscorescale',lang=lang,
                                                  default=1.0)
    disableclosestframe = config.getFlagFromComp('cnms','disableclosestframe',lang=lang)
    fndatadir = config.getValue('framenetdatadir',lang=lang)
    wikdatadir = config.getValue('wiktionarydatadir',lang=lang)
    pwfdatadir = config.getValue('persianwordformsdatadir')
    mrdatadir = config.getValue('mrdatadir',lang=lang)
    
    cnmsparams = {}
    cnmsparams[u'targetconceptranking'] = tconranking
    cnmsparams[u'secondaryminscore'] = secondaryminscore
    cnmsparams[u'sourcelimit'] = mappinglimit
    cnmsparams[u'expansiontypes'] = expansionTypes
    cnmsparams[u'expansionscorescale'] = expansionScoreScale
    cnmsparams[u'disableclosestframe'] = disableclosestframe
    paramrec[u'cnms'] = cnmsparams
    paramrec[u'casemode'] = conceptmode
    paramrec[u'framenetdatadir'] = fndatadir
    paramrec[u'wiktionarydatadir'] = wikdatadir
    paramrec[u'persianwordformsdatadir'] = pwfdatadir
    paramrec[u'mrdatadir'] = mrdatadir
    
    fndata = None
    wikdata = None
    pwforms = None
    if lang=='en':
        if ('fn' in expansionTypes) or (not disableclosestframe):
            if not fndatadir:
                logging.error('FN expansion requires "framenetdatadir" parameter')
            else:
                fndata = FrameNet(cachedir=fndatadir)
        if ('wik' in expansionTypes):
            if not wikdatadir:
                logging.error('Wiktionary expansion requires "wiktionarydatadir" parameter')
            else:
                wikdata = Wiktionary(dbdir=wikdatadir)
    if lang=='fa':
        if not pwfdatadir:
            logging.warn('Persian extraction/mapping not using precomputed word forms.'\
                         ' Set "persianwordformsdatadir" to enable.')
        pwforms = PersianWordForms(pwfdir=pwfdatadir)

    # configure and initialize MetaNet Repository
    metanetrep = MetaNetRepository(lang, useSE=cmdline.useSE,
                                   mrbasedir=mrdatadir,
                                   fndata=fndata,wikdata=wikdata,pwforms=pwforms)
    metanetrep.initLookups()
    
    cnmapper = ConceptualNetworkMapper(lang, cmdline.cachedir,
                                       targetConceptRank=tconranking,
                                       disableFN=disableclosestframe,
                                       expansionTypes=expansionTypes,
                                       expansionScoreScale=expansionScoreScale,
                                       sourceMappingLimit=mappinglimit, 
                                       minSecondaryScore=secondaryminscore,
                                       metanetrep=metanetrep,
                                       conceptMode=conceptmode)
    
    excludedfamilies = config.getListFromComp('cms','excludedfamilies', lang)
    metarcfname = config.getValueFromComp('cms','metarcfile',lang)    

    cms = ConstructionMatchingSystem(lang, posfield=cmdline.pos, lemfield=cmdline.lem,
                                     useSE=cmdline.useSE,
                                     forcecache=cmdline.forcecache,
                                     engine=cmdline.engine, nodepcheck=True,
                                     excludedFamilies=excludedfamilies,
                                     metanetrep=metanetrep,cnmapper=cnmapper,
                                     metarcfname=metarcfname)
    
    maxsentlength = config.getValueFromComp('cms','maxsentlength', lang)
    if maxsentlength:
        cms.setMaxSentenceLength(int(maxsentlength))
    disablewcache = config.getValueFromComp('cms','disablewcaching', lang)
    if disablewcache and (disablewcache.lower() == 'true'):
        cms.setSearchWordCaching(False)
    cxnpatternfname = config.getValueFromComp('cms','cxnpatternfile',lang)
    cmsparams = {}
    cmsparams[u'excludedfamilies'] = excludedfamilies
    cmsparams[u'metarcfile'] = metarcfname
    cmsparams[u'maxsentlength'] = maxsentlength
    cmsparams[u'disablewcaching'] = disablewcache
    cmsparams[u'cxnpatternfile'] = cxnpatternfname
    paramrec[u'cms'] = cmsparams
    
    cms.prepSearchWords(tfamlist,tsnamelist,tconlist,tcongrouplist,
                        sfamlist,ssnamelist,sconlist,
                        translateEn=cmdline.translateEn)
    
    sentence_counter = 0
    subcorpus = []
    with codecs.open(cmdline.filelistfile, encoding='utf-8') as flist:
        for line in flist:
            infilename = line.strip()
            if not infilename:
                continue
            in_jdata = mnjson.loadfile(infilename)
            sc_sents = cms.getSubcorpus(in_jdata[u'sentences'],
                                        forcePOScomp=cmdline.forcepos)
            for sent in sc_sents:
                sent[u'cfile'] = infilename
                sent[u'cid'] = sent[u'id']
                sent[u'cidx'] = sent[u'idx']
                sent[u'id'] = u'%s:%s' % (cmdline.doc,sentence_counter+1)
                sent[u'idx'] = sentence_counter
                subcorpus.append(sent)
            logging.info("added %d sentences from %s",len(sc_sents),infilename)
                
    # ------------------------------------------------------------------- #
    # OUTPUT FILE GENERATION
    jdata[u'parameters'] = paramrec
    jdata[u'end_processing_time'] = datetime.now(tzlocal()).strftime("%Y-%m-%d %H:%M:%S %z")
    jdata[u'sentences'] = subcorpus
    logging.info(u'writing %s with %d sentences',cmdline.outputfile, len(subcorpus))
    mnjson.writefile(cmdline.outputfile, jdata)
Esempio n. 5
0
def main():
    """
    Runs LM to concept mapping.
    """
    # ------------------------------------------------------------------- #
    # INITIALIZATION
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Convert gold standard annotation to JSON format.")
    parser.add_argument('goldfile',
                        help="Excel file containing gold standard annotation")
    parser.add_argument('-l','--lang',required=True,
                        help="Language being processed")
    parser.add_argument('-v','--verbose',action='store_true',
                        help='print verbose messages')
    cmdline = parser.parse_args()
    
    msgformat = '%(asctime)-15s - %(message)s'
    dateformat = '%Y-%m-%dT%H:%M:%SZ'
    if cmdline.verbose:
        deflevel = logging.INFO
    else:
        deflevel = logging.WARN
    logging.basicConfig(format=msgformat, datefmt=dateformat, level=deflevel)
    
    # load the input excel file
    wb = load_workbook(cmdline.goldfile)
    
    sentences = []
    idx = 0;
    sentbycon = {}
    for tconcept in wb.get_sheet_names():
        logging.info('processing tab %s',tconcept)
        ws = wb.get_sheet_by_name(tconcept)
        sentbycon[tconcept] = {}
        for row in ws.rows[1:]:
            (id,text,tlemma,status,sform,sschema,sconcept) = (cell.value for cell in row[0:7])
            if (not text) or (not status) or status.lower()=='p':
                continue
            sent = mnjson.getJSONSentence(id,idx,text)
            if status.lower()=='n':
                sentences.append(sent)
                idx += 1
                continue
            if not sform:
                # then need to skip, because it's marked as Y but no answer was given
                continue
            if sconcept:
                sconcept = sconcept.replace(u' ',u'_').upper()
            #create lm
            sstart = text.lower().find(sform.lower())
            send = sstart + len(sform)
            source = {'form':sform,
                      'concept':sconcept,
                      'start':sstart,
                      'end':send}
            if sschema:
                source['schemanames'] = [sname.strip() for sname in sschema.split(u',')]
            target = {'lemma':tlemma,
                      'concept':tconcept}
            lm = {u'name':u'%s %s'%(target['lemma'],source['form']),
                  u'target':target,
                  u'source':source,
                  u'extractor':'Gold'}
            sent['lms'] = [lm]
            sentences.append(sent)
            idx += 1
            if sconcept:
                if sconcept in sentbycon[tconcept]:
                    sentbycon[tconcept][sconcept].append(sent)
                else:
                    sentbycon[tconcept][sconcept] = [sent]
    logging.info("Running POS tagger...")
    computePOS(cmdline.lang, sentences)
    # attempt to fill in missing form / lemma fields
    logging.info("Filling in missing fields in LMs...")
    for sent in sentences:
        if 'lms' in sent:
            if 'word' not in sent:
                continue
            for lm in sent['lms']:
                # do target
                target = lm['target']
                mwetarg = None
                if u' ' in target['lemma']:
                    # its a multiword
                    mwetarg = target['lemma'].split(u' ')
                for widx in range(len(sent['word'])):
                    w = sent['word'][widx]
                    if 'form' in target:
                        break
                    if mwetarg:
                        if mwetarg[0].lower()==w['lem'].lower():
                            tfparts = []
                            for midx in range(len(mwetarg)):
                                if mwetarg[midx].lower()==sent['word'][widx+midx]['lem'].lower():
                                    tfparts.append(sent['word'][widx+midx]['form'])
                                else:
                                    tfparts = []
                                    break
                            if tfparts:
                                target['form'] = u' '.join(tfparts)
                                target['start'] = w['start']
                                target['end'] = w['start'] + len(target['form'])
                                break
                                
                    else:
                        if target['lemma'].lower()==w['lem'].lower():
                            target['form'] = w['form']
                            target['start'] = w['start']
                            target['end'] = w['end']
                            break
                if 'form' not in target:
                    # backup, search for matching wforms
                    for widx in range(len(sent['word'])):
                        w = sent['word'][widx]
                        if 'form' in target:
                            break
                        if mwetarg:
                            if mwetarg[0].lower()==w['form'].lower():
                                tfparts = []
                                for midx in range(len(mwetarg)):
                                    if mwetarg[midx].lower()==sent['word'][widx+midx]['form'].lower():
                                        tfparts.append(sent['word'][widx+midx]['form'])
                                    else:
                                        tfparts = []
                                        break
                                if tfparts:
                                    target['form'] = u' '.join(tfparts)
                                    target['start'] = w['start']
                                    target['end'] = w['start'] + len(target['form'])
                                    break
                                    
                        else:
                            if target['lemma'].lower()==w['form'].lower():
                                target['form'] = w['form']
                                target['start'] = w['start']
                                target['end'] = w['end']
                                break
                source = lm['source']
                mwesource = None
                if u' ' in source['form']:
                    mwesource = source['form'].split(u' ')
                for widx in range(len(sent['word'])):
                    w = sent['word'][widx]
                    if 'lemma' in source:
                        break
                    if mwesource:
                        if mwesource[0].lower()==w['form'].lower():
                            slparts = []
                            for midx in range(len(mwesource)):
                                if mwesource[midx].lower()==sent['word'][widx+midx]['form'].lower():
                                    slparts.append(sent['word'][widx+midx]['lem'])
                                else:
                                    slparts = []
                                    break
                            if slparts:
                                source['lemma'] = u' '.join(slparts)
                                break                                
                    else:
                        if source['form'].lower()==w['form'].lower():
                            source['lemma'] = w['lem']
                            break
        if 'word' in sent:
            del sent['word']
        if 'ctext' in sent:
            del sent['ctext']
    fbase, _ = os.path.splitext(cmdline.goldfile)
    # create m4d gold file
    docheader = mnjson.getJSONDocumentHeader(fbase,
                                             "Gold Standard %s"%(cmdline.lang),
                                             "Gold Standard Document %s (%s)"%(fbase,cmdline.lang),
                                             "%s_MN_Analysis_Team"%(fbase),
                                             "mixed", len(sentences), cmdline.lang)
    jdata = mnjson.getJSONRoot(cmdline.lang, docs=[docheader], sents=sentences)
    mnjson.writefile(fbase+'_m4d_gold.json', jdata)
    # create m4m gold file
    
    docheaders = []
    msentences= []
    idx = 0
    for tconcept in sentbycon.iterkeys():
        for sconcept in sentbycon[tconcept].iterkeys():
            docnum = 1
            sentcount = 0
            docname = u'%s_%s_%d' % (tconcept,sconcept,docnum)
            for sent in sentbycon[tconcept][sconcept]:
                sentcount += 1
                sent['id'] = 'gold:%s:%d' % (docname,sentcount)
                sent['idx'] = idx
                msentences.append(sent)
                if sentcount == 5:
                    docheader = mnjson.getJSONDocumentHeader(docname,
                                                             "m4mapping gold %s"%(cmdline.lang),
                                                             "gold standard m4mapping set",
                                                             docname,
                                                             "mixed", sentcount, cmdline.lang)
                    docheaders.append(docheader)
                    sentcount = 0
                    docnum += 1
                    docname = u'%s_%s_%d' % (tconcept,sconcept,docnum)
            if sentcount > 0:
                docheader = mnjson.getJSONDocumentHeader(docname,
                                                         "m4mapping gold %s"%(cmdline.lang),
                                                         "gold standard m4mapping set",
                                                         docname,
                                                         "mixed", sentcount, cmdline.lang)
                docheaders.append(docheader)
    jdata = mnjson.getJSONRoot(cmdline.lang, docs=docheaders, sents=msentences)
    mnjson.writefile(fbase+'_m4m_gold.json', jdata)
    
    #
    # to create the input versions-- need to reload from file
    #
    
    jdata = mnjson.loadfile(fbase+'_m4d_gold.json')    
    for sent in jdata['sentences']:
        if 'lms' in sent:
            del sent['lms']
    mnjson.writefile(fbase+'_m4d_input.json',jdata)

    jdata = mnjson.loadfile(fbase+'_m4m_gold.json')
    for sent in jdata['sentences']:
        if 'lms' in sent:
            for lm in sent['lms']:
                for field in ('concept','schemaname','schemanames','schemauri','schemauris'):
                    for tgsc in ('target','source'):
                        if field in lm[tgsc]:
                            del lm[tgsc][field]
    mnjson.writefile(fbase+'_m4m_input.json',jdata)
Esempio n. 6
0
def runLMDetectionInstance((infilename, outfilename, cmdline, config)):
    """Run single instance of LM detection on one input JSON file.
    This method takes a single tuple argument and is intended to be
    run via :py:mod:`multiprocessing`.
    
    :param infilename: input file name
    :type infilename: str
    :param outfilename: output file name
    :type outfilename: str
    :param cmdline: command line parameters 
    :type cmdline: parse output from (:py:mod:`argparse`)
    
    """
    if os.path.exists(outfilename) and (not cmdline.force):
        logging.info("Skipping %s because result file exists.", infilename)
        return

    # read input file and prep settings
    logging.info('start LM detection on %s', infilename)

    if cmdline.verbose:
        msgformat = u'%(asctime)-15s - ' + os.path.basename(
            infilename) + ' - %(message)s'
        dateformat = u'%Y-%m-%dT%H:%M:%SZ'
        lfmt = logging.Formatter(msgformat, dateformat)

        # root logger: remove and re-add handlers
        rlogger = logging.getLogger()
        for h in list(rlogger.handlers):
            rlogger.removeHandler(h)

        # info handler: screen (stderr)
        infohand = logging.StreamHandler()
        infohand.setFormatter(lfmt)

        infohand.setLevel(logging.INFO)
        rlogger.setLevel(logging.INFO)

        # debug handler: to file
        if cmdline.debug:
            dbglogfname = os.path.basename(infilename) + '.debug.log'
            debughand = logging.FileHandler(dbglogfname,
                                            mode='w',
                                            encoding='utf-8')
            debughand.setLevel(logging.DEBUG)
            debughand.setFormatter(lfmt)
            rlogger.addHandler(debughand)
            rlogger.setLevel(logging.DEBUG)
        rlogger.addHandler(infohand)

    jdata = mnjson.loadfile(infilename)

    logger = None
    jdata = runLMDetection(jdata, cmdline, config, logger)
    if not jdata:
        logging.error('LMDetection returned empty {}')
        raise

    # write output file
    mnjson.writefile(outfilename, jdata)
    logging.info('done LM detection')
Esempio n. 7
0
def main():
    global SEPARATOR, MIN_SCORE
    cmdline = process_command_line()
    cwd = os.getcwd()
    
    if cmdline.intdir:
        intdir = cmdline.intdir
    else:
        intdir = "./int_"+os.path.basename(cmdline.infile)
    
    # if the intermediate directory exists, first delete it
    if os.path.exists(intdir):
        shutil.rmtree(intdir)
    
    os.makedirs(intdir)
    shutil.copy(cmdline.infile,intdir)

    # change to intermediate files dir
    os.chdir(intdir)
    intpath = os.getcwd()
    if intpath.endswith('/')==False:
        intpath += '/'
    
    jsonfiles = subprocess.check_output(["testset2json",
                                         "-p",
                                         "-d",
                                         SEPARATOR,
                                         cmdline.infile]).splitlines()
    jsonfiles.sort()

    METADNS = "http://www.iarpa.gov/Metaphor/DetectSchema"
    XSINS = "http://www.w3.org/2001/XMLSchema-instance"
    M4SCHEMA = 'http://www.iarpa.gov/Metaphor/DetectSchema m4detectSchema_11.xsd'
    
    et.register_namespace("metad",METADNS)
    et.register_namespace("xsi",XSINS)
    
    rsroot = et.Element('metad:ResultSet');
    rsroot.set('xsi:schemaLocation',M4SCHEMA)
    rsroot.set('xmlns:metad',METADNS)
    rsroot.set('xmlns:xsi',XSINS)
    rsroot.set('teamId',cmdline.teamname)
    rscount = 0

    logroot = et.Element('metad:LogFile');
    logroot.set('xsi:schemaLocation',M4SCHEMA)
    logroot.set('xmlns:metad',METADNS)
    logroot.set('xmlns:xsi',XSINS)
    logroot.set('teamId',cmdline.teamname)
    logcount = 0
    
    logentry = et.SubElement(logroot, 'metad:TestStartTime')
    logentry.text = get_time()
    
    processor = postextraction.SimpleWordlistSystem(cmdline.extdir)
    prdir = None
    if cmdline.extdir:
        prdir = cmdline.extdir + '/persian'
    persianextractor = PersianMetaphorExtractor(prdir)
    
    for jfile in jsonfiles:
        lmflag = "0"
        lmsentno = "999"
        lmtarget = ""
        lmsource = ""
        
        # process filename
        (fbase,textid) = os.path.splitext(jfile)[0].split(SEPARATOR)

        # get lang from inside
        jfdoc = mnjson.loadfile(jfile)
        lang = jfdoc['lang']

        # start log entry
        logentry = et.SubElement(logroot, 'metad:LogEntry')
        logentry.set('id',textid)

        # record start time
        logstart = et.SubElement(logentry, 'metad:StartTime')
        logstart.text = get_time()
        print >> sys.stderr, logstart.text,"- starting processing on",textid
        
        # run pipeline
        result = et.SubElement(rsroot, 'metad:Result')
        result.set('id',textid)

        procfilename = 'processed.'+jfile
        errfilename = 'errfile.'+jfile
        errfile = codecs.open(errfilename,"w+",encoding='utf-8')
        
        seed_start_time = time.time()
        
        parsemetcmd = ['parsemet',
                       '-l',lang,
                       '-s','seeds.ei',
                       '-j',jfile]
        russiancmd = ['pipeline_russian',
                      '-f',jfile,
                      '-t','json',
                      '-o',procfilename]
                
        if cmdline.extdir:
            parsemetcmd.append('-d')
            parsemetcmd.append(cmdline.extdir+'/seeds')

        if lang=="en":
            parsemetcmd.insert(1, '-x')
            procfile = codecs.open(procfilename,"w",encoding='utf-8')
            subprocess.call(parsemetcmd,
                            stdout=procfile,
                            stderr=errfile)
            procfile.flush()
            procfile.close()
        elif (lang=="es") or (lang=="ru"):
            procfile = codecs.open(procfilename,"w",encoding='utf-8')
            subprocess.call(parsemetcmd,
                            stdout=procfile,
                            stderr=errfile)
            procfile.flush()
            procfile.close()
        elif lang=="fa":
            jobj = mnjson.loadfile(jfile)
            jobj = persianextractor.find_LMs(jobj)
            persianextractor.writejson(jobj, procfilename)

        procfile = codecs.open(procfilename,"r",encoding='UTF-8')

        seed_elapsed_time = time.time() - seed_start_time
        msgpf("SBS processing time: %fs",(seed_elapsed_time))

        # load the resulting json file
        # do post_processing
        word_start_time = time.time()
        doc = processor.post_process(json.load(procfile,encoding='UTF-8'))
        word_elapsed_time = time.time() - word_start_time
        msgpf("SWS processing time: %fs",(word_elapsed_time))
        
        # save the resulting json for debugging
        mnjson.writefile(procfilename+'.post', doc)
        
        highscorelmlist = []
        for sentence in doc['sentences']:
            if 'lms' in sentence:
                lmlist = sentence['lms']
                if len(lmlist) < 1:
                    continue
                highscorelmlist.append((lmlist[0],sentence))
        
        # choose the highest scoring lm from all the sentences in the json
        if len(highscorelmlist) > 0:
            highscorelmlist.sort(key=lambda lmtuple:lmtuple[0]['score'], reverse=True)
            (lm, sentence) = highscorelmlist[0]
            if lm['score'] >= MIN_SCORE:
                lmflag = "1"
                lmsentno = sentence['id'].split(':')[-1]
                # use text if there, or lemma if not
                if 'text' in lm['target']:
                    lmtarget = lm['target']['text']
                else:
                    lmtarget = lm['target']['lemma']
                    tmatch = re.search('^(\w+)\.(a|v|n|j)$',lmtarget)
                    if tmatch:
                        lmtarget = tmatch.group(1)
                if 'text' in lm['source']:
                    lmsource = lm['source']['text']
                else:
                    lmsource = lm['source']['lemma']
                    smatch = re.search('^(\w+)\.(a|v|n|j)$',lmsource)
                    if smatch:
                        lmsource = smatch.group(1)

        # check doc if LMs were found
        # currently reports only 1st LM match in the whole text
        rsflag = et.SubElement(result,'metad:LmFlag')
        rsflag.text = lmflag
        rssent = et.SubElement(result,'metad:LmSentence')
        rssent.text = lmsentno
        rstarget = et.SubElement(result,'metad:LmTargetText')
        rstarget.text = lmtarget
        rssource = et.SubElement(result,'metad:LmSourceText')
        rssource.text = lmsource
        rscount += 1

        # record end time
        logend = et.SubElement(logentry, 'metad:EndTime')
        logend.text = get_time()
        print >> sys.stderr, logend.text,"- ended processing on",textid

        # record processing flag
        logflag = et.SubElement(logentry, 'metad:ProcessingFlag')
        logflag.text = lmflag
        print >> sys.stderr, "Processing flag for",textid,'=',lmflag
        logcount += 1

    logentry = et.SubElement(logroot, 'metad:TestEndTime')
    logentry.text = get_time()

    rsroot.set("count",str(rscount))
    logroot.set("count",str(logcount))
    
    # open the input file to read the test id
    intree = et.parse(cmdline.infile)
    inroot = intree.getroot()
    testid = inroot.get('testId')
    rsroot.set('testId',testid)
    logroot.set('testId',testid)

    # write result file
    tmpoutfile = os.path.basename(cmdline.outfile)
    rstree = et.ElementTree(rsroot)
    rstree.write(tmpoutfile,encoding='UTF-8',xml_declaration=True)
    tmplogfile = os.path.basename(cmdline.logfile)
    logtree = et.ElementTree(logroot)
    logtree.write(tmplogfile,encoding='UTF-8',xml_declaration=True)

    # change back to original cwd
    os.chdir(cwd)

    # copy out pretty printed file using xmllint
    finaloutfile = codecs.open(cmdline.outfile,"w",encoding='utf-8')
    subprocess.call(['xmllint','--format',intpath+tmpoutfile],
                    stdout=finaloutfile)
    finallogfile = codecs.open(cmdline.logfile,"w",encoding='utf-8')
    subprocess.call(['xmllint','--format',intpath+tmplogfile],
                    stdout=finallogfile)
    finaloutfile.flush()
    finallogfile.flush()
    return 0