def run_cms(jfile, procfilename, cmdline):
    processor = postextraction.SimpleConstructionMatchingSystem(cmdline.extdir,None,None,cmdline.verbose)
    doc = processor.post_process(mnjson.loadfile(jfile),
                                 None,
                                 cmdline.matchfield,
                                 cmdline.posfield,
                                 cmdline.reportfield,
                                 cmdline.forcetagger)
    mnjson.writefile(procfilename, doc)
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Diff JSON to determine performance against gold",
        epilog="Note:")

    # required (positional) args
    parser.add_argument("goldfile", help="gold JSON file")
    parser.add_argument("resfile", help="results JSON file")
    parser.add_argument('-t',
                        '--testname',
                        help='m4 test name',
                        default='m4detect')
    parser.add_argument('-s', '--score', dest='scoret',
                        type=float,
                        default=-9999.0,
                        help="Score threshhold for m4detect. If negative,"\
                        " it must be quoted with a space inserted before the - sign.")
    parser.add_argument('-q',
                        '--quiet',
                        help="Only print the stats",
                        action='store_true')

    cmdline = parser.parse_args()

    goldjdata = mnjson.loadfile(cmdline.goldfile)
    resjdata = mnjson.loadfile(cmdline.resfile)
    if resjdata['lang'] == 'fa':
        for sent in resjdata['sentences']:
            if 'lms' in sent:
                for lm in sent['lms']:
                    if ('seed' in lm) and (lm['seed'] == 'NA'):
                        lm['target']['form'] = lm['target']['lemma']
                        lm['source']['form'] = lm['source']['lemma']
    if cmdline.testname == 'm4detect':
        m4detect_cmp(goldjdata,
                     resjdata,
                     scoret=cmdline.scoret,
                     quiet=cmdline.quiet)
    elif cmdline.testname == 'm4mapping':
        m4mapping_cmp(goldjdata, resjdata, quiet=cmdline.quiet)
def run_sbs(jfile, procfilename, cmdline):
    
    # bypass if the output file already exists and has non-zero size
    if os.path.exists(procfilename):
        if os.path.getsize(procfilename):
            return 0
    
    errfilename = 'error.'+jfile
    errfile = codecs.open(errfilename,'w+',encoding='utf-8')
    
    #open the file to see what language it is
    lang = cmdline.lang

    seedfile = "seeds."+cmdline.seedext
    parsemetcmd = ['parsemet','-l', lang,'-s', seedfile,'-j', jfile]
    if cmdline.extdir:
        parsemetcmd.append('-d')
        parsemetcmd.append(cmdline.extdir+'/seeds')
    
    if lang=="fa":     
        prdir = None
        if cmdline.extdir:
            prdir = cmdline.extdir + '/persian'
        persianextractor = PersianMetaphorExtractor(prdir)
        jobj = mnjson.loadfile(jfile)
        persianextractor.parse(jobj)
        jobj = persianextractor.find_LMs(jobj)
        persianextractor.writejson(jobj, procfilename)
    else:
        if cmdline.noparse:
            parsemetcmd.append('-n')
        if lang=='en':
            parsemetcmd.append('-x')
        procfile = codecs.open(procfilename,"w",encoding='utf-8')
        subprocess.call(parsemetcmd,
                        stdout=procfile,
                        stderr=errfile)
        procfile.flush()
        procfile.close()
 def processFile(self, fname):
     try:
         jdata = mnjson.loadfile(fname)
         tscount = 0
         scount = 0
         for sentence in jdata['sentences']:
             scount += 1
             if 'CMS' not in sentence:
                 continue
             if 'targetlist' not in sentence['CMS']:
                 continue
             tlist = sentence['CMS']['targetlist']
             sent = None
             for tmatch in tlist:
                 if tmatch['schemaname'] in self.ts2con:
                     tscount += 1
                     # add sentence to DB if not added already
                     if not sent:
                         sent = self.getSent(sentence['id'],
                                             sentence['text'],
                                             sentence['mtext'])
                     # add target to DB
                     targ = self.getTarget(
                         self.ts2con[tmatch['schemaname']],
                         tmatch['lemma'].lower(), tmatch['schemaname'])
                     # add target to sent join
                     self.joinTargetSent(sent, targ, tmatch['start'],
                                         tmatch['end'], tmatch['form'])
                     # forget source lemmas -- too slow
                     #for word in sentence['word']:
                     #    lem = self.getLemma(word['lem'].lower())
                     #    self.joinLemmaSent(sent,lem,word['start'],word['end'],word['form'])
         self.logger.info('file %s: found %d target terms in %d sentences',
                          os.path.basename(fname), tscount, scount)
     except:
         self.logger.error("error processing %s because\n%s", fname,
                           traceback.format_exc())
def main():
    """
    Runs subcorpus generation.
    """
    # ------------------------------------------------------------------- #
    # INITIALIZATION
    
    # add some custom cmdline parameters
    aparser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Script for generating a subcorpus from a set of JSON files "\
        "usually from a large corpus.  It uses the CMS's target and source word "\
        "matching system.")
    aparser.add_argument("filelistfile",help="File containing list of corpus files "\
                         "in JSON format.")
    aparser.add_argument("outputfile",help="Name of resulting subcorpus file")
    aparser.add_argument("-l","--lang",help="Language",required=True)
    aparser.add_argument("--doc",help="Unique name to give to the subcorpus")
    aparser.add_argument("--corpus",help="Unique name of corpus")
    aparser.add_argument("--desc",help="Description of subcorpus")
    aparser.add_argument("-e", "--engine", help="Querying engine (CMS).  Options are"\
                         " (rdflib, redland, sesame)."\
                         " Use of sesame must be suffixed with the server ip or dns "\
                         " demarked with a colon, e,g, seseame:servername.",
                         default='sesame:localhost')
    aparser.add_argument("--force-pos-tag", dest="forcepos", action="store_true",
                         help="Force POS tagging in CMS. Overwrite existing tags")
    aparser.add_argument("--force-cache", dest="forcecache", action="store_true",
                         help="Force cache regeneration (CMS)")
    aparser.add_argument("--lem", help="Override default lemma field name ('lem')",
                         default="lem")
    aparser.add_argument("--pos", help="Override default POS field name ('pos')",
                         default="pos")
    aparser.add_argument("--trans-en",dest="translateEn",action="store_true",
                         help="For non-English languages, this option allows frame and "\
                         "frame families names to be given in English.  Translation is "\
                         "accomplished via Interwiki links.")
    aparser.add_argument("--config", dest="configfname",
                         help="Configuration filename to override the default "\
                         " which can also be overridden by environment variable"\
                         " %s (default=%s)" % (CONFIG_ENV,DEFAULT_CONFIGFNAME))
    aparser.add_argument("--mode", dest="configmode",
                         help="Used to activate a mode defined in the config file.")
    aparser.add_argument("--cache-dir", dest="cachedir", default=None,
                         help="To override default cache directories")
    aparser.add_argument("--use-se", dest="useSE", default='localhost',
                         help="Use Sparql endpoint at the specified server address.")


    cmdline = aparser.parse_args()
    
    # for logging / error messages
    msgformat = '%(asctime)-15s - %(message)s'
    dateformat = '%Y-%m-%dT%H:%M:%SZ'
    logging.basicConfig(format=msgformat, datefmt=dateformat,
                        level=logging.INFO)

    # parse the config file
    cfname = None
    if cmdline.configfname:
        cfname = self.cmdline.configfname
    else:
        cfname = os.environ.get(CONFIG_ENV)
        if not cfname:
            cfname = DEFAULT_CONFIGFNAME
    config = MetaNetConfigParser(cfname,"makesubcorpus",cmdline.configmode)

    startproctime = datetime.now(tzlocal()).strftime("%Y-%m-%d %H:%M:%S %z")

    docheader = mnjson.getJSONDocumentHeader(name=cmdline.doc,
                                             corp=cmdline.corpus,
                                             desc=cmdline.desc,
                                             prov=cmdline.doc,
                                             type='subcorpus',
                                             lang=cmdline.lang,
                                             pubdate=startproctime)
    jdata = mnjson.getJSONRoot(lang='en',docs=docheader)
    
    # start up metanetrep, cnmapper, cms
    jdata[u'start_processing_time'] = startproctime
    lang = cmdline.lang
    
    paramrec = {}
    
    tfamlist = config.getList('targetfamilies', lang)
    tsnamelist = config.getList('targetframes', lang)
    tconlist = config.getList('targetconcepts', lang)
    tcongrouplist = config.getList('targetconceptgroups', lang)
    sfamlist = config.getList('sourcefamilies', lang)
    ssnamelist = config.getList('sourceframes', lang)
    sconlist = config.getList('sourceconcepts', lang)
    
    filterparams = {}
    filterparams[u'targetfamilies'] = tfamlist
    filterparams[u'targetframes'] = tsnamelist
    filterparams[u'targetconcepts'] = tconlist
    filterparams[u'targetconceptgroups'] = tcongrouplist
    filterparams[u'sourcefamilies'] = sfamlist
    filterparams[u'sourceframes'] = ssnamelist
    filterparams[u'sourceconcepts'] = sconlist
    paramrec[u'filterparams'] = filterparams

    
    # configure and initialize Conceptual Network Mapper
    tconranking = config.getListFromComp('cnms','targetconceptranking', lang)
    secondaryminscore = config.getFloatFromComp('cnms','secondaryminscore', lang)
    mappinglimit = config.getIntFromComp('cnms','sourcelimit', lang)
    conceptmode = config.getValue('casemode',default='general')
    expansionTypes = config.getListFromComp('cnms','expansiontypes',lang=lang)
    expansionScoreScale = config.getFloatFromComp('cnms','expansionscorescale',lang=lang,
                                                  default=1.0)
    disableclosestframe = config.getFlagFromComp('cnms','disableclosestframe',lang=lang)
    fndatadir = config.getValue('framenetdatadir',lang=lang)
    wikdatadir = config.getValue('wiktionarydatadir',lang=lang)
    pwfdatadir = config.getValue('persianwordformsdatadir')
    mrdatadir = config.getValue('mrdatadir',lang=lang)
    
    cnmsparams = {}
    cnmsparams[u'targetconceptranking'] = tconranking
    cnmsparams[u'secondaryminscore'] = secondaryminscore
    cnmsparams[u'sourcelimit'] = mappinglimit
    cnmsparams[u'expansiontypes'] = expansionTypes
    cnmsparams[u'expansionscorescale'] = expansionScoreScale
    cnmsparams[u'disableclosestframe'] = disableclosestframe
    paramrec[u'cnms'] = cnmsparams
    paramrec[u'casemode'] = conceptmode
    paramrec[u'framenetdatadir'] = fndatadir
    paramrec[u'wiktionarydatadir'] = wikdatadir
    paramrec[u'persianwordformsdatadir'] = pwfdatadir
    paramrec[u'mrdatadir'] = mrdatadir
    
    fndata = None
    wikdata = None
    pwforms = None
    if lang=='en':
        if ('fn' in expansionTypes) or (not disableclosestframe):
            if not fndatadir:
                logging.error('FN expansion requires "framenetdatadir" parameter')
            else:
                fndata = FrameNet(cachedir=fndatadir)
        if ('wik' in expansionTypes):
            if not wikdatadir:
                logging.error('Wiktionary expansion requires "wiktionarydatadir" parameter')
            else:
                wikdata = Wiktionary(dbdir=wikdatadir)
    if lang=='fa':
        if not pwfdatadir:
            logging.warn('Persian extraction/mapping not using precomputed word forms.'\
                         ' Set "persianwordformsdatadir" to enable.')
        pwforms = PersianWordForms(pwfdir=pwfdatadir)

    # configure and initialize MetaNet Repository
    metanetrep = MetaNetRepository(lang, useSE=cmdline.useSE,
                                   mrbasedir=mrdatadir,
                                   fndata=fndata,wikdata=wikdata,pwforms=pwforms)
    metanetrep.initLookups()
    
    cnmapper = ConceptualNetworkMapper(lang, cmdline.cachedir,
                                       targetConceptRank=tconranking,
                                       disableFN=disableclosestframe,
                                       expansionTypes=expansionTypes,
                                       expansionScoreScale=expansionScoreScale,
                                       sourceMappingLimit=mappinglimit, 
                                       minSecondaryScore=secondaryminscore,
                                       metanetrep=metanetrep,
                                       conceptMode=conceptmode)
    
    excludedfamilies = config.getListFromComp('cms','excludedfamilies', lang)
    metarcfname = config.getValueFromComp('cms','metarcfile',lang)    

    cms = ConstructionMatchingSystem(lang, posfield=cmdline.pos, lemfield=cmdline.lem,
                                     useSE=cmdline.useSE,
                                     forcecache=cmdline.forcecache,
                                     engine=cmdline.engine, nodepcheck=True,
                                     excludedFamilies=excludedfamilies,
                                     metanetrep=metanetrep,cnmapper=cnmapper,
                                     metarcfname=metarcfname)
    
    maxsentlength = config.getValueFromComp('cms','maxsentlength', lang)
    if maxsentlength:
        cms.setMaxSentenceLength(int(maxsentlength))
    disablewcache = config.getValueFromComp('cms','disablewcaching', lang)
    if disablewcache and (disablewcache.lower() == 'true'):
        cms.setSearchWordCaching(False)
    cxnpatternfname = config.getValueFromComp('cms','cxnpatternfile',lang)
    cmsparams = {}
    cmsparams[u'excludedfamilies'] = excludedfamilies
    cmsparams[u'metarcfile'] = metarcfname
    cmsparams[u'maxsentlength'] = maxsentlength
    cmsparams[u'disablewcaching'] = disablewcache
    cmsparams[u'cxnpatternfile'] = cxnpatternfname
    paramrec[u'cms'] = cmsparams
    
    cms.prepSearchWords(tfamlist,tsnamelist,tconlist,tcongrouplist,
                        sfamlist,ssnamelist,sconlist,
                        translateEn=cmdline.translateEn)
    
    sentence_counter = 0
    subcorpus = []
    with codecs.open(cmdline.filelistfile, encoding='utf-8') as flist:
        for line in flist:
            infilename = line.strip()
            if not infilename:
                continue
            in_jdata = mnjson.loadfile(infilename)
            sc_sents = cms.getSubcorpus(in_jdata[u'sentences'],
                                        forcePOScomp=cmdline.forcepos)
            for sent in sc_sents:
                sent[u'cfile'] = infilename
                sent[u'cid'] = sent[u'id']
                sent[u'cidx'] = sent[u'idx']
                sent[u'id'] = u'%s:%s' % (cmdline.doc,sentence_counter+1)
                sent[u'idx'] = sentence_counter
                subcorpus.append(sent)
            logging.info("added %d sentences from %s",len(sc_sents),infilename)
                
    # ------------------------------------------------------------------- #
    # OUTPUT FILE GENERATION
    jdata[u'parameters'] = paramrec
    jdata[u'end_processing_time'] = datetime.now(tzlocal()).strftime("%Y-%m-%d %H:%M:%S %z")
    jdata[u'sentences'] = subcorpus
    logging.info(u'writing %s with %d sentences',cmdline.outputfile, len(subcorpus))
    mnjson.writefile(cmdline.outputfile, jdata)
Beispiel #6
0
def main():
    """
    Runs LM to concept mapping.
    """
    # ------------------------------------------------------------------- #
    # INITIALIZATION
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Convert gold standard annotation to JSON format.")
    parser.add_argument('goldfile',
                        help="Excel file containing gold standard annotation")
    parser.add_argument('-l','--lang',required=True,
                        help="Language being processed")
    parser.add_argument('-v','--verbose',action='store_true',
                        help='print verbose messages')
    cmdline = parser.parse_args()
    
    msgformat = '%(asctime)-15s - %(message)s'
    dateformat = '%Y-%m-%dT%H:%M:%SZ'
    if cmdline.verbose:
        deflevel = logging.INFO
    else:
        deflevel = logging.WARN
    logging.basicConfig(format=msgformat, datefmt=dateformat, level=deflevel)
    
    # load the input excel file
    wb = load_workbook(cmdline.goldfile)
    
    sentences = []
    idx = 0;
    sentbycon = {}
    for tconcept in wb.get_sheet_names():
        logging.info('processing tab %s',tconcept)
        ws = wb.get_sheet_by_name(tconcept)
        sentbycon[tconcept] = {}
        for row in ws.rows[1:]:
            (id,text,tlemma,status,sform,sschema,sconcept) = (cell.value for cell in row[0:7])
            if (not text) or (not status) or status.lower()=='p':
                continue
            sent = mnjson.getJSONSentence(id,idx,text)
            if status.lower()=='n':
                sentences.append(sent)
                idx += 1
                continue
            if not sform:
                # then need to skip, because it's marked as Y but no answer was given
                continue
            if sconcept:
                sconcept = sconcept.replace(u' ',u'_').upper()
            #create lm
            sstart = text.lower().find(sform.lower())
            send = sstart + len(sform)
            source = {'form':sform,
                      'concept':sconcept,
                      'start':sstart,
                      'end':send}
            if sschema:
                source['schemanames'] = [sname.strip() for sname in sschema.split(u',')]
            target = {'lemma':tlemma,
                      'concept':tconcept}
            lm = {u'name':u'%s %s'%(target['lemma'],source['form']),
                  u'target':target,
                  u'source':source,
                  u'extractor':'Gold'}
            sent['lms'] = [lm]
            sentences.append(sent)
            idx += 1
            if sconcept:
                if sconcept in sentbycon[tconcept]:
                    sentbycon[tconcept][sconcept].append(sent)
                else:
                    sentbycon[tconcept][sconcept] = [sent]
    logging.info("Running POS tagger...")
    computePOS(cmdline.lang, sentences)
    # attempt to fill in missing form / lemma fields
    logging.info("Filling in missing fields in LMs...")
    for sent in sentences:
        if 'lms' in sent:
            if 'word' not in sent:
                continue
            for lm in sent['lms']:
                # do target
                target = lm['target']
                mwetarg = None
                if u' ' in target['lemma']:
                    # its a multiword
                    mwetarg = target['lemma'].split(u' ')
                for widx in range(len(sent['word'])):
                    w = sent['word'][widx]
                    if 'form' in target:
                        break
                    if mwetarg:
                        if mwetarg[0].lower()==w['lem'].lower():
                            tfparts = []
                            for midx in range(len(mwetarg)):
                                if mwetarg[midx].lower()==sent['word'][widx+midx]['lem'].lower():
                                    tfparts.append(sent['word'][widx+midx]['form'])
                                else:
                                    tfparts = []
                                    break
                            if tfparts:
                                target['form'] = u' '.join(tfparts)
                                target['start'] = w['start']
                                target['end'] = w['start'] + len(target['form'])
                                break
                                
                    else:
                        if target['lemma'].lower()==w['lem'].lower():
                            target['form'] = w['form']
                            target['start'] = w['start']
                            target['end'] = w['end']
                            break
                if 'form' not in target:
                    # backup, search for matching wforms
                    for widx in range(len(sent['word'])):
                        w = sent['word'][widx]
                        if 'form' in target:
                            break
                        if mwetarg:
                            if mwetarg[0].lower()==w['form'].lower():
                                tfparts = []
                                for midx in range(len(mwetarg)):
                                    if mwetarg[midx].lower()==sent['word'][widx+midx]['form'].lower():
                                        tfparts.append(sent['word'][widx+midx]['form'])
                                    else:
                                        tfparts = []
                                        break
                                if tfparts:
                                    target['form'] = u' '.join(tfparts)
                                    target['start'] = w['start']
                                    target['end'] = w['start'] + len(target['form'])
                                    break
                                    
                        else:
                            if target['lemma'].lower()==w['form'].lower():
                                target['form'] = w['form']
                                target['start'] = w['start']
                                target['end'] = w['end']
                                break
                source = lm['source']
                mwesource = None
                if u' ' in source['form']:
                    mwesource = source['form'].split(u' ')
                for widx in range(len(sent['word'])):
                    w = sent['word'][widx]
                    if 'lemma' in source:
                        break
                    if mwesource:
                        if mwesource[0].lower()==w['form'].lower():
                            slparts = []
                            for midx in range(len(mwesource)):
                                if mwesource[midx].lower()==sent['word'][widx+midx]['form'].lower():
                                    slparts.append(sent['word'][widx+midx]['lem'])
                                else:
                                    slparts = []
                                    break
                            if slparts:
                                source['lemma'] = u' '.join(slparts)
                                break                                
                    else:
                        if source['form'].lower()==w['form'].lower():
                            source['lemma'] = w['lem']
                            break
        if 'word' in sent:
            del sent['word']
        if 'ctext' in sent:
            del sent['ctext']
    fbase, _ = os.path.splitext(cmdline.goldfile)
    # create m4d gold file
    docheader = mnjson.getJSONDocumentHeader(fbase,
                                             "Gold Standard %s"%(cmdline.lang),
                                             "Gold Standard Document %s (%s)"%(fbase,cmdline.lang),
                                             "%s_MN_Analysis_Team"%(fbase),
                                             "mixed", len(sentences), cmdline.lang)
    jdata = mnjson.getJSONRoot(cmdline.lang, docs=[docheader], sents=sentences)
    mnjson.writefile(fbase+'_m4d_gold.json', jdata)
    # create m4m gold file
    
    docheaders = []
    msentences= []
    idx = 0
    for tconcept in sentbycon.iterkeys():
        for sconcept in sentbycon[tconcept].iterkeys():
            docnum = 1
            sentcount = 0
            docname = u'%s_%s_%d' % (tconcept,sconcept,docnum)
            for sent in sentbycon[tconcept][sconcept]:
                sentcount += 1
                sent['id'] = 'gold:%s:%d' % (docname,sentcount)
                sent['idx'] = idx
                msentences.append(sent)
                if sentcount == 5:
                    docheader = mnjson.getJSONDocumentHeader(docname,
                                                             "m4mapping gold %s"%(cmdline.lang),
                                                             "gold standard m4mapping set",
                                                             docname,
                                                             "mixed", sentcount, cmdline.lang)
                    docheaders.append(docheader)
                    sentcount = 0
                    docnum += 1
                    docname = u'%s_%s_%d' % (tconcept,sconcept,docnum)
            if sentcount > 0:
                docheader = mnjson.getJSONDocumentHeader(docname,
                                                         "m4mapping gold %s"%(cmdline.lang),
                                                         "gold standard m4mapping set",
                                                         docname,
                                                         "mixed", sentcount, cmdline.lang)
                docheaders.append(docheader)
    jdata = mnjson.getJSONRoot(cmdline.lang, docs=docheaders, sents=msentences)
    mnjson.writefile(fbase+'_m4m_gold.json', jdata)
    
    #
    # to create the input versions-- need to reload from file
    #
    
    jdata = mnjson.loadfile(fbase+'_m4d_gold.json')    
    for sent in jdata['sentences']:
        if 'lms' in sent:
            del sent['lms']
    mnjson.writefile(fbase+'_m4d_input.json',jdata)

    jdata = mnjson.loadfile(fbase+'_m4m_gold.json')
    for sent in jdata['sentences']:
        if 'lms' in sent:
            for lm in sent['lms']:
                for field in ('concept','schemaname','schemanames','schemauri','schemauris'):
                    for tgsc in ('target','source'):
                        if field in lm[tgsc]:
                            del lm[tgsc][field]
    mnjson.writefile(fbase+'_m4m_input.json',jdata)
Beispiel #7
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys, re, pprint
from mnformats import mnjson
from depparsing.dep2json import parse
from depparsing.parsemet import sanitized

ntdotre = re.compile(ur'[.]([^0-9])', flags=re.U)
ntmarkre = re.compile(ur'[!?](.)', flags=re.U)
endre = re.compile(ur'([.!?])+$', flags=re.U)

print sys.argv[1]
jdata = mnjson.loadfile(sys.argv[1])
in_sentences = jdata['sentences']
# out_jdata = parse(jdata['lang'],
#                   [ntmarkre.sub(r',\1,',
#                                 ntdotre.sub(r',\1',
#                                             endre.sub(r'\1',sanitized(s['text'])))) for s in in_sentences])
out_jdata = parse(jdata['lang'], [s['ctext'] for s in in_sentences])
pprint.pprint(out_jdata)
def runLMDetectionInstance((infilename, outfilename, cmdline, config)):
    """Run single instance of LM detection on one input JSON file.
    This method takes a single tuple argument and is intended to be
    run via :py:mod:`multiprocessing`.
    
    :param infilename: input file name
    :type infilename: str
    :param outfilename: output file name
    :type outfilename: str
    :param cmdline: command line parameters 
    :type cmdline: parse output from (:py:mod:`argparse`)
    
    """
    if os.path.exists(outfilename) and (not cmdline.force):
        logging.info("Skipping %s because result file exists.", infilename)
        return

    # read input file and prep settings
    logging.info('start LM detection on %s', infilename)

    if cmdline.verbose:
        msgformat = u'%(asctime)-15s - ' + os.path.basename(
            infilename) + ' - %(message)s'
        dateformat = u'%Y-%m-%dT%H:%M:%SZ'
        lfmt = logging.Formatter(msgformat, dateformat)

        # root logger: remove and re-add handlers
        rlogger = logging.getLogger()
        for h in list(rlogger.handlers):
            rlogger.removeHandler(h)

        # info handler: screen (stderr)
        infohand = logging.StreamHandler()
        infohand.setFormatter(lfmt)

        infohand.setLevel(logging.INFO)
        rlogger.setLevel(logging.INFO)

        # debug handler: to file
        if cmdline.debug:
            dbglogfname = os.path.basename(infilename) + '.debug.log'
            debughand = logging.FileHandler(dbglogfname,
                                            mode='w',
                                            encoding='utf-8')
            debughand.setLevel(logging.DEBUG)
            debughand.setFormatter(lfmt)
            rlogger.addHandler(debughand)
            rlogger.setLevel(logging.DEBUG)
        rlogger.addHandler(infohand)

    jdata = mnjson.loadfile(infilename)

    logger = None
    jdata = runLMDetection(jdata, cmdline, config, logger)
    if not jdata:
        logging.error('LMDetection returned empty {}')
        raise

    # write output file
    mnjson.writefile(outfilename, jdata)
    logging.info('done LM detection')
def main():
    global SEPARATOR, MIN_SCORE
    cmdline = process_command_line()
    cwd = os.getcwd()
    
    if cmdline.intdir:
        intdir = cmdline.intdir
    else:
        intdir = "./int_"+os.path.basename(cmdline.infile)
    
    # if the intermediate directory exists, first delete it
    if os.path.exists(intdir):
        shutil.rmtree(intdir)
    
    os.makedirs(intdir)
    shutil.copy(cmdline.infile,intdir)

    # change to intermediate files dir
    os.chdir(intdir)
    intpath = os.getcwd()
    if intpath.endswith('/')==False:
        intpath += '/'
    
    jsonfiles = subprocess.check_output(["testset2json",
                                         "-p",
                                         "-d",
                                         SEPARATOR,
                                         cmdline.infile]).splitlines()
    jsonfiles.sort()

    METADNS = "http://www.iarpa.gov/Metaphor/DetectSchema"
    XSINS = "http://www.w3.org/2001/XMLSchema-instance"
    M4SCHEMA = 'http://www.iarpa.gov/Metaphor/DetectSchema m4detectSchema_11.xsd'
    
    et.register_namespace("metad",METADNS)
    et.register_namespace("xsi",XSINS)
    
    rsroot = et.Element('metad:ResultSet');
    rsroot.set('xsi:schemaLocation',M4SCHEMA)
    rsroot.set('xmlns:metad',METADNS)
    rsroot.set('xmlns:xsi',XSINS)
    rsroot.set('teamId',cmdline.teamname)
    rscount = 0

    logroot = et.Element('metad:LogFile');
    logroot.set('xsi:schemaLocation',M4SCHEMA)
    logroot.set('xmlns:metad',METADNS)
    logroot.set('xmlns:xsi',XSINS)
    logroot.set('teamId',cmdline.teamname)
    logcount = 0
    
    logentry = et.SubElement(logroot, 'metad:TestStartTime')
    logentry.text = get_time()
    
    processor = postextraction.SimpleWordlistSystem(cmdline.extdir)
    prdir = None
    if cmdline.extdir:
        prdir = cmdline.extdir + '/persian'
    persianextractor = PersianMetaphorExtractor(prdir)
    
    for jfile in jsonfiles:
        lmflag = "0"
        lmsentno = "999"
        lmtarget = ""
        lmsource = ""
        
        # process filename
        (fbase,textid) = os.path.splitext(jfile)[0].split(SEPARATOR)

        # get lang from inside
        jfdoc = mnjson.loadfile(jfile)
        lang = jfdoc['lang']

        # start log entry
        logentry = et.SubElement(logroot, 'metad:LogEntry')
        logentry.set('id',textid)

        # record start time
        logstart = et.SubElement(logentry, 'metad:StartTime')
        logstart.text = get_time()
        print >> sys.stderr, logstart.text,"- starting processing on",textid
        
        # run pipeline
        result = et.SubElement(rsroot, 'metad:Result')
        result.set('id',textid)

        procfilename = 'processed.'+jfile
        errfilename = 'errfile.'+jfile
        errfile = codecs.open(errfilename,"w+",encoding='utf-8')
        
        seed_start_time = time.time()
        
        parsemetcmd = ['parsemet',
                       '-l',lang,
                       '-s','seeds.ei',
                       '-j',jfile]
        russiancmd = ['pipeline_russian',
                      '-f',jfile,
                      '-t','json',
                      '-o',procfilename]
                
        if cmdline.extdir:
            parsemetcmd.append('-d')
            parsemetcmd.append(cmdline.extdir+'/seeds')

        if lang=="en":
            parsemetcmd.insert(1, '-x')
            procfile = codecs.open(procfilename,"w",encoding='utf-8')
            subprocess.call(parsemetcmd,
                            stdout=procfile,
                            stderr=errfile)
            procfile.flush()
            procfile.close()
        elif (lang=="es") or (lang=="ru"):
            procfile = codecs.open(procfilename,"w",encoding='utf-8')
            subprocess.call(parsemetcmd,
                            stdout=procfile,
                            stderr=errfile)
            procfile.flush()
            procfile.close()
        elif lang=="fa":
            jobj = mnjson.loadfile(jfile)
            jobj = persianextractor.find_LMs(jobj)
            persianextractor.writejson(jobj, procfilename)

        procfile = codecs.open(procfilename,"r",encoding='UTF-8')

        seed_elapsed_time = time.time() - seed_start_time
        msgpf("SBS processing time: %fs",(seed_elapsed_time))

        # load the resulting json file
        # do post_processing
        word_start_time = time.time()
        doc = processor.post_process(json.load(procfile,encoding='UTF-8'))
        word_elapsed_time = time.time() - word_start_time
        msgpf("SWS processing time: %fs",(word_elapsed_time))
        
        # save the resulting json for debugging
        mnjson.writefile(procfilename+'.post', doc)
        
        highscorelmlist = []
        for sentence in doc['sentences']:
            if 'lms' in sentence:
                lmlist = sentence['lms']
                if len(lmlist) < 1:
                    continue
                highscorelmlist.append((lmlist[0],sentence))
        
        # choose the highest scoring lm from all the sentences in the json
        if len(highscorelmlist) > 0:
            highscorelmlist.sort(key=lambda lmtuple:lmtuple[0]['score'], reverse=True)
            (lm, sentence) = highscorelmlist[0]
            if lm['score'] >= MIN_SCORE:
                lmflag = "1"
                lmsentno = sentence['id'].split(':')[-1]
                # use text if there, or lemma if not
                if 'text' in lm['target']:
                    lmtarget = lm['target']['text']
                else:
                    lmtarget = lm['target']['lemma']
                    tmatch = re.search('^(\w+)\.(a|v|n|j)$',lmtarget)
                    if tmatch:
                        lmtarget = tmatch.group(1)
                if 'text' in lm['source']:
                    lmsource = lm['source']['text']
                else:
                    lmsource = lm['source']['lemma']
                    smatch = re.search('^(\w+)\.(a|v|n|j)$',lmsource)
                    if smatch:
                        lmsource = smatch.group(1)

        # check doc if LMs were found
        # currently reports only 1st LM match in the whole text
        rsflag = et.SubElement(result,'metad:LmFlag')
        rsflag.text = lmflag
        rssent = et.SubElement(result,'metad:LmSentence')
        rssent.text = lmsentno
        rstarget = et.SubElement(result,'metad:LmTargetText')
        rstarget.text = lmtarget
        rssource = et.SubElement(result,'metad:LmSourceText')
        rssource.text = lmsource
        rscount += 1

        # record end time
        logend = et.SubElement(logentry, 'metad:EndTime')
        logend.text = get_time()
        print >> sys.stderr, logend.text,"- ended processing on",textid

        # record processing flag
        logflag = et.SubElement(logentry, 'metad:ProcessingFlag')
        logflag.text = lmflag
        print >> sys.stderr, "Processing flag for",textid,'=',lmflag
        logcount += 1

    logentry = et.SubElement(logroot, 'metad:TestEndTime')
    logentry.text = get_time()

    rsroot.set("count",str(rscount))
    logroot.set("count",str(logcount))
    
    # open the input file to read the test id
    intree = et.parse(cmdline.infile)
    inroot = intree.getroot()
    testid = inroot.get('testId')
    rsroot.set('testId',testid)
    logroot.set('testId',testid)

    # write result file
    tmpoutfile = os.path.basename(cmdline.outfile)
    rstree = et.ElementTree(rsroot)
    rstree.write(tmpoutfile,encoding='UTF-8',xml_declaration=True)
    tmplogfile = os.path.basename(cmdline.logfile)
    logtree = et.ElementTree(logroot)
    logtree.write(tmplogfile,encoding='UTF-8',xml_declaration=True)

    # change back to original cwd
    os.chdir(cwd)

    # copy out pretty printed file using xmllint
    finaloutfile = codecs.open(cmdline.outfile,"w",encoding='utf-8')
    subprocess.call(['xmllint','--format',intpath+tmpoutfile],
                    stdout=finaloutfile)
    finallogfile = codecs.open(cmdline.logfile,"w",encoding='utf-8')
    subprocess.call(['xmllint','--format',intpath+tmplogfile],
                    stdout=finallogfile)
    finaloutfile.flush()
    finallogfile.flush()
    return 0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description="Import JSON files containing extracted LMs into MetaNet LM database")
    parser.add_argument('filenamelist',
                        help="Text file containing JSON file(s) to import")
    parser.add_argument('-l','--lang', help="Language of input files.",
                        required=True)
    parser.add_argument('-f','--force',
                        action='store_true',
                        help="Re-import files already imported")
    parser.add_argument('-e','--from-empty',dest='fromempty',action='store_true',
                        help='Assume population from empty DB.')
    parser.add_argument('-u','--mdb-user',dest='mdbuser',default='mdbuser',
                        help='MetaNet LM database username')
    parser.add_argument('-p','--mdb-pw',dest='mdbpw',default=None,required=True,
                        help='MetaNet LM database password')
    parser.add_argument('-n','--mdb-name',dest='mdbname',
                        help='Metanet LM database name. Default: lang + mnlm')
    parser.add_argument('-s','--single',action='store_true',
                        help='Process single JSON file (input file is JSON)')
    parser.add_argument('-v','--verbose',action='store_true',
                        help='Display more status messages')
    parser.add_argument('-c','--concept-only',dest='conceptonly',action='store_true',
                        help='Import LMs only if they map to concepts')
    parser.add_argument('-d','--doc-uref',dest='docureffield', default='name',
                        help='Reference documents by given field.')
    cmdline = parser.parse_args()
    
    # proc title manipulation to hide PW
    pstr = setproctitle.getproctitle()
    pstr = re.sub(ur'(-p|--mdb-pw)(=|\s+)(\S+)',ur'\1\2XXXX',pstr)
    setproctitle.setproctitle(pstr)
    
    # this routine has to write its own files
    msgformat = '%(asctime)-15s - %(message)s'
    dateformat = '%Y-%m-%dT%H:%M:%SZ'
    if cmdline.verbose:
        deflevel = logging.INFO
    else:
        deflevel = logging.WARN
    logging.basicConfig(format=msgformat, datefmt=dateformat, level=deflevel)
        
    if cmdline.mdbname:
        mdbname = cmdline.mdbname
    else:
        mdbname = cmdline.lang + 'mnlm'

    lmdb = MetaNetLMDB(socket='/tmp/mysql.sock',
                       user=cmdline.mdbuser,
                       passwd=cmdline.mdbpw,
                       dbname=mdbname)
    if cmdline.single:
        flist = [cmdline.filenamelist]
    else:
        flist = codecs.open(cmdline.filenamelist,encoding='utf-8')
        
    for line in flist:
        jfile = line.strip()
        (dirname,basename) = os.path.split(jfile)
        if not dirname:
            dirname = '.'
        cmpfname = '%s/.imported.%s.tmp' % (dirname,basename)
        if not cmdline.force:
            if os.path.exists(cmpfname):
                logging.warn('Skipping %s because already imported.',jfile)
                continue
        try:
            logging.info('start importing %s', jfile)
            jdata = mnjson.loadfile(jfile)
            lmdb.importLMs(jdata, cmdline.fromempty, cmdline.conceptonly, cmdline.docbyname, cmdline.docureffield)
            open(cmpfname, 'a').close()
            logging.info('end importing %s', jfile)
        except:
            logging.error(traceback.format_exc())
            logging.error("Error parsing %s",jfile)
def main():
    """
	Compare m4detect output file to expected output from gsdb2mndetectinput.py
	"""
    global METAPHORPREF
    parser = argparse.ArgumentParser(
           formatter_class=argparse.ArgumentDefaultsHelpFormatter,
           description="Compares gold standard answers CSV with XML and JSON "\
        "extractor output",
           epilog="")

    # required (positional) args
    parser.add_argument("-g",
                        "--gold",
                        dest="goldcsv",
                        help="CSV files containing gold standard answers",
                        required=True)
    parser.add_argument("-x",
                        "--xml",
                        dest="xmlfile",
                        help="XML file containing extractor output",
                        required=True)
    parser.add_argument("-j",
                        "--json",
                        dest="jsonfile",
                        help="JSON file containing extractor output",
                        required=True)
    parser.add_argument("-o",
                        "--outputfile",
                        dest="outputfile",
                        help="CSV file containing comparison output")
    parser.add_argument("-v",
                        "--verbose",
                        action='store_true',
                        help="Display verbose error messages")
    cmdline = parser.parse_args()

    logLevel = logging.WARN
    if cmdline.verbose:
        logLevel = logging.INFO
    logging.basicConfig(level=logLevel,
                        format='%(asctime)s %(levelname)s %(message)s')

    if not cmdline.outputfile:
        cmdline.outputfile = 'comp-' + cmdline.goldcsv

    xdoc = minidom.parse(cmdline.xmlfile)
    csvfile = open(cmdline.goldcsv, 'rb')
    cdoc = csv.reader(csvfile, encoding='utf-8')
    jdata = mnjson.loadfile(cmdline.jsonfile)
    lang = jdata['lang']
    METAPHORPREF = METAPHORPREF % (lang)
    sentences = jdata['sentences']

    sid2sent = {}
    for sent in sentences:
        sid2sent[sent['id']] = sent['idx']

    testsetid = '%sdetect' % lang

    # Pre-process GOLD standard answers, there will be a row for everything
    # including negatives
    goldlmrows = {}
    i = 0
    for row in cdoc:
        i += 1
        if i == 1:
            header = list(row)
        else:
            rowdict = {}
            for k in range(len(row)):
                rowdict[header[k]] = row[k]
            sentId = '%s:%s%03d:%s' % (testsetid, lang, int(
                rowdict['ti_num']), rowdict['sent_num'])
            if sentId in goldlmrows:
                goldlmrows[sentId].append(rowdict)
            else:
                goldlmrows[sentId] = [rowdict]

    # Parse xml file: Our system's answers
    # An entry will exist only for answers (not for negatives)
    metads = xdoc.getElementsByTagName("metad:Result")

    testresponses = {}
    for metad in metads:
        tId = unicode(metad.getAttribute("testItemId")).encode('utf8')
        sId = unicode(metad.getAttribute("sentenceId")).encode('utf8')
        LmTT = unicode(
            metad.getElementsByTagName("metad:LmTargetText")
            [0].firstChild.data).encode('utf8')
        LmST = unicode(
            metad.getElementsByTagName("metad:LmSourceText")
            [0].firstChild.data).encode('utf8')
        sentId = '%s:%s:%s' % (testsetid, tId, sId)
        sent = sentences[sid2sent[sentId]]
        # find the LM
        theLM = None
        if LmTT and LmST:
            for lm in sent['lms']:
                if (lm['target']['lemma'] == LmTT) and (lm['source']['lemma']
                                                        == LmST):
                    theLM = lm
                    break
        else:
            logging.error('XML parsing error: a <Result> with no lemmas')
            # negative example

        if sentId in testresponses:
            testresponses[sentId].append({
                'target': LmTT,
                'source': LmST,
                'lm': theLM
            })
        else:
            testresponses[sentId] = [{
                'target': LmTT,
                'source': LmST,
                'lm': theLM
            }]

    #logging.warning('testresponses:\n%s',pprint.pformat(testresponses))
    ofile = codecs.open(cmdline.outputfile, "w", encoding='utf-8')

    truepos = 0
    goldcount = 0
    rcount = 0

    #logging.info(u'testresponses:\n%s',pprint.pformat(testresponses))
    #sys.exit(1)

    printCompRow(ofile, None, header=True)

    # iterating through gold rows
    for sentId in sorted(goldlmrows.keys()):
        goldrows = goldlmrows[sentId]
        responserows = testresponses.get(sentId)
        countedresponses = set()

        if (len(goldrows) == 1) and (not goldrows[0]['tform']) and (
                not goldrows[0]['sform']):
            # it's a negative example.
            if not responserows:
                # we correctly got nothing
                logging.debug(u'true negative: gold=%s\n\n\nresponse=%s',
                              pprint.pformat(goldrows),
                              pprint.pformat(responserows))
                printCompRow(ofile, sentId, gold=goldrows[0], mode='trueneg')
        else:
            for grow in goldrows:
                goldcount += 1
                try:
                    if grow['tspan'] and grow['sspan']:
                        gtspan = tuple(
                            [int(cpos) for cpos in grow['tspan'].split(u',')])
                        gsspan = tuple(
                            [int(cpos) for cpos in grow['sspan'].split(u',')])
                    else:
                        gtspan = (-1, -1)
                        gsspan = (-1, -1)
                except ValueError:
                    logging.error(u'error extracting spans: %s',
                                  pprint.pformat(grow))
                    raise
                foundMatch = False
                if responserows:
                    for i, rrow in enumerate(responserows):
                        if i in countedresponses:
                            continue
                        rtspan = (rrow['lm']['target']['start'],
                                  rrow['lm']['target']['end'])
                        rsspan = (rrow['lm']['source']['start'],
                                  rrow['lm']['source']['end'])
                        if overlaps(gtspan, rtspan) and overlaps(
                                gsspan, rsspan):
                            # it's a match
                            logging.debug(
                                u'true positive: gold=%s\n\n\nresponse=%s',
                                pprint.pformat(goldrows),
                                pprint.pformat(responserows))
                            printCompRow(ofile,
                                         sentId,
                                         gold=grow,
                                         result=rrow,
                                         mode='truepos')
                            countedresponses.add(i)
                            foundMatch = True
                            truepos += 1
                            rcount += 1
                        else:
                            logging.debug(
                                u'no overlap: %s <> %s, and %s <> %s',
                                pprint.pformat(gtspan), pprint.pformat(rtspan),
                                pprint.pformat(gsspan), pprint.pformat(rsspan))
                if not foundMatch:
                    # false negative
                    logging.debug(u'false negative: gold=%s\n\n\nresponse=%s',
                                  pprint.pformat(goldrows),
                                  pprint.pformat(responserows))
                    printCompRow(ofile, sentId, gold=grow, mode='falseneg')

        # remaining responserows had no match
        if not responserows:
            continue
        for i, rrow in enumerate(responserows):
            if i in countedresponses:
                continue
            logging.debug('false positive: gold=%s\n\n\nresponse=%s',
                          pprint.pformat(goldrows),
                          pprint.pformat(responserows))
            printCompRow(ofile,
                         sentId,
                         gold=goldrows[0],
                         result=rrow,
                         mode='falsepos')
            rcount += 1
            # these had no gold match: false positives

    if goldcount > 0:
        recall = float(truepos) / float(goldcount)
        print "Recall: %d / %d = %.4f" % (truepos, goldcount, recall)
    if goldcount > 0:
        precision = float(truepos) / float(rcount)
        print "Precision: %d / %d = %.4f" % (truepos, rcount, precision)
    if recall and precision:
        print "Accuracy: %.4f" % (recall * precision)

    return 0