Esempio n. 1
0
def main():
    """
    Runs affect computation.
    """
    # ------------------------------------------------------------------- #
    # INITIALIZATION

    m4test = ix.IARPATestCommand(
        'metaa', 'Computes LM affect in terms of polarity and intensity.')
    cmdline = m4test.parseCmdLine()
    jdata = m4test.getJSON()

    # ------------------------------------------------------------------- #
    # MAIN APPLICATION LOGIC

    lang = jdata['lang']
    if lang != 'fa':
        tt = mnjson.MNTreeTagger(lang)
        tt.cleanText(jdata['sentences'])
        tt.run(jdata['sentences'])
        tt.processLMs(jdata['sentences'])

    esfilterwords = set([
        'a', 'desde', 'detrás', 'ante', 'en', 'segun', 'bajo', 'entre', 'sin',
        'con', 'hacia', 'sobre', 'contra', 'hasta', 'la', 'el', 'los', 'tras',
        'de', 'por', 'para'
    ])

    aff_system = AffectLookup(jdata['lang'], cmdline.extdir)
    for sent in jdata['sentences']:
        for lm in sent['lms']:
            tg = lm['target']
            sc = lm['source']
            tlemma = tg['lemma'] if 'lemma' in tg else tg['form']
            slemma = sc['lemma'] if 'lemma' in sc else sc['form']
            affect = aff_system.getLMAffect(tlemma.lower(), slemma.lower())
            if affect == 999:
                if lang == 'es':
                    if ' ' in tlemma:
                        tlist = []
                        for w in tlemma.split():
                            if w in esfilterwords:
                                continue
                            tlist.append(w)
                        tlemma = u' '.join(tlist)
                    if ' ' in slemma:
                        slist = []
                        for w in slemma.split():
                            if w in esfilterwords:
                                continue
                            slist.append(w)
                        slemma = u' '.join(slist)
                    affect = aff_system.getLMAffect(tlemma.lower(),
                                                    slemma.lower())
            lm['affect'] = affect

    # ------------------------------------------------------------------- #
    # OUTPUT FILE GENERATION
    m4test.writeOutput(jdata)
Esempio n. 2
0
def computePOS(lang, sentences):
    """ Compute POS tags and add them under a 'word' node in each sentence.
    The 'word' node is a list of dicts, where each describes a word in the
    sentence.  Uses TreeTagger for EN, ES, and RU, and a custom HMM tagger
    for FA.
    
    :param sentences: list of sentences:
    :type sentences: str
    """
    if lang == 'fa':
        pt = PersianPOSTagger()
        for sent in sentences:
            sent['ctext'] = pt.cleanText(sent['text'])
            tags = pt.run_hmm_tagger(sent['ctext'])
            #print 'sentence %d: %s\n%s' % (sidx, sent['text'],pprint.pformat(tags))
            sent['word'] = pt.getWordList(sent['text'], sent['ctext'], tags,
                                          'pos', 'lem')
    else:
        tt = mnjson.MNTreeTagger(lang)
        tt.cleanText(sentences)
        tt.run(sentences)
    def __init__(self, exdir=None, wldir=None, cxndir=None, verbose=False):
        global TAGGER_LANGNAME, DOMAIN
        if 'MNEXTRACTPATH' in os.environ:
            self.exdir = os.environ['MNEXTRACTPATH']
        if exdir:
            self.exdir = exdir
        self.wldir = self.exdir + '/wordlists'
        self.cxndir = self.exdir + '/cxns'
        if wldir:
            self.wldir = wldir
        if cxndir:
            self.cxndir = cxndir
        self.verbose = verbose

        # setup taggers for all supported langs
        # pre-load all wordlists cxn lists
        for l in TAGGER_LANGNAME:
            if TAGGER_LANGNAME[l]:
                self.taggers[l] = mnjson.MNTreeTagger(l)
            else:
                self.taggers[l] = None
            wldir = self.wldir + '/' + l + '/'
            cxndir = self.cxndir + '/' + l + '/'
            tfile = wldir + "target." + DOMAIN
            sfile = wldir + "source." + DOMAIN
            cfile = cxndir + "cxns." + DOMAIN
            if os.path.exists(tfile) and os.path.exists(sfile):
                (dtwlist, dtwlists,
                 dswlists) = self.get_domained_wordlists(tfile, sfile)
                self.twlist_by_lang[l] = dtwlist
                self.twlists_by_lang[l] = dtwlists
                self.swlists_by_lang[l] = dswlists
                self.old_twlist_by_lang[l] = self.get_wordlist(tfile)
                self.old_swlist_by_lang[l] = self.get_wordlist(sfile)
                self.tword_rank_by_lang[l] = self.get_ranked_wordlist(tfile)
            if os.path.exists(cfile):
                (dcxns, dcxn_ranking) = self.get_cxns(cfile)
                self.cxns_by_lang[l] = dcxns
                self.cxn_ranks_by_lang[l] = dcxn_ranking
Esempio n. 4
0
def main():
    """
    Runs LM to concept mapping.
    """
    global REMAP_CONCEPT
    # ------------------------------------------------------------------- #
    # INITIALIZATION
    m4test = ix.IARPATestCommand('metam',
                                 'Map LMs to target and source concepts.')

    # add some custom cmdline parameters
    aparser = m4test.getArgParser()
    cmdline, config = m4test.parseCmdLineConfig('m4mapping')
    in_jdata = m4test.getJSON()

    # ------------------------------------------------------------------- #
    # MAIN APPLICATION LOGIC

    lang = in_jdata['lang']
    mappingsystems = config.getList('mappingsystems', lang=lang)
    if not mappingsystems:
        mappingsystems = ['CNMS', 'DSMS', 'DLS']
    secondaryMappingThreshold = config.getFloat('secondarymappingthreshold',
                                                lang=lang,
                                                default=0.1)
    secondaryMinScore = config.getFloatFromComp('cnms',
                                                'secondaryminscore',
                                                lang=lang,
                                                default=0.1)
    mappingLimit = config.getIntFromComp('cnms',
                                         'sourcelimit',
                                         lang=lang,
                                         default=2)
    if secondaryMappingThreshold:
        m4test.setSecondaryMappingThreshold(secondaryMappingThreshold)
    conceptrank = config.getListFromComp('cnms',
                                         'targetconceptranking',
                                         lang=lang)
    expansionTypes = config.getListFromComp('cnms',
                                            'expansiontypes',
                                            lang=lang)
    expansionScoreScale = config.getFloatFromComp('cnms',
                                                  'expansionscorescale',
                                                  lang=lang,
                                                  default=1.0)
    dsmsdefaultrank = config.getIntFromComp('dsms',
                                            'defaultrank',
                                            lang=lang,
                                            default=2)
    dsmsdefaultscore = config.getFloatFromComp('dsms',
                                               'defaultscore',
                                               lang=lang,
                                               default=0.10)
    dsmsScoreStr = ':%s:%s' % (dsmsdefaultrank, dsmsdefaultscore)

    # initialize CNMS system
    # this is always used at least for target concept lookups
    cnmap = ConceptualNetworkMapper(in_jdata['lang'],
                                    cmdline.cachedir,
                                    useSE=cmdline.useSE,
                                    govOnly=True,
                                    disableFN=True,
                                    targetConceptRank=conceptrank,
                                    expansionTypes=expansionTypes,
                                    expansionScoreScale=expansionScoreScale)

    # ------------------------------------------------------------------- #
    # Invoke here the parser and add tags to the sentences element of the JSON input
    in_sentences = in_jdata['sentences']

    # run POS/Lemmatizer for all languages except Persian (CNMS)
    if (lang != 'fa'):
        tt = mnjson.MNTreeTagger(lang)
        tt.cleanText(in_sentences)
        tt.run(in_sentences)
        tt.processLMs(in_sentences)

    # run dependency parser for Englishjunk
    if (lang in ('en', 'ru', 'es')) and ('DSMS' in mappingsystems):
        ss = [s['ctext'] for s in in_sentences]
        logger.info('begin parsing sentence block, lang: %s, len: %d' %
                    (lang, len(ss)))
        out_jdata = parse(in_jdata['lang'], ss)
        logger.info('end parsing sentence block')
        mapping = Assigner(lang)
    else:
        out_jdata = in_jdata

    currentTestItem = ''
    parser_name = parserdesc(lang).name
    # XXX makes no sense!
    #     for in_sent, parsed_sent, in_sent in zip(in_sentences, out_jdata['sentences'], in_jdata['sentences']):
    for in_sent, parsed_sent in zip(in_sentences, out_jdata['sentences']):
        testItemId = in_sent['id'].split(u':')[1]
        if testItemId != currentTestItem:
            currentTestItem = testItemId
            logger.warn('mapping sentences in %s', currentTestItem)

        if 'lms' not in in_sent:
            continue

        for lm in in_sent['lms']:
            source, target = lm['source'], lm['target']
            # ===============================================================
            # TARGET CONCEPT MAPPING: ALWAYS USE CNMS
            # ===============================================================
            cnmap.runTargetMapping(lm)
            lm['extractor'] = 'CNMS'

            # remap targetconcepts if needed.  this is a hack to deal with
            # IARPA's inconsistency about concept coverage
            if target.get('concept') in REMAP_CONCEPT:
                target['concept'] = REMAP_CONCEPT[target['concept']]

            # ================================================================
            # CNMS
            # ================================================================
            if 'CNMS' in mappingsystems:
                cnmap.runSourceMapping(lm,
                                       sourceMappingLimit=mappingLimit,
                                       minSecondaryScore=secondaryMinScore)

            # ================================================================
            # DSMS MAPPING SYSTEM (formerly KMS)
            # ================================================================

            if ((source.get('concept') in (None, 'NULL', 'NONE', ''))
                    and ('DSMS' in mappingsystems)
                    and (lang in ('en', 'ru', 'es'))):
                target_f = target['form'] if 'form' in target else target[
                    'lemma']
                source_f = source['form'] if 'form' in target else source[
                    'lemma']
                found_lms = False

                words = sorted(parsed_sent['word'], key=lambda w: w['idx'])
                twords = sorted(in_sent['word'], key=lambda w: w['idx'])

                #                 logger.info(pformat(in_sent['word']))

                # Try looking for a relation first
                relations = parsed_sent[parser_name]['relations']
                found_lms = find_lm5(target_f, source_f, relations)

                if not found_lms:
                    found_lms = find_lm3(target_f, source_f, twords)

#                 if not found_lms:
#                     found_lms = find_lm4(target, source, words)

                logger.debug('DSMS: found_lms: %s' % found_lms)

                if found_lms:
                    target_l, source_l, _r = found_lms[0]
                    target['rlemma'] = target_l
                    source['rlemma'] = source_l
                    if _r != '-':
                        r = _r.split('.')[0] if '.' in _r else _r
                        dimensions = mapping.assign2(source_l, r)
                    else:
                        dimensions = mapping.gassign(source_l, target_l)

                    scon = dimensions[0].upper() if dimensions else None
                else:
                    scon = None
                    target_l = target[
                        'lemma'] if 'lemma' in target else target['form']
                    source_l = source[
                        'lemma'] if 'lemma' in source else source['form']


#                     dd = ', '.join(' '.join(d) for d in deprels(words))
#                     log('could not find %s - %s in %s' % (target_f, source_f, dd))
                source['concept'] = scon + dsmsScoreStr if scon else 'NONE'
                if scon:
                    if source.get('extractor'):
                        source['extractor'] += ':DSMS'
                    else:
                        source['extractor'] = 'DSMS'

    # ------------------------------------------------------------------- #
    # OUTPUT FILE GENERATION
    m4test.writeOutput(in_jdata)
Esempio n. 5
0
def main():
    """
    Runs source dimension identification.
    """
    # ------------------------------------------------------------------- #
    # INITIALIZATION
    m4test = ix.IARPATestCommand(
        'metas', 'Map LMs with concepts to source dimensions.')
    cmdline = m4test.parseCmdLine()
    jdata = m4test.getJSON()

    # Run the parser
    #     parsed_jdata = parse(jdata['lang'],  [s['text'] for s in jdata['sentences']])

    def lemma(source):
        pass

    # ------------------------------------------------------------------- #
    # MAIN APPLICATION LOGIC

    lang = jdata['lang']
    in_sentences = jdata['sentences']
    if lang != 'fa':
        tt = mnjson.MNTreeTagger(lang)
        tt.cleanText(in_sentences)
        tt.run(in_sentences)
        tt.processLMs(in_sentences)

    cnmap = ConceptualNetworkMapper(lang, cmdline.cachedir)
    for sent in jdata['sentences']:
        if 'lms' not in sent:
            continue
        for lm in sent['lms']:
            # Note that dimension here is in the form
            # CONCEPT.Dimension, e.g. DISEASE.Type.  An UNRESOLVED problem here
            # for the GMR system is what happens when the CONCEPT part we
            # calculated doesn't match with what IARPA provides in the XML.

            # INTEGRATE NEW SYSTEM HERE!

            source, target = lm['source'], lm['target']
            source_f = source['form']
            target_f = target['form']
            source_l = source['lemma'] if 'lemma' in source else source['form']
            target_l = target['lemma'] if 'lemma' in target else target['form']
            source_pos = source['pos'] if 'pos' in source else ''
            target_pos = target['pos'] if 'pos' in target else ''

            sschemas, sourceconceptdim = cnmap.getSourceSchemasAndDimensionFromLemma(
                source_l, source_pos)
            if not sourceconceptdim:
                sschemas, sourceconceptdim = cnmap.getSourceSchemasAndDimensionFromLemma(
                    source_f)

            if not USE_DARIOS or lang == 'fa':
                source['dimension'] = sourceconceptdim
                lm['extractor'] = 'WMS'
            else:
                source_c = source['concept'].lower()
                ((_, dim, sdim),
                 confident) = subdim_match(_lang[lang], source_l, target_l,
                                           source_c)
                sd_pair = u'%s.%s' % (dim.upper(), capwords(sdim, '_'))
                source[
                    'dimension'] = sd_pair if confident else sourceconceptdim
                lm['extractor'] = 'DMS' if confident else 'WMS'

            source['schemas'] = sschemas

    # ------------------------------------------------------------------- #
    # OUTPUT FILE GENERATION
    m4test.writeOutput(jdata)