def setUp(self):
     unittest.TestCase.setUp(self)
     self.myexpr=pgf.readExpr("AdjCN (PositA crucial_A) (UseN item_N)")
     self.myexprw=pgf.readExpr("AdjCN (PositA crucial_A) (UseN wildcard_1)")
     self.myexprq=pgf.readExpr("CompoundCN ? wildcard_3 (AdjCN (PositA wildcard_1) (UseN wildcard_2))")
     self.myexprs=pgf.readExpr('(PredVP (DetCN (DetQuant IndefArt NumSg) (PossNP (AdjCN (PositA complete_A) (UseN collapse_N)) (UseQuantPN DefArt (SymbPN (MkSymb "U"))))) (UseComp (CompNP (MassNP (UseN dollar_N)))))')
     self.extExpr=ExtendedExpr(self.myexpr,None)
     self.extExprW=ExtendedExpr(self.myexprw,None)
     self.extExprQ=ExtendedExpr(self.myexprq,None)
     self.extExprS=ExtendedExpr(self.myexprs,None)
     self.bilingualPhraseSet=BilingualPhraseSet()
     self.bilingualPhraseSet.add("NATO ||| la OTAN ||| 0-0 0-1")
     
     self.mwe1=ParallelMWE()
     self.mwe1.parse("( MassNP ( UseN safety_N ) ) | ( DetCN ( DetQuant wildcard_IGNORE wildcard_IGNORE ) ( UseN security_N ) )")
     
     self.mwe2=ParallelMWE()
     self.mwe2.parse("( PossNP ( UseN wildcard_1 ) ( MassNP ( AdjCN ( PositA wildcard_2 ) ( UseN politics_N ) ) ) ) | ( PossNP ( UseN wildcard_1 ) ( DetCN ( DetQuant wildcard_IGNORE wildcard_IGNORE ) ( AdjCN ( PositA wildcard_2 ) ( UseN policy_N ) ) ) )")
     
     self.bilphrase=BilingualExpr()
     self.bilphrase.parse("( MassNP ( AdjCN ( PositA wildcard_2 ) ( UseN politics_N ) ) )  | ( DetCN ( DetQuant wildcard_IGNORE wildcard_IGNORE ) ( AdjCN ( PositA wildcard_2 ) ( UseN policy_N ) ) )", ignoreFreq=True)
     
     synDict=dict()
     synDict["politics_N"]=set(["policy_N"])
     ParallelMWE.synonymDict=synDict
 def testExtractCandidateMWEs(self):
     bilExpr=BilingualExpr()
     bilExpr.set_exprs(self.extExpr,self.extExpr)
     self.assertTrue(bilExpr.is_equal_sides())
     
     candidateMWEs=bilExpr.extract_candidate_mwes()
     self.assertEqual(len(candidateMWEs), 4)
     for mwestr in candidateMWEs:
         mwe =ParallelMWE()
         mwe.parse(" | ".join(mwestr.split(" | ")[2:]))
         self.assertTrue(mwe.is_equal_sides())
import gzip
import sys

if __name__ == "__main__":
    
    parser = argparse.ArgumentParser(description='Chooses rules.')
    parser.add_argument('--use_synonyms')
    parser.add_argument('--inverse_synonyms',action='store_true')
    parser.add_argument('--additional_references')
    parser.add_argument('--debug', action='store_true')
    args = parser.parse_args(sys.argv[1:])
    set_debug(args.debug)
    
    #read synonyms
    if args.use_synonyms:
        ParallelMWE.load_synonym_dict(args.use_synonyms, args.inverse_synonyms)
    
    mweset=ParallelMWESet()
    
    if args.additional_references:
        myfile=gzip.open(args.additional_references)
        for line in myfile:
            mwe=ParallelMWE()
            mwe.parse(line)
            mweset.add(mwe)
        myfile.close()
    
    #read mwes
    mwelist=list()
    for line in sys.stdin:
        line=line.strip()
class ExtendedExprTest(unittest.TestCase):
    
    def setUp(self):
        unittest.TestCase.setUp(self)
        self.myexpr=pgf.readExpr("AdjCN (PositA crucial_A) (UseN item_N)")
        self.myexprw=pgf.readExpr("AdjCN (PositA crucial_A) (UseN wildcard_1)")
        self.myexprq=pgf.readExpr("CompoundCN ? wildcard_3 (AdjCN (PositA wildcard_1) (UseN wildcard_2))")
        self.myexprs=pgf.readExpr('(PredVP (DetCN (DetQuant IndefArt NumSg) (PossNP (AdjCN (PositA complete_A) (UseN collapse_N)) (UseQuantPN DefArt (SymbPN (MkSymb "U"))))) (UseComp (CompNP (MassNP (UseN dollar_N)))))')
        self.extExpr=ExtendedExpr(self.myexpr,None)
        self.extExprW=ExtendedExpr(self.myexprw,None)
        self.extExprQ=ExtendedExpr(self.myexprq,None)
        self.extExprS=ExtendedExpr(self.myexprs,None)
        self.bilingualPhraseSet=BilingualPhraseSet()
        self.bilingualPhraseSet.add("NATO ||| la OTAN ||| 0-0 0-1")
        
        self.mwe1=ParallelMWE()
        self.mwe1.parse("( MassNP ( UseN safety_N ) ) | ( DetCN ( DetQuant wildcard_IGNORE wildcard_IGNORE ) ( UseN security_N ) )")
        
        self.mwe2=ParallelMWE()
        self.mwe2.parse("( PossNP ( UseN wildcard_1 ) ( MassNP ( AdjCN ( PositA wildcard_2 ) ( UseN politics_N ) ) ) ) | ( PossNP ( UseN wildcard_1 ) ( DetCN ( DetQuant wildcard_IGNORE wildcard_IGNORE ) ( AdjCN ( PositA wildcard_2 ) ( UseN policy_N ) ) ) )")
        
        self.bilphrase=BilingualExpr()
        self.bilphrase.parse("( MassNP ( AdjCN ( PositA wildcard_2 ) ( UseN politics_N ) ) )  | ( DetCN ( DetQuant wildcard_IGNORE wildcard_IGNORE ) ( AdjCN ( PositA wildcard_2 ) ( UseN policy_N ) ) )", ignoreFreq=True)
        
        synDict=dict()
        synDict["politics_N"]=set(["policy_N"])
        ParallelMWE.synonymDict=synDict
    
    def testNonLeafFunList(self):
        listOfFuns=self.extExpr.get_non_leaf_funtions()
        assert listOfFuns == ['AdjCN', 'PositA', 'UseN']
        
    def testLeafFunList(self):
        listOfFuns=self.extExpr.get_leaf_functions()
        assert listOfFuns == ['crucial_A','item_N']
        
        listOfFuns=self.extExprS.get_leaf_functions()
        self.assertEqual(listOfFuns , ['IndefArt','NumSg','complete_A','collapse_N','DefArt','String_U','dollar_N'])
        
        listOfFuns=self.extExprQ.get_leaf_functions()
        self.assertEqual(listOfFuns, ['?','wildcard_3','wildcard_1','wildcard_2']) 
    
    def testWildcardFunList(self):
        listOfFuns=self.extExprW.get_wildcard_leaf_functions()
        self.assertEqual(listOfFuns,['wildcard_1'])
    
    def testExtractCandidateMWEs(self):
        bilExpr=BilingualExpr()
        bilExpr.set_exprs(self.extExpr,self.extExpr)
        self.assertTrue(bilExpr.is_equal_sides())
        
        candidateMWEs=bilExpr.extract_candidate_mwes()
        self.assertEqual(len(candidateMWEs), 4)
        for mwestr in candidateMWEs:
            mwe =ParallelMWE()
            mwe.parse(" | ".join(mwestr.split(" | ")[2:]))
            self.assertTrue(mwe.is_equal_sides())
    
    def testPrint(self):
        strrep=str(self.extExpr)
        myexpragain=pgf.readExpr(strrep)
        self.assertEqual(str(self.myexpr), str(myexpragain))
        
        strrep=str(self.extExprS)
        myexpragain=pgf.readExpr(strrep)
        self.assertEqual(str(self.myexprs), str(myexpragain))
    
    def testBilingualPhraseSet(self):
        self.assertTrue(self.bilingualPhraseSet.contains_biligual_phrase("NATO", "OTAN"))
        self.assertTrue(self.bilingualPhraseSet.contains_biligual_phrase("NATO", "la OTAN"))
        self.assertTrue(self.bilingualPhraseSet.contains_biligual_phrase("NATO", "la"))
    
    def testCompositionally(self):
        self.assertFalse(self.mwe1.is_bilexpr_matched_or_reproduced(self.bilphrase).reproduced)
Beispiel #5
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='select minimum amount of parallel MWEs to reproduce the bilingual expr.')
    parser.add_argument('--only_print_scores',action='store_true')
    parser.add_argument('--bilingual_exprs',required=True)
    parser.add_argument('--use_synonyms')
    parser.add_argument('--inverse_synonyms',action='store_true')
    parser.add_argument('--invert_synonym_direction',action='store_true')
    parser.add_argument('--threshold',default='2')
    parser.add_argument('--debug', action='store_true')
    args = parser.parse_args(sys.argv[1:])
    
    set_debug(args.debug)
    
    if args.use_synonyms:
        ParallelMWE.load_synonym_dict(args.use_synonyms, args.inverse_synonyms,args.invert_synonym_direction)
    
    mwes=list()
    #read MWEs
    for line in sys.stdin:
        line=line.strip()
        mwe=ParallelMWE()
        mwe.parse(line)
        mwes.append(mwe)
    
    reprlistofnonleafs=mwes[0].get_representative()
    
    bilExprs=list()
    #read bilingual exprs
    for line in gzip.open(args.bilingual_exprs,'r'):
        line=line.strip()
#!/usr/bin/env python
# coding=utf-8
# -*- encoding: utf-8 -*-

from lib.abstractLearningLib import BilingualExpr, set_debug, \
    GFProbabilisticBilingualDictionary, ParallelMWE
import sys
import argparse

if __name__ == "__main__":
    
    parser = argparse.ArgumentParser(description='Chooses rules.')
    parser.add_argument('--use_synonyms')
    parser.add_argument('--inverse_synonyms',action='store_true')
    parser.add_argument('--debug', action='store_true')
    args = parser.parse_args(sys.argv[1:])
    
    set_debug(args.debug)
    
    if args.use_synonyms:
        ParallelMWE.load_synonym_dict(args.use_synonyms, args.inverse_synonyms)
    
    for line in sys.stdin:
    #for line in ['1 | BaseNP (UsePN (SymbPN (MkSymb "Wilders"))) (DetCN (DetQuant (PossPron he_Pron) NumPl) (UseN supporter_N)) | BaseNP (UsePN (SymbPN (MkSymb "Wilders"))) (DetCN (DetQuant (PossPron it_Pron) NumPl) (UseN backer_N))']:
        line=line.strip()
        bilExpr=BilingualExpr()
        bilExpr.parse(line)
        for candidatemwe in bilExpr.extract_candidate_mwes():
            print candidatemwe
 
 parser = argparse.ArgumentParser(description='filter final MWEs')
 parser.add_argument('--different_sides',action='store_true')
 parser.add_argument('--contains_lexical',action='store_true')
 parser.add_argument('--not_contains_lexical',action='store_true')
 parser.add_argument('--contains_non_wildcard',action='store_true')
 parser.add_argument('--contains_wildcard',action='store_true')
 parser.add_argument('--not_contains_wildcard',action='store_true')
 parser.add_argument('--use_synonyms')
 parser.add_argument('--inverse_synonyms',action='store_true')
 args = parser.parse_args(sys.argv[1:])
 
 inputSource=sys.stdin
 
 if args.use_synonyms:
     ParallelMWE.load_synonym_dict(args.use_synonyms, args.inverse_synonyms)
 
 for line in inputSource:
     line=line.strip()
     mwe=ParallelMWE()
     mwe.parse(line)
     
     isValid=True
     
     if args.different_sides:
         isValid = isValid and not mwe.is_equal_sides()
     
     if args.contains_lexical:
         isValid = isValid and ( len(mwe.slexpr.get_open_leaf_functions())>0 or len(mwe.tlexpr.get_open_leaf_functions())>0 )
     
     if args.not_contains_lexical: