def main(argv):
    if len(argv) < 2:
        print 'USAGE: python NER.py trainFile testFile'
        exit(0)

    printOp = ''
    if len(argv) > 2:
        printOp = '-print'

    featureFactory = FeatureFactory()

    # read the train and test data
    trainData = featureFactory.readData(argv[0])
    testData = featureFactory.readData(argv[1])

    # add the features
    trainDataWithFeatures = featureFactory.setFeaturesTrain(trainData)
    testDataWithFeatures = featureFactory.setFeaturesTest(testData)

    # write the updated data into JSON files
    featureFactory.writeData(trainDataWithFeatures, 'trainWithFeatures')
    featureFactory.writeData(testDataWithFeatures, 'testWithFeatures')

    # run MEMM
    output = Popen([
        'java', '-cp', '../java/classes', '-Xmx2G', 'MEMM',
        'trainWithFeatures.json', 'testWithFeatures.json', printOp
    ],
                   stdout=PIPE).communicate()[0]

    # java -cp classes -Xmx1G MEMM trainWithFeatures.json testWithFeatures.json
    # java -cp ../java/classes -Xmx1G MEMM trainWithFeatures.json testWithFeatures.json

    print output
Example #2
0
def test():
    import pprint
    from FeatureFactory import FeatureFactory
    af1 = FeatureFactory.getInstance(FeatureType.ADDRESS,FeedType.FEATURES)
    a1 = af1.get(ref='one_feat')
    
    af2 = FeatureFactory.getInstance(FeatureType.ADDRESS,FeedType.CHANGEFEED)
    a2 = af2.get(ref='two_chg')
    a2.setVersion(100)
    a2.setObjectType('Parcel')
    a2.setAddressNumber(100)
    a2.setAddressId(100)
    a2.setRoadName('Smith Street')
    
    af3 = FeatureFactory.getInstance(FeatureType.ADDRESS,FeedType.RESOLUTIONFEED)
    a3 = af3.get(ref='three_res')
    a3.setChangeId(200)
    a3.setVersion(200)
    a3.setAddressNumber(200)
    a3.setRoadName('Jones Road')
    
    
    print a1,a2,a3

    r2 = af2.convert(a2,ActionType.UPDATE)
    r3 = af3.convert(a3,ApprovalType.UPDATE)

    pprint.pprint (r2)
    pprint.pprint (r3)
Example #3
0
def test():
    import pprint
    from FeatureFactory import FeatureFactory
    af1 = FeatureFactory.getInstance(FeatureType.ADDRESS, FeedType.FEATURES)
    a1 = af1.get(ref='one_feat')

    af2 = FeatureFactory.getInstance(FeatureType.ADDRESS, FeedType.CHANGEFEED)
    a2 = af2.get(ref='two_chg')
    a2.setVersion(100)
    a2.setObjectType('Parcel')
    a2.setAddressNumber(100)
    a2.setAddressId(100)
    a2.setRoadName('Smith Street')

    af3 = FeatureFactory.getInstance(FeatureType.ADDRESS,
                                     FeedType.RESOLUTIONFEED)
    a3 = af3.get(ref='three_res')
    a3.setChangeId(200)
    a3.setVersion(200)
    a3.setAddressNumber(200)
    a3.setRoadName('Jones Road')

    print a1, a2, a3

    r2 = af2.convert(a2, ActionType.UPDATE)
    r3 = af3.convert(a3, ApprovalType.UPDATE)

    pprint.pprint(r2)
    pprint.pprint(r3)
Example #4
0
def output(partId, ch_aux):
    """Uses the student code to compute the output for test cases."""
    print '== Running your code ...'

    featureFactory = FeatureFactory()

    # read the train and test data
    trainData = featureFactory.readData("../data/train")
    testData = featureFactory.readTestData(ch_aux)

    # add the features
    trainDataWithFeatures = featureFactory.setFeaturesTrain(trainData)
    testDataWithFeatures = featureFactory.setFeaturesTest(testData)

    # write the updated data into JSON files
    featureFactory.writeData(trainDataWithFeatures, "trainWithFeaturesSubmit")
    featureFactory.writeData(testDataWithFeatures, "testWithFeaturesSubmit")

    # run MEMM
    output = Popen([
        'java', '-cp', 'classes', '-Xmx1G', 'MEMM',
        'trainWithFeaturesSubmit.json', 'testWithFeaturesSubmit.json',
        '-submit'
    ],
                   stdout=PIPE).communicate()[0]
    # print output[:100]
    os.remove('trainWithFeaturesSubmit.json')
    os.remove('testWithFeaturesSubmit.json')

    print '== Finished running your code'

    return output
 def setUp(self):            
     self.af = FeedRef((FeatureType.ADDRESS,FeedType.FEATURES))
     self.ac = FeedRef((FeatureType.ADDRESS,FeedType.CHANGEFEED))
     self.ar = FeedRef((FeatureType.ADDRESS,FeedType.RESOLUTIONFEED))
     self.aff = FeatureFactory.getInstance(self.af)
     self.afc = FeatureFactory.getInstance(self.ac)
     self.afr = FeatureFactory.getInstance(self.ar)
     self.dm = DataManager()
 def setUp(self):
     self.af = FeedRef((FeatureType.ADDRESS, FeedType.FEATURES))
     self.ac = FeedRef((FeatureType.ADDRESS, FeedType.CHANGEFEED))
     self.ar = FeedRef((FeatureType.ADDRESS, FeedType.RESOLUTIONFEED))
     self.aff = FeatureFactory.getInstance(self.af)
     self.afc = FeatureFactory.getInstance(self.ac)
     self.afr = FeatureFactory.getInstance(self.ar)
     self.dm = DataManager()
    def setUp(self):
        self.dm = DataManager(ref_int)
        self.af = FeedRef((FeatureType.ADDRESS, FeedType.FEATURES))
        self.ac = FeedRef((FeatureType.ADDRESS, FeedType.CHANGEFEED))
        self.ar = FeedRef((FeatureType.ADDRESS, FeedType.RESOLUTIONFEED))

        self.afc = FeatureFactory.getInstance(self.ac)
        self.afr = FeatureFactory.getInstance(self.ar)
        self.addr_r = _getTestAddress(af[FeedType.FEATURES])
 def setUp(self):    
     self.dm = DataManager(ref_int)
     self.af = FeedRef((FeatureType.ADDRESS,FeedType.FEATURES))
     self.ac = FeedRef((FeatureType.ADDRESS,FeedType.CHANGEFEED))
     self.ar = FeedRef((FeatureType.ADDRESS,FeedType.RESOLUTIONFEED))
     
     self.afc = FeatureFactory.getInstance(self.ac)
     self.afr = FeatureFactory.getInstance(self.ar)
     self.addr_r = _getTestAddress(af[FeedType.FEATURES])
def output(partId, ch_aux):
  """Uses the student code to compute the output for test cases."""
  print '== Running your code ...'

  featureFactory = FeatureFactory()

  # read the train and test data
  trainData = featureFactory.readData("../data/train")
  testData = featureFactory.readTestData(ch_aux)
  
  # add the features
  trainDataWithFeatures = featureFactory.setFeaturesTrain(trainData);
  testDataWithFeatures = featureFactory.setFeaturesTest(testData);

  # write the updated data into JSON files
  featureFactory.writeData(trainDataWithFeatures, "trainWithFeaturesSubmit");
  featureFactory.writeData(testDataWithFeatures, "testWithFeaturesSubmit");

  # run MEMM 
  output = Popen(['java','-cp', 'classes', '-Xmx1G' ,'MEMM'
                  ,'trainWithFeaturesSubmit.json', 'testWithFeaturesSubmit.json',
                  '-submit'], stdout=PIPE).communicate()[0]
  # print output[:100]
  os.remove('trainWithFeaturesSubmit.json')
  os.remove('testWithFeaturesSubmit.json')

  print '== Finished running your code'

  return output
Example #10
0
def main(argv):
    if len(argv) < 2:
        print 'USAGE: python NER.py trainFile testFile'
        exit(0)
    
    printOp = ''
    if len(argv) > 2:
        printOp = '-print'

    featureFactory = FeatureFactory()

    # read the train and test data
    trainData = featureFactory.readData(argv[0])
    testData = featureFactory.readData(argv[1])
    
    # add the features
    trainDataWithFeatures = featureFactory.setFeaturesTrain(trainData);
    testDataWithFeatures = featureFactory.setFeaturesTest(testData);

    # write the updated data into JSON files
    featureFactory.writeData(trainDataWithFeatures, 'trainWithFeatures');
    featureFactory.writeData(testDataWithFeatures, 'testWithFeatures');
    
    # run MEMM 
    output = Popen(['java','-cp', 'classes', '-Xmx1G' ,'MEMM'
                    ,'trainWithFeatures.json', 'testWithFeatures.json',
                    printOp], stdout=PIPE).communicate()[0]

    print output
def main(argv):
    # defaults
    if len(argv) == 0:
        argv.append("../data/train")
        argv.append("../data/dev")
    elif len(argv) < 2:
        print ('USAGE: python NER.py trainFile testFile')
        exit(0)

    # Set this to -print to print
    printOp = ''
    if len(argv) > 2:
        printOp = '-print'

    featureFactory = FeatureFactory()

    # read the train and test data
    trainData = featureFactory.readData(argv[0])
    testData = featureFactory.readData(argv[1])

    # add the features
    trainDataWithFeatures = featureFactory.setFeaturesTrain(trainData);
    testDataWithFeatures = featureFactory.setFeaturesTest(testData);

    # write the updated data into JSON files
    featureFactory.writeData(trainDataWithFeatures, 'trainWithFeatures');
    featureFactory.writeData(testDataWithFeatures, 'testWithFeatures');

    # run MEMM
    output = Popen(['java','-cp', 'classes', '-Xmx1G' ,'MEMM'
                    ,'trainWithFeatures.json', 'testWithFeatures.json',
                    printOp], stdout=PIPE).communicate()[0]

    print (output)
Example #12
0
 def test(self):
     global refsnap
     refsnap = {0:None,1:None,2:None}
     af = {ft:FeatureFactory.getInstance(FeedRef((FeatureType.ADDRESS,ft))) for ft in (FeedType.FEATURES,FeedType.CHANGEFEED,FeedType.RESOLUTIONFEED)}
     gf = {ft:FeatureFactory.getInstance(FeedRef((FeatureType.GROUPS,ft))) for ft in (FeedType.CHANGEFEED,FeedType.RESOLUTIONFEED)}
     af[3] = FeatureFactory.getInstance(FeedRef((FeatureType.GROUPS,FeedType.CHANGEFEED)))
     #with DataManager(start=None) as dm:
     #    dm.start(FeedType.CHANGEFEED)
     with DataManager() as dm:
         dm.registermain(self)
         self.test1(dm,af)
def test():
    from pprint import pprint as pp
    af_f = FeatureFactory.getInstance(
        FeedRef(FeatureType.ADDRESS, FeedType.FEATURES))
    af_c = FeatureFactory.getInstance(
        FeedRef(FeatureType.ADDRESS, FeedType.CHANGEFEED))
    af_r = FeatureFactory.getInstance(
        FeedRef(FeatureType.ADDRESS, FeedType.RESOLUTIONFEED))

    #axx = af_r.get()
    ac1 = af_f.get()
    #ac1._addressedObject_externalObjectId = 1000
    ac1._components_addressType = 'Road'
    ac1._components_addressNumber = 100
    ac1._components_roadName = 'The Terrace'
    ac1._version = 1
    ac1._components_addressId = 100
    ac1._workflow_sourceUser = '******'

    ac1a = af_c.convert(ac1, ActionType.ADD)
    ac1r = af_c.convert(ac1, ActionType.RETIRE)
    ac1u = af_c.convert(ac1, ActionType.UPDATE)

    #------------------------------------------------

    ar1 = af_c.get()
    ar1._version = 100
    ar1._changeId = 100
    ar1._components_addressType = 'Road'
    ar1._components_addressNumber = 100
    ar1._components_roadName = 'The Terrace'

    ar1a = af_r.convert(ar1, ApprovalType.ACCEPT)
    ar1d = af_r.convert(ar1, ApprovalType.DECLINE)
    ar1u = af_r.convert(ar1, ApprovalType.UPDATE)

    print 'CHGF-ADD'
    pp(ac1a)
    print 'CHGF-RET'
    pp(ac1r)
    print 'CHGF-UPD'
    pp(ac1u)

    print 'RESF-ACC'
    pp(ar1a)
    print 'RESF-DEC'
    pp(ar1d)
    print 'RESF-UPD'
    pp(ar1u)
 def _processResolutionGroup(self,feat,cid,etft):
     '''Processes the res-address objects in a res-group. Subsequently populates the sub entities as feature-addresses.
     @param feat: dict representation of feature before object processing
     @type feat: Dict
     @param cid: Change ID or group change ID
     @type cid: Integer
     @param etft: Feed/Feature identifier
     @type etft: FeedRef
     @return: Instantiated feature object
     '''
     featurelist = []
     g = self.factory.get(model=feat['properties'])#group
     #HACK subst cid for cid+count string
     ce,feat2 = self.api.getOneFeature(etft,'{}/address?count={}'.format(cid,MAX_FEATURE_COUNT))#group entity/adr list
     if any(ce.values()): aimslog.error('Single-feature request failure {}'.format(ce))
     etft2 = FeedRef((FeatureType.ADDRESS,FeedType.RESOLUTIONFEED))
     factory2 = FeatureFactory.getInstance(etft2)
     for f in feat2['entities']:
         a = factory2.get(model=f['properties'])
         elist2 = []
         for e in f['entities']:
             elist2.append(self._populateEntity(e))
         a._setEntities(elist2)
         featurelist.append(a)
     g._setEntities(featurelist)
     return g
def test():
    from pprint import pprint as pp
    af_f = FeatureFactory.getInstance(FeedRef(FeatureType.ADDRESS,FeedType.FEATURES))
    af_c = FeatureFactory.getInstance(FeedRef(FeatureType.ADDRESS,FeedType.CHANGEFEED))
    af_r = FeatureFactory.getInstance(FeedRef(FeatureType.ADDRESS,FeedType.RESOLUTIONFEED))
    
    
    #axx = af_r.get()
    ac1 = af_f.get()
    #ac1._addressedObject_externalObjectId = 1000
    ac1._components_addressType = 'Road'
    ac1._components_addressNumber = 100
    ac1._components_roadName = 'The Terrace'
    ac1._version = 1
    ac1._components_addressId = 100
    ac1._workflow_sourceUser = '******'
    
    ac1a = af_c.convert(ac1,ActionType.ADD)
    ac1r = af_c.convert(ac1,ActionType.RETIRE)
    ac1u = af_c.convert(ac1,ActionType.UPDATE)
    
    #------------------------------------------------
    
    ar1 = af_c.get()
    ar1._version = 100
    ar1._changeId = 100
    ar1._components_addressType = 'Road'
    ar1._components_addressNumber = 100
    ar1._components_roadName = 'The Terrace'
    
    ar1a = af_r.convert(ar1,ApprovalType.ACCEPT)
    ar1d = af_r.convert(ar1,ApprovalType.DECLINE)
    ar1u = af_r.convert(ar1,ApprovalType.UPDATE)
    
    print 'CHGF-ADD'
    pp(ac1a)
    print 'CHGF-RET'
    pp(ac1r)
    print 'CHGF-UPD'
    pp(ac1u)
    
    print 'RESF-ACC'
    pp(ar1a)
    print 'RESF-DEC'
    pp(ar1d)
    print 'RESF-UPD'
    pp(ar1u)
 def _processAddressEntity(self,feat):        
     '''Processes feature data into address object
     @param feat: dict representation of feature before object processing
     @type feat: Dict
     @return: Instantiated Address entity
     '''
     #return EntityAddress.getInstance(feat)
     return self._processSimpleEntity(FeatureFactory.getInstance(FeedRef((FeatureType.ADDRESS,FeedType.RESOLUTIONFEED))).get,feat)
Example #17
0
 def __init__(self, language):
     self.total_labels = []
     self.klasses = []
     self.language = language
     self.train_sentences = []
     self.test_sentenses = []
     self.factory = FeatureFactory()
     self.viterbi = Viterbi()
 def castTo(self,requiredtype,address):
     '''Convenience method abstracting the casting function used to downcast address objects to the various feed required formats
     @param requiredtype: Address format requirement in FeedRef format 
     @type requiredtype: FeedRef
     @param address: Address object being cast
     @type address: Address
     @return: Address
     '''
     if not requiredtype in FeedType.reverse.keys(): raise Exception('unknown feed/address type')
     return FeatureFactory.getInstance(FeedRef((FeatureType.ADDRESS,requiredtype))).cast(address)
Example #19
0
 def getInstance(data,etft=FeedRef((FeatureType.ADDRESS,FeedType.FEATURES))): 
     '''Gets instance of Entity object defaulting to Addressfeed/Feature
     @param data: Dict containing AF Entity object attributes
     @param etft: Address Entity feedref 
     @type etft: FeedRef
     @return: Populated Entity object 
     '''
     from FeatureFactory import FeatureFactory
     ff = FeatureFactory.getInstance(etft)
     return ff.get(model=data)
Example #20
0
 def getInstance(data,
                 etft=FeedRef((FeatureType.ADDRESS, FeedType.FEATURES))):
     '''Gets instance of Entity object defaulting to Addressfeed/Feature
     @param data: Dict containing AF Entity object attributes
     @param etft: Address Entity feedref 
     @type etft: FeedRef
     @return: Populated Entity object 
     '''
     from FeatureFactory import FeatureFactory
     ff = FeatureFactory.getInstance(etft)
     return ff.get(model=data)
Example #21
0
 def clone(a,b=None):
     '''Clones attributes of A to B and instantiates B (as type A) if not provided
     @param a: Feature object to-be cloned
     @type a: Feature
     @param b: Feature object being overwritten (optional)
     @type b: Feature
     @return: Manual deepcop of Feature object 
     '''
     #duplicates only attributes set in source object
     from FeatureFactory import FeatureFactory
     if not b: b = FeatureFactory.getInstance(a.type).get()
     for attr in a.__dict__.keys(): setattr(b,attr,getattr(a,attr))
     return b
Example #22
0
 def clone(a, b=None):
     '''Clones attributes of A to B and instantiates B (as type A) if not provided
     @param a: Feature object to-be cloned
     @type a: Feature
     @param b: Feature object being overwritten (optional)
     @type b: Feature
     @return: Manual deepcop of Feature object 
     '''
     #duplicates only attributes set in source object
     from FeatureFactory import FeatureFactory
     if not b: b = FeatureFactory.getInstance(a.type).get()
     for attr in a.__dict__.keys():
         setattr(b, attr, getattr(a, attr))
     return b
Example #23
0
def main():
    
    print 'USAGE: python NER.py trainFile testFile'
    featureFactory = FeatureFactory()

    # read the train and test data
    trainData = featureFactory.readData("../data/train")
    testData = featureFactory.readData("../data/dev")

    # add the features
    trainDataWithFeatures = featureFactory.setFeaturesTrain(trainData);
    testDataWithFeatures = featureFactory.setFeaturesTest(testData);

    # write the updated data into JSON files
    featureFactory.writeData(trainDataWithFeatures, 'trainWithFeatures');
    featureFactory.writeData(testDataWithFeatures, 'testWithFeatures');

    # run MEMM
    output = Popen(['java','-cp', 'classes', '-Xmx1G' ,'MEMM'
                    ,'trainWithFeatures.json', 'testWithFeatures.json', '-print'], 
                    stdout=PIPE).communicate()[0]

    print output
Example #24
0
def main(argv):
    # defaults
    if len(argv) == 0:
        argv.append("../data/train")
        argv.append("../data/dev")
    elif len(argv) < 2:
        print 'USAGE: python NER.py trainFile testFile'
        exit(0)

    # Set this to -print to print
    printOp = ''
    if len(argv) > 2:
        printOp = '-print'

    featureFactory = FeatureFactory()

    # read the train and test data
    trainData = featureFactory.readData(argv[0])
    testData = featureFactory.readData(argv[1])

    # add the features
    trainDataWithFeatures = featureFactory.setFeaturesTrain(trainData)
    testDataWithFeatures = featureFactory.setFeaturesTest(testData)

    # write the updated data into JSON files
    featureFactory.writeData(trainDataWithFeatures, 'trainWithFeatures')
    featureFactory.writeData(testDataWithFeatures, 'testWithFeatures')

    # run MEMM
    output = Popen([
        'java', '-cp', 'classes', '-Xmx2G', 'MEMM', 'trainWithFeatures.json',
        'testWithFeatures.json', printOp
    ],
                   stdout=PIPE).communicate()[0]

    print output
def output(partId, ch_aux):
    """Uses the student code to compute the output for test cases."""
    print "== Running your code ..."

    featureFactory = FeatureFactory()

    # read the train and test data
    trainData = featureFactory.readData("../data/train")
    testData = featureFactory.readTestData(ch_aux)

    # add the features
    trainDataWithFeatures = featureFactory.setFeaturesTrain(trainData)
    testDataWithFeatures = featureFactory.setFeaturesTest(testData)

    # write the updated data into JSON files
    featureFactory.writeData(trainDataWithFeatures, "trainWithFeaturesSubmit")
    featureFactory.writeData(testDataWithFeatures, "testWithFeaturesSubmit")

    # run MEMM
    output = Popen(
        [
            "java",
            "-cp",
            "classes",
            "-Xmx1G",
            "MEMM",
            "trainWithFeaturesSubmit.json",
            "testWithFeaturesSubmit.json",
            "-submit",
        ],
        stdout=PIPE,
    ).communicate()[0]
    # print output[:100]
    os.remove("trainWithFeaturesSubmit.json")
    os.remove("testWithFeaturesSubmit.json")

    print "== Finished running your code"

    return output
Example #26
0
 def testgrpresfeedAUD(self,dm,af):
     ver = 6977370
     #cid = 4117724
     cid = 4117720
     #pull address from features (map)
     grp_r = self.gettestgroup(FeatureFactory.getInstance(FeedRef((FeatureType.GROUPS,FeedType.RESOLUTIONFEED))))
     
     aimslog.info('*** GROUP Resolution ACCEPT '+str(time.clock()))
     rqid1 = 4321234
     dm.acceptGroup(grp_r,rqid1)
     resp = None
     while True: 
         _,resp,_ = self.testresp(dm,FeedType.RESOLUTIONFEED)
         if resp: 
             print rqid1,resp[0].meta.requestId
             break
         time.sleep(5)
     ver += 1
Example #27
0
 def testuseractions(self,dm):
     '''create and submit user actions, add/update/delete '''
     ver = 100
     uid = 100
     rqid = 100
     
     etft = FeedRef((FeatureType.USERS,FeedType.ADMIN))
     uf = FeatureFactory.getInstance(etft)
     
     user = uf.get('local_test_user')
     user.setUserId(uid)
     user._version = ver
     user._userName = '******'
     user._email = '*****@*****.**'
     user._requiresProgress = 'False'
     user._organisation = 'LINZ'
     user._role = 'follower'
     
     dm.addUser(user,rqid)
     while True: 
         _,_,resp = self.testresp(dm,FeedType.ADMIN)
         if resp: 
             print rqid,resp[0].meta.requestId
             break
         time.sleep(5)
         
     rqid+=1
     user._userName = '******'
     dm.updateUser(user,rqid)
     while True: 
         _,_,resp = self.testresp(dm,FeedType.ADMIN)
         if resp: 
             print rqid,resp[0].meta.requestId
             break
         time.sleep(5)
         
     rqid+=1
     dm.deleteUser(user,rqid)
     while True: 
         _,_,resp = self.testresp(dm,FeedType.ADMIN)
         if resp: 
             print rqid,resp[0].meta.requestId
             break
         time.sleep(5)
Example #28
0
 def __init__(self,params,queues):
     '''Initialise new DataSync object splitting out config parameters
     @param params: List of configuration parameters
     @type params: List<?>
     @param queues: List of IOR queues
     @type queues: Dict<String,Queue.Queue>        
     '''
     #from DataManager import FEEDS
     super(DataSync,self).__init__()
     #thread reference, ft to AD/CF/RF, config info
     self.start_time = time.time()
     self.updater_running = False
     self.ref,self.etft,self.ftracker,self.conf = params
     self.data_hash = {dh:0 for dh in FEEDS.values()}
     self.factory = FeatureFactory.getInstance(self.etft)
     self.updater = DataUpdater.getInstance(self.etft) # unevaluated class
     self.inq = queues['in']
     self.outq = queues['out']
     self.respq = queues['resp']
Example #29
0
def test_cluster(w, auto):
	#check if directory is empty-- should implement try/except later.
	if os.listdir(outputpath):
		finalcluster = FeatureFactory.cluster_100(w, auto)
Example #30
0
class Perceptron:
    
    def __init__(self, language):
        self.total_labels = []
        self.klasses = []
        self.language = language
        self.train_sentences = []
        self.test_sentenses = []
        self.factory = FeatureFactory()
        self.viterbi = Viterbi()

    def read_data(self, train_file, test_file):
        self.read_training_data(train_file)
        self.read_testing_data(test_file)
    
    def read_training_data(self, train_file):
        list_of_training_instances = []
        new_sentence = Sentence()
        for line in train_file:      
            split = line.strip().split()
            if len(split) == 0 and new_sentence.size() != 0:
                if '-DOCSTART-' not in new_sentence.full_sentence:
                    self.train_sentences.append(new_sentence)
                new_sentence = Sentence()
            else:
                instance = EngInstance(split[0], split[1], split[2], split[3])
                list_of_training_instances.append(instance)
                new_sentence.add(instance)
                if split[3] not in self.total_labels:
                    self.total_labels.append(split[3])

        print 'total number of training instances',len(list_of_training_instances), \
                'total number of training sentences', len(self.train_sentences)  

        self.klasses_init()
        self.viterbi.train(self.total_labels, self.train_sentences)

    def klasses_init(self):
        for label in self.total_labels:
            self.klasses.append(Klass(label))

    def tag_klass(self, tag):
        for klass in self.klasses:
            if klass.tag == tag:
                return klass
        return None                

    def read_testing_data(self, test_file):
        list_of_testing_instances = []
        new_sentence = Sentence()
        for line in test_file:      
            split = line.strip().split()
            if len(split) == 0 and new_sentence.size() != 0:
                if '-DOCSTART-' not in new_sentence.full_sentence:
                    self.test_sentenses.append(new_sentence)
                new_sentence = Sentence()
            else:
                instance = EngInstance(split[0], split[1], split[2], split[3])
                list_of_testing_instances.append(instance)
                new_sentence.add(instance)

        print 'total number of testing instances',len(list_of_testing_instances), \
                'total number of testing sentences', len(self.test_sentenses)

    def computeFeatures(self):
        for sentence in self.train_sentences:
            self.factory.compute_sentence_features_eng(sentence)
        for sentence in self.test_sentenses:
            self.factory.compute_sentence_features_eng(sentence)

    def train(self):
        iteration = 0
        total = len(self.train_sentences)
        while iteration < 10:
            error = 0
            for i in range(len(self.train_sentences)):
                sentence = self.train_sentences[i] 
                path = self.classify(sentence)
                for index in range(len(sentence.instances)):
                    instance = sentence.instances[index]
                    if path[index] == instance.label:
                        instance.predicted_label = instance.label
                    else:
                        guess = self.tag_klass(path[index])
                        instance.predicted_label = path[index]
                        gold = self.tag_klass(instance.label)
                        error += 1
                        guess.adjust(instance.features, '-')
                        gold.adjust(instance.features, '+')
                self.factory.features_update(sentence)
                for klass in self.klasses:
                    klass.update()
            iteration += 1
            print 'Iteration %d: number of errors %d' % (iteration, error)
        for klass in self.klasses:
            klass.average_weights()                

    def classify(self, sentence):
        return self.viterbi.viterbi(sentence, self.klasses)
        

    def test(self):
        correct = 0
        wrong = 0
        report_summary = defaultdict(lambda:0)
        
        for i in range(len(self.train_sentences)):
            sentence = self.train_sentences[i] 
            path = self.classify(sentence)
            for index in range(len(sentence.instances)):
                instance = sentence.instances[index]
                instance.predicted_label = path[index]
            self.factory.features_update(sentence)
        
        for sentence in self.test_sentenses:
            path = self.classify(sentence)
            for index in range(len(sentence.instances)):
                instance = sentence.instances[index]
                guess = self.tag_klass(path[index])
                gold = self.tag_klass(instance.label)
                report_summary[(gold.tag, guess.tag)] += 1
                if guess.tag != gold.tag:
                    gold.FN += 1
                    guess.FP += 1
                    wrong += 1
                else:
                    gold.TP += 1
                    if guess.tag != 'O':
                        correct += 1

        for label_1 in self.total_labels:
            print label_1, "&",
        print    
        for label_1 in self.total_labels:
            print label_1, 
            for label_2 in self.total_labels:
                print "&", report_summary[(label_1, label_2)],
            print "\\\\ \\hline"
        print correct, wrong
        for klass in self.klasses:
            try:
                P = float(klass.TP)/(klass.TP + klass.FP) 
            except:
                P = 0
            try:        
                R = float(klass.TP)/(klass.TP + klass.FN) 
            except:
                R = 0
            try:        
                F = 2 * P * R /(P + R) * 100
            except:
                F = 0
            print "%s & %.2f & %.2f & %.2f" % (klass.tag, P * 100, R * 100, F)    
Example #31
0
def test_cluster(w, auto):
    #check if directory is empty-- should implement try/except later.
    if os.listdir(outputpath):
        finalcluster = FeatureFactory.cluster_100(w, auto)
Example #32
0
        #write past feature name to file
        with open('last', 'w') as f:
            f.write(feature)

    if feature == '1':
        fe = features.FEATURE
    elif feature == '2':
        fe = features.FEATURE2
    elif feature == '3':
        fe = features.FEATURE3
    elif feature == 'stress':
        fe = features.STRESSTEST

    #options for cluster:
    k = raw_input('Number of clusters (k): ') or 'auto'
    if k != 'auto':
        k = int(k)
    t = int(raw_input('Number of iterations: ') or 1)
    weightlist = raw_input('enter weight vector, or auto: ') or '1 0 0'
    if weightlist == 'auto':
        w = 'auto'
    else:
        w = weightlist.split(" ")
        w = map(float, w)
    euclidean = raw_input('Use Euclidean distance? [Y/N]: ') or 'N'
    if euclidean == 'Y' or euclidean == 'y': euclid = True

    FeatureFactory = FeatureFactory(44100, fe, mp3list, k, t, run_before,
                                    euclid)
    test_cluster(w, auto)