Esempio n. 1
0
class Nuts4Nuts(object):
    LOWER_THRESHOLD = 0.33
    UPPER_THRESHOLD = 0.66
    TRAINER_LEARNINGRATE = 0.02
    TRAINER_LRDECAY = 1.0
    DATATXT_RHO = 0.2
    DATATXT_DBPEDIA = True
    W_PARENT_TYPE = 1
    W_IS_PARENT = 1
    W_HAS_LOCALITY = 1
    BIAS = 0.1
    SCORE_WEIGHT_NN = 0.66
    SCORE_WEIGHT_TEMPLATES = 0.7
    SET_MATCH_THRESHOLD = 0.6

    def __init__(self,
                 datatxt_app_id,
                 datatxt_app_key,
                 lang='it'
                 ):

        self.lang = lang

        #create network and modules
        self.net = FeedForwardNetwork()
        self.inp = LinearLayer(9, name='input')
        self.h1 = LinearLayer(4, name='h1')
        self.h2 = LinearLayer(4, name='h2')
        self.hrho = LinearLayer(1, name='hrho')
        self.hsig = SigmoidLayer(3, name='hsig')
        self.outp = LinearLayer(1, name='output')

        # add modules
        self.net.addOutputModule(self.outp)
        self.net.addInputModule(self.inp)
        self.net.addModule(self.h1)
        self.net.addModule(self.h2)
        self.net.addModule(self.hrho)
        self.net.addModule(self.hsig)

        # create connections
        self.net.addConnection(FullConnection(self.inp, self.h1, name='input->h1'))
        self.net.addConnection(FullConnection(self.inp, self.h2, name='input->h2'))

        # self.net.addConnection(FullConnection(self.inp, self.h1,
        #                                       name='input->h1',
        #                                       inSliceFrom=0,
        #                                       inSliceTo=4))
        # self.net.addConnection(FullConnection(self.inp, self.h2,
        #                                       name='input->h2',
        #                                       inSliceFrom=4,
        #                                       inSliceTo=8))
        self.net.addConnection(FullConnection(self.inp, self.hrho,
                                              name='input->hrho',
                                              inSliceFrom=8,
                                              inSliceTo=9))
        self.net.addConnection(FullConnection(self.h1, self.hsig))
        self.net.addConnection(FullConnection(self.h2, self.hsig))
        self.net.addConnection(FullConnection(self.hrho, self.hsig))
        self.net.addConnection(FullConnection(self.hsig, self.outp))

        # finish up
        self.net.sortModules()

        self.ds = SupervisedDataSet(9, 1)

        self.trainer = BackpropTrainer(self.net, self.ds,
                                       learningrate=self.TRAINER_LEARNINGRATE,
                                       lrdecay=self.TRAINER_LRDECAY)

        self.dq = DataTXTQuerist(app_id=datatxt_app_id,
                                 app_key=datatxt_app_key)

        self.dq.set_params(lang=lang,
                           rho=self.DATATXT_RHO,
                           dbpedia=self.DATATXT_DBPEDIA)

    def _treat_sample(self, sample):
        newsample0 = sample[0:3]
        newsample1 = sample[4:7]
        rho0 = sample[3]
        rho1 = sample[7]

        wsample0 = (self.W_PARENT_TYPE*(newsample0[0]+self.BIAS),
                    self.W_IS_PARENT*(newsample0[1]+self.BIAS),
                    self.W_HAS_LOCALITY*(newsample0[2]+self.BIAS),
                    rho0)

        wsample1 = (self.W_PARENT_TYPE*(newsample1[0]+self.BIAS),
                    self.W_IS_PARENT*(newsample1[1]+self.BIAS),
                    self.W_HAS_LOCALITY*(newsample1[2]+self.BIAS),
                    rho1)

        return wsample0 + wsample1 + tuple([rho0-rho1])

    def add_sample(self, sample, target):
        sample = self._treat_sample(sample)
        self.ds.addSample(sample, target)

    def train(self, nsteps=None):
        if nsteps:
            for _ in range(0, nsteps):
                self.trainer.train()
        else:
            self.trainer.trainUntilConvergence()

    def activate_from_sample(self, sample):
        sample = self._treat_sample(sample)
        return self.net.activate(sample)

    def activate(self, candidate0, candidate1):
        feat0 = candidate0.features.dump_features()
        feat1 = candidate1.features.dump_features()
        return self.activate_from_sample(feat0+feat1)

    def _decide(self, nnres):
        result = None
        if nnres < self.LOWER_THRESHOLD:
            result = 0
        elif nnres > self.UPPER_THRESHOLD:
            result = 1

        return result

    def _dedup_candidates(self, candidates):
        names = [c.name for c in candidates]
        for name in names:
            dups = [c for c in enumerate(candidates) if c[1].name == name]
            if len(dups) > 1:
                max_feat = max([c[1].features.rho for c in dups])
                dedups = [c for c in dups if c[1].features.rho == max_feat][0]
                dups.reverse()
                for d in dups:
                    if d[0] != dedups[0]:
                        del candidates[d[0]]

        return candidates

    def _select_couples(self, candidates):
        logger.setLevel(logging.DEBUG)
        logger.debug('call _select_couples')

        winning_candidates = defaultdict(int)
        for c in combinations(candidates, 2):
            nnres = self.activate(c[0], c[1])
            result = self._decide(nnres)
            logger.debug('couple: (cand0: {cand0}, cand1: {cand1}), nnres: {nnres}'.format(
                         cand0=c[0],
                         cand1=c[1],
                         nnres=nnres))
            if result is not None:
                winning_candidates[c[result]] += 1

        logger.debug(winning_candidates)
        for cand in candidates:
            cand.score = winning_candidates[cand]/float(len(candidates))

        max_score = max(cand.score for cand in candidates)

        selected_places = [c for c in candidates if c.score >= max_score]

        return selected_places

    def _lau3_from_lau2(self, candidates):
        lau2 = [c for c in candidates if c.type == '/LAU2']
        lau3 = [c for c in candidates if c.type == '/LAU3']

        logger.debug(lau2)
        logger.debug(lau3)

        winning_lau3s = list()

        for cand in lau3:
            lau3_fathers = [father
                            for father in lau2
                            if father.name.lower() in [cf.lower() for cf in cand.fathers]
                            ]
            if len(lau3_fathers) == 1:
                winning_lau3s.append((cand, lau3_fathers[0]))

        # logger.debug(winning_lau3s)
        if len(winning_lau3s) == 1:
            winning_lau3s[0][0].score = 1.0
            return [winning_lau3s[0][0]]
        elif len(winning_lau3s) > 1:
            if len(frozenset(father for lau3, father in winning_lau3s)) == 1:
                winning_lau3s[0][1].score = 1.0
                return [winning_lau3s[0][1]]

        return winning_lau3s


    def from_candidates(self, candidates):
        logger.debug('candidates: %s' % candidates)
        logger.debug('len(candidates): %s' % len(candidates))

        candidates = self._dedup_candidates(candidates)

        logger.debug('(deduped) candidates: %s' % candidates)
        logger.debug('(deduped) len(candidates): %s' % len(candidates))

        if len(candidates) == 0:
            logger.debug('No candidates found')
            return candidates

        if len(candidates) == 1:
            candidates[0].score = 1.0
            return candidates

        winning_lau3s = self._lau3_from_lau2(candidates)
        if winning_lau3s:
            return winning_lau3s

        return self._select_couples(candidates)

    def find_municipality(self, page):
        ta = TemplateAnalyzer(page=page, lang=self.lang)
        candidates_from_templates = ta.analyze_templates()

        logger.debug('candidates from templates: {candidates}'.format(
                     candidates=candidates_from_templates))

        ag = AbstractGetter(page=page, lang=self.lang)
        self.abstract = ag.get_abstract()
        querytext = self.dq.query(self.abstract)

        pg = PlacesGetter(page=page,
                          queryres=querytext)


        candidates_for_nn = pg.get_candidates()

        logger.debug('candidates: {candidates}'.format(candidates=candidates_for_nn))

        common_candidates = set([c.name for c in candidates_from_templates]).intersection(
                                 set([c.name for c in candidates_for_nn])
                                 )

        logger.debug(common_candidates)
        if common_candidates:
            result = [c
                      for c in candidates_from_templates
                      for name in common_candidates
                      if c.name == name
                      ]

            if len(result) == 1:
                result[0].set_match()
                result[0].score = 1.0

            return result

        else:
            candidates_from_nn = self.from_candidates(candidates=candidates_for_nn)

            for c in candidates_from_nn:
                c.score = c.score*self.SCORE_WEIGHT_NN

            for c in candidates_from_templates:
                c.score = c.score*self.SCORE_WEIGHT_TEMPLATES

            candidates_from_templates = self._lau3_from_lau2(candidates_from_templates) or \
                                        candidates_from_templates

            logger.debug(candidates_from_templates)
            logger.debug(candidates_from_nn)
            if candidates_from_templates and candidates_from_nn:
                merge_candidates = candidates_from_nn + candidates_from_templates
                total_candidates = self._lau3_from_lau2(merge_candidates) or \
                    merge_candidates
                if len(total_candidates) == 1:
                    total_candidates[0].set_match()
                return total_candidates
            else:
                if candidates_from_templates:
                    if len(candidates_from_templates) == 1 and \
                       candidates_from_templates[0].score > self.SET_MATCH_THRESHOLD:
                       candidates_from_templates[0].set_match()
                    return candidates_from_templates

                else:
                    if len(candidates_from_nn) == 1 and \
                       candidates_from_nn[0].score > self.SET_MATCH_THRESHOLD:
                       candidates_from_nn[0].set_match()
                    return candidates_from_nn

    def show(self):
        for mod in self.net.modules:
            print "Module: {name} ({mod})".format(name=mod.name, mod=mod)
            if mod.paramdim > 0:
                print "* parameters:", mod.params
            for conn in self.net.connections[mod]:
                print conn
                try:
                    paramdim = len(conn.params)
                except:
                    paramdim = conn.paramdim
                for cc in range(paramdim):
                    print conn.whichBuffers(cc), conn.params[cc]

    def save(self, filename='nut4nutsNN.xml'):
        logger.info('Writing NN to file: {filename}'.format(filename=filename))
        NetworkWriter.writeToFile(self.net, filename)

    def load(self, filename='nut4nutsNN.xml'):
        logger.info('Loading NN from file: {filename}'.format(filename=filename))
        self.net = NetworkReader.readFrom(filename)
Esempio n. 2
0
in_to_hidden = FullConnection(inLayer, hiddenLayer)
hidden_to_out = FullConnection(hiddenLayer, outLayer)
nn.addConnection(in_to_hidden)
nn.addConnection(hidden_to_out)
nn.sortModules()

#trainer = BackpropTrainer(nn, ds, learningrate=0.01, momentum=0.1)
ga = GA(ds.evaluateModuleMSE, nn, minimize=True)

for epoch in range(0, 100):
    nn = ga.learn(0)[0]
    print('Epoch: ', epoch)
   # if epoch % 100 == 0:
        #error = trainer.train()
        #print('Error: ', error)


result = []
real = []
# Testing
for i in range(0, testing_range): 
    xs, ys = iman.next_norm()
    result.append(nn.activate(xs[0]))
    real.append(ys[0])

plt.plot(result, 'r--', label='Predicted')
plt.plot(real, label='Real Data')
plt.legend(loc='best')
plt.show()