Exemple #1
0
 def num_votes_from_tr(tr):
     # there may be 2 <td> with class "vot", but only one won't be empty
     vot_nums = tr.xpath('./td[contains(@class, "vot") and ' \
                              'not(contains(@class, "pvot"))]/text()')
     # Python lists need Ruby's compact method.
     vot_nums = [y for y in (digits_only(c(x)) for
             x in vot_nums if not '%' in vot_nums) if y]
     assert len(vot_nums) == 1
     return int(vot_nums[0])
Exemple #2
0
    def preprocess(self, job):
        assert (200 <= job.response.status < 300)
        out = job.meta['preprocess'] = {}
        out['results'] = []

        doc = html5lib.parse(job.data, treebuilder='lxml',
                             namespaceHTMLElements=False)
        html = doc.getroot()



        ### RESULTS PARSING

        tvotos = html.xpath('.//table[@id="TVOTOS"]')[0]
        # we skip the first <tr> since it's the title row
        # the rest of <tr> are repeated structures like this one.
        # AGRUP 1:       (a) tr
        # AGRUP 1:       (a)   th.sigla/text() -- agrupación nombre
        # AGRUP 1: [OPT] (b) tr.agrupa
        # AGRUP 1: [OPT] (b)   th.[agrupa,sigla]/text() -- agrupación lista
        # AGRUP 1: [OPT] (c) tr.agrupa
        # AGRUP 1: [OPT] (c)   th.sigla/text() -- agrupación formula
        rows = tvotos.xpath('.//tr')[1:]

        def num_votes_from_tr(tr):
            # there may be 2 <td> with class "vot", but only one won't be empty
            vot_nums = tr.xpath('./td[contains(@class, "vot") and ' \
                                     'not(contains(@class, "pvot"))]/text()')
            # Python lists need Ruby's compact method.
            vot_nums = [y for y in (digits_only(c(x)) for
                    x in vot_nums if not '%' in vot_nums) if y]
            assert len(vot_nums) == 1
            return int(vot_nums[0])

        oddity = lambda tr: 'r1' in tr.attrib['class']
        for trs in flip_flop(rows, oddity):
            bigrow = {}

            for tr in trs:
                if not bigrow:
                    # (a) agrupacion nombre
                    th = tr.xpath('./th[1]')[0]
                    bigrow['agrupacion'] = {'id': c(th.attrib['id']),
                                            'nombre': c(th.text),
                                            'votos': num_votes_from_tr(tr)}
                else:
                    th = tr.xpath('./th[1]')[0]
                    if 'agrupa' in th.attrib['class']:
                        # (b) agrupacion lista
                        aglist = {'id': c(th.attrib['id']),
                                  'nombre': c(th.text),
                                  'votos': num_votes_from_tr(tr)}
                        if not 'listas' in bigrow:
                            bigrow['listas'] = []
                        bigrow['listas'].append(aglist)
                    else:
                        # (c) agrupacion formula
                        bigrow['formula'] = { 'id': c(th.attrib['id']),
                                              'nombre': c(th.text) }

            out['results'].append(bigrow)



        ### MESAS PARSING
        table = html.xpath('.//div[@class="pt1"]/table[@class="tablin"]')[0]
        mesas_total = int(digits_only(table.xpath(
            './/th[contains(.,"Totales")]/following-sibling::*')[0].text))
        mesas_escrutadas = int(digits_only(table.xpath(
            './/th[contains(.,"Escrutadas")]/following-sibling::*')[0].text))
        out['mesas'] = {
            'total': mesas_total,
            'escrutadas': mesas_escrutadas }



        ### ELECTORES PARSING
        table = html.xpath('.//div[@class="pt2"]/table[@class="tablin"]')[0]
        electores_total = int(digits_only(table.xpath(
            './/th[contains(., "Totales")]/following-sibling::*')[0].text))
        electores_votantes = int(digits_only(table.xpath(
            './/th[contains(., "Votantes")]/following-sibling::*')[0].text))
        out['electores'] = {
            'total': electores_total,
            'votantes': electores_votantes }

        out['parsed_timestamp'] = time.time()