Example #1
0
 def __init__(self, htmlDataFilename=ls.MS_DATAFILE_ABSPATH):
     self.htmlDataFilename = htmlDataFilename
     self.concursoSlider = ConcursoSlider(conc.ConcursoHTML)
     self.process_flow()
     self.print_concursos()
     self.save_concursos_in_db()
Example #2
0
class ConcursoHTMLScraper(object):
    def __init__(self, htmlDataFilename=ls.MS_DATAFILE_ABSPATH):
        self.htmlDataFilename = htmlDataFilename
        self.concursoSlider = ConcursoSlider(conc.ConcursoHTML)
        self.process_flow()
        self.print_concursos()
        self.save_concursos_in_db()

    def process_flow(self):
        self.parseToDataStru()
        self.convert_concursos_fieldtypes()

    def parseToDataStru(self):
        self.createSoupObj()
        if self.bsObj <> None:
            self.concursos = processRowsAcrossTable(self.bsObj)

    def convert_concursos_fieldtypes(self):
        for concurso in self.concursos:
            concurso.transport_dict_into_attrs()

    def createSoupObj(self):
        htmlText = open(self.htmlDataFilename).read()
        """
    Because of Portuguese accents in headers and in SIM/NÃO row values
      and the fact that the HTML is probably iso-8859-1 (Latin1) instead of UTF-8
    the unicode function raises UnicodeDecodeError
      if optional parameter errors is not set either to 'ignore' or 'replace'
      we chose 'ignore' because we only read the first character of field acumuladoSimNao,
      so it's either 'S' or 'N' coinciding with its ASCII/UTF-8 codes
    """
        htmlText = unicode(htmlText, errors="ignore")
        self.bsObj = bf.BeautifulSoup(htmlText)

    def print_concursos(self):
        outStr = "\n" + "=" * 30 + "\n"
        outStr += "============ print_concursos() ============"
        outStr += "\n" + "=" * 30 + "\n"
        print outStr
        for concurso in self.concursos:
            nDoConc = concurso["nDoConc"]
            if nDoConc == None:
                nDoConc = -1
            print nDoConc, concurso

    def save_concursos_in_db(self):
        print "========== save_concursos_in_db() ============"
        total_db_concursos = self.concursoSlider.get_total_concursos()
        total_html_concursos = len(self.concursos)
        print "total_db_concursos", total_db_concursos
        print "total_html_concursos", total_html_concursos
        if total_html_concursos <= total_db_concursos:
            return
        concursos_to_insert = []
        for nDoConc in range(total_db_concursos + 1, total_html_concursos + 1):
            index = nDoConc - 1
            concurso = self.concursos[index]
            expectedNDoConc = concurso["nDoConc"]
            print "expectedNDoConc", expectedNDoConc
            if expectedNDoConc == None:
                print "Stopping expectedNDoConc == None."
                sys.exit(0)
            concurso.transport_dict_into_attrs()
            concursos_to_insert.append(concurso)
        self.concursoSlider.bulk_insert(concursos_to_insert)

    def __str__(self):
        outStr = "\n" + "=" * 30 + "\n"
        outStr += "============ Concursos ============"
        outStr += "\n" + "=" * 30 + "\n"
        for concurso in self.concursos:
            outStr += str(concurso)
            outStr += "\n" + "=" * 30 + "\n"
        outStr += "Total: %d" % (len(self.concursos))
        return outStr