def executa(self, search_url) :
    
        self.SEARCH_URL = search_url
        
        list = None
        list = [u'Favorecido:' , u'Valor:' , u'Observação do Documento:']
    
        socket.socket = socks.socksocket
        socket.create_connection = create_connection
        br = Browser() 
        print search_url 
        print "ID = " + str(Consulta.ID)
        gravalog(self,search_url + " cont = " + str(Consulta.ID) + "\n")
        LRequest = urllib2.Request(search_url," " )
        LResponse = br.open(LRequest)
        page = bs_parse(LResponse.read())
        
        #pode ir para fora!!!!
        
        soup = bs_parse(LResponse.get_data())
        img_captcha = soup.find('img', alt='captcha')
        if img_captcha != None :
            #caso encontre um captcha, o sistema troca o endereço IP
			try:
                print "CAPTCHA!!!"
                gravalog(self,"CAPTCHA\n")
Ejemplo n.º 2
0
def parse_guide(totalguide,zip,headend):
    """takes a list of guides, iterates through each, then parses the data creating a list of Guide objects
    """
    global guide
    
    count=1
    for a in totalguide:
        print "Parsing page:%s"%count
        count+=1
        pageparsed = bs_parse(a.read())
        soup = BeautifulSoup(''.join(str(pageparsed)))
        shows=soup.findAll(attrs={"class":"shows"})
        channels=soup.findAll(attrs={"class":"channel"})
        channelandshow=soup.findAll(attrs={"id":"grid"})
        
        for a in range(0,len(channels)):
            channelmatch = re.search(r'name="[^"]+', str(channels[a]))
            #print channelmatch.group()
            #wholematch=re.findall(r'title="[\w\s\(\)\-\:\.\$\'\!\,\&]+',str(shows[a]))
            wholematch=re.findall(r'title="[^"]+',str(shows[a]))
            
            #titlematch=re.findall(r'title="[\w\s\(\)\:\.\$\'\!\,\&]+',str(shows[a]))
            titlematch=re.findall(r'">[^<]+</a>',str(shows[a]))
            #titlematch=re.findall(r'title="[^:]+',str(shows[a]))
            typematch=re.findall(r'<a href="/[^/]+',str(shows[a]))
    
            for a in range(0,len(titlematch)):
                timematch=re.search(r'[0-9]+:[0-9]+[a-z]+[a-z]+',wholematch[a])
                lengthmatch=re.search(r'\([0-9]+[^\)]+',wholematch[a])
                if timematch and lengthmatch: 
                    
                    #parsing the html
                    parsedtitle=parse_show_html(titlematch[a][2:-4])
                                           
                    guide.append(Guide(parsedtitle,channelmatch.group()[6:],timematch.group(),lengthmatch.group()[1:],typematch[a][10:],zip,headend))
Ejemplo n.º 3
0
#string de conexão do banco de dados
conn_string = "host='localhost' dbname='portaltransparencia' user='******' password='******'"

#inicialização do objeto de simulação de navegador
br = Browser()

try:
    conn = psycopg2.connect(conn_string)
except:
    print "problema ao conectar no banco de dados"

#primeira solicitação no portal da transparência
try:
    LRequest = urllib2.Request(SEARCH_URL, " ")
    LResponse = br.open(LRequest)
    page = bs_parse(LResponse.read())
    print SEARCH_URL
    print page
    #f.write(page)
except:
    print "problema ao realizar primeira consulta na web"

br.close()

#grava array com orgaos superiores e apresenta logs do processo na tela
print "################### Orgaos ###################"
a = []
b = []

#cursor para naveção no banco de dados
cursor = conn.cursor()
class Consulta_Favorecido ():

    #atributos
    controller = None
    SEARCH_URL = None
    tor_control_hostname = "127.0.0.1"
    tor_control_port = "8118"
    tor_control_password = "******"
    contador = 0
    ID = ""
    ver = "4"
    arquivo = ''
    f = None

    #construtor
    def __init__(self, d_inic, m_inic, ano, search_url, controller, nao_cria):
        arquivo = '//home//raul//Documents//unb_python//data//data' + str(d_inic) + "-" + str(m_inic) + "-" + str(ano) + '.txt'
        if nao_cria == 1:
            self.f = open(arquivo,'a')
        else :
            self.f = open(arquivo,'w')
        
        Consulta.controller = controller
                
        self.SEARCH_URL = search_url  
        
        socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", 9050)
        
		#abre conector na rede tor
		try: 
			socket.socket = socks.socksocket
			socket.create_connection = create_connection
		except:
			print "problema ao abrir socket na rede tor"
		
        br = Browser() 
  
        #prepara para iniciar consultas
        print "################### Consulta Avancada Portal Transparencia ###################"
        gravalog(self,"\n\n\n################### Consulta Avancada Portal Transparencia ###################\n\n")
        print "################### versao" + self.ver + " ###################"
        gravalog(self,"\n################### versao " + self.ver + " ###################\n\n")
    
        try :
			LRequest = urllib2.Request(SEARCH_URL," " )
			LResponse = br.open(LRequest)
			page = bs_parse(LResponse.read())
			print SEARCH_URL
			print page
			#f.write(page)
		except :
			print "problema ao realizar primeira consulta na web"
        
        gravalog(self,(page.text).encode('utf-8', 'ignore'))
    
        br.close()    
        
		#Consulta.ID = newID(self, Consulta.controller)
        Consulta.ID = 000000000
        
        #Objeto para captura de logs.
        x = logging.getLogger("logarqui")
        x.setLevel(logging.DEBUG)
        
        #captura logs e grava em arquivo.
        h1 = logging.FileHandler("//home//raul//Documents//unb_python//data//log//erros" + str(d_inic) + "-" + str(m_inic) + "-" + str(ano) + '.log')
        f = logging.Formatter("%(levelname)s %(asctime)s %(funcName)s %(lineno)d %(message)s")
        h1.setFormatter(f)
        h1.setLevel(logging.DEBUG)
        x.addHandler(h1)