def _download_heft(self, heft: Heft, browser: mechanize.Browser):
     filename = self.pdf_name_template.format(heft=heft)
     pdf_data = browser.follow_link(heft.pdf_link)
     with open(f"{self._dir}/{filename}", "bw+") as pdf_file:
         pdf_file.write(pdf_data.read())
     browser.back()
     return f"{self._dir}/{filename}"
def authorize(n):

	number = "0"

	# Permutations of sizes between 1 and n!
	for k in range(1, n):

		# generates permutations of the string

		# add n copies of each number to the the list, allows for 0000 or 1111 (permutations with repeated digits)

		perms = [''.join(p) for p in permutations('0123456789' * n, k)]
		print "Printing permutations for k = " + str(k)

		# create a set to remove any possible duplicates that result from having multiple copies of the same number
		perms = set(perms)

		for permutation in perms:

			br = Browser()
			br.open("<URL_GOES_HERE>")

			# if a page has multiple forms, change the index the index appropriately
			# e.g. the 4th form would have index 3
			br.form = list(br.forms())[0]

			print "Trying permutation: " + permutation

			# copy and paste this line to fill in all the fields
			br.form["<FIELD_NAME>"] = "<VALUE_FOR_FIELD>"

			# the line that guesses at the field
			br.form["<NAME_OF_CODE_FIELD>"] = permutation

			# prints the finished form, can remove to reduce I/O costs
			print br.form

			# submits the form and grabs the html page after the submit
			response = br.submit()
			htmlFile = response.get_data()

			# most websites display a message if the code is not successful, replace the field below with this
			# searches for the error/failure message in the returned html page
			# if it doesn't find it, the permutation worked! otherwise resets the form

			if "<FAILURE_MESSAGE>" not in htmlFile:
				number = perm
				break
			else:
				br.back()
	return number
Exemple #3
0
class Lockerz:
    def __init__( self, name, password ):
        self.name = name
        self.password = password
        self.br = Browser()

    def connect( self ):
        self.br.open( "http://www.lockerz.com" )
        self.br.select_form( nr=0 )
        self.br["handle"] = self.name
        self.br["password"] = self.password

        self.br.submit()
        return "Lockerz : My Locker" in self.br.title()

    def answer_all( self, generator, recursive=False ):
        page = self.br.open( "http://www.lockerz.com/dailies" );
        self._answer_all( page, generator )
        # ..
        if recursive:
            i = 0
            while True:
                try:
                    i+=1
                    page = self.br.follow_link( text_regex="< Previous Posts" )
                    print "-- page %d" % i
                    self._answer_all( page, generator )
                except LinkNotFoundError:
                    break

    def answer( self, id, answer ):
        d = urllib.urlencode( { "id": id, "a": answer, "o": None } )
        r = self.br.open( "http://www.lockerz.com/daily/answer", d );
        print r.read()
        self.br.back()

    def getPTZ( self ):
        s = BeautifulSoup( self.br.open( "http://www.lockerz.com" ).read() )
        return s.find( "span", attrs={ "class": "ptz_value" } ).string
 
    def _answer_all( self, page, generator ):
        s = BeautifulSoup( page.read() )
        e = s.findAll( "div", attrs={ "class": re.compile( "dailiesEntry dailyIndex*" ) } )
        for i in e:
            try:
                self.answer( i["id"], generator.getRandomSentence() )
            except KeyError:
                print "Already answered ..."    
Exemple #4
0
    def test_reload_read_incomplete(self):
        import mechanize
        from mechanize._response import test_response

        class Browser(TestBrowser):
            def __init__(self):
                TestBrowser.__init__(self)
                self.reloaded = False

            def reload(self):
                self.reloaded = True
                TestBrowser.reload(self)

        br = Browser()
        data = "<html><head><title></title></head><body>%s</body></html>"
        data = data % ("The quick brown fox jumps over the lazy dog." * 100)

        class Handler(mechanize.BaseHandler):
            def http_open(self, requst):
                return test_response(data, [("content-type", "text/html")])

        br.add_handler(Handler())

        # .reload() on .back() if the whole response hasn't already been read
        # (.read_incomplete is True)
        r = br.open("http://example.com")
        r.read(10)
        br.open('http://www.example.com/blah')
        self.assertFalse(br.reloaded)
        br.back()
        self.assertTrue(br.reloaded)

        # don't reload if already read
        br.reloaded = False
        br.response().read()
        br.open('http://www.example.com/blah')
        br.back()
        self.assertFalse(br.reloaded)
Exemple #5
0
 def test_reload_read_incomplete(self):
     from mechanize import Browser
     browser = Browser()
     r1 = browser.open(urljoin(self.uri, "bits/mechanize_reload_test.html"))
     # if we don't do anything and go straight to another page, most of the
     # last page's response won't be .read()...
     r2 = browser.open(urljoin(self.uri, "mechanize"))
     self.assert_(len(r1.get_data()) < 4097)  # we only .read() a little bit
     # ...so if we then go back, .follow_link() for a link near the end (a
     # few kb in, past the point that always gets read in HTML files because
     # of HEAD parsing) will only work if it causes a .reload()...
     r3 = browser.back()
     browser.follow_link(text="near the end")
     # ... good, no LinkNotFoundError, so we did reload.
     # we have .read() the whole file
     self.assertEqual(len(r3._seek_wrapper__cache.getvalue()), 4202)
 def test_reload_read_incomplete(self):
     from mechanize import Browser
     browser = Browser()
     r1 = browser.open(urljoin(self.uri, "bits/mechanize_reload_test.html"))
     # if we don't do anything and go straight to another page, most of the
     # last page's response won't be .read()...
     r2 = browser.open(urljoin(self.uri, "mechanize"))
     self.assert_(len(r1.get_data()) < 4097)  # we only .read() a little bit
     # ...so if we then go back, .follow_link() for a link near the end (a
     # few kb in, past the point that always gets read in HTML files because
     # of HEAD parsing) will only work if it causes a .reload()...
     r3 = browser.back()
     browser.follow_link(text="near the end")
     # ... good, no LinkNotFoundError, so we did reload.
     # we have .read() the whole file
     self.assertEqual(len(r3._seek_wrapper__cache.getvalue()), 4202)
def get_br():
    #todo low
    #headers
    #Accept-Encoding: identity
    # Host: _login.weibo.cn
    # Referer: http://weibo.cn/
    # Connection: close
    # User-Agent: Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)
    br = Browser(factory=RobustFactory(), history=NoHistory(),)
    cj = cookielib.LWPCookieJar()
    br.back = back_func
    br.set_cookiejar(cj)
    br.set_handle_equiv(True)
    #br.set_handle_gzip(True) #gzip在mechanize里面还不是正式功能
    br.set_handle_redirect(True)
    br.set_handle_referer(True)
    br.set_handle_robots(False)
    br.set_handle_refresh(HTTPRefreshProcessor(), max_time=10)
    br.addheaders = [('User-agent', USER_AGENT)]
    return br
                                                    print u'Favorecido:' 
                                                    gravalog(self, u'Favorecido:' )
                                                    favorecido = 1
                                                m = re.search(u'Valor:' , col.string)
                                                if m != None :
                                                    print u'Valor:' 
                                                    gravalog(self, u'Valor:' )
                                                    valor = 1
                                                m = re.search(u'Observação do Documento:' , col.string)
                                                if m != None :
                                                    print u'Observação do Documento:' 
                                                    gravalog(self, 'Observação do Documento:' )
                                                    observacao = 1
                           
                                entra = 0
                                br.back()
                            except Exception, ex:

                                logarqui.exception
                                logarqui.error
                                logarqui.exception("\nProvlema na gravação de logs! \n" + search_url)
                    
                            logarqui.debug("Finishing f!")
                                #sys.exitPortalTranspareciaef)
                                #print col.string
                                #print col
                                #print row2
    
        if len(list) != 0 :
            while len(list) % 3 != 0 :
                list.append(' ')
Exemple #9
0
class RegPublDownloader(LegalSource.Downloader):
    def __init__(self, baseDir="data"):
        self.dir = baseDir + "/regpubl/downloaded"
        if not os.path.exists(self.dir):
            Util.mkdir(self.dir)
        self.config = ConfigObj("%s/%s.ini" % (self.dir, __moduledir__))

        # Why does this say "super() argument 1 must be type, not classobj"
        # super(RegPublDownloader,self).__init__()
        self.browser = Browser()

    def DownloadAll(self):
        # we use mechanize instead of our own Robot class to list
        # available documents since we can't get the POST/cookie based
        # search to work.
        doctype = '160'
        log.info(u'Selecting documents of type %s' % doctype)
        self.browser.open(
            "http://www.regeringen.se/sb/d/108/action/browse/c/%s" % doctype)
        log.info(u'Posting search form')
        self.browser.select_form(nr=1)
        self.browser.submit()

        pagecnt = 1
        done = False
        while not done:
            log.info(u'Result page #%s' % pagecnt)
            for l in self.browser.links(url_regex=r'/sb/d/108/a/\d+'):
                self._downloadSingle(l.absolute_url)
                self.browser.back()
            try:
                self.browser.find_link(text='N\xe4sta sida')
                self.browser.follow_link(text='N\xe4sta sida')
            except LinkNotFoundError:
                log.info(u'No next page link found, this was the last page')
                done = True
            pagecnt += 1
        self.config['last_update'] = datetime.date.today()
        self.config.write()

    def DownloadNew(self):
        if 'last_update' in self.config:
            then = datetime.datetime.strptime(self.config['last_update'],
                                              '%Y-%m-%d')
        else:
            # assume last update was more than a year ago
            then = datetime.datetime.now() - datetime.timedelta(-367)

        now = datetime.datetime.now()
        if (now - then).days > 30:
            pass
            # post a "last 30 days" query
        elif (now - then).days > 365:
            pass
            # post a "last 12 months" query
        else:
            # post a full query
            self.DownloadAll()

    def _downloadSingle(self, url):
        docid = re.match(r'http://www.regeringen.se/sb/d/108/a/(\d+)',
                         url).group(1)

        fname = "%s/%s/index.html" % (self.dir, docid)
        log.info(u'    Loading docidx %s' % url)
        self.browser.open(url)
        if not os.path.exists(fname):
            Util.ensureDir(fname)
            self.browser.retrieve(url, fname)

        for l in self.browser.links(url_regex=r'/download/(\w+\.pdf).*'):
            filename = re.match(
                r'http://www.regeringen.se/download/(\w+\.pdf).*',
                l.absolute_url).group(1)
            # note; the url goes to a redirect script; however that
            # part of the URL tree (/download/*) is off-limits for
            # robots. But we can figure out the actual URL anyway!
            if len(docid) > 4:
                path = "c6/%02d/%s/%s" % (int(
                    docid[:-4]), docid[-4:-2], docid[-2:])
            else:
                path = "c4/%02d/%s" % (int(docid[:-2]), docid[-2:])
            fileurl = "http://regeringen.se/content/1/%s/%s" % (path, filename)

            df = "%s/%s/%s" % (self.dir, docid, filename)
            if not os.path.exists(df):
                log.info(u'        Downloading %s' % (fileurl))
                self.browser.retrieve(fileurl, df)
            else:
                log.info(u'        Already downloaded %s' % (fileurl))
Exemple #10
0
assert br.viewing_html()
print br.title()
print response1.geturl()
print response1.info()  # headers
print response1.read()  # body
response1.close()  # (shown for clarity; in fact Browser does this for you)

br.select_form(name="order")
# Browser passes through unknown attributes (including methods)
# to the selected HTMLForm (from ClientForm).
br["cheeses"] = ["mozzarella",
                 "caerphilly"]  # (the method here is __setitem__)
response2 = br.submit()  # submit current form

# print currently selected form (don't call .submit() on this, use br.submit())
print br.form

response3 = br.back()  # back to cheese shop (same data as response1)
# the history mechanism returns cached response objects
# we can still use the response, even though we closed it:
response3.seek(0)
response3.read()
response4 = br.reload()  # fetches from server

for form in br.forms():
    print form
# .links() optionally accepts the keyword args of .follow_/.find_link()
for link in br.links(url_regex="python.org"):
    print link
    br.follow_link(link)  # takes EITHER Link instance OR keyword args
    br.back()
br.form['ctl00$ContentPlaceHolder1$ddlVillage']=['003']

br.form.new_control('select', 'ctl00$ContentPlaceHolder1$ddlSurveyNo',{'__select' : {'name': 'ctl00$ContentPlaceHolder1$ddlSurveyNo', 'id': 'ctl00$ContentPlaceHolder1$ddlSurveyNo', 'class': 'form-control'} })
br.form.fixup()

item2 = Item(br.form.find_control(name='ctl00$ContentPlaceHolder1$ddlSurveyNo'),
           {'contents': '172', 'value': '172', 'label': 172})
br.form['ctl00$ContentPlaceHolder1$ddlSurveyNo']=['172']

br.form['ctl00$ContentPlaceHolder1$txt_captcha_1'] = job.get_captcha_text()

response = br.submit(id='ContentPlaceHolder1_btnGo')
print(response.read())
print(response.geturl()) # URL of the page we just opened
print(response.info())   # headers
br.back()   # go back

#img = cv2.imread('/Users/srinivas/captcha_original.png')
#custom_config = r'--oem 3 --psm 6'
#text= pytesseract.image_to_string(img, config=custom_config)
#print(text)
#pytesseract.pytesseract.tesseract_cmd = r'/usr/local/Cellar/tesseract/4.1.1/bin/tesseract'
#custom_oem_psm_config = r'--oem 3 --psm 8'
#text = pytesseract.image_to_string(Image.open('/Users/srinivas/captcha_gray.png'),lang='eng', config='--psm 8  -c tessedit_char_whitelist=0123456789')
#text = detectText('/Users/srinivas/captcha_gray.png')
#pprint.pprint(text)

#with open('/Users/srinivas/decoded_image.png', 'wb') as file_to_save:
 #   decoded_image_data = base64.decodebytes(base64_img_bytes)
  #  file_to_save.write(decoded_image_data)
#img.save('/Users/srinivas/captcha_original.png')
Exemple #12
0
class Downloader(object):
    """ Downloads all words.
    """

    def __init__(self):

        self.browser = Browser()
        self.browser.set_handle_robots(False)
        self.words = open('tmp.dict', 'ab')

    def parse_word(self, url):
        """ Downloads word description.
        """

        print 'Parsing:', url
        page = self.browser.follow_link(tag="a", url=url).read()
        page = self.browser.follow_link(text_regex=r'taisyti').read()

        self.browser.back()
        self.browser.back()

        word, meaning = page.split('<textarea')
        word = word.split('<h2>')[-1].split('</h2>')[0]
        meaning = meaning.split('>', 1)[1]
        meaning = meaning.split('</textarea>')[0]

        for search, replace in [
                ('\n', '',),
                #('\x8d', u'\u2013\u0308'.encode('utf-8'),),
                ]:
            word = word.replace(search, replace)
            meaning = meaning.replace(search, replace)

        self.words.write(word)
        self.words.write('=')
        self.words.write(meaning)
        self.words.write('\n')

    def parse_page(self, url):
        """ Downloads all words from single page.
        """

        print 'Parsing:', url
        page = self.browser.open(url).read()

        page = page.split('<table cellpadding="6"><tr valign="top"><td>')[1]
        page = page.split('</td></tr></table>')[0]
        page = page.replace('</td>\n<td>', '')

        open('tmp.html', 'wb').write(page)

        for a in page.split('\n'):
            try:
                word_url = a.split('\"')[1]
            except IndexError:
                continue
            oldurl = self.browser.geturl()
            try:
                self.parse_word(word_url)
            except Exception as e:
                print "Error:", e
                self.browser.open(oldurl)

        time.sleep(10)

    def parse_letter(self, url):
        """ Downloads all words from given letter page.
        """

        print 'Parsing:', url

        page = self.browser.open(url).read()
        page = page.split('</a></p></td></tr></table>')[0]
        open('tmp.html', 'wb').write(page)
        try:
            pages_count = int(page.split('\">')[-1])
        except ValueError:
            pages_count = 1

        for i in range(pages_count):
            self.parse_page(url + str(i + 1) + '/')

        time.sleep(60)

    def parse(self, url, skip):
        """ Downloads all words from given url.
        """

        page = self.browser.open(url).read()
        page = page.split('bgcolor="#FFD780" colspan="2">')[1]
        page = page.split('</td></tr><tr>', 1)[0]

        for i, a in enumerate(page.split(' | ')):
            if i < skip:
                continue
            letter_url = a.split('\"')[1]
            self.words.write('#LETTER ({1}):{0}\n'.format(letter_url, i))
            self.parse_letter(url + letter_url)
			flag = 1
	else:
		if '</SELECT>' not in line:
			x = line.split('"')
			try:
				classtype.write(x[1])
				classtype.write('\n')
				class_type.append(x[1])
			except:
				break

#make sure the directory for this semester exists
if not os.path.exists(os.getcwd()+"/html/%s" % semester):
	os.makedirs(os.getcwd()+"/html/%s" % semester)

for c in class_type:
	time.sleep(5)
	f = ''.join(c)
	path=os.getcwd()+"/html/%s/%s.html" % (semester, f.replace('&', ''))
	temp = open(path, "w")
	browser.select_form(nr=0)
	item = browser.find_control(id="subj_id").get("%s" % c)
	item.selected = True
	response = browser.submit()
	content = response.read()
	temp.write(content)
	print "Wrote %s" % path
	temp.close()
	time.sleep(5)
	browser.back()
class RequestQuery:

    def __init__(self,config):
        self.br=Browser()

        self.config = config
        
        # Initialise connections
        self.mySiteDB = SiteDBJSON()
        self.phedex = PhEDEx({"endpoint":"https://cmsweb.cern.ch/phedex/datasvc/json/prod/"}, "json")
        self.dbsPhys01 = DbsApi(url = dbs_base_url+"phys01/DBSReader/")
        self.dbsPhys02 = DbsApi(url = dbs_base_url+"phys02/DBSReader/")
        self.dbsPhys03 = DbsApi(url = dbs_base_url+"phys03/DBSReader/")
        
    def __del__(self):
        self.br.close()

    def login2Savannah(self):
        """
        login2Savannah log into savannah with the given parameters in the config (username and password) 
        User must have admin priviledges for store results requests
        """
        login_page='https://savannah.cern.ch/account/login.php?uri=%2F'
        savannah_page='https://savannah.cern.ch/task/?group=cms-storeresults'
        
        self.br.open(login_page)

        ## 'Search' form is form 0
        ## login form is form 1
        self.br.select_form(nr=1)

        username = self.config["SavannahUser"]
    
        self.br['form_loginname']=username
        self.br['form_pw']=self.config["SavannahPasswd"]
        
        self.br.submit()
        
        response = self.br.open(savannah_page)
        
        # Check to see if login was successful
        if not re.search('Logged in as ' + username, response.read()):
            print('login unsuccessful, please check your username and password')
            return False
        else:
            return True
    
    def selectQueryForm(self,**kargs):       
        """
        selectQueryForm create the browser view to get all the store result tickets from savannah
        """
        if self.isLoggedIn:
            self.br.select_form(name="bug_form")

            ## Use right query form labelled Test
            control = self.br.find_control("report_id",type="select")

            for item in control.items:
                if item.attrs['label'] == "Test":
                    control.value = [item.attrs['value']]
                    
            ##select number of entries displayed per page
            control = self.br.find_control("chunksz",type="text")
            control.value = "150"

            ##check additional searching parameter
            for arg in kargs:
                if arg == "approval_status":
                    control = self.br.find_control("resolution_id",type="select")
                    for item in control.items:
                        if item.attrs['label'] == kargs[arg].strip():
                            control.value = [item.attrs['value']]

                elif arg == "task_status":
                    control = self.br.find_control("status_id",type="select")
                    for item in control.items:
                        if item.attrs['label'] == kargs[arg].strip():
                            control.value = [item.attrs['value']]
                            
                elif arg == "team":
                    control = self.br.find_control("custom_sb5",type="select")
                    for item in control.items:
                        if item.attrs['label'] == kargs[arg].strip():
                            control.value = [item.attrs['value']]

            response = self.br.submit()
            response.read()

        return

    def getScramArchByCMSSW(self):
        """
        Get from the list of available CMSSW releases
        return a dictionary of ScramArchitecture by CMSSW
        """
        
        # Set temporary conection to the server and get the response from cmstags
        url = 'https://cmssdt.cern.ch/SDT/cgi-bin/ReleasesXML'
        br = Browser()
        br.set_handle_robots(False)
        response=br.open(url)
        soup = BeautifulSoup(response.read())
        
        # Dictionary form
        # {'CMSSW_X_X_X':[slc5_amd64_gcc472], ... }
        archByCmssw={}
        
        # Fill the dictionary
        for arch in soup.find_all('architecture'): 
            for cmssw in arch.find_all('project'): 
                # CMSSW release
                cmsswLabel = cmssw.get('label').encode('ascii', 'ignore')
                if cmsswLabel not in archByCmssw:
                    archByCmssw[cmsswLabel]=[]
                # ScramArch related to this CMSSW release
                archName = arch.get('name').encode('ascii', 'ignore')
                archByCmssw[cmsswLabel].append(archName)
        
        return archByCmssw
      
    def createValueDicts(self):       
        """
        Init dictionaries by value/label:
        - Releases by Value
        - Physics group by value
        - DBS url by value
        - DBS rul by label
        - Status of savannah request by value 
        - Status of savannah ticket by value (Open/Closed/Any)
        """      
        if self.isLoggedIn:
            self.br.select_form(name="bug_form")
            
            control = self.br.find_control("custom_sb2",type="select")
            self.ReleaseByValueDict = self.getLabelByValueDict(control)

            control = self.br.find_control("custom_sb3",type="select")
            self.GroupByValueDict = self.getLabelByValueDict(control)

            control = self.br.find_control("custom_sb4",type="select")
            self.DBSByValueDict = self.getLabelByValueDict(control)
            self.DBSByLabelDict = self.getValueByLabelDict(control)

            control = self.br.find_control("resolution_id",type="select")
            self.StatusByValueDict = self.getLabelByValueDict(control)

            control = self.br.find_control("status_id",type="select")
            self.TicketStatusByLabelDict = self.getValueByLabelDict(control)

        return
    
    def getDatasetOriginSites(self, dbs_url, data):
        """
        Get the origin sites for each block of the dataset.
        Return a list block origin sites.
        """
        
        local_dbs = dbs_url.split('/')[5]
        if local_dbs == 'phys01':
            response = self.dbsPhys01.listBlocks(detail=True,dataset=data)
        elif local_dbs == 'phys02':
            response = self.dbsPhys02.listBlocks(detail=True,dataset=data)
        elif local_dbs == 'phys03':
            response = self.dbsPhys03.listBlocks(detail=True,dataset=data)

        pnnList = set()
        for block in response:
            pnnList.add(block['origin_site_name'])
        psnList = self.mySiteDB.PNNstoPSNs(pnnList)
        
        return psnList, list(pnnList)

    def phEDExNodetocmsName(self, nodeList):
        """
        Convert PhEDEx node name list to cms names list 
        """
        names = []
        for node in nodeList:
            name = node.replace('_MSS',
                                '').replace('_Disk',
                                    '').replace('_Buffer',
                                        '').replace('_Export', '')
            if name not in names:
                names.append(name)
        return names
    
    def setGlobalTagFromOrigin(self, dbs_url,input_dataset):
        """
        Get the global tag of the dataset from the source dbs url. If it is not set, then set global tag to 'UNKNOWN'
        """
        
        globalTag = ""
        local_dbs = dbs_url.split('/')[5]
        if local_dbs == 'phys01':
            response = self.dbsPhys01.listOutputConfigs(dataset=input_dataset)
        elif local_dbs == 'phys02':
            response = self.dbsPhys02.listOutputConfigs(dataset=input_dataset)
        elif local_dbs == 'phys03':
            response = self.dbsPhys03.listOutputConfigs(dataset=input_dataset)
        
        globalTag = response[0]['global_tag']
        # GlobalTag cannot be empty
        if globalTag == '':
            globalTag = 'UNKNOWN'
            
        return globalTag
    
    def isDataAtUrl(self, dbs_url,input_dataset):
        """
        Returns True if the dataset is at the dbs url, if not returns False
        """
        local_dbs = dbs_url.split('/')[5]
        if local_dbs == 'phys01':
            response = self.dbsPhys01.listDatasets(dataset=input_dataset)
        elif local_dbs == 'phys02':
            response = self.dbsPhys02.listDatasets(dataset=input_dataset)
        elif local_dbs == 'phys03':
            response = self.dbsPhys03.listDatasets(dataset=input_dataset)
        # This means that the dataset is not at the url
        if not response:
            return False
        else:
            return True
         
    def getLabelByValueDict(self, control):
        """
        From control items, create a dictionary by values
        """   
        d = {}
        for item in control.items:
            value = item.attrs['value']
            label = item.attrs['label']
            d[value] = label
                
        return d
    
    def getValueByLabelDict(self, control):
        """
        From control items, create a dictionary by labels
        """
        d = {}
        for item in control.items:
            value = item.attrs['value']
            label = item.attrs['label']
            d[label] = value

        return d
    
    def getRequests(self,**kargs):
        """
        getRequests Actually goes through all the savannah requests and create json files if the 
        ticket is not Closed and the status of the item is Done.
        It also reports back the summary of the requests in savannah
        """
        requests = []
        
        # Open Browser and login into Savannah
        self.br=Browser()
        self.isLoggedIn = self.login2Savannah()
        
        if self.isLoggedIn:
            if not kargs:
                self.selectQueryForm(approval_status='1',task_status='0')
            else:
                self.selectQueryForm(**kargs)
            self.createValueDicts()
        
            self.br.select_form(name="bug_form")
            response = self.br.submit()

            html_ouput = response.read()
            
            scramArchByCMSSW = self.getScramArchByCMSSW()
            self.nodeMappings = self.phedex.getNodeMap()
            
            for link in self.br.links(text_regex="#[0-9]+"):
                response = self.br.follow_link(link)
                
                try:
                    ## Get Information
                    self.br.select_form(name="item_form")

                    ## remove leading &nbsp and # from task
                    task = link.text.replace('#','').decode('utf-8').strip()
                    print("Processing ticket: %s" % task)
                    
                    ## Get input dataset name
                    control = self.br.find_control("custom_tf1",type="text")
                    input_dataset = control.value
                    input_primary_dataset = input_dataset.split('/')[1].replace(' ','')
                    input_processed_dataset = input_dataset.split('/')[2].replace(' ','')
                    data_tier = input_dataset.split('/')[3].replace(' ','')
                    
                    ## Get DBS URL by Drop Down
                    control = self.br.find_control("custom_sb4",type="select")
                    dbs_url = self.DBSByValueDict[control.value[0]]

                    ## Get DBS URL by text field (for old entries)
                    if dbs_url=='None':
                        control = self.br.find_control("custom_tf4",type="text")
                        dbs_url = control.value.replace(' ','')
                    else: # Transform input value to a valid DBS url
                        #dbs_url = "https://cmsweb.cern.ch/dbs/prod/"+dbs_url+"/DBSReader"
                        dbs_url = dbs_base_url+dbs_url+"/DBSReader"
                        
                    ## Get Release
                    control = self.br.find_control("custom_sb2",type="select")
                    release_id = control.value
                    
                    ## Get current request status
                    control = self.br.find_control("status_id",type="select")
                    request_status_id = control.value
                    RequestStatusByValueDict = self.getLabelByValueDict(control)
                    
                    # close the request if deprecated release was used
                    try:
                        release = self.ReleaseByValueDict[release_id[0]]
                    except:
                        if len(self.ReleaseByValueDict)>0 and RequestStatusByValueDict[request_status_id[0]] != "Closed":
                            msg = "Your request is not valid anymore, since the given CMSSW release is deprecated. If your request should be still processed, please reopen the request and update the CMSSW release to a more recent *working* release.\n"
                            msg+= "\n"
                            msg+= "Thanks,\n"
                            msg+= "Your StoreResults team"
                            self.closeRequest(task,msg)
                            self.br.back()
                            print("I tried to Close ticket %s due to CMSSW not valid" % task)
                            continue
                    
                    # close the request if release has not ScramArch match
                    if release not in scramArchByCMSSW:
                        if len(self.ReleaseByValueDict)>0 and RequestStatusByValueDict[request_status_id[0]] != "Closed":
                            msg = "Your request is not valid, there is no ScramArch match for the given CMSSW release.\n"
                            msg+= "If your request should be still processed, please reopen the request and update the CMSSW release according to: https://cmssdt.cern.ch/SDT/cgi-bin/ReleasesXML \n"
                            msg+= "\n"
                            msg+= "Thanks,\n"
                            msg+= "Your StoreResults team"
                            self.closeRequest(task,msg)
                            self.br.back()
                            print("I tried to Close ticket %s due to ScramArch mismatch" % task)
                            continue
                    else: 
                        index=len(scramArchByCMSSW[release])
                        scram_arch = scramArchByCMSSW[release][index-1]

                    # close the request if dataset is not at dbs url
                    try:
                        data_at_url = self.isDataAtUrl(dbs_url,input_dataset)
                    except:
                        print('I got an error trying to look for dataset %s at %s, please look at this ticket: %s' %(input_dataset,dbs_url,task))
                        continue
                    if not data_at_url:
                        msg = "Your request is not valid, I could not find the given dataset at %s\n" % dbs_url
                        msg+= "If your request should be still processed, please reopen the request and change DBS url properly \n"
                        msg+= "\n"
                        msg+= "Thanks,\n"
                        msg+= "Your StoreResults team"
                        self.closeRequest(task,msg)
                        self.br.back()
                        print("I tried to Close ticket %s, dataset is not at DBS url" % task)
                        continue
                        
                    # Avoid not approved Tickets
                    #if not RequestStatusByValueDict[request_status_id[0]] == "Done":
                    #    continue

                    ## Get Physics Group
                    control = self.br.find_control("custom_sb3",type="select")
                    group_id = control.value[0]
                    group_squad = 'cms-storeresults-'+self.GroupByValueDict[group_id].replace("-","_").lower()

                    ## Get Dataset Version
                    control = self.br.find_control("custom_tf3",type="text")
                    dataset_version = control.value.replace(' ','')
                    if dataset_version == "": dataset_version = '1'
                                        
                    ## Get current status
                    control = self.br.find_control("resolution_id",type="select")
                    status_id = control.value

                    ## Get assigned to
                    control = self.br.find_control("assigned_to",type="select")
                    AssignedToByValueDict = self.getLabelByValueDict(control)
                    assignedTo_id = control.value

                    ##Assign task to the physics group squad
                    if AssignedToByValueDict[assignedTo_id[0]]!=group_squad:
                        assignedTo_id = [self.getValueByLabelDict(control)[group_squad]]
                        control.value = assignedTo_id
                        self.br.submit()

                    # Set default Adquisition Era for StoreResults 
                    acquisitionEra = "StoreResults"

                    ## Construction of the new dataset name (ProcessingString)
                    ## remove leading hypernews or physics group name and StoreResults+Version
                    if input_processed_dataset.find(self.GroupByValueDict[group_id])==0:
                        new_dataset = input_processed_dataset.replace(self.GroupByValueDict[group_id],"",1)
                    else:
                        stripped_dataset = input_processed_dataset.split("-")[1:]
                        new_dataset = '_'.join(stripped_dataset)
                    
                except Exception as ex:
                    self.br.back()
                    print("There is a problem with this ticket %s, please have a look to the error:" % task)
                    print(str(ex))
                    print(traceback.format_exc())
                    continue
                
                self.br.back()
                
                # Get dataset site info:
                psnList, pnnList = self.getDatasetOriginSites(dbs_url,input_dataset)
                
                infoDict = {}
                # Build store results json
                # First add all the defaults values
                infoDict["RequestType"] = "StoreResults"
                infoDict["UnmergedLFNBase"] = "/store/unmerged" 
                infoDict["MergedLFNBase"] = "/store/results/" + self.GroupByValueDict[group_id].replace("-","_").lower()
                infoDict["MinMergeSize"] = 1500000000
                infoDict["MaxMergeSize"] = 5000000000
                infoDict["MaxMergeEvents"] = 100000
                infoDict["TimePerEvent"] = 40
                infoDict["SizePerEvent"] = 512.0
                infoDict["Memory"] = 2394
                infoDict["CmsPath"] = "/uscmst1/prod/sw/cms"                                        
                infoDict["Group"] = "DATAOPS"
                infoDict["DbsUrl"] = dbs_url
                
                # Add all the information pulled from Savannah
                infoDict["AcquisitionEra"] = acquisitionEra
                infoDict["GlobalTag"] = self.setGlobalTagFromOrigin(dbs_url,input_dataset)
                infoDict["DataTier"] = data_tier
                infoDict["InputDataset"] = input_dataset
                infoDict["ProcessingString"] = new_dataset
                infoDict["CMSSWVersion"] = release
                infoDict["ScramArch"] = scram_arch
                infoDict["ProcessingVersion"] = dataset_version                    
                infoDict["SiteWhitelist"] = psnList
                
                # Create report for Migration2Global
                report = {}
                 
                #Fill json file, if status is done
                if self.StatusByValueDict[status_id[0]]=='Done' and RequestStatusByValueDict[request_status_id[0]] != "Closed":
                    self.writeJSONFile(task, infoDict)
                    report["json"] = 'y'
                else:
                    report["json"] = 'n'
                    
                report["task"] = int(task)
                report["InputDataset"] = input_dataset
                report["ProcessingString"] = new_dataset
                report["ticketStatus"] = self.StatusByValueDict[status_id[0]]
                report["assignedTo"] = AssignedToByValueDict[assignedTo_id[0]]
                report["localUrl"] = dbs_url
                report["sites"] = psnList
                report["pnns"] = pnnList

                # if the request is closed, change the item status to report to Closed
                if report["ticketStatus"] == "Done" and RequestStatusByValueDict[request_status_id[0]] == "Closed":
                    report["ticketStatus"] = "Closed"

                requests.append(report)
                    
            # Print out report
            self.printReport(requests)
        # Close connections
        self.br.close()
        
        return requests

    def closeRequest(self,task,msg):
        """
        This close a specific savannag ticket
        Insert a message in the ticket
        """
        if self.isLoggedIn:
            #self.createValueDicts()
            
            response = self.br.open('https://savannah.cern.ch/task/?'+str(task))

            html = response.read()

            self.br.select_form(name="item_form")

            control = self.br.find_control("status_id",type="select")
            control.value = [self.TicketStatusByLabelDict["Closed"]]

            #Put reason to the comment field
            control = self.br.find_control("comment",type="textarea")
            control.value = msg
                        
            #DBS Drop Down is a mandatory field, if set to None (for old requests), it is not possible to close the request
            self.setDBSDropDown()
                        
            self.br.submit()

            #remove JSON ticket
            self.removeJSONFile(task)
            
            self.br.back()
        return

    def setDBSDropDown(self):
        ## Get DBS URL by Drop Down
        control = self.br.find_control("custom_sb4",type="select")
        dbs_url = self.DBSByValueDict[control.value[0]]

        ## Get DBS URL by text field (for old entries)
        if dbs_url=='None':
            tmp = self.br.find_control("custom_tf4",type="text")
            dbs_url = tmp.value.replace(' ','')

            if dbs_url.find("phys01")!=-1:
                control.value = [self.DBSByLabelDict["phys01"]]
            elif dbs_url.find("phys02")!=-1:
                control.value = [self.DBSByLabelDict["phys02"]]
            elif dbs_url.find("phys03")!=-1:
                control.value = [self.DBSByLabelDict["phys03"]]
            else:
                msg = 'DBS URL of the old request is neither phys01, phys02 nor phys03. Please, check!'
                print(msg)
                raise RuntimeError(msg)

        return

    def writeJSONFile(self, task, infoDict):
        """
        This writes a JSON file at ComponentDir
        """
        ##check if file already exists
        filename = self.config["ComponentDir"]+'/Ticket_'+str(task)+'.json'
        if not os.access(filename,os.F_OK):
            jsonfile = open(filename,'w')
            request = {'createRequest':infoDict} ## CHECK THIS BEFORE FINISHING
            jsonfile.write(json.dumps(request,sort_keys=True, indent=4))
            jsonfile.close

        return

    def removeJSONFile(self,task):
        """
        This removes the JSON file at ComponentDir if it was created
        """
        filename = self.config["ComponentDir"]+'/Ticket_'+str(task)+'.json'

        if os.access(filename,os.F_OK):
            os.remove(filename)

        return

    def printReport(self, requests):
        """
        Print out a report
        """
        print("%20s %10s %5s %35s %10s %50s %50s" %( 'Savannah Ticket','Status','json','Assigned to','local DBS','Sites','pnns')) 
        print("%20s %10s %5s %35s %10s %50s %50s" %( '-'*20,'-'*10,'-'*5,'-'*35,'-'*10,'-'*50,'-'*50 ))
        
        for report in requests:
            
            json = report["json"]
            ticket = report["task"]
            status = report["ticketStatus"]
            assigned = report["assignedTo"]
            localUrl = report["localUrl"].split('/')[5]
            site = ', '.join(report["sites"])
            pnns = ', '.join(report["pnns"])
            print("%20s %10s %5s %35s %10s %50s %50s" %(ticket,status,json,assigned,localUrl,site,pnns))  
    #print "rad 159"
    #print response10.read()
    #print list(br.links())
    root = lxml.html.fromstring(response10.read())
    namn = root.cssselect("td.reportTitle h1")[0] 
    namn = namn.text_content()
    #print type(namn)
    namn= namn.encode('utf-8') #namn har typen ElementUnicodeResult. Detta konverterar det till en sträng
    #print type(namn)
    print namn


    oms = root.cssselect("tr.bgLightPink td")
    print oms[14].text_content()

    br.back()

    br.select_form(name="f_search")
    br["what"]=namn
    response11 = br.submit()

    #print response11.read()

    root = lxml.html.fromstring(response11.read())
    print root.cssselect("td.text11grey6 span") #här verkar det bli fel ibland, dvs. listan som returneras är tom. Varför? 22 maj 2013
    if len(root.cssselect("td.text11grey6 span"))>1:

        verksamhet = root.cssselect("td.text11grey6 span")[1]
        verksamhet =  verksamhet.tail
    else:
        verksamhet = "Verksamhet ej funnen"    
Exemple #16
0
     if notas != None:
         hayNota[k] = True  # habemus nota
         nota = mean(notas)
         j = -1
         if nota < R1[k]:
             log("No pasaste :(")
             j = 0
         elif nota < R2[k]:
             log("Se puede con examen de segunda")
             j = 1
         else:
             log("Pasaste!!!!")
             j = 2
         try:
             mixer.music.load(MUSICA[k][j])
         except PygameError:
             raise MusicFileError(
                 "No se encontro el archivo de musica ingresado, {}. Verifica que lo escribiste bien y que esta en este mismo directorio"
                 .format(MUSICA[k][j]))
         mixer.music.play(-1, START[k][j])
         r = raw_input('presiona cualquier tecla para parar ')
         mixer.music.stop()
     browser.back()
 if t0 == None:
     log("Aun no hay notas")
     t0 = time.time()
 else:
     if time.time() - t0 >= MENSAJES_CADA:
         log("Aun no hay notas")
         t0 = time.time()
 time.sleep(10)
Exemple #17
0
class RequestQuery:

    def __init__(self,config):
        self.br=Browser()

        self.config = config

        self.isLoggedIn = self.login2Savannah()

    def __del__(self):
        self.br.close()

    def closeRequest(self,task,msg):
        if self.isLoggedIn:
            self.createValueDicts()
            
            response = self.br.open('https://savannah.cern.ch/task/?'+str(task))

            html = response.read()

            self.br.select_form(name="item_form")

            control = self.br.find_control("status_id",type="select")
            control.value = [self.TicketStatusByLabelDict["Closed"]]

            #Put reason to the comment field
            control = self.br.find_control("comment",type="textarea")
            control.value = msg
                        
            #DBS Drop Down is a mandatory field, if set to None (for old requests), it is not possible to close the request
            self.setDBSDropDown()
                        
            self.br.submit()

            #remove JSON ticket
            self.removeJSONFile(task)
            
        return
                
    def createValueDicts(self):       
        if self.isLoggedIn:
            self.br.select_form(name="bug_form")
            
            control = self.br.find_control("custom_sb2",type="select")
            self.ReleaseByValueDict = self.getLabelByValueDict(control)

            control = self.br.find_control("custom_sb3",type="select")
            self.GroupByValueDict = self.getLabelByValueDict(control)

            control = self.br.find_control("custom_sb4",type="select")
            self.DBSByValueDict = self.getLabelByValueDict(control)
            self.DBSByLabelDict = self.getValueByLabelDict(control)

            control = self.br.find_control("resolution_id",type="select")
            self.StatusByValueDict = self.getLabelByValueDict(control)

            control = self.br.find_control("status_id",type="select")
            self.TicketStatusByLabelDict = self.getValueByLabelDict(control)

        return

    def setDBSDropDown(self):
        ## Get DBS URL by Drop Down
        control = self.br.find_control("custom_sb4",type="select")
        dbs_url = self.DBSByValueDict[control.value[0]]

        ## Get DBS URL by text field (for old entries)
        if dbs_url=='None':
            tmp = self.br.find_control("custom_tf4",type="text")
            dbs_url = tmp.value.replace(' ','')

            if dbs_url.find("analysis_02")!=-1:
                control.value = [self.DBSByLabelDict["cms_dbs_ph_analysis_02"]]
            elif dbs_url.find("analysis_01")!=-1:
                control.value = [self.DBSByLabelDict["cms_dbs_ph_analysis_01"]]
            elif dbs_url.find("local_09")!=-1:
                control.value = [self.DBSByLabelDict["cms_dbs_ph_prod_local_09"]]
            else:
                msg = 'DBS URL of the old request is neither analysis_01, analysis_02 nor local_09. Please, check!'
                logging.error(msg)
                raise RuntimeError, msg

        return
                
    def getLabelByValueDict(self, control):
        d = {}
        for item in control.items:
            value = item.attrs['value']
            label = item.attrs['label']
            d[value] = label
                
        return d

    def getRequests(self,**kargs):
        requests = []
        
        if self.isLoggedIn:
            self.selectQueryForm(**kargs)
            self.createValueDicts()
        
            self.br.select_form(name="bug_form")
            response = self.br.submit()

            html_ouput = response.read()

            for link in self.br.links(text_regex="#[0-9]+"):
                    response = self.br.follow_link(link)
    
                    ## Get Information
                    self.br.select_form(name="item_form")

                    ## Get input dataset name
                    control = self.br.find_control("custom_tf1",type="text")
                    input_dataset = control.value.split('/')
                    input_primary_dataset = input_dataset[1].replace(' ','')
                    input_processed_dataset = input_dataset[2].replace(' ','')
                    
                    ## Get DBS URL by Drop Down
                    control = self.br.find_control("custom_sb4",type="select")
                    dbs_url = self.DBSByValueDict[control.value[0]]

                    ## Get DBS URL by text field (for old entries)
                    if dbs_url=='None':
                        control = self.br.find_control("custom_tf4",type="text")
                        dbs_url = control.value.replace(' ','')
                    else: # Transform input value to a valid DBS url
                        dbs_url = "http://cmsdbsprod.cern.ch/"+dbs_url+"/servlet/DBSServlet"
                        
                    ## Get Release
                    control = self.br.find_control("custom_sb2",type="select")
                    release_id = control.value

                    ## Get Physics Group
                    control = self.br.find_control("custom_sb3",type="select")
                    group_id = control.value[0]
                    group_squad = 'cms-storeresults-'+self.GroupByValueDict[group_id].replace("-","_").lower()

                    ## Get Dataset Version
                    control = self.br.find_control("custom_tf3",type="text")
                    dataset_version = control.value.replace(' ','')
                                        
                    ## Get current status
                    control = self.br.find_control("resolution_id",type="select")
                    status_id = control.value
                
                    ## Get current request status
                    control = self.br.find_control("status_id",type="select")
                    request_status_id = control.value
                    RequestStatusByValueDict = self.getLabelByValueDict(control)

                    ## Get assigned to
                    control = self.br.find_control("assigned_to",type="select")
                    AssignedToByValueDict = self.getLabelByValueDict(control)
                    assignedTo_id = control.value

                    ##Assign task to the physics group squad
                    if AssignedToByValueDict[assignedTo_id[0]]!=group_squad:
                        control.value = [self.getValueByLabelDict(control)[group_squad]]
                        self.br.submit()

                    ## Construction of the new dataset name
                    ## remove leading hypernews or physics group name and StoreResults+Version

                    if len(dataset_version)>0:
                        dataset_prefix = "StoreResults-"+dataset_version
                    else:
                        dataset_prefix = "StoreResults"
                    
                    if input_processed_dataset.find(self.GroupByValueDict[group_id])==0:
                        new_dataset = input_processed_dataset.replace(self.GroupByValueDict[group_id],dataset_prefix,1)
                    else:
                        stripped_dataset = input_processed_dataset.split("-")[1:]
                        new_dataset = dataset_prefix+'-'+'-'.join(stripped_dataset)
                
                    self.br.back()

                    ## remove leading &nbsp and # from task
                    task = link.text.replace('#','').decode('utf-8').strip()

                    infoDict = {}
                
                    infoDict["primaryDataset"] = input_primary_dataset
                    infoDict["processedDataset"] = input_processed_dataset
                    infoDict["outputDataset"] = new_dataset
                    infoDict["physicsGroup"] = self.GroupByValueDict[group_id]
                    infoDict["inputDBSURL"] = dbs_url

                    # close the request if deprecated release was used
                    try:
                        infoDict["cmsswRelease"] = self.ReleaseByValueDict[release_id[0]]
                    except:
                        if len(self.ReleaseByValueDict)>0 and RequestStatusByValueDict[request_status_id[0]] != "Closed":
                            msg = "Your request is not valid anymore, since the given CMSSW release is deprecated. If your request should be still processed, please reopen the request and update the CMSSW release to a more recent *working* release.\n"
                            msg+= "\n"
                            msg+= "Thanks,\n"
                            msg+= "Your StoreResults team"
                            self.closeRequest(task,msg)
            
                    
                    #Fill json file, if status is done
                    if self.StatusByValueDict[status_id[0]]=='Done' and RequestStatusByValueDict[request_status_id[0]] != "Closed":
                        self.writeJSONFile(task, infoDict)

                    infoDict["task"] = int(task)
                    infoDict["ticketStatus"] = self.StatusByValueDict[status_id[0]]
                    infoDict["assignedTo"] = AssignedToByValueDict[assignedTo_id[0]]

                    if infoDict["ticketStatus"] == "Done" and RequestStatusByValueDict[request_status_id[0]] == "Closed":
                        infoDict["ticketStatus"] = "Closed"

                    requests.append(infoDict)
                    
        return requests

    def getValueByLabelDict(self, control):
        d = {}
        for item in control.items:
            value = item.attrs['value']
            label = item.attrs['label']
            d[label] = value

        return d

    def login2Savannah(self):
        login_page='https://savannah.cern.ch/account/login.php?uri=%2F'
        savannah_page='https://savannah.cern.ch/task/?group=cms-storeresults'
        
        self.br.open(login_page)

        ## 'Search' form is form 0
        ## login form is form 1
        self.br.select_form(nr=1)

        username = self.config["SavannahUser"]
    
        self.br['form_loginname']=username
        self.br['form_pw']=self.config["SavannahPasswd"]
        
        self.br.submit()
        
        response = self.br.open(savannah_page)
        
        # Check to see if login was successful
        if not re.search('Logged in as ' + username, response.read()):
            logging.error('login unsuccessful, please check your username and password')
            return False
        else:
            return True

    def selectQueryForm(self,**kargs):       
        if self.isLoggedIn:
            self.br.select_form(name="bug_form")

            ## Use right query form labelled Test
            control = self.br.find_control("report_id",type="select")

            for item in control.items:
                if item.attrs['label'] == "Test":
                    control.value = [item.attrs['value']]
                    
            ##select number of entries displayed per page
            control = self.br.find_control("chunksz",type="text")
            control.value = "150"

            ##check additional searching parameter
            for arg in kargs:
                if arg == "approval_status":
                    control = self.br.find_control("resolution_id",type="select")
                    for item in control.items:
                        if item.attrs['label'] == kargs[arg].strip():
                            control.value = [item.attrs['value']]

                elif arg == "task_status":
                    control = self.br.find_control("status_id",type="select")
                    for item in control.items:
                        if item.attrs['label'] == kargs[arg].strip():
                            control.value = [item.attrs['value']]
                            
                elif arg == "team":
                    control = self.br.find_control("custom_sb5",type="select")
                    for item in control.items:
                        if item.attrs['label'] == kargs[arg].strip():
                            control.value = [item.attrs['value']]

            response = self.br.submit()
            response.read()

        return

    def removeJSONFile(self,task):
        filename = self.config["ComponentDir"]+'/Ticket_'+str(task)+'.json'

        if os.access(filename,os.F_OK):
            os.remove(filename)

        return
            
    def writeJSONFile(self, task, infoDict):
        ##check if file already exists
        filename = self.config["ComponentDir"]+'/Ticket_'+str(task)+'.json'
        if not os.access(filename,os.F_OK):
            jsonfile = open(filename,'w')
            jsonfile.write(json.dumps(infoDict,sort_keys=True, indent=4))
            jsonfile.close

        return
# login successful, home page redirect
print "\n***", rsp.geturl()
print "Logged in properly on home page; click Account link"
assert rsp.geturl() == "http://us.pycon.org/2011/home/", rsp.geturl()
page = rsp.read()
assert "Logout" in page, "Logout not in page"
rsp = br.follow_link(text_regex="Account")

# account page
print "\n***", rsp.geturl()
print "Email address parseable on Account page; go back"
assert rsp.geturl() == "http://us.pycon.org/2011/account/email/", rsp.geturl()
page = rsp.read()
assert "Email Addresses" in page, "Missing email addresses"
print "    Primary e-mail: %r" % str(BS(page).find("table").find("tr").find("td").find("b").string)
rsp = br.back()

# back to home page
print "\n***", rsp.geturl()
print "Back works, on home page again; click Logout link"
assert rsp.geturl() == "http://us.pycon.org/2011/home/", rsp.geturl()
rsp = br.follow_link(url_regex="logout")

# logout page
print "\n***", rsp.geturl()
print "Confirm on Logout page and Log in link at the top"
assert rsp.geturl() == "http://us.pycon.org/2011/account/logout/", rsp.geturl()
page = rsp.read()
assert "Log in" in page, "Log in not in page"
print "\n*** DONE"
Exemple #19
0
class RegPublDownloader(LegalSource.Downloader):
    
    def __init__(self,baseDir="data"):
        self.dir = baseDir + "/regpubl/downloaded"
        if not os.path.exists(self.dir):
            Util.mkdir(self.dir)
        self.config = ConfigObj("%s/%s.ini" % (self.dir, __moduledir__))

        # Why does this say "super() argument 1 must be type, not classobj"
        # super(RegPublDownloader,self).__init__()
        self.browser = Browser()
    
    def DownloadAll(self):
        # we use mechanize instead of our own Robot class to list
        # available documents since we can't get the POST/cookie based
        # search to work.
        doctype = '160'
        log.info(u'Selecting documents of type %s' % doctype)
        self.browser.open("http://www.regeringen.se/sb/d/108/action/browse/c/%s" % doctype)
        log.info(u'Posting search form')
        self.browser.select_form(nr=1)
        self.browser.submit()

        pagecnt = 1
        done = False
        while not done:
            log.info(u'Result page #%s' % pagecnt)
            for l in self.browser.links(url_regex=r'/sb/d/108/a/\d+'):
                self._downloadSingle(l.absolute_url)
                self.browser.back()
            try:
                self.browser.find_link(text='N\xe4sta sida')
                self.browser.follow_link(text='N\xe4sta sida')
            except LinkNotFoundError:
                log.info(u'No next page link found, this was the last page')
                done = True
            pagecnt += 1
        self.config['last_update'] = datetime.date.today()    
        self.config.write()
        
    def DownloadNew(self):
        if 'last_update' in self.config:
            then = datetime.datetime.strptime(self.config['last_update'], '%Y-%m-%d')
        else:
            # assume last update was more than a year ago
            then = datetime.datetime.now() - datetime.timedelta(-367)
        
        now =  datetime.datetime.now()
        if (now - then).days > 30:
            pass
            # post a "last 30 days" query
        elif (now - then).days > 365:
            pass
            # post a "last 12 months" query
        else:
            # post a full query
            self.DownloadAll()        
        
    def _downloadSingle(self,url):
        docid = re.match(r'http://www.regeringen.se/sb/d/108/a/(\d+)', url).group(1)

        fname = "%s/%s/index.html" % (self.dir, docid)
        log.info(u'    Loading docidx %s' % url)
        self.browser.open(url)
        if not os.path.exists(fname):
            Util.ensureDir(fname)
            self.browser.retrieve(url,fname)
        
        for l in self.browser.links(url_regex=r'/download/(\w+\.pdf).*'):
            filename = re.match(r'http://www.regeringen.se/download/(\w+\.pdf).*',l.absolute_url).group(1)
            # note; the url goes to a redirect script; however that
            # part of the URL tree (/download/*) is off-limits for
            # robots. But we can figure out the actual URL anyway!
            if len(docid) > 4:
                path = "c6/%02d/%s/%s" % (int(docid[:-4]),docid[-4:-2],docid[-2:])
            else:
                path = "c4/%02d/%s" % (int(docid[:-2]),docid[-2:])
            fileurl = "http://regeringen.se/content/1/%s/%s" % (path,filename)
            
            df = "%s/%s/%s" % (self.dir,docid, filename)
            if not os.path.exists(df):
                log.info(u'        Downloading %s' % (fileurl))
                self.browser.retrieve(fileurl, df)
            else:
                log.info(u'        Already downloaded %s' % (fileurl))
class Session(object):
    def __init__(self):
        """Constructor

        Args:
            None

        Attributes:
            browser (`mechanize._mechanize.Browser`): browser object in session
        """

        self.browser = Browser()

        # set error and debug handlers for the browser

        # cookie jar
        self.browser.set_cookiejar(cookielib.LWPCookieJar())

        # browser options
        self.browser.set_handle_equiv(True)
        self.browser.set_handle_gzip(True)
        self.browser.set_handle_redirect(True)
        self.browser.set_handle_referer(True)
        self.browser.set_handle_robots(False)

        # follows refresh 0 but doesn't hang on refresh > 0
        self.browser.set_handle_refresh(_http.HTTPRefreshProcessor(),
                                        max_time=1)

        # user-Agent
        self.browser.addheaders = [("User-agent", HEADER)]

    def close(self):
        """Destructor for Session. Closes current browser session

        Args:
            None

        Returns:
            None
        """
        self.browser.close()

    def case_id_form(self, case):
        """Grabs the form in the case searching page, and inputs the
        case number to return the response.

        Args:
            case (`str`): case ID to be scraped

        Returns:
            response (`str`): HTML response
        """

        # iterate through the forms to find the correct one
        for form in self.browser.forms():
            if form.attrs["name"] == "inquiryFormByCaseNum":
                self.browser.form = form
                break

        # submit case ID and return the response
        self.browser.form["caseId"] = case
        self.browser.submit()
        response = self.browser.response().read()

        self.browser.back()

        return response if any(
            case_type in response.upper()
            for case_type in ("FORECLOSURE",
                              "FORECLOSURE RIGHTS OF REDEMPTION")) else ''

    def disclaimer_form(self):
        """Navigates to the URL to proceed to the case searching page

        Args:
            None

        Returns:
            None
        """

        # visit the site
        self.browser.open(URL)

        # select the only form on the page
        self.browser.select_form(nr=0)

        # select the checkbox
        self.browser.form["disclaimer"] = ['Y']

        # submit the form
        self.browser.submit()

    @staticmethod
    def server_running():
        """Checks the status of the Casesearch servers

        Args:
            None

        Returns:
            `True` if server is up, `False` otherwise
        """
        return urlopen(URL).getcode() == 200
            
        moredeets = fonts[5].findAll('b')
        warranttype = moredeets[1].get_text(strip=True)
        court = moredeets[2].get_text(strip=True)
            
        agency = fonts[6].findAll('b')[0].get_text(strip=True)
           
        due = fonts[7].findAll('b')[0].get_text(strip=True)
            
        charges = []
            
        for row in table2.findAll('tr')[1:]:
            col = row.findAll('td')
            crime = col[0].get_text(strip=True)
            charges.append(crime) 
                
        problems = ' and '.join(charges)
            
        fullrecord = (warrant_number, rest, last, dob, eyes, hair, race, sex, height, weight, address, apt, city, state, issued, status, warranttype, court, agency, due, problems, "\n")
        print rest.upper() + " " + last.upper()
            
        f.write("|".join(fullrecord))
        count = count + 1
        
        # navigate back
        mech.back()
        sleep(1)

f.flush()
f.close()
Exemple #22
0
print '\n***', rsp.geturl()
print 'Logged in properly on home page; click Account link'
assert rsp.geturl() == 'http://us.pycon.org/2011/home/', rsp.geturl()
page = rsp.read()
assert 'Logout' in page, 'Logout not in page'
rsp = br.follow_link(text_regex='Account')

# account page
print '\n***', rsp.geturl()
print 'Email address parseable on Account page; go back'
assert rsp.geturl() == 'http://us.pycon.org/2011/account/email/', rsp.geturl()
page = rsp.read()
assert 'Email Addresses' in page, 'Missing email addresses'
print '    Primary e-mail: %r' % str(
    BS(page).find('table').find('tr').find('td').find('b').string)
rsp = br.back()

# back to home page
print '\n***', rsp.geturl()
print 'Back works, on home page again; click Logout link'
assert rsp.geturl() == 'http://us.pycon.org/2011/home/', rsp.geturl()
rsp = br.follow_link(url_regex='logout')

# logout page
print '\n***', rsp.geturl()
print 'Confirm on Logout page and Log in link at the top'
assert rsp.geturl() == 'http://us.pycon.org/2011/account/logout/', rsp.geturl()
page = rsp.read()
assert 'Log in' in page, 'Log in not in page'
print '\n*** DONE'
Exemple #23
0
def main(ra, senha, arquivos):

    if not ra:
        ra = raw_input("Digite seu RA: ")

    if not senha:
        senha = getpass.getpass("Senha: ")

    br = Browser()
    br.set_handle_equiv(True)
    br.set_handle_redirect(True)
    br.set_handle_referer(True)
    br.set_handle_robots(False)

    br.addheaders = [('User-agent',
        'Mozilla/5.0 (X11; Linux x86_64; rv:9.0.1) Gecko/20100101 Firefox/9.0.1')]

    #link = 'https://progradweb.ufscar.br/progradweb/servlet/Superior'
    link = 'http://progradweb.ufscar.br:8080/progradweb/servlet/Superior'
    br.open(link)
    br.select_form(name="fSuperior")
    br.form["Usuario"] = ra
    br.form["sess"] = senha
    br.submit()

    br.select_form(name="fPrincipal")
    resp = br.submit()
    #Corrige nested FORMs
    soup = BeautifulSoup(resp.get_data())
    resp.set_data(soup.prettify())
    br.set_response(resp)

    br.select_form(name="fHistorico")
    pagina = br.submit()

    data = pagina.get_data()

    # Possui mais de 1 enfase?
    if data.find("Clique em uma das &ecirc;nfases abaixo para ver o") != -1:
        links = list(br.links(url_regex=re.compile(r"^javascript:submita")))

        print 'Enfases:'
        for index, link in enumerate(links, start=1):
            print '({}) - {}'.format(index, link.text)

        n = int(raw_input("Digite o numero da enfase: "))

        pattern = re.compile(r'''
        javascript:submita\(\'
        (\d*)\',%20\'
        (\d*)\',%20\'
        (\d)
        ''', re.VERBOSE)

        enfase, ano, semestre = pattern.search(links[n - 1].url).groups()

        br.back()
        br.select_form(name="fHistorico")

        br.form.new_control('text', 'RA', {'value': ra})
        br.form.new_control('text', 'Enfase', {'value': enfase})
        br.form.new_control('text', 'AnoIni', {'value': ano})
        br.form.new_control('text', 'SemIni', {'value': semestre})
        br.form.new_control('text', 'Tipo', {'value': '1'})
        br.form.new_control('text', 'MaisEnfase', {'value': 'S'})
        br.form.new_control('text', 'Modo', {'value': '2'})
        br.form.new_control('text', 'CodigoCurso', {'value': ''})
        br.form.new_control('text', 'Certificado', {'value': '0'})
        br.form.new_control('text', 'Consulta', {'value': '0'})
        br.form.new_control('text', 'sC', {'value': '51'})
        br.form.fixup()
        pagina = br.submit()

    html = BeautifulSoup(pagina.get_data())
    linhas = html.findAll('tr')

    creditos_aprovados = 0
    creditos_solicitados = 0
    creditos_desistentes = 0
    creditos_trancados = 0
    creditos_reprovados = 0
    nota_ponderada = 0

    if arquivos:
        materias = []

    for lin in linhas:
        if len(lin) == 21:
            materia = lin.findAll('td')

            nome = materia[2].text.encode('utf-8')
            nota = materia[3].text
            if nota == '&nbsp;':
                nota = 0
            nota = float(nota)

            resultado = materia[5].text
            creditos = int(materia[6].text)

            if arquivos:
                materia = {'nome': nome,
                        'nota': nota,
                        'resultado': resultado,
                        'creditos': creditos}
                materias.append(materia)

            if resultado == 'Aprovado':
                creditos_aprovados += creditos
            elif resultado == 'Cancelado':
                creditos_solicitados -= creditos
                creditos_trancados += creditos
            elif (resultado == 'Reprovado nota' or
                    resultado == 'Reprovado nota/freq.' or
                    resultado == 'Pendente'):
                creditos_reprovados += creditos
            elif resultado == 'Desistente':
                creditos_desistentes += creditos
            elif resultado == 'Afastado':
                creditos -= creditos

            creditos_solicitados += creditos
            nota_ponderada += creditos * nota

    # Realiza o cálculo
    ira = ((nota_ponderada / creditos_solicitados) *
            (2 - 2 * (creditos_desistentes / creditos_solicitados) -
                creditos_trancados / creditos_solicitados)) * 1000
    ira = int(ceil(ira))

    print "Seu ira é {}.".format(ira)

    if arquivos:
        keys = ['nome', 'nota', 'resultado', 'creditos']
        for arquivo in arquivos:
            out = csv.DictWriter(arquivo, keys)
            out.writer.writerow(keys)
            out.writerows(materias)
        
    return 0
    def executa(self, search_url):

        self.SEARCH_URL = search_url

        list = None
        list = [u'Favorecido:', u'Valor:', u'Observação do Documento:']

        socket.socket = socks.socksocket
        socket.create_connection = create_connection
        br = Browser()
        print search_url
        print "ID = " + str(Consulta.ID)
        gravalog(self, search_url + " cont = " + str(Consulta.ID) + "\n")
        LRequest = urllib2.Request(search_url, " ")
        LResponse = br.open(LRequest)
        page = bs_parse(LResponse.read())

        #pode ir para fora!!!!

        soup = bs_parse(LResponse.get_data())
        img_captcha = soup.find('img', alt='captcha')
        if img_captcha != None:
            #caso encontre um captcha, o sistema troca o endereço IP
            try:
                print "CAPTCHA!!!"
                gravalog(self, "CAPTCHA\n")
            finally:
                Consulta.ID = newID(self, Consulta.controller)
                br.close()
                socket.socket = socks.socksocket
                socket.create_connection = self.create_connection
                br = Browser()
                print search_url + " cont = " + str(Consulta.ID)
                gravalog(self,
                         search_url + " cont = " + str(Consulta.ID) + "\n")
                LRequest = urllib2.Request(search_url, " ")
                LResponse = br.open(LRequest)
                page = bs_parse(LResponse.read())
        entra = 0

        #navega na página HTML consultando o Favorecido no link do hypertexto
        for table in page.findAll("table"):
            for row2 in table.findAll('tr'):
                #             print row2
                for col in row2.findAll('td'):
                    for href in col.findAll('a'):
                        print href
                        gravalog(self,
                                 str(href).encode('utf-8', 'ignore') + '\n')
                        #resp = br.follow_link(text_regex=href.string)
                        #html = resp.read()
                        #print html
                    if col.string != None:
                        m = re.search('a href', col.string)
                        if m != None:
                            print 'Link!!!'
                            gravalog(self, 'Link!!!\n')
                            print col.string
                            gravalog(
                                self,
                                str(col.string).encode('utf-8', 'ignore') +
                                '\n')
                        m = re.search('INFORMATICA', col.string)
                        if m != None:
                            entra = 1
                        m = re.search('TECNOLOGIA DA INFORMACAO', col.string)
                        if m != None:
                            entra = 1
                        m = re.search('TELECOMUNICACOES', col.string)
                        if m != None:
                            entra = 1
                        m = re.search('TELECOMUNICACAO', col.string)
                        if m != None:
                            entra = 1
                        m = re.search('NETWORKS', col.string)
                        if m != None:
                            entra = 1
                        m = re.search('NETWORK', col.string)
                        if m != None:
                            entra = 1
                        m = re.search('REDE', col.string)
                        if m != None:
                            entra = 1
                        m = re.search('REDES', col.string)
                        if m != None:
                            entra = 1
                        if entra == 1:
                            logarqui = logging.getLogger("logarqui")
                            logarqui.debug("Inside f!")
                            try:
                                print 'BINGO!'
                                gravalog(self, 'BINGO!\n')
                                print href.string
                                gravalog(
                                    self,
                                    str(href.string).encode('utf-8', 'ignore')
                                    + '\n')
                                LResponse = br.follow_link(
                                    text_regex=href.string)
                                html = LResponse.read()
                                print html
                                gravalog(self, html + '\n')
                                page = bs_parse(html)
                                cont = 3
                                for table in page.findAll("table"):
                                    for row2 in table.findAll('tr'):
                                        #             print row2
                                        favorecido = 0
                                        valor = 0
                                        observacao = 0
                                        for col in row2.findAll('td'):
                                            if favorecido == 1:
                                                texto = str(col.string).decode(
                                                    'utf8').encode(
                                                        'utf8',
                                                        'ignore').replace(
                                                            "'", "").replace(
                                                                ";",
                                                                "").replace(
                                                                    "--", "")
                                                print texto
                                                gravalog(self, texto + '\n')
                                                list.append(texto)
                                            if valor == 1:
                                                texto = str(col.string).decode(
                                                    'utf8').encode(
                                                        'utf8',
                                                        'ignore').replace(
                                                            "'", "").replace(
                                                                ";",
                                                                "").replace(
                                                                    "--", "")
                                                print texto
                                                gravalog(self, texto + '\n')
                                                list.append(texto)
                                            if observacao == 1:
                                                texto = str(col.string).decode(
                                                    'utf8').encode(
                                                        'utf8',
                                                        'ignore').replace(
                                                            "'", "").replace(
                                                                ";",
                                                                "").replace(
                                                                    "--", "")
                                                print texto
                                                gravalog(self, texto + '\n')
                                                list.append(texto)
                                                print list
                                            if col.string != None:
                                                m = re.search(
                                                    u'Favorecido:', col.string)
                                                if m != None:
                                                    print u'Favorecido:'
                                                    gravalog(
                                                        self, u'Favorecido:')
                                                    favorecido = 1
                                                m = re.search(
                                                    u'Valor:', col.string)
                                                if m != None:
                                                    print u'Valor:'
                                                    gravalog(self, u'Valor:')
                                                    valor = 1
                                                m = re.search(
                                                    u'Observação do Documento:',
                                                    col.string)
                                                if m != None:
                                                    print u'Observação do Documento:'
                                                    gravalog(
                                                        self,
                                                        'Observação do Documento:'
                                                    )
                                                    observacao = 1

                                entra = 0
                                br.back()
                            except Exception, ex:

                                logarqui.exception
                                logarqui.error
                                logarqui.exception(
                                    "\nProvlema na gravação de logs! \n" +
                                    search_url)

                            logarqui.debug("Finishing f!")
Exemple #25
0
assert br.viewing_html()
print br.title()
print br.geturl()
#print br.info()  # headers
#print br.read()  # body
#br.close()  # (shown for clarity; in fact Browser does this for you)

br.select_form(name="vb_login_username=User Name")
# Browser passes through unknown attributes (including methods)
# to the selected HTMLForm (from ClientForm).
br["vb_login_username"] = ["sleven"]  # (the method here is __setitem__)
response2 = br.submit()  # submit current form

# print currently selected form (don't call .submit() on this, use br.submit())
print br.form

response3 = br.back()  # back to cheese shop (same data as response1)
# the history mechanism returns cached response objects
# we can still use the response, even though we closed it:
response3.seek(0)
response3.read()
response4 = br.reload()  # fetches from server

for form in br.forms():
    print form
# .links() optionally accepts the keyword args of .follow_/.find_link()
for link in br.links(url_regex="python.org"):
    print link
    br.follow_link(link)  # takes EITHER Link instance OR keyword args
    br.back()