Ejemplo n.º 1
0
def rtnHTMLformat(tmpddGenrcgenPresent, sppPrefx, pthwcod, ouPthwpng):
    inpx = '\n'.join(tmpddGenrcgenPresent)  # inpx="ALDH2 color \nALDH3A1	color"
    request = mechanize.Request(
        "http://www.genome.jp/kegg/tool/map_pathway2.html")
    response = mechanize.urlopen(request)
    forms = mechanize.ParseResponse(response, backwards_compat=False)
    form = forms[0]
    form["unclassified"] = inpx
    form["org"] = sppPrefx
    request2 = form.click()
    response2 = mechanize.urlopen(request2)
    a = str(response2.read()).split('href="/kegg-bin/show_pathway?')[1]
    code = a.split('/')[0]  # response2.read()
    request = mechanize.Request(
        "http://www.genome.jp/kegg-bin/show_pathway?%s/%s.args" % (code, pthwcod))  # request=mechanize.Request("http://www.genome.jp/kegg-bin/show_pathway?%s/%s.args"%('13171478854246','hsa00410'))
    response = mechanize.urlopen(request)
    forms = mechanize.ParseResponse(response, backwards_compat=False)
    form = forms[1]
    status = ' NOT '
    try:
        imgf = str(forms[1]).split('/mark_pathway')[1].split('/')[0]
        os.system("wget --quiet http://www.genome.jp/tmp/mark_pathway%s/%s.png -O %s" % (imgf, pthwcod, ouPthwpng))
        status = ' '
    except:
        pass
    return 'A pathway image was%ssuccefully produced...' % status
Ejemplo n.º 2
0
    def getScheduleByCSV(self, userid, startday, period):
        _res = self.br.open("http://" + self.HOSTNAME +
                            "/cgi-def/dnet/dnet.cgi?page=schpsetexport")
        _forms = mechanize.ParseResponse(_res)
        _form = _forms[0]

        _form["uid"] = [userid]
        forms = mechanize.ParseResponse(
            self.br.open(_form.click(name='s_add', nr=0)))
        form = forms[0]

        ## 開始日付をセット
        sdate = datetime.datetime.strptime(startday, '%Y-%m-%d')
        form["syear"] = [sdate.strftime("%Y")]
        form["smonth"] = [sdate.strftime("%m")]
        form["sday"] = [sdate.strftime("%d")]

        ## 終了日付をセット
        edate = sdate + timedelta(days=period)
        form["eyear"] = [edate.strftime("%Y")]
        form["emonth"] = [edate.strftime("%m")]
        form["eday"] = [edate.strftime("%d")]
        response = self.br.open(form.click(name="s_ok", nr=0))

        return response.read()
 def login_to_deviantart(self, credentials):
     print("Logging in")
     retry_count = 0
     while 1:
         try:
             response = self.agent.open(self.HOME_URL)
             lda = lambda i: (hasattr(i, "attrs") and "id" in i.attrs and i.
                              attrs["id"] == "login")
             for f in filter(lda, mechanize.ParseResponse(response)):
                 f["username"] = credentials["deviantart.com"][0]
                 f["password"] = credentials["deviantart.com"][2]
             f.click()
             #
             # dunno how to translate this to py mechanize:
             """
             if len(self.agent.cookie_jar) < 3:
                 print ("Log on unsuccessful (maybe wrong login/pass combination?)")
                 print ("You might not be able to fetch the age restricted resources")
             else:
                 print ("Log on successful")
             self.agent.pluggable_parser.default = mechanize.Download
             """
         except Exception:  #i.e. let KeyboardInterrupt through.
             traceback.print_exc()
             if retry_count < 3:
                 retry_count += 1
                 print("Will retry after 1 second")
                 time.sleep(1)
                 continue
             else:
                 print("Login failed after 3 retries")
                 print(
                     "You might not be able to fetch the age restricted resources"
                 )
         break
Ejemplo n.º 4
0
def getPropertyPins(streetName):

    url = r'https://taxcommissioner.dekalbcountyga.gov/TaxCommissioner/TCSearch.asp'
    request = mechanize.Request(url)
    response = mechanize.urlopen(request)
    forms = mechanize.ParseResponse(response, backwards_compat=False)
    response.close()

    form = forms[0]

    form['StreetName'] = sys.argv[1]
    propertyList = mechanize.urlopen(form.click()).read()

    tree = html.fromstring(propertyList)
    pins = tree.xpath('//tr/td[1]/a/@href')
    addresses = tree.xpath('//tr/td[1]/a/text()')

    pinList = []
    i = 0
    for pin in pins:
        #print pin
        newpin = pin.split('=')
        pinList.append([newpin[3], addresses[i]])
        print newpin[3] + '\t' + addresses[i]
        i = i + 1

    return pinList
def get_vorlage(session_id, url):
    try:
        response = mechanize.urlopen(mechanize.Request(url))
        pprint.pprint(response)
    except URLError:
        return
    forms = mechanize.ParseResponse(response, backwards_compat=False)
    for form in forms:
        # All forms are iterated. Might not all be attachment-related.
        for control in form.controls:
            if control.name == 'DT':
                print control.name, control.value
                request2 = form.click()
                try:
                    response2 = mechanize.urlopen(request2)
                    form_url = response2.geturl()
                    if "getfile.asp" in form_url:
                        #print "ERFOLG:", response2.info()
                        pdf = response2.read()
                        md5 = hashlib.md5(pdf).hexdigest()
                        scraperwiki.sqlite.save(
                            unique_keys=['session_id', 'dt', 'md5', 'size'],
                            data={
                                'session_id': session_id,
                                'dt': control.value,
                                'md5': md5,
                                'size': len(pdf)
                            })
                        continue
                except mechanize.HTTPError, response2:
                    print "HTTP-FEHLER :("
                except URLError:
                    pass
Ejemplo n.º 6
0
def _get_results(form, dbg=False):
    # click the form
    clicked_form = form.click()
    # then get the results page
    result = mechanize.urlopen(clicked_form)

    #### EXPORTING RESULTS FILE
    # so what I do is that I fetch the first results page,
    # click the form/link to get all hits as a colon separated
    # ascii table file

    # get the form
    resultform = mechanize.ParseResponse(result, backwards_compat=False)
    result.close()
    resultform = resultform[0]
    # set colon as dilimeter of the table (could use anything I guess)
    #~ resultform.find_control('export_delimiter').items[1].selected =  True
    resultform.find_control('export_delimiter').toggle('colon')
    resultform_clicked = resultform.click()
    result_table = mechanize.urlopen(resultform_clicked)
    data = result_table.read()
    result_table.close()
    if dbg:
        return resultform, result_table, data
    else:
        return data
Ejemplo n.º 7
0
def slurp_with_login_and_pwd():
    import sys
    import mechanize
    # sys.path.append('ClientCookie-1.0.3')
    # from mechanize import ClientCookie
    # sys.path.append('ClientForm-0.1.17')
    # import ClientForm

    # Create special URL opener (for User-Agent) and cookieJar
    cookieJar = mechanize.CookieJar()

    opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cookieJar))
    opener.addheaders = [("User-agent", "Mozilla/5.0 (compatible)")]
    mechanize.install_opener(opener)
    fp = mechanize.urlopen("http://login.yahoo.com")
    forms = mechanize.ParseResponse(fp)
    fp.close()

    # print forms on this page
    for form in forms:
        print "***************************"
        print form

    form = forms[0]
    form["login"] = "******"  # use your userid
    form["passwd"] = "password"  # use your password
    fp = mechanize.urlopen(form.click())
    fp.close()
    fp = mechanize.urlopen(
        "https://class.coursera.org/ml-003/lecture/download.mp4?lecture_id=1"
    )  # use your group
    fp.readlines()
    fp.close()
Ejemplo n.º 8
0
def uploadFileToAquaforum(uploadFilename, requestedFileName):
    '''
    returns response page
    '''

    # build opener. Can be extended to handle cookies/proxies
    opener = mechanize.build_opener()
    # goto upload page
    request3 = mechanize.Request(FORUM_UPLOAD_URL)
    response3 = opener.open(request3)

    # parse form on upload page and add file
    forms = mechanize.ParseResponse(response3, backwards_compat=False)
    form = forms[0]
    filectr = form.find_control("imgfile")
    # filectr.add_file(open('/home/jasper/avatar.jpg'),"image/jpeg","avatar.jpg")
    theFile = file(uploadFilename, 'rb')
    filectr.add_file(theFile, "image/jpeg", os.path.split(
        requestedFileName)[-1])
    # obtain form data
    request4 = form.click()  # urllib2.Request object
    theFile.close()
    request4.add_header('Referer', response3.geturl())
    response4 = opener.open(request4)
    return response4.read()
Ejemplo n.º 9
0
 def open_form(self, form_url):
     self.password_manager.add_password(None, form_url, self.username,
                                        self.password)
     page = mechanize.urlopen(mechanize.Request(form_url))
     forms = mechanize.ParseResponse(page, backwards_compat=False)
     form = forms[
         0]  #work in this case, but need to find out a better way like get form by name
     return form
Ejemplo n.º 10
0
def secondButtonPress(widget):
    global flow
    coursecode_text = objects['coursecode'].get_text()
    serialno_text = objects['serialno'].get_text()
    theUrl = bnextUrl + coursecode_text
    value = 0
    ctr = 0
    if coursecode_text == 'CSE304':
        indexes = ['1', '2', '3', '4']
    else:
        indexes = [serialno_text]
    while True:
        r = br.open(theUrl)
        br.select_form(nr=value)
        br.submit()
        response = br.response()
        html = response.read()
        soup = getsoup(html)
        if value == 0:
            for xserialno_text in indexes:
                tds = [
                    a.renderContents() for a in soup.findAll('table')
                    [2].findAll('font', attrs={'color': 'black'})
                ]
                index = (eval(xserialno_text) - 1) * 9 + 8
                venue = (eval(xserialno_text) - 1) * 9 + 3
                print tds[venue], venue

                if eval(tds[index]) > 0:
                    body = coursecode_text + ":" + serialno_text
                    sendMail(body)
                    value = 1
                    continue
                else:
                    ctr += 1
                    print ctr
                    continue
            continue
        if value == 1:
            tds = soup.findAll('td', attrs={'align': 'center'})
            start = 5
            index = 0
            while start < len(tds):
                if venue == tds[start]:
                    break
                start += 5
            inputs = soup.findAll('input', attrs={'type': 'radio'})
            inp = inputs[index]
            val = inp['value']
            forms = mechanize.ParseResponse(br.response(),
                                            backwards_compat=False)
            form = forms[0]
            br.select_form(nr=0)
            br.form.set_value([val], name='clsnbr1')
            br.submit()
            print br.response().read()
            break
    exit(0)
Ejemplo n.º 11
0
def main(argv):
    search = argv
    # Google Login credentials
    username = '******'  #argv[1]
    password = '******'  #argv[2]

    # Where to save the CSV file
    pathname = 'trend_data/' + search + '_trends.csv'  #argv[3]

    queries = ('q=' + query for query in argv[1:])

    br = mechanize.Browser()

    # Create cookie jar
    cj = cookielib.LWPCookieJar()
    br.set_cookiejar(cj)

    # Act like we're a real browser
    br.addheaders = [(
        'User-agent',
        'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'
    )]

    # Login in to Google
    response = br.open(
        'https://accounts.google.com/ServiceLogin?hl=en&continue=https://www.google.com/'
    )
    forms = mechanize.ParseResponse(response)
    form = forms[0]
    form['Email'] = username
    form['Passwd'] = password
    response = br.open(form.click())

    # Get CSV from Google Trends
    trends_url = 'http://www.google.com/trends/trendsReport?'
    query_params = '&'.join(queries)
    response = br.open(trends_url + query_params + '&export=1')

    # Remove headers and footers from Google's CSV
    # Use last date in date range
    reader = csv.reader(StringIO(response.read()))
    dates = []
    values = []
    for row in reader:
        try:
            date, value = row
        except ValueError:
            continue
        if re.search('[0-9]{4}-[0-9]{2}-[0-9]{2}', date):
            dates.append(date[-10:])  # Uses last date in time period
            values.append(value)

    with open(pathname, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['Date', search])
        for row in zip(dates, values):
            writer.writerow(row)
    def SendRequestToGoogle(self, username, password):
        br = mechanize.Browser()
        # Create cookie jar
        cj = cookielib.LWPCookieJar()
        br.set_cookiejar(cj)
        br.set_handle_robots(False)
        # Act like we're a real browser
        br.addheaders = [(
            'User-agent',
            'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'
        )]
        response = br.open(
            'https://accounts.google.com/ServiceLogin?hl=en&continue=https://www.google.com/'
        )
        forms = mechanize.ParseResponse(response)
        form = forms[0]
        form['Email'] = username
        form['Passwd'] = password
        response = br.open(form.click())

        keyList = self.queryDict.keys()
        keyCount = len(keyList)
        i = 0
        TermsDone = open(self.DoneTermsFile, 'ab')
        print "\n\n log-in success! \n\n"
        while i < keyCount:
            SearchTerm = keyList[i]
            Queries = self.queryDict[SearchTerm]
            WorldQuery = Queries[0]
            USQuery = Queries[1]
            FiscalEnd = Queries[2]  # ending month of fiscal year
            sleep(random.uniform(40, 70))
            WorldResponse = br.open(WorldQuery)
            WorldResult = csv.reader(StringIO(WorldResponse.read()))
            sleep(random.uniform(30, 60))
            USResponse = br.open(USQuery)  # searchterm : query
            USResult = csv.reader(StringIO(USResponse.read()))

            TempWorld = Queries[3]  # temperary file pathes
            TempUS = Queries[4]
            # Send contents out for writing intermediate output CSV files
            W_Error = self.IntermediateCSV(WorldResult, TempWorld)
            US_Error = self.IntermediateCSV(USResult, TempUS)
            if W_Error == -1 or US_Error == -1:
                self.ErrorHandler(SearchTerm)  # quota limit
            else:
                i = i + 1
                # keep track of downloaded CSV files, prevetent repeats
                TermsDone.write(SearchTerm + ',' + FiscalEnd + '\n')
                print "%s\t\tDONE" % (SearchTerm)  # for monitoring
        TermsDone.close()
Ejemplo n.º 13
0
def grab_redirect(link):
    response = mechanize.urlopen(link['href'])
    forms = mechanize.ParseResponse(response, backwards_compat=False)
    form = forms[0]
    data = mechanize.urlopen(form.click()).read()
    soup = BeautifulSoup.BeautifulSoup(data)
    for div in soup('div'):
        if 'class' in dict(div.attrs) and \
           div['class'] == 'urlworkaround':
            txt = ''.join([str(x) for x in div.contents])
            lsoup = BeautifulSoup.BeautifulSoup(txt)
            link = lsoup('a')[0]
            return link['href']
    raise Exception('no href')
Ejemplo n.º 14
0
def _get_form():
    # GET SERVER RESPONSE
    try:
        response = mechanize.urlopen(SPLAT_FORM_URL)
    except mechanize.URLError:
        raise Exception('No reponse from server : {0}'.format(SPLAT_FORM_URL))

    # PARSE SERVER RESPONSE
    forms = mechanize.ParseResponse(response, backwards_compat=False)
    response.close()

    # GET FORM
    form = forms[0]
    return form
Ejemplo n.º 15
0
def start_cloning(options):
    link = options['link']
    user = options['user']
    password = options['password']
    response = mechanize.urlopen(link)
    forms = mechanize.ParseResponse(response, backwards_compat=False)
    form = forms[0]
    form['txtIdentifiant'] = user
    form['txtMDP'] = password
    website = mechanize.urlopen(form.click())
    data = website.read()
    outfile = open('index.html', 'wt')
    print >> outfile, """
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html dir="ltr" lang="fr" xml:lang="fr" xmlns="http://www.w3.org/1999/xhtml"
class="yui3-js-enabled" id="yui_3_2_0_1_1326674808791714">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
"""
    soup = BeautifulSoup.BeautifulSoup(data)
    title = soup('title')
    print >> outfile, str(title[0])
    divs = soup('div')
    for div in divs:
        if 'class' in dict(div.attrs):
            if div['class'] == 'course-content':
                vstr = '\n'.join([str(x) for x in div.contents[1:]])
                # Eliminate wrong divs
                lsoup = BeautifulSoup.BeautifulSoup(vstr)
                for ldiv in lsoup.findAll('div'):
                    if ('class' in dict(ldiv.attrs) and ldiv['class']
                            in ['left side', 'right side', 'jumpmenu']):
                        ldiv.extract()
                replace = {}
                for link in lsoup.findAll('a'):
                    if 'href' in dict(link.attrs):
                        try:
                            replace[link['href']] = grab_redirect(link)
                        except:
                            pass
                page_txt = str(lsoup)
                for k, v in replace.items():
                    nw_key = str(k) + "&amp;redirect=1"
                    page_txt = page_txt.replace(nw_key, str(v))
                    page_txt = page_txt.replace(str(k), str(v))
                print >> outfile, page_txt
    outfile.close()
Ejemplo n.º 16
0
def connect_to_form(url, formurl):
    print "\nConnecting to Web Page...",
    # Connect to URL (add error handling!!)
    request = mechanize.Request(mechanize.urljoin(url, formurl))
    response = mechanize.urlopen(request)
    monkeypatch_mechanize()
    print "Success."

    # Retrieve forms
    forms = mechanize.ParseResponse(response, backwards_compat=False)
    response.close()

    if len(forms) <= 0:
        raise FormNotFound('No Forms were found on the web page.')

    return forms
    def login(self,username,password):
        " Login to Droptask "
        login_redirect = conf.login_redirect
        login_url = conf.base_url+login_redirect #Fetch url from conf file and concatenate '/login' to the base url
        my_params = {'email':username,'password':password}
        params_encoded = urllib.urlencode(my_params)
        self.browser.method='POST'
        headers = {'Host': 'auth.droptask.com','User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:46.0) Gecko/20100101 Firefox/46.0','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8','Accept-Language': 'en-US,en;q=0.5','Accept-Encoding': 'gzip, deflate, br','Referer': 'https://auth.droptask.com/login?continue=https%3A%2F%2Fapp.droptask.com%2Fauth&clientId=5c8af5ea-fa7c-4cb3-80e8-04b361d9e297&source=web','Cookie':' _ga=GA1.2.2000288028.1449634785; connect.sid=s%3A08sLYE3zBHEDS4T72OeDhDzqUCW-MNxB.bU6Bdt5xDsHeg6yIyIQKPmL4NG%2BgqRaBOMTMkax44N0','Connection': 'keep-alive'}
        login_response = self.post(url=login_url,data=params_encoded,headers=headers)
        forms = mechanize.ParseResponse(login_response, backwards_compat=False)   
        result_flag = False 
        if (len(forms)!= 0) and (forms[0].find_control("password") != None):
            self.write("    -Login failed")
            result_flag = False
        else:
            self.write("    -Login success")
            result_flag = True

        return result_flag
Ejemplo n.º 18
0
def next_page(ind): 
    #time.sleep(4)
    br = mechanize.Browser()
    for attemt in range(5):
        try:
            response = br.open(url)
            break
        except:
            pass
    #br.select_form()
    forms = mechanize.ParseResponse(response, backwards_compat=False)
    form = forms[0]
    form.set_all_readonly(False)
    form['startitem']=str(ind*5+5)
    form.set_all_readonly(True)
    response = form.click()
    #response1 = br.submit()
    #print response1
    #br.open(mechanize.urlopen(response).read()
    return mechanize.urlopen(response).read()
Ejemplo n.º 19
0
 def write_page(self, page_name, new_contents, comment=""):
     """
     write raw content to page 
     """
     url = self.get_page_url(page_name)
     response = self.browser.open(url + "?action=edit")
     forms = mechanize.ParseResponse(response, backwards_compat=False)
     edit_form = self._find_edit_form(forms)
     old_contents = edit_form.get_value(name="text")
     if old_contents.replace("\r\n",
                             "\n") == new_contents.replace("\r\n", "\n"):
         print "No changes to make"
         return
     edit_form.set_value(name="text", value=new_contents)
     edit_form.set_value(name="comment", value=comment)
     request = edit_form.click(name="save")
     response = self.browser.open(request)
     response_text = response.read()
     if "Your changes have been saved" not in response_text:
         #print "Previous content: '%s'" % old_contents
         #print "Attempted new content: '%s'" % new_contents
         raise Exception("error writing page")
Ejemplo n.º 20
0
def RA_prep_search(location, position, date, lenghtOfStay):
    scrape_url = 'http://www.reserveamerica.com/unifSearchResults.do'
    req = mechanize.Request(scrape_url)
    req.add_header("Referer", scrape_url)
    req.add_header('user-agent', 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36')
#    req.add_header("Accept-Encoding", "gzip, deflate")
    req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8')
    req.add_header('Origin', 'http://www.reserveamerica.com')
    r1 = mechanize.urlopen(req)
    forms = mechanize.ParseResponse(r1)
    form =  forms[0]
    form.set_all_readonly(False)
    form['locationCriteria'] = location
    form['locationPosition'] = position
    form['interest'] = ["camping"]
    form['lookingFor'] = ['2003']
    form['camping_2003_3012'] = '3'
#    form['camping_2003_moreOptions'] = ['false']
    form['campingDate'] = date
    form['lengthOfStay'] = str(lenghtOfStay)
#    print form
    return form.click()
def RA_prep_search(location, position, date, lenghtOfStay, accessNeeds=False):
    scrape_url = 'http://www.reserveamerica.com/unifSearchResults.do'
    req = mechanize.Request(scrape_url)
    req.add_header("Referer", scrape_url)
    req.add_header(
        'user-agent',
        'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36'
    )
    #    req.add_header("Accept-Encoding", "gzip, deflate")
    req.add_header(
        'Accept',
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
    )
    req.add_header('Origin', 'http://www.reserveamerica.com')
    r1 = mechanize.urlopen(req)
    forms = mechanize.ParseResponse(r1, backwards_compat=False)
    form = forms[0]
    form.set_all_readonly(False)
    form['locationCriteria'] = location
    form['locationPosition'] = position
    form['interest'] = ["camping"]
    form['lookingFor'] = ['2003']
    form['camping_2003_3012'] = '3'
    #    print form
    #    control = form.find_control("camping_2003_3009")
    #    for item in control.items:
    #        print " name=%s values=%s" % (item.name, str([label.text  for label in item.get_labels()]))
    #    return
    if accessNeeds:
        form['camping_2003_moreOptions'] = ['true']
        form['camping_2003_3009'] = ['true']
    else:
        form['camping_2003_moreOptions'] = []
        form['camping_2003_3009'] = []
    form['campingDate'] = date
    form['lengthOfStay'] = str(lenghtOfStay)
    return form.click()
Ejemplo n.º 22
0
    with open(sys.argv[1]) as input_file:
        for row_o in csv.reader(input_file):

            # first row is username, password, and pause length
            if any(row_o) & (line_o == 0):
                username, password, pause = row_o
                pause = int(pause) if (pause_override == None) else pause_override

                # echo login and pause length information
                logging.info('Username: '******'Password: '******'Pause: ' + str(pause))

                # login to Google with username and password
                response = br.open('https://accounts.google.com/ServiceLogin?hl=en&continue=https://www.google.com/')
                forms = mechanize.ParseResponse(response)
                form = forms[0]
                form['Email'] = username
                form['Passwd'] = password
                response = br.open(form.click())

            # remaining rows are query and countries
            elif any(row_o) & (line_o > 0) & (start_line <= line_o) & (line_o <= stop_line):

                # output filename root
                output_root = sys.argv[1].replace('.csv', '_' + str(line_o)) 

                # pause before subsequent queries
                if (line_o > 1): time.sleep(pause)

                # generate query url
Ejemplo n.º 23
0
		<frame name="buttons" src="/ITE/common/html/buttons.html" marginwidth="0" marginheight="0" scrolling="no" frameborder="0" noresize="noresize"\>
		<frame name="nada" src="/ITE/common/html/nada.htm" marginwidth="0" marginheight="0" scrolling="0" frameborder="0" noresize="noresize"\>
	</frameset>
	<noframes>
	<body>
		<p>
		Seu Browser não suporta frames.
		</p>
	</body>
	</noframes>
</html>
'''

url2 = 'https://wwws3.hsbc.com.br/HWB-SIMULADOR/servlets/SrvSimulador?ServletState=10'

url3 = 'https://wwws3.hsbc.com.br/HWB-SIMULADOR/servlets/SrvSimulador?ServletState=30'

import sys
import mechanize

request = mechanize.Request(url)
response = mechanize.urlopen(request)
forms = mechanize.ParseResponse(response, backwards_compat=False)
#response.close()
## f = open("example.html")
## forms = mechanize.ParseFile(f, "http://example.com/example.html",
##                              backwards_compat=False)
## f.close()
form = forms[0]
print form  # very useful!
Ejemplo n.º 24
0
    def get_session(self, session_url=None, session_id=None):
        """
        Load session details for the given detail page URL or numeric ID
        """
        # Read either session_id or session_url from the opposite
        if session_id is not None:
            session_url = self.urls['SESSION_DETAIL_PRINT_PATTERN'] % session_id
        elif session_url is not None:
            parsed = parse.search(self.urls['SESSION_DETAIL_PARSE_PATTERN'],
                                  session_url)
            session_id = parsed['session_id']

        logging.info("Getting session %d from %s", session_id, session_url)

        session = Session(numeric_id=session_id)

        time.sleep(self.config.WAIT_TIME)
        response = self.user_agent.open(session_url)
        # forms for later attachment download
        mechanize_forms = mechanize.ParseResponse(response,
                                                  backwards_compat=False)
        # seek(0) is necessary to reset response pointer.
        response.seek(0)
        html = response.read()
        html = html.replace('&nbsp;', ' ')
        parser = etree.HTMLParser()
        dom = etree.parse(StringIO(html), parser)

        # check for page errors
        try:
            page_title = dom.xpath('//h1')[0].text
            if 'Fehlermeldung' in page_title:
                logging.info("Page %s cannot be accessed due to server error",
                             session_url)
                if self.options.verbose:
                    print "Page %s cannot be accessed due to server error" % session_url
                return
            if 'Berechtigungsfehler' in page_title:
                logging.info("Page %s cannot be accessed due to permissions",
                             session_url)
                if self.options.verbose:
                    print "Page %s cannot be accessed due to permissions" % session_url
                return
        except:
            pass
        try:
            error_h3 = dom.xpath('//h3[@class="smc_h3"]')[0].text.strip()
            if 'Keine Daten gefunden' in error_h3:
                logging.info("Page %s does not contain any agenda items",
                             session_url)
                if self.options.verbose:
                    print "Page %s does not contain agenda items" % session_url
                return
        except:
            pass

        session.original_url = session_url

        # Session title
        try:
            session.title = dom.xpath(
                self.xpath['SESSION_DETAIL_TITLE'])[0].text
        except:
            logging.critical(
                'Cannot find session title element using XPath SESSION_DETAIL_TITLE'
            )
            raise TemplateError(
                'Cannot find session title element using XPath SESSION_DETAIL_TITLE'
            )

        # Committe link
        try:
            links = dom.xpath(self.xpath['SESSION_DETAIL_COMMITTEE_LINK'])
            for link in links:
                href = link.get('href')
                parsed = parse.search(
                    self.urls['COMMITTEE_DETAIL_PARSE_PATTERN'], href)
                if parsed is not None:
                    session.committee_id = parsed['committee_id']
        except:
            logging.critical(
                'Cannot find link to committee detail page using SESSION_DETAIL_COMMITTEE_LINK_XPATH'
            )
            raise TemplateError(
                'Cannot find link to committee detail page using SESSION_DETAIL_COMMITTEE_LINK_XPATH'
            )

        # Session identifier, date, address etc
        tds = dom.xpath(self.xpath['SESSION_DETAIL_IDENTIFIER_TD'])
        if len(tds) == 0:
            logging.critical(
                'Cannot find table fields using SESSION_DETAIL_IDENTIFIER_TD_XPATH'
            )
            raise TemplateError(
                'Cannot find table fields using SESSION_DETAIL_IDENTIFIER_TD_XPATH'
            )
        else:
            for n in range(0, len(tds)):
                try:
                    tdcontent = tds[n].text.strip()
                    nextcontent = tds[n + 1].text.strip()
                except:
                    continue
                if tdcontent == 'Sitzung:':
                    session.identifier = nextcontent
                elif tdcontent == 'Gremium:':
                    session.committee_name = nextcontent
                elif tdcontent == 'Datum:':
                    datestring = nextcontent
                    if tds[n + 2].text == 'Zeit:':
                        if (n + 3) in tds and tds[n + 3].text is not None:
                            datestring + ' ' + tds[n + 3].text
                    session.date_start = datestring
                elif tdcontent == 'Raum:':
                    session.address = " ".join(tds[n + 1].xpath('./text()'))
                elif tdcontent == 'Bezeichnung:':
                    session.description = nextcontent
            if not hasattr(session, 'identifier'):
                logging.critical(
                    'Cannot find session identifier using XPath SESSION_DETAIL_IDENTIFIER_TD'
                )
                raise TemplateError(
                    'Cannot find session identifier using XPath SESSION_DETAIL_IDENTIFIER_TD'
                )

        # Agendaitems
        found_attachments = []
        rows = dom.xpath(self.xpath['SESSION_DETAIL_AGENDA_ROWS'])
        if len(rows) == 0:
            logging.critical(
                'Cannot find agenda using XPath SESSION_DETAIL_AGENDA_ROWS')
            raise TemplateError(
                'Cannot find agenda using XPath SESSION_DETAIL_AGENDA_ROWS')
        else:
            agendaitems = {}
            agendaitem_id = None
            public = True
            for row in rows:
                row_id = row.get('id')
                row_classes = row.get('class').split(' ')
                fields = row.xpath('td')
                number = fields[0].xpath('./text()')
                if len(number) > 0:
                    number = number[0]
                if number == []:
                    number = None
                #print "number: %s" % number
                if row_id is not None:
                    # Agendaitem main row
                    agendaitem_id = row_id.rsplit('_', 1)[1]
                    agendaitems[agendaitem_id] = {}
                    agendaitems[agendaitem_id]['id'] = int(agendaitem_id)
                    if number is not None:
                        agendaitems[agendaitem_id]['number'] = number
                    agendaitems[agendaitem_id]['subject'] = "; ".join(
                        fields[1].xpath('./text()'))
                    agendaitems[agendaitem_id]['public'] = public
                    # submission links
                    links = row.xpath(
                        self.
                        xpath['SESSION_DETAIL_AGENDA_ROWS_SUBMISSION_LINK'])
                    submissions = []
                    for link in links:
                        href = link.get('href')
                        if href is None:
                            continue
                        parsed = parse.search(
                            self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'], href)
                        if parsed is not None:
                            submission = Submission(numeric_id=int(
                                parsed['submission_id']),
                                                    identifier=link.text)
                            submissions.append(submission)
                            # Add submission to submission queue
                            if hasattr(self, 'submission_queue'):
                                self.submission_queue.add(
                                    int(parsed['submission_id']))
                    if len(submissions):
                        agendaitems[agendaitem_id]['submissions'] = submissions
                    """
                    Note: we don't scrape agendaitem-related attachments for now,
                    based on the assumption that they are all found via submission
                    detail pages. All we do here is get a list of attachment IDs
                    in found_attachments
                    """
                    #attachments = []
                    forms = row.xpath('.//form')
                    for form in forms:
                        for hidden_field in form.xpath('input'):
                            if hidden_field.get('name') != 'DT':
                                continue
                            attachment_id = hidden_field.get('value')
                            #attachments.append(attachment_id)
                            found_attachments.append(attachment_id)
                    #if len(attachments):
                    #    agendaitems[agendaitem_id]['attachments'] = attachments

                elif 'smc_tophz' in row_classes:
                    # additional (optional row for agendaitem)
                    label = fields[1].text
                    value = fields[2].text
                    if label is not None and value is not None:
                        label = label.strip()
                        value = value.strip()
                        #print (label, value)
                        if label in ['Ergebnis:', 'Beschluss:']:
                            if value in self.config.RESULT_STRINGS:
                                agendaitems[agendaitem_id][
                                    'result'] = self.config.RESULT_STRINGS[
                                        value]
                            else:
                                logging.warn(
                                    "String '%s' not found in configured RESULT_STRINGS",
                                    value)
                                if self.options.verbose:
                                    print "WARNING: String '%s' not found in RESULT_STRINGS\n" % value
                                agendaitems[agendaitem_id]['result'] = value
                        elif label == 'Bemerkung:':
                            agendaitems[agendaitem_id]['result_note'] = value
                        elif label == 'Abstimmung:':
                            agendaitems[agendaitem_id]['voting'] = value
                        else:
                            logging.critical(
                                "Agendaitem info label '%s' is unknown", label)
                            raise ValueError(
                                'Agendaitem info label "%s" is unknown' %
                                label)

                elif 'smcrowh' in row_classes:
                    # Subheading (public / nonpublic part)
                    if fields[
                            0].text is not None and "Nicht öffentlich" in fields[
                                0].text.encode('utf-8'):
                        public = False
            #print json.dumps(agendaitems, indent=2)
            session.agendaitems = agendaitems.values()

        # session-related attachments
        containers = dom.xpath(self.xpath['SESSION_DETAIL_ATTACHMENTS'])
        for container in containers:
            classes = container.get('class')
            if classes is None:
                continue
            classes = classes.split(' ')
            if self.xpath[
                    'SESSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME'] not in classes:
                continue
            attachments = []
            rows = container.xpath('.//tr')
            for row in rows:
                forms = row.xpath('.//form')
                for form in forms:
                    #print "Form: ", form
                    name = " ".join(row.xpath('./td/text()')).strip()
                    for hidden_field in form.xpath('input'):
                        if hidden_field.get('name') != 'DT':
                            continue
                        attachment_id = hidden_field.get('value')
                        # make sure to add only those which aren't agendaitem-related
                        if attachment_id not in found_attachments:
                            attachment = Attachment(identifier=attachment_id,
                                                    name=name)
                            # Traversing the whole mechanize response to submit this form
                            for mform in mechanize_forms:
                                #print "Form found: '%s'" % mform
                                for control in mform.controls:
                                    if control.name == 'DT' and control.value == attachment_id:
                                        #print "Found matching form: ", control.name, control.value
                                        attachment = self.get_attachment_file(
                                            attachment, mform)
                            attachments.append(attachment)
                            found_attachments.append(attachment_id)
            if len(attachments):
                session.attachments = attachments

        oid = self.db.save_session(session)
        if self.options.verbose:
            logging.info("Session %d stored with _id %s", session_id, oid)
Ejemplo n.º 25
0
 def __init__(self):
     self.url = "https://store.steampowered.com/join/"
     self.site_data = urllib2.urlopen(self.url)
     self.forms = mechanize.ParseResponse(self.site_data, backwards_compat=False)
     self.form = self.forms[1] #currently true, but this line will cause this script to eventually break
     self.captchagid = self.form.find_control(id="captchagid").value
Ejemplo n.º 26
0
def getStars(age, zeta):
    """
    Returns a list of stars (for use by e.g. phoenix grid) with necessary parameters to determine
    their characteristics and magnitude.
    
    Inputs:
    -------
    
    age: float
    
        The age of the stellar population, in years
    
    zeta: float
    
        The metal content of the stars, where the Sun has 0.019 as its metal content.
    
    Output:
    -------
    
    star_list: list of lists
    
        The output stars. The list has the following columns:
            Z: [M/H]
            Age (Gyr)
            M_ini (M_\odot)
            M_act (M_\odot)
            Te
            log(g)
            int_IMF
            Johnson,I
    """
    result_str = re.compile(r"The results are available at <a href=(.*?)>output\d*\.dat</a>")
    request = mechanize.Request("http://stev.oapd.inaf.it/cgi-bin/cmd_2.5")
    response = mechanize.urlopen(request)
    forms = mechanize.ParseResponse(response,backwards_compat=False)
    response.close()
    form = forms[0]
    #The reasoning here is that I can *get* Johnson filters in JWST pysynphot, but can't figure
    #out how to do the Spitzer equivalents.
#    form["photsys_file"] = ["tab_mag_odfnew/tab_mag_2mass_spitzer_wise.dat"]
    form["photsys_file"] = ["tab_mag_odfnew/tab_mag_ubvrijhk.dat"]
    #0 = single isochrone, single metallicity.
    #1 = sequence of isochrones, different ages, constant metallicity
    #2 = sequence of isochrones, constant age, different metallicities
    form["isoc_val"] = ["0"]
    #Age for single-single
    form["isoc_age"] = '%g' % (age)
    form["isoc_zeta"] = '%g' % (zeta)
    request2 = form.click()
    response2 = mechanize.urlopen(request2)
    response_value = response2.read()
    response_url = response2.geturl()
    match = result_str.search(response_value)
    star_list = []
    if match is not None:
        output_url = match.group(1)
        response_result = mechanize.urlopen(mechanize.urljoin(response_url,output_url))
        output_lines = response_result.read().split("\n")
        output_lines = output_lines[13:]
        for line in output_lines:
            if line != "":
                #Z, log(age/yr), M_ini, M_act, logL/Lo, logTe, logG, mbol, U, B, V, R, I, J, H, K, int_IMF, stage
                items = line.split()
                star = [None]
                star.append(getZ(float(items[0])))
                star.append(10**float(items[1]))
                star.append(float(items[2]))
                star.append(float(items[3]))
                star.append(10**float(items[5]))
                star.append(float(items[6]))
                star.append(float(items[6]))
                star.append(float(items[16]))
                star.append(float(items[12]))
                star_list.append(star)
    return star_list
Ejemplo n.º 27
0
 def _getForm(self, response):
     forms = mechanize.ParseResponse(response, backwards_compat=False)       
     return forms[0]
Ejemplo n.º 28
0
def calculate_RH(seq_list,
                 pore_size=100,
                 ion_pairing_agent='TFA',
                 pH=2,
                 proxy=''):
    # Find cached RHs in database.
    output = dict([(seq, None) for seq in seq_list])
    database = shelve.open(DATABASE_FILENAME, writeback=True)
    for seq in seq_list:
        if (seq in database
                and (pore_size, ion_pairing_agent, pH) in database[seq]):

            output[seq] = database[seq][(pore_size, ion_pairing_agent, pH)]
    remaining_seq = []
    for seq, RH in output.items():
        if RH is None:
            remaining_seq.append(seq)

    if not remaining_seq:
        return output

    # If there undefined RHs, obtain them at the SSRCalc site:
    if proxy:
        proxy_handler = urllib2.ProxyHandler({'http': proxy})
        opener = urllib2.build_opener(proxy_handler)
        opener.addheaders = [
            ('Host', '2ip.ru\n'),
            ('User-Agent',
             'Mozilla/5.0 (Windows; U; Windows NT 5.1; ru; rv:1.8.0.2) '
             'Gecko/20060308 Firefox/1.5.0.2\n'),
            ('Accept',
             'text/xml,application/xml,application/xhtml+xml,text/html;'
             'q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5\n'),
            ('Accept-Language', 'ru-ru,ru;q=0.8,en-us;q=0.5,en;q=0.3\n'),
            ('Accept-Charset', 'windows-1251,utf-8;q=0.7,*;q=0.7\n'),
            ('X-Forwarded-For', '44.55.66.77\n'), ('Pragma', 'no-cache\n'),
            ('Referer', 'http://www.test.com\n'), ('Keep-Alive', '500\n'),
            ('Connection', 'close\n'),
            ('Content-Type', 'application/x-www-form-urlencoded\r\n\r\n')
        ]
        urllib2.install_opener(opener)
    request = urllib2.Request('http://hs2.proteome.ca/SSRCalc/SSRCalcX.html')
    response = urllib2.urlopen(request)
    forms = mechanize.ParseResponse(response, backwards_compat=False)
    response.close()
    form = forms[0]
    if ion_pairing_agent == 'FA':
        form['sver'] = ['ssrFA']
    elif pH == 10:
        form['sver'] = ['ssrXT']
    elif pore_size == 100:
        form['sver'] = ['ssr100']
    elif pore_size == 300:
        form['sver'] = ['ssr300']
    form['seqs'] = "\n".join(remaining_seq)
    result = urllib2.urlopen(form.click())
    result = result.read()

    processed_seq_re = re.compile(
        r'(?<=\<tr class\=\"bodyText\"\>\<td\>)\S+\n?\S+')
    processed_RH_re = re.compile(r'(\(\d+\)\<\/td\>\n\<td\>\s+)(-?\d+.?\d+)')

    processed_seq = processed_seq_re.findall(result)
    processed_RH = [float(rh[1]) for rh in processed_RH_re.findall(result)]
    processed_data = dict(zip(processed_seq, processed_RH))

    # Caching obtained data.
    for seq, RH in processed_data.items():
        seq = remove_html_tags(seq)
        entry = database.get(seq, {})
        entry[(pore_size, ion_pairing_agent, pH)] = RH
        database[seq] = entry
        output[seq] = RH

    database.close()

    return output
Ejemplo n.º 29
0
    def get_submission(self, submission_url=None, submission_id=None):
        """
        Load submission (Vorlage) details for the submission given by detail page URL
        or numeric ID
        """
        # Read either submission_id or submission_url from the opposite
        if submission_id is not None:
            submission_url = self.urls[
                'SUBMISSION_DETAIL_PRINT_PATTERN'] % submission_id
        elif submission_url is not None:
            parsed = parse.search(self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'],
                                  submission_url)
            submission_id = parsed['submission_id']

        logging.info("Getting submission %d from %s", submission_id,
                     submission_url)
        submission = Submission(numeric_id=submission_id)
        try_until = 1
        try_counter = 0
        try_found = False

        while (try_counter < try_until):
            try_counter += 1
            try_found = False
            time.sleep(self.config.WAIT_TIME)
            try:
                response = self.user_agent.open(submission_url)
            except urllib2.HTTPError, e:
                if e.code == 404:
                    sys.stderr.write(
                        "URL not found (HTTP 404) error caught: %s\n" %
                        submission_url)
                    sys.stderr.write(
                        "Please check BASE_URL in your configuration.\n")
                    sys.exit(1)
            mechanize_forms = mechanize.ParseResponse(response,
                                                      backwards_compat=False)
            response.seek(0)
            html = response.read()
            html = html.replace('&nbsp;', ' ')
            parser = etree.HTMLParser()
            dom = etree.parse(StringIO(html), parser)
            # Hole die Seite noch einmal wenn unbekannter zufällig auftretender Fehler ohne Fehlermeldung ausgegeben wird (gefunden in Duisburg, vermutlich kaputte Server Config)
            try:
                page_title = dom.xpath('//h1')[0].text
                if 'Fehler' in page_title:
                    try_until = 3
                    try_found = True
                    logging.info(
                        "Original RIS Server Bug, restart scraping submission %s",
                        submission_url)
            except:
                pass
            if (try_found == False):
                # check for page errors
                try:
                    if 'Fehlermeldung' in page_title:
                        logging.info(
                            "Page %s cannot be accessed due to server error",
                            submission_url)
                        if self.options.verbose:
                            print "Page %s cannot be accessed due to server error" % submission_url
                        return
                    if 'Berechtigungsfehler' in page_title:
                        logging.info(
                            "Page %s cannot be accessed due to permissions",
                            submission_url)
                        if self.options.verbose:
                            print "Page %s cannot be accessed due to permissions" % submission_url
                        return
                except:
                    pass

                submission.original_url = submission_url

                # Session title
                try:
                    stitle = dom.xpath(self.xpath['SUBMISSION_DETAIL_TITLE'])
                    submission.title = stitle[0].text
                except:
                    logging.critical(
                        'Cannot find submission title element using XPath SUBMISSION_DETAIL_TITLE'
                    )
                    raise TemplateError(
                        'Cannot find submission title element using XPath SUBMISSION_DETAIL_TITLE'
                    )

                # Submission identifier, date, type etc
                tds = dom.xpath(self.xpath['SUBMISSION_DETAIL_IDENTIFIER_TD'])
                if len(tds) == 0:
                    logging.critical(
                        'Cannot find table fields using XPath SUBMISSION_DETAIL_IDENTIFIER_TD'
                    )
                    logging.critical('HTML Dump:' + html)
                    raise TemplateError(
                        'Cannot find table fields using XPath SUBMISSION_DETAIL_IDENTIFIER_TD'
                    )
                else:
                    current_category = None
                    for n in range(0, len(tds)):
                        try:
                            tdcontent = tds[n].text.strip()
                        except:
                            continue
                        if tdcontent == 'Name:':
                            submission.identifier = tds[n + 1].text.strip()
                        elif tdcontent == 'Art:':
                            submission.type = tds[n + 1].text.strip()
                        elif tdcontent == 'Datum:':
                            submission.date = tds[n + 1].text.strip()
                        elif tdcontent == 'Name:':
                            submission.identifier = tds[n + 1].text.strip()
                        elif tdcontent == 'Betreff:':
                            submission.subject = '; '.join(
                                tds[n + 1].xpath('./text()'))
                        elif tdcontent == 'Referenzvorlage:':
                            link = tds[n + 1].xpath('a')[0]
                            href = link.get('href')
                            parsed = parse.search(
                                self.urls['SUBMISSION_DETAIL_PARSE_PATTERN'],
                                href)
                            submission.superordinate = {
                                'identifier': link.text.strip(),
                                'numeric_id': parsed['submission_id']
                            }
                            # add superordinate submission to queue
                            if hasattr(self, 'submission_queue'):
                                self.submission_queue.add(
                                    parsed['submission_id'])
                        # subordinate submissions are added to the queue
                        elif tdcontent == 'Untergeordnete Vorlage(n):':
                            current_category = 'subordinates'
                            for link in tds[n + 1].xpath('a'):
                                href = link.get('href')
                                parsed = parse.search(
                                    self.
                                    urls['SUBMISSION_DETAIL_PARSE_PATTERN'],
                                    href)
                                if hasattr(self, 'submission_queue'
                                           ) and parsed is not None:
                                    #add subordinate submission to queue
                                    self.submission_queue.add(
                                        parsed['submission_id'])
                        else:
                            if current_category == 'subordinates':
                                for link in tds[n + 1].xpath('a'):
                                    href = link.get('href')
                                    parsed = parse.search(
                                        self.urls[
                                            'SUBMISSION_DETAIL_PARSE_PATTERN'],
                                        href)
                                    if hasattr(self, 'submission_queue'
                                               ) and parsed is not None:
                                        self.submission_queue.add(
                                            parsed['submission_id'])

                    if not hasattr(submission, 'identifier'):
                        logging.critical(
                            'Cannot find session identifier using SESSION_DETAIL_IDENTIFIER_TD_XPATH'
                        )
                        raise TemplateError(
                            'Cannot find session identifier using SESSION_DETAIL_IDENTIFIER_TD_XPATH'
                        )

                # "Beratungsfolge"(list of sessions for this submission)
                # This is currently not parsed for scraping, but only for
                # gathering session-attachment ids fpr later exclusion
                found_attachments = []
                rows = dom.xpath(self.xpath['SUBMISSION_DETAIL_AGENDA_ROWS'])
                for row in rows:
                    formfields = row.xpath(
                        './/input[@type="hidden"][@name="DT"]')
                    if len(formfields):
                        attachment_id = formfields[0].get('value')
                        if attachment_id is not None:
                            found_attachments.append(attachment_id)

                # submission-related attachments
                submission.attachments = []
                containers = dom.xpath(
                    self.xpath['SUBMISSION_DETAIL_ATTACHMENTS'])
                for container in containers:
                    try:
                        classes = container.get('class').split(' ')
                    except:
                        continue
                    if self.xpath[
                            'SUBMISSION_DETAIL_ATTACHMENTS_CONTAINER_CLASSNAME'] not in classes:
                        continue
                    rows = container.xpath('.//tr')
                    for row in rows:
                        forms = row.xpath('.//form')
                        for form in forms:
                            name = " ".join(row.xpath('./td/text()')).strip()
                            for hidden_field in form.xpath(
                                    'input[@name="DT"]'):
                                attachment_id = hidden_field.get('value')
                                if attachment_id in found_attachments:
                                    continue
                                attachment = Attachment(
                                    identifier=attachment_id, name=name)
                                #print attachment_id
                                # Traversing the whole mechanize response to submit this form
                                #print mechanize_forms
                                for mform in mechanize_forms:
                                    #print "Form found: '%s'" % mform
                                    for control in mform.controls:
                                        if control.name == 'DT' and control.value == attachment_id:
                                            attachment = self.get_attachment_file(
                                                attachment, mform)
                                            submission.attachments.append(
                                                attachment)

                # forcing overwrite=True here
                oid = self.db.save_submission(submission)
Ejemplo n.º 30
0
import re
import scraperwiki
import string

base = "http://www.dleg.state.mi.us/bcs_corp/"
search_page = "sr_corp.asp"
result_page = "dt_corp.asp"

# Next page link: rs_corp.asp?s_button=sname&v_search=a&hiddenField=&offset=40

# Get the main name search form
print urljoin(base, search_page)
main_page = urlopen(urljoin(base, search_page))
br = mechanize.Browser()
br.open(base)
forms = mechanize.ParseResponse(main_page, backwards_compat=False)
form = forms[0]
print form

# Search for something:
form.set_value("a", name="v_search")

br.open(form.click())

# Find all URLS that begin with 'dt_corp.asp'
# for letter in string.lowercase:
#     print letter

# You got cookie.
# So share it maybe?
from bs4 import BeautifulSoup