Exemple #1
0
def login_user(username, password):
    # Create the agent and log in.
    agent = me.Browser()
    print("Attempting to login to Garmin Connect...")
    login(agent, username, password)
    return agent
Exemple #2
0
import mechanize
import sys
import httplib
import argparse
import logging
import time 
from urlparse import urlparse
from mpi4py import MPI

comm = MPI.COMM_WORLD
rank = comm.Get_rank()

start_time =  time.time()
br = mechanize.Browser()  # initiating the browser
br.addheaders = [
    ('User-agent',
     'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11)Gecko/20071127 Firefox/2.0.0.11')
]
br.set_handle_robots(False)
br.set_handle_refresh(False)

payloads = ['<svg "ons>', '" onfocus="alert(1);', 'javascript:alert(1)']
blacklist = ['.png', '.jpg', '.jpeg', '.mp3', '.mp4', '.avi', '.gif', '.svg',
             '.pdf']
xssLinks = []            # TOTAL CROSS SITE SCRIPTING FINDINGS


class color:
    BLUE = '\033[94m'
    RED = '\033[91m'
    GREEN = '\033[92m'
Exemple #3
0
# William Gurecky
#

import mechanize as mz
import re
import os

if not os.path.exists('endfvii'):
    os.makedirs('endfvii')

outdir = 'endfvii/'
#outdir = 'endfvi/'

# Set target page
target = 'http://t2.lanl.gov/nis/data/endf/endfvii-n.html'
#target = 'http://t2.lanl.gov/nis/data/endf/endfvi-n.html'

# Open up browser instance
br = mz.Browser()
br.open(target)

keywrd = re.compile("neutron")
links = list(br.links())
for link in links:
    if keywrd.findall(link.url):
        print("Downloading: " + link.url)
        br.follow_link(link)
        material = str(br.geturl()).split('/')[-2:]
        br.retrieve(br.geturl(), outdir + ''.join(material))
        br.back()
Exemple #4
0
import mechanize
import argparse

sr = argparse.ArgumentParser()
sr.add_argument('-u', dest='url', action='store', help='The URL to analyze')
results = sr.parse_args()

moilla = mechanize.Browser()
moilla.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11)Gecko/20071127 Firefox/2.0.0.11')]
moilla.set_handle_robots(False)
moilla.set_handle_refresh(False)

back = ['.png', '.jpg', '.jpeg', '.mp3', '.mp4', '.avi', '.gif', '.svg', '.pdf']
XSSpay = ['<svg "ons>', '"onfocus="alert(1);', 'javascript:alert(1)']

class color:
    RED = '\033[91m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'

xssurl = results.url
if not xssurl:
    print color.RED + """NOT URL"""
else:
    try:
        abc = 0
        for ba in back:
            if ba in xssurl:
                print color.RED + """Not a good url to test"""
                abc = 1
        if abc == 0:
Exemple #5
0
def get_bd_index_all_mechanize(file_name):
    
    ts = time.time()
    st = datetime.datetime.fromtimestamp(ts).strftime('%Y_%m_%d_%H_%M_%S')
    f_error = open('errors_' + st, 'w')
    f_success = open('success_' + st, 'w')
    f_success.write('ID\tgg_index_common\tgg_hk\tgg_new\tgg_site\tbd_index_chinese\t'
                    +'bd_index_common\tbd_news_chinese\tbd_news_common\tbd_site\n')
    
    all_schools = []
    with codecs.open(file_name, 'r', 'utf-8') as f:
        for line in f.readlines():
            line = line.strip().split('\t')
            all_schools.append(line)
    #print all_schools
    
    count =0
    for school in all_schools[0:]:
        sleep(random.random() * 2)
        
        while True:
            
            nums = []
            print school, nums
            try:
                ############################## gg
                # school names inside quotes
                if len(school) > 4:
                    target_en = "\"" + school[2] + "\" " + school[4]
                else:
                    target_en = "\"" + school[2] + "\""
                
                #target_ch = "\"" + school[3] + "\""
                site = 'site:' + school[1]
                print target_en, site
                
                # gg search
                # en                
                br = mechanize.Browser()
                br.set_handle_robots(False)     # ignore robots
                br.set_handle_refresh(False)    # can sometimes hang without this
                br.set_handle_redirect(True)
                br.set_handle_referer(True)
                br.addheaders = [('user-agent','Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.3) Gecko/20100423 Ubuntu/10.04 (lucid) Firefox/3.6.3')]
                url = "http://www.google.com"
                search_box='q'
                
                br.open(url)
                #htmlFile = br.response()
                br.select_form(nr=0)
                br.form[search_box] = target_en
                response = br.submit()
                #response_html = response.get_data()
                soup = BeautifulSoup(response)
                target = soup.find('div', {'id': 'resultStats'})
                all_txt = target.text
                str_txt = unicodedata.normalize('NFKD', all_txt).encode('ascii','ignore')
                nums_en_index = re.findall(r'\d+', str_txt)
                nums.append(''.join(nums_en_index))
                
                # gg hk
                # en
                br = mechanize.Browser()
                br.set_handle_robots(False)     # ignore robots
                br.set_handle_refresh(False)    # can sometimes hang without this
                br.set_handle_redirect(True)
                br.set_handle_referer(True)
                br.addheaders = [('user-agent','Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.3) Gecko/20100423 Ubuntu/10.04 (lucid) Firefox/3.6.3')]
                url = "http://www.google.com.hk"
                search_box='q'
                
                br.open(url)
                #htmlFile = br.response()
                br.select_form(nr=0)
                br.form[search_box] = target_en
                response = br.submit()
                #response_html = response.get_data()
                soup = BeautifulSoup(response)
                target = soup.find('div', {'id': 'resultStats'})
                all_txt = target.text
                str_txt = unicodedata.normalize('NFKD', all_txt).encode('ascii','ignore')
                nums_en_index = re.findall(r'\d+', str_txt)
                nums.append(''.join(nums_en_index))
                
                
                # gg news
                # en
                br = mechanize.Browser()
                br.set_handle_robots(False)     # ignore robots
                br.set_handle_refresh(False)    # can sometimes hang without this
                br.set_handle_redirect(True)
                br.set_handle_referer(True)
                br.addheaders = [('user-agent','Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.3) Gecko/20100423 Ubuntu/10.04 (lucid) Firefox/3.6.3')]
                url = "http://news.google.com"
                search_box='q'
                
                br.open(url)
                #htmlFile = br.response()
                br.select_form(nr=0)
                br.form[search_box] = target_en
                response = br.submit()
                #response_html = response.get_data()
                soup = BeautifulSoup(response)
                target = soup.find('div', {'id': 'resultStats'})
                all_txt = target.text
                str_txt = unicodedata.normalize('NFKD', all_txt).encode('ascii','ignore')
                nums_en_index = re.findall(r'\d+', str_txt)
                nums.append(''.join(nums_en_index))
                
                
                # gg site
                site = 'site:' + school[1]
                br = mechanize.Browser()
                br.set_handle_robots(False)     # ignore robots
                br.set_handle_refresh(False)    # can sometimes hang without this
                br.set_handle_redirect(True)
                br.set_handle_referer(True)
                br.addheaders = [('user-agent','Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.3) Gecko/20100423 Ubuntu/10.04 (lucid) Firefox/3.6.3')]
                url = "http://www.google.com"
                search_box='q'
                
                br.open(url)
                #htmlFile = br.response()
                br.select_form(nr=0)
                br.form[search_box] = site
                response = br.submit()
                #response_html = response.get_data()
                soup = BeautifulSoup(response)
                target = soup.find('div', {'id': 'resultStats'})
                all_txt = target.text
                str_txt = unicodedata.normalize('NFKD', all_txt).encode('ascii','ignore')
                nums_en_index = re.findall(r'\d+', str_txt)
                nums.append(''.join(nums_en_index))
                
                ############################################### baidu
                # school name has not quotes
                if len(school) > 4:
                    target_en = school[2] + " " + school[4]
                else:
                    target_en = school[2]
                target_ch = school[3]
                target_en = string.replace(target_en, ' ', '%20')
                target_ch = string.replace(target_ch, ' ', '%20')
                print target_ch, target_en, site
                
                while True:
                    nums_bd = []
                    try:
                        # baidu search
                        # ch
                        url = 'http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=baidu&wd=' \
                            +target_ch
                        url = url.encode('utf8')
                        response = urllib2.urlopen(url, timeout = 10)
                        response_html = response.read()
                        soup = BeautifulSoup(response_html)
                        #bs_html = soup.body.prettify(encoding='utf-8')
                        target = soup.find('div', {'class': 'nums'})
                        all_txt = target.text
                        str_txt = unicodedata.normalize('NFKD', all_txt).encode('ascii','ignore')
                        nums_en_index = re.findall(r'\d+', str_txt)
                        nums_bd.append(''.join(nums_en_index))
                        
                        # baidu search
                        # en
                        url = 'http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=baidu&wd=' \
                            + target_en
                        response = urllib2.urlopen(url, timeout = 30)
                        response_html = response.read()
                        soup = BeautifulSoup(response_html)
                        #bs_html = soup.body.prettify(encoding='utf-8')
                        target = soup.find('div', {'class': 'nums'})
                        all_txt = target.text
                        str_txt = unicodedata.normalize('NFKD', all_txt).encode('ascii','ignore')
                        nums_en_index = re.findall(r'\d+', str_txt)
                        nums_bd.append(''.join(nums_en_index))
                        
                        # baidu news
                        # ch
                        url = 'http://news.baidu.com/ns?cl=2&rn=20&tn=news&word='+target_ch
                        url = url.encode('utf8')
                        response = urllib2.urlopen(url, timeout = 30)
                        response_html = response.read()
                        soup = BeautifulSoup(response_html)
                        #bs_html = soup.body.prettify(encoding='utf-8')
                        target = soup.find('span', {'class': 'nums'})
                        all_txt = target.text
                        str_txt = unicodedata.normalize('NFKD', all_txt).encode('ascii','ignore')
                        nums_en_index = re.findall(r'\d+', str_txt)
                        nums_bd.append(''.join(nums_en_index))
                        
                        
                        # baidu news
                        # en
                        url = 'http://news.baidu.com/ns?cl=2&rn=20&tn=news&word='+target_en
                        response = urllib2.urlopen(url, timeout = 30)
                        response_html = response.read()
                        soup = BeautifulSoup(response_html)
                        #bs_html = soup.body.prettify(encoding='utf-8')
                        target = soup.find('span', {'class': 'nums'})
                        all_txt = target.text
                        str_txt = unicodedata.normalize('NFKD', all_txt).encode('ascii','ignore')
                        nums_en_index = re.findall(r'\d+', str_txt)
                        nums_bd.append(''.join(nums_en_index))
                        
                        # baidu site
                        url = 'http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=baidu&wd=' \
                            + site
                        response = urllib2.urlopen(url, timeout = 30)
                        response_html = response.read()
                        soup = BeautifulSoup(response_html)
                        #bs_html = soup.body.prettify(encoding='utf-8')
                        target = soup.find('div', {'class': 'nums'})
                        all_txt = target.text
                        str_txt = unicodedata.normalize('NFKD', all_txt).encode('ascii','ignore')
                        nums_en_index = re.findall(r'\d+', str_txt)
                        nums_bd.append(''.join(nums_en_index))
                        
                        nums += nums_bd
                        one_line = school[0] + '\t' + '\t'.join(nums)
                        f_success.write(one_line + '\n')
                        f_success.flush()
                        break
                        
                    except Exception:
                        print sys.exc_info()[:2]
                        pass
                
                print 'count =', count, '&&&\t', one_line + '\n'
                count += 1
                
                #if count % 10 == 0: # sleep 10 min every 10 runs 
                    #sleep(600)
                break
                
            except Exception:
                print sys.exc_info()[:2]
                print 'count =', count, ' &&&' + '\t'.join(school) + '\n'
                count += 1
                #if count % 10 == 0: # sleep 10 min every 10 runs 
                sleep(600) # sleep 10 min if failed
                f_error.write(school[0] + '\t' + school[2] + '\n')
                f_error.flush()
                pass
    f_error.close()
    f_success.close()
Exemple #6
0
 def setUp(self):
     self.server = multiprocessing.Process(target=run_server)
     self.server.start()
     self.browser = mechanize.Browser()
     time.sleep(3)
import scraperwiki
import mechanize
import re
import urlparse
import lxml.html


# ASPX pages are some of the hardest challenges because they use javascript and forms to navigate
# Almost always the links go through the function function __doPostBack(eventTarget, eventArgument)
# which you have to simulate in the mechanize form handling library

# This example shows how to follow the Next page link

url3 = 'http://home.btconnect.com/haltontransport/servicechanges.htm'
br3 = mechanize.Browser()

    # sometimes the server is sensitive to this information
br3.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
response3 = br3.open(url3)


html3 = response3.read()
print html3
videoimgs = re.findall('<TD.*?>(.*?)</TD>', html3, re.MULTILINE | re.DOTALL | re.VERBOSE )
print videoimgs



scraperwiki.sqlite.execute("delete from swdata") 

record = {}
def GetCoordinatesFromPageSoup():
    br = mechanize.Browser()
    url = "http://en.wikipedia.org/wiki/Toronto"
    response = br.open(url)
    soup = BeautifulSoup(br.response().read())
def GetCoordinatesSoup():
    # Get the information on the census divisions:
    # Name, Province, Population, Illustrative
    # If there is no illustrative, fill it with Name

    br = mechanize.Browser()

    url = "http://en.wikipedia.org/wiki/List_of_census_divisions_of_Canada_by_population"
    response = br.open(url)
    soup = BeautifulSoup(br.response().read())

    table = soup.find_all('tr')

    print "Looking at the table:"
    rownum = 0
    locations = []
    errorRows = []
    for row in table:
        rownum += 1
        print "Here comes the row ", rownum
        #print row.text
        print "------------------"
        columns = row.find_all('td')
        try:
            # Test that the first column is a number, incidating it was used in 2011 census
            int(columns[0].find('span', 'sorttext').text)

            #print columns[0].find('span','sorttext').text
            #print "*"
            #print columns[1].find('span','sorttext').text
            #print "**"

            name = columns[2].find('span', 'sorttext').text.encode('utf-8')
            print name
            print "And href"
            locURL = columns[2].find('span', 'sorttext')
            print locURL
            locURL = locURL.a['href'].encode('utf-8')
            print locURL
            locURL = urljoin(url, locURL)
            print locURL
            latlon = GetCoordinatesFromWikipediaPage(br, locURL)
            print "***"
            province = columns[4].find('span', 'sorttext').text.encode('utf-8')
            print province
            print "****4 - getting population"
            population = int(columns[5].text.replace(',', ''))
            print population
            # int(data.replace(',', ''))
            print "*****5 - Getting illustrative"
            illustrative = columns[8].text.encode('utf-8')
            illustrativeURL = ''
            print illustrative
            if illustrative != '':
                illustrativeURL = columns[8]
                #print illustrativeURL
                illustrativeURL = illustrativeURL.a['href'].encode('utf-8')
                print illustrativeURL
                illustrativeURL = urljoin(url, illustrativeURL)
                print illustrativeURL
                if latlon is None:
                    print "Calculating latlon from Illustrative"
                    latlon = GetCoordinatesFromWikipediaPage(
                        br, illustrativeURL)
            else:
                print "NO ILLUSTRATIVE"

            print "******6"
            print latlon
            if latlon is None:
                latlon = None, None
            #print columns[8].a # gives the href
            print "*******7"
            #print columns[8].a.text # alternate
            thisLocation = [
                name, province, population, illustrative, latlon[0], latlon[1]
            ]
            locations.append(thisLocation)
            print thisLocation
            '''
            time.sleep(4)
            print "clicking link"
            link = browser.find_element_by_link_text(name)
            link.click()
            browser.wait_for_page_to_load("5000")

            # In this section, I'm having difficulty getting the coordinates from the 
            # wikipedia page after I click the link to the district.  Would be nice!
            print "Getting Coordinates maybe"
            wholepage = find_element_by_xpath("//*").get_attribute("outerHTML")
            print wholepage

            # maybe can try getting from here:  <span class="geo">

            geo = browser.find_element_by_class.name('geo')
            print "Printing coordinates maybe"
            print geo.text

            print "Going back"
            browser.back()
            
            if rownum > 3:
                print "we done"
                return locations
            '''
        except ValueError, AttributeError:
            None
        except:
Exemple #10
0
 def make_browser(self):
     browser = mechanize.Browser()
     self._configure_user_agent(browser)
     return browser
def GetCoordinatesSelenium():
    # start on the wikipedia census division page
    browser = webdriver.Firefox()

    url = 'http://en.wikipedia.org/wiki/List_of_census_divisions_of_Canada_by_population'

    browser.get(url)

    # Get the information on the census divisions:
    # Name, Province, Population, Illustrative
    # If there is no illustrative, fill it with Name

    br = mechanize.Browser()

    url = "http://en.wikipedia.org/wiki/List_of_census_divisions_of_Canada_by_population"
    response = br.open(url)
    soup = BeautifulSoup(br.response().read())

    table = soup.find_all('tr')

    print "Looking at the table:"
    rownum = 0
    locations = []
    for row in table:
        rownum += 1
        print "Here comes the row ", rownum
        #print row.text
        print "------------------"
        columns = row.find_all('td')
        try:
            # Test that the first column is a number, incidating it was used in 2011 census
            int(columns[0].find('span', 'sorttext').text)

            #print columns[0].find('span','sorttext').text
            #print "*"
            #print columns[1].find('span','sorttext').text
            #print "**"

            name = columns[2].find('span', 'sorttext').text
            print name
            print "***"
            province = columns[4].find('span', 'sorttext').text
            print province
            print "****"
            population = int(columns[5].text.replace(',', ''))
            print population
            # int(data.replace(',', ''))
            print "*****"
            illustrative = columns[8].text
            print illustrative
            if illustrative == '':
                illustrative = name
                print illustrative
            #print "******"
            #print columns[8].a # gives the href
            #print "*******"
            #print columns[8].a.text # alternate
            thisLocation = [
                name.encode('utf-8'), province, population,
                illustrative.encode('utf-8')
            ]
            locations.append(thisLocation)
            '''
            time.sleep(4)
            print "clicking link"
            link = browser.find_element_by_link_text(name)
            link.click()
            browser.wait_for_page_to_load("5000")

            # In this section, I'm having difficulty getting the coordinates from the 
            # wikipedia page after I click the link to the district.  Would be nice!
            print "Getting Coordinates maybe"
            wholepage = find_element_by_xpath("//*").get_attribute("outerHTML")
            print wholepage

            # maybe can try getting from here:  <span class="geo">

            geo = browser.find_element_by_class.name('geo')
            print "Printing coordinates maybe"
            print geo.text

            print "Going back"
            browser.back()
            '''
            print "we done"
            return 5
        except:
            None
            print "Error"
            if rownum > 2:
                return 0
            #return 0
        print "=================="
    def login(self, className):
        """
        Login into coursera and obtain the necessary session cookies.
        """
        hn, fn = tempfile.mkstemp()
        cookies = cookielib.LWPCookieJar()
        handlers = [
            urllib2.HTTPHandler(),
            urllib2.HTTPSHandler(),
            urllib2.HTTPCookieProcessor(cookies)
        ]

        # prepend a proxy handler if defined
        if (self.proxy):
            proxy = urllib2.ProxyHandler({'http': self.proxy})
            handlers = [proxy] + handlers

        opener = urllib2.build_opener(*handlers)

        url = self.lecture_url_from_name(className)
        req = urllib2.Request(url)

        try:
            res = opener.open(req)
        except urllib2.HTTPError as e:
            if e.code == 404:
                raise Exception("Unknown class %s" % className)

        # get the csrf token
        csrfcookie = [c for c in cookies if c.name == "csrf_token"]
        if not csrfcookie: raise Exception("Failed to find csrf cookie")
        csrftoken = csrfcookie[0].value
        opener.close()

        # call the authenticator url:
        cj = cookielib.MozillaCookieJar(fn)
        opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj),
                                      urllib2.HTTPHandler(),
                                      urllib2.HTTPSHandler())

        opener.addheaders.append(('Cookie', 'csrftoken=%s' % csrftoken))
        opener.addheaders.append(
            ('Referer', 'https://accounts.coursera.org/signin'))
        opener.addheaders.append(('X-CSRFToken', csrftoken))
        req = urllib2.Request(self.LOGIN_URL)

        data = urllib.urlencode({
            'email': self.username,
            'password': self.password
        })
        req.add_data(data)

        try:
            opener.open(req)
        except urllib2.HTTPError as e:
            if e.code == 401:
                raise Exception("Invalid username or password")

        # check if we managed to login
        sessionid = [c.name for c in cj if c.name == "CAUTH"]
        if not sessionid:
            raise Exception("Failed to authenticate as %s" % self.username)

        # all should be ok now, mechanize can handle the rest if we give it the
        # cookies
        br = mechanize.Browser()
        #br.set_debug_http(True)
        #br.set_debug_responses(False)
        #br.set_debug_redirects(True)
        br.set_handle_robots(False)
        br.set_cookiejar(cj)

        if self.proxy:
            br.set_proxies({"http": self.proxy})

        self.browser = br

        # also use this cookiejar for other mechanize operations (e.g., urlopen)
        opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj))
        mechanize.install_opener(opener)
Exemple #13
0
 def __init__(self):
     self.mb = mechanize.Browser()
Exemple #14
0
from bs4 import BeautifulSoup
import mechanize

mtrace = mechanize.Browser()
mtrace.set_handle_robots(False)

main_url = 'https://www.findandtrace.com/trace-mobile-number-location'
mtrace.open(main_url)
mtrace.select_form(name='trace')
mtrace['mobilenumber'] = ' '
response = mtrace.submit().read()

soup = BeautifulSoup(response, 'html.parser')
tbl = soup.find_all('table', class_='shop_table')

data = tbl[0].find('tfoot')
c = 0
for i in data:
    c += 1
    if c in (1, 4, 6, 8):
        continue
    th = i.find('th')
    td = i.find('td')
    print(th.text, td.text)

data = tbl[2].find('tfoot')
c = 0
for i in data:
    c += 1
    if c in (2, 20, 22, 26):
        th = i.find('th')
Exemple #15
0
def create_browser():
    br = mechanize.Browser()

    # Ignore robots.txt
    br.set_handle_robots(False)
    return br
def GetCoordinatesNoEncode(outFile):
    # Get the information on the census divisions:
    # Name, Province, Population, Illustrative
    # If there is no illustrative, fill it with Name

    br = mechanize.Browser()

    url = "http://en.wikipedia.org/wiki/List_of_census_divisions_of_Canada_by_population"
    response = br.open(url)
    soup = BeautifulSoup(br.response().read())

    table = soup.find_all('tr')

    print "Looking at the table:"
    rowNum = 0
    locations = []
    errorRows = []
    for row in table:
        print "Evaluating row: ", rowNum
        #print row.text
        print "-----------------------"
        columns = row.find_all('td')
        try:
            # Test that the first column is a number, incidating it was used in 2011 census
            int(columns[0].find('span', 'sorttext').text)

            # Column 2 - Census Division
            name = columns[2].find('span', 'sorttext').text
            print name
            print "*** Getting href to the census division's wikipedia page"
            locURL = columns[2].find('span', 'sorttext')
            locURL = locURL.a['href'].encode('utf-8')
            locURL = urljoin(url, locURL)
            print locURL

            # Get the lat/lon coordinates from the census division page
            latlon = GetCoordinatesFromWikipediaPage(br, locURL)

            # Column 4 - Province (abbreviated)
            province = columns[4].find('span', 'sorttext').text

            # Column 5 - Population from 2011 census
            population = int(columns[5].text.replace(',', ''))

            # Column 8 - Illustrative census subdivision
            illustrative = columns[8].text
            illustrativeURL = ''

            if illustrative != '':
                illustrativeURL = columns[8]
                #print illustrativeURL
                illustrativeURL = illustrativeURL.a['href']
                illustrativeURL = urljoin(url, illustrativeURL)
                print "*** Getting href to the census illustrative division's wikipedia page"
                print illustrativeURL
                # If we couldn't retrieve the latlon from the division's page, try the subdivision's page
                if latlon is None:
                    print "**** Calculating latlon from Illustrative"
                    latlon = GetCoordinatesFromWikipediaPage(
                        br, illustrativeURL)
            else:
                print "NO ILLUSTRATIVE"

            #print latlon
            if latlon is None:
                print "!!!!! NO COORDINATES !!!!!!"
                latlon = None, None

            # Create a list of the data we used to calculate the population and location
            thisLocation = [
                name, province, population, illustrative, latlon[0], latlon[1]
            ]
            # Append this division's list to the master list of all divisions
            locations.append(thisLocation)
            print "*** This location"
            print thisLocation

        except ValueError, AttributeError:
            None
        except:
Exemple #17
0
'''
#select replace(' sd d ',' ','')
#all = scraperwiki.sqlite.select('replace("CompanyNumber"," ","") from malta_companies')

#for a in all:
#    print a

# - Definition til að extracta streng ef við vitum tákn til beggja hliða


def extract(text, sub1, sub2):
    """extract a substring between two substrings sub1 and sub2 in text"""
    return text.split(sub1)[-1].split(sub2)[0]


b = mechanize.Browser()
b.addheaders = [('User-agent',
                 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')]
l = range(21, 1000, 10)


def process_companies(response, letter, page):
    root = lxml.html.fromstring(response)
    #print lxml.html.tostring(root)
    results = root.xpath(
        '//tr[contains(@class,"rgRow")]/.| //tr[contains(@class,"rgAltRow")]/.'
    )
    print 'Processing: ', letter, 'page: ', page
    if results:
        for tr in results:
            record = {}
Exemple #18
0
        url = ACTIVITIES % (currentIndex, increment)
        response = agent.open(url)
        search = json.loads(response.get_data())


parser = argparse.ArgumentParser(
    description='Garmin Data Scraper',
    epilog='Because the hell with APIs!',
    add_help='How to use',
    prog='python download.py -u <username> -o <output dir>')
parser.add_argument('-u',
                    '--user',
                    required=True,
                    help='Garmin username. This will NOT be saved!')
parser.add_argument('-o', '--output', required=True, help='Output directory.')

args = vars(parser.parse_args())
password = getpass('Garmin account password (NOT saved): ')
username = args['user']
output = args['output']

# Create the agent and log in.
agent = me.Browser()
login(agent, username, password)

# Create output directory (if it does not already exist).
if not os.path.exists(output):
    os.mkdir(output)

# Scrape all the activities.
activities(agent, output)
Exemple #19
0
def download_problems():
    global to_return

    # Clearing download directory for new downloads
    folder = "./downloads"
    for file in os.listdir(folder):
        file_path = os.path.join(folder, file)
        if os.path.isfile(file_path) or os.path.islink(file_path):
            os.unlink(file_path)
        elif os.path.isdir(file_path):
            shutil.rmtree(file_path)

    # Getting list of indices of problems to download
    download_list = [int(x) for x in request.form['prob_indices'].split(',')]

    # adjust your page display settings here
    options = {
        'quiet': '',
        'page-size': 'Letter',
        'margin-top': '0.75in',
        'margin-right': '0.75in',
        'margin-bottom': '0.75in',
        'margin-left': '0.75in',
        'encoding': "UTF-8",
        'no-outline': None
    }

    zip_probs = zipfile.ZipFile('./downloads/zipped_problems.zip', 'w')
    # print(download_list)
    for prob_indx in download_list:
        pdfName = to_return['problems'][int(
            prob_indx)]['id'] + ". " + to_return['problems'][int(
                prob_indx)]['name'] + '.pdf'

        # opening and saving questions.
        br = mechanize.Browser()
        br.set_handle_robots(False)
        br.addheaders = [(
            'User-agent',
            'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'
        )]
        prob_url = to_return['problems'][int(prob_indx)]['link']
        response = br.open(prob_url)
        data = response.read()

        bsoup = BeautifulSoup(data, features="lxml")

        css = ""
        for stylesheet in bsoup.find_all('link', rel="stylesheet"):
            css_url = "https:" + stylesheet.get('href')
            br1 = mechanize.Browser()
            br1.set_handle_robots(False)
            br1.addheaders = [(
                'User-agent',
                'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'
            )]
            response1 = br1.open(css_url)
            data1 = response1.read()

            temp = BeautifulSoup(data1, features="lxml")
            css += str(temp)

        css = "<style>" + css + "</style>"
        ques = bsoup.find('div', class_="ttypography")
        ques = str(ques)
        tags = bsoup.find_all('div', class_="roundbox sidebox")[2]
        tags = str(tags)

        html = css + ques + "<div style=\"margin:2em\"></p>" + tags
        pdfkit.from_string(html,
                           os.path.join(folder, pdfName),
                           options=options)
        zip_probs.write(os.path.join(folder, pdfName),
                        pdfName,
                        compress_type=zipfile.ZIP_DEFLATED)

    zip_probs.close()

    try:
        return send_file('./downloads/zipped_problems.zip',
                         attachment_filename='zipped_problems.zip')
    except Exception as e:
        return str(e)
def redditData(i, subreddits, debug, minDelay, maxDelay, b):
    """Function to scrape data from reddit subreddits
    subreddits = list of subreddits
    debug = print updates on screen
    minDelay = minimum delay between each scrape
    maxDelay = maximum delay between each scrape
    b = connection to AWS bucket
    """

    # intantiate an instance of mechanize with headers

    br = mechanize.Browser()

    header = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1',
        'Referer': 'http://www.reddit.com'
    }
    # Cookie Jar
    cj = cookielib.LWPCookieJar()

    # Browser Options
    br.set_cookiejar(cj)
    br.set_handle_equiv(True)
    br.set_handle_redirect(True)
    br.set_handle_referer(True)
    br.set_handle_robots(False)
    br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)

    # iterate through the subreddits. Stop the program when the user
    # terminates of no more data
    for subreddit in subreddits:
        page = 1
        titles = []
        while True:
            url = base_url + str(subreddit).strip().replace(
                '\t', '') + '?count=' + str(25 * (page - 1))
            page += 1
            if page % 3 == 0:
                titles = []
            if debug:
                sys.stdout.write('Visting reddit url :: ' + url + '\n' + '\n')

            logging.info('Visting reddit url :: ' + url + '\n' + '\n')

            # wrap the request.
            request = urllib2.Request(url, None, header)
            br.open(request)
            html = br.response().read()
            soup = BeautifulSoup(html, 'lxml')

            siteTable = soup.find(attrs={'id': 'siteTable'})
            divs = siteTable.findAll('div')

            for div in divs:
                try:
                    timestamp = div['data-timestamp']
                    date_time = str(
                        parser.parse(div.find('time')['datetime']).replace(
                            second=0).isoformat()).replace(
                                ':00+00:00', '+00:00')
                    day = str(
                        datetime.fromtimestamp(int(timestamp) / 1000).date())
                    time_post = str(
                        datetime.fromtimestamp(int(timestamp) /
                                               1000).isoformat())
                    title = unidecode(
                        div.find(attrs={
                            'class': 'title'
                        }).find('a').getText())
                    rank = div.find(attrs={'class': 'rank'}).getText()
                    link = div.find(attrs={'class': 'title'}).find('a')['href']
                    comment_link = div.find(attrs={
                        'class': 'flat-list buttons'
                    }).find('a')['href']

                    logging.info('Visting reddit comment url :: ' +
                                 comment_link + '\n' + '\n')

                    # wrap the request.
                    request_comment = urllib2.Request(comment_link, None,
                                                      header)
                    br.open(request_comment)
                    html_comment = br.response().read()
                    soup_comment = BeautifulSoup(html_comment, 'lxml')

                    comment_dict = {}
                    words = title

                    comments = soup_comment.findAll(attrs={'class': 'comment'})

                    # get comment 1 and 3 children
                    comment_dict['commentary_1'] = {}
                    comment_dict['commentary_1']['child_comments'] = {}
                    comment_dict['commentary_2'] = {}
                    comment_dict['commentary_2']['child_comments'] = {}
                    comment_dict['commentary_3'] = {}
                    comment_dict['commentary_3']['child_comments'] = {}
                    try:
                        comment_dict['commentary_1']['words'] = unidecode(
                            comments[0].find(attrs={
                                'class': 'usertext warn-on-unload'
                            }).getText().replace('\n', ' '))
                        words = words + ' ' + unidecode(
                            comments[0].find(attrs={
                                'class': 'usertext warn-on-unload'
                            }).getText().replace('\n', ' '))
                    except:
                        comment_dict['commentary_1']['words'] = ''
                    try:
                        comment_dict['commentary_1']['points'] = unidecode(
                            comments[0].find(attrs={
                                'class': 'score likes'
                            }).getText())
                    except:
                        comment_dict['commentary_1']['points'] = ''
                    try:
                        comment_dict['commentary_1']['time'] = unidecode(
                            comments[0].find(attrs={
                                'class': 'tagline'
                            }).find('time')['title'])
                    except:
                        comment_dict['commentary_1']['time'] = ''
                    try:
                        comment_dict['commentary_1']['user'] = unidecode(
                            comments[0].find(attrs={
                                'class': 'tagline'
                            }).findAll('a')[1].getText().replace('\n', ' '))
                    except:
                        comment_dict['commentary_1']['user'] = ''

                    try:
                        children = comments[0].findAll(
                            attrs={'class': 'usertext warn-on-unload'})
                    except:
                        pass
                    try:
                        comment_dict['commentary_1']['child_comments'][
                            'comment_1'] = unidecode(
                                children[1].getText().replace('\n', ' '))
                        words = words + ' ' + unidecode(
                            children[1].getText().replace('\n', ' '))
                    except:
                        comment_dict['commentary_1']['child_comments'][
                            'comment_1'] = ''

                    try:
                        comment_dict['commentary_1']['child_comments'][
                            'comment_2'] = unidecode(
                                children[2].getText().replace('\n', ' '))
                        words = words + ' ' + unidecode(
                            children[2].getText().replace('\n', ' '))
                    except:
                        comment_dict['commentary_1']['child_comments'][
                            'comment_2'] = ''

                    try:
                        comment_dict['commentary_1']['child_comments'][
                            'comment_3'] = unidecode(
                                children[3].getText().replace('\n', ' '))
                        words = words + ' ' + unidecode(
                            children[3].getText().replace('\n', ' '))
                    except:
                        comment_dict['commentary_1']['child_comments'][
                            'comment_3'] = ''

                    # get comment 2 and 3 children
                    try:
                        comment_dict['commentary_2']['words'] = unidecode(
                            comments[1].find(attrs={
                                'class': 'usertext warn-on-unload'
                            }).getText())
                        words = words + ' ' + unidecode(
                            comments[1].find(attrs={
                                'class': 'usertext warn-on-unload'
                            }).getText())
                    except:
                        comment_dict['commentary_2']['words'] = ''
                    try:
                        comment_dict['commentary_2']['points'] = unidecode(
                            comments[1].find(attrs={
                                'class': 'score likes'
                            }).getText())
                    except:
                        comment_dict['commentary_2']['points'] = ''
                    try:
                        comment_dict['commentary_2']['time'] = unidecode(
                            comments[1].find(attrs={
                                'class': 'tagline'
                            }).find('time')['title'])
                    except:
                        comment_dict['commentary_2']['time'] = ''
                    try:
                        comment_dict['commentary_2']['user'] = unidecode(
                            comments[1].find(attrs={
                                'class': 'tagline'
                            }).findAll('a')[1].getText().replace('\n', ' '))
                    except:
                        comment_dict['commentary_2']['user'] = ''

                    try:
                        children = comments[0].findAll(
                            attrs={'class': 'usertext warn-on-unload'})
                    except:
                        children = []
                    try:
                        comment_dict['commentary_2']['child_comments'][
                            'comment_1'] = unidecode(
                                children[1].getText().replace('\n', ' '))
                        words = words + ' ' + unidecode(
                            children[1].getText().replace('\n', ' '))
                    except:
                        comment_dict['commentary_2']['child_comments'][
                            'comment_1'] = ''

                    try:
                        comment_dict['commentary_2']['child_comments'][
                            'comment_2'] = unidecode(
                                children[2].getText().replace('\n', ' '))
                        words = words + ' ' + unidecode(
                            children[2].getText().replace('\n', ' '))
                    except:
                        comment_dict['commentary_2']['child_comments'][
                            'comment_2'] = ''

                    try:
                        comment_dict['commentary_2']['child_comments'][
                            'comment_3'] = unidecode(
                                children[3].getText().replace('\n', ' '))
                        words = words + ' ' + unidecode(
                            children[3].getText().replace('\n', ' '))
                    except:
                        comment_dict['commentary_2']['child_comments'][
                            'comment_3'] = ''

                    # get comment 3 and 3 children
                    try:
                        comment_dict['commentary_3']['words'] = unidecode(
                            comments[2].find(attrs={
                                'class': 'usertext warn-on-unload'
                            }).getText().replace('\n', ' '))
                        words = words + ' ' + unidecode(
                            comments[2].find(attrs={
                                'class': 'usertext warn-on-unload'
                            }).getText().replace('\n', ' '))
                    except:
                        comment_dict['commentary_3']['words'] = ''
                    try:
                        comment_dict['commentary_3']['points'] = unidecode(
                            comments[2].find(attrs={
                                'class': 'score likes'
                            }).getText())
                    except:
                        comment_dict['commentary_3']['points'] = ''
                    try:
                        comment_dict['commentary_3']['time'] = unidecode(
                            comments[2].find(attrs={
                                'class': 'tagline'
                            }).find('time')['title'])
                    except:
                        comment_dict['commentary_3']['time'] = ''
                    try:
                        comment_dict['commentary_3']['user'] = unidecode(
                            comments[2].find(attrs={
                                'class': 'tagline'
                            }).findAll('a')[1].getText().replace('\n', ' '))
                    except:
                        comment_dict['commentary_3']['user'] = ''

                    try:
                        children = comments[2].findAll(
                            attrs={'class': 'usertext warn-on-unload'})
                    except:
                        children = []
                    try:
                        comment_dict['commentary_3']['child_comments'][
                            'comment_1'] = unidecode(
                                children[1].getText().replace('\n', ' '))
                        words = words + ' ' + unidecode(
                            children[1].getText().replace('\n', ' '))
                    except:
                        comment_dict['commentary_3']['child_comments'][
                            'comment_1'] = ''

                    try:
                        comment_dict['commentary_3']['child_comments'][
                            'comment_2'] = unidecode(
                                children[2].getText().replace('\n', ' '))
                        words = words + ' ' + unidecode(
                            children[2].getText().replace('\n', ' '))
                    except:
                        comment_dict['commentary_3']['child_comments'][
                            'comment_2'] = ''

                    try:
                        comment_dict['commentary_3']['child_comments'][
                            'comment_3'] = unidecode(
                                children[3].getText().replace('\n', ' '))
                        words = words + ' ' + unidecode(
                            children[3].getText().replace('\n', ' '))
                    except:
                        comment_dict['commentary_3']['child_comments'][
                            'comment_3'] = ''
                    abstract = trimArticle(words, 50)

                    if '/r/' in link:
                        link = base_url + link[1:]

                    logging.info(
                        'Successfully got 3 comments from reddit comment url :: '
                        + comment_link + '\n' + '\n')

                    if title in titles:
                        logging.info('Scraped all posts from sub reddit :: ' +
                                     url + '\n' + '\n')
                        break

                    titles.append(title)

                    # write the fellow summary to file
                    file_name = 'reddit_' + title.replace(
                        ' ', '-') + '_' + day + '.json'
                    file_name = ''.join(c for c in file_name
                                        if c in valid_chars)

                    if os.name == 'nt':
                        f = open('reddit_jsons//' + file_name, 'wb')
                    else:
                        f = open('reddit_jsons/' + file_name, 'wb')
                    folder = 'reddit_jsons'
                    logging.info('Opened ' + 'reddit_jsons//' + file_name +
                                 '.json' + ' for writing')

                    data = {
                        'abstract': abstract,
                        'external_id': 'reddit_' + title.replace(' ', '-'),
                        'date': date_time,
                        'title': title,
                        'words': words,
                        'meta': {
                            'reddit': {
                                'comments': str(comment_dict),
                                'link': link,
                                'rank': rank
                            }
                        },
                        'url': comment_link
                    }

                    f.write(json.dumps(data))
                    f.close()
                    logging.info('File written ' + file_name + '.json' + '')
                    if os.name == 'nt':
                        uploadDataS3(folder + '//' + file_name, b)
                    else:
                        uploadDataS3(folder + '/' + file_name, b)

                    if debug:
                        sys.stdout.write(file_name + ' written' + '\n')

                except Exception as e:
                    # print str(e)
                    pass

            wait_time = random.randint(minDelay, maxDelay)
            sys.stdout.write('Sleeping for :: ' + str(wait_time) + '\n')
            logging.info('Sleeping for :: ' + str(wait_time) + '\n')
            sys.stdout.write('******************************************' +
                             '\n')
            sys.stdout.write('******************************************' +
                             '\n')
            time.sleep(wait_time)
Exemple #21
0
import os
import codecs
from bs4 import BeautifulSoup
import re
from selenium import webdriver
import time
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
import pyperclip
import mechanize
import cookielib
import xml.etree.ElementTree
import requests
#declarar navegador

br = mechanize.Browser(factory=mechanize.RobustFactory())

cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)

chromedriver = 'C:\\chromedriver.exe'
browser = webdriver.Chrome(chromedriver)

paginas = codecs.open('pages.txt', encoding='iso-8859-1')

br.set_handle_robots(False)
br.set_handle_equiv(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_debug_responses(True)
def login(user, pw, cookiePath):

    br = mechanize.Browser()

    cj = cookielib.LWPCookieJar()

    br.set_cookiejar(cj)
    br.set_handle_robots(False)

    br.open("https://www.onlinetvrecorder.com/v2/?go=home")

    # login form
    br.select_form('fhomelogin')

    br['email'] = user
    br['password'] = pw

    # result = br.submit().read()

    # change 02/19
    # not working since the base url is wrong...

    loginURL = "https://www.onlinetvrecorder.com/v2/?go=login"
    params = {
        u'email': user,
        u'password': pw,
        u'rememberlogin': '******',
        u'btn_login': '******'
    }
    data = urllib.urlencode(params)
    response = br.open(loginURL, data)
    result = response.read()

    em = ''
    pw = ''

    # get user and pw and set cookies
    m = re.search('otr_email=(.*?);', result)
    if (m != None):
        em = m.group(1)
    m = re.search('otr_password=(.*?);', result)
    if (m != None):
        pw = m.group(1)

    date = datetime.datetime.now()
    ts = time.mktime(date.timetuple())
    ts = ts + 86400

    c = cookielib.Cookie(version=0,
                         name='otr_email',
                         value=em,
                         port=None,
                         port_specified=False,
                         domain='onlinetvrecorder.com',
                         domain_specified=False,
                         domain_initial_dot=False,
                         path='/',
                         path_specified=True,
                         secure=False,
                         expires=ts,
                         discard=True,
                         comment=None,
                         comment_url=None,
                         rest={'HttpOnly': None},
                         rfc2109=False)
    cj.set_cookie(c)
    c = cookielib.Cookie(version=0,
                         name='otr_email',
                         value=em,
                         port=None,
                         port_specified=False,
                         domain='www.onlinetvrecorder.com',
                         domain_specified=False,
                         domain_initial_dot=False,
                         path='/',
                         path_specified=True,
                         secure=False,
                         expires=ts,
                         discard=True,
                         comment=None,
                         comment_url=None,
                         rest={'HttpOnly': None},
                         rfc2109=False)
    cj.set_cookie(c)

    c = cookielib.Cookie(version=0,
                         name='otr_password',
                         value=pw,
                         port=None,
                         port_specified=False,
                         domain='onlinetvrecorder.com',
                         domain_specified=False,
                         domain_initial_dot=False,
                         path='/',
                         path_specified=True,
                         secure=False,
                         expires=ts,
                         discard=True,
                         comment=None,
                         comment_url=None,
                         rest={'HttpOnly': None},
                         rfc2109=False)
    cj.set_cookie(c)
    c = cookielib.Cookie(version=0,
                         name='otr_password',
                         value=pw,
                         port=None,
                         port_specified=False,
                         domain='www.onlinetvrecorder.com',
                         domain_specified=False,
                         domain_initial_dot=False,
                         path='/',
                         path_specified=True,
                         secure=False,
                         expires=ts,
                         discard=True,
                         comment=None,
                         comment_url=None,
                         rest={'HttpOnly': None},
                         rfc2109=False)
    cj.set_cookie(c)

    #now reload
    response = br.reload()
    result = response.read()

    x = ItemClass()
    x.state = 'not loged in'
    x.id = '0'
    x.decode = '0'
    x.value = '0'

    # info fron website
    match = re.search('my_user_id="(?P<id>.*?)";.*?my_ut="(?P<state>.*?)"',
                      result)
    if (match != None):
        if (match.group('state') != ''):
            cj.save(cookiePath, ignore_discard=True, ignore_expires=True)

            x.id = match.group('id')
            x.state = match.group('state').title()

        match = re.search(
            '<a.href="history.decodings".*?<div.*?>(?P<value>[^<]*)<', result,
            re.DOTALL)
        if (match != None):
            x.decode = match.group('value')

        match = re.search(
            '<div.id="cssmenuright">.*?<a.href="points.*?>(?P<value>[^<]*)<',
            result, re.DOTALL)
        if (match != None):
            x.value = match.group('value')

    return x
    def __init__(self, args):
        self.args = args
        self.fixed_password = args.password is not None
        self.last_connect = 0

        if args.enable_funk:
            if not args.platform:
                args.platform = platform.system() + ' ' + platform.release()
            if not args.hostname:
                args.hostname = socket.gethostname()
            if not args.hwaddr:
                args.hwaddr = []
                for iface in netifaces.interfaces():
                    try:
                        mac = netifaces.ifaddresses(iface)[
                            netifaces.AF_LINK][0]['addr']
                        assert mac != '00:00:00:00:00:00'
                        args.hwaddr.append(mac)
                    except:
                        pass
            else:
                args.hwaddr = [n.strip() for n in args.hwaddr.split(',')]

            certs = []
            if args.certs:
                now = datetime.datetime.now()
                for f in args.certs.split(','):
                    cert = tncc.x509cert(f.strip())
                    if now < cert.not_before:
                        print 'WARNING: %s is not yet valid' % f
                    if now > cert.not_after:
                        print 'WARNING: %s is expired' % f
                    certs.append(cert)
                args.certs = [n.strip() for n in args.certs.split(',')]
            args.certs = certs

        self.br = mechanize.Browser()

        self.cj = cookielib.LWPCookieJar()
        self.br.set_cookiejar(self.cj)

        # Browser options
        self.br.set_handle_equiv(True)
        self.br.set_handle_redirect(True)
        self.br.set_handle_referer(True)
        self.br.set_handle_robots(False)

        # Follows refresh 0 but not hangs on refresh > 0
        self.br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(),
                                   max_time=1)

        # Want debugging messages?
        if debug:
            self.br.set_debug_http(True)
            self.br.set_debug_redirects(True)
            self.br.set_debug_responses(True)

        if args.user_agent:
            self.user_agent = args.user_agent
        else:
            self.user_agent = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'

        self.br.addheaders = [('User-agent', self.user_agent)]

        self.last_action = None
        self.needs_2factor = False
        self.key = None
        self.pass_postfix = None
Exemple #24
0
def readAeWeb(sTime, eTime=None, res=60):
    """This function reads ae data from the WDC kyoto website
    
    Parameters
    ----------
    sTime : datetime
        the earliest time you want data for
    eTime : Optional[datetime]
        the latest time you want data for.  if this is None, eTime will
        be equal to sTime.  eTime must not be more than 366 days after
        sTime.  default = None
    res : Optional[int]
        the time resolution desired, either 1 or 60 minutes.  default=60

    Notes
    -----
    You should not use this. Use the general function gme.ind.ae.readAe instead.
    
    Example
    -------
        import datetime as dt
        aeList = gme.ind.readAeWeb(dt.datetime(2011,1,1,1,50),eTime=dt.datetime(2011,1,1,10,0))
        
    written by AJ, 20130131

    """
    import datetime as dt
    import mechanize

    assert (isinstance(
        sTime, dt.datetime)), logging.error('sTime must be a datetime object')
    if (eTime == None): eTime = sTime
    assert (isinstance(
        eTime, dt.datetime)), logging.error('eTime must be a datetime object')
    assert (eTime >= sTime), logging.error('eTime < eTime')
    assert (res == 1 or res == 60), logging.error('res must be 1 or 60')
    delt = eTime - sTime
    assert (delt.days <= 366), logging.error('cant read more than 366 days')

    br = mechanize.Browser()
    br.set_handle_robots(False)  # no robots
    br.set_handle_refresh(False)  # can sometimes hang without this
    br.addheaders = [(
        'User-agent',
        'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'
    )]

    if (res == 60):
        sCent = sTime.year / 100
        sTens = (sTime.year - sCent * 100) / 10
        sYear = sTime.year - sCent * 100 - sTens * 10
        sMonth = sTime.strftime("%m")
        eCent = eTime.year / 100
        eTens = (eTime.year - eCent * 100) / 10
        eYear = eTime.year - eCent * 100 - eTens * 10
        eMonth = eTime.strftime("%m")

        br.open('http://wdc.kugi.kyoto-u.ac.jp/dstae/index.html')

        br.form = list(br.forms())[0]

        #fill out the page fields
        br.form.find_control('SCent').value = [str(sCent)]
        br.form.find_control('STens').value = [str(sTens)]
        br.form.find_control('SYear').value = [str(sYear)]
        br.form.find_control('SMonth').value = [sMonth]
        br.form.find_control('ECent').value = [str(eCent)]
        br.form.find_control('ETens').value = [str(eTens)]
        br.form.find_control('EYear').value = [str(eYear)]
        br.form.find_control('EMonth').value = [eMonth]

        br.form.find_control('Output').value = ['AE']
        br.form.find_control('Out format').value = ['IAGA2002']
        br.form.find_control('Email').value = "*****@*****.**"

    else:
        tens = (sTime.year) / 10
        year = sTime.year - tens * 10
        month = sTime.strftime("%m")
        dtens = sTime.day / 10
        day = sTime.day - dtens * 10
        htens = sTime.hour / 10
        hour = sTime.hour - htens * 10
        ehtens = eTime.hour / 10
        ehour = eTime.hour - ehtens * 10
        minute_tens = sTime.minute / 10
        minute = sTime.minute - minute_tens * 10
        eminute_tens = eTime.minute / 10
        eminute = eTime.minute - eminute_tens * 10
        ddtens = delt.days / 10
        dday = delt.days - ddtens * 10

        br.open('http://wdc.kugi.kyoto-u.ac.jp/aeasy/index.html')

        br.form = list(br.forms())[0]

        #fill out the fields
        br.form.find_control('Tens').value = [str(tens)]
        br.form.find_control('Year').value = [str(year)]
        br.form.find_control('Month').value = [str(month)]
        br.form.find_control('Day_Tens').value = [str(dtens)]
        br.form.find_control('Days').value = [str(day)]
        #br.form.find_control('Hour_Tens').value = [str(htens)]
        br.form.find_control('Hour').value = [str(htens) + str(hour)]
        br.form.find_control('min').value = [str(minute_tens) + str(minute)]
        if (ddtens < 9): ddtens = '0' + str(ddtens)
        br.form.find_control('Dur_Day_Tens').value = [str(ddtens)]
        br.form.find_control('Dur_Day').value = [str(dday)]
        br.form.find_control('Dur_Hour').value = [str(ehtens) + str(ehour)]
        br.form.find_control('Dur_Min').value = [
            str(eminute_tens) + str(eminute)
        ]
        br.form.find_control('Output').value = ['AE']
        br.form.find_control('Out format').value = ['IAGA2002']
        br.form.find_control('Email').value = "*****@*****.**"

    response = br.submit()

    #get the data
    lines = response.readlines()

    aeList = []
    for l in lines:
        #check for headers
        if (l[0] == ' ' or l[0:4] == 'DATE'): continue
        cols = l.split()
        try:
            aeList.append(aeRec(webLine=l, res=res))
        except Exception, e:
            logging.exception(e)
            logging.exception('problem assigning initializing ae object')
Exemple #25
0
def testProxy(url, proxy):
    browser = mechanize.Browser()
    browser.set_proxies(proxy)
    page = browser.open(url)
    source_code = page.read()
    print source_code
Exemple #26
0
rangeInputLower = raw_input("What lower rowlimit?")
rangeInputLower = int(rangeInputLower)
rangeInputUpper = raw_input("What upper rowlimit?")
rangeInputUpper = int(rangeInputUpper)


## inputs
input1 = "input/townsOnly.csv"
input2 = "input/malefirstnames.csv"


######################
# Mechanize Set up

#  set up the browser
driver = mechanize.Browser()

# Enable cookie support for mechanize 
cookiejar = cookielib.LWPCookieJar() 
driver.set_cookiejar( cookiejar ) 

# Broser options 
driver.set_handle_equiv( True ) 
driver.set_handle_gzip( True ) 
driver.set_handle_redirect( True ) 
driver.set_handle_referer( True ) 
driver.set_handle_robots( False ) 

# # this does seomthing ...  (copy paste from stackexchange...)
driver.set_handle_refresh( mechanize._http.HTTPRefreshProcessor(), max_time = 1 ) 
driver.addheaders = [ ( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' ) ] 
Exemple #27
0
#!/usr/bin/env python2
#Modules
import mechanize
import itertools
import cookielib
import sys
from bs4 import BeautifulSoup
from re import search, findall
from urllib import urlopen
#Stuff related to Mechanize browser module
br = mechanize.Browser(
)  #Shortening the call by assigning it to a varaible "br"
# set cookies
cookies = cookielib.LWPCookieJar()
br.set_cookiejar(cookies)
# Mechanize settings
br.set_handle_equiv(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
br.set_debug_http(False)
br.set_debug_responses(False)
br.set_debug_redirects(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
br.addheaders = [(
    'User-agent',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'
), ('Accept',
    'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
                 ('Accept-Encoding', 'br')]
# Banner
        raw_hours =  root.cssselect('tr')
        days = ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']
        opening = {}
        for i, day in enumerate(days):
            daily_hours = raw_hours[i].cssselect('td')[1].text_content().split('-')
            opening[day + '_opening'] = daily_hours[0]
            opening[day + '_closing'] = daily_hours[1]
        store['opening'] = opening
        m = re.compile(r'new google.maps.LatLng(.*?);').search(store_html)
        store['location'] = m.group(1).replace('(','').replace(')','')
        print store
        scraperwiki.sqlite.save(unique_keys=["name"], data=store)

# And finally - international stores. 
INTERNATIONAL_URL = 'http://corporate.marksandspencer.com/aboutus/where/international_stores'
browser = mechanize.Browser()
browser.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
browser.open(INTERNATIONAL_URL)
#print browser.response().read()
html = browser.response().get_data().replace('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">','').replace('<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">','<html>')
response = mechanize.make_response(
    html, [("Content-Type", "text/html")],
INTERNATIONAL_URL, 200, "OK")
browser.set_response(response)
#browser.select_form(nr=0)import json
import mechanize
import re
import scraperwiki
import urllib2
import lxml.html
Exemple #29
0
try:
    import mechanize
except ImportError:
    os.system('pip2 install request')
    time.sleep(1)
    os.system('Then type: python2 boss')

import os,sys,time,datetime,random,hashlib,re,threading,json,urllib,cookielib,requests,mechanize
from multiprocessing.pool import ThreadPool
from requests.exceptions import ConnectionError
from mechanize import Browser


reload(sys)
sys.setdefaultencoding('utf8')
br = mechanize.Browser()
br.set_handle_robots(False)
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(),max_time=1)
br.addheaders = [('User-Agent', 'Opera/9.80 (Android; Opera Mini/32.0.2254/85. U; id) Presto/2.12.423 Version/12.16')]
br.addheaders = [('user-agent','Dalvik/1.6.0 (Linux; U; Android 4.4.2; NX55 Build/KOT5506) [FBAN/FB4A;FBAV/106.0.0.26.68;FBBV/45904160;FBDM/{density=3.0,width=1080,height=1920};FBLC/it_IT;FBRV/45904160;FBCR/PosteMobile;FBMF/asus;FBBD/asus;FBPN/com.facebook.katana;FBDV/ASUS_Z00AD;FBSV/5.0;FBOP/1;FBCA/x86:armeabi-v7a;]')]

def keluar():
	print 'Thanks.'
	os.sys.exit()

def acak(b):
    w = 'ahtdzjc'
    d = ''
    for i in x:
        d += '!'+w[random.randint(0,len(w)-1)]+i
    return cetak(d)
Exemple #30
0
def cli(fr, msg, notifs, bdays):
    browser = mechanize.Browser()
    browser.set_handle_robots(False)  #Allows everything to be written
    cookies = mechanize.CookieJar()
    browser.set_cookiejar(cookies)
    browser.addheaders = [(
        'User-agent',
        'Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.41 Safari/534.7'
    )]
    browser.set_handle_refresh(False)  #Sometimes hangs without this
    bday_people_links = [
    ]  #List to store profile links of people who have their birthdays today
    bday_people_names = []
    i = 1
    try:
        if (bdays):
            url = 'http://www.facebook.com/events/birthdays/'
            soup = authenticate(browser, url, email,
                                pwd)  #Parses the html and stores in 'soup'
            bday_box = soup.find(
                'div',
                attrs={
                    'class': '_4-u2 _tzh _fbBirthdays__todayCard _4-u8'
                })  #Finds the html with the div tags and given attributes
            bday_box_narrow = bday_box.find_all(
                'a', attrs={'data-hovercard-prefer-more-content-show': '1'}
            )  #Finds all a tags with the given attirbute. This will be the list of bdays
            click.echo("%d people have their birthdays today :\n" %
                       (len(bday_box_narrow)))
            for a in bday_box_narrow:
                print str(
                    i
                ) + ')', a.text  #prints names of people who have their birthdays today
                bday_people_names += [
                    a.text
                ]  #stores names of people who have their birthdays today
                bday_people_links += [
                    a.get('href')
                ]  #stores links of profiles of people have their bdays today
                i += 1
        else:
            url = 'http://www.facebook.com/login.php'
            soup = authenticate(browser, url, email,
                                pwd)  #Parses the html and stores in 'soup'
            if (fr):  #To find number of new friend request
                fr_num_box = soup.find('span',
                                       attrs={
                                           'id': 'requestsCountValue'
                                       })  #Finds span tags with the given ID
                click.echo(
                    "You have %s new friend requests" % (fr_num_box.text)
                )  #Displays and gives the string between the span tags (<span>...</span>)
            if (msg):  #To find number of unread messages
                msg_num_box = soup.find(
                    'span', attrs={'id': 'mercurymessagesCountValue'})
                click.echo("You have %s unread messages" % (msg_num_box.text))
            if (notifs):  #To find the number of unseen notifications
                notifs_num_box = soup.find(
                    'span', attrs={'id': 'notificationsCountValue'})
                click.echo("You have %s unread notifications" %
                           (str(int(notifs_num_box.text) + 1)))
    except AttributeError:
        click.echo("Either the password or email id you've entered is wrong")