def login_user(username, password): # Create the agent and log in. agent = me.Browser() print("Attempting to login to Garmin Connect...") login(agent, username, password) return agent
import mechanize import sys import httplib import argparse import logging import time from urlparse import urlparse from mpi4py import MPI comm = MPI.COMM_WORLD rank = comm.Get_rank() start_time = time.time() br = mechanize.Browser() # initiating the browser br.addheaders = [ ('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11)Gecko/20071127 Firefox/2.0.0.11') ] br.set_handle_robots(False) br.set_handle_refresh(False) payloads = ['<svg "ons>', '" onfocus="alert(1);', 'javascript:alert(1)'] blacklist = ['.png', '.jpg', '.jpeg', '.mp3', '.mp4', '.avi', '.gif', '.svg', '.pdf'] xssLinks = [] # TOTAL CROSS SITE SCRIPTING FINDINGS class color: BLUE = '\033[94m' RED = '\033[91m' GREEN = '\033[92m'
# William Gurecky # import mechanize as mz import re import os if not os.path.exists('endfvii'): os.makedirs('endfvii') outdir = 'endfvii/' #outdir = 'endfvi/' # Set target page target = 'http://t2.lanl.gov/nis/data/endf/endfvii-n.html' #target = 'http://t2.lanl.gov/nis/data/endf/endfvi-n.html' # Open up browser instance br = mz.Browser() br.open(target) keywrd = re.compile("neutron") links = list(br.links()) for link in links: if keywrd.findall(link.url): print("Downloading: " + link.url) br.follow_link(link) material = str(br.geturl()).split('/')[-2:] br.retrieve(br.geturl(), outdir + ''.join(material)) br.back()
import mechanize import argparse sr = argparse.ArgumentParser() sr.add_argument('-u', dest='url', action='store', help='The URL to analyze') results = sr.parse_args() moilla = mechanize.Browser() moilla.addheaders = [('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11)Gecko/20071127 Firefox/2.0.0.11')] moilla.set_handle_robots(False) moilla.set_handle_refresh(False) back = ['.png', '.jpg', '.jpeg', '.mp3', '.mp4', '.avi', '.gif', '.svg', '.pdf'] XSSpay = ['<svg "ons>', '"onfocus="alert(1);', 'javascript:alert(1)'] class color: RED = '\033[91m' GREEN = '\033[92m' YELLOW = '\033[93m' xssurl = results.url if not xssurl: print color.RED + """NOT URL""" else: try: abc = 0 for ba in back: if ba in xssurl: print color.RED + """Not a good url to test""" abc = 1 if abc == 0:
def get_bd_index_all_mechanize(file_name): ts = time.time() st = datetime.datetime.fromtimestamp(ts).strftime('%Y_%m_%d_%H_%M_%S') f_error = open('errors_' + st, 'w') f_success = open('success_' + st, 'w') f_success.write('ID\tgg_index_common\tgg_hk\tgg_new\tgg_site\tbd_index_chinese\t' +'bd_index_common\tbd_news_chinese\tbd_news_common\tbd_site\n') all_schools = [] with codecs.open(file_name, 'r', 'utf-8') as f: for line in f.readlines(): line = line.strip().split('\t') all_schools.append(line) #print all_schools count =0 for school in all_schools[0:]: sleep(random.random() * 2) while True: nums = [] print school, nums try: ############################## gg # school names inside quotes if len(school) > 4: target_en = "\"" + school[2] + "\" " + school[4] else: target_en = "\"" + school[2] + "\"" #target_ch = "\"" + school[3] + "\"" site = 'site:' + school[1] print target_en, site # gg search # en br = mechanize.Browser() br.set_handle_robots(False) # ignore robots br.set_handle_refresh(False) # can sometimes hang without this br.set_handle_redirect(True) br.set_handle_referer(True) br.addheaders = [('user-agent','Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.3) Gecko/20100423 Ubuntu/10.04 (lucid) Firefox/3.6.3')] url = "http://www.google.com" search_box='q' br.open(url) #htmlFile = br.response() br.select_form(nr=0) br.form[search_box] = target_en response = br.submit() #response_html = response.get_data() soup = BeautifulSoup(response) target = soup.find('div', {'id': 'resultStats'}) all_txt = target.text str_txt = unicodedata.normalize('NFKD', all_txt).encode('ascii','ignore') nums_en_index = re.findall(r'\d+', str_txt) nums.append(''.join(nums_en_index)) # gg hk # en br = mechanize.Browser() br.set_handle_robots(False) # ignore robots br.set_handle_refresh(False) # can sometimes hang without this br.set_handle_redirect(True) br.set_handle_referer(True) br.addheaders = [('user-agent','Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.3) Gecko/20100423 Ubuntu/10.04 (lucid) Firefox/3.6.3')] url = "http://www.google.com.hk" search_box='q' br.open(url) #htmlFile = br.response() br.select_form(nr=0) br.form[search_box] = target_en response = br.submit() #response_html = response.get_data() soup = BeautifulSoup(response) target = soup.find('div', {'id': 'resultStats'}) all_txt = target.text str_txt = unicodedata.normalize('NFKD', all_txt).encode('ascii','ignore') nums_en_index = re.findall(r'\d+', str_txt) nums.append(''.join(nums_en_index)) # gg news # en br = mechanize.Browser() br.set_handle_robots(False) # ignore robots br.set_handle_refresh(False) # can sometimes hang without this br.set_handle_redirect(True) br.set_handle_referer(True) br.addheaders = [('user-agent','Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.3) Gecko/20100423 Ubuntu/10.04 (lucid) Firefox/3.6.3')] url = "http://news.google.com" search_box='q' br.open(url) #htmlFile = br.response() br.select_form(nr=0) br.form[search_box] = target_en response = br.submit() #response_html = response.get_data() soup = BeautifulSoup(response) target = soup.find('div', {'id': 'resultStats'}) all_txt = target.text str_txt = unicodedata.normalize('NFKD', all_txt).encode('ascii','ignore') nums_en_index = re.findall(r'\d+', str_txt) nums.append(''.join(nums_en_index)) # gg site site = 'site:' + school[1] br = mechanize.Browser() br.set_handle_robots(False) # ignore robots br.set_handle_refresh(False) # can sometimes hang without this br.set_handle_redirect(True) br.set_handle_referer(True) br.addheaders = [('user-agent','Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.3) Gecko/20100423 Ubuntu/10.04 (lucid) Firefox/3.6.3')] url = "http://www.google.com" search_box='q' br.open(url) #htmlFile = br.response() br.select_form(nr=0) br.form[search_box] = site response = br.submit() #response_html = response.get_data() soup = BeautifulSoup(response) target = soup.find('div', {'id': 'resultStats'}) all_txt = target.text str_txt = unicodedata.normalize('NFKD', all_txt).encode('ascii','ignore') nums_en_index = re.findall(r'\d+', str_txt) nums.append(''.join(nums_en_index)) ############################################### baidu # school name has not quotes if len(school) > 4: target_en = school[2] + " " + school[4] else: target_en = school[2] target_ch = school[3] target_en = string.replace(target_en, ' ', '%20') target_ch = string.replace(target_ch, ' ', '%20') print target_ch, target_en, site while True: nums_bd = [] try: # baidu search # ch url = 'http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=baidu&wd=' \ +target_ch url = url.encode('utf8') response = urllib2.urlopen(url, timeout = 10) response_html = response.read() soup = BeautifulSoup(response_html) #bs_html = soup.body.prettify(encoding='utf-8') target = soup.find('div', {'class': 'nums'}) all_txt = target.text str_txt = unicodedata.normalize('NFKD', all_txt).encode('ascii','ignore') nums_en_index = re.findall(r'\d+', str_txt) nums_bd.append(''.join(nums_en_index)) # baidu search # en url = 'http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=baidu&wd=' \ + target_en response = urllib2.urlopen(url, timeout = 30) response_html = response.read() soup = BeautifulSoup(response_html) #bs_html = soup.body.prettify(encoding='utf-8') target = soup.find('div', {'class': 'nums'}) all_txt = target.text str_txt = unicodedata.normalize('NFKD', all_txt).encode('ascii','ignore') nums_en_index = re.findall(r'\d+', str_txt) nums_bd.append(''.join(nums_en_index)) # baidu news # ch url = 'http://news.baidu.com/ns?cl=2&rn=20&tn=news&word='+target_ch url = url.encode('utf8') response = urllib2.urlopen(url, timeout = 30) response_html = response.read() soup = BeautifulSoup(response_html) #bs_html = soup.body.prettify(encoding='utf-8') target = soup.find('span', {'class': 'nums'}) all_txt = target.text str_txt = unicodedata.normalize('NFKD', all_txt).encode('ascii','ignore') nums_en_index = re.findall(r'\d+', str_txt) nums_bd.append(''.join(nums_en_index)) # baidu news # en url = 'http://news.baidu.com/ns?cl=2&rn=20&tn=news&word='+target_en response = urllib2.urlopen(url, timeout = 30) response_html = response.read() soup = BeautifulSoup(response_html) #bs_html = soup.body.prettify(encoding='utf-8') target = soup.find('span', {'class': 'nums'}) all_txt = target.text str_txt = unicodedata.normalize('NFKD', all_txt).encode('ascii','ignore') nums_en_index = re.findall(r'\d+', str_txt) nums_bd.append(''.join(nums_en_index)) # baidu site url = 'http://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&tn=baidu&wd=' \ + site response = urllib2.urlopen(url, timeout = 30) response_html = response.read() soup = BeautifulSoup(response_html) #bs_html = soup.body.prettify(encoding='utf-8') target = soup.find('div', {'class': 'nums'}) all_txt = target.text str_txt = unicodedata.normalize('NFKD', all_txt).encode('ascii','ignore') nums_en_index = re.findall(r'\d+', str_txt) nums_bd.append(''.join(nums_en_index)) nums += nums_bd one_line = school[0] + '\t' + '\t'.join(nums) f_success.write(one_line + '\n') f_success.flush() break except Exception: print sys.exc_info()[:2] pass print 'count =', count, '&&&\t', one_line + '\n' count += 1 #if count % 10 == 0: # sleep 10 min every 10 runs #sleep(600) break except Exception: print sys.exc_info()[:2] print 'count =', count, ' &&&' + '\t'.join(school) + '\n' count += 1 #if count % 10 == 0: # sleep 10 min every 10 runs sleep(600) # sleep 10 min if failed f_error.write(school[0] + '\t' + school[2] + '\n') f_error.flush() pass f_error.close() f_success.close()
def setUp(self): self.server = multiprocessing.Process(target=run_server) self.server.start() self.browser = mechanize.Browser() time.sleep(3)
import scraperwiki import mechanize import re import urlparse import lxml.html # ASPX pages are some of the hardest challenges because they use javascript and forms to navigate # Almost always the links go through the function function __doPostBack(eventTarget, eventArgument) # which you have to simulate in the mechanize form handling library # This example shows how to follow the Next page link url3 = 'http://home.btconnect.com/haltontransport/servicechanges.htm' br3 = mechanize.Browser() # sometimes the server is sensitive to this information br3.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] response3 = br3.open(url3) html3 = response3.read() print html3 videoimgs = re.findall('<TD.*?>(.*?)</TD>', html3, re.MULTILINE | re.DOTALL | re.VERBOSE ) print videoimgs scraperwiki.sqlite.execute("delete from swdata") record = {}
def GetCoordinatesFromPageSoup(): br = mechanize.Browser() url = "http://en.wikipedia.org/wiki/Toronto" response = br.open(url) soup = BeautifulSoup(br.response().read())
def GetCoordinatesSoup(): # Get the information on the census divisions: # Name, Province, Population, Illustrative # If there is no illustrative, fill it with Name br = mechanize.Browser() url = "http://en.wikipedia.org/wiki/List_of_census_divisions_of_Canada_by_population" response = br.open(url) soup = BeautifulSoup(br.response().read()) table = soup.find_all('tr') print "Looking at the table:" rownum = 0 locations = [] errorRows = [] for row in table: rownum += 1 print "Here comes the row ", rownum #print row.text print "------------------" columns = row.find_all('td') try: # Test that the first column is a number, incidating it was used in 2011 census int(columns[0].find('span', 'sorttext').text) #print columns[0].find('span','sorttext').text #print "*" #print columns[1].find('span','sorttext').text #print "**" name = columns[2].find('span', 'sorttext').text.encode('utf-8') print name print "And href" locURL = columns[2].find('span', 'sorttext') print locURL locURL = locURL.a['href'].encode('utf-8') print locURL locURL = urljoin(url, locURL) print locURL latlon = GetCoordinatesFromWikipediaPage(br, locURL) print "***" province = columns[4].find('span', 'sorttext').text.encode('utf-8') print province print "****4 - getting population" population = int(columns[5].text.replace(',', '')) print population # int(data.replace(',', '')) print "*****5 - Getting illustrative" illustrative = columns[8].text.encode('utf-8') illustrativeURL = '' print illustrative if illustrative != '': illustrativeURL = columns[8] #print illustrativeURL illustrativeURL = illustrativeURL.a['href'].encode('utf-8') print illustrativeURL illustrativeURL = urljoin(url, illustrativeURL) print illustrativeURL if latlon is None: print "Calculating latlon from Illustrative" latlon = GetCoordinatesFromWikipediaPage( br, illustrativeURL) else: print "NO ILLUSTRATIVE" print "******6" print latlon if latlon is None: latlon = None, None #print columns[8].a # gives the href print "*******7" #print columns[8].a.text # alternate thisLocation = [ name, province, population, illustrative, latlon[0], latlon[1] ] locations.append(thisLocation) print thisLocation ''' time.sleep(4) print "clicking link" link = browser.find_element_by_link_text(name) link.click() browser.wait_for_page_to_load("5000") # In this section, I'm having difficulty getting the coordinates from the # wikipedia page after I click the link to the district. Would be nice! print "Getting Coordinates maybe" wholepage = find_element_by_xpath("//*").get_attribute("outerHTML") print wholepage # maybe can try getting from here: <span class="geo"> geo = browser.find_element_by_class.name('geo') print "Printing coordinates maybe" print geo.text print "Going back" browser.back() if rownum > 3: print "we done" return locations ''' except ValueError, AttributeError: None except:
def make_browser(self): browser = mechanize.Browser() self._configure_user_agent(browser) return browser
def GetCoordinatesSelenium(): # start on the wikipedia census division page browser = webdriver.Firefox() url = 'http://en.wikipedia.org/wiki/List_of_census_divisions_of_Canada_by_population' browser.get(url) # Get the information on the census divisions: # Name, Province, Population, Illustrative # If there is no illustrative, fill it with Name br = mechanize.Browser() url = "http://en.wikipedia.org/wiki/List_of_census_divisions_of_Canada_by_population" response = br.open(url) soup = BeautifulSoup(br.response().read()) table = soup.find_all('tr') print "Looking at the table:" rownum = 0 locations = [] for row in table: rownum += 1 print "Here comes the row ", rownum #print row.text print "------------------" columns = row.find_all('td') try: # Test that the first column is a number, incidating it was used in 2011 census int(columns[0].find('span', 'sorttext').text) #print columns[0].find('span','sorttext').text #print "*" #print columns[1].find('span','sorttext').text #print "**" name = columns[2].find('span', 'sorttext').text print name print "***" province = columns[4].find('span', 'sorttext').text print province print "****" population = int(columns[5].text.replace(',', '')) print population # int(data.replace(',', '')) print "*****" illustrative = columns[8].text print illustrative if illustrative == '': illustrative = name print illustrative #print "******" #print columns[8].a # gives the href #print "*******" #print columns[8].a.text # alternate thisLocation = [ name.encode('utf-8'), province, population, illustrative.encode('utf-8') ] locations.append(thisLocation) ''' time.sleep(4) print "clicking link" link = browser.find_element_by_link_text(name) link.click() browser.wait_for_page_to_load("5000") # In this section, I'm having difficulty getting the coordinates from the # wikipedia page after I click the link to the district. Would be nice! print "Getting Coordinates maybe" wholepage = find_element_by_xpath("//*").get_attribute("outerHTML") print wholepage # maybe can try getting from here: <span class="geo"> geo = browser.find_element_by_class.name('geo') print "Printing coordinates maybe" print geo.text print "Going back" browser.back() ''' print "we done" return 5 except: None print "Error" if rownum > 2: return 0 #return 0 print "=================="
def login(self, className): """ Login into coursera and obtain the necessary session cookies. """ hn, fn = tempfile.mkstemp() cookies = cookielib.LWPCookieJar() handlers = [ urllib2.HTTPHandler(), urllib2.HTTPSHandler(), urllib2.HTTPCookieProcessor(cookies) ] # prepend a proxy handler if defined if (self.proxy): proxy = urllib2.ProxyHandler({'http': self.proxy}) handlers = [proxy] + handlers opener = urllib2.build_opener(*handlers) url = self.lecture_url_from_name(className) req = urllib2.Request(url) try: res = opener.open(req) except urllib2.HTTPError as e: if e.code == 404: raise Exception("Unknown class %s" % className) # get the csrf token csrfcookie = [c for c in cookies if c.name == "csrf_token"] if not csrfcookie: raise Exception("Failed to find csrf cookie") csrftoken = csrfcookie[0].value opener.close() # call the authenticator url: cj = cookielib.MozillaCookieJar(fn) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj), urllib2.HTTPHandler(), urllib2.HTTPSHandler()) opener.addheaders.append(('Cookie', 'csrftoken=%s' % csrftoken)) opener.addheaders.append( ('Referer', 'https://accounts.coursera.org/signin')) opener.addheaders.append(('X-CSRFToken', csrftoken)) req = urllib2.Request(self.LOGIN_URL) data = urllib.urlencode({ 'email': self.username, 'password': self.password }) req.add_data(data) try: opener.open(req) except urllib2.HTTPError as e: if e.code == 401: raise Exception("Invalid username or password") # check if we managed to login sessionid = [c.name for c in cj if c.name == "CAUTH"] if not sessionid: raise Exception("Failed to authenticate as %s" % self.username) # all should be ok now, mechanize can handle the rest if we give it the # cookies br = mechanize.Browser() #br.set_debug_http(True) #br.set_debug_responses(False) #br.set_debug_redirects(True) br.set_handle_robots(False) br.set_cookiejar(cj) if self.proxy: br.set_proxies({"http": self.proxy}) self.browser = br # also use this cookiejar for other mechanize operations (e.g., urlopen) opener = mechanize.build_opener(mechanize.HTTPCookieProcessor(cj)) mechanize.install_opener(opener)
def __init__(self): self.mb = mechanize.Browser()
from bs4 import BeautifulSoup import mechanize mtrace = mechanize.Browser() mtrace.set_handle_robots(False) main_url = 'https://www.findandtrace.com/trace-mobile-number-location' mtrace.open(main_url) mtrace.select_form(name='trace') mtrace['mobilenumber'] = ' ' response = mtrace.submit().read() soup = BeautifulSoup(response, 'html.parser') tbl = soup.find_all('table', class_='shop_table') data = tbl[0].find('tfoot') c = 0 for i in data: c += 1 if c in (1, 4, 6, 8): continue th = i.find('th') td = i.find('td') print(th.text, td.text) data = tbl[2].find('tfoot') c = 0 for i in data: c += 1 if c in (2, 20, 22, 26): th = i.find('th')
def create_browser(): br = mechanize.Browser() # Ignore robots.txt br.set_handle_robots(False) return br
def GetCoordinatesNoEncode(outFile): # Get the information on the census divisions: # Name, Province, Population, Illustrative # If there is no illustrative, fill it with Name br = mechanize.Browser() url = "http://en.wikipedia.org/wiki/List_of_census_divisions_of_Canada_by_population" response = br.open(url) soup = BeautifulSoup(br.response().read()) table = soup.find_all('tr') print "Looking at the table:" rowNum = 0 locations = [] errorRows = [] for row in table: print "Evaluating row: ", rowNum #print row.text print "-----------------------" columns = row.find_all('td') try: # Test that the first column is a number, incidating it was used in 2011 census int(columns[0].find('span', 'sorttext').text) # Column 2 - Census Division name = columns[2].find('span', 'sorttext').text print name print "*** Getting href to the census division's wikipedia page" locURL = columns[2].find('span', 'sorttext') locURL = locURL.a['href'].encode('utf-8') locURL = urljoin(url, locURL) print locURL # Get the lat/lon coordinates from the census division page latlon = GetCoordinatesFromWikipediaPage(br, locURL) # Column 4 - Province (abbreviated) province = columns[4].find('span', 'sorttext').text # Column 5 - Population from 2011 census population = int(columns[5].text.replace(',', '')) # Column 8 - Illustrative census subdivision illustrative = columns[8].text illustrativeURL = '' if illustrative != '': illustrativeURL = columns[8] #print illustrativeURL illustrativeURL = illustrativeURL.a['href'] illustrativeURL = urljoin(url, illustrativeURL) print "*** Getting href to the census illustrative division's wikipedia page" print illustrativeURL # If we couldn't retrieve the latlon from the division's page, try the subdivision's page if latlon is None: print "**** Calculating latlon from Illustrative" latlon = GetCoordinatesFromWikipediaPage( br, illustrativeURL) else: print "NO ILLUSTRATIVE" #print latlon if latlon is None: print "!!!!! NO COORDINATES !!!!!!" latlon = None, None # Create a list of the data we used to calculate the population and location thisLocation = [ name, province, population, illustrative, latlon[0], latlon[1] ] # Append this division's list to the master list of all divisions locations.append(thisLocation) print "*** This location" print thisLocation except ValueError, AttributeError: None except:
''' #select replace(' sd d ',' ','') #all = scraperwiki.sqlite.select('replace("CompanyNumber"," ","") from malta_companies') #for a in all: # print a # - Definition til að extracta streng ef við vitum tákn til beggja hliða def extract(text, sub1, sub2): """extract a substring between two substrings sub1 and sub2 in text""" return text.split(sub1)[-1].split(sub2)[0] b = mechanize.Browser() b.addheaders = [('User-agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')] l = range(21, 1000, 10) def process_companies(response, letter, page): root = lxml.html.fromstring(response) #print lxml.html.tostring(root) results = root.xpath( '//tr[contains(@class,"rgRow")]/.| //tr[contains(@class,"rgAltRow")]/.' ) print 'Processing: ', letter, 'page: ', page if results: for tr in results: record = {}
url = ACTIVITIES % (currentIndex, increment) response = agent.open(url) search = json.loads(response.get_data()) parser = argparse.ArgumentParser( description='Garmin Data Scraper', epilog='Because the hell with APIs!', add_help='How to use', prog='python download.py -u <username> -o <output dir>') parser.add_argument('-u', '--user', required=True, help='Garmin username. This will NOT be saved!') parser.add_argument('-o', '--output', required=True, help='Output directory.') args = vars(parser.parse_args()) password = getpass('Garmin account password (NOT saved): ') username = args['user'] output = args['output'] # Create the agent and log in. agent = me.Browser() login(agent, username, password) # Create output directory (if it does not already exist). if not os.path.exists(output): os.mkdir(output) # Scrape all the activities. activities(agent, output)
def download_problems(): global to_return # Clearing download directory for new downloads folder = "./downloads" for file in os.listdir(folder): file_path = os.path.join(folder, file) if os.path.isfile(file_path) or os.path.islink(file_path): os.unlink(file_path) elif os.path.isdir(file_path): shutil.rmtree(file_path) # Getting list of indices of problems to download download_list = [int(x) for x in request.form['prob_indices'].split(',')] # adjust your page display settings here options = { 'quiet': '', 'page-size': 'Letter', 'margin-top': '0.75in', 'margin-right': '0.75in', 'margin-bottom': '0.75in', 'margin-left': '0.75in', 'encoding': "UTF-8", 'no-outline': None } zip_probs = zipfile.ZipFile('./downloads/zipped_problems.zip', 'w') # print(download_list) for prob_indx in download_list: pdfName = to_return['problems'][int( prob_indx)]['id'] + ". " + to_return['problems'][int( prob_indx)]['name'] + '.pdf' # opening and saving questions. br = mechanize.Browser() br.set_handle_robots(False) br.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' )] prob_url = to_return['problems'][int(prob_indx)]['link'] response = br.open(prob_url) data = response.read() bsoup = BeautifulSoup(data, features="lxml") css = "" for stylesheet in bsoup.find_all('link', rel="stylesheet"): css_url = "https:" + stylesheet.get('href') br1 = mechanize.Browser() br1.set_handle_robots(False) br1.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' )] response1 = br1.open(css_url) data1 = response1.read() temp = BeautifulSoup(data1, features="lxml") css += str(temp) css = "<style>" + css + "</style>" ques = bsoup.find('div', class_="ttypography") ques = str(ques) tags = bsoup.find_all('div', class_="roundbox sidebox")[2] tags = str(tags) html = css + ques + "<div style=\"margin:2em\"></p>" + tags pdfkit.from_string(html, os.path.join(folder, pdfName), options=options) zip_probs.write(os.path.join(folder, pdfName), pdfName, compress_type=zipfile.ZIP_DEFLATED) zip_probs.close() try: return send_file('./downloads/zipped_problems.zip', attachment_filename='zipped_problems.zip') except Exception as e: return str(e)
def redditData(i, subreddits, debug, minDelay, maxDelay, b): """Function to scrape data from reddit subreddits subreddits = list of subreddits debug = print updates on screen minDelay = minimum delay between each scrape maxDelay = maximum delay between each scrape b = connection to AWS bucket """ # intantiate an instance of mechanize with headers br = mechanize.Browser() header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 5.1; rv:14.0) Gecko/20100101 Firefox/14.0.1', 'Referer': 'http://www.reddit.com' } # Cookie Jar cj = cookielib.LWPCookieJar() # Browser Options br.set_cookiejar(cj) br.set_handle_equiv(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) # iterate through the subreddits. Stop the program when the user # terminates of no more data for subreddit in subreddits: page = 1 titles = [] while True: url = base_url + str(subreddit).strip().replace( '\t', '') + '?count=' + str(25 * (page - 1)) page += 1 if page % 3 == 0: titles = [] if debug: sys.stdout.write('Visting reddit url :: ' + url + '\n' + '\n') logging.info('Visting reddit url :: ' + url + '\n' + '\n') # wrap the request. request = urllib2.Request(url, None, header) br.open(request) html = br.response().read() soup = BeautifulSoup(html, 'lxml') siteTable = soup.find(attrs={'id': 'siteTable'}) divs = siteTable.findAll('div') for div in divs: try: timestamp = div['data-timestamp'] date_time = str( parser.parse(div.find('time')['datetime']).replace( second=0).isoformat()).replace( ':00+00:00', '+00:00') day = str( datetime.fromtimestamp(int(timestamp) / 1000).date()) time_post = str( datetime.fromtimestamp(int(timestamp) / 1000).isoformat()) title = unidecode( div.find(attrs={ 'class': 'title' }).find('a').getText()) rank = div.find(attrs={'class': 'rank'}).getText() link = div.find(attrs={'class': 'title'}).find('a')['href'] comment_link = div.find(attrs={ 'class': 'flat-list buttons' }).find('a')['href'] logging.info('Visting reddit comment url :: ' + comment_link + '\n' + '\n') # wrap the request. request_comment = urllib2.Request(comment_link, None, header) br.open(request_comment) html_comment = br.response().read() soup_comment = BeautifulSoup(html_comment, 'lxml') comment_dict = {} words = title comments = soup_comment.findAll(attrs={'class': 'comment'}) # get comment 1 and 3 children comment_dict['commentary_1'] = {} comment_dict['commentary_1']['child_comments'] = {} comment_dict['commentary_2'] = {} comment_dict['commentary_2']['child_comments'] = {} comment_dict['commentary_3'] = {} comment_dict['commentary_3']['child_comments'] = {} try: comment_dict['commentary_1']['words'] = unidecode( comments[0].find(attrs={ 'class': 'usertext warn-on-unload' }).getText().replace('\n', ' ')) words = words + ' ' + unidecode( comments[0].find(attrs={ 'class': 'usertext warn-on-unload' }).getText().replace('\n', ' ')) except: comment_dict['commentary_1']['words'] = '' try: comment_dict['commentary_1']['points'] = unidecode( comments[0].find(attrs={ 'class': 'score likes' }).getText()) except: comment_dict['commentary_1']['points'] = '' try: comment_dict['commentary_1']['time'] = unidecode( comments[0].find(attrs={ 'class': 'tagline' }).find('time')['title']) except: comment_dict['commentary_1']['time'] = '' try: comment_dict['commentary_1']['user'] = unidecode( comments[0].find(attrs={ 'class': 'tagline' }).findAll('a')[1].getText().replace('\n', ' ')) except: comment_dict['commentary_1']['user'] = '' try: children = comments[0].findAll( attrs={'class': 'usertext warn-on-unload'}) except: pass try: comment_dict['commentary_1']['child_comments'][ 'comment_1'] = unidecode( children[1].getText().replace('\n', ' ')) words = words + ' ' + unidecode( children[1].getText().replace('\n', ' ')) except: comment_dict['commentary_1']['child_comments'][ 'comment_1'] = '' try: comment_dict['commentary_1']['child_comments'][ 'comment_2'] = unidecode( children[2].getText().replace('\n', ' ')) words = words + ' ' + unidecode( children[2].getText().replace('\n', ' ')) except: comment_dict['commentary_1']['child_comments'][ 'comment_2'] = '' try: comment_dict['commentary_1']['child_comments'][ 'comment_3'] = unidecode( children[3].getText().replace('\n', ' ')) words = words + ' ' + unidecode( children[3].getText().replace('\n', ' ')) except: comment_dict['commentary_1']['child_comments'][ 'comment_3'] = '' # get comment 2 and 3 children try: comment_dict['commentary_2']['words'] = unidecode( comments[1].find(attrs={ 'class': 'usertext warn-on-unload' }).getText()) words = words + ' ' + unidecode( comments[1].find(attrs={ 'class': 'usertext warn-on-unload' }).getText()) except: comment_dict['commentary_2']['words'] = '' try: comment_dict['commentary_2']['points'] = unidecode( comments[1].find(attrs={ 'class': 'score likes' }).getText()) except: comment_dict['commentary_2']['points'] = '' try: comment_dict['commentary_2']['time'] = unidecode( comments[1].find(attrs={ 'class': 'tagline' }).find('time')['title']) except: comment_dict['commentary_2']['time'] = '' try: comment_dict['commentary_2']['user'] = unidecode( comments[1].find(attrs={ 'class': 'tagline' }).findAll('a')[1].getText().replace('\n', ' ')) except: comment_dict['commentary_2']['user'] = '' try: children = comments[0].findAll( attrs={'class': 'usertext warn-on-unload'}) except: children = [] try: comment_dict['commentary_2']['child_comments'][ 'comment_1'] = unidecode( children[1].getText().replace('\n', ' ')) words = words + ' ' + unidecode( children[1].getText().replace('\n', ' ')) except: comment_dict['commentary_2']['child_comments'][ 'comment_1'] = '' try: comment_dict['commentary_2']['child_comments'][ 'comment_2'] = unidecode( children[2].getText().replace('\n', ' ')) words = words + ' ' + unidecode( children[2].getText().replace('\n', ' ')) except: comment_dict['commentary_2']['child_comments'][ 'comment_2'] = '' try: comment_dict['commentary_2']['child_comments'][ 'comment_3'] = unidecode( children[3].getText().replace('\n', ' ')) words = words + ' ' + unidecode( children[3].getText().replace('\n', ' ')) except: comment_dict['commentary_2']['child_comments'][ 'comment_3'] = '' # get comment 3 and 3 children try: comment_dict['commentary_3']['words'] = unidecode( comments[2].find(attrs={ 'class': 'usertext warn-on-unload' }).getText().replace('\n', ' ')) words = words + ' ' + unidecode( comments[2].find(attrs={ 'class': 'usertext warn-on-unload' }).getText().replace('\n', ' ')) except: comment_dict['commentary_3']['words'] = '' try: comment_dict['commentary_3']['points'] = unidecode( comments[2].find(attrs={ 'class': 'score likes' }).getText()) except: comment_dict['commentary_3']['points'] = '' try: comment_dict['commentary_3']['time'] = unidecode( comments[2].find(attrs={ 'class': 'tagline' }).find('time')['title']) except: comment_dict['commentary_3']['time'] = '' try: comment_dict['commentary_3']['user'] = unidecode( comments[2].find(attrs={ 'class': 'tagline' }).findAll('a')[1].getText().replace('\n', ' ')) except: comment_dict['commentary_3']['user'] = '' try: children = comments[2].findAll( attrs={'class': 'usertext warn-on-unload'}) except: children = [] try: comment_dict['commentary_3']['child_comments'][ 'comment_1'] = unidecode( children[1].getText().replace('\n', ' ')) words = words + ' ' + unidecode( children[1].getText().replace('\n', ' ')) except: comment_dict['commentary_3']['child_comments'][ 'comment_1'] = '' try: comment_dict['commentary_3']['child_comments'][ 'comment_2'] = unidecode( children[2].getText().replace('\n', ' ')) words = words + ' ' + unidecode( children[2].getText().replace('\n', ' ')) except: comment_dict['commentary_3']['child_comments'][ 'comment_2'] = '' try: comment_dict['commentary_3']['child_comments'][ 'comment_3'] = unidecode( children[3].getText().replace('\n', ' ')) words = words + ' ' + unidecode( children[3].getText().replace('\n', ' ')) except: comment_dict['commentary_3']['child_comments'][ 'comment_3'] = '' abstract = trimArticle(words, 50) if '/r/' in link: link = base_url + link[1:] logging.info( 'Successfully got 3 comments from reddit comment url :: ' + comment_link + '\n' + '\n') if title in titles: logging.info('Scraped all posts from sub reddit :: ' + url + '\n' + '\n') break titles.append(title) # write the fellow summary to file file_name = 'reddit_' + title.replace( ' ', '-') + '_' + day + '.json' file_name = ''.join(c for c in file_name if c in valid_chars) if os.name == 'nt': f = open('reddit_jsons//' + file_name, 'wb') else: f = open('reddit_jsons/' + file_name, 'wb') folder = 'reddit_jsons' logging.info('Opened ' + 'reddit_jsons//' + file_name + '.json' + ' for writing') data = { 'abstract': abstract, 'external_id': 'reddit_' + title.replace(' ', '-'), 'date': date_time, 'title': title, 'words': words, 'meta': { 'reddit': { 'comments': str(comment_dict), 'link': link, 'rank': rank } }, 'url': comment_link } f.write(json.dumps(data)) f.close() logging.info('File written ' + file_name + '.json' + '') if os.name == 'nt': uploadDataS3(folder + '//' + file_name, b) else: uploadDataS3(folder + '/' + file_name, b) if debug: sys.stdout.write(file_name + ' written' + '\n') except Exception as e: # print str(e) pass wait_time = random.randint(minDelay, maxDelay) sys.stdout.write('Sleeping for :: ' + str(wait_time) + '\n') logging.info('Sleeping for :: ' + str(wait_time) + '\n') sys.stdout.write('******************************************' + '\n') sys.stdout.write('******************************************' + '\n') time.sleep(wait_time)
import os import codecs from bs4 import BeautifulSoup import re from selenium import webdriver import time from selenium.webdriver.common.keys import Keys from selenium.common.exceptions import NoSuchElementException import pyperclip import mechanize import cookielib import xml.etree.ElementTree import requests #declarar navegador br = mechanize.Browser(factory=mechanize.RobustFactory()) cj = cookielib.LWPCookieJar() br.set_cookiejar(cj) chromedriver = 'C:\\chromedriver.exe' browser = webdriver.Chrome(chromedriver) paginas = codecs.open('pages.txt', encoding='iso-8859-1') br.set_handle_robots(False) br.set_handle_equiv(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) br.set_debug_responses(True)
def login(user, pw, cookiePath): br = mechanize.Browser() cj = cookielib.LWPCookieJar() br.set_cookiejar(cj) br.set_handle_robots(False) br.open("https://www.onlinetvrecorder.com/v2/?go=home") # login form br.select_form('fhomelogin') br['email'] = user br['password'] = pw # result = br.submit().read() # change 02/19 # not working since the base url is wrong... loginURL = "https://www.onlinetvrecorder.com/v2/?go=login" params = { u'email': user, u'password': pw, u'rememberlogin': '******', u'btn_login': '******' } data = urllib.urlencode(params) response = br.open(loginURL, data) result = response.read() em = '' pw = '' # get user and pw and set cookies m = re.search('otr_email=(.*?);', result) if (m != None): em = m.group(1) m = re.search('otr_password=(.*?);', result) if (m != None): pw = m.group(1) date = datetime.datetime.now() ts = time.mktime(date.timetuple()) ts = ts + 86400 c = cookielib.Cookie(version=0, name='otr_email', value=em, port=None, port_specified=False, domain='onlinetvrecorder.com', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=ts, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False) cj.set_cookie(c) c = cookielib.Cookie(version=0, name='otr_email', value=em, port=None, port_specified=False, domain='www.onlinetvrecorder.com', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=ts, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False) cj.set_cookie(c) c = cookielib.Cookie(version=0, name='otr_password', value=pw, port=None, port_specified=False, domain='onlinetvrecorder.com', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=ts, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False) cj.set_cookie(c) c = cookielib.Cookie(version=0, name='otr_password', value=pw, port=None, port_specified=False, domain='www.onlinetvrecorder.com', domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=ts, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False) cj.set_cookie(c) #now reload response = br.reload() result = response.read() x = ItemClass() x.state = 'not loged in' x.id = '0' x.decode = '0' x.value = '0' # info fron website match = re.search('my_user_id="(?P<id>.*?)";.*?my_ut="(?P<state>.*?)"', result) if (match != None): if (match.group('state') != ''): cj.save(cookiePath, ignore_discard=True, ignore_expires=True) x.id = match.group('id') x.state = match.group('state').title() match = re.search( '<a.href="history.decodings".*?<div.*?>(?P<value>[^<]*)<', result, re.DOTALL) if (match != None): x.decode = match.group('value') match = re.search( '<div.id="cssmenuright">.*?<a.href="points.*?>(?P<value>[^<]*)<', result, re.DOTALL) if (match != None): x.value = match.group('value') return x
def __init__(self, args): self.args = args self.fixed_password = args.password is not None self.last_connect = 0 if args.enable_funk: if not args.platform: args.platform = platform.system() + ' ' + platform.release() if not args.hostname: args.hostname = socket.gethostname() if not args.hwaddr: args.hwaddr = [] for iface in netifaces.interfaces(): try: mac = netifaces.ifaddresses(iface)[ netifaces.AF_LINK][0]['addr'] assert mac != '00:00:00:00:00:00' args.hwaddr.append(mac) except: pass else: args.hwaddr = [n.strip() for n in args.hwaddr.split(',')] certs = [] if args.certs: now = datetime.datetime.now() for f in args.certs.split(','): cert = tncc.x509cert(f.strip()) if now < cert.not_before: print 'WARNING: %s is not yet valid' % f if now > cert.not_after: print 'WARNING: %s is expired' % f certs.append(cert) args.certs = [n.strip() for n in args.certs.split(',')] args.certs = certs self.br = mechanize.Browser() self.cj = cookielib.LWPCookieJar() self.br.set_cookiejar(self.cj) # Browser options self.br.set_handle_equiv(True) self.br.set_handle_redirect(True) self.br.set_handle_referer(True) self.br.set_handle_robots(False) # Follows refresh 0 but not hangs on refresh > 0 self.br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) # Want debugging messages? if debug: self.br.set_debug_http(True) self.br.set_debug_redirects(True) self.br.set_debug_responses(True) if args.user_agent: self.user_agent = args.user_agent else: self.user_agent = 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' self.br.addheaders = [('User-agent', self.user_agent)] self.last_action = None self.needs_2factor = False self.key = None self.pass_postfix = None
def readAeWeb(sTime, eTime=None, res=60): """This function reads ae data from the WDC kyoto website Parameters ---------- sTime : datetime the earliest time you want data for eTime : Optional[datetime] the latest time you want data for. if this is None, eTime will be equal to sTime. eTime must not be more than 366 days after sTime. default = None res : Optional[int] the time resolution desired, either 1 or 60 minutes. default=60 Notes ----- You should not use this. Use the general function gme.ind.ae.readAe instead. Example ------- import datetime as dt aeList = gme.ind.readAeWeb(dt.datetime(2011,1,1,1,50),eTime=dt.datetime(2011,1,1,10,0)) written by AJ, 20130131 """ import datetime as dt import mechanize assert (isinstance( sTime, dt.datetime)), logging.error('sTime must be a datetime object') if (eTime == None): eTime = sTime assert (isinstance( eTime, dt.datetime)), logging.error('eTime must be a datetime object') assert (eTime >= sTime), logging.error('eTime < eTime') assert (res == 1 or res == 60), logging.error('res must be 1 or 60') delt = eTime - sTime assert (delt.days <= 366), logging.error('cant read more than 366 days') br = mechanize.Browser() br.set_handle_robots(False) # no robots br.set_handle_refresh(False) # can sometimes hang without this br.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' )] if (res == 60): sCent = sTime.year / 100 sTens = (sTime.year - sCent * 100) / 10 sYear = sTime.year - sCent * 100 - sTens * 10 sMonth = sTime.strftime("%m") eCent = eTime.year / 100 eTens = (eTime.year - eCent * 100) / 10 eYear = eTime.year - eCent * 100 - eTens * 10 eMonth = eTime.strftime("%m") br.open('http://wdc.kugi.kyoto-u.ac.jp/dstae/index.html') br.form = list(br.forms())[0] #fill out the page fields br.form.find_control('SCent').value = [str(sCent)] br.form.find_control('STens').value = [str(sTens)] br.form.find_control('SYear').value = [str(sYear)] br.form.find_control('SMonth').value = [sMonth] br.form.find_control('ECent').value = [str(eCent)] br.form.find_control('ETens').value = [str(eTens)] br.form.find_control('EYear').value = [str(eYear)] br.form.find_control('EMonth').value = [eMonth] br.form.find_control('Output').value = ['AE'] br.form.find_control('Out format').value = ['IAGA2002'] br.form.find_control('Email').value = "*****@*****.**" else: tens = (sTime.year) / 10 year = sTime.year - tens * 10 month = sTime.strftime("%m") dtens = sTime.day / 10 day = sTime.day - dtens * 10 htens = sTime.hour / 10 hour = sTime.hour - htens * 10 ehtens = eTime.hour / 10 ehour = eTime.hour - ehtens * 10 minute_tens = sTime.minute / 10 minute = sTime.minute - minute_tens * 10 eminute_tens = eTime.minute / 10 eminute = eTime.minute - eminute_tens * 10 ddtens = delt.days / 10 dday = delt.days - ddtens * 10 br.open('http://wdc.kugi.kyoto-u.ac.jp/aeasy/index.html') br.form = list(br.forms())[0] #fill out the fields br.form.find_control('Tens').value = [str(tens)] br.form.find_control('Year').value = [str(year)] br.form.find_control('Month').value = [str(month)] br.form.find_control('Day_Tens').value = [str(dtens)] br.form.find_control('Days').value = [str(day)] #br.form.find_control('Hour_Tens').value = [str(htens)] br.form.find_control('Hour').value = [str(htens) + str(hour)] br.form.find_control('min').value = [str(minute_tens) + str(minute)] if (ddtens < 9): ddtens = '0' + str(ddtens) br.form.find_control('Dur_Day_Tens').value = [str(ddtens)] br.form.find_control('Dur_Day').value = [str(dday)] br.form.find_control('Dur_Hour').value = [str(ehtens) + str(ehour)] br.form.find_control('Dur_Min').value = [ str(eminute_tens) + str(eminute) ] br.form.find_control('Output').value = ['AE'] br.form.find_control('Out format').value = ['IAGA2002'] br.form.find_control('Email').value = "*****@*****.**" response = br.submit() #get the data lines = response.readlines() aeList = [] for l in lines: #check for headers if (l[0] == ' ' or l[0:4] == 'DATE'): continue cols = l.split() try: aeList.append(aeRec(webLine=l, res=res)) except Exception, e: logging.exception(e) logging.exception('problem assigning initializing ae object')
def testProxy(url, proxy): browser = mechanize.Browser() browser.set_proxies(proxy) page = browser.open(url) source_code = page.read() print source_code
rangeInputLower = raw_input("What lower rowlimit?") rangeInputLower = int(rangeInputLower) rangeInputUpper = raw_input("What upper rowlimit?") rangeInputUpper = int(rangeInputUpper) ## inputs input1 = "input/townsOnly.csv" input2 = "input/malefirstnames.csv" ###################### # Mechanize Set up # set up the browser driver = mechanize.Browser() # Enable cookie support for mechanize cookiejar = cookielib.LWPCookieJar() driver.set_cookiejar( cookiejar ) # Broser options driver.set_handle_equiv( True ) driver.set_handle_gzip( True ) driver.set_handle_redirect( True ) driver.set_handle_referer( True ) driver.set_handle_robots( False ) # # this does seomthing ... (copy paste from stackexchange...) driver.set_handle_refresh( mechanize._http.HTTPRefreshProcessor(), max_time = 1 ) driver.addheaders = [ ( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' ) ]
#!/usr/bin/env python2 #Modules import mechanize import itertools import cookielib import sys from bs4 import BeautifulSoup from re import search, findall from urllib import urlopen #Stuff related to Mechanize browser module br = mechanize.Browser( ) #Shortening the call by assigning it to a varaible "br" # set cookies cookies = cookielib.LWPCookieJar() br.set_cookiejar(cookies) # Mechanize settings br.set_handle_equiv(True) br.set_handle_redirect(True) br.set_handle_referer(True) br.set_handle_robots(False) br.set_debug_http(False) br.set_debug_responses(False) br.set_debug_redirects(False) br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) br.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' ), ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'), ('Accept-Encoding', 'br')] # Banner
raw_hours = root.cssselect('tr') days = ['Mon','Tue','Wed','Thu','Fri','Sat','Sun'] opening = {} for i, day in enumerate(days): daily_hours = raw_hours[i].cssselect('td')[1].text_content().split('-') opening[day + '_opening'] = daily_hours[0] opening[day + '_closing'] = daily_hours[1] store['opening'] = opening m = re.compile(r'new google.maps.LatLng(.*?);').search(store_html) store['location'] = m.group(1).replace('(','').replace(')','') print store scraperwiki.sqlite.save(unique_keys=["name"], data=store) # And finally - international stores. INTERNATIONAL_URL = 'http://corporate.marksandspencer.com/aboutus/where/international_stores' browser = mechanize.Browser() browser.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] browser.open(INTERNATIONAL_URL) #print browser.response().read() html = browser.response().get_data().replace('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">','').replace('<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">','<html>') response = mechanize.make_response( html, [("Content-Type", "text/html")], INTERNATIONAL_URL, 200, "OK") browser.set_response(response) #browser.select_form(nr=0)import json import mechanize import re import scraperwiki import urllib2 import lxml.html
try: import mechanize except ImportError: os.system('pip2 install request') time.sleep(1) os.system('Then type: python2 boss') import os,sys,time,datetime,random,hashlib,re,threading,json,urllib,cookielib,requests,mechanize from multiprocessing.pool import ThreadPool from requests.exceptions import ConnectionError from mechanize import Browser reload(sys) sys.setdefaultencoding('utf8') br = mechanize.Browser() br.set_handle_robots(False) br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(),max_time=1) br.addheaders = [('User-Agent', 'Opera/9.80 (Android; Opera Mini/32.0.2254/85. U; id) Presto/2.12.423 Version/12.16')] br.addheaders = [('user-agent','Dalvik/1.6.0 (Linux; U; Android 4.4.2; NX55 Build/KOT5506) [FBAN/FB4A;FBAV/106.0.0.26.68;FBBV/45904160;FBDM/{density=3.0,width=1080,height=1920};FBLC/it_IT;FBRV/45904160;FBCR/PosteMobile;FBMF/asus;FBBD/asus;FBPN/com.facebook.katana;FBDV/ASUS_Z00AD;FBSV/5.0;FBOP/1;FBCA/x86:armeabi-v7a;]')] def keluar(): print 'Thanks.' os.sys.exit() def acak(b): w = 'ahtdzjc' d = '' for i in x: d += '!'+w[random.randint(0,len(w)-1)]+i return cetak(d)
def cli(fr, msg, notifs, bdays): browser = mechanize.Browser() browser.set_handle_robots(False) #Allows everything to be written cookies = mechanize.CookieJar() browser.set_cookiejar(cookies) browser.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.517.41 Safari/534.7' )] browser.set_handle_refresh(False) #Sometimes hangs without this bday_people_links = [ ] #List to store profile links of people who have their birthdays today bday_people_names = [] i = 1 try: if (bdays): url = 'http://www.facebook.com/events/birthdays/' soup = authenticate(browser, url, email, pwd) #Parses the html and stores in 'soup' bday_box = soup.find( 'div', attrs={ 'class': '_4-u2 _tzh _fbBirthdays__todayCard _4-u8' }) #Finds the html with the div tags and given attributes bday_box_narrow = bday_box.find_all( 'a', attrs={'data-hovercard-prefer-more-content-show': '1'} ) #Finds all a tags with the given attirbute. This will be the list of bdays click.echo("%d people have their birthdays today :\n" % (len(bday_box_narrow))) for a in bday_box_narrow: print str( i ) + ')', a.text #prints names of people who have their birthdays today bday_people_names += [ a.text ] #stores names of people who have their birthdays today bday_people_links += [ a.get('href') ] #stores links of profiles of people have their bdays today i += 1 else: url = 'http://www.facebook.com/login.php' soup = authenticate(browser, url, email, pwd) #Parses the html and stores in 'soup' if (fr): #To find number of new friend request fr_num_box = soup.find('span', attrs={ 'id': 'requestsCountValue' }) #Finds span tags with the given ID click.echo( "You have %s new friend requests" % (fr_num_box.text) ) #Displays and gives the string between the span tags (<span>...</span>) if (msg): #To find number of unread messages msg_num_box = soup.find( 'span', attrs={'id': 'mercurymessagesCountValue'}) click.echo("You have %s unread messages" % (msg_num_box.text)) if (notifs): #To find the number of unseen notifications notifs_num_box = soup.find( 'span', attrs={'id': 'notificationsCountValue'}) click.echo("You have %s unread notifications" % (str(int(notifs_num_box.text) + 1))) except AttributeError: click.echo("Either the password or email id you've entered is wrong")