def download_linke(coords, proxy, port, saveFile, saveMode): print proxy, port print proxy != "" url = "http://www.soda-is.com/eng/services/service_invoke/gui.php?" + "xml_descript=soda_tl.xml&Submit2=Month" session = Session() session.verify = False if proxy != "": proxies = {proxy: port} session.proxies = proxies br = RoboBrowser(session=session, parser="lxml") br.open(url) linke_form = br.get_forms()[1] num = len(coords) index = 0 with open(saveFile, saveMode) as f: try: for coord in coords: inlon, inlat = coord linke_form["lat"].value = inlat linke_form["lon"].value = inlon sf = linke_form.submit_fields.getlist("execute") br.submit_form(linke_form, submit=sf[0]) linke_table = br.find("table", {"cellspacing": "0", "cellpadding": "2"}) linkes = get_monthly_linke_str(get_linke_values(linke_table)) s = "%s,%s,%s\n" % (format(inlon, "0.5f"), format(inlat, "0.5f"), linkes) if len(s) > 48: f.write(s) print "Done with point %i of %i: (%s, %s)" % ( index + 1, num, format(inlon, "0.5f"), format(inlat, "0.5f"), ) index += 1 br.back() print "DONE!" except Exception as e: not_dl = list(coords[index:]) with open(saveFile + "_notdownloaded.txt", "w") as nd: for c in not_dl: nd.write("%s,%s\n" % (str(c[0]), str(c[1]))) print e
def scrape_site(url, cookie_file="", ses=False, is_rss=False): from http.cookiejar import LWPCookieJar from robobrowser import RoboBrowser from requests import Session s = Session() if cookie_file: s.cookies = LWPCookieJar(cookie_file) try: s.cookies.load(ignore_discard=True) except: # Cookies don't exsit yet pass s.headers['User-Agent'] = 'Mozilla/5.0 (X11; Ubuntu; rv:39.0)' s.headers['Accept'] = 'text/html' s.headers['Connection'] = 'keep-alive' if is_rss: parser = 'xml' else: parser = 'html5lib' browser = RoboBrowser(session=s, parser=parser) browser.open(url) if ses: return browser, s else: return browser
def take_action(self, parsed_args): config_dir = '~/.kddcup2015-cli' config_dir = os.path.expanduser(config_dir) if parsed_args.topN: topN = int(parsed_args.topN) else: topN = 10 base = 'https://www.kddcup2015.com' rank_url = '/'.join([base, 'submission-rank.html']) browser = RoboBrowser() browser.open(rank_url) html_str = str(browser.parsed) html = pq(html_str) country_teams = list( map(lambda x: x.text.strip(), html('.country_team')[:topN])) scores = list( map(lambda x: x.text.strip(), html('.td_result')[:topN])) entries = list( map(lambda x: x.text.strip(), html('.td_result + td')[:topN])) last_subs = list( map(lambda x: x.text.strip(), html('.td_result + td + td')[:topN])) return ( ('Team', 'Score', 'Entries', 'Last Submission UTC'), (list(zip(country_teams, scores, entries, last_subs))) )
def parseWeek(year, week): """ parses a specific week on http://rotoguru1.com/cgi-bin/fyday.pl?week={}&year={}&game=fd&scsv=1 which contains a csv of the fan duel player prices stores this info in fanduel_prices collection """ logger = makeLogger(str(year) + '_' + str(week), r'./logs_RotoFDStats/') startTime = datetime.now() logger.debug('Starting %d', year) client = MongoClient('localhost', 27017) db = client['nfl_data'] col_fanduel_prices = db['fanduel_prices'] if col_fanduel_prices.find({'year': year, 'yeek': week}).count(): logger.debug('Already parsed %d %d', year, week) closeLogger(logger) return None wait = random.uniform(1.5,3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening main page') browser = RoboBrowser(history=False, parser='html.parser', user_agent=get_user_agent(logger), timeout=10) url = "http://rotoguru1.com/cgi-bin/fyday.pl?week={}&year={}&game=fd&scsv=1".format(week, year) browser = open_or_follow_link(logger, browser, 'open', url) docs = [] try: data = browser.find('pre').text lines = data.split('\n') header = lines[0] header = header.split(';') lines = lines[1:] for line in lines: doc = {} if not line: continue for index, each in enumerate(line.split(';')): doc[cleanKey(header[index])] = convertToNumber(each) docs.append(doc) except: logger.exception("Parse fail: %s", url) try: logger.debug('Bulk Creating docs') col_fanduel_prices.insert_many(docs) except: logger.exception('insert_many error') logger.debug('parseWeek time elapsed: ' + str(datetime.now() - startTime)) closeLogger(str(year) + '_' + str(week))
def get_bracket_data(year): url = 'http://espn.go.com/mens-college-basketball/tournament/bracket/_/id/{}22/'.format(year) b = RoboBrowser() b.open(url) data = [] for item in b.find_all(attrs={'class': 'match'}): t1, t2 = [(get_id(a['href']), a['title']) for a in item('a')] s1, s2 = ' '.join(item.find('dd').stripped_strings).split() data.append([t1, t2, s1, s2]) return data
def get_mp3_url(lecture_url): browser = RoboBrowser() browser.open(lecture_url) link = browser.get_link(href=re.compile("\\.mp3$")) if link is not None: return link["href"] else: return None
class RoboBrowserTestCase(StaticLiveServerTestCase, base.AbstractBrowser): def setUp(self): super().setUp() self.browser = RoboBrowser(history=True, parser='html.parser') def load(self, url): self.browser.open(self.live_server_url + url) def get_title(self): return self.browser.find('title').text def get_form(self, selector): return RoboBrowserForm(self.browser, selector)
def testMethod_findXssFailuresInAForm_shouldOnlySaveXssFailuresThatAreNotAlreadyInTheList(self): url = "http://www.remikya.com/Controllers/SearchController.php" xssFinder = XssFinder(url) browser = RoboBrowser() browser.open(url) form = browser.get_form(id="form") xssFinder.findXssFailuresInAForm(browser, form) xssFinder.findXssFailuresInAForm(browser, form) xssFinderListLength = len(xssFinder.getListOfLinks()) EXPECTED_ANSWER = 1 self.assertEqual(EXPECTED_ANSWER, xssFinderListLength)
def take_action(self, parsed_args): config_dir = '~/.kddcup2015-cli' config_dir = os.path.expanduser(config_dir) if os.path.isdir(config_dir): config = ConfigParser.ConfigParser(allow_no_value=True) config.readfp(open(config_dir + '/config')) if parsed_args.username: username = parsed_args.username else: username = config.get('user', 'username') if parsed_args.password: password = parsed_args.password else: password = config.get('user', 'password') entry = parsed_args.entry message = parsed_args.message base = 'https://www.kddcup2015.com' login_url = '/'.join([base, 'user-ajaxlogin.html']) submit_url = '/'.join([base, 'submission-make.html']) submission_url = '/'.join(([base, 'submission.html'])) browser = RoboBrowser() response = browser.session.post( login_url, dict(email=username, pwd=password)).json() if response['rs'] == 'error': self.app.stdout.write(response['msg']) browser.open(submit_url) form = browser.get_form() form['_f'].value = open(entry) if message: form['description'] = message browser.submit_form(form) sleep(5) browser.open(submission_url) html_str = str(browser.parsed) html = pq(html_str) times = list(map( lambda x: datetime_parser.parse(x.text), html('.td_result +td+td+td+td'))) newest_index = times.index(max(times)) score = html('.td_result')[newest_index * 2].text.strip() self.app.stdout.write(score + '\n')
def scrape_snotel_sites(url=None): if not url: url = "http://www.wcc.nrcs.usda.gov/nwcc/yearcount?network=sntl&counttype=statelist&state=" browser = RoboBrowser(parser="html5lib") browser.open(url) browser.response.raise_for_status() table = browser.find_all("table")[4] sites = [] # list of sites with name and code cols = [t.text.strip() for t in table.tr.find_all("th")] for row in table.find_all("tr"): if row.td and row.td.text.strip() == 'SNTL': items = [i.text.strip() for i in row.find_all("td")] sites.append(dict(zip(cols, items))) return sites
def lookUpNetCTLPan(self, sequ): seq='>seq' + '\n'+ sequ browser= RoboBrowser(user_agent='Mozilla/5.0', history=True) browser.allow_redirects=True browser.session.cookies Query="http://tools.immuneepitope.org/stools/netchop/netchop.do?app=netchop" browser.open(Query) net_form= browser.get_form(action="upload-submit.do") net_form net_form['sequences'].value=seq net_form['formtype'].value='netctlpan_select' net_form['length'].value='9' net_form['species'].value="human" net_form['supertype'].value='A2' net_form['allele'].value= "HLA-A02:01" #self.amerLength print(net_form) net_form.serialize() net_form browser.submit_form(net_form, submit="Submit") browser print(browser) table_form=browser.get_form(action="tableViewctlpan.do?thePage=1") print(table_form) return
def scrape(q): query = q ph = re.compile('(\(\d{3}\)\ \d{3}-\d{4})') ad = re.compile('[A-Z]{2}\ (\d{5})') site = re.compile('(?<=\?q=).*(?=&sa)') result = { 'name':'!NO DATA!', 'address':'!NO DATA!', 'phone':'!NO DATA!', 'website':'!NO DATA!', 'blurb':'!NO DATA!' } #uses mechanize to submit google search browser = RoboBrowser(user_agent='Firefox', parser='html.parser') browser.open('http://google.com/') # Search for Porcupine Tree form = browser.get_form(action='/search') form # <RoboForm q=> form['q'].value = query browser.submit_form(form, form.submit_fields['btnG']) result['query']=query if browser.find("div", {"class" : "_B5d"}): result['name'] = browser.find("div", {"class" : "_B5d"}).text.encode('utf-8') stuff = browser.find("div", {"class" : "_uXc"}) address = stuff.find(text=ad) if address: result['address']=address.encode('utf-8') phone = stuff.find(text=ph) if phone: result['phone']=phone.encode('utf-8') blurb = stuff.find("span") if blurb: result['blurb'] = blurb.text.encode('utf-8') website = stuff.find("a", string="Website") if website: website = website.get('href').encode('utf-8') result['website'] = site.search(website).group() print result delay = random.randint(5,10) print "Waiting " + str(delay) + " seconds..." time.sleep(delay) return result
def get_schedule(employee_info,shift_period): browser=RoboBrowser(parser='lxml') login_handler(browser,employee_info) start_of_week=shift_period['start_of_week'] end_of_week=shift_period['end_of_week'] browser.open('https://www.rsishifts.com/Schedules/SchedulePrintByUser.aspx?' 'StartDate='+start_of_week+'&EndDate='+end_of_week) employee_name=employee_info['employee_name'] schedule=find_schedule(browser,employee_name) if (schedule): return convert_schedule_to_datetime(start_of_week,schedule) else: return False
def get_digitised_pages(self, entity_id=None): ''' Returns the number of pages (images) in a digitised file. Note that you don't need a session id to access these pages, so there's no need to go through get_url(). ''' # url = 'http://recordsearch.naa.gov.au/scripts/Imagine.asp?B={}&I=1&SE=1'.format(entity_id) url = 'http://recordsearch.naa.gov.au/SearchNRetrieve/Interface/ViewImage.aspx?B={}'.format(entity_id) br = RoboBrowser(parser='lxml') br.open(url) try: pages = int(br.find('span', attrs={'id': "lblEndPage"}).string) except AttributeError: pages = 0 return pages
def testMethod_getAllFieldNamesInAForm_shouldReturnTheNameAttributeOfAllFormNodes(self): url = "http://www.remikya.com/Controllers/LoginController.php" xssFinder = XssFinder(url) browser = RoboBrowser() browser.open(url) form = browser.get_form(action="/Controllers/LoginController.php") fieldNames = xssFinder.getAllFieldNamesFromAForm(form) FIRST_EXPECTED_ANSWER = "nom_utilisateur" SECOND_EXPECTED_ANSWER = "mot_de_passe" THIRD_EXPECTED_ANSWER = "Connecter" self.assertEqual(FIRST_EXPECTED_ANSWER, fieldNames[0]) self.assertEqual(SECOND_EXPECTED_ANSWER, fieldNames[1]) self.assertEqual(THIRD_EXPECTED_ANSWER, fieldNames[2])
class StatusChecker(object): def __init__(self, config): self.browser = RoboBrowser() self.config = config def post(self, url, data): full_url = self.base_url + url data_json = json.dumps(data) return self.session.post(full_url, data=data_json) def get(self, url): full_url = self.base_url + url return self.session.get(full_url) def status(self, username, password, name=None): status = Status(name or username) self.browser.open(self.base_url) login = self.post("/login", dict( username=username, lastName=password, password=password, rememberMe=False)) login_json = login.json() if not login_json['success']: print "Unable to log in: {}".format(login_json['error']) print login.text return status account_summary = self.get('/account/summary') try: account_summary.json() except: print "Account summary text:" print account_summary.text raise status.fees_cents = account_summary.json()['accountSummary']['fees'] status_response = self.get('/loans/0/20/Status') for loan in status_response.json()['loans']: status.add_loan(loan) return status @property def session(self): return self.browser.session @property def base_url(self): return self.config['library']['base_url']
class FakeMail(object): def __init__(self): self.browser = RoboBrowser(history=True) self.browser.open('http://10minutemail.com/') with open('10minmail.txt', 'w') as f: f.write(str(self.browser.parsed)) if self.browser.get_link('Blocked'): raise BlockedException('to many login Attempts') def get_address(self): address = self.browser.find("div", {"id": "copyAddress"}) print address def read_mail(self): pass
def __init__(self): self.browser = RoboBrowser(history=True) self.browser.open('http://10minutemail.com/') with open('10minmail.txt', 'w') as f: f.write(str(self.browser.parsed)) if self.browser.get_link('Blocked'): raise BlockedException('to many login Attempts')
def __init__(self, email, passwd=None, metrics=None, dt1=None, dt2=None, write_out=False, filename=None): self.email = email if passwd is None: from getpass import getpass self.passwd = getpass('Password for %s: ' % self.email) else: self.passwd = passwd if metrics is None: self.metrics = ['steps', 'distance', 'floors', 'active-minutes', 'calories-burned', 'heart-rate'] else: self.metrics = metrics if dt2 is None: self.dt2 = datetime.now() else: self.dt2 = dt2 if dt1 is None: from datetime import timedelta self.dt1 = self.dt2 - timedelta(days=7) else: self.dt1 = dt1 self.browser = RoboBrowser(parser='lxml') self.write_out = write_out self.filename = filename self.date, self.metric, self.data = (None, None, None)
def __init__(self, appname, username, password, *args, **kwargs): self.loggedin=False self.browser=RoboBrowser(history=True) self.appname=appname self.username=username self.password=password self.baseurl = 'https://' + self.appname + '.infusionsoft.com'
def __init__(self): # Browse url : self.result = None self.browser = RoboBrowser(parser="html.parser") self.browser.session.headers = config.headers # Mount with custom SSL Adapter self.browser.session.mount('https://', HTTPSAdapter())
def __init__(self, user_id, password): self.user_id = user_id self.password = password self.browser = RoboBrowser() self.run_id = '' self.pre_id = '' self.res = {}
def build_cache(): """ Get current data from the website http://www.lfd.uci.edu/~gohlke/pythonlibs/ Returns ------- Dictionary containing package details """ data = {} soup = RoboBrowser() soup.open(MAIN_URL) links = soup.find(class_="pylibs").find_all("a") for link in links: if link.get("onclick") is not None: jsfun = link.get("onclick").split('"') mlstr = jsfun[0].split("(")[1].strip()[1:-2] ml = list(map(int, mlstr.split(","))) mi = jsfun[1] url = parse_url(ml, mi) # Details = [package, version, pyversion, --, arch] details = url.split("/")[-1].split("-") pkg = details[0].lower().replace("_", "-") # Not using EXEs and ZIPs if len(details) != 5: continue # arch = win32 / win_amd64 / any arch = details[4] arch = arch.split(".")[0] # ver = cpXX / pyX / pyXXx pkg_ver = details[1] py_ver = details[2] py_ver_key = py_ver + "-" + arch if pkg in data.keys(): if py_ver_key in data[pkg].keys(): data[pkg][py_ver_key].update({pkg_ver: url}) else: data[pkg][py_ver_key] = {pkg_ver: url} else: data[pkg] = {py_ver_key: {pkg_ver: url}} return data
def scrape_cache(query, total): browser = RoboBrowser() listings = [] for i in range(1, total, 14): offset = i # which listing to start at per page. Increment by 14 browser.open('http://www.bing.com/search?q=%s&first=%d' % (query, offset)) # Database Schema - A sqlite database is used to make data queries more efficient. # id (Primary Key) - ID of the item # orig_url - Original URL of the site. # cache_url - Cached URL of the site. # desc - Quick description of the site. # grab all search attribute strings capt_list = browser.select('.b_caption') for capt in capt_list: # start a new listing listing = {} # display original url listing['orig_url'] = re.sub('<[^>]*>', '', str(capt.select('cite')[0])) # display description listing['desc'] = capt.p.string # '|' delimited list, containing the ids needed to cache id_string = capt.select('div.b_attribution')[0].get('u') print(id_string) if (id_string != None): ids = id_string.split('|') listing['cache_url'] = "http://cc.bingj.com/cache.aspx?q=%s&d=%s&mkt=en-US&setlang=en-US&w=%s" % (query, ids[2], ids[3]) else: listing['cache_url'] = None print(listing) listings.append(listing) print(":: End of dump %d" % i) # delay between page grabs time.sleep(1) # listings is given as an output object return(listings)
def __init__(self, name): self._name = name self._url = "https://www.kicktipp.de/" + self._name + "/" self._url_login = self._url + "profil/login" self._url_logout = self._url + "profil/logout" self._url_tippabgabe = self.url + "tippabgabe" self._browser = RoboBrowser()
def __init__(self, un="*****@*****.**", pw='62IS1DSDBgyTM8b7GUl', appname='if188', **kwargs): self.un = un self.pw = pw self.appname=appname self.client_id="aa8fnmbza344ypd9anqeq62v" self.secret="VsNrwPpHDN" self.redirect_uri = "http://jlmarks.org/infusionsoftcallback" self.browser = RoboBrowser(history=True)
def __init__(self, output_path=None, username=None, password=None): self.browser = RoboBrowser(history=True) self.output_path = output_path or tempfile.TemporaryDirectory().name self.username = username or os.environ['STITCHBOT_USERNAME'] self.password = password or os.environ['STITCHBOT_PASSWORD'] self.logger = logger.getChild('StitchBot')
def main(): args = docopt(__doc__, version="dailyprogrammer-dl v{}".format(__version__)) # Configure logging logLevel = logging.INFO #default if args['--verbose']: logLevel = logging.DEBUG elif args['--quiet']: logLevel = logging.ERROR logging.basicConfig(format='%(levelname)s: %(message)s', level=logLevel) logging.debug(args) # Process command line arguments challengeURL = args['<challengeurl>'] # Parse project page for title and description logging.info("Parsing daily challenge: {}".format(challengeURL)) browser = RoboBrowser() browser.session.headers['User-Agent'] = "dailyprogrammer-dl v{} by /u/zod77".format(__version__) browser.open(challengeURL) title = browser.find('a',class_='title').string description = browser.find_all('div',class_="md") description = description[1] descriptionHTML = "".join(str(t) for t in description.contents) # remove outer <div> projectName = generateProjectName(title) # Init project skeleton logging.info("Generating project") projectPath = os.path.abspath(projectName) os.mkdir(projectPath) # Write out project files pyTemplate = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)),"boilerplate.txt")) shutil.copy(pyTemplate, os.path.join(projectPath,"{}.py".format(projectName))) # Generate README.md h = html2text.HTML2Text() descriptionMD = h.handle(descriptionHTML) readme = os.path.join(projectPath,"README.md") with open(readme, "w") as f: f.write(descriptionMD) return
offset_inicio:sub_trecho.find(string_fim):].lstrip() def create_dict_from_resultset(resultset, fields): dict = {} for tag in resultset: # modificar para algo assim --> field = ([f for field in fields] in str(tag.next_sibling.next_element)): for key in fields: if str(fields[key]).lstrip() in str(tag): dict[key] = str(tag.next_sibling).replace("\n", "").lstrip() return dict #------- LOGIN -------------------- session = Session() session.verify = False browser = RoboBrowser(session=session, parser="html5lib") browser.open("https://ditec.pf.gov.br/sistemas/criminalistica/meus_dados.php") form_login = browser.get_form() form_login["usuario"].value = config.LOGIN["SISCRIM"]["USER"] form_login["senha"].value = config.LOGIN["SISCRIM"]["PASS"] browser.submit_form(form_login) #------- PRENCHE MATERIAL -------------------- numero_material = "846/2017" form_mat1 = browser.get_form() form_mat1["tipo_busca"].value = "numero_material" form_mat1["numero_busca"].value = numero_material browser.submit_form(form_mat1) #------- NAVEGA P/ MATERIAL -------------------- browser.follow_link(browser.get_link(str(numero_material)))
class Siding: def __init__(self, usuario, contraseña): self.browser = RoboBrowser(parser="html.parser") self.usuario = usuario self.contraseña = contraseña self.ramos_alumno = defaultdict(dict) self.ramos_administrador = defaultdict(dict) self.ramos_ayudante = defaultdict(dict) def __repr__(self): return "Siding - " + self.usuario def login(self): "Inicia sesion en Siding" self.browser.open("http://www.ing.uc.cl/") forma = self.browser.get_form(id="form-siding") forma["login"] = self.usuario forma["passwd"] = self.contraseña self.browser.submit_form(forma) if "Datos de ingreso incorrectos" in self.browser.parsed.text: raise CredencialesIncorrectas() def cargar_ramos(self): self.browser.open( "https://intrawww.ing.puc.cl/siding/" + "dirdes/ingcursos/cursos/index.phtml") ramos = self.browser.find(class_="ColorFondoZonaTrabajo") ramos = ramos.find_all("tr")[1:] comenzar = False lista = None for ramo in ramos: titulo = ramo.find("td").text.strip() if titulo == "": continue if "Cursos donde es alumno" in titulo: comenzar = True dic = self.ramos_alumno continue if "Cursos donde es administrador" in titulo: dic = self.ramos_administrador continue if "Cursos donde es ayudante" in titulo: dic = self.ramos_ayudante continue if not comenzar: continue titulo = titulo.split() sigla = titulo[0] seccion = titulo[1].split(".")[1] nombre = " ".join(titulo[2:]) local_link = ramo.find("a") if local_link is not None: link = "https://intrawww.ing.puc.cl" + local_link.get("href") id_ = link.split("=")[-1] else: link = local_link id_ = None dic_ramo = { "sigla": sigla, "nombre": nombre, "seccion": seccion, "link": link, "id": id_ } dic[sigla][seccion] = dic_ramo def subir_anuncio(self, sigla, seccion, asunto, mensaje): ramo = self.ramos_administrador[sigla][seccion] link = "https://intrawww.ing.puc.cl/siding/dirdes/ingcursos/" + \ "cursos/index.phtml?accion_curso=avisos&acc_aviso=nuevo" + \ "&id_curso_ic={}".format(ramo["id"]) form = None while form is None: self.browser.open(link) form = self.browser.get_form( action="?accion_curso=avisos&acc_aviso=ingresar_aviso&" + \ "id_curso_ic={}".format(ramo["id"])) form["asunto"].value = sigla + " - Nuevo aviso - " + asunto form["contenido_aviso"].value = mensaje print(form["asunto"].value) print(form["contenido_aviso"].value) #self.browser.submit_form(form) def subir_anuncio_multiple(self, sigla, secciones, asunto, mensaje): for seccion in secciones: self.subir_anuncio(sigla, seccion, asunto, mensaje) print("Se han subido todos los anuncios")
var = 1402713001 csv_ofile = open("student_data_IT_4.csv", 'w', newline='') writer = csv.writer(csv_ofile) writer.writerow([ 'Name', 'Father', 'roll_no', 'student_no', 'branch', 'year', 'section', 'current_sem', 'DOB', 'category', 'hostler', 'Addmission_mod', 'contact', 'parent_contact', 'address', 'email', '10th%', '12th%', 'B_tech%', 'sem_1_marks', 'sem_1_attendance', 'sem_2_marks', 'sem_2_attendance', 'sem_3_marks', 'sem_3_attendance', 'sem_4_marks', 'sem_4_attendance', 'sem_5_marks', 'sem_5_attendance', 'sem_6_marks', 'sem_6_attendance', 'sem_7_marks', 'sem_7_attendance', 'sem_8_marks', 'sem_8_attendance' ]) for i in range(120): br = RoboBrowser() br.open("http://10.10.156.201/login-student.php") form = br.get_form('password-form') form['username1'].value = var form['password'].value = str(var) br.submit_form(form) container = br.find_all('table') if len(container) != 0 and len(container) == 6: row = [] row.append(extract_special(container[2])) row.append(extractor(container[3])) row.append(extract_special(container[4])) relevant = important(extractor(container[5])) row.append(relevant)
def parseYear(team_name, year_url, year): """ parses a schedule for a specific year on http://www.pro-football-reference.com/years/{YEAR}/games.htm follows all the "boxscore" links (column[3]) to get stadium and weather conditions (game_info) stores schedule info in nfl_data.schedule stores game_info in nfl_data.game_info with schedule ids """ logger = makeLogger( cleanKey(team_name) + '_' + str(year), r'./logs_pfrTeamStats/') startTime = datetime.now() logger.debug('Starting %d', year) schedule_list = [] gameInfo_list = [] client = MongoClient('localhost', 27017) db = client['nfl_data'] col_team_stats_weekly = db['team_stats_weekly'] #need to fix this to actually detect duplicate # if col_team_stats_weekly.find({'year': year}).count(): # logger.debug('Already parsed %s', year) # closeLogger(logger) # return None wait = random.uniform(1.5, 3.5) logger.debug('Waiting %f', wait) time.sleep(wait) logger.debug('Opening main page') browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link(logger, browser, 'open', year_url) table = browser.find(id='games') rows = table.find_all('tr') header = [ cleanKey(each.attrs['data-stat']) for each in rows[0].find_all('th') ] rows = rows[1:] row_dicts = [] for index, row in enumerate(rows): logger.debug('Row %d of %d', index, len(rows)) try: week_number = convertToNumber(row.find('th').text) row_values = [ convertToNumber(value.text) for value in row.find_all('td') ] row_values.insert(0, week_number) row_dict = dict(zip(header, row_values)) row_dict['year'] = year row_dict['team_name'] = team_name row_dict['year_url'] = year_url if row_dict['game_date'].lower() == 'playoffs': continue row_dicts.append(row_dict) except: logger.exception(row) logger.debug('team_stats_weekly.inert_many') col_team_stats_weekly.insert_many(row_dicts) logger.debug('parseYear time elapsed: ' + str(datetime.now() - startTime)) closeLogger(logger)
def mainScript(host, username, password, flashFirmware, upgradeFilename, flashSleepDelay, activeMethod, activeCommand, splitCommand, ddnsService, connectRetryDelay, interCommandDelay): br = RoboBrowser(history=True, parser="html.parser", timeout=15) print('{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + ' ' + _("Authenticating")) srp6authenticate(br, host, username, password) print('{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + ' GETing : http://' + host + ' to aquire authenticated CSRFtoken') br.open('http://' + host) print('{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + ' GET completed: ' + str(br.response)) if activeMethod == 'VodafoneDDNS' or activeMethod == 'VodafoneDDNS2': token = br.find(lambda tag: tag.has_attr('name') and tag.has_attr('type') and tag['name'] == 'CSRFtoken')['value'] else: token = br.find(lambda tag: tag.has_attr('name') and tag['name'] == 'CSRFtoken')['content'] print('{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + ' Got authenticated CSRFtoken: ' + token) success = False if flashFirmware: print('{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + ' Flash Firmware option is enabled. Activemethod = ' + activeMethod) if activeMethod == 'VodafoneDDNS': # DGA0130 Vodafone NZ VANT-9 Ultra Hub upgradeurlpostfix = '/modals/upgrade.lp?action=upgradefw' elif activeMethod == 'VodafoneDDNS2': # DNA0130 Vodafone NZ VBNT-Z Ultrahub Plus upgradeurlpostfix = '/modals/settings/firmwareUpdate.lp?action=upgradefw' else: upgradeurlpostfix = '/modals/gateway-modal.lp?action=upgradefw' filedata = {'CSRFtoken': token, 'upgradefile': ('test.rbi', open(upgradeFilename, 'rb'))} print('{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + ' POSTing firmware to: ' + 'http://' + host + upgradeurlpostfix) r = br.session.post('http://' + host + upgradeurlpostfix, files=filedata) print('{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + ' Fimrware POST completed: ' + str(br.response)) br._update_state(r) print(r.text) if r.text == '{ "success":"true" }': print('{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + ' ' + _("Modem reports flashing commenced successfully")) success = True print('{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + ' ' + _("Waiting for reboot... Sleeping for %s s") % (flashSleepDelay)) time.sleep(int(flashSleepDelay)) else: success = True if success: backUp = False attempt = 0 while not backUp: attempt += 1 print('{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + ' ' + _("Connect attempt %i") % (attempt)) try: br.open('http://' + host) print ('{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + ' Response: ' + str(br.response)) if br.response.ok: backUp = True except Exception: print('{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + ' ' + _('Failed to connect, attempt %i. Retrying') % (attempt)) time.sleep(int(connectRetryDelay)) pass print('{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + ' ' + _("Modem up")) if not splitCommand: runCommand(br, host, token, activeMethod, activeCommand, ddnsService) else: print('{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + ' ' + _("Splitting command up using semicolons")) for subCommand in [s for s in activeCommand.split(';') if len(s) > 0]: runCommand(br, host, token, activeMethod, subCommand, ddnsService) print('{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + ' ' + _("Sleeping...") + str(int(interCommandDelay)) + ' seconds') time.sleep(int(interCommandDelay)) result = '{0:%Y-%m-%d %H:%M:%S}'.format(datetime.datetime.now()) + ' ' + _("Please try a ssh connection now to ") + host + _(" with username root and password root (change password immediately with passwd!) Rebooting your modem now is recommended to stop any services that have been disabled.") print(result) return result
import sys import os from robobrowser import RoboBrowser # 認証の情報は環境変数から取得する。 AMAZON_EMAIL = "*****@*****.**" AMAZON_PASSWORD = "******" #AMAZON_EMAIL = os.environ['AMAZON_EMAIL'] #AMAZON_PASSWORD = os.environ['AMAZON_PASSWORD'] # RoboBrowserオブジェクトを作成する。 browser = RoboBrowser( parser='html.parser', # Beautiful Soupで使用するパーサーを指定する。 # Cookieが使用できないと表示されてログインできない問題を回避するため、 # 通常のブラウザーのUser-Agent(ここではFirefoxのもの)を使う。 user_agent= 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:45.0) Gecko/20100101 Firefox/45.0' ) def main(): # 注文履歴のページを開く。 print('Navigating...', file=sys.stderr) browser.open('https://www.amazon.co.jp/gp/css/order-history') # サインインページにリダイレクトされていることを確認する。 assert 'Amazonサインイン' in browser.parsed.title.string # name="signIn" というサインインフォームを埋める。 # フォームのname属性の値はブラウザーの開発者ツールで確認できる。
extract usage data from hallon mobile broadband """ from robobrowser import RoboBrowser from bs4 import BeautifulSoup as Soup from os import path import sys import yaml import json with open(path.join(path.dirname(sys.argv[0]), ".hallon-credentials.yaml")) as f: CREDENTIALS = yaml.safe_load(f) URL = "https://www.hallon.se/mina-sidor" br = RoboBrowser(parser="lxml") br.open(URL) form = br.get_form(action="/logga-in") form["UserName"].value = CREDENTIALS["username"] form["Password"].value = CREDENTIALS["password"] br.submit_form(form) usage = br.select("p.usage")[0].text.replace(",", ".").split() remaining = round(float(usage[0]), 2) total = int(usage[2]) used = round(float(total-remaining), 2) used_pct = round(used*100/total, 1) days_remaining = int(br.select("p.usage-daysleft")[0].text.split()[0]) print(json.dumps({"total": total,
class MullVad: accountnumber = "6798499523758101" website = "https://www.mullvad.net/account/login/" br = RoboBrowser(parser='html.parser', history=True) #wallet = Wallet() #Login with given accountnumber def login(self): self.br.open(self.website) form = self.br.get_form() form['account_number'].value = self.accountnumber self.br.session.headers['Referer'] = self.website self.br.submit_form(form) #Purchase 1 month VPN def purchase(self): form = self.br.get_form() form['months'].value = "1" self.br.session.headers['Referer'] = self.br.url self.br.submit_form(form) month_price = "" bitcoin_address = "" payment_info_page = str(self.br.parsed) #Get the price for one month and bitcoin address from html code for line in payment_info_page.split("\n"): if "1 month = " in line: month_price = line.strip().split(" ")[3] if 'input readonly' in line: bitcoin_address_line = line.strip().split(" ")[3].split("=")[1] bitcoin_address = bitcoin_address_line.partition( '"')[-1].rpartition('"')[0] print(month_price) print(bitcoin_address) #if pay(month_price, bitcoin_address): # setupVPN() #else: # print("Error: payment failed") #Pay for 1 month using bitcoins and the electrum wallet def pay(self, price, bitcoin_address): #Start electrum daemon os.system('electrum --testnet daemon start') #Load electrum default wallet os.system('electrum --testnet daemon load_wallet') #Check balance in wallet is enough for payment balance = os.popen('electrum --testnet getbalance').read() balance = float( balance.split("\n")[1].split(":")[1].replace('"', "").replace( " ", "").replace(",", "")) print(balance) if balance >= price: transaction = os.popen('electrum --testnet payto ' + bitcoin_address + ' ' + str(price) + '| electrum --testnet broadcast -').read() #Check if transaction was successfull and return state of transaction transaction_complete = transaction.find('true') if transaction_complete == -1: transaction_complete = False else: transaction_complete = True print('transaction = ' + str(transaction_complete)) return transaction_complete else: print('Insufficient balance, transaction cancelled') return False #Setup the VPN def setupVPN(): print("Time to setup the vpn!")
from robobrowser import RoboBrowser from tqdm import tqdm import requests import re browser = RoboBrowser(parser='html.parser') #lines = ["Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", "Delaware", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"] url = 'https://greatnonprofits.org/state/' second = '/sort:review_count/direction:desc/page:' def linkcleaner(links): newlist = [] links = list(set(links)) for link in links: if 'GreatNonprofits' not in link: if (link != "http://twitter.com/" and link != "http://twitter.com/share" and link != "https://twitter.com/" and link != "https://twitter.com/share" and link != "https://twitter.com/?lang=en" and link != "http://twitter.com/?lang=en" and link != "//twitter.com/share"): if ' ' not in link and 'status' not in link and 'search' not in link and 'intent/' not in link and 'hashtag/' not in link and 'share?' not in link: newlist.append(link) return newlist links_by_state = dict()
import json import random import time import requests from bs4 import BeautifulSoup from robobrowser import RoboBrowser # ====================================== # # =========== LOGIN AND AUTH =========== # # ====================================== # br = RoboBrowser() br.open('https://dragcave.net/') form = br.get_form() print('==================') form['username'] = input('Username: '******'password'] = input('Password: '******'==================') br.submit_form(form) # =====================================# # =========== MODE STARTUP =========== # # =====================================# f = open('eggpedia.json', 'r') eggpedia = json.load(f) f.close() biome_codes = { 'Coast':'1', 'Desert':'2', 'Forest':'3', 'Jungle':'4',
class Scrape(object): def __init__(self, cookie): self._cookies = parse_cookie(cookie) self._data_dir = self._get_data_dir() self._ts = datetime.datetime.now().isoformat() self.measures = None self.props = [] self.browser = RoboBrowser( history=True, parser='html5lib', user_agent=UA) self.browser.session.cookies.update(self._cookies) def _get_data_dir(self): dirname = os.path.dirname(os.path.abspath(__file__)) if 'site-packages' not in dirname: data_dir = os.path.normpath(os.path.join(dirname, '..', 'data')) else: data_dir = os.path.normpath(os.path.join(os.getcwd(), 'data')) mkdir(data_dir) return data_dir def fetch_measures(self): self.browser.open(URLS.get('measures')) # get measures measures_raw = self.browser.find(id='ListElections1__ctl0') pattern = re.compile(r'^PROPOSITION (?P<prop>[0-9]+) - (?P<description>.*)') measures = [] for a in measures_raw.find_all('a'): measure = pattern.match(a.text).groupdict() measure['url'] = a['href'] measures.append(measure) self.measures = { 'timestamp': self._ts, 'measures': measures, } with open(os.path.join(self._data_dir, 'measures.json'), 'w') as f: f.write(json.dumps(self.measures)) def fetch_prop(self, prop, url): link = os.path.join(URLS['measures'], url) self.browser.open(link) cf = self.browser.find(text='Campaign Finance:') body = cf.parent.parent tables = body.find_all('table') new = [] data = { 'prop': prop, 'committees': {}, } for table in tables: if table.find('span', text='COMMITTEE ID'): for row in table.find_all('tr'): if row.find(text='COMMITTEE ID'): continue cols = row.find_all('td') c = {} c_id = cols[0].find('span').text a = cols[1].find('a') c['name'] = a.text c['link'] = a['href'] c['position'] = cols[2].find('span').text data['committees'][c_id] = c data['timestamp'] = self._ts prop_dir = os.path.join(self._data_dir, prop) mkdir(prop_dir) with open(os.path.join(prop_dir, 'prop.json'), 'w') as f: f.write(json.dumps(data)) self.props.append(data) return data def fetch_committee(self, prop, committee_id, link): committee_dir = os.path.join(self._data_dir, prop, committee_id) mkdir(committee_dir) ''' url = 'http://cal-access.sos.ca.gov%s' % link print link self.browser.open(url) cf = self.browser.find(text='Campaign Finance:') body = cf.parent.parent ''' ''' 'contributions', 'expenditures', 'late1', 'late2', 'late3', 'http://cal-access.sos.ca.gov/Campaign/Committees/DetailContributionsReceivedExcel.aspx?id=1406518&session=2017', 'http://cal-access.sos.ca.gov/Campaign/Committees/DetailContributionsMadeExcel.aspx?id=1406518&session=2017', 'http://cal-access.sos.ca.gov/Campaign/Committees/DetailExpendituresMadeExcel.aspx?id=1406518&session=2017', 'http://cal-access.sos.ca.gov/Campaign/Committees/DetailLateExcel.aspx?id=1406518&session=2017&view=LATE1', 'http://cal-access.sos.ca.gov/Campaign/Committees/DetailLateExcel.aspx?id=1406518&session=2017&view=LATE2', 'http://cal-access.sos.ca.gov/Campaign/Committees/DetailLateExcel.aspx?id=1406518&session=2017&view=LATE3', ''' links = { 'contributions_received': '%(prefix)sDetailContributionsReceivedExcel.aspx?id=%(id)s&session=2017' % { 'prefix': URLS.get('committees'), 'id': committee_id, }, 'contributions_made': '%(prefix)sDetailContributionsMadeExcel.aspx?id=%(id)s&session=2017' % { 'prefix': URLS.get('committees'), 'id': committee_id, }, 'expenditures_made': '%(prefix)sDetailExpendituresMadeExcel.aspx?id=%(id)s&session=2017' % { 'prefix': URLS.get('committees'), 'id': committee_id, }, 'late_and_5k_plus_contributions_received': '%(prefix)sDetailLateExcel.aspx?id=%(id)s&session=2017&view=LATE1' % { 'prefix': URLS.get('committees'), 'id': committee_id, }, 'late_contributions_made': '%(prefix)sDetailLateExcel.aspx?id=%(id)s&session=2017&view=LATE2' % { 'prefix': URLS.get('committees'), 'id': committee_id, }, 'late_independent_expenditures': '%(prefix)sDetailLateExcel.aspx?id=%(id)s&session=2017&view=LATE3' % { 'prefix': URLS.get('committees'), 'id': committee_id, }, } data = {} data['timestamp'] = self._ts for kind, link in links.iteritems(): self.browser.open(link) csv_data = self.browser.find('body').text with open(os.path.join(committee_dir, '%s.csv' % kind), 'w') as f: f.write(csv_data) with open(os.path.join(committee_dir, '%s.csv' % kind), 'r') as f: #reader = csv.DictReader(f, delimiter='\t') reader = csv.reader(f, delimiter='\t') header = next(reader, None) rows = [] for row in reader: rows.append(row) data[kind] = { 'header': header, 'data': rows, } time.sleep(THROTTLE_TIME) with open(os.path.join(committee_dir, 'committee.json'), 'w') as f: f.write(json.dumps(data))
for x in college: for u in range(low, high): # IF condition to concatenate USN if u < 10: usn = x + year + branch + '00' + str(u) elif u < 100: usn = x + year + branch + '0' + str(u) else: usn = x + year + branch + str(u) # opens the vtu result login page, gets the usn and opens the result page url = "http://results.vtu.ac.in/vitaviresultcbcs/index.php" if semc == '7': url = "http://results.vtu.ac.in/vitaviresultnoncbcs/index.php" br = RoboBrowser() br.open(url) form = br.get_form() form['lns'].value = usn br.submit_form(form) soup = br.parsed # Finds all the table elements and stores in array tds tds = soup.findAll('td') ths = soup.findAll('th') divs = soup.findAll('div', attrs={'class': 'col-md-12'}) divCell = soup.findAll('div', attrs={'class': 'divTableCell'}) try: sem = divs[5].div.text sem = sem.strip('Semester : ')
def cli(prob_id, filename): # get latest submission id, so when submitting should have not equal id last_id, b, c, d, e = get_latest_verdict(config.username) # Browse to Codeforces browser = RoboBrowser(parser = 'html.parser') browser.open('http://codeforces.com/enter') enter_form = browser.get_form('enterForm') enter_form['handleOrEmail'] = config.username enter_form['password'] = config.password browser.submit_form(enter_form) try: checks = list(map(lambda x: x.getText()[1:].strip(), browser.select('div.caption.titled'))) if config.username not in checks: click.secho('Login Failed.. Wrong password.', fg = 'red') return except Exception as e: click.secho('Login Failed.. Maybe wrong id/password.', fg = 'red') return click.secho('[{0}] login successful! '.format(config.username), fg = 'green') click.secho('Submitting [{1}] for problem [{0}]'.format(prob_id, filename), fg = 'green') browser.open('https://codeforces.com/contest/'+prob_id[:-1]+'/problem/'+prob_id[-1]) submit_form = browser.get_form(class_ = 'submitForm') try: submit_form['sourceFile'] = filename except Exception as e: click.secho('File {0} not found in current directory'.format(filename)) return browser.submit_form(submit_form) if browser.url[-3:] != '/my': click.secho('Failed submission, probably you have submit the same file before', fg = 'red') return click.secho('[{0}] submitted ...'.format(filename), fg = 'green') hasStarted = False while True: id_, verdict_, time_, memory_, passedTestCount_ = get_latest_verdict(config.username) if id_ != last_id and verdict_ != 'TESTING' and verdict_ != None: if verdict_ == 'OK': click.secho('OK - Passed {} tests'.format(passedTestCount_), fg = 'green') else: click.secho("{} on test {}".format(verdict_, passedTestCount_ + 1), fg = 'red') click.secho('{} MS | {} KB'.format(time_, memory_), fg = ('green' if verdict_ == 'OK' else 'red')) break elif verdict_ == 'TESTING' and (not hasStarted): click.secho("Judgment has begun", fg='green') hasStarted = True time.sleep(0.5)
from robobrowser import RoboBrowser br = RoboBrowser() br.open("https://<url>") form = br.get_form() form['username'] = "******" form['password'] = "******" br.submit_form(form) print(str(br.parsed))
def run(wait): """Starts the scrapping proccess. creates a process per year between minyear and maxyear """ logger = makeLogger('main', r'./logs_pfrTeamStats/') startTime = datetime.now() logger.debug('start time: ' + str(startTime)) logger.debug('waiting %d seconds', wait) time.sleep(wait) logger.debug('Opening main page') browser = RoboBrowser(history=False, parser='html5lib', user_agent=get_user_agent(logger), timeout=10) browser = open_or_follow_link( logger, browser, 'open', "http://www.pro-football-reference.com/teams/") table_body = browser.find(id='teams_active').find('tbody') rows = table_body.find_all('tr') team_url_tups = [] for index, row in enumerate(rows): logger.debug('Row %d of %d', index, len(rows)) try: team_link = row.find('th').find('a') if team_link: team_url = 'http://www.pro-football-reference.com' + team_link[ 'href'] team_name = team_link.text team_url_tups.append((team_url, team_name)) except: logger.exception(row) pool = Pool(processes=int(get_proxy_count() / 2.5)) results = [] for team_url, team_name in team_url_tups: #print parseTeam(team_url, team_name) results.append(pool.apply_async(parseTeam, ( team_url, team_name, ))) pool.close( ) #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join( ) #Wait for the worker processes to exit. One must call close() or terminate() before using join(). year_url_tups = [] for result in results: year_url_tup = result.get() if year_url_tup: year_url_tups += (year_url_tup) logger.debug('Done gathering %d year urls', len(year_url_tups)) pool = Pool(processes=int(get_proxy_count() / 2)) logger.debug('Shuffling year_urls') random.shuffle(year_url_tups) logger.debug('Starting to parse year_urls') for team_name, year_url, year in year_url_tups: #parseYear(team_name, year_url, year) pool.apply_async(parseYear, ( team_name, year_url, year, )) pool.close( ) #Prevents any more tasks from being submitted to the pool. Once all the tasks have been completed the worker processes will exit. pool.join( ) #Wait for the worker processes to exit. One must call close() or terminate() before using join(). logger.debug('run time: ' + str(datetime.now() - startTime)) closeLogger('main')
## search terms terms = [ 'chafee', 'clinton', "o'malley", 'sanders', 'webb', 'warren', 'bush', 'carson', 'christie', 'cruz', 'fiorina', 'gilmore', 'graham', 'huckabee', 'jindal', 'kasich', 'pataki', 'paul', 'perry', 'rubio', 'santorum', 'trump', 'walker', 'romney', 'election', 'presidential', 'cycle', 'primary', 'primaries', 'candidate', 'race' ] ## dates to search in 2015 months, days = range(1, 9), range(1, 32) dates = itertools.product(months, days) ## search the archives for potentially relevant material browser = RoboBrowser(history=True) relevant_urls = [] bad_urls = [] for date in dates: m, d = date[0], date[1] archive_url = 'http://www.wsj.com/public/page/archive-2015-' + str( m) + '-' + str(d) + '.html' try: browser.open(archive_url) articles = browser.find_all('h2') for article in articles: if any(word in article.get_text().lower() for word in terms): relevant_urls.append(article.find('a').get('href')) except: bad_urls.append(archive_url) pass
class Paper: no = 0 title = "" author = "" journal = "" ISSN = "" year = 0 cited = 0 doi = "" IF = 0 browser = RoboBrowser(history=True) browser.open("http://apps.webofknowledge.com/WOS_GeneralSearch_input.do;jsessionid=C0869EEDE01F91FB8B7F92ED05EB8972?product=WOS&search_mode=GeneralSearch&SID=E1XUuzYJfkILEXCQF8V&preferencesSaved=") papers = [] if (len(sys.argv) == 2): filename = str(sys.argv[1]) else: filename = "files/top20.csv" df = pd.read_csv(filename, header=0) titles = df.values[:, 0] fileparts = filename.split('.') fileresult = fileparts[0] + "_result.csv"
r r.cookies r = requests.get("http://www.google.com") r r.cookies r.url r = requests.get("http://www.google.com", redirect=False) requests.request? r = requests.get("http://www.google.com", allow_redirects=False) r.url r.status_code r.headers["location"] requests.request? import robobrowser from robobrowser import RoboBrowser b = RoboBrowser(parser="lxml.html") b b.open("http://www.chandrashekar.info") b.url b.contents b.response b.response.status_code b.links dir(b) b.get_links() b = RoboBrowser(parser="lxml") b.open("http://www.chandrashekar.info") b.get_links() b.get_links() b.forms dir(b)
def setUp(self): super().setUp() self.browser = RoboBrowser(history=True, parser='html.parser')
import os import shutil extension = { "C++": "cpp", "C": "c", "C++14": "cpp", "Java": "java", "Python": "py", "CPP": "cpp", "JAVA": "java" } username = raw_input("Enter your spoj username:"******"Enter your spoj password:"******"html5lib") browser.open('http://www.spoj.com/') form = browser.get_form(id='login-form') form['login_user'].value = username form['password'].value = password browser.submit_form(form) browser.open('http://www.spoj.com/myaccount') problems = browser.find(id="user-profile-tables").find_all('td') try: os.mkdir("spoj_solutions") except: shutil.rmtree("spoj_solutions") os.mkdir("spoj_solutions") for problem in problems:
def get_non_legistar_entries(past_entries, city, search_regex): positive_results = [] new_agendas = [] browser = RoboBrowser(history=True) header = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'none', 'Accept-Language': 'en-US,en;q=0.8', 'Connection': 'keep-alive' } s = requests.Session() s.headers = header browser = RoboBrowser(session=s, parser="lxml") agenda_url = city["agenda_site"] #non-Legistar sites need to be very specific - these sites could throw anything at you. #if you need to add another city, follow this format: #if if city["short"] == "berkeley": try: browser.open(agenda_url) links = browser.find_all("a", title="Agenda") except: print("There was a problem opening the URL: " + agenda_url) print("Aborting search for agendas from " + city["name"]) return [], [] for link in links: url = city["root_site"] + str(link['href']) meetingid = url[url.rfind("/") + 1:url.rfind(".aspx")] #print(meetingid) if not any(meetingid in entry for entry in past_entries): new_agendas = new_agendas + [meetingid] browser.follow_link(link) content = str(browser.response.content) content = content.lower() content = content[content.find("innercontentcontainer"):] term_match = [] m = re.findall(search_regex, content.lower()) if m is not None and len(m) > 0: term_match = term_match + list(set(m)) browser.back() if (len(term_match) > 0): page_body = str(browser.response.content) index1 = page_body.find(meetingid) page_body = page_body[0:index1] index2 = page_body.rfind("<tr>") page_body = page_body[index2:] deets = re.findall('[\\d]+/[\\d]+', page_body) meeting_date = deets[0] matches = "" for term in set(term_match): for bogus in ['-', ' ']: if bogus in term: term = term.replace(bogus, "") matches = matches + "#" + term + ", " positive_results.append( (meetingid, "#" + city["short"] + " #" + city["hash_tag"] + " city meeting on " + meeting_date + " about " + matches, url)) elif city["short"] == "berkeleyprc" or city["short"] == "berkeleyp&j": try: browser.open(agenda_url) links = browser.find_all("a", title=re.compile(".genda")) except: print("There was a problem opening the URL: " + agenda_url) print("Aborting search for agendas from " + city["name"]) return [], [] for link in links: meetingid = str(link) url = city["root_site"] + str(link['href']).replace(" ", "%20") #print(url) pdf_index = url.rfind(".pdf") if pdf_index < 0: meetingid = url[url.rfind("/") + 1:] if not any(meetingid in entry for entry in past_entries): new_agendas = new_agendas + [meetingid] continue meetingid = url[url.rfind("/") + 1:pdf_index] if not any(meetingid in entry for entry in past_entries): new_agendas = new_agendas + [meetingid] browser.follow_link(link) content = browser.response.content term_match = search_pdf(meetingid, content, search_regex) browser.back() if (len(term_match) > 0): searchdex = str(link['title']) deets = searchdex.split() meeting_date = deets[0].lower() for bogus in string.ascii_letters: if bogus in meeting_date: meeting_date = meeting_date.replace(bogus, "") matches = "" for term in set(term_match): for bogus in ['-', ' ']: if bogus in term: term = term.replace(bogus, "") matches = matches + "#" + term + ", " positive_results.append( (meetingid, "#" + city["short"] + " #" + city["hash_tag"] + " mtg on " + meeting_date + " about " + matches, url)) else: return [], [] return new_agendas, positive_results
# -*- coding: utf-8 -*- """ Created on Fri Feb 03 15:29:35 2017 @author: d_floriello enel scraper """ from robobrowser import RoboBrowser from selenium import webdriver from selenium.webdriver.firefox.firefox_binary import FirefoxBinary browser = RoboBrowser() login_url = 'https://smistaweb.enel.it/tpauth/JavaNotEnabled.html' browser.open(login_url) form = browser.get_form(id='form_id') browser = webdriver.Firefox() browser.get(login_url) #Profilo: Z:\Lorenzo\Entrust Profile\GABRIELE BERTHOLET.epf #Password: Axopower_123 form['profile'].value = "Z:\Lorenzo\Entrust Profile\GABRIELE BERTHOLET.epf" form['password'].value = "Axopower_123" browser.submit_form(form) binary = FirefoxBinary("C:/Program Files (x86)/Mozilla Firefox/firefox.exe") login_url = 'https://smistaweb.enel.it/tpauth/JavaNotEnabled.html'
for i, line in enumerate(f): movie_id = line.replace('\n', '') if i + 1 < start_line: continue if i + 1 > end_line: break retry_count = 0 while True: retry_count += 1 if retry_count >= 20: break print(i + 1, movie_id, end='\t') try: browser = RoboBrowser(history=True, parser='html.parser', timeout=10) browser.open('http://www.imdb.com/title/'+movie_id) poster_tag = str(browser.find(class_=re.compile(r'\bposter\b'))) browser.select('id.titleDetails') country = str(browser.find(href=re.compile(r'\?country')).text) print(country, end='\t') browser.select('id.titleStoryLine') genres = browser.find_all(href=re.compile(r'\?ref_=tt_stry_gnr')) for j in range(len(genres)): genres[j] = str(genres[j]).split('> ')[1].split('<')[0] genres = str(genres).replace('[', '').replace(']', '') genres = genres.replace(', ', ':')
import re import config from robobrowser import RoboBrowser br = RoboBrowser() br.open( "https://192.168.100.1:6082/php/uid.php?vsys=1&rule=2&url=http://chitkara.cloud/ChitkaraLocalCloud/home.php", verify=False) form = br.get_form() form['user'] = config.DATACOUP_USERNAME form['passwd'] = config.DATACOUP_PASSWORD br.submit_form(form) src = str(br.parsed()) start = '<title>' end = '</title>' result = re.search('%s(.*)%s' % (start, end), src).group(1) print(result)
class TinderApi(): def __init__(self, data_folder): self.get_headers = { 'app_version': '6.9.4', 'platform': 'ios', "User-agent": "Tinder/7.5.3 (iPhone; iOS 10.3.2; Scale/2.00)", "Accept": "application/json" } self.get_message_headers = { "accept": "application/json", "platform": "web", "tinder-version": "2.46.1" } self.headers = self.get_headers.copy() self.headers['content-type'] = "application/json" self.host = "https://api.gotinder.com" self.browser = RoboBrowser() self.data_folder = data_folder self.page_token = None def get_person_data(self, data): if "user" in data: person = data['user'] type = "recommendation" elif 'person' in data: person = data['person'] type = "match" else: person = data type = "person" return person, type def download_people_data_api(self, data_list, folder_path, photos, insta, messages, rename_images, amount, force_overwrite=False, log_to_widget=True, thread_update_signal=None): downloaded_data = [] if not isinstance(data_list, list): data_list = [data_list] total = len(data_list) if amount > 0: total = min(total, amount) for i in range(total): if thread_update_signal is not None: thread_update_signal.emit("Downloading " + str(folder_path) + ": " + str(i + 1) + "/" + str(total)) log.i("API", "Downloading " + str(i + 1) + "/" + str(total), log_to_widget) updated_data = self.download_person_data(data_list[i], folder_path, photos, insta, messages, rename_images, force_overwrite, log_to_widget, thread_update_signal) downloaded_data.append(updated_data) log.i("API", "Data Downloaded!", log_to_widget) return downloaded_data def download_person_data(self, data, base_folder, photos, insta, messages, rename_images, force_overwrite=False, log_to_widget=True, thread_update_signal=None): person_data, type = self.get_person_data(data) id = person_data['_id'] name = person_data['name'] path = base_folder + "/" + str(name) + "_" + str(id) + "/" person_data['path'] = str(os.path.abspath(path)) log.i( "API", "Downloading " + type + ": " + name + " " + id + " to: " + str(person_data['path']), log_to_widget) if os.path.exists(path): log.d("API", "Person path already exists: " + person_data['path'], log_to_widget) else: os.makedirs(path) log.d("API", "Person path created: " + person_data['path'], log_to_widget) person_data['local_path'] = str(os.path.abspath(path)) if insta and 'instagarm' in person_data: self.download_instagram_photos(person_data['instagram'], path, rename_images, force_overwrite, log_to_widget, thread_update_signal) if photos and 'photos' in person_data: self.download_photos(person_data['photos'], path, rename_images, force_overwrite, log_to_widget, thread_update_signal) if messages and 'match' in type: data['messages'] = self.download_messages(data, log_to_widget, thread_update_signal) data['AI_Dating_metadata'] = {} data['AI_Dating_metadata']['last_updated_datetime'] = str( datetime.now().strftime("%d-%b-%Y %H:%M:%S")) data['AI_Dating_metadata']['last_updated_timestamp'] = str( datetime.utcnow()) self.write_data_to_file(data, path, log_to_widget, thread_update_signal) return data def download_messages(self, match_data, log_to_widget=True, thread_update_signal=None): log.d("API", "Downloading match messages", log_to_widget) messages = self.get_messages(match_data, 100, None, log_to_widget, thread_update_signal) log.d( "API", "Downloaded messages: " + str(match_data["_id"] + ": " + str(messages)), log_to_widget) if messages is not None and 'data' in messages: return messages['data']['messages'] return [] def write_data_to_file(self, data, base_path, log_to_widget=True, thread_update_signal=None): log.d("API", "Data written to: " + str(base_path), log_to_widget) with open(base_path + 'data.yaml', 'w') as fp: yaml.dump(data, fp) def download_photos(self, photos_list, base_path, rename, force_overwrite=False, log_to_widget=True, thread_update_signal=None): for i in range(len(photos_list)): photo = photos_list[i] log.d("API", "Downloading full-size photos", log_to_widget) filename, skipped = self.download_file(photo['url'], base_path, rename, i, "", force_overwrite, log_to_widget) if filename is not None: photo['local_path'] = str(os.path.abspath(filename)) if 'processedFiles' in photo: processed_files = photo['processedFiles'] small_photo = processed_files[len(processed_files) - 1] log.d("API", "Downloading small photo", log_to_widget) filename, skipped = self.download_file( small_photo['url'], base_path + "/small/", rename, i, "_small", force_overwrite, log_to_widget=log_to_widget) if filename is not None: small_photo['local_path'] = str(os.path.abspath(filename)) def download_instagram_photos(self, instagram_data, base_path, rename, force_overwrite=False, log_to_widget=True, thread_update_signal=None): if 'photos' not in instagram_data.keys(): log.d("API", "NO instagram photos", log_to_widget) return log.d("API", "Downloading instagram photos", log_to_widget) for i in range(len(instagram_data['photos'])): filename, skipped = self.download_file( instagram_data['photos'][i]['image'], base_path + "instagram/", rename, i, "", force_overwrite, log_to_widget) if filename is not None: instagram_data['photos'][i]['local_path'] = str( os.path.abspath(filename)) def download_file(self, url, base_path, rename, index, postfix="", force_overwrite=False, log_to_widget=True, thread_update_signal=None): try: file_name = str(index) + postfix + ".jpg" if not rename: file_name = (url.split("/")[-1] + '.jpg').split('?')[0] full_filename = base_path + file_name if not os.path.exists(base_path): os.makedirs(base_path) log.d("API", "File path created: " + base_path, log_to_widget) if not os.path.exists(full_filename) or force_overwrite: self.browser.open(url) with open(full_filename, "wb") as image_file: image_file.write(self.browser.response.content) if force_overwrite: log.d("API", "Forcing Re-Download: " + full_filename, log_to_widget) else: log.i("API", "Downloading: " + full_filename, log_to_widget) return full_filename, False else: log.d( "API", "File already downloaded (force_overwrite=False): " + full_filename, log_to_widget) return full_filename, True except Exception as e: log.e("API", "EXCEPTION!: " + str(e), log_to_widget) return None, False def read_data(self, file_path, log_to_widget=True, thread_update_signal=None): try: with open(file_path, "r") as f: try: data = json.load(f) except Exception as e: try: data = yaml.safe_load(f) except Exception as e: return None log.i("API", "Data read from file: " + str(file_path), log_to_widget) return data except Exception as e: log.e( "API", "Exception reading data from file : " + str(os.path.abspath(file_path)) + ", Exc: " + str(e), log_to_widget) return None def reload_data_from_disk(self, folder_path, merged_filename, photos, insta, messages, force_overwrite=False, log_to_widget=True, thread_update_signal=None): list = [] try: for subdir, dirs, files in os.walk(folder_path): total_dirs = len(dirs) for i in range(len(dirs)): data_path = os.path.join(subdir, dirs[i]) + "/" data_file_path = data_path + "data.yaml" try: if os.path.exists(data_file_path): with open(data_file_path) as yf: data = yaml.safe_load(yf) person_data, type = self.get_person_data(data) person_data['path'] = os.path.abspath( data_path ) # Updating the data path just in case if photos and 'photos' in person_data: self.download_photos( person_data['photos'], data_path, True, force_overwrite, log_to_widget=log_to_widget) if insta and 'instagram' in person_data and 'photos' in person_data[ 'instagram']: self.download_instagram_photos( person_data['instagram'], data_path, True, force_overwrite, log_to_widget=log_to_widget) if messages and 'match' in type: data['messages'] = self.download_messages( data, log_to_widget) log.d("API", "Updating " + type + " data file", log_to_widget) data['AI_Dating_metadata'] = {} data['AI_Dating_metadata'][ 'last_updated_datetime'] = str( datetime.now().strftime( "%d-%b-%Y %H:%M:%S")) data['AI_Dating_metadata'][ 'last_updated_timestamp'] = str( datetime.utcnow()) self.write_data_to_file( data, data_path, log_to_widget, thread_update_signal) log.d("API", "Updated", log_to_widget) list.append(data) log.i( "API", str(i + 1) + "/" + str(total_dirs) + " - " + str(dirs[i]) + " " + person_data['name'], log_to_widget) else: log.i( "API", str(i + 1) + "/" + str(total_dirs) + " - " + str(dirs[i]) + " SKIPPED", log_to_widget) except Exception as e: log.e("API", "Exception reloading data " + str(e), log_to_widget) if thread_update_signal is not None: thread_update_signal.emit( str(folder_path) + "\t" + str(i + 1) + "/" + str(total_dirs)) break except Exception as e: log.e("API", "Exception in reloading from disk: " + str(e), log_to_widget) try: with open(merged_filename, "w+") as f: json.dump(list, f) except Exception as e: log.e( "API", "Could not save merged file " + merged_filename + ": " + str(e), log_to_widget) return list def get_fb_access_token(self, email, password, log_to_widget=True, thread_update_signal=None): token = fb_auth_token.get_fb_access_token(email, password) log.e("TOKEN", "Gotten token: " + str(token), log_to_widget) return token def get_fb_user_id(self, fb_token, log_to_widget=True, thread_update_signal=None): fb_id = fb_auth_token.get_fb_id(fb_token) log.e("FB_ID", "Gotten fb user id: " + str(fb_id), log_to_widget) return fb_id def get_auth_token(self, fb_auth_token, fb_user_id, log_to_widget=True, thread_update_signal=None): log.d("API", "get_auth_token: " + fb_auth_token + "\t" + fb_user_id, log_to_widget) if "error" in fb_auth_token: return {"error": "could not retrieve fb_auth_token"} if "error" in fb_user_id: return {"error": "could not retrieve fb_user_id"} url = self.host + '/v2/auth/login/facebook' req = requests.post(url, headers=self.headers, data=json.dumps({ 'token': fb_auth_token, 'facebook_id': fb_user_id })) try: log.d("API", "Sending JSON request", log_to_widget) json_request = req.json() log.i("API", "Token JSON status: " + str(json_request['meta']['status']), log_to_widget) tinder_auth_token = json_request["data"]["api_token"] self.headers.update({"X-Auth-Token": tinder_auth_token}) self.get_headers.update({"X-Auth-Token": tinder_auth_token}) self.get_message_headers.update( {"X-Auth-Token": tinder_auth_token}) log.s("API", "You have been successfully authorized!") return tinder_auth_token except Exception as e: log.e("API", "Error getting Tinder Token " + str(e), log_to_widget) return { "error": "Something went wrong. Sorry, but we could not authorize you." } def authverif(self, fb_access_token, fb_user_id, log_to_widget=True, thread_update_signal=None): res = self.get_auth_token(fb_access_token, fb_user_id) if "error" in res: return False return True def get_recommendations(self, log_to_widget=True, thread_update_signal=None): ''' Returns a list of users that you can swipe on ''' try: r = requests.get('https://api.gotinder.com/user/recs', headers=self.headers) json = r.json() log.i( "API", "get_recommendations: Got response. Status: " + str(json['status']) + ": " + utils.error_code_to_message[json['status']], log_to_widget) return json except requests.exceptions.RequestException as e: log.e("API", "Something went wrong with getting recomendations:" + str(e), log_to_widget) def get_updates(self, last_activity_date="", log_to_widget=True, thread_update_signal=None): ''' Returns all updates since the given activity date. The last activity date is defaulted at the beginning of time. Format for last_activity_date: "2017-07-09T10:28:13.392Z" ''' try: url = self.host + '/updates' r = requests.post(url, headers=self.headers, data=json.dumps( {"last_activity_date": last_activity_date})) return r.json() except requests.exceptions.RequestException as e: log.e("API", "Something went wrong with getting updates:" + str(e), log_to_widget) def get_self(self, log_to_widget=True, thread_update_signal=None): ''' Returns your own profile data ''' try: url = self.host + '/profile' r = requests.get(url, headers=self.headers) return r.json() except requests.exceptions.RequestException as e: log.e("API", "Something went wrong. Could not get your data:" + str(e), log_to_widget) def change_preferences(self, **kwargs): ''' ex: change_preferences(age_filter_min=30, gender=0) kwargs: a dictionary - whose keys become separate keyword arguments and the values become values of these arguments age_filter_min: 18..46 age_filter_max: 22..55 age_filter_min <= age_filter_max - 4 gender: 0 == seeking males, 1 == seeking females distance_filter: 1..100 discoverable: true | false {"photo_optimizer_enabled":false} ''' try: url = self.host + '/profile' r = requests.post(url, headers=self.headers, data=json.dumps(kwargs)) return r.json() except requests.exceptions.RequestException as e: log.e( "API", "Something went wrong. Could not change your preferences:" + str(e), log_to_widget) def get_meta(self, log_to_widget=True, thread_update_signal=None): ''' Returns meta data on yourself. Including the following keys: ['globals', 'client_resources', 'versions', 'purchases', 'status', 'groups', 'products', 'rating', 'tutorials', 'travel', 'notifications', 'user'] ''' try: url = self.host + '/meta' r = requests.get(url, headers=self.headers) return r.json() except requests.exceptions.RequestException as e: log.e( "API", "Something went wrong. Could not get your metadata:" + str(e), log_to_widget) def get_meta_v2(self, log_to_widget=True, thread_update_signal=None): ''' Returns meta data on yourself from V2 API. Including the following keys: ['account', 'client_resources', 'plus_screen', 'boost', 'fast_match', 'top_picks', 'paywall', 'merchandising', 'places', 'typing_indicator', 'profile', 'recs'] ''' try: url = self.host + '/v2/meta' r = requests.get(url, headers=self.headers) return r.json() except requests.exceptions.RequestException as e: log.e( "API", "Something went wrong. Could not get your metadata:" + str(e), log_to_widget) def update_location(self, lat, lon, log_to_widget=True, thread_update_signal=None): ''' Updates your location to the given float inputs Note: Requires a passport / Tinder Plus ''' try: url = self.host + '/passport/user/travel' r = requests.post(url, headers=self.headers, data=json.dumps({ "lat": lat, "lon": lon })) return r.json() except requests.exceptions.RequestException as e: log.e( "API", "Something went wrong. Could not update your location:" + str(e), log_to_widget) def reset_real_location(self, log_to_widget=True, thread_update_signal=None): try: url = self.host + '/passport/user/reset' r = requests.post(url, headers=self.headers) return r.json() except requests.exceptions.RequestException as e: log.e( "API", "Something went wrong. Could not update your location:" + str(e), log_to_widget) def get_recs_v2(self, log_to_widget=True, thread_update_signal=None): ''' This works more consistently then the normal get_recommendations becuase it seeems to check new location ''' try: url = self.host + '/v2/recs/core?locale=en-US' r = requests.get(url, headers=self.headers) return r.json() except Exception as e: log.e("API", 'excepted', log_to_widget) def set_webprofileusername(self, username): ''' Sets the username for the webprofile: https://www.gotinder.com/@YOURUSERNAME ''' try: url = self.host + '/profile/username' r = requests.put(url, headers=self.headers, data=json.dumps({"username": username})) return r.json() except requests.exceptions.RequestException as e: log.e( "API", "Something went wrong. Could not set webprofile username:"******"API", "Something went wrong. Could not delete webprofile username:"******"API", "Something went wrong. Could not get that person:" + str(e), log_to_widget) def get_messages(self, match_data=None, count=100, page_token=None, log_to_widget=True, thread_update_signal=None): # https://api.gotinder.com/v2/matches/5e762f611d443d01005c86975ea8db0a728e280100783a6e/messages?locale=en&count=100 # https://api.gotinder.com/v2/matches/5cae0e962d5de015002490965ea8db0a728e280100783a6e/messages?locale=en&count=100&page_token= try: path = '/v2/matches/%s/messages?locale=en&count=%s' % ( match_data["_id"], count) if page_token is not None: path += "&page_token=%s" % page_token r = requests.get(self.host + path, headers=self.headers) print("Messages url: " + str(self.host + path)) r_json = r.json() if 'next_page_token' in r_json['data']: new_data = self.get_messages(match_data, 100, r_json['data']['next_page_token'], log_to_widget, thread_update_signal) for message in new_data['data']['messages']: message[ 'page_token'] = page_token # This will be needed to get messages r_json['data']['messages'] = r_json['data'][ 'messages'] + new_data['data']['messages'] r_json["match_id"] = match_data["_id"] return r_json except requests.exceptions.RequestException as e: log.e("API", "Something went wrong. Could not get messages:" + str(e), log_to_widget) def send_msg(self, match_id, msg, log_to_widget=True, thread_update_signal=None): try: url = self.host + '/user/matches/%s' % match_id r = requests.post(url, headers=self.headers, data=json.dumps({"message": msg})) return r.json() except requests.exceptions.RequestException as e: log.e( "API", "Something went wrong. Could not send your message:" + str(e), log_to_widget) def unmatch(self, match_id, log_to_widget=True, thread_update_signal=None): try: url = self.host + '/user/matches/%s' % match_id r = requests.delete(url, headers=self.headers) return r.json() except requests.exceptions.RequestException as e: log.e("API", "Something went wrong. Could not unmatch person:" + str(e), log_to_widget) def superlike(self, person_id, log_to_widget=True, thread_update_signal=None): try: url = self.host + '/like/%s/super' % person_id r = requests.post(url, headers=self.headers) return r.json() except requests.exceptions.RequestException as e: log.e("API", "Something went wrong. Could not superlike:" + str(e), log_to_widget) def like(self, person_id, log_to_widget=True, thread_update_signal=None): try: url = self.host + '/like/%s' % person_id r = requests.get(url, headers=self.get_headers) return r.json() except requests.exceptions.RequestException as e: log.e("API", "Something went wrong. Could not like:" + str(e), log_to_widget) def dislike(self, person_id, log_to_widget=True, thread_update_signal=None): try: url = self.host + '/pass/%s' % person_id r = requests.get(url, headers=self.get_headers) return r.json() except requests.exceptions.RequestException as e: log.e("API", "Something went wrong. Could not dislike:" + str(e), log_to_widget) def report(self, person_id, cause, explanation='', log_to_widget=True, thread_update_signal=None): ''' There are three options for cause: 0 : Other and requires an explanation 1 : Feels like spam and no explanation 4 : Inappropriate Photos and no explanation ''' try: url = self.host + '/report/%s' % person_id r = requests.post(url, headers=self.headers, data={ "cause": cause, "text": explanation }) return r.json() except requests.exceptions.RequestException as e: log.e("API", "Something went wrong. Could not report:" + str(e), log_to_widget) def match_info(self, match_id, log_to_widget=True, thread_update_signal=None): try: url = self.host + '/matches/%s' % match_id r = requests.get(url, headers=self.headers) return r.json() except requests.exceptions.RequestException as e: log.e( "API", "Something went wrong. Could not get your match info:" + str(e), log_to_widget) def get_matches(self, log_to_widget=True, thread_update_signal=None): try: url = self.host + '/v2/matches' r = requests.get(url, headers=self.headers) return r.json() except requests.exceptions.RequestException as e: log.e( "API", "Something went wrong. Could not get your match iself.page_tokennfo:" + str(e), log_to_widget) def all_matches(self, amount=60, message=0, page_token=None, log_to_widget=True, thread_update_signal=None): try: url = self.host + '/v2/matches?locale=en&count=' + str( amount) + '&message=' + str(message) + '&is_tinder_u=false' log.d("API", "All matches page: " + str(page_token), log_to_widget) if page_token: url = url + '&page_token=' + page_token r = requests.get(url, headers=self.headers) json = r.json() log.d("API", "All matches keys " + str(json.keys()), log_to_widget) log.d("API", "All matches data " + str(json['data'].keys()), log_to_widget) log.d( "API", "All matches data matches " + str(len(json['data']['matches'])) + " " + str(json['data']['matches'][0].keys()), log_to_widget) log.d("API", "All matches meta " + str(json['meta'].keys()), log_to_widget) log.i( "API", "all_matches: Got response. Status: " + str(json['meta']['status']) + ": " + utils.error_code_to_message[json['meta']['status']], log_to_widget) if 'next_page_token' in json['data']: new_data = self.all_matches(amount, message, json['data']['next_page_token']) for match in new_data['data']['matches']: match[ 'page_token'] = page_token # This will be needed to get messages json['data']['matches'] = json['data']['matches'] + new_data[ 'data']['matches'] self.page_token = json['data']['next_page_token'] elif message <= 0: new_data = self.all_matches(amount, 1, None) json['data']['matches'] = json['data']['matches'] + new_data[ 'data']['matches'] log.i("API", "Total matches " + str(len(json['data']["matches"])), log_to_widget) return json except requests.exceptions.RequestException as e: log.e( "API", "Something went wrong. Could not get your match info:" + str(e), log_to_widget) def fast_match_info(self, log_to_widget=True, thread_update_signal=None): try: url = self.host + '/v2/fast-match/preview' r = requests.get(url, headers=self.headers) count = r.headers['fast-match-count'] # image is in the response but its in hex.. return count except requests.exceptions.RequestException as e: log.e( "API", "Something went wrong. Could not get your fast-match count:" + str(e), log_to_widget) def trending_gifs(self, limit=3, log_to_widget=True, thread_update_signal=None): try: url = self.host + '/giphy/trending?limit=%s' % limit r = requests.get(url, headers=self.headers) return r.json() except requests.exceptions.RequestException as e: log.e( "API", "Something went wrong. Could not get the trending gifs:" + str(e), log_to_widget) def gif_query(self, query, limit=3, log_to_widget=True, thread_update_signal=None): try: url = self.host + '/giphy/search?limit=%s&query=%s' % (limit, query) r = requests.get(url, headers=self.headers) return r.json() except requests.exceptions.RequestException as e: log.e("API", "Something went wrong. Could not get your gifs:" + str(e), log_to_widget) # def see_friends(self, log_to_widget=True, thread_update_signal=None): # try: # url = self.host + '/group/friends' # r = requests.get(url, headers=self.headers) # return r.json()['results'] # except requests.exceptions.RequestException as e: # log.e("API", "Something went wrong. Could not get your Facebook friends:" +str(e), log_to_widget) """ FEATURES """ def get_match_info(self, log_to_widget=True, thread_update_signal=None): matches = self.get_updates()['matches'] now = datetime.utcnow() match_info = {} for match in matches[:len(matches)]: try: person = match['person'] person_id = person['_id'] # This ID for looking up person name = person['name'] id = match['id'] msg_count = match['message_count'] photos = self.get_photos(person) bio = "" if 'bio' in person.keys(): bio = person['bio'] gender = person['gender'] avg_succ_rate = self.get_avg_successRate(person) messages = match['messages'] age = self.calculate_age(match['person']['birth_date']) distance = self.get_person(person_id)['results']['distance_mi'] last_activity_date = match['last_activity_date'] match_info[person_id] = { "name": name, "match_id": id, # This ID for messaging "message_count": msg_count, "photos": photos, "bio": bio, "gender": gender, "avg_successRate": avg_succ_rate, "messages": messages, "age": age, "distance": distance, "last_activity_date": last_activity_date, } log.d("API", name + "_" + id) except Exception as ex: template = "An exception of type {0} occurred. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) log.e("API", message) # continue log.i("API", "All data stored in variable: match_info") filename = self.data_folder + 'match_info.json' with open(filename, 'w') as fp: json.dump(match_info, fp) log.i("API", "All data dumped to file: " + str(os.path.abspath(filename))) return match_info def get_match_id_by_name(self, name, log_to_widget=True, thread_update_signal=None): ''' Returns a list_of_ids that have the same name as your input ''' global match_info list_of_ids = [] for match in match_info: if match_info[match]['name'] == name: list_of_ids.append(match_info[match]['match_id']) if len(list_of_ids) > 0: return list_of_ids return {"error": "No matches by name of %s" % name} def get_photos(self, person, log_to_widget=True, thread_update_signal=None): ''' Returns a list of photo urls ''' photos = person['photos'] photo_urls = [] for photo in photos: photo_urls.append(photo['url']) return photo_urls def calculate_age(self, birthday_string, log_to_widget=True, thread_update_signal=None): ''' Converts from '1997-03-25T22:49:41.151Z' to an integer (age) ''' birthyear = int(birthday_string[:4]) birthmonth = int(birthday_string[5:7]) birthday = int(birthday_string[8:10]) today = date.today() return today.year - birthyear - ((today.month, today.day) < (birthmonth, birthday)) def get_avg_successRate(self, person, log_to_widget=True, thread_update_signal=None): ''' SuccessRate is determined by Tinder for their 'Smart Photos' feature ''' photos = person['photos'] curr_avg = 0 for photo in photos: try: photo_successRate = photo['successRate'] curr_avg += photo_successRate except: return -1 return curr_avg / len(photos) def sort_by_value(self, sortType, log_to_widget=True, thread_update_signal=None): ''' Sort options are: 'age', 'message_count', 'gender' ''' global match_info return sorted(match_info.items(), key=lambda x: x[1][sortType], reverse=True) def see_friends(self, log_to_widget=True, thread_update_signal=None): try: url = self.host + '/group/friends' r = requests.get(url, headers=self.headers) return r.json()['results'] except requests.exceptions.RequestException as e: log.e( "API", "Something went wrong. Could not get your Facebook friends:" + str(e), log_to_widget) def see_friends_profiles(self, name=None, log_to_widget=True, thread_update_signal=None): friends = self.see_friends() if name == None: return friends else: result_dict = {} name = name.title() # upcases first character of each word for friend in friends: if name in friend["name"]: result_dict[friend["name"]] = friend if result_dict == {}: return "No friends by that name" return result_dict def convert_from_datetime(self, difference, log_to_widget=True, thread_update_signal=None): secs = difference.seconds days = difference.days m, s = divmod(secs, 60) h, m = divmod(m, 60) return ("%d days, %d hrs %02d min %02d sec" % (days, h, m, s)) def get_last_activity_date(self, now, ping_time, log_to_widget=True, thread_update_signal=None): ping_time = ping_time[:len(ping_time) - 5] datetime_ping = datetime.strptime(ping_time, '%Y-%m-%dT%H:%M:%S') difference = now - datetime_ping since = self.convert_from_datetime(difference) return since def how_long_has_it_been(self, log_to_widget=True, thread_update_signal=None): global match_info now = datetime.utcnow() times = {} for person in match_info: name = match_info[person]['name'] ping_time = match_info[person]['last_activity_date'] since = self.get_last_activity_date(now, ping_time) times[name] = since log.i("API", name, "----->", since) return times def pause(self, log_to_widget=True, thread_update_signal=None): ''' In order to appear as a real Tinder user using the app... When making many API calls, it is important to pause a... realistic amount of time between actions to not make Tinder... suspicious! ''' nap_length = 3 * random() log.d("API", 'Napping for %f seconds...' % nap_length) sleep(nap_length)
def prepare_browser(): br = RoboBrowser(parser='html.parser', history=True) return br
import re from robobrowser import RoboBrowser import pdb #debugger import pandas as spreadsheet import requests from bs4 import BeautifulSoup ##### -- GLOBALS -- page = requests.get('https://pathofexile.gamepedia.com/Prophecy#Upgrading_uniques') soup = BeautifulSoup(page.content, "lxml") browser = RoboBrowser() browser.open("https://poe.trade/") form = browser.get_form(id="search") fated_uniques = [[],[],[]] prices = [[0 for x in range(57)],[0 for x in range(57)],[0 for x in range(57)]] ##### -- -- ##### -- FATED SEARCH / GET NAMES fated_page = soup.find(class_='mw-parser-output') h2 = fated_page.find(id="Fated_Uniques") fates = h2.parent.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling.next_sibling fates_rows = fates.tbody.find_all('tr') del fates_rows[0] fated_uniques[0] = [row.td.get_text() for row in fates_rows] count = 0 for row in fates_rows: acts = row.td.next_sibling.em.find_all(class_="c-item-hoverbox__activator") fated_uniques[1].append( acts[0].get_text() ) fated_uniques[2].append( acts[1].get_text() )
def main(): # obtain tagtree page browser = RoboBrowser(history=True) browser.open(tagtree_url) # obtain list of items from html tagtree = browser.find("ul") # dump taglist to `tagtree-clickable.html`, clickable, pretty printed with open(tagtree_clickable_fname, 'w') as f: f.write(tagtree.prettify()) # remove all <a href=> tags to facilitate parsing for match in tagtree.findAll('a'): match.replaceWithChildren() # dump taglist to `tagtree.html`, plain, no <a href> with open(tagtree_fname, 'w') as f: f.write(str(tagtree)) # obtain xml data from file treexml = etree.parse(tagtree_fname) # connect to the database conn = sqlite3.connect(db_fname) c = conn.cursor() # create a dict from XHTML matching tags to their ancestors treedict = {} for element in treexml.getiterator(): # ignore repeated elements if (str(element.text) != "None"): # find all ancestor elements ancestorlist = [] for ancestor in element.iterancestors(): if (str(ancestor.tag) != "ul"): # ignore ul tags ancestorlist.append(ancestor.text) # add element and ancestors to dictionary (if not empty) if len(ancestorlist) != 0: treedict[element.text] = ancestorlist # write treedict to formatted json with open(json_fname, 'w') as json_file: json_file.write( json.dumps(treedict, sort_keys=True, indent=2, separators=(',', ': '))) # insert into SQLite database for tagname, ancestors in treedict.items(): # element[0] is always the first parent, so insert that c.execute("""UPDATE tags SET parent = ? WHERE tagname = ?""", [ancestors[0], tagname]) # find all images with this tagname and add additional tags for it image_query = c.execute( """SELECT imageid FROM taglink WHERE tagname = ?""", [tagname]) for img_id in image_query: # link current images to all ancestor tags # OR IGNORE used to avoid duplicating taglinks for tag in ancestors: c.execute( """INSERT OR IGNORE INTO taglink (imageid, tagname) VALUES (?,?)""", [img_id[0], tag]) # Save (commit) the database changes conn.commit() # close sqlite database once finished conn.close()
import re from robobrowser import RoboBrowser browser = RoboBrowser() browser.open("https://duckduckgo.com") # Must find the proper id in the html form = browser.get_form(id="search_form_homepage") form form["q"].value = "python" browser.submit_form(form) links = browser.get_links() for link in links: print(link)
#!/usr/bin/env python2 # -*- coding: iso-8859-15 -*- import re from robobrowser import RoboBrowser browser = RoboBrowser() browser.open("https://www.celio.com/") # ON sera dirigé sur la page d'inscription de celio # Et on récupère le formulaire signup_form = browser.get_form(class_="register") # Vérification des valeurs signup_form["user[titleCode]"].value = "mr" signup_form["user[lastName]"].value = "Sym" signup_form["user[firstName]"].value = "Secure Your Mail" signup_form["user[birthdayDay]"].value = "21" signup_form["user[birthdayMonth]"].value = "03" signup_form["user[birthdayYear]"].value = "1989" signup_form["user[mobilephoneCode]"].value = "612345678" signup_form["user[defaultAdressline1]"].value = "08 rue secureyourmail" signup_form["user[defaultAdress.postalCode]"].value = "85123" signup_form["user[defaultAddress.town]"].value = "fraise" signup_form["user[defaultAdress.country.isocode]"].value = "FR" signup_form["user[email]"].value = "*****@*****.**" signup_form["user[emailConfirmation]"].value = "*****@*****.**" signup_form["user[password]"].value = "secureyourmail123" signup_form["user[passwordConfirmation]"].value = "secureyourmail123" # On soumet le formulaire
def main(): browser = RoboBrowser(history=True) ## # First to get the stuff off of the general page. # # There is the Presented, Handled, abandoned browser.open("http://support.infusiontest.com/csdashboard/general.php") generalresults = BeautifulSoup(browser.response.content, 'html.parser') phone['presented'] = generalresults.find('div', {'id': 'presented'}).find('div', {'class': 'data'}).text.strip('\r\n ') phone['abandoned'] = generalresults.find('div', {'id': 'queued'}).find('div', {'class': 'data'}).text.strip('\r\n ') phone['handled'] = generalresults.findAll('div', {'id': 'handled'})[0].find('div', {'class': 'data'}).text.strip('\r\n ') phone['abandonedpct'] = generalresults.findAll('div', {'id': 'handled'})[1].find('div', {'class': 'data'}).text.strip('\r\n ') chat['presented'] = generalresults.find('div', {'id': 'diverted'}).find('div', {'class': 'data'}).text.strip('\r\n ') chat['abandoned'] = generalresults.find('div', {'id': 'sla'}).find('div', {'class': 'data'}).text.strip('\r\n ') chat['handled'] = generalresults.find('div', {'id': 'abandoned'}).find('div', {'class': 'data'}).text.strip('\r\n ') chat['abandonedpct'] = generalresults.find('div', {'id': 'asa'}).find('div', {'class': 'data'}).text.strip('\r\n ') browser.open("http://support.infusiontest.com/csdashboard/stats.php") statsresults = BeautifulSoup(browser.response.content, 'html.parser') phone['asa'] = statsresults.find('div', {'id': 'phone'}).find('div', {'class': 'data'}).text.strip('\r\n ') chat['asa'] = statsresults.find('div', {'id': 'phone'}).find('div', {'class': 'data'}).text.strip('\r\n ') browser.open("https://docs.google.com/forms/d/1UvD_au-S6YaDGQ-u23Lth5l-JFrrpUiQT6yVFrj64BA/viewform") submitform = browser.get_form() submitform.fields['entry.1210668230'].value = '5pm' submitform.fields['entry.339838906'].value = phone['asa'] submitform.fields['entry.335804195'].value = phone['presented'] submitform.fields['entry.950389349'].value = phone['handled'] submitform.fields['entry.125377286'].value = phone['abandoned'] submitform.fields['entry.73700777'].value = phone['abandonedpct'] submitform.fields['entry.941849183'].value = chat['asa'] submitform.fields['entry.1083299158'].value = chat['presented'] submitform.fields['entry.487211652'].value = chat['handled'] submitform.fields['entry.1724578827'].value = chat['abandoned'] submitform.fields['entry.1590181783'].value = chat['abandonedpct'] browser.submit_form(submitform)
def get_medicare_email(request, mmg): """ :param request: :param mmg: :return: """ mmg_back = mmg mmg_back['status'] = "FAIL" mmg_back['mmg_email'] = "" PARSER = settings.BS_PARSER if not PARSER: if settings.DEBUG: print('Default Parser for BeautifulSoup:', 'lxml') PARSER = 'lxml' # Call the default page rb = RoboBrowser() # Set the default parser (lxml) # This avoids BeautifulSoup reporting an issue in the console/log rb.parser = PARSER target_page = "https://www.mymedicare.gov/myaccount.aspx" # Open the form to start the login rb.open(target_page) # Get the form content page = rb.parsed if settings.DEBUG: print("===============================") print("on page:", rb.url) print("MyAccount:", page) my_email = rb.find("div", attrs={"class":"ctl00_ctl00_ContentPlaceHolder1_ctl00_ctl00_ctl00_ctl01_UserInfo_pnlEmailSettings"}) if settings.DEBUG: print("What email information:", my_email) for addr in my_email: mail_addr = my_email.find("div", attrs={"class": "myaccount-data"}) mail_address = mail_addr.text mmg_back['mmg_email'] = mail_address if rb.url == target_page: mmg_back['url'] = rb.url mmg_back['status'] = "OK" if settings.DEBUG: print("Email:", mail_address) print("url:", rb.url) return mmg_back