def scrape_latest(scrape_target: Text): # This header is added to cron requests by GAE, and stripped from any external # requests. See # https://cloud.google.com/appengine/docs/standard/python3/scheduling-jobs-with-cron-yaml#validating_cron_requests if not flask.request.headers.get('X-Appengine-Cron'): return 'Attempted to access internal endpoint.', status.HTTP_403_FORBIDDEN scrapers.scrape(scrape_target) return 'Successfully scraped latest %s.' % scrape_target, status.HTTP_200_OK
def scrape_dep_sites(): """Scrapes two of the websites associated with deputy-sessions, the CV and overview sites, and dumps the htmls in folder.""" scrape_counter = 0 #load urls in list db_name_user = '******' table_command = 'SELECT dep_name, dep_url FROM dep_name_url' urls = ftt.get_list_from_table(db_name_user, table_command) cv_fldr = "/media/radu/romparl/CD/htmls/politicians/cv_pages/" smry_fldr = "/media/radu/romparl/CD/htmls/politicians/summary_pages/" err_fldr = "/media/radu/romparl/CD/errors/" #go thorugh urls and scrape, ignore 2016 session, no CVs yet for u in urls: if '2016' not in u[1]: session = u[1][-4:] scrape_counter += 1 print(u[0], session, scrape_counter) cv_u = u[1] + '&pag=0' cv_file = str(u[0]) + '-' + session + '-cv.txt' smry_u = u[1] + '&pag=1' smry_file = str(u[0]) + '-' + session + '-summary.txt' try: cv_uht = sc.scrape(cv_u) cv_dictio = { 'url': cv_uht[0], 'html': cv_uht[1], 'scrape_time_utc': cv_uht[2] } ftt.json_file_dump(cv_fldr, cv_file, cv_dictio) smry_uht = sc.scrape(smry_u) smry_dictio = { 'url': smry_uht[0], 'html': smry_uht[1], 'scrape_time_utc': smry_uht[2] } ftt.json_file_dump(smry_fldr, smry_file, smry_dictio) except: err_file = str(u[0]) + '.txt' tb = traceback.format_stack() error = { 'message': tb, 'error_index': scrape_counter, 'list_element': u[0] } ftt.json_file_dump(err_fldr, err_file, error) continue
print 'Got {0} out of {1} total'.format(str(contained_report), str(total_report)) raise dke.MissingPlayersException( 'Total missing players at price point: ' + str(miss_len)) if __name__ == "__main__": args = get_args() uploader = nba_upload if args.l == 'NBA' else nfl_upload if not args.keep_pids: uploader.create_upload_file() if args.pids: player_map = uploader.map_pids(args.pids) if args.s == _YES: try: scrapers.scrape(args.source) except KeyError: raise dke.InvalidProjectionSourceException( 'You must choose from the following data sources {}.'.format( scrapers.scrape_dict.keys())) rosters, remove = [], [] for x in xrange(0, int(args.i)): rosters.append(run(args.l, remove, args)) if args.pids: uploader.update_upload_csv(player_map, rosters[x].sorted_players()[:]) if None not in rosters: for roster in rosters: for player in roster.players: remove.append(player.name)
missing = filter(lambda x: x.marked != "Y" and x.cost > min_cost, all_players) miss_len = len(missing) if e_raise < miss_len: print "Got {0} out of {1} total".format(str(contained_report), str(total_report)) raise dke.MissingPlayersException("Total missing players at price point: " + str(miss_len)) if __name__ == "__main__": args = get_args() if not args.keep_pids: upload.create_upload_file() if args.pids: player_map = upload.map_pids(args.pids) if args.s == _YES: try: scrapers.scrape(args.source) except KeyError: raise dke.InvalidProjectionSourceException( "You must choose from the following data sources {}.".format(scrapers.scrape_dict.keys()) ) rosters, remove = [], [] for x in xrange(0, int(args.i)): rosters.append(run(cons.POSITIONS[args.l], args.l, remove, args)) if args.pids: upload.update_upload_csv(player_map, rosters[x].sorted_players()[:]) if None not in rosters: for roster in rosters: for player in roster.players: remove.append(player.name) else:
import time from utils import intput_output from utils.http import get_page from utils.validation import get_errors import scrapers urls = intput_output.import_urls() output = [] for url in urls: html = get_page(url) result = scrapers.scrape(url, html) errors = get_errors(result) output.append({'url': url, 'errors': errors, 'result': result}) print('Scraped {} with {} errors'.format(url, len(errors))) time.sleep(1) # to avoid spamming intput_output.export_results(output) print('Scrape complete! See: ' + intput_output.OUTPUT_FILE)
def jobs_route_get(): if session.get('signedIn') == None: return redirect('/login') sites = '-site:yelp.com/* -site:dice.com/* -site:indeed.com/* -site:monster.com/* -site:glassdoor.com/ -site:jobs.climber.com/* -site:ziprecruiter.com/* site:jobs.*.com/* OR site:careers.*.com/* OR site:*.com/careers/* OR site:*.com/jobs/* OR site:*.org/careers/* OR site:*.org/jobs/* OR site:jobs.lever.co/* OR site:boards.greenhouse.io/* OR site:linkedin.com/jobs/view/* ' results = pref_sql( "SELECT skills, exclusions, postype, field, explevel FROM user WHERE uid = '{0}'", (session['uid'], )) if len(results) and not None in results[ 0][:5]: #if we have something in the database skills = results[0][0].split(";") exclusions = results[0][1].split(";") postype = results[0][2].split(";") fields = results[0][3].split(";") experience_level = results[0][4].split(";") experience_level = experience_level[0] all_fields = fields[0] for field in range(1, len(fields)): if fields[field] != '': all_fields += ' OR ' + fields[field] all_positions = postype[0] for pos in range(1, len(postype)): if postype[pos] != '': all_positions += ' ' + postype[pos] all_exclusions = exclusions[0] for exclusion in range(1, len(exclusions)): if exclusions[exclusion] != '': all_exclusions += ' -' + exclusions[exclusion] if experience_level == 'New Grad' or experience_level == 'Intern' or experience_level == 'Entry Level': all_exclusions += " -senior -lead" all_skills = '"' + skills[0] + '"' for skill in range(1, len(skills)): if skills[skill] != '': all_skills += ' OR ' + '"' + skills[skill] + '"' query = sites + all_positions + ' ' + experience_level + ' ' + all_fields + ' ' + all_skills + ' -' + all_exclusions jobs, summaries, num, full_desc = scraper.scrape(query) match_skills = [] match_pos = [] match_fields = [] for desc in full_desc: skilz = findAllMatches(skills, desc) positionz = findAllMatches(postype, desc) fieldz = findAllMatches(fields, desc) match_skills.append(skilz) match_pos.append(positionz) match_fields.append(fieldz) print(match_skills) return render_template("jobs.html", jobs=jobs, summaries=summaries, num=num, match_skills=match_skills, match_pos=match_pos, match_fields=match_fields, signedIn=True)