# socks.set_default_proxy(socks.SOCKS5, "127.0.0.1", 9050) # socket.socket = socks.socksocket # session = requesocks.session() # session.proxies = {'http': 'socks5://localhost:9050', 'https': 'socks5://localhost:9050'} folder = os.path.basename(__file__).split(".")[0] base_url = "http://www.emich.edu/directory/?page={}&first-name=First+Name&last-name={}&student=on&x=0&y=0" def get_name(name): filename = "{}/{}.csv".format(folder,name) if not(os.path.exists(filename)): print name url = base_url.format(1,name,headers={"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36"}) num_pages = int(ceil(int(pq(url)(".pagination").eq(1).text().split(" ")[2])/10.0)) def get_page(page,name=""): return [{"Name":"{}, {}".format(pq(x).find("td").eq(1).text().encode("utf-8"),pq(x).find("td").eq(2).text().encode("utf-8")),"Email":pq(x).find("a[href^=mailto]").text().encode("utf-8")} for x in pq(base_url.format(page,name))("table tr:not(.details):gt(0)")] with concurrent.futures.ThreadPoolExecutor(max_workers=num_pages) as thread: dat = list(itertools.chain(*list(thread.map(partial(get_page,name=name),range(1,num_pages+1))))) pd.DataFrame(data=dat,columns=["Name","Email"]).to_csv(filename,index=False) return dat else: print "{} Skipped".format(name) names = get_last_names(40000) with concurrent.futures.ThreadPoolExecutor(max_workers=50) as thread: for i in range(0,40000,50): thread.map(get_name,names[i:i+50]) time.sleep(.1) compile_csvs(folder)
linecache.checkcache(filename) line = linecache.getline(filename, lineno, f.f_globals) print 'EXCEPTION IN ({}, LINE {} "{}"): {}'.format(filename, lineno, line.strip(), exc_obj) folder = os.path.basename(__file__).split(".")[0] base_url = "http://www.slu.edu/peoplefinder/json/json_index.php" def get_name(name): filename = "{}/{}.csv" if not(os.path.exists(filename)): print name data = {"q": name} def get_data(): dat = json.loads(pq(base_url,data=data,method="post")("p").text()).get("resultSet").get("result") return [{"Name":x.get("fullname")[0],"Email":x.get("email")[0] if x.get("email") else None} for x in dat if x.get("affiliation")[0].lower() == "student"] try: dat = get_data() pd.DataFrame(data=dat,columns=def_col).to_csv(filename.format(folder,name),index=False) except: return get_name(name) else: print "{} Skipped".format(name) with concurrent.futures.ThreadPoolExecutor(max_workers=50) as thread: thread.map(get_name,get_last_names(2500)) print compile_csvs(folder) print("--- %s seconds ---" % (time.time() - start_time))
jq = pq(x.get_attribute("outerHTML")) over = len(jq(".partialResults")) dat = [{"Name": pq(x).text().strip()} for x in jq(".resultsList li a p.name")] if over and layer == 0: dat = itertools.chain( *[ get_data(browser=browser, major=major, first_name=x, layer=layer + 1) for x in get_character_permutations(num_characters=1) ] ) pd.DataFrame(data=list(dat), columns=["Name"]).to_csv(filename, index=False) return dat else: print "Skipped {} {}".format(major, first_name) return [] try: dat = list(itertools.chain(*[get_data(major=i) for i in majors])) browser.close() compile_csvs("clemson", duplicate="Name") print len(pd.read_csv("clemson.csv")) except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print (exc_type, fname, exc_tb.tb_lineno) print e time.sleep(10) browser.close() compile_csvs() print "{} students found".format(len(pd.read_csv("clemson.csv")))