def main(): # 爬取地址, 当当所有 Python 的书籍, 一共是 21 页 url = "http://en-name.xiao84.com/changjian/" # url='https://search.jd.com/Search?keyword=python&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=python&page=5&s=104&click=0' # url='https://search.jd.com/Search?keyword=python&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=python&page=5&s=104&click=0' # 请求头 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' } ip_list = get_proxies(1, need_check=True) page = 1 data = [] while True: try: #省url page_url = 'http://en-name.xiao84.com/changjian/p{}.html'.format( page) proxy = dict2proxy2(random.choice(ip_list)) soup = get_bs(page_url, headers, proxy) names = soup.find_all('td', class_='en-col') for item in names: enname = item.find('a', class_='enname').get_text() print(enname) data.append(enname) page += 1 except: break with open('eng_names1.txt', 'w') as fout: for name in data: fout.write(name + '\n')
def start_crawler(max_fail_count=10): agents_list = agents.get_user_agents() targer_urls = targeturls.get_urls() fail_count = 1 try_count = 1 with open('crawler_log.txt', 'w') as file: while fail_count <= max_fail_count: proxies_list = proxies.get_proxies() while True: try: user_agent = random.choice(agents_list) target_url = random.choice(targer_urls) user_proxy = 'http://' + random.choice(proxies_list) print('try count = {0}, proxy = {1}, url = {2}'.format(try_count, user_proxy, target_url)) file.write('try count = {0}, fail count = {1}, proxy = {2}, url = {3}, agent = {4}\n'.format(try_count, fail_count, user_proxy, target_url, user_agent)) html_response = requests.get(target_url, proxies={'http': user_proxy, 'https': user_proxy}, headers={'User-Agent': user_agent, 'Connection': 'keep-alive' }) try_count += 1 with open("target.html", "wb") as page: page.write(html_response.content) time.sleep(random.uniform(1.1, 3.3)) pass except: print('failed!') break fail_count += 1
def getRandomProxy(): proxy = random.choice(proxies.get_proxies()) if proxy: proxy = {'http': 'http://%s:%s' % (proxy['IP'], proxy['PORT'])} return proxy else: return False
def start_crawler(max_fail_count=10): agents_list = agents.get_user_agents() targer_urls = targeturls.get_urls() fail_count = 1 try_count = 1 with open('crawler_log.txt', 'w') as file: while fail_count <= max_fail_count: proxies_list = proxies.get_proxies() while True: try: user_agent = random.choice(agents_list) user_proxy = random.choice(proxies_list) target_url = random.choice(targer_urls) print('try count = {0}, proxy = {1}, url = {2}'.format(try_count, user_proxy, target_url)) file.write('try count = {0}, fail count = {1}, proxy = {2}, url = {3}, agent = {4}\n'.format(try_count, fail_count, user_proxy, target_url, user_agent)) html_response = requests.get(target_url, proxies={'http': user_proxy}, headers={'User-Agent': user_agent}) try_count += 1 with open("target.html", "wb") as page: page.write(html_response.content) time.sleep(random.uniform(1.1, 3.3)) pass except: break fail_count += 1
def save_contest(contestId): global proxy_pool global thread_count, MAX_PROBLEM_SUBS, BASE_DIR, URL_CONTEST_SUBMISSION print('Thread ' + str(contestId) + ' started') thread_start_time = time.time() submissions = submissions_from_contest(contestId) print('Contest: ' + str(contestId) + ' has ' + str(len(submissions)) + ' submissions') sources, count, subs_checked = {}, 0, 0 submissions = sorted(submissions, key=lambda k: k['id']) using_proxy = next(proxy_pool) for sub in submissions: subs_checked = subs_checked + 1 if sub['verdict'] != 'OK': continue count = count + 1 if count > MAX_PROBLEM_SUBS: print('Thread ' + str(contestId) + ' reached maximum problem submissions') break percentage = subs_checked / len(submissions) * 100 prob_id = sub['problem']['index'] sub_id = str(sub['id']) if prob_id not in sources.keys(): sources[prob_id] = {} code, ext = 'nothing', 'no-ext' while True: try: code, ext = submission_code(sub, using_proxy) break except Exception as e: print('Thread ' + str(contestId) + ' has Exception, waiting') print(e) time.sleep(4) using_proxy = next(proxy_pool) sources[prob_id][sub_id] = {'code': code, 'ext': ext} if count % 10 == 0: save_progress(sources, contestId) print("\nSaved contest {}: {}/{} {:.2f}% saved: {} time: "\ .format(contestId, subs_checked, len(submissions), percentage, count) + \ time.strftime("%H:%M:%S", time.gmtime(time.time() - thread_start_time)) + "\n") if count % 500 == 0: proxy_pool = cycle(get_proxies(NUM_PROXIES)) thread_count = thread_count - 1 print('Thread ' + str(contestId) + ' finished')
def redirect(platform, arg): proxies_ = proxies.get_proxies() try: if platform == 'douyin': real_url = douyin.get_real_url(arg, proxies_) elif platform == 'kuaishou': real_url = kuaishou.get_real_url(arg, proxies_) else: real_url = '未找到文件' except Exception as e: real_url = e return real_url
def vote(): emails = get_emails() for item in emails: print(item) proxies = get_proxies() proxy_index = random.randrange(0, 5) proxy = proxies[proxy_index] print(proxy) full_name = item[0] email = item[1] print(full_name) print(email) proxy_config = Proxy() proxy_config.proxy_type = ProxyType.MANUAL proxy_config.http_proxy = proxy proxy_config.ssl_proxy = proxy capabilities = webdriver.DesiredCapabilities.CHROME proxy_config.add_to_capabilities(capabilities) chrome_options = Options() chrome_options.headless = True driver = webdriver.Chrome("./chromedriver", desired_capabilities=capabilities, options=chrome_options) driver.get("") # FILL IN WITH WEBSITE time.sleep(10) # give page time to load form_full_name = driver.find_element_by_xpath('') #FILL IN WITH X PATH form_full_name.send_keys(full_name) form_radio_button = driver.find_element_by_xpath( '') #FILL IN WITH X PATH form_radio_button.click() form_video_dropdown = driver.find_element_by_xpath( '') #FILL IN WITH X PATH form_video_dropdown.click() form_video_choice = driver.find_element_by_xpath( '') #FILL IN WITH X PATH form_video_choice.click() form_email_address = driver.find_element_by_xpath( '') #FILL IN WITH X PATH form_email_address.send_keys(email) form_submit_button = driver.find_element_by_xpath( '') #FILL IN WITH X PATH form_submit_button.click() time.sleep(10) # give time to submit get_confirmation_text = driver.find_element_by_css_selector( '') #FILL IN WITH X PATH if ((get_confirmation_text.text ) == ""): #FILL IN WITH CONFIRMATION MESSAGE print("Successfully voted!") else: print("Did not vote successfully.") driver.quit() time.sleep(300) # wait 5 mins
def main(): # 爬取地址, 当当所有 Python 的书籍, 一共是 21 页 url = "http://www.lawtime.cn/gongan/" # url='https://search.jd.com/Search?keyword=python&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=python&page=5&s=104&click=0' # url='https://search.jd.com/Search?keyword=python&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=python&page=5&s=104&click=0' # 请求头 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' } ip_list = get_proxies(1) data = {'名称': [], '电话': [], '地址': []} soup = get_bs(url, headers) provinces = soup.find('p').find_all('a') urls = {} for province in provinces: key = province.get_text() urls[key] = province['href'] city_urls = [] try: #省url for url in urls: url = urls[url] proxy = dict2proxy(random.choice(ip_list)) soup = get_bs(url, headers, proxy) cities = soup.find_all('div', class_='mcol') for city in cities: city_url = city.find('span', class_='right') if city_url is not None: city_url = city_url.a['href'] else: city_url = re.sub('province', 'city', url) page = 1 while True: try: tmp = city_url.split('/')[:-1] tmp.append('p{}'.format(page)) tmp.append(city_url.split('/')[-1]) url = '/'.join(tmp) # url='http://www.lawtime.cn/gongan/city/p22/taiyuan' print(url) soup = get_bs(url, headers) items = soup.find_all('dl', class_="newline") if len(items) > 0: for item in items: name = item.find( 'dt', class_='mccname').a.get_text() tmp = item.find_all('dd') data['名称'].append(name) data['电话'].append(tmp[1].get_text()) data['地址'].append(tmp[2].get_text()) else: item = soup.find('div', class_="mcol") data['名称'].append(item.get_text()) data['电话'].append('') data['地址'].append('') break time.sleep(0.5) page += 1 except: print('done,last page:{}'.format(page)) break time.sleep(1) pass data1 = pd.DataFrame(data) data1.to_excel('gongan.xlsx', index=False) except: data = pd.DataFrame(data) data.to_excel('gongan.xlsx', index=False)
# https://medium.com/mitre-attack/att-ck-content-available-in-stix-2-0-via-public-taxii-2-0-server-317e5c41e214 # # Need pip install stix2 # pip install taxii2-client # # # Debugging tool to list out all the items # Example: # python list from stix2 import TAXIICollectionSource, Filter from taxii2client import Server from proxies import get_proxies server = Server("https://cti-taxii.mitre.org/taxii/", proxies=get_proxies()) api_root = server.api_roots[0] # # Three collections: Enterprise ATT&CK, PRE-ATT&CK, MOBILE # for collection in api_root.collections: print(collection.title + ": " + collection.id) #collection = Collection("https://cti-taxii.mitre.org/stix/collections/95ecc380-afe9-11e4-9b6c-751b66dd541e/") # Supply the collection to TAXIICollection tc_source = TAXIICollectionSource(collection) # Create filters to retrieve content from Enterprise ATT&CK filter_objs = {
# -*- coding: utf-8 -*- # pragma pylint: disable=unused-argument, no-self-use # (c) Copyright IBM Corp. 2010, 2020. All Rights Reserved. """ Debugging tool: Example: get_technique_info.py AppleScript """ import sys from fn_mitre_integration.lib.mitre_attack import MitreAttack import json from proxies import get_proxies if len(sys.argv) < 2: print("Usage get_technique_info.py tech_id <mitigation>") sys.exit() tech_id = sys.argv[1] mitigation_only = False if len(sys.argv) == 3: mitigation_only = True if mitigation_only: mitigations = MitreAttack(opts=None, function_opts=get_proxies()).get_tech_mitigation(tech_id) print(str(mitigations)) else: tech = MitreAttack().get_tech(tech_id) print(str(tech))
for chunk in chunks: p = mp.Process(target=check_proxy, args=(chunk, valid_proxies_list)) prcs.append(p) p.start() for p in prcs: p.join() return valid_proxies_list #%% if __name__ == '__main__': WORK_DIR = '/home/parser/' pl = get_proxies(WORK_DIR, 200) pl = check_proxies(pl) try: with open(WORK_DIR + 'proxieslist.txt', 'r') as prx: proxies_list = prx.read().split('\n') prx.close() except: proxies_list = None if proxies_list: pl.extend(proxies_list) pl = list(set(pl))
def download_text(path): """ This function receives a list of urls. It downloads the content as plain text. Separates different paragraphs and sends them to preprocess. values: Numpy array with the cosine similarity of each text fragment - query indexes: Positions of the numpy array where cosine simlarity surpass a threshold clean_text: Plain text with the relevant text """ proxies = get_proxies() proxy_pool = cycle(proxies) # First step is to load the file with the urls and the word doc_path = path + "/url_list.txt" url_doc = open(doc_path, "r") for line in url_doc: content = line.split(" ", 1) url = content[0] word = content[1] clean_plain_text = '' clean_text = '' final = [] if check_resource_retrieved_before(url, path): print("Resource " + url + " has already been searched, skipping...") print(url) print(word) else: for i in range(1, 11): #Get a proxy from the pool #proxy = next(proxy_pool) print("Request #%d" % i) try: time.sleep(30) response = requests.get(url) #response = requests.get(url) #response = requests.get(url,proxies={"http": proxy, "https": proxy}) print('Response HTTP Status Code: ', response.status_code) #print('Response HTTP Response Body: ', response.content) #print(response.json()) #Here we must do the clear separation between pdf and normal html. #If it is a pdf we should download the pdf and store it in a temporal file #Afterwards we will process the temporal pdf like a normal text if url.endswith("pdf"): with open('temporal.pdf', 'wb') as f: f.write(response.content) clean_plain_text = parser.from_file('temporal.pdf') else: data = response.text print("We have stored the content in data") soup = BeautifulSoup(data, 'html.parser') texts = soup.find_all(text=True) for t in texts: if t.parent.name not in blacklist: clean_plain_text += '{} '.format( t ) #We have obtained the html (except the blacklist) as plain text print("We have stored the lines") lines = clean_plain_text.split('\n \n') for line in lines[:]: #Sería mejor almacenarlo en otro objeto porque es raro modificar el mismo objeto sobre el cual se itera if len(line.strip()) < 3 or '^' in line or len( line.split()) < 2: lines.remove(line) print("We have stripped the lines") values = apply_tfidf(lines, word.rstrip("\n")) if len(texts) == 0: print( "This text was not correctly downloaded due to an error while decoding" ) if max(values) < max_value: #If the max value of cosine similarity is lower than the max_value acceptable # we may be working with text in another language, thus, we dont want this text if max(values) == 0: print("No matching was found") else: print( "Cosine similarity matrix values may be too small to be valuable text." ) print("Proceeding to ignore this document...\n") else: indexes = np.where(values > threshold * max(values)) print("Threshold employed for this document is " + str(threshold * max(values))) for i in range(len(indexes[0])): text = remove_punctuation(lines[indexes[0][i]]) text_ = unicodedata.normalize('NFKD', text).encode( 'ASCII', 'ignore') clean_text += text_.decode("utf-8") + "\n\n" store_text(path, url, clean_text, word) break except: print("Web page " + url + " could not be scraped") #Most free proxies will often get connection errors. You will have retry the entire request using another proxy to work. #We will just skip retries as its beyond the scope of this tutorial and we are only downloading a single url print("Skipping. Connnection error") exc_type, exc_obj, exc_tb = sys.exc_info() print("Error in line " + str(exc_tb.tb_lineno) + " " + str(exc_type)) print(traceback.print_exc())
import urllib.request, urllib.parse import json, threading import os, time, datetime from itertools import cycle from submission_getter import submission_code from proxies import get_proxies, get_proxies_manually MAX_PROBLEM_SUBS = 1000 * 1000 * 10 MAX_THREADS = 5 NUM_PROXIES = 50 BASE_DIR = 'Surse' URL_CONTEST_SUBMISSION = 'http://codeforces.com/api/contest.status?contestId={contestId}' thread_count = 0 proxy_pool = cycle(get_proxies(NUM_PROXIES)) def submissions_from_contest(contestId): url = URL_CONTEST_SUBMISSION.format(contestId=contestId) # url = urllib.parse.quote(url) submissions = json.loads(urllib.request.urlopen(url).read()) return submissions['result'] def save_progress(sources, contestId): for prob_id, data in sources.items(): file_obj = open(BASE_DIR + '/' + str(contestId) + prob_id + '.txt', 'w') json.dump(data, file_obj, indent=2)
import sys from fn_mitre_integration.lib.mitre_attack import MitreAttack import json from proxies import get_proxies if len(sys.argv) < 2: print( "Usage lookup.py item_name type_name[optional] collection_name[optional]" ) sys.exit() item_name = sys.argv[1] type_name = None collection_name = None if len(sys.argv) > 2: type_name = sys.argv[2] if len(sys.argv) > 3: collection_name = sys.argv[3] attack = MitreAttack(opts=None, function_opts=get_proxies()) attack.connect_server() item = attack.lookup_item(item_name) item_dict = json.loads(item.serialize()) print(item_dict)
for chunk in chunks: p = mp.Process(target=check_proxy, args=(chunk, valid_proxies_list)) prcs.append(p) p.start() for p in prcs: p.join() return valid_proxies_list #%% if __name__ == '__main__': WORK_DIR = 'D:' pl = get_proxies(WORK_DIR, 1000) pl = check_proxies(pl) try: with open(WORK_DIR + 'new_proxieslist.txt', 'r') as prx: proxies_list = prx.read().split('\n') prx.close() except: proxies_list = None if proxies_list: pl.extend(proxies_list) pl = list(set(pl))
def scraper(): # variables for each scrape HEADERS = ({ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" }) max_results_per_area = 100 areas = [ "Madrid+provincia", "Las+Palmas+provincia", "Galicia", "Barcelona+provincia", "Cádiz+provincia" ] columns = [ "Area", "Job Title", "Company Name", "Location", "Summary", "Salary" ] df = pd.DataFrame(columns=columns) # Scraping loop: proxies = get_proxies() proxy_pool = cycle(proxies) for area in areas: for start in range(0, max_results_per_area, 10): print("*") proxy = next(proxy_pool) url = "http://es.indeed.com/jobs?q=Junior+developer&l=" + str( area) + "&jt=fulltime&lang=en&start=" + str(start) try: page = requests.get(url, proxies={ "http://": proxy, "https://": proxy }, headers=HEADERS) time.sleep(15) # separate page grabs soup = BeautifulSoup(page.text, "lxml", from_encoding="utf-8") for div in soup.find_all(name="div", attrs={"class": "row"}): num = (len(df) + 1) job_post = [] job_post.append(area) # Title for a in div.find_all( name="a", attrs={"data-tn-element": "jobTitle"}): job_post.append(a["title"]) # Company Name company = div.find_all(name="span", attrs={"class": "company"}) if len(company) > 0: for b in company: job_post.append(b.text.strip()) else: try_again = div.find_all( name="span", attrs={"class": "result-link-source"}) for span in try_again: job_post.append(span.text.strip()) # Location spans = div.find_all(name="span", attrs={"class": "location"}) for span in spans: job_post.append(span.text) # Summary dv = div.find_all(name="div", attrs={"class": "summary"}) for d in dv: job_post.append(d.text.strip()) # Salary try: span = div.find(name="span", attrs={"class": "salary"}) job_post.append(span.text.strip()) except: job_post.append("Nothing Found") # Pass to pandas df.loc[num] = job_post except: proxy = next(proxy_pool) page = requests.get(url, proxies={ "http://": proxy, "https://": proxy }) time.sleep(15) # separate page grabs soup = BeautifulSoup(page.text, "lxml", from_encoding="utf-8") for div in soup.find_all(name="div", attrs={"class": "row"}): num = len(df) + 1 job_post = [] job_post.append(area) # Title for a in div.find_all( name="a", attrs={"data-tn-element": "jobTitle"}): job_post.append(a["title"]) # Company Name company = div.find_all(name="span", attrs={"class": "company"}) if len(company) > 0: for b in company: job_post.append(b.text.strip()) else: try_again = div.find_all( name="span", attrs={"class": "result-link-source"}) for span in try_again: job_post.append(span.text.strip()) # Location spans = div.find_all(name="span", attrs={"class": "location"}) for span in spans: job_post.append(span.text) # Summary dv = div.find_all(name="div", attrs={"class": "summary"}) for d in dv: job_post.append(d.text.strip()) # Salary try: span = div.find(name="span", attrs={"class": "salary"}) job_post.append(span.text.strip()) except: job_post.append("Nothing Found") # Pass to pandas df.loc[num] = job_post with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also print(df) # save to csv df.to_csv("sample.csv", quoting=csv.QUOTE_ALL, encoding='utf-8')
#!/usr/bin/env python # -*- coding: utf-8 -*- # pragma pylint: disable=unused-argument, no-self-use # (c) Copyright IBM Corp. 2010, 2020. All Rights Reserved. """ Debugging tool get_tech_mitigation AppleScript """ import sys from fn_mitre_integration.lib.mitre_attack import MitreAttack from fn_mitre_integration.lib.mitre_attack_utils import get_techniques import json from proxies import get_proxies if len(sys.argv) < 2: print("Usage get_tech_mitigation.py <tech name>") sys.exit() tactic_name = sys.argv[1] tactics = tactic_name.split(", ") if len(tactics) == 1: techs = MitreAttack( opts=None, function_opts=get_proxies()).get_tactic_techniques(tactic_name) else: techs = get_techniques(tactic_name) print(str(techs))