Beispiel #1
0
def main():
    # 爬取地址, 当当所有 Python 的书籍, 一共是 21 页
    url = "http://en-name.xiao84.com/changjian/"
    # url='https://search.jd.com/Search?keyword=python&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=python&page=5&s=104&click=0'
    # url='https://search.jd.com/Search?keyword=python&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=python&page=5&s=104&click=0'
    # 请求头
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
    }
    ip_list = get_proxies(1, need_check=True)
    page = 1

    data = []
    while True:

        try:
            #省url
            page_url = 'http://en-name.xiao84.com/changjian/p{}.html'.format(
                page)

            proxy = dict2proxy2(random.choice(ip_list))
            soup = get_bs(page_url, headers, proxy)
            names = soup.find_all('td', class_='en-col')
            for item in names:
                enname = item.find('a', class_='enname').get_text()
                print(enname)
                data.append(enname)
            page += 1
        except:
            break
    with open('eng_names1.txt', 'w') as fout:
        for name in data:
            fout.write(name + '\n')
Beispiel #2
0
def start_crawler(max_fail_count=10):
    agents_list = agents.get_user_agents()
    targer_urls = targeturls.get_urls()
    fail_count = 1
    try_count = 1
    with open('crawler_log.txt', 'w') as file:
        while fail_count <= max_fail_count:
            proxies_list = proxies.get_proxies()
            while True:
                try:
                    user_agent = random.choice(agents_list)
                    target_url = random.choice(targer_urls)
                    user_proxy = 'http://' + random.choice(proxies_list)

                    print('try count = {0}, proxy = {1}, url = {2}'.format(try_count, user_proxy, target_url))
                    file.write('try count = {0}, fail count = {1}, proxy = {2}, url = {3}, agent = {4}\n'.format(try_count,
                        fail_count, user_proxy, target_url, user_agent))

                    html_response = requests.get(target_url, proxies={'http': user_proxy, 'https': user_proxy},
                        headers={'User-Agent': user_agent,
                                'Connection': 'keep-alive' })

                    try_count += 1
                    with open("target.html", "wb") as page:
                        page.write(html_response.content)
                    time.sleep(random.uniform(1.1, 3.3))
                    pass
                except:
                    print('failed!')
                    break
            fail_count += 1
Beispiel #3
0
def getRandomProxy():
    proxy = random.choice(proxies.get_proxies())
    if proxy:
        proxy = {'http': 'http://%s:%s' % (proxy['IP'], proxy['PORT'])}
        return proxy
    else:
        return False
Beispiel #4
0
def start_crawler(max_fail_count=10):
    agents_list = agents.get_user_agents()
    targer_urls = targeturls.get_urls()
    fail_count = 1
    try_count = 1
    with open('crawler_log.txt', 'w') as file:
        while fail_count <= max_fail_count:
            proxies_list = proxies.get_proxies()
            while True:
                try:
                    user_agent = random.choice(agents_list)
                    user_proxy = random.choice(proxies_list)
                    target_url = random.choice(targer_urls)
                    print('try count = {0}, proxy = {1}, url = {2}'.format(try_count, user_proxy, target_url))
                    file.write('try count = {0}, fail count = {1}, proxy = {2}, url = {3}, agent = {4}\n'.format(try_count,
                        fail_count, user_proxy, target_url, user_agent))
                    html_response = requests.get(target_url, proxies={'http': user_proxy}, headers={'User-Agent': user_agent})
                    try_count += 1
                    with open("target.html", "wb") as page:
                        page.write(html_response.content)
                    time.sleep(random.uniform(1.1, 3.3))
                    pass
                except:
                    break
            fail_count += 1
Beispiel #5
0
def save_contest(contestId):
    global proxy_pool
    global thread_count, MAX_PROBLEM_SUBS, BASE_DIR, URL_CONTEST_SUBMISSION

    print('Thread ' + str(contestId) + ' started')
    thread_start_time = time.time()

    submissions = submissions_from_contest(contestId)
    print('Contest: ' + str(contestId) + ' has ' + str(len(submissions)) +
          ' submissions')

    sources, count, subs_checked = {}, 0, 0
    submissions = sorted(submissions, key=lambda k: k['id'])
    using_proxy = next(proxy_pool)
    for sub in submissions:
        subs_checked = subs_checked + 1

        if sub['verdict'] != 'OK':
            continue

        count = count + 1
        if count > MAX_PROBLEM_SUBS:
            print('Thread ' + str(contestId) +
                  ' reached maximum problem submissions')
            break

        percentage = subs_checked / len(submissions) * 100

        prob_id = sub['problem']['index']
        sub_id = str(sub['id'])
        if prob_id not in sources.keys():
            sources[prob_id] = {}

        code, ext = 'nothing', 'no-ext'

        while True:
            try:
                code, ext = submission_code(sub, using_proxy)
                break
            except Exception as e:
                print('Thread ' + str(contestId) + ' has Exception, waiting')
                print(e)
                time.sleep(4)
                using_proxy = next(proxy_pool)

        sources[prob_id][sub_id] = {'code': code, 'ext': ext}

        if count % 10 == 0:
            save_progress(sources, contestId)
            print("\nSaved contest {}: {}/{}  {:.2f}%  saved: {}  time: "\
              .format(contestId, subs_checked, len(submissions), percentage, count) + \
              time.strftime("%H:%M:%S", time.gmtime(time.time() - thread_start_time)) + "\n")

        if count % 500 == 0:
            proxy_pool = cycle(get_proxies(NUM_PROXIES))

    thread_count = thread_count - 1
    print('Thread ' + str(contestId) + ' finished')
def redirect(platform, arg):
    proxies_ = proxies.get_proxies()
    try:
        if platform == 'douyin':
            real_url = douyin.get_real_url(arg, proxies_)
        elif  platform == 'kuaishou':
            real_url = kuaishou.get_real_url(arg, proxies_)
        else:
            real_url = '未找到文件'
    except Exception as e:
        real_url = e
    return real_url
Beispiel #7
0
def vote():
    emails = get_emails()
    for item in emails:
        print(item)
        proxies = get_proxies()
        proxy_index = random.randrange(0, 5)
        proxy = proxies[proxy_index]
        print(proxy)
        full_name = item[0]
        email = item[1]
        print(full_name)
        print(email)
        proxy_config = Proxy()
        proxy_config.proxy_type = ProxyType.MANUAL
        proxy_config.http_proxy = proxy
        proxy_config.ssl_proxy = proxy
        capabilities = webdriver.DesiredCapabilities.CHROME
        proxy_config.add_to_capabilities(capabilities)
        chrome_options = Options()
        chrome_options.headless = True
        driver = webdriver.Chrome("./chromedriver",
                                  desired_capabilities=capabilities,
                                  options=chrome_options)
        driver.get("")  # FILL IN WITH WEBSITE
        time.sleep(10)  # give page time to load
        form_full_name = driver.find_element_by_xpath('')  #FILL IN WITH X PATH
        form_full_name.send_keys(full_name)
        form_radio_button = driver.find_element_by_xpath(
            '')  #FILL IN WITH X PATH
        form_radio_button.click()
        form_video_dropdown = driver.find_element_by_xpath(
            '')  #FILL IN WITH X PATH
        form_video_dropdown.click()
        form_video_choice = driver.find_element_by_xpath(
            '')  #FILL IN WITH X PATH
        form_video_choice.click()
        form_email_address = driver.find_element_by_xpath(
            '')  #FILL IN WITH X PATH
        form_email_address.send_keys(email)
        form_submit_button = driver.find_element_by_xpath(
            '')  #FILL IN WITH X PATH
        form_submit_button.click()
        time.sleep(10)  # give time to submit
        get_confirmation_text = driver.find_element_by_css_selector(
            '')  #FILL IN WITH X PATH
        if ((get_confirmation_text.text
             ) == ""):  #FILL IN WITH CONFIRMATION MESSAGE
            print("Successfully voted!")
        else:
            print("Did not vote successfully.")
        driver.quit()
        time.sleep(300)  # wait 5 mins
def main():
    # 爬取地址, 当当所有 Python 的书籍, 一共是 21 页
    url = "http://www.lawtime.cn/gongan/"
    # url='https://search.jd.com/Search?keyword=python&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=python&page=5&s=104&click=0'
    # url='https://search.jd.com/Search?keyword=python&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=python&page=5&s=104&click=0'
    # 请求头
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
    }
    ip_list = get_proxies(1)

    data = {'名称': [], '电话': [], '地址': []}
    soup = get_bs(url, headers)
    provinces = soup.find('p').find_all('a')
    urls = {}
    for province in provinces:
        key = province.get_text()
        urls[key] = province['href']
    city_urls = []
    try:
        #省url
        for url in urls:
            url = urls[url]
            proxy = dict2proxy(random.choice(ip_list))
            soup = get_bs(url, headers, proxy)
            cities = soup.find_all('div', class_='mcol')
            for city in cities:
                city_url = city.find('span', class_='right')
                if city_url is not None:
                    city_url = city_url.a['href']
                else:
                    city_url = re.sub('province', 'city', url)
                page = 1
                while True:
                    try:
                        tmp = city_url.split('/')[:-1]
                        tmp.append('p{}'.format(page))
                        tmp.append(city_url.split('/')[-1])
                        url = '/'.join(tmp)
                        # url='http://www.lawtime.cn/gongan/city/p22/taiyuan'
                        print(url)
                        soup = get_bs(url, headers)
                        items = soup.find_all('dl', class_="newline")
                        if len(items) > 0:
                            for item in items:
                                name = item.find(
                                    'dt', class_='mccname').a.get_text()
                                tmp = item.find_all('dd')
                                data['名称'].append(name)
                                data['电话'].append(tmp[1].get_text())
                                data['地址'].append(tmp[2].get_text())
                        else:
                            item = soup.find('div', class_="mcol")
                            data['名称'].append(item.get_text())
                            data['电话'].append('')
                            data['地址'].append('')
                            break

                        time.sleep(0.5)
                        page += 1
                    except:
                        print('done,last page:{}'.format(page))
                        break
                time.sleep(1)
            pass
        data1 = pd.DataFrame(data)
        data1.to_excel('gongan.xlsx', index=False)
    except:
        data = pd.DataFrame(data)
        data.to_excel('gongan.xlsx', index=False)
Beispiel #9
0
# https://medium.com/mitre-attack/att-ck-content-available-in-stix-2-0-via-public-taxii-2-0-server-317e5c41e214
#
#   Need pip install stix2
#        pip install taxii2-client
#

#
#   Debugging tool to list out all the items
#   Example:
#   python list

from stix2 import TAXIICollectionSource, Filter
from taxii2client import Server
from proxies import get_proxies

server = Server("https://cti-taxii.mitre.org/taxii/", proxies=get_proxies())

api_root = server.api_roots[0]

#
# Three collections: Enterprise ATT&CK, PRE-ATT&CK, MOBILE
#
for collection in api_root.collections:
    print(collection.title + ": " + collection.id)
    #collection = Collection("https://cti-taxii.mitre.org/stix/collections/95ecc380-afe9-11e4-9b6c-751b66dd541e/")

    # Supply the collection to TAXIICollection
    tc_source = TAXIICollectionSource(collection)

    # Create filters to retrieve content from Enterprise ATT&CK
    filter_objs = {
# -*- coding: utf-8 -*-
# pragma pylint: disable=unused-argument, no-self-use
# (c) Copyright IBM Corp. 2010, 2020. All Rights Reserved.

"""
    Debugging tool:

    Example:
        get_technique_info.py AppleScript
"""
import sys
from fn_mitre_integration.lib.mitre_attack import MitreAttack
import json
from proxies import get_proxies

if len(sys.argv) < 2:
    print("Usage get_technique_info.py tech_id <mitigation>")
    sys.exit()

tech_id = sys.argv[1]

mitigation_only = False
if len(sys.argv) == 3:
    mitigation_only = True

if mitigation_only:
    mitigations = MitreAttack(opts=None, function_opts=get_proxies()).get_tech_mitigation(tech_id)
    print(str(mitigations))
else:
    tech = MitreAttack().get_tech(tech_id)
    print(str(tech))
Beispiel #11
0
    for chunk in chunks:
        p = mp.Process(target=check_proxy, args=(chunk, valid_proxies_list))
        prcs.append(p)
        p.start()

    for p in prcs:
        p.join()

    return valid_proxies_list


#%%
if __name__ == '__main__':
    WORK_DIR = '/home/parser/'

    pl = get_proxies(WORK_DIR, 200)

    pl = check_proxies(pl)

    try:
        with open(WORK_DIR + 'proxieslist.txt', 'r') as prx:
            proxies_list = prx.read().split('\n')
            prx.close()
    except:
        proxies_list = None

    if proxies_list:
        pl.extend(proxies_list)

    pl = list(set(pl))
Beispiel #12
0
def download_text(path):
    """
    This function receives a list of urls. It downloads the content
    as plain text. Separates different paragraphs and sends them to preprocess.
        
    values: Numpy array with the cosine similarity of each text fragment - query
    indexes: Positions of the numpy array where cosine simlarity surpass a threshold
    clean_text: Plain text with the relevant text
    """

    proxies = get_proxies()
    proxy_pool = cycle(proxies)

    # First step is to load the file with the urls and the word
    doc_path = path + "/url_list.txt"
    url_doc = open(doc_path, "r")

    for line in url_doc:
        content = line.split(" ", 1)
        url = content[0]
        word = content[1]
        clean_plain_text = ''
        clean_text = ''
        final = []
        if check_resource_retrieved_before(url, path):
            print("Resource " + url +
                  " has already been searched, skipping...")

            print(url)
            print(word)
        else:
            for i in range(1, 11):
                #Get a proxy from the pool
                #proxy = next(proxy_pool)
                print("Request #%d" % i)

                try:
                    time.sleep(30)
                    response = requests.get(url)
                    #response = requests.get(url)
                    #response = requests.get(url,proxies={"http": proxy, "https": proxy})
                    print('Response HTTP Status Code: ', response.status_code)
                    #print('Response HTTP Response Body: ', response.content)

                    #print(response.json())

                    #Here we must do the clear separation between pdf and normal html.
                    #If it is a pdf we should download the pdf and store it in a temporal file
                    #Afterwards we will process the temporal pdf like a normal text

                    if url.endswith("pdf"):
                        with open('temporal.pdf', 'wb') as f:
                            f.write(response.content)
                        clean_plain_text = parser.from_file('temporal.pdf')

                    else:
                        data = response.text
                        print("We have stored the content in data")
                        soup = BeautifulSoup(data, 'html.parser')
                        texts = soup.find_all(text=True)
                        for t in texts:
                            if t.parent.name not in blacklist:
                                clean_plain_text += '{} '.format(
                                    t
                                )  #We have obtained the html (except the blacklist) as plain text

                    print("We have stored the lines")
                    lines = clean_plain_text.split('\n \n')

                    for line in lines[:]:
                        #Sería mejor almacenarlo en otro objeto porque es raro modificar el mismo objeto sobre el cual se itera
                        if len(line.strip()) < 3 or '^' in line or len(
                                line.split()) < 2:
                            lines.remove(line)

                    print("We have stripped the lines")
                    values = apply_tfidf(lines, word.rstrip("\n"))

                    if len(texts) == 0:
                        print(
                            "This text was not correctly downloaded due to an error while decoding"
                        )

                    if max(values) < max_value:
                        #If the max value of cosine similarity is lower than the max_value acceptable
                        # we may be working with text in another language, thus, we dont want this text
                        if max(values) == 0:
                            print("No matching was found")
                        else:
                            print(
                                "Cosine similarity matrix values may be too small to be valuable text."
                            )
                            print("Proceeding to ignore this document...\n")
                    else:
                        indexes = np.where(values > threshold * max(values))
                        print("Threshold employed for this document is " +
                              str(threshold * max(values)))

                        for i in range(len(indexes[0])):
                            text = remove_punctuation(lines[indexes[0][i]])
                            text_ = unicodedata.normalize('NFKD', text).encode(
                                'ASCII', 'ignore')

                            clean_text += text_.decode("utf-8") + "\n\n"
                        store_text(path, url, clean_text, word)
                    break
                except:
                    print("Web page " + url + " could not be scraped")

                    #Most free proxies will often get connection errors. You will have retry the entire request using another proxy to work.
                    #We will just skip retries as its beyond the scope of this tutorial and we are only downloading a single url
                    print("Skipping. Connnection error")
                    exc_type, exc_obj, exc_tb = sys.exc_info()
                    print("Error in line " + str(exc_tb.tb_lineno) + "  " +
                          str(exc_type))
                    print(traceback.print_exc())
Beispiel #13
0
import urllib.request, urllib.parse
import json, threading
import os, time, datetime
from itertools import cycle

from submission_getter import submission_code
from proxies import get_proxies, get_proxies_manually

MAX_PROBLEM_SUBS = 1000 * 1000 * 10
MAX_THREADS = 5
NUM_PROXIES = 50
BASE_DIR = 'Surse'
URL_CONTEST_SUBMISSION = 'http://codeforces.com/api/contest.status?contestId={contestId}'

thread_count = 0
proxy_pool = cycle(get_proxies(NUM_PROXIES))


def submissions_from_contest(contestId):
    url = URL_CONTEST_SUBMISSION.format(contestId=contestId)
    # url = urllib.parse.quote(url)
    submissions = json.loads(urllib.request.urlopen(url).read())
    return submissions['result']


def save_progress(sources, contestId):
    for prob_id, data in sources.items():
        file_obj = open(BASE_DIR + '/' + str(contestId) + prob_id + '.txt',
                        'w')
        json.dump(data, file_obj, indent=2)
import sys
from fn_mitre_integration.lib.mitre_attack import MitreAttack
import json
from proxies import get_proxies

if len(sys.argv) < 2:
    print(
        "Usage lookup.py item_name type_name[optional] collection_name[optional]"
    )
    sys.exit()

item_name = sys.argv[1]
type_name = None
collection_name = None
if len(sys.argv) > 2:
    type_name = sys.argv[2]

if len(sys.argv) > 3:
    collection_name = sys.argv[3]

attack = MitreAttack(opts=None, function_opts=get_proxies())

attack.connect_server()

item = attack.lookup_item(item_name)

item_dict = json.loads(item.serialize())

print(item_dict)
Beispiel #15
0
    for chunk in chunks:
        p = mp.Process(target=check_proxy, args=(chunk, valid_proxies_list))
        prcs.append(p)
        p.start()

    for p in prcs:
        p.join()

    return valid_proxies_list


#%%
if __name__ == '__main__':
    WORK_DIR = 'D:'

    pl = get_proxies(WORK_DIR, 1000)

    pl = check_proxies(pl)

    try:
        with open(WORK_DIR + 'new_proxieslist.txt', 'r') as prx:
            proxies_list = prx.read().split('\n')
            prx.close()
    except:
        proxies_list = None

    if proxies_list:
        pl.extend(proxies_list)

    pl = list(set(pl))
Beispiel #16
0
def scraper():
    # variables for each scrape
    HEADERS = ({
        "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
    })
    max_results_per_area = 100
    areas = [
        "Madrid+provincia", "Las+Palmas+provincia", "Galicia",
        "Barcelona+provincia", "Cádiz+provincia"
    ]
    columns = [
        "Area", "Job Title", "Company Name", "Location", "Summary", "Salary"
    ]
    df = pd.DataFrame(columns=columns)

    # Scraping loop:

    proxies = get_proxies()
    proxy_pool = cycle(proxies)

    for area in areas:
        for start in range(0, max_results_per_area, 10):
            print("*")
            proxy = next(proxy_pool)
            url = "http://es.indeed.com/jobs?q=Junior+developer&l=" + str(
                area) + "&jt=fulltime&lang=en&start=" + str(start)
            try:
                page = requests.get(url,
                                    proxies={
                                        "http://": proxy,
                                        "https://": proxy
                                    },
                                    headers=HEADERS)
                time.sleep(15)  # separate page grabs
                soup = BeautifulSoup(page.text, "lxml", from_encoding="utf-8")
                for div in soup.find_all(name="div", attrs={"class": "row"}):
                    num = (len(df) + 1)
                    job_post = []
                    job_post.append(area)
                    # Title
                    for a in div.find_all(
                            name="a", attrs={"data-tn-element": "jobTitle"}):
                        job_post.append(a["title"])
                    # Company Name
                    company = div.find_all(name="span",
                                           attrs={"class": "company"})
                    if len(company) > 0:
                        for b in company:
                            job_post.append(b.text.strip())
                    else:
                        try_again = div.find_all(
                            name="span", attrs={"class": "result-link-source"})
                        for span in try_again:
                            job_post.append(span.text.strip())
                    # Location
                    spans = div.find_all(name="span",
                                         attrs={"class": "location"})
                    for span in spans:
                        job_post.append(span.text)
                    # Summary
                    dv = div.find_all(name="div", attrs={"class": "summary"})
                    for d in dv:
                        job_post.append(d.text.strip())
                    # Salary
                    try:
                        span = div.find(name="span", attrs={"class": "salary"})
                        job_post.append(span.text.strip())
                    except:
                        job_post.append("Nothing Found")
                    # Pass to pandas
                    df.loc[num] = job_post
            except:
                proxy = next(proxy_pool)
                page = requests.get(url,
                                    proxies={
                                        "http://": proxy,
                                        "https://": proxy
                                    })
                time.sleep(15)  # separate page grabs
                soup = BeautifulSoup(page.text, "lxml", from_encoding="utf-8")
                for div in soup.find_all(name="div", attrs={"class": "row"}):
                    num = len(df) + 1
                    job_post = []
                    job_post.append(area)
                    # Title
                    for a in div.find_all(
                            name="a", attrs={"data-tn-element": "jobTitle"}):
                        job_post.append(a["title"])
                    # Company Name
                    company = div.find_all(name="span",
                                           attrs={"class": "company"})
                    if len(company) > 0:
                        for b in company:
                            job_post.append(b.text.strip())
                    else:
                        try_again = div.find_all(
                            name="span", attrs={"class": "result-link-source"})
                        for span in try_again:
                            job_post.append(span.text.strip())
                    # Location
                    spans = div.find_all(name="span",
                                         attrs={"class": "location"})
                    for span in spans:
                        job_post.append(span.text)
                    # Summary
                    dv = div.find_all(name="div", attrs={"class": "summary"})
                    for d in dv:
                        job_post.append(d.text.strip())
                    # Salary
                    try:
                        span = div.find(name="span", attrs={"class": "salary"})
                        job_post.append(span.text.strip())
                    except:
                        job_post.append("Nothing Found")
                    # Pass to pandas
                    df.loc[num] = job_post
    with pd.option_context('display.max_rows', None, 'display.max_columns',
                           None):  # more options can be specified also
        print(df)
    # save to csv
    df.to_csv("sample.csv", quoting=csv.QUOTE_ALL, encoding='utf-8')
Beispiel #17
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# pragma pylint: disable=unused-argument, no-self-use
# (c) Copyright IBM Corp. 2010, 2020. All Rights Reserved.
"""
    Debugging tool
        get_tech_mitigation AppleScript

"""
import sys
from fn_mitre_integration.lib.mitre_attack import MitreAttack
from fn_mitre_integration.lib.mitre_attack_utils import get_techniques
import json
from proxies import get_proxies

if len(sys.argv) < 2:
    print("Usage get_tech_mitigation.py <tech name>")
    sys.exit()

tactic_name = sys.argv[1]

tactics = tactic_name.split(", ")

if len(tactics) == 1:
    techs = MitreAttack(
        opts=None,
        function_opts=get_proxies()).get_tactic_techniques(tactic_name)
else:
    techs = get_techniques(tactic_name)

print(str(techs))