def get_source_code_submissions(proxies,
                                    proxy_index,
                                    contest_id,
                                    submission_ids,
                                    pbar,
                                    submission_index=0,
                                    iterations=1000):
        if iterations > 0:
            # Specify a random user agent
            headers = {'User-Agent': 'anything'}

            # if all proxies have been exhausted, get new list of proxies
            if proxy_index >= len(proxies):
                proxies = Proxies.get_proxies()
                proxy_index = 0

            # Set the proxy for the requests
            proxy_dict = {
                'http': f'{proxies[proxy_index]}',
                'https': f'{proxies[proxy_index]}',
            }

            try:
                # Go over all submissions and make a request to the submission page
                for submission in submission_ids[submission_index:]:
                    response = requests.get(
                        f'https://codeforces.com/contest/{contest_id}/submission/{submission}',
                        timeout=10,
                        headers=headers,
                        proxies=proxy_dict,
                        allow_redirects=False)

                    # Transform to soup object with html parser
                    soup = BeautifulSoup(response.content, 'html.parser')

                    # Find the actual program code on page
                    solution = soup.find("pre", {
                        "id": "program-source-text"
                    }).text.strip()

                    os.makedirs(f'data/contests_solutions_code/{contest_id}/',
                                exist_ok=True)

                    text_file = open(
                        f'data/contests_solutions_code/{contest_id}/{submission}.txt',
                        "w")
                    text_file.write(solution)
                    text_file.close()

                    # update where we are for if we need to update the proxy and update the progress bar
                    submission_index += 1
                    pbar.update()

            except Exception:
                proxy_index += 1
                iterations -= 1
                Scraper.get_source_code_submissions(proxies, proxy_index,
                                                    contest_id, submission_ids,
                                                    pbar, submission_index,
                                                    iterations)
Beispiel #2
0
def baiduzhidaosearch(keyword, page):
    ret = {
        'code': 1002,
        'msg': 'failure',
        'data': []
    }
    try:
        page = int(page) * 10
        print 111
        keyword_u = keyword.encode('utf-8')
        print chardet.detect(keyword_u)
        # url = 'http://zhidao.baidu.com/search?word=%s&ie=gbk&site=-1&sites=0&date=0&pn=%s' % (keyword.encode('utf-8 ').decode('gbk','ignore'), page)
        url = u'http://zhidao.baidu.com/search?ct=17&pn=%s&tn=ikaslist&rn=10&word=%s' % (page, keyword)
        #print(url)
        print 222
        headers = Headers.getHeaders()
        proxies = Proxies.get_proxies()
        req = requests.get(url, headers=headers, timeout=60, proxies=proxies)
        if req.status_code == 200:
            ret['code'] = 1001
            ret['msg'] = 'success'
            id = []
            title = []
            req.encoding = 'gbk'
            html = req.text.encode(encoding="utf-8", errors="ignore").decode("utf-8", errors="ignore")
            selector = etree.HTML(html)
            urls = selector.xpath('//div[@class="list"]/dl/dt[1]/a/@href')
            for u in urls:
                match_obj = re.search(r'question/(.*?).html', u, re.M | re.I)
                id.append(match_obj.group(1))
            titles = selector.xpath('//div[@class="list"]/dl/dt[1]/a')
            for t in titles:
                title.append(etree.tostring(t, encoding='utf8', method="html"))
            max_n = len(id)
            n = 0
            while True:
                if n >= max_n:
                    break
                # print(title[n])
                ret['data'].append(
                        {'cid': id[n], 'title': re.search(r'"ti">(.*?)</a>', title[n], re.M | re.I).group(1)})
                n = n + 1
    except Exception as e :
        print e
    return simplejson.dumps(ret)
Beispiel #3
0
from headers import Headers
from config import Config
from mysqldao import MysqlDao
import time
from proxies import Proxies

url_main = 'http://www.bttiantang.com/movie.php?/order/id/'

page_count = 10
# 1367为总页数,需要定期修改
for one in xrange(1, page_count):
    url = url_main + str(one)
    print(url)
    headers = Headers.getHeaders()
    try:
        proxies = Proxies.get_proxies()
        req = requests.get(url, headers=headers, timeout=30, proxies=proxies)
        code = req.status_code
        print(code)
        if code == 200:
            html = req.content
            selector = etree.HTML(html)
            content_urls = selector.xpath('//*[contains(@class,"litpic")]/a[1]/@href')
            for content_url in content_urls:
                content_url = Config.url_main + content_url
                created_at = time.strftime('%Y-%m-%d %H:%M:%S')
                insert_value = '"' + content_url + '",0,"' + created_at + '"'
                mysqlDao = MysqlDao()
                sql = 'select url from bttiantang_url ORDER BY id desc limit 0,1 '
                ret = mysqlDao.execute(sql)
                for r in ret:
Beispiel #4
0
def get_solutions(amt_threads=40):
    # Get all contests that have already been scraped
    scraped_contests = []

    for _, dirs, _ in os.walk('data/contests_solutions_code'):
        scraped_contests += [int(dir) for dir in dirs]

    # Get the metadata files to find submission ids to scrape solutions for
    metadata_files = [
        file for file in os.listdir('data/contests_solutions_metadata')
        if os.path.isfile(f'data/contests_solutions_metadata/{file}')
    ]

    if len(metadata_files) == 0:
        print(
            'Run "get_solutions_metadata" first to fetch the solutions to scrape before running this function'
        )

    for metadata_file in metadata_files:
        # Open the solutions metadata file where solutions will be coupled to
        solutions_df = pd.read_csv(
            f'data/contests_solutions_metadata/{metadata_file}')

        # Get all unique contests in this dataset that are not scraped yet
        contests = list(
            solutions_df[~solutions_df['contestId'].isin(scraped_contests)]
            ['contestId'].unique())

        for contest in contests:
            print(f'starting with contest {contest}...')
            contest_submissions = solutions_df[solutions_df['contestId'] ==
                                               contest]['solutionId']

            # Work with amt of threads to parallelize the requests
            threads = []

            submissions_per_thread = math.ceil(
                len(contest_submissions) / amt_threads)

            proxies = Proxies.get_proxies()

            # Progress bar to indicate current progress and speed
            pbar = tqdm(total=len(contest_submissions))

            for index in range(0, len(contest_submissions),
                               submissions_per_thread):
                # Let every thread start with a random proxy to spread the search space
                proxy_index = random.randrange(0, len(proxies))

                # Get solutions for the contests from the scraper
                threads.append(
                    threading.Thread(
                        target=Scraper.get_source_code_submissions,
                        args=(
                            proxies,
                            proxy_index,
                            contest,
                            contest_submissions[index:index +
                                                submissions_per_thread],
                            pbar,
                        )))

            for t in threads:
                t.start()

            for t in threads:
                t.join()