def get_source_code_submissions(proxies, proxy_index, contest_id, submission_ids, pbar, submission_index=0, iterations=1000): if iterations > 0: # Specify a random user agent headers = {'User-Agent': 'anything'} # if all proxies have been exhausted, get new list of proxies if proxy_index >= len(proxies): proxies = Proxies.get_proxies() proxy_index = 0 # Set the proxy for the requests proxy_dict = { 'http': f'{proxies[proxy_index]}', 'https': f'{proxies[proxy_index]}', } try: # Go over all submissions and make a request to the submission page for submission in submission_ids[submission_index:]: response = requests.get( f'https://codeforces.com/contest/{contest_id}/submission/{submission}', timeout=10, headers=headers, proxies=proxy_dict, allow_redirects=False) # Transform to soup object with html parser soup = BeautifulSoup(response.content, 'html.parser') # Find the actual program code on page solution = soup.find("pre", { "id": "program-source-text" }).text.strip() os.makedirs(f'data/contests_solutions_code/{contest_id}/', exist_ok=True) text_file = open( f'data/contests_solutions_code/{contest_id}/{submission}.txt', "w") text_file.write(solution) text_file.close() # update where we are for if we need to update the proxy and update the progress bar submission_index += 1 pbar.update() except Exception: proxy_index += 1 iterations -= 1 Scraper.get_source_code_submissions(proxies, proxy_index, contest_id, submission_ids, pbar, submission_index, iterations)
def baiduzhidaosearch(keyword, page): ret = { 'code': 1002, 'msg': 'failure', 'data': [] } try: page = int(page) * 10 print 111 keyword_u = keyword.encode('utf-8') print chardet.detect(keyword_u) # url = 'http://zhidao.baidu.com/search?word=%s&ie=gbk&site=-1&sites=0&date=0&pn=%s' % (keyword.encode('utf-8 ').decode('gbk','ignore'), page) url = u'http://zhidao.baidu.com/search?ct=17&pn=%s&tn=ikaslist&rn=10&word=%s' % (page, keyword) #print(url) print 222 headers = Headers.getHeaders() proxies = Proxies.get_proxies() req = requests.get(url, headers=headers, timeout=60, proxies=proxies) if req.status_code == 200: ret['code'] = 1001 ret['msg'] = 'success' id = [] title = [] req.encoding = 'gbk' html = req.text.encode(encoding="utf-8", errors="ignore").decode("utf-8", errors="ignore") selector = etree.HTML(html) urls = selector.xpath('//div[@class="list"]/dl/dt[1]/a/@href') for u in urls: match_obj = re.search(r'question/(.*?).html', u, re.M | re.I) id.append(match_obj.group(1)) titles = selector.xpath('//div[@class="list"]/dl/dt[1]/a') for t in titles: title.append(etree.tostring(t, encoding='utf8', method="html")) max_n = len(id) n = 0 while True: if n >= max_n: break # print(title[n]) ret['data'].append( {'cid': id[n], 'title': re.search(r'"ti">(.*?)</a>', title[n], re.M | re.I).group(1)}) n = n + 1 except Exception as e : print e return simplejson.dumps(ret)
from headers import Headers from config import Config from mysqldao import MysqlDao import time from proxies import Proxies url_main = 'http://www.bttiantang.com/movie.php?/order/id/' page_count = 10 # 1367为总页数,需要定期修改 for one in xrange(1, page_count): url = url_main + str(one) print(url) headers = Headers.getHeaders() try: proxies = Proxies.get_proxies() req = requests.get(url, headers=headers, timeout=30, proxies=proxies) code = req.status_code print(code) if code == 200: html = req.content selector = etree.HTML(html) content_urls = selector.xpath('//*[contains(@class,"litpic")]/a[1]/@href') for content_url in content_urls: content_url = Config.url_main + content_url created_at = time.strftime('%Y-%m-%d %H:%M:%S') insert_value = '"' + content_url + '",0,"' + created_at + '"' mysqlDao = MysqlDao() sql = 'select url from bttiantang_url ORDER BY id desc limit 0,1 ' ret = mysqlDao.execute(sql) for r in ret:
def get_solutions(amt_threads=40): # Get all contests that have already been scraped scraped_contests = [] for _, dirs, _ in os.walk('data/contests_solutions_code'): scraped_contests += [int(dir) for dir in dirs] # Get the metadata files to find submission ids to scrape solutions for metadata_files = [ file for file in os.listdir('data/contests_solutions_metadata') if os.path.isfile(f'data/contests_solutions_metadata/{file}') ] if len(metadata_files) == 0: print( 'Run "get_solutions_metadata" first to fetch the solutions to scrape before running this function' ) for metadata_file in metadata_files: # Open the solutions metadata file where solutions will be coupled to solutions_df = pd.read_csv( f'data/contests_solutions_metadata/{metadata_file}') # Get all unique contests in this dataset that are not scraped yet contests = list( solutions_df[~solutions_df['contestId'].isin(scraped_contests)] ['contestId'].unique()) for contest in contests: print(f'starting with contest {contest}...') contest_submissions = solutions_df[solutions_df['contestId'] == contest]['solutionId'] # Work with amt of threads to parallelize the requests threads = [] submissions_per_thread = math.ceil( len(contest_submissions) / amt_threads) proxies = Proxies.get_proxies() # Progress bar to indicate current progress and speed pbar = tqdm(total=len(contest_submissions)) for index in range(0, len(contest_submissions), submissions_per_thread): # Let every thread start with a random proxy to spread the search space proxy_index = random.randrange(0, len(proxies)) # Get solutions for the contests from the scraper threads.append( threading.Thread( target=Scraper.get_source_code_submissions, args=( proxies, proxy_index, contest, contest_submissions[index:index + submissions_per_thread], pbar, ))) for t in threads: t.start() for t in threads: t.join()