def get_project(name): project_page = etree.HTML(get_page('https://www.openhub.net/p/' + name).decode('utf-8')) project_name = project_page.xpath(u"//*[@id=\"project_header\"]/div[1]/h1/a")[0].text project_tag = project_page.xpath(u"//*[@id=\"project_tags\"]/p")[0].text similar_projects = project_page.xpath(u"//*[@id=\"similar_projects\"]")[0].text manager = project_page.xpath(u"//*[@id=\"page_contents\"]/div[3]/div[2]/div/dl/dd[5]/a")[0].text licenses = project_page.xpath(u"//*[@id=\"page_contents\"]/div[3]/div[2]/div/dl/dd[3]")[0].text location_page = etree.HTML(get_page('https://www.openhub.net/p/' + name + '/enlistments').decode('utf-8')) location_table = location_page.xpath(u"//table//tbody")[0] locations = [c.getchildren()[0].text.strip() for c in location_table.getchildren()] code_location = '\t'.join(locations) project = {"update_time": datetime.datetime.now().isoformat(), "project_name": project_name, "project_tag": project_tag, "similar_projects": similar_projects, "manager": manager, "licenses": licenses, "code_location": code_location } for key in project: if project[key] is None: project[key] = '' return project # //*[@id="analyses_language_table"]
def _task_list_handle(self, request, template): '''所有任务列表 通用处理''' dicts = self._get_role(request) if not dicts.has_key('is_operator'): return request, get_template(template), None name = request.GET.get('name') email = request.GET.get('email') duty = request.GET.get('duty') result = request.GET.get('result') als = Task.objects.filter(operator = request.user, uuid__category=self.ApplicationModel.__name__) if name: als = als.filter(uuid__name__contains = name) if email: als = als.filter(uuid__email__contains = email) if duty: als = als.filter(operator_duty = duty) if result: als = als.filter(result = result) als = als.order_by('-allot_time') dicts['p'] = get_page(als, request ) dicts['result_list'] = dict(Task.result_choices) dicts['duty_list'] = dict(HelpDeskUser.duty_choices) return request, get_template(template), dicts
def ajax_list(request): '通过ajax的方式请求和返回新闻列表' if request.session['stunum']: # 已经登录 print request.session['stunum'] news = News.objects.all() if 'date' in request.GET: date = request.GET['date'] startDate = date.split('--')[0] endDate = date.split('--')[1] # print startDate, endDate news = news.filter(rel_time__gte=startDate) # 大于等于开始日期 news = news.filter(rel_time__lte=endDate) # 小于等于结束日期 # 加len(news) >0 的原因是,如果之前过滤后就没有数据了,其他条件就不用过滤了 if len(news) > 0 and 'key_word' in request.GET: keyword = request.GET['key_word'] news = news.filter(title__contains=keyword) if len(news) > 0 and 'department' in request.GET: department = request.GET['department'] news = news.filter(section=department) if len(news) > 0 and 'zhuanti' in request.GET: zhuanti = request.GET['zhuanti'] news = utils.handle_zhuanti(zhuanti, news) reDict = utils.get_page(news, 10, request.GET['page']) # reDict['data_list'] = json.dumps(reDict['data_list']) # print reDict#['data_list'] return HttpResponse(json.dumps(reDict), content_type='application/json') else: return HttpResponse(u'error')
def findPerson(query): """ returns the name that shows up the most from the google search of the query arguments: string of the question return: name of a person """ file = open("words.txt") words = file.read() l = utils.search(query) goodWords=[] exp = "[A-Z][a-z][a-z]+ [A-Z][a-z]+" for pages in l: text = re.sub("[\t\n ]", " ", utils.get_page(pages)) result = re.findall(exp, text) for x in result: z = x.split(" ") if z[0].lower() not in words and z[1].lower() not in words: goodWords.append(x) wordcounts={} for word in goodWords: if wordcounts.has_key(word): wordcounts[word]+=1 else: wordcounts[word]=1 person = wordcounts.keys()[0] for word in wordcounts: if wordcounts[word] > wordcounts[person]: person = word return person
def POST(self, favorite_id): """ save changes. """ post_sent = web.input() #post_sent['date'] = datetime.datetime.now() # umcomment if you want to update the date! old_favorite = db.load_fav_by_id(favorite_id) # load it again flag_get_title = False flag_save_page = False # update post_sent with old page_path # if save_page is True it will be overwritten to new page_path # otherwise old value used. post_sent['page_path'] = old_favorite['page_path'] # checkboxes if post_sent.has_key('get_title'): flag_get_title = True if post_sent.has_key('save_page'): db.delete_saved_page(favorite_id) # remove previous page flag_save_page = True # if any of two flags is True -> call utils.get_page if flag_get_title or flag_save_page: post_sent = utils.get_page(post_sent, flag_save_page, flag_get_title) db.update_favorite(favorite_id, post_sent) # update raise web.seeother('/')# go home
def history(self, request, template): '''用户申请历史记录''' dicts = self._get_role(request) if not dicts.has_key('is_applicant'): return HttpResponseRedirect(self._get_login_url(request)) name = request.GET.get('name') email = request.GET.get('email') status = request.GET.get('status') page = request.GET.get('page') user = HelpDeskUser.objects.filter(user = request.user, role='0') user = user[0].user # dep = [u.department for u in user] # apps = self.ApplicationModel.objects.filter(department__in = dep).order_by('-apply_time') # only login user apps = self.ApplicationModel.objects.filter(submit_user = user).order_by('-apply_time') if name: apps = apps.filter(name__contains = name) if email: apps = apps.filter(email__contains = email) if status: apps = apps.filter(status = status) p = get_page(apps, request) dicts['p'] = p dicts['statuslist'] = dict(self.ApplicationModel.status_choices) return render_to_response(request, get_template(template), dicts)
def list(self, request, template): """ 在原来的基础上添加了文件上传和xls导入,添加了分页后数据处理的接口 """ u = request.user ls = self._get_list(request) #重写_get_list的时候返回的ls 如果是None会报错,强制转换一下增加可用性 if ls == None: ls = self.DefaultModel.objects.none() args = {} for ak in self.list_args.keys(): if re.search('_doption$', ak): if request.GET.get(ak , None): datestr = (request.GET.get(ak, None)).split('-') args[str(self.list_args.get(ak))] = datetime.strptime((''.join((datestr[0],'-',datestr[1],'-01'))), '%Y-%m-%d') elif re.search('_option$', self.list_args.get(ak)): if request.GET.get(ak, None) and request.GET.get(ak + '_option', None): args[str(ak+'__'+request.GET.get(ak + '_option', None))] = str(request.GET.get(ak, None)) # elif re.search('_extra$', self.list_args.get(ak)): # if request.GET.get(ak, None): # ls = self._extra_filter(request, ls, ak,self.list_args[ak]) else: if request.GET.get(ak, None): try: args[str(self.list_args.get(ak))] = str(request.GET.get(ak, None)) except UnicodeEncodeError: args[str(self.list_args.get(ak))] = request.GET.get(ak, None) ls = ls.filter(**args) ls = self._extra_filter(request,ls) if(request.GET.get('excel')): if request.method == "POST": cols = request.POST.getlist("cols") return self.csv_export(request, ls, cols) try: p = get_page(ls, request) except EmptyPage: return HttpResponseRedirect('./') c_list = [] if self.csv_columns: for c in self.csv_columns: c_list.append(c[0].decode("utf-8")); p = self._deal_page_data(request,p) list_dicts = {'p':p, 'excel_cs':c_list} list_dicts.update(self._get_list_dicts(request)) if(request.GET.get('upload')): if request.method == "POST": return self.upload(request, template, list_dicts) return render_to_response(request, template, list_dicts )
def download_episode(number): try: download_episode(get_episodes()[(number.real)-1]) except AttributeError: #no .real => is _not_ an integer page = utils.get_page("http://musicforprogramming.net/?{0}".format("c="+number if not number.startswith("c") else number)) url, songname = re.findall(r"(http:\/\/datashat\.net\/(music_for_programming_.+\.mp3))\"", page)[0] print(url, songname) utils.store(url, songname, overwrite=False)
def imguralbum(url, opt_store=True): html = utils.get_page(url) names = [] for s in re.findall(r"<a.+?class=\"zoom\".+?href=\"(.+?)\">", html): r = re.search(r"([^/]+?)(.png|.jpg|.jpeg)$", s) if opt_store: utils.store("https:" + s, r.group(1) + r.group(2)) names.append(r.group(1) + r.group(2)) return names
def last_comic(download=True, return_number=False): text = utils.get_page('http://megatokyo.com') #print (strip_image(text)) strip_number = int(strip_image(text)[0][:-4]) #removing .png || .gif if download: dump_single(strip_number, image_format=strip_image(text)[0][-4:]) if return_number: return strip_number
def dump_single(number, image_format=None): if not image_format: text = utils.get_page('http://megatokyo.com/strip/{0}'.format(number)) #retrieving the image format #print (strip_image(text)) strip_name = strip_image(text)[0] else: strip_name = str(number)+image_format utils.store('http://megatokyo.com/strips/{0}'.format(strip_name), strip_name, overwrite=False)
def parse_category_by_type(category, subcategory, link, project_index, type = 'popular'): if subcategory: message = 'Parse subcategory "{0}" of "{1}"'.format(subcategory, category) else: message = 'Parse category "{0}"'.format(category) print message projects = [] stop = False page_count = 1 while not stop and (page_count <= MAX_PAGE_PARSE or MAX_PAGE_PARSE < 0): page = get_page('{0}{2}/?page={1}'.format(link, page_count, type)) page_count+=1 project_blocks = page.cssselect('.project') stop = len(project_blocks) == 0 for block in project_blocks: try: location = block.cssselect('.location-name')[0].text.strip() except Exception: location = '' project = { 'category': category, 'subcategory': subcategory, 'name': block.cssselect('.project-card > h2 > strong > a')[0].text.strip(), 'description': block.cssselect('.project-card > p')[0].text.strip(), 'location': location, 'founder': block.cssselect('.project-card > h2 > span')[0].text.strip()[3:], 'funded': None, 'funded_date': None, 'pledged': None, 'days left': None, } stats = block.cssselect('.project-stats > li') for stat in stats: stat_name = ''.join(stat.xpath("text()")).strip() if stat_name in {'funded', 'pledged'}: value = stat.cssselect('strong')[0].text.replace('%', '').replace('$', '').replace(',', '').strip() project[stat_name] = float(value) elif stat_name == 'days left': value = stat.cssselect('.num')[0].text.strip() project[stat_name] = int(value) elif stat_name in ['hours left', 'hour left', 'min left', 'mins left']: project['days left'] = 0 else: value = stat_name project['days left'] = -1 project['funded_date'] = str(datetime.datetime.strptime(value, '%b %d, %Y')) h = hashit(project) if h not in project_index: project_index.add(h) projects.append(project) print '{0}. Ended!!'.format(message) return projects
def get_project_news(name): project_page = etree.HTML(get_page('https://www.openhub.net/p/' + name).decode('utf-8')) project_age = project_page.xpath(u"//*[@id=\"factoids\"]/li[3]/div/span[1]/a")[0].text.strip() team_size = project_page.xpath(u"//*[@id=\"factoids\"]/li[1]/div/a[2]")[0].text.strip() project_activity = project_page.xpath(u"//*[@id=\"project_header_activity_indicator\"]/div")[0].text.strip() factoids_page = etree.HTML(get_page('https://www.openhub.net/p/' + name + '/factoids').decode('utf-8')) comments = ''.join(factoids_page.xpath(u"//*[@id=\"page_contents\"]")[0].itertext()).replace(u'\xa0', '').strip() # team_size_per_month = project_page.xpath(u"//*[@id=\"factoids\"]/li[3]/div/span[2]/a")[0].text # print(team_size_per_month) # contributor = project_page.xpath(u"")[0].text # print(contributor) ratings_page = etree.HTML(get_page('https://www.openhub.net/p/' + name + '/reviews/summary').decode('utf-8')) community_score = ratings_page.xpath(u"//*[@id=\"average_rating_details_2\"]")[0].text.replace(u'\xa0', '').strip() cost_page = etree.HTML(get_page('https://www.openhub.net/p/' + name + '/estimated_cost').decode('utf-8')) costs = [''.join(i.itertext()).strip().replace(',', '').split('\n') for i in cost_page.xpath('.//div[@class="controls"]')][1:] lines = [i.attrib['value'] for i in cost_page.xpath('.//option')] codebase_size = int(costs[0][0]) estimated_effort = int(costs[1][0]) estimated_cost = int(costs[2][1]) cocomo = { 'codebase_size': codebase_size, 'estimated_effort': estimated_effort, 'estimated_cost': estimated_cost, "all_code": lines[0], 'logic_code_only': lines[1], 'markup_only': lines[2], 'build_scripts_only': lines[3] } language_page = etree.HTML(get_page('https://www.openhub.net/p/' + name + '/analyses/latest/languages_summary').decode('utf-8')) languages_table = language_page.xpath(u"//*[@id=\"analyses_language_table\"]")[0] data = [x for c in languages_table.getchildren() for x in c.getchildren()][2:-2] data = [[''.join(j.itertext()).strip() for j in i.getchildren()][1:] for i in data] languages = [{"code_name": line[0], "code_lines": line[1], "comment_lines": line[2], "comment_ratio": line[3], "blank_lines" : line[4], "total_lines": line[5], "total_percentage" : line[6]} for line in data] project_news = {"update_time": datetime.datetime.now().isoformat(), 'team_size': team_size, 'project_age': project_age, 'activity': project_activity, 'comments': comments, 'languages': json.dumps(languages), 'cost': json.dumps(cocomo) } for key in project_news: if project_news[key] is None: project_news[key] = '' return project_news
def _task_handle(self, request, template): '''待处理任务列表 通用处理''' dicts = self._get_role(request) if not dicts.has_key('is_operator'): return request, get_template(template), None al = Task.objects.filter(operator = request.user, result='0', uuid__category=self.ApplicationModel.__name__) dicts['p'] = get_page(al, request) return request, get_template(template), dicts
def search(request): from sphinxapi import SphinxClient, SPH_MATCH_EXTENDED, SPH_SORT_RELEVANCE term = request.GET.get('term', '') category = None args = [u'term=%s'%term] template_name = 'board/search.html' if term: sphinx = SphinxClient() sphinx.SetServer(settings.SPHINX_SERVER, settings.SPHINX_PORT) sphinx.SetMatchMode(SPH_MATCH_EXTENDED) sphinx.SetSortMode(SPH_SORT_RELEVANCE) cid = request.GET.get('c') if cid: try: cid = int(cid) except TypeError: raise Http404 category = get_object_or_404(Category, cid) if category: sphinx.SetFilter('category_id', [category]) args.append(u'c=%s'%cid) user_settings = get_user_settings(request.user) try: page = int(request.GET.get('page', '1')) if page < 1: raise Http404 except ValueError: raise Http404 #sphinx.SetLimits(page * user_settings.ppp, user_settings.ppp) if request.GET.get('adv_submit.x'): template_name='board/advanced_search.html' u = User.objects.filter(username=term) if u: q = QuerySetPaginator(Post.objects.filter(user=u), user_settings.ppp) else: q = Paginator([], 1).page(1) else: result = sphinx.Query(u'@@relaxed %s'%term) if not result.has_key('total_found'): template_name = 'board/search_unavailable.html' pages = result.get('total_found', 0) / user_settings.ppp if pages > 0 and page > pages: raise Http404 ids = [m['id'] for m in result.get('matches', [])] q = QuerySetPaginator(Post.view_manager.filter(id__in=ids), user_settings.ppp) q = get_page(request.GET.get('page', 1), q) else: q = Paginator([], 1).page(1) return render_to_response(template_name, { 'result': q, 'term': term, 'category': category, 'args': u'&'.join(['']+args), }, context_instance=RequestContext(request, processors=extra_processors))
def list_by_tag(request, name): '''按标签''' try: page = int(request.GET.get('page', '1')) except ValueError: page = 1 return render_to_response('index.html', { 'title': name + ' - Tag', 'posts': utils.get_page(Post.objects.filter(tags__name=name), page), }, context_instance=RequestContext(request))
def crawl_daili66(self, page_count=4): start_url = 'http://www.66ip.cn/{0}.html' urls = [start_url.format(page) for page in range(1, page_count + 1)] for url in urls: html = get_page(url) if html: selector = etree.HTML(html) trs = selector.xpath('.//div[@id="main"]/div[1]/div[1]/table/tr') for i in range(1, len(trs)): ip = trs[i].xpath('.//td[1]/text()')[0] port = trs[i].xpath('.//td[2]/text()')[0] yield ':'.join([ip, port])
def home(request): """首页""" try: page = int(request.GET.get("page", "1")) except ValueError: page = 1 return render_to_response( "index.html", {"index": True, "keywords": settings.SITE_DESC, "posts": utils.get_page(Post.objects.all(), page)}, context_instance=RequestContext(request), )
def list(self, request, template): """ 实现对象的列表与查询,该方法在子类中一般不需要重写 """ u = request.user ls = self._get_list(request) args = {} for ak in self.list_args.keys(): if re.search('_doption$', ak): if request.GET.get(ak , None): datestr = (request.GET.get(ak, None)).split('-') args[str(self.list_args.get(ak))] = datetime.strptime((''.join((datestr[0],'-',datestr[1],'-01'))), '%Y-%m-%d') elif re.search('_option$', self.list_args.get(ak)): if request.GET.get(ak, None) and request.GET.get(ak + '_option', None): args[str(ak+'__'+request.GET.get(ak + '_option', None))] = str(request.GET.get(ak, None)) else: if request.GET.get(ak, None): try: args[str(self.list_args.get(ak))] = str(request.GET.get(ak, None)) except UnicodeEncodeError: args[str(self.list_args.get(ak))] = request.GET.get(ak, None) pri_contains = request.GET.get("pcontains", None) if pri_contains: ls = ls.model.objects_uc.filter_by_username_contains_pri(pri_contains, ls) pri = request.GET.get("p", None) if pri: order = 0 ls = ls.model.objects_uc.filter_by_jobnum_str(pri, ls, order) attstr = request.GET.get("a",None) if attstr: order = 1 ls = ls.model.objects_uc.filter_by_jobnum_str(attstr, ls, order) print args ls = ls.filter(**args) if(request.GET.get('excel')): if request.method == "POST": cols = request.POST.getlist("cols") return self.csv_export(request, ls, cols) p = get_page(ls, request) c_list = [] if self.csv_columns: for c in self.csv_columns: c_list.append(c[0].decode("utf-8")); list_dicts = {'p':p, 'excel_cs':c_list} list_dicts.update(self._get_list_dicts(request)) return render_to_response(request, template, list_dicts )
def get(self, PAGE_RE): username = self.get_username() login = True if username else False if login: v = self.request.get('v') if v and v.isdigit(): p = utils.get_page(PAGE_RE, page_id=int(v)) else: p = utils.get_page(PAGE_RE) if p is None: self.error(404) else: self.render('/templates/editpage.html', login=login, username=username, page=p) else: self.redirect('/login')
def home(request): '''首页''' try: page = int(request.GET.get('page', '1')) except ValueError: page = 1 return render_to_response('index.html', { 'index': True, 'keywords': settings.SITE_DESC, 'posts': utils.get_page(Post.objects.all(), page), }, context_instance=RequestContext(request))
def _allot_handle(self, request, template): '''待分配申请列表 通用处理''' dicts = self._get_role(request) if not dicts.has_key('is_alloter'): return request, get_template(template), None apps = self.ApplicationModel.objects.filter(status__in=('0', '5')).order_by('-apply_time') undocount = apps.count() dicts['p'] = get_page(apps, request) dicts['undocount'] = undocount return request, get_template(template), dicts
def thread(request, thread_id): try: thr = Thread.view_manager.get(pk=thread_id) except Thread.DoesNotExist: raise Http404 if not thr.category.can_read(request.user): raise PermissionError render_dict = {} if request.user.is_authenticated(): render_dict.update({"watched": WatchList.objects.filter(user=request.user, thread=thr).count() != 0}) if request.POST: if not thr.category.can_post(request.user): raise PermissionError postform = PostForm(request.POST) if postform.is_valid(): postobj = Post(thread = thr, user = request.user, text = postform.cleaned_data['post'], ) postobj.save() # this needs to happen before many-to-many private is assigned if len(postform.cleaned_data['private']) > 0: _log.debug('thread(): new post private = %s' % postform.cleaned_data['private']) postobj.private = postform.cleaned_data['private'] postobj.is_private = True postobj.save() postobj.notify() return HttpResponseRedirect(reverse('board_locate_post', args=(postobj.id,))) else: postform = PostForm() # this must come after the post so new messages show up post_list = Post.view_manager.posts_for_thread(thread_id, request.user) user_settings = get_user_settings(request.user) if user_settings.reverse_posts: post_list = post_list.order_by('-odate') post_list = QuerySetPaginator(post_list, user_settings.ppp) render_dict.update({ 'result': get_page(request.GET.get('page', 1), post_list), 'thr': thr, 'postform': postform, 'category': thr.category, }) return render_to_response('board/thread.html', render_dict, context_instance=RequestContext(request, processors=extra_processors))
def crawl_xicidaili(self, page_count=4): start_url = 'http://www.xicidaili.com/nn/{0}' urls = [start_url.format(page) for page in range(1, page_count + 1)] for url in urls: html = get_page(url) if html: selector = etree.HTML(html) trs = selector.xpath('.//table[@id="ip_list"]/tr') for i in range(1, len(trs)): ip = trs[i].xpath('.//td[2]/text()')[0] port = trs[i].xpath('.//td[3]/text()')[0] yield ':'.join([ip, port])
def get_entry(id): contents = utils.get_page(show_entry_base_url%id) soup = BeautifulSoup(contents) for th in soup.find_all('th'): if th.has_attr('id') and th['id'] == "modelname": entry = Entry(id, th.string) return entry
def crawl_ip3366(self, page_count=4): start_url = 'http://www.ip3366.net/free/?stype=1&page={0}' urls = [start_url.format(page) for page in range(1, page_count + 1)] for url in urls: html = get_page(url) if html: selector = etree.HTML(html) table = selector.xpath('.//div[@id="list"]/table')[0] trs = table.xpath('.//tr') for i in range(1, len(trs)): ip = trs[i].xpath('.//td[1]/text()')[0] port = trs[i].xpath('.//td[2]/text()')[0] yield ':'.join([ip, port])
def index(request, page=1): '''首页 with page add''' # try: # page = int(request.GET.get('page', '1')) # except ValueError: # page = 1 page = int(page) return render_to_response('index.html', { 'index': True, 'keywords': settings.SITE_DESC, 'posts': utils.get_page(Post.objects.all(), page), }, context_instance=RequestContext(request))
def show_verify(self, request, template): if get_role(request) == 'admin' or get_role(request) == 'gra': list = self.DefaultModel.objects.filter(activate_state = '1').values("verify").annotate(count=Count('verify')).order_by("-count") else: c = get_perm_college_sch(request) if c: list = self.DefaultModel.objects.filter(activate_state = '1', eduinfo__apply_major__school_code = c.code).values("verify").annotate(count=Count('verify')).order_by("-count") else: list = None p = get_page(list, request) dicts = get_top_left_modules(request,5) dicts.update({"p": p}) return render_to_response(request, template, dicts)
def get(self, PAGE_RE): v = self.request.get('v') if v and v.isdigit(): p = utils.get_page(PAGE_RE, page_id=int(v)) else: p = utils.get_page(PAGE_RE) username = self.get_username() login = True if username else False if p is None: if login: content = '' # empty content p = utils.create_page(PAGE_RE, content) self.redirect('/_edit' + PAGE_RE) else: self.redirect('/login') else: self.render('/templates/wikipage.html', login=login, username=username, page=p)
def parse_navigation(): start_page = get_page(to_absolute_url('/discover/')) nav = dict() for el in start_page.cssselect('.navigation > li > a'): # get first level categories if el.text in EXCLUDE_NAVIGATION: continue link = to_absolute_url(el.attrib['href']) nav[el.text] = { 'link': link, 'children': dict(), } category_page = get_page(link) for sub_el in category_page.cssselect('.subnavigation > li > a'): # look for second level categories if sub_el.text in EXCLUDE_NAVIGATION: continue sub_category_link = to_absolute_url(sub_el.attrib['href']) nav[el.text]['children'][sub_el.text] = { 'link': sub_category_link, } return nav
def crawl_ip3366(self): for i in range(1, 4): start_url = 'http://www.ip3366.net/?stype=1&page={}'.format(i) html = get_page(start_url) if html: find_tr = re.compile('<tr>(.*?)</tr>', re.S) trs = find_tr.findall(html) for s in range(1, len(trs)): find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>') re_ip_address = find_ip.findall(trs[s]) find_port = re.compile('<td>(\d+)</td>') re_port = find_port.findall(trs[s]) for address, port in zip(re_ip_address, re_port): address_port = address + ':' + port yield address_port.replace(' ', '')
def crawl_iphai(self): start_url = 'http://www.iphai.com/' html = get_page(start_url) if html: find_tr = re.compile('<tr>(.*?)</tr>', re.S) trs = find_tr.findall(html) for s in range(1, len(trs)): find_ip = re.compile('<td>\s+(\d+\.\d+\.\d+\.\d+)\s+</td>', re.S) re_ip_address = find_ip.findall(trs[s]) find_port = re.compile('<td>\s+(\d+)\s+</td>', re.S) re_port = find_port.findall(trs[s]) for address, port in zip(re_ip_address, re_port): address_port = address + ':' + port yield address_port.replace(' ', '')
def crawl_daili66(self, page_count=4): start_url = "http://www.66ip.cn/{}.html" urls = [start_url.format(page) for page in range(1, page_count + 1)] for url in urls: html = get_page(url) if html: doc = pq(html) trs = doc(".containerbox table tr:gt(0)").items() for tr in trs: ip = tr.find("td:nth-child(1)").text() port = tr.find("td:nth-child(2)").text() yield ":".join([ip, port])
def menu(self): for i in range(1, 2): target_url = 'http://www.ygdy8.net/html/gndy/oumei/list_7_{}.html'.format( i) # 建立一个selector选择器,用于xpath过滤 selector = build_xpath_tree(get_page(target_url, 'gb2312')) # 获取所有的子集页面的a标签 items = selector.xpath('//div[@class="co_content8"]//tr//a[2]') # 提取a标签内的url和内容(电影名) for item in items: next_url = item.xpath('@href')[0] name = re.findall(u'《(.*)》', item.xpath('text()')[0])[0] print(name) self.get_info(next_url, movie_info={'name': windows_name_format(name)})
def crawl_proxy360(self): """ 获取Proxy360 :return: 代理 """ start_url = 'http://www.proxy360.cn/Region/China' print('Crawling', start_url) html = get_page(start_url) if html: doc = pq(html) lines = doc('div[name="list_proxy_ip"]').items() for line in lines: ip = line.find('.tbBottomLine:nth-child(1)').text() port = line.find('.tbBottomLine:nth-child(2)').text() yield ':'.join([ip, port])
def resource_manage(): query_data = request.args page = int(query_data.get('page', 1)) size = int(query_data.get('size', current_app.config['ARTICLE_PER_PAGE'])) if page <= 0 or size <= 0: abort(400) offset, limit = get_page(page, size) session = getSessionFactory().get_session() resources = session.query(Resources) \ .order_by(Resources.id.desc()).offset(offset).limit(limit).all() resources_data = [resource.get_map_data() for resource in resources] return render_template('resourceUpload.html', resources = resources_data)
def scraper_main_gumtree(url): """ Reads pages with offers from GumTree and provides URLS to said offers. """ # Loading the page page = get_page(url) # Putting the offer's URLs together offers = page.element("div[class='view'] div[class='title'] a") return { "url": url, "offers_urls": ["https://www.gumtree.pl" + off.attrib.get("href") for off in offers] }
def crawl_kuaidaili(self, page_count=4): start_url = "http://www.kuaidaili.com/free/inha/{}" urls = [start_url.format(page) for page in range(1, page_count + 1)] for url in urls: html = get_page(url) if html: doc = pq(html) trs = doc("#list tbody tr").items() for tr in trs: ip = tr.find("td:nth-child(1)").text() port = tr.find("td:nth-child(2)").text() yield ":".join([ip, port])
def crawl_xicidaili(self, page_count=4): start_url = "http://www.xicidaili.com/nn/{}" urls = [start_url.format(page) for page in range(1, page_count + 1)] for url in urls: html = get_page(url) if html: doc = pq(html) trs = doc("#ip_list tr:gt(0)").items() for tr in trs: ip = tr.find("td:nth-child(2)").text() port = tr.find("td:nth-child(3)").text() yield ":".join([ip, port])
def _get_quiz_json(self, quiz_id, session_id): headers = self._auth_headers_with_json() data = {"contentRequestBody": {"argument": []}} reply = get_page(self._session, POST_OPENCOURSE_API_QUIZ_SESSION_GET_STATE, json=True, post=True, data=json.dumps(data), headers=headers, user_id=self._user_id, class_name=self._course_name, quiz_id=quiz_id, session_id=session_id) return reply['contentResponseBody']['return']
def scrape(): for url in remaining_urls(): try: id = get_page_identifier(url) page = get_page(url) metadata_to_save = capture_metadata(page) save_captured_data(metadata_to_save, id) print( 'Saved volume {}, issue {}, page {} - {:.2f}% finished'.format( id['volume'], id['issue'], id['page'], 100 * len(os.listdir('Metadata')) / len(os.listdir('PNGs')))) except Exception as error: print('Failure at volume {}, issue {}, page {}: {}'.format( id['volume'], id['issue'], id['page'], error)) print('Skipping for now')
def crawl_goubanjia(self): start_url = 'http://www.goubanjia.com/free/gngn/index{page}.shtml' for page in range(1, 11): html = get_page(start_url.format(page=page)) if html: doc = pq(html.text) tds = doc('td.ip').items() for td in tds: td.find('p').remove() port_code = td.find('.port').attr['class'].replace( 'port ', '') port = proxy_port(port_code) td.find('.port').remove() ip = td.text().replace(' ', '') proxy = ip + port yield proxy
def show_tag(request, tagname): '''按标签查看''' try: page = int(request.GET.get('page', '1')) except ValueError: page = 1 c = { 'settings': global_settings, 'title': tagname + ' - Tag', 'no_sidebar': False, 'posts': utils.get_page(Post.objects.filter(tags__name=tagname), page), } c.update(common_response(request)) return render_to_response('%s/index.html' % THEME, c, context_instance=RequestContext(request))
def get_danmu(cls, cid): """获取视频对应的弹幕信息""" danmu_url = 'https://api.bilibili.com/x/v1/dm/list.so?oid={}'.format( cid) res = get_page(danmu_url) danmu = [] try: response = res.content.decode('utf-8') selector = Selector(text=response) for d in selector.css('d'): txt = d.css('::text').extract_first() danmu.append({'txt': txt}) except Exception as e: print(e) return danmu
def crawl_daili66(self, page_count=5): """ 代理名称:66ip :param page_count: 页码 :return: proxy """ start_url = 'http://www.66ip.cn/{}.html' urls = [start_url.format(page) for page in range(1, page_count + 1)] for url in urls: r = get_page(url) html = etree.HTML(r) proxy_trs = html.xpath("//div[@id='main']//table//tr") for i in range(1, len(proxy_trs)): # 第一行是标题,略过 ip = proxy_trs[i].xpath("./td[1]")[0].text port = proxy_trs[i].xpath("./td[2]")[0].text yield ":".join([ip, port])
def IP3366Crawler(self): """ 云代理 parse html file to get proxies :return: """ start_url = 'http://www.ip3366.net/free/?stype=1&page={page}' urls = [start_url.format(page=i) for i in range(1, 8)] # \s * 匹配空格,起到换行作用 ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>') for url in urls: html = get_page(url) re_ip_address = ip_address.findall(str(html)) for adress, port in re_ip_address: result = f"{adress}:{port}" yield result.strip()
def crawl_goubanjia(self): """ 获取Goubanjia :return: 代理 """ start_url = 'http://www.goubanjia.com/' html = get_page(start_url) # print(html) if html: doc = pq(html) tds = doc('.table.table-hover tbody tr').items() for td in tds: td.find('p').remove() # 去掉p节点,因为p节点重复了 ip_class = td.find('.ip') ip = ip_class.text().replace('\n', '') yield ip
def summary(request, *args, **kwargs): if not request.session.get( 'is_login', False) or request.session['user_type'] != 'tender': return redirect("/index") error_msg = '' page = utils.get_page(args) # 获取当前页面 _projects_dict = utils.get_all_projects() page_list = utils.get_pages(page, _projects_dict) projects_dict, page, project_id = utils.adjust_info( _projects_dict, page, 1) if request.method == 'GET': return render(request, 'tender_projectsum.html', { 'projects_dict': projects_dict, "page_list": page_list, "page": page })
def craw_iphai(self): start_url = 'http://www.iphai.com/' html = get_page(start_url) if html: #搞不懂为他为什么做这么复杂,可以直接用正则匹配 find_tr = re.compile('<tr>(.*?)</tr>', re.S) trs = find_tr.findall(html) for s in range(1, len(trs)): #网站只有一页,有多少行匹配循环多少次 find_ip = re.compile('<td>\s*?(\d+\.\d+\.\d+\.\d+)\s*?</td>', re.S) re_ip_address = find_ip.findall(trs[s]) find_port = re.compile('<td>\s*?(\d+)\s*?</td>', re.S) re_port = find_port.findall(trs[s]) for address, port in zip(re_ip_address, re_port): address_port = address + ':' + port yield address_port.replace(' ', '')
def crawl_data5u(self): """ 无忧代理 :return: """ start_url = 'http://www.data5u.com' html = get_page(start_url) # print(html) ip_adress = re.compile( '<ul class="l2">\s*<span><li>(.*?)</li></span>\s*<span style="width: 100px;"><li class=".*">(.*?)</li></span>' ) # \s * 匹配空格,起到换行作用 re_ip_adress = ip_adress.findall(str(html)) for adress, port in re_ip_adress: result = f"{adress}:{port}" yield result.strip()
def crawl_daili66(self, page_count=10): """获取66代理 :param page_count: 页码 :return 代理""" start_url = 'http://www.66ip.cn/{}.html' urls = [start_url.format(page) for page in range(1, page_count+1)] for url in urls: print('Crawling', url) html = get_page(url) if html: doc = pq(html) trs = doc('.containerbox table tr:gt(0)').items() for tr in trs: ip = tr.find('td:nth-child(1)').text() port = tr.find('td:nth-child(2)').text() yield ':'.join([ip, port])
def crawl_xicidaili(self): for i in range(1, 3): start_url = 'http://www.xicidaili.com/nn/{}'.format(i) print('Crawling', start_url) html = get_page(start_url) if html: find_trs = re.compile('<tr class.*?>(.*?)</tr>', re.S) trs = find_trs.findall(html) for tr in trs: find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>') re_ip_address = find_ip.findall(tr) find_port = re.compile('<td>(\d+)</td>') re_port = find_port.findall(tr) for address, port in zip(re_ip_address, re_port): address_port = address + ':' + port yield address_port.replace(' ', '')
def crawl_kuaidaili(self): ''' 获取快代理 :return: ''' for i in range(1, 6): start_url = 'http://www.kuaidaili.com/free/inha/{}/'.format(i) html = get_page(start_url) if html: ip_address = re.compile('<td data-title="IP">(.*?)</td>') re_ip_address = ip_address.findall(html) port = re.compile('<td data-title="PORT">(.*?)</td>') re_port = port.findall(html) for address, port in zip(re_ip_address, re_port): address_port = address + ':' + port yield address_port.replace(' ', '')
def crawl_ip3366(self): ''' 获取ip3366代理 :return: ''' for page in range(1, 7): start_url = 'http://www.ip3366.net/free/?stype=1&page={}'.format( page) html = get_page(start_url) if html: doc = pq(html) trs = doc('#list table tbody tr').items() for tr in trs: ip = tr.find('td:nth-child(1)').text() port = tr.find('td:nth-child(2)').text() yield ':'.join([ip, port])
def crawl_kuaidaili(self, page_count=5): """ 代理名称:快代理 :param page_count: 页码 :return: proxy """ base_url = 'https://www.kuaidaili.com/free/inha/{}/' urls = [base_url.format(page) for page in range(1, page_count + 1)] for url in urls: time.sleep(1) html = etree.HTML(get_page(url)) proxy_trs = html.xpath("//div[@id='list']/table//tr") for i in range(1, len(proxy_trs)): ip = proxy_trs[i].xpath("./td[1]")[0].text port = proxy_trs[i].xpath("./td[2]")[0].text yield ":".join([ip, port])
def scrape(url): while url is not None: # sometimes, requests loads a malformed page # we know all pages exist with PNG data, next url, etc # keep retrying until we get it while True: try: page = get_page(url) id = get_page_identifier(url) save_poem(page, id) url = next_page_url(page) print(url) delay() except Exception as error: print('Failure at volume {}, issue {}, page {}: {}'.format( id['volume'], id['issue'], id['page'], error))
def crawl_daili66(self, page_count=34): proxy = [] start_url = "http://www.66ip.cn/areaindex_{}/1.html" urls = [start_url.format(page) for page in range(1, page_count + 1)] for url in urls: print('Crawling', url) html = get_page(url) if html: soup = BeautifulSoup(html.decode("gbk"), 'lxml') trs = soup.find("div", class_="containerbox boxindex").find_all("tr") for tr in trs[1:]: IP = tr.find_all("td")[0].get_text() PORT = tr.find_all("td")[1].get_text() TYPE = "http" proxy.append([IP, PORT, TYPE]) return proxy
def index(request): '''首页及分页''' try: page = int(request.GET.get('page', '1')) except ValueError: page = 1 c = { 'posts': utils.get_page(Post.objects.all(), page), 'settings': global_settings, 'no_sidebar': False, } c.update(common_response(request)) return render_to_response('%s/index.html' % THEME, c, context_instance=RequestContext(request))
def crawl_daili66(self, page_count=4): """ Get proxies from daili66 :param page_count: Page number :return: proxy """ start_url = 'http://www.66ip.cn/{}.html' urls = [start_url.format(page) for page in range(1, page_count+1)] for url in urls: html = get_page(url) if html: doc = pq(html) trs = doc('.containerbox table tr:gt(0)').items() for tr in trs: ip = tr.find('td:nth-child(1)').text() port = tr.find('td:nth-child(2)').text() yield ':'.join([ip, port])
def crawl_daili66(self, page_count=4): ''' 获取代理66 ''' t_url = 'https://www.66ip.cn/{}.html' # 抓取4页 urls = [t_url.format(page) for page in range(1, page_count + 1)] for url in urls: print('Crawing', url) html = get_page(url) if html: doc = pq(html) trs = doc('.containerbox table tr:gt(0)').items() for tr in trs: ip = tr.find('td:nth-child(1)').text() port = tr.find('td:nth-child(2)').text() yield ':'.join([ip, port])