def manual_redirect(out, obj, verbose=False): s_out = soupypages.makeSoup(out['page']) if s_out['pass']: soup = s_out['soup'] meta = soup.find_all('meta') if len(meta) > 0: for m in meta: try: if m['http-equiv'].lower() == 'refresh': try: if m['content'].lower().find('url') > -1: new_url = prepend_url( obj.url, re.split(u'url=', m['content'].lower())[1]) if new_url != obj.url: obj.url = new_url obj = get_page(obj, overw=True) if verbose: print( 'Broken auto-redirect; manual redirect performed.' ) except KeyError: obj.page = out['page'] obj.soup = '' except KeyError: obj.page = out['page'] obj.soup = '' else: obj.page = out['page'] obj.soup = '' else: obj.page = out['page'] obj.soup = '' return obj
def manual_redirect(out, obj, verbose=False): s_out = soupypages.makeSoup(out['page']) if s_out['pass']: soup = s_out['soup'] meta = soup.find_all('meta') if len(meta)>0: for m in meta: try: if m['http-equiv'].lower()=='refresh': try: if m['content'].lower().find('url')>-1: new_url = prepend_url(obj.url, re.split(u'url=', m['content'].lower())[1]) if new_url != obj.url: obj.url = new_url obj = get_page(obj, overw=True) if verbose: print('Broken auto-redirect; manual redirect performed.') except KeyError: obj.page = out['page'] obj.soup = '' except KeyError: obj.page = out['page'] obj.soup = '' else: obj.page = out['page'] obj.soup = '' else: obj.page = out['page'] obj.soup = '' return obj
def manual_redirect_02(): ''' Check for pages with incorrect links that provide re-directs that were not followed for some reason by urllib2. 1000 seems to be an idela cutoff lengthfor page size. "Dead zone" from 1000-3000ish. ''' dept_conn = connectMon.MongoConn({'db_name':'EduCrawl', 'coll_name':'Department'}) dept_conn.query({"$where":"this.Page.length<1000"}) ##soupypages.makeSoup(short_pages[0][1].strip(r')|(').split(',')[0]) for i in range(dept_conn.LastQLen): cp = dept_conn.LastQ.next() if cp['Page']: soup = soupypages.makeSoup(cp['Page']) meta = soup['soup'].find_all('meta') for m in meta: try: if m['http-equiv'].lower()=='refresh': try: if m['content'].lower().find('url'): cp['Link'] = soupypages_helper.prepend_url(cp['Link'], re.split(u'url=', m['content'].lower())[1]) new_page = soupypages.makePage(cp['Link']) if new_page['pass']: cp['Page'] = new_page['page'] dept_conn.coll.update({"_id":cp["_id"]},cp) print(cp["_id"]) break except KeyError: pass except KeyError: pass
def get_soup(obj, overw=False): if not 'soup' in set(dir(obj)) or\ overw == True: if not 'page' in set(dir(obj)): obj = get_page(obj) obj = get_soup(obj) else: obj = taste_soup(obj, soupypages.makeSoup(obj.page)) else: print('Soup already on table. To reorder, call with overw=True') return obj
def manual_redirect_02(): ''' Check for pages with incorrect links that provide re-directs that were not followed for some reason by urllib2. 1000 seems to be an idela cutoff lengthfor page size. "Dead zone" from 1000-3000ish. ''' dept_conn = connectMon.MongoConn({ 'db_name': 'EduCrawl', 'coll_name': 'Department' }) dept_conn.query({"$where": "this.Page.length<1000"}) ##soupypages.makeSoup(short_pages[0][1].strip(r')|(').split(',')[0]) for i in range(dept_conn.LastQLen): cp = dept_conn.LastQ.next() if cp['Page']: soup = soupypages.makeSoup(cp['Page']) meta = soup['soup'].find_all('meta') for m in meta: try: if m['http-equiv'].lower() == 'refresh': try: if m['content'].lower().find('url'): cp['Link'] = soupypages_helper.prepend_url( cp['Link'], re.split(u'url=', m['content'].lower())[1]) new_page = soupypages.makePage(cp['Link']) if new_page['pass']: cp['Page'] = new_page['page'] dept_conn.coll.update({"_id": cp["_id"]}, cp) print(cp["_id"]) break except KeyError: pass except KeyError: pass