Exemple #1
0
def get_search_result_parser(base_url, page_idx):
    page_url = re.sub("PAGEIDX", str(page_idx), base_url)
    opener = proxy_opener()
    html = opener.open(
        page_url
    )  #urllib2.build_opener(urllib2.ProxyHandler({"http": "http://localhost:3128"})).open(page_url)
    search_results_parser = BeautifulSoup(html)
    return search_results_parser
Exemple #2
0
def _get_url(url):
    """ retrieves the created url """
    #if not settings_local.PROXY:
    proxy_url = proxy_opener()
    html = proxy_url.open(url)
    #else:
        #response = urllib2.urlopen(url)
        #html = response.read()
    return html 
Exemple #3
0
def _get_url(url):
    """ retrieves the created url """
    #if not settings_local.PROXY:
    proxy_url = proxy_opener()
    html = proxy_url.open(url)
    #else:
    #response = urllib2.urlopen(url)
    #html = response.read()
    return html
Exemple #4
0
def __getHTMLPage_Containing_SearchResult(url_base, index_offset) :
    # set up fields for any type of search
    search_results_per_page = 25
    search_page_num = str(1 + (index_offset/search_results_per_page))   
    howFarDownThePage = index_offset % search_results_per_page
    url = url_base + "&page="+search_page_num
    # use a proxy handler as developing behind firewall
    proxy_url = proxy_opener()
    html = proxy_url.open(url)
    return html, howFarDownThePage
Exemple #5
0
def __getHTMLPage_Containing_SearchResult(url_base, index_offset):
    # set up fields for any type of search
    search_results_per_page = 25
    search_page_num = str(1 + (index_offset / search_results_per_page))
    howFarDownThePage = index_offset % search_results_per_page
    url = url_base + "&page=" + search_page_num
    # use a proxy handler as developing behind firewall
    proxy_url = proxy_opener()
    html = proxy_url.open(url)
    return html, howFarDownThePage
Exemple #6
0
def __get_image_properties_from_imageSpecific_page(id) :
    """ Slower but more thorough method for finding metadata """
    page_url = BASE_IMAGE_PROPERTIES_URL + "?asset=" + id
    proxy_url = proxy_opener()
    html = proxy_url.open(page_url)
    page_html_parser = BeautifulSoup(html)
    containing_div = page_html_parser.find('div', id="info", style=True)    # check for style, because there are two div with id info
    artist = containing_div.find('dd')    # first dd
    title = artist.findNextSibling('dd').findNextSibling('dd')
    date = title.findNextSibling('dd')    # note, not just numeric
    access = containing_div('dd')[-1]    # last dd in containing_div
    meta = {'artist': artist.renderContents(), 
            'title': title.renderContents(),
            'date': date.renderContents(),
            'access': access.renderContents()}
    return (title.renderContents(), meta) 
Exemple #7
0
def __get_image_properties_from_imageSpecific_page(id):
    """ Slower but more thorough method for finding metadata """
    page_url = BASE_IMAGE_PROPERTIES_URL + "?asset=" + id
    proxy_url = proxy_opener()
    html = proxy_url.open(page_url)
    page_html_parser = BeautifulSoup(html)
    containing_div = page_html_parser.find(
        'div', id="info",
        style=True)  # check for style, because there are two div with id info
    artist = containing_div.find('dd')  # first dd
    title = artist.findNextSibling('dd').findNextSibling('dd')
    date = title.findNextSibling('dd')  # note, not just numeric
    access = containing_div('dd')[-1]  # last dd in containing_div
    meta = {
        'artist': artist.renderContents(),
        'title': title.renderContents(),
        'date': date.renderContents(),
        'access': access.renderContents()
    }
    return (title.renderContents(), meta)
Exemple #8
0
			jobinfo = JobInfo.objects.get(id=job.arg)
		except Exception, ex:
			print 'oh no, i except'
	arg = json.loads(jobinfo.arg)
	record = Record.objects.get(id=arg['record'], manager='unitedsearch')
	try:
		if jobinfo.status.startswith == 'Complete':
			return
		url = arg['url']
		print 'ready to download image at: ' + url
 		storage = get_storage()
		if storage:
			print 'storage in workers.py is valid at 32'
		else:
			print 'storage is invalid at 32'
		proxy = proxy_opener()
		# where you get a url error
		file = proxy.open(url)
		image_data = file.read()
		print 'unitedsearch.workers.py -- image_data: '+str(len(image_data))
		size = len(image_data)

		image_file=StringIO.StringIO(image_data)
		if image_file:
			print 'have image file'
		else:
			print 'do not have image file'
		#size = file.info().get('content-length')
		#setattr(file, 'size', int(size if size else 0))
		setattr(image_file, 'size', int(size if size else 0))
		mimetype = file.info().get('content-type')
Exemple #9
0
def _get_url(url):
    """ retrieves the created url """
    proxy_url = proxy_opener()
    html = proxy_url.open(url)
    return html 
Exemple #10
0
def get_search_result_parser(base_url, page_idx) :
    page_url = re.sub("PAGEIDX", str(page_idx),base_url)
    opener = proxy_opener()
    html = opener.open(page_url)#urllib2.build_opener(urllib2.ProxyHandler({"http": "http://localhost:3128"})).open(page_url)
    search_results_parser = BeautifulSoup(html)
    return search_results_parser
Exemple #11
0
            jobinfo = JobInfo.objects.get(id=job.arg)
        except Exception, ex:
            print 'oh no, i except'
    arg = simplejson.loads(jobinfo.arg)
    record = Record.objects.get(id=arg['record'], manager='unitedsearch')
    try:
        if jobinfo.status.startswith == 'Complete':
            return
        url = arg['url']
        print 'ready to download image at: ' + url
        storage = get_storage()
        if storage:
            print 'storage in workers.py is valid at 32'
        else:
            print 'storage is invalid at 32'
        proxy = proxy_opener()
        # where you get a url error
        file = proxy.open(url)
        image_data = file.read()
        print 'unitedsearch.workers.py -- image_data: ' + str(len(image_data))
        size = len(image_data)

        image_file = StringIO.StringIO(image_data)
        if image_file:
            print 'have image file'
        else:
            print 'do not have image file'
        #size = file.info().get('content-length')
        #setattr(file, 'size', int(size if size else 0))
        setattr(image_file, 'size', int(size if size else 0))
        mimetype = file.info().get('content-type')
Exemple #12
0
def _get_url(url):
    proxy_url = proxy_opener()
    html = proxy_url.open(url)
    return html
Exemple #13
0
def _get_url(url):
    """ retrieves the created url """
    proxy_url = proxy_opener()
    html = proxy_url.open(url)
    return html