def get_tracker_info(self, website): url_to_call = SITE_NAME + '/tracker_infos/get_tracker_info_from_script/' + PYTHON_VERIFICATION_CODE data = {'website': get_website_name_for_tracker_update(website)} user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent': user_agent} data = urllib.urlencode(data) req = urllib2.Request(url_to_call, data, headers) resultset = None try: #print url_to_call response = urllib2.urlopen(req) response_page = response.read() #print response_page clean_response_page = htmlops.removeCommentsAndJS(response_page) soup = BeautifulSoup(clean_response_page) #print clean_response_page # pull the divs div = soup.find('div', attrs={'id': '_tracker_info_'}) csses_div = soup.find('div', attrs={'id': 'csses'}) #print main_div #print "print beautiful soup over" #build up result set resultset = { 'title_xpath': div['title_xpath'].encode("ascii", "ignore"), 'title_and_price_xpath': div['title_and_price_xpath'].encode("ascii", "ignore"), 'price_xpath': div['price_xpath'].encode("ascii", "ignore"), 'pimg_xpath': div['pimg_xpath'].encode("ascii", "ignore"), 'csses': json.loads(csses_div.text.encode("ascii", "ignore")), 'details_xpath': div['details_xpath'].encode("ascii", "ignore") } #print resultset # returl thus prepared url list :) except: print "FATAL ERROR : Couldnt get tracker info" print "Unexpected error:", sys.exc_info()[0] raise return resultset
def get_bloggers_url(self): url_to_call = SITE_NAME + '/bloggers/get_list_for_parsing/' + PYTHON_VERIFICATION_CODE user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent': user_agent} req = urllib2.Request(url_to_call, None, headers) try: print url_to_call response = urllib2.urlopen(req) response_page = response.read() clean_response_page = htmlops.removeCommentsAndJS(response_page) soup = BeautifulSoup(clean_response_page) #print clean_response_page # pull the divs main_div = soup.findAll('div', attrs={'id': '_parsing_urls_'}) #print main_div #print "print beautiful soup over" #build up result set resultset = [] for div in main_div: resultset.append({ 'blog': div['blog'].encode("ascii", "ignore"), 'facebooklink': div['facebooklink'].encode("ascii", "ignore"), 'twitterlink': div['twitterlink'].encode("ascii", "ignore"), 'instagramlink': div['instagramlink'].encode("ascii", "ignore"), 'pinterestlink': div['pinterestlink'].encode("ascii", "ignore"), 'bloggerid': div['bloggerid'].encode("ascii", "ignore"), 'ready_to_parse': div['ready_to_parse'].encode("ascii", "ignore") }) #print resultset # returl thus prepared url list :) return resultset except: print "FATAL ERROR : Couldnt get url list" print "Unexpected error:", sys.exc_info()[0] raise return None
def get_pending_jobs(self, job_types_str): data = {'job_types_str': job_types_str} url_to_call = SITE_NAME + '/backend_ops/get_pending_jobs/' + PYTHON_VERIFICATION_CODE user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent': user_agent} req = urllib2.Request(url_to_call, urllib.urlencode(data), headers) resultset = [] try: #print url_to_call response = urllib2.urlopen(req) response_page = response.read() #print response_page clean_response_page = htmlops.removeCommentsAndJS(response_page) soup = BeautifulSoup(clean_response_page) #print clean_response_page # pull the divs divs = soup.findAll('div', attrs={'class': '_single_backend_op_'}) #print main_div #print "print beautiful soup over" #print divs #build up result set for div in divs: resultset.append({ 'type': div['type'].encode("ascii", "ignore"), 'url': div['url'].encode("ascii", "ignore"), 'pid': div['pid'].encode("ascii", "ignore"), 'id': div['id'].encode("ascii", "ignore"), }) #print resultset # returl thus prepared url list :) except: print "FATAL ERROR : Couldnt get tracker info" print "Unexpected error:", sys.exc_info()[0] raise #print resultset return resultset
def update_get_prod_detail_jobs(self, prod_page_links): data = {} key_index = 0 for link in prod_page_links: data[str(key_index)] = link key_index += 1 url_to_call = SITE_NAME + '/backend_ops/add_get_prod_detail_jobs/' + PYTHON_VERIFICATION_CODE user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent': user_agent} req = urllib2.Request(url_to_call, urllib.urlencode(data), headers) retval = '0' try: #print url_to_call response = urllib2.urlopen(req) response_page = response.read() #print response_page clean_response_page = htmlops.removeCommentsAndJS(response_page) soup = BeautifulSoup(clean_response_page) #print clean_response_page # pull the divs div = soup.find('div', attrs={'class': '_result_'}) #print main_div #print "print beautiful soup over" #print divs #build up result set retval = div['retval'].encode("ascii", "ignore") #print resultset # returl thus prepared url list :) except: print "FATAL ERROR : Couldnt get tracker info" print "Unexpected error:", sys.exc_info()[0] raise #print resultset return '1' == retval
def update_prod_info_in_db(self, prod_info): url_to_call = SITE_NAME + '/products/update_prod_info_from_script/' + PYTHON_VERIFICATION_CODE user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent': user_agent} req = urllib2.Request(url_to_call, urllib.urlencode(prod_info), headers) retval = '0' try: #print url_to_call response = urllib2.urlopen(req) response_page = response.read() #print response_page clean_response_page = htmlops.removeCommentsAndJS(response_page) soup = BeautifulSoup(clean_response_page) #print clean_response_page # pull the divs div = soup.find('div', attrs={'class': '_result_'}) #print main_div #print "print beautiful soup over" #print divs #build up result set retval = div['retval'].encode("ascii", "ignore") #print resultset # returl thus prepared url list :) except: print "FATAL ERROR : Couldnt update prod info" print "Unexpected error:", sys.exc_info()[0] raise #print resultset return '1' == retval
def get_tracker_info(self, website): url_to_call = SITE_NAME + '/tracker_infos/get_tracker_info_from_script/' + PYTHON_VERIFICATION_CODE data = {'website': get_website_name_for_tracker_update(website)} user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent': user_agent} data = urllib.urlencode(data) req = urllib2.Request(url_to_call, data, headers) resultset = None try: #print url_to_call response = urllib2.urlopen(req) response_page = response.read() #print response_page clean_response_page = htmlops.removeCommentsAndJS(response_page) soup = BeautifulSoup(clean_response_page) #print clean_response_page # pull the divs div = soup.find('div', attrs={'id': '_tracker_info_'}) csses_div = soup.find('div', attrs={'id': 'csses'}) #print main_div #print "print beautiful soup over" #build up result set resultset = { 'titlexpath': div['titlexpath'].encode("ascii", "ignore"), 'title_price_xpath': div['title_price_xpath'].encode("ascii", "ignore"), 'pricexpath': div['pricexpath'].encode("ascii", "ignore"), 'oldpricexpath': div['oldpricexpath'].encode("ascii", "ignore"), 'pimg_xpath': div['pimg_xpath'].encode("ascii", "ignore"), 'pimg_xpath1': div['pimg_xpath1'].encode("ascii", "ignore"), 'pimg_xpath2': div['pimg_xpath2'].encode("ascii", "ignore"), 'pimg_xpath3': div['pimg_xpath3'].encode("ascii", "ignore"), 'pimg_xpath4': div['pimg_xpath4'].encode("ascii", "ignore"), 'pimg_xpath5': div['pimg_xpath5'].encode("ascii", "ignore"), 'urllib2_pimg_xpath1': div['urllib2_pimg_xpath1'].encode("ascii", "ignore"), 'urllib2_pimg_xpath2': div['urllib2_pimg_xpath2'].encode("ascii", "ignore"), 'urllib2_pimg_xpath3': div['urllib2_pimg_xpath3'].encode("ascii", "ignore"), 'urllib2_pimg_xpath4': div['urllib2_pimg_xpath4'].encode("ascii", "ignore"), 'urllib2_pimg_xpath5': div['urllib2_pimg_xpath5'].encode("ascii", "ignore"), 'image_and_title_parent_xpath': div['image_and_title_parent_xpath'].encode("ascii", "ignore"), 'image_and_details_container_xpath': div['image_and_details_container_xpath'].encode( "ascii", "ignore"), 'csses': json.loads(csses_div.text.encode("ascii", "ignore")), 'details_xpath': div['details_xpath'].encode("ascii", "ignore"), 'title_xpath_regex': div['titlexpath_regex'].encode("ascii", "ignore"), 'title_and_price_xpath_regex': div['title_price_xpath_regex'].encode("ascii", "ignore"), 'pricexpath_regex': div['pricexpath_regex'].encode("ascii", "ignore"), 'oldpricexpath_regex': div['oldpricexpath_regex'].encode("ascii", "ignore"), 'pimg_xpath_regex': div['pimg_xpath_regex'].encode("ascii", "ignore"), 'pimg_xpath1_regex': div['pimg_xpath_regex'].encode("ascii", "ignore"), 'pimg_xpath2_regex': div['pimg_xpath2_regex'].encode("ascii", "ignore"), 'pimg_xpath3_regex': div['pimg_xpath3_regex'].encode("ascii", "ignore"), 'pimg_xpath4_regex': div['pimg_xpath4_regex'].encode("ascii", "ignore"), 'pimg_xpath5_regex': div['pimg_xpath5_regex'].encode("ascii", "ignore"), 'urllib2_pimg_xpath1_regex': div['urllib2_pimg_xpath1_regex'].encode("ascii", "ignore"), 'urllib2_pimg_xpath2_regex': div['urllib2_pimg_xpath2_regex'].encode("ascii", "ignore"), 'urllib2_pimg_xpath3_regex': div['urllib2_pimg_xpath3_regex'].encode("ascii", "ignore"), 'urllib2_pimg_xpath4_regex': div['urllib2_pimg_xpath4_regex'].encode("ascii", "ignore"), 'urllib2_pimg_xpath5_regex': div['urllib2_pimg_xpath5_regex'].encode("ascii", "ignore"), 'image_and_title_parent_xpath_regex': div['image_and_title_parent_xpath_regex'].encode( "ascii", "ignore"), 'image_and_details_container_xpath_regex': div['image_and_details_container_xpath_regex'].encode( "ascii", "ignore"), 'is_image_in_og_image_meta_tag': div['is_image_in_og_image_meta_tag'].encode("ascii", "ignore"), 'pinterest_position': div['pinterest_position'].encode("ascii", "ignore") } #print resultset # returl thus prepared url list :) except KeyError, e: print "################# Error Start ##################" print "Likely bad tracker info" print website print "################# Error End ##################"