def create_new_cursor(conn): g_log.debug('Now try to create a new cur.') try: cur = conn.cursor() return cur except Exception as e: g_log.error(e) return None
def open_url(URL): ''' [Description]: open the requested url and return the pure html info ''' try: g_log.debug('Connecting the %s' % URL) html = urlopen(URL) return html except Exception as e: g_log.error(e) return None
def GoThroughPage(self): try: g_log.debug('Now get all the hidden items in page') # in taobao page it mark the blank row with class blank-row blank_items = self.driver.find_elements_by_class_name('blank-row') for item in blank_items: self.driver.execute_script("arguments[0].scrollIntoView();", item) time.sleep(1) return True except Exception as e: g_log.error(e) return False
def getExternalLinks(bsObj, excludeUrl): externalLinks = [] g_log.debug("get the external links exclude url: %s" % excludeUrl) # find out all the http/https/www/ and exclude the current urls try: links = bsObj.findAll("a", href=re.compile("^(http|www|https)((?!" + excludeUrl + ").)*$")) except Exception as e: g_log.error(e) for link in links: if link.attrs['href'] is not None: if link.attrs['href'] not in externalLinks: externalLinks.append(link.attrs['href']) return externalLinks
def waitForLoad(self): # if javascript and jQuery is completely loaded, return True g_log.debug("wait for the page is loaded.") time.sleep(TIME_OUT) page_state_js = self.driver.execute_script( 'return document.readyState;') try: page_state_jq = int( self.driver.execute_script('return jQuery.active;')) except Exception as e: g_log.error("get jquery state error: %s" % e) page_state_jq = 0 if page_state_jq == 0 and page_state_js == 'complete': return True else: return False
def getInternalLinks(bsObj, includeUrl): internalLinks = [] g_log.debug("get the internal links include url: %s" % includeUrl) # find out the links start with / try: links = bsObj.findAll("a", href=re.compile("^(?!http|https)(/|.*" + includeUrl + ")")) except Exception as e: g_log.error(e) return [] for link in links: if link.attrs['href'] is not None: if link.attrs['href'] not in internalLinks: internalLinks.append(link.attrs['href']) return internalLinks
def GoToSearchPage(self, page_id): try: if page_id < 1: g_log.error('page id is not correct %d' % page_id) return False g_log.debug('Now get the page Input bar') input_form = self.driver.find_element_by_class_name('J_Input') input_form.clear() input_form.send_keys(int(page_id)) g_log.debug('Now Click the submit button') submit_btn = self.driver.find_element_by_class_name('J_Submit') submit_btn.click() self.waitForLoad() return True except Exception as e: g_log.error(e) return False
def __init__(self, driver=None, proxy=None): if driver == None: # self.driver = webdriver.PhantomJS(executable_path='/usr/bin/phantomjs') cap = webdriver.DesiredCapabilities.PHANTOMJS # cap["phantomjs.page.settings.userAgent"] = 'Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko' cap["phantomjs.page.settings.resourceTimeout"] = TIME_OUT * 1000 g_log.debug(cap) if proxy == None: self.driver = webdriver.PhantomJS(desired_capabilities=cap) else: # proxy should be either http or https, and with ip:port information g_log.debug(proxy[0] + ' ' + proxy[1]) self.driver = webdriver.PhantomJS(service_args=[ '--proxy=' + proxy[0], '--proxy-type=' + proxy[1] ], desired_capabilities=cap) else: self.driver = driver self.driver.set_page_load_timeout(TIME_OUT)
def create_new_connect(host='127.0.0.1', unix_socket='/var/run/mysqld/mysqld.sock', user='******', passwd='#####', db='mysql'): g_log.debug('now try to connect the db with %s, %s, %s, %s, %s' % (host, unix_socket, user, passwd, db)) try: conn = pymysql.connect(host=host, unix_socket=unix_socket, user=user, passwd=passwd, db=db, use_unicode=True, charset="utf8") return conn except Exception as e: g_log.error(e) return None
def SearchItems(self, words): if self.element == None: g_log.warn("The main page has not opened when try to search items") return False else: try: # search bar is with id 'q' g_log.debug('Now get the search bar') input_form = self.driver.find_element_by_xpath('//*[@id="q"]') input_form.clear() input_form.send_keys(words) g_log.debug('Now get the search button') search_btn = self.driver.find_element_by_class_name( 'icon-btn-search') search_btn.click() self.waitForLoad() return True except Exception as e: g_log.error(e) return False
def get_every_page(page_item, page_loc, db_conn, db_cursor): ''' this function will goto the page_loc first, then fetch all the information into local db ''' try: page_item.GoToSearchPage(page_loc) page_item.GoThroughPage() products = page_item.driver.find_elements_by_class_name(PRODUCT_PANEL) for product_item in products: product_name = product_currency = product_price = product_link = product_num = None try: full_name = product_item.find_element_by_xpath( './/div[2]/div[1]/a').text.strip() first_feature = product_item.find_element_by_xpath( 'div[2]/div[1]/a/span/span[1]').text.strip() product_name = full_name[0:full_name.index(first_feature )].strip() product_currency = product_item.find_element_by_xpath( './/div[2]/div[1]/span[2]/span').text.strip() product_price = product_item.find_element_by_xpath( './/div[2]/div[1]/span[2]/strong').text.strip() product_link = product_item.find_element_by_class_name( 'product-title').get_attribute('href') product_num = product_item.find_element_by_xpath( './/div[2]/div[2]/div[2]/span/span').text.strip() except Exception as e: if product_name == None: g_log.error("Get Product Name Fail") continue if product_currency == None: g_log.error("Get Product Currency Fail") product_currency = u'¥' if product_price == None: g_log.error("Get price Fail") product_price = 0 if product_link == None: g_log.error("Get link Fail") product_link = '' if product_num == None: g_log.error("Get monthly sell num fail") product_num = 0 g_log.debug( "Now fetched product info are %s, %s, %s, link: %s, %s" % (product_name, product_currency, product_price, product_link, product_num)) g_log.debug('Now add cellphones into DB') db_cursor.execute( 'replace into cellphones (name, search_link, currency, price, sell_num) values (\"%s\", \"%s\", \"%s\", \"%d\", \"%s\")' % (product_name, product_link, product_currency, int(product_price), product_num)) db_cursor.connection.commit() try: db_cursor.execute( 'select id from cellphones where name=\"%s\"' % product_name) product_id = db_cursor.fetchone()[0] g_log.debug('Now add features into DB %d' % product_id) product_feature = [] features = product_item.find_elements_by_class_name( "feature-item") for item in features: product_feature.append(item.text) db_cursor.execute( 'replace into phone_features (cellphone_id, feature_item) values (\"%d\", \"%s\")' % (int(product_id), product_feature[-1])) g_log.debug(product_feature) db_cursor.connection.commit() except Exception as e: g_log.error(e) except Exception as e: g_log.error(e) return False
def close_conn(conn): g_log.debug("now release connection") conn.close()
def close_cursor(cursor): g_log.debug("now release cursor") cursor.close()