Beispiel #1
0
def create_new_cursor(conn):
    g_log.debug('Now try to create a new cur.')
    try:
        cur = conn.cursor()
        return cur
    except Exception as e:
        g_log.error(e)
        return None
Beispiel #2
0
def open_url(URL):
    '''
    [Description]: open the requested url and return the pure html info
    '''
    try:
        g_log.debug('Connecting the %s' % URL)
        html = urlopen(URL)
        return html
    except Exception as e:
        g_log.error(e)
        return None
Beispiel #3
0
 def GoThroughPage(self):
     try:
         g_log.debug('Now get all the hidden items in page')
         # in taobao page it mark the blank row with class blank-row
         blank_items = self.driver.find_elements_by_class_name('blank-row')
         for item in blank_items:
             self.driver.execute_script("arguments[0].scrollIntoView();",
                                        item)
             time.sleep(1)
         return True
     except Exception as e:
         g_log.error(e)
         return False
Beispiel #4
0
def getExternalLinks(bsObj, excludeUrl):
    externalLinks = []
    g_log.debug("get the external links exclude url: %s" % excludeUrl)
    # find out all the http/https/www/ and exclude the current urls
    try:
        links = bsObj.findAll("a",
                              href=re.compile("^(http|www|https)((?!" +
                                              excludeUrl + ").)*$"))
    except Exception as e:
        g_log.error(e)
    for link in links:
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in externalLinks:
                externalLinks.append(link.attrs['href'])
    return externalLinks
Beispiel #5
0
 def waitForLoad(self):
     # if javascript and jQuery is completely loaded, return True
     g_log.debug("wait for the page is loaded.")
     time.sleep(TIME_OUT)
     page_state_js = self.driver.execute_script(
         'return document.readyState;')
     try:
         page_state_jq = int(
             self.driver.execute_script('return jQuery.active;'))
     except Exception as e:
         g_log.error("get jquery state error: %s" % e)
         page_state_jq = 0
     if page_state_jq == 0 and page_state_js == 'complete':
         return True
     else:
         return False
Beispiel #6
0
def getInternalLinks(bsObj, includeUrl):
    internalLinks = []
    g_log.debug("get the internal links include url: %s" % includeUrl)
    # find out the links start with /
    try:
        links = bsObj.findAll("a",
                              href=re.compile("^(?!http|https)(/|.*" +
                                              includeUrl + ")"))
    except Exception as e:
        g_log.error(e)
        return []
    for link in links:
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                internalLinks.append(link.attrs['href'])
    return internalLinks
Beispiel #7
0
 def GoToSearchPage(self, page_id):
     try:
         if page_id < 1:
             g_log.error('page id is not correct %d' % page_id)
             return False
         g_log.debug('Now get the page Input bar')
         input_form = self.driver.find_element_by_class_name('J_Input')
         input_form.clear()
         input_form.send_keys(int(page_id))
         g_log.debug('Now Click the submit button')
         submit_btn = self.driver.find_element_by_class_name('J_Submit')
         submit_btn.click()
         self.waitForLoad()
         return True
     except Exception as e:
         g_log.error(e)
         return False
Beispiel #8
0
 def __init__(self, driver=None, proxy=None):
     if driver == None:
         #            self.driver = webdriver.PhantomJS(executable_path='/usr/bin/phantomjs')
         cap = webdriver.DesiredCapabilities.PHANTOMJS
         #            cap["phantomjs.page.settings.userAgent"] = 'Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko'
         cap["phantomjs.page.settings.resourceTimeout"] = TIME_OUT * 1000
         g_log.debug(cap)
         if proxy == None:
             self.driver = webdriver.PhantomJS(desired_capabilities=cap)
         else:
             # proxy should be either http or https, and with ip:port information
             g_log.debug(proxy[0] + '   ' + proxy[1])
             self.driver = webdriver.PhantomJS(service_args=[
                 '--proxy=' + proxy[0], '--proxy-type=' + proxy[1]
             ],
                                               desired_capabilities=cap)
     else:
         self.driver = driver
         self.driver.set_page_load_timeout(TIME_OUT)
Beispiel #9
0
def create_new_connect(host='127.0.0.1',
                       unix_socket='/var/run/mysqld/mysqld.sock',
                       user='******',
                       passwd='#####',
                       db='mysql'):
    g_log.debug('now try to connect the db with %s, %s, %s, %s, %s' %
                (host, unix_socket, user, passwd, db))
    try:
        conn = pymysql.connect(host=host,
                               unix_socket=unix_socket,
                               user=user,
                               passwd=passwd,
                               db=db,
                               use_unicode=True,
                               charset="utf8")
        return conn
    except Exception as e:
        g_log.error(e)
        return None
Beispiel #10
0
 def SearchItems(self, words):
     if self.element == None:
         g_log.warn("The main page has not opened when try to search items")
         return False
     else:
         try:
             # search bar is with id 'q'
             g_log.debug('Now get the search bar')
             input_form = self.driver.find_element_by_xpath('//*[@id="q"]')
             input_form.clear()
             input_form.send_keys(words)
             g_log.debug('Now get the search button')
             search_btn = self.driver.find_element_by_class_name(
                 'icon-btn-search')
             search_btn.click()
             self.waitForLoad()
             return True
         except Exception as e:
             g_log.error(e)
             return False
Beispiel #11
0
def get_every_page(page_item, page_loc, db_conn, db_cursor):
    '''
    this function will goto the page_loc first, then fetch all the information into local db
    '''
    try:
        page_item.GoToSearchPage(page_loc)
        page_item.GoThroughPage()
        products = page_item.driver.find_elements_by_class_name(PRODUCT_PANEL)
        for product_item in products:
            product_name = product_currency = product_price = product_link = product_num = None
            try:
                full_name = product_item.find_element_by_xpath(
                    './/div[2]/div[1]/a').text.strip()
                first_feature = product_item.find_element_by_xpath(
                    'div[2]/div[1]/a/span/span[1]').text.strip()
                product_name = full_name[0:full_name.index(first_feature
                                                           )].strip()
                product_currency = product_item.find_element_by_xpath(
                    './/div[2]/div[1]/span[2]/span').text.strip()
                product_price = product_item.find_element_by_xpath(
                    './/div[2]/div[1]/span[2]/strong').text.strip()
                product_link = product_item.find_element_by_class_name(
                    'product-title').get_attribute('href')
                product_num = product_item.find_element_by_xpath(
                    './/div[2]/div[2]/div[2]/span/span').text.strip()
            except Exception as e:
                if product_name == None:
                    g_log.error("Get Product Name Fail")
                    continue
                if product_currency == None:
                    g_log.error("Get Product Currency Fail")
                    product_currency = u'¥'
                if product_price == None:
                    g_log.error("Get price Fail")
                    product_price = 0
                if product_link == None:
                    g_log.error("Get link Fail")
                    product_link = ''
                if product_num == None:
                    g_log.error("Get monthly sell num fail")
                    product_num = 0

            g_log.debug(
                "Now fetched product info are %s, %s, %s, link: %s, %s" %
                (product_name, product_currency, product_price, product_link,
                 product_num))
            g_log.debug('Now add cellphones into DB')
            db_cursor.execute(
                'replace into cellphones (name, search_link, currency, price, sell_num) values (\"%s\", \"%s\", \"%s\", \"%d\", \"%s\")'
                % (product_name, product_link, product_currency,
                   int(product_price), product_num))
            db_cursor.connection.commit()
            try:
                db_cursor.execute(
                    'select id from cellphones where name=\"%s\"' %
                    product_name)
                product_id = db_cursor.fetchone()[0]
                g_log.debug('Now add features into DB %d' % product_id)
                product_feature = []
                features = product_item.find_elements_by_class_name(
                    "feature-item")
                for item in features:
                    product_feature.append(item.text)
                    db_cursor.execute(
                        'replace into phone_features (cellphone_id, feature_item) values (\"%d\", \"%s\")'
                        % (int(product_id), product_feature[-1]))
                g_log.debug(product_feature)
                db_cursor.connection.commit()
            except Exception as e:
                g_log.error(e)
    except Exception as e:
        g_log.error(e)
        return False
Beispiel #12
0
def close_conn(conn):
    g_log.debug("now release connection")
    conn.close()
Beispiel #13
0
def close_cursor(cursor):
    g_log.debug("now release cursor")
    cursor.close()