def get_page(): if len(data) > 0: conn, cur = mysql_database.connect() driver = phantomjs.start() counter = 0 for url, pid, mid in data: try: driver.get(url) counter += 1 print('{} visiting {}'.format(counter, url)) ## allow time for page to load (or possibly redirect) time.sleep(20) except Exception as e: logging.info( 'An error occured trying to visit {}. \n{}'.format(url, e)) continue else: current_url = driver.current_url update_db(driver, url, pid, mid, current_url, conn, cur, num, table) mysql_database.disconnect(conn, cur) driver.quit() return
def check(): if len(data) > 0: conn, cur = mysql_database.connect() print('starting the driver...') driver = webdriver.PhantomJS(executable_path=local_path.phantomjs_path) driver.set_window_size(1124, 850) counter = 0 for url, pid in data: try: driver.get(url) ## allow time for page to load (or possibly redirect) counter += 1 print('{} visiting {}'.format(counter, url)) time.sleep(20) except Exception as e: print(str(e)) print('An error occured trying to visit {}.'.format(url)) continue else: current_url = driver.current_url update_db(url, pid, current_url, conn, cur, sys.argv[1]) mysql_database.disconnect(conn, cur) driver.quit() return
def select_posts(table): conn, cur = mysql_database.connect() cur.execute( '''SELECT url, pid from {} WHERE pubdate < CURDATE() - INTERVAL 2 DAY AND tested = "not yet" '''.format(table)) data = list(cur.fetchall()) print('no. of posts to be tested: {}'.format(len(data))) mysql_database.disconnect(conn, cur) return data
def select(num, table): conn, cur = mysql_database.connect() ## 1st test after 2 days if num == '1': selectData(cur, '2', table, 'tested') ## 2nd test after 14 days elif num == '2': selectData(cur, '14', table, 'retested') else: sys.exit('unable to select appropriate data to test') data = list(cur.fetchall()) logging.info('\nno. of posts to be tested: {}'.format(len(data))) mysql_database.disconnect(conn, cur) return data
def save_to_db(table, queue): saved = 0 skipped = 0 conn, cur = mysql_database.connect() while not queue.empty(): cur.execute('SELECT url from {}'.format(table)) ## turn tuple of tuples into list of strings exisiting_urls = [ ''.join(ele) for urls in list(cur.fetchall()) for ele in urls ] data = queue.get() for i in range(len(data["content"])): if data["url"][i] not in exisiting_urls: try: cur.execute('''INSERT INTO {} (content, url, uid, pid, mid, pubdate, tested, testdate, status) VALUES (%s, %s, %s, %s, %s, CURDATE(), DEFAULT, DEFAULT, DEFAULT)'''.format(table), \ (data["content"][i], data["url"][i], data["uid"][i], data["pid"][i], data["mid"][i])) logging.info('saved pid: {}'.format(data["pid"][i])) saved += 1 cur.connection.commit() except Exception as e: logging.info( '\nunable to insert pid {} into table. {}'.format( data["pid"][i], e)) continue else: skipped += 1 logging.info('\nsaved: {}; skipped: {}'.format(saved, skipped)) cur.execute('SELECT COUNT(*) FROM {}'.format(table)) no_of_rows = str(cur.fetchone()[0]) logging.info('\nno. of rows in database: {}'.format(no_of_rows)) mysql_database.disconnect(conn, cur) return