def scrape(id): '''Scrape function fetches the page record with the page_id provided, Raise an exception if page with the isn't found, Updates the page’s is_scraping attribute to true, Fetch the HTML content at the page url using requests, Parses the fetched HTML content to extract hyperlinks (Maximum 10), Deletes existing links that may have been previously saved for the page, Saves the newly extracted links to the database for the page, Updates the page’s is_scraping attribute to false, passes the scraped links to the links table on the database. ''' try: the_url = DB.pages().fetch(id) if len(the_url) == 0: raise Exception the_url = the_url[0] address = the_url[0] DB().pages().update(id, 'True') web_request = requests.get(address) soup = BeautifulSoup(web_request.text, features='html.parser') list_of_links = [] for link in soup.find_all('a', href=True): links = link['href'] if re.search("^https", links): list_of_links.append(links) linksy = (list_of_links[:10]) DB().links().delete(id) for url in linksy: DB().links().insert(url, id) DB().pages().update(id, 'False') return '===============Successfully scraped================' except Exception as e: print(e)
def spider_scrap(page_id): '''function that recieve a page_id and insert links in the link table''' page_ids = [i[0] for i in DB().pages().select()] if page_id in page_ids: url = DB().pages().fetch_url(page_id) else: raise ValueError('page_id not valid') #update is_scraping to true DB().pages().update_id_true(page_id) #fetch the html content at the page url page = requests.get(url[0]) # fetching the html content to extract maximum 10 hyperlinks soup = BeautifulSoup(page.text, features='html.parser') links_list = [] for link in soup.find_all('a', href=True): links = link['href'] if re.search("^https", links): links_list.append(links) link_url = links_list[:10] DB.links().delete(page_id) #saves the newly extratcted links to the database for the page for url in link_url: DB.links().insert(page_id, url) DB().pages().update_id_false(page_id) # print(spider_scrap(1))
def setUp(s): if os.path.isfile(dbPath): os.remove(dbPath) if hasattr(s, 'db'): s.db.dropAll() s.db = DB({'db': dbPath}) s.db.createDb()
def __init__(self, bot_token, admin_id, engine_uri, oc_host, mtproto_proxy, base_dir, log_level='INFO'): self.updater = Updater(bot_token, use_context=True) self.dispatcher = self.updater.dispatcher self.input_dispatcher = \ { #user_id: callback_function } self.db = DB(engine_uri) self.admin_id = admin_id self.oc_host = oc_host self.mtproto_proxy = mtproto_proxy self.base_dir = base_dir logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level={ 'INFO': logging.INFO, 'DEBUG': logging.DEBUG, 'ERROR': logging.ERROR, }[log_level])
def scrape(start_index): db = DB() nips = NipsETL(db) google = GoogleETL(db) arxiv = ArxivETL(db) titles = db.all('nips_papers') print "found %s nips_papers" % len(titles) if len(titles) < NUM_NIPS_17_PAPERS: print "fetching..." response = nips.extract() titles = nips.transform(response) nips.load(titles) all_nips_papers_missing_abstracts = db.all_nips_papers_missing_abstracts() print "found %i nips papers missing abstracts" % len( all_nips_papers_missing_abstracts) for record in all_nips_papers_missing_abstracts: print "fetching #%d: %s" % (record['id'], record['title']) try: google_response = google.extract(record["title"]) except RateLimitError: break search_result = google.transform(record['id'], google_response) google.load(search_result) if search_result["abstract_url"]: print "found search result!" arxiv_response = arxiv.extract(search_result["abstract_url"]) abstract = arxiv.transform(arxiv_response) arxiv.load(record["id"], abstract) db.to_md("abstracts.md")
def test_column_reference(): k = 'column_which_has_a_really_long_name_longer_than_sixty_four_characters' db = DB('sqlite:///:memory:', main_tbl_name="test") db.add_record({'id': 1, k: 0}) db.commit() assert k in list(db.retrieve_records())[0].keys()
def _dump_db(self, file_path: str): new_db = DB(self.currentDB) new_db.tables = self.tables self.db[self.currentDB] = new_db f = open(file_path, 'wb') pickle.dump(self.db[self.currentDB], f) f.close() return 0
def setUp(cls): if os.path.isfile(dbPath): os.remove(dbPath) #game gets an instance of db in the constructor cls.db = DB({'db': dbPath}) cls.db.createDb() cls.db.populateInfo('Gino') cls.testId = cls.db.addTest('Gino', 'function ciao() { return "Ciao" }') cls.db.updateUserInfo('Gino', {"selectedTest": cls.testId})
def execute_create_db(self, d): ''' CREATE DATABASE testdb; d = { 'name': 'testdb', } ''' if not d['name'] in self.db: self.db[d['name']] = DB(d['name']) return 0 else: raise Exception('')
def scrape(id): DB.pages().update('True', id) url = DB().pages().fetch(id) page = requests.get(url[0]) soup = BeautifulSoup(page.text, features='html.parser') a_soup = soup.find_all('a', href=True) ext_links = [ link.get("href") for link in a_soup if "http" in link.get("href") ] new_links = ext_links[:10] DB.links().delete(id) for i in new_links: DB.links().insert(i, id)
def run(self, cmd, args, user): slack_id = user db = DB(slack_id) print("\n") print(cmd) print("\n") print(args) if cmd == "help": ret = "\n".join(( "Available commands:", "help: Prints the list of available commands", "login: User login, required before any other action", "apply: Apply for leave", )) return ret elif cmd == "login": print(args) user_id, user_pass = args.split(' ') # print('setting user details') db.greythr_user_id = user_id db.greythr_password = user_pass db.freeze() return "Login successful!", None elif cmd == "apply": print('here') start, end = args.split(' ') start = f'{start} Dec 2019' end = f'{end} Dec 2019' print(start) print(end) print(db.greythr_user_id) print(db.greythr_password) # userid, passwd = 'T12546', '@123456789' # userid, passwd = 'S12667', 'Dynamic@@123' # login T12546 @123456789 # apply ‘18 Dec 2019’ ‘19 Dec 2019’ # res = asyncio.run(apply(db.greythr_user_id, db.greythr_password, start, end)) res = asyncio.run(apply('T12546', '@123456789', start, end)) return res, [{ "type": "section", "text": { "type": "mrkdwn", "text": "dsf" } }] else: ret = "Command not available!" return ret
def __init__(self): pyglet.resource.path = ['./res'] pyglet.resource.reindex() self.db = DB('localhost', 3306, 'fisica', 'qwe123iop', 'fisica') platform = pyglet.window.get_platform() display = platform.get_default_display() self.MW = display.get_screens()[0].width self.MH = display.get_screens()[0].height pyglet.clock.schedule(self.timer) self.activateSuck() self.window = Frame(self, 400, 400, False, visible=False) self.window.set_location(int((self.MW-self.window.width)/2), int((self.MH-self.window.height)/2)) self.window.setScene(AppLauncher()) self.window.set_visible(True)
def main(): db = DB() db.create() datas = GetPttPost(2) for data in datas: if db.get(data['title']).fetchall(): db.update( db.get_id(data['title']).fetchall()[0][0], data['url'], data['author'], data['date'], data['push']) else: analyze_data = analyze.nlp(data['title'].replace( 'Re: ', '').replace('[新聞] ', '').replace('[爆卦] ', '').replace('[問卦] ', '').replace('[協尋]', '')) db.store(data['title'], data['url'], data['author'], data['date'], data['push'], analyze_data) chartHandler.handle_data(db.get_all()) chartHandler.handle_push(db.get_all())
def spider(page_id): ''' Takes a page id, selects the url linked to page id and runs the scraper Scraper takes url and returns a list of urls scraped, a maximum of 10 links are inserted into the database ''' if type(page_id) != int or page_id == 0: raise ValueError('Page Id is not valid') get_url = DB.pages().get_url(page_id) if get_url is None: return ValueError('Page Id not found') else: url = get_url[0] all_links = [] # set is_scraping to True where id == page_id DB.pages().update_by_id(True, page_id) res = requests.get(url) soup = BeautifulSoup(res.text, 'html.parser') for link in soup.find_all('a', href=True): if link['href'].startswith('http'): all_links.append(link['href']) # check if page id is in already in links table, delete all data with page id DB.links().delete_by_page_id(page_id) for link in all_links[:10]: # Insert each link into the links table Links(DB().connect()).insert(page_id, link) # set is_scraping to False in where id == page_id DB.pages().update_by_id(False, page_id)
def setUp(self) -> None: """Set up the connection""" self.exec = DB().connection_details()
def test_persist(self): alice_feeds, bob_feeds, charlie_feeds = get_mock_feeds() self.subscribe_user_to_feeds('Alice', alice_feeds) self.db.close() TestDB.db = DB() self.assertItemsEqual(self.db.get_feeds_by_subscriber('Alice'), alice_feeds)
def setUp(self) -> None: self.DB = DB().serv_conn()
# Show examples of how you would use ALL your implementations here from src.db import DB from src.spider import spider_scrap from celery import Celery from decouple import config # db = DB() db.connect() db.new_connect() db.setup() db.seed() dd = DB.new_connect() pages = DB.pages() # pages.fetch_url(2) print(pages.fetch_url(2)) print(pages.select()) print(pages.find(2)) # print(pages.update_id(1)) links = DB.links() print(links.insert(1, 'www.goggle.com')) print(links.delete(1)) print(links.select(1)) # # # app = Celery('main', broker=config('CELERY_BROKER'), backend=config('CELERY_BACKEND')) # # # @app.task # def scrap_url(): # return spider_scrap(1) # spider_scrap(1)
def setUpClass(cls): cls.db = DB()
def __init__(self): try: self.db = DB("mysql", "localhost", "root", "123456", "modeling") except: self.db = DB("mysql", "10.20.2.26", "root", "123456", "modeling")
def test_update_id_false(self): '''test for update_id_false function in pages''' DB().setup() DB().seed() result = (1, 'https://www.facebook.com', False) self.assertEqual(self.pages.update_id_false(1)[:3], result)
def test_pages(self): """Test the reference to the pages interface""" self.assertIsNotNone(DB().pages())
def setUp(self): self.db = DB()
def test_links(self): """Test the reference to the links interface""" self.assertIsNotNone(DB().links())
def setUp(self) -> None: self.exec = DB().server_conn()
def test_setup(self): """Test the creation of the database table""" self.assertIsNone(DB().setup())
def test_connect(self): """Test new connection of the database""" self.assertIsNotNone(DB().connect())
def test_insert(self): DB().setup() DB.seed() self.assertEqual(self.links.insert(2, 'https://www.wikipedia.com'), None)
def test_seed(self): """Test the insert of information into the database table created""" self.assertIsNone(DB().seed())
import flask from flask import Flask from flask import url_for from mysql.connector import IntegrityError from src.user import USER from src.position import Position from src import connection_info from src.db import DB app = Flask(__name__) app.secret_key = b'G\xd3\x95iW9\x90\x93M\xf0Aa/XUU' # CLE UTILISER POUR ENREGISTRER DES COOKIES db = DB(host=connection_info.DB_HOST, db_name=connection_info.DB_NAME, table_name=connection_info.TABLE_NAME, user_name=connection_info.DB_USER_NAME, user_pwd=connection_info.DB_USER_PASS_WORD) user = USER() @app.route('/', methods=['POST', 'GET']) def index(): return flask.render_template('index.html') @app.route('/connection', methods=['POST', 'GET']) def connection(): city = flask.request.form['ville'] min_distance = flask.request.form['min_distance'] uuid = flask.request.form["uuid"]