コード例 #1
0
 def test_find_by_id(self):
     ''' Test selection of specific data from pages table by id '''
     DB.setup()
     DB.seed()
     value = self.exec.find_by_id(1)
     self.assertIsNotNone(value)
     self.assertEqual(type(value), tuple)
コード例 #2
0
 def test_get_url(self):
     ''' Test selection of specific url by id'''
     DB.setup()
     DB.seed()
     value = self.exec.get_url(1)
     self.assertIsNotNone(value)
     self.assertEqual(type(value), tuple)
コード例 #3
0
def scrape(id):
    '''Scrape function fetches the page record with the page_id provided,
  Raise an exception if page with the isn't found,
  Updates the page’s is_scraping attribute to true,
  Fetch the HTML content at the page url using requests,
  Parses the fetched HTML content to extract hyperlinks (Maximum 10),
  Deletes existing links that may have been previously saved for the page,
  Saves the newly extracted links to the database for the page,
  Updates the page’s is_scraping attribute to false,
  passes the scraped links to the links table on the database.
  '''
    try:
        the_url = DB.pages().fetch(id)
        if len(the_url) == 0:
            raise Exception
        the_url = the_url[0]
        address = the_url[0]
        DB().pages().update(id, 'True')
        web_request = requests.get(address)
        soup = BeautifulSoup(web_request.text, features='html.parser')
        list_of_links = []
        for link in soup.find_all('a', href=True):
            links = link['href']
            if re.search("^https", links):
                list_of_links.append(links)
        linksy = (list_of_links[:10])
        DB().links().delete(id)
        for url in linksy:
            DB().links().insert(url, id)
        DB().pages().update(id, 'False')
        return '===============Successfully scraped================'
    except Exception as e:
        print(e)
コード例 #4
0
    def __init__(self,
                 bot_token,
                 admin_id,
                 engine_uri,
                 oc_host,
                 mtproto_proxy,
                 base_dir,
                 log_level='INFO'):
        self.updater = Updater(bot_token, use_context=True)
        self.dispatcher = self.updater.dispatcher
        self.input_dispatcher = \
            {
                #user_id: callback_function
        }

        self.db = DB(engine_uri)

        self.admin_id = admin_id
        self.oc_host = oc_host
        self.mtproto_proxy = mtproto_proxy
        self.base_dir = base_dir

        logging.basicConfig(
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
            level={
                'INFO': logging.INFO,
                'DEBUG': logging.DEBUG,
                'ERROR': logging.ERROR,
            }[log_level])
コード例 #5
0
ファイル: conftest.py プロジェクト: dakotalillie/marathon-api
def app_fixture():
    app = create_app(db_name="marathon_test")
    app.config["TESTING"] = True
    with app.app_context():
        DB.create_all()
        yield app
        DB.session.close()
        DB.drop_all()
コード例 #6
0
ファイル: core.py プロジェクト: raririn/SlowDB
 def _dump_db(self, file_path: str):
     new_db = DB(self.currentDB)
     new_db.tables = self.tables
     self.db[self.currentDB] = new_db
     f = open(file_path, 'wb')
     pickle.dump(self.db[self.currentDB], f)
     f.close()
     return 0
コード例 #7
0
 def test_select_by_id(self):
     ''' Test selection of specific data from links table by id '''
     DB.setup()
     DB.seed()
     self.exec.insert(1, 'https://www.google.com/')
     value = self.exec.select_by_id(1)
     self.assertIsNotNone(value)
     self.assertEqual(type(value), tuple)
コード例 #8
0
def web_scraper(page_id):
    """This function accepts the id,checks if it is within the list of ids in the database, and
    scrapes only 10 links on that particular link page"""
    all_ids = Pages(DB.connect()).select_id()
    new_all_id = [pid[0] for pid in all_ids]

    if page_id not in new_all_id:
        raise TypeError('Id does not exist.')

    else:
        url = Pages(DB.connect()).select_url(page_id)
        DB.pages().update(True, page_id)
        value = requests.get(url)
        soup = BeautifulSoup(value.text, 'html.parser')

        list_urls = []
        for link in soup.find_all('a', href=True):
          if link['href'].startswith('https'):
            list_urls.append(link['href'])

        new_list_urls = list_urls[:10]
        DB.links().delete_by_page_id(page_id)

        for item in new_list_urls:
            Links(DB.connect()).insert(page_id, item)

        DB.pages().update(False, page_id)
コード例 #9
0
def scraping_function(id):
    """
  This function implements the web scraper that inserts into the liks table.
  :param
  id(int): The id at which the url to be scraped is retrieved.
  :return:
  None: Returns None
  :raises:
  TypeError: Raises a TypeError
  """
    try:
        # retrieves the url from the pages table
        url = DB.pages().fetch(id)
        DB.pages().update(True, id)

        link_list = []
        r = requests.get(url[0])
        # scrapes the url for hyperlinks
        soup = BeautifulSoup(r.text, features='html.parser')
        for link in soup.find_all('a', href=True):
            if 'https' in link['href']:
                link_list.append(link['href'])
        links = link_list[:10]
        DB.links().delete(id)
        for i in links:
            DB.links().insert(id, i)
        DB.pages().update(False, id)
        return None
    except TypeError:
        raise TypeError('Id not found in Pages Table')
コード例 #10
0
def spider_scrap(page_id):
    '''function that recieve a page_id and insert links in the link table'''

    page_ids = [i[0] for i in DB().pages().select()]
    if page_id in page_ids:
        url = DB().pages().fetch_url(page_id)
    else:
        raise ValueError('page_id not valid')

    #update is_scraping to true
    DB().pages().update_id_true(page_id)

    #fetch the html content at the page url
    page = requests.get(url[0])

    # fetching the html content to extract maximum 10 hyperlinks
    soup = BeautifulSoup(page.text, features='html.parser')
    links_list = []
    for link in soup.find_all('a', href=True):
        links = link['href']
        if re.search("^https", links):
            links_list.append(links)
    link_url = links_list[:10]

    DB.links().delete(page_id)

    #saves the newly extratcted links to the database for the page
    for url in link_url:
        DB.links().insert(page_id, url)

    DB().pages().update_id_false(page_id)


# print(spider_scrap(1))
コード例 #11
0
 def setUp(s):
     if os.path.isfile(dbPath):
         os.remove(dbPath)
     if hasattr(s, 'db'):
         s.db.dropAll()
     s.db = DB({'db': dbPath})
     s.db.createDb()
コード例 #12
0
class TestDb(unittest.TestCase):
    '''class that tests db class in _init_.py'''
    def setUp(self):
        '''function that sets up for testing '''
        self.db = DB()

    def test_connect(self):
        '''function that tests the connect function'''
        connection_object = self.db.connect()
        self.assertIsNotNone(connection_object)

    def test_new_connect(self):
        '''function that tests the new_connect function'''
        connection_object = self.db.new_connect()
        self.assertIsNotNone(connection_object)

    def test_setup(self):
        '''function that tests the setup function'''
        self.assertEqual(self.db.setup(), None)
        cursor = self.db.new_connect().cursor()
        query = cursor.execute('SELECT url FROM pages WHERE id=1 ')
        self.assertEqual(query, None)

    def test_seed(self):
        '''function that tests the seed function'''
        self.db.setup()
        seed = self.db.seed()
        self.assertIsNone(seed)

    def tearDown(self):
        self.db = None
コード例 #13
0
ファイル: tests.py プロジェクト: abmyii/downloader_scraper
def test_column_reference():
    k = 'column_which_has_a_really_long_name_longer_than_sixty_four_characters'

    db = DB('sqlite:///:memory:', main_tbl_name="test")
    db.add_record({'id': 1, k: 0})
    db.commit()

    assert k in list(db.retrieve_records())[0].keys()
コード例 #14
0
 def setUp(cls):
     if os.path.isfile(dbPath):
         os.remove(dbPath)
     #game gets an instance of db in the constructor
     cls.db = DB({'db': dbPath})
     cls.db.createDb()
     cls.db.populateInfo('Gino')
     cls.testId = cls.db.addTest('Gino',
                                 'function ciao() { return "Ciao" }')
     cls.db.updateUserInfo('Gino', {"selectedTest": cls.testId})
コード例 #15
0
ファイル: main.py プロジェクト: zerkh/nips-scraper
def scrape(start_index):
    db = DB()
    nips = NipsETL(db)
    google = GoogleETL(db)
    arxiv = ArxivETL(db)

    titles = db.all('nips_papers')
    print "found %s nips_papers" % len(titles)
    if len(titles) < NUM_NIPS_17_PAPERS:
        print "fetching..."
        response = nips.extract()
        titles = nips.transform(response)
        nips.load(titles)

    all_nips_papers_missing_abstracts = db.all_nips_papers_missing_abstracts()
    print "found %i nips papers missing abstracts" % len(
        all_nips_papers_missing_abstracts)

    for record in all_nips_papers_missing_abstracts:
        print "fetching #%d: %s" % (record['id'], record['title'])
        try:
            google_response = google.extract(record["title"])
        except RateLimitError:
            break
        search_result = google.transform(record['id'], google_response)
        google.load(search_result)

        if search_result["abstract_url"]:
            print "found search result!"
            arxiv_response = arxiv.extract(search_result["abstract_url"])
            abstract = arxiv.transform(arxiv_response)
            arxiv.load(record["id"], abstract)

    db.to_md("abstracts.md")
コード例 #16
0
ファイル: core.py プロジェクト: raririn/SlowDB
    def execute_create_db(self, d):
        '''
        CREATE DATABASE testdb;

        d = {
             'name': 'testdb',
        }
        '''
        if not d['name'] in self.db:
            self.db[d['name']] = DB(d['name'])
            return 0
        else:
            raise Exception('')
コード例 #17
0
ファイル: utils.py プロジェクト: 01CodeLT/twitter-bot
    def cache(key, value = False, expiry = False):
        try:
            #Open data or write
            if value == False:
                #Get db cache
                cache = DB.selectOne("SELECT * FROM cache WHERE key = ?;", (key,))

                #Check if expired
                if(cache['expiry']):
                    datetime.strptime(cache['expiry'], "%Y-%m-%d %H:%M:%S.%f") < datetime.now()
                else:
                    return None

                return cache['value']
            else:
                DB.execute("INSERT OR REPLACE INTO cache(key, value, expiry) VALUES(?, ?, ?);", (
                        key, value, str(datetime.now() + timedelta(minutes=expiry)),
                    )
                )
        except Exception as e:
            print(e)
            return None
コード例 #18
0
ファイル: spider.py プロジェクト: ooakhu/web_scraper
def scrape(id):
    DB.pages().update('True', id)
    url = DB().pages().fetch(id)
    page = requests.get(url[0])
    soup = BeautifulSoup(page.text, features='html.parser')
    a_soup = soup.find_all('a', href=True)
    ext_links = [
        link.get("href") for link in a_soup if "http" in link.get("href")
    ]
    new_links = ext_links[:10]
    DB.links().delete(id)
    for i in new_links:
        DB.links().insert(i, id)
コード例 #19
0
 def __init__(self):
     
     pyglet.resource.path = ['./res']
     pyglet.resource.reindex()
     
     self.db = DB('localhost', 3306, 'fisica', 'qwe123iop', 'fisica')
     
     platform = pyglet.window.get_platform()
     display = platform.get_default_display()
     self.MW = display.get_screens()[0].width
     self.MH = display.get_screens()[0].height
     
     pyglet.clock.schedule(self.timer)
     self.activateSuck()
     self.window = Frame(self, 400, 400, False, visible=False)
     self.window.set_location(int((self.MW-self.window.width)/2), int((self.MH-self.window.height)/2))
     self.window.setScene(AppLauncher())
     self.window.set_visible(True)
コード例 #20
0
ファイル: commands.py プロジェクト: duke79/leavebot
    def run(self, cmd, args, user):
        slack_id = user
        db = DB(slack_id)

        print("\n")
        print(cmd)
        print("\n")
        print(args)
        if cmd == "help":
            ret = "\n".join((
                "Available commands:",
                "help: Prints the list of available commands",
                "login: User login, required before any other action",
                "apply: Apply for leave",
            ))
            return ret
        elif cmd == "login":
            print(args)
            user_id, user_pass = args.split(' ')
            # print('setting user details')
            db.greythr_user_id = user_id
            db.greythr_password = user_pass
            db.freeze()
            return "Login successful!", None
        elif cmd == "apply":
            print('here')
            start, end = args.split(' ')
            start = f'{start} Dec 2019'
            end = f'{end} Dec 2019'
            print(start)
            print(end)
            print(db.greythr_user_id)
            print(db.greythr_password)
            # userid, passwd = 'T12546', '@123456789'
            # userid, passwd = 'S12667', 'Dynamic@@123'

            # login T12546 @123456789
            # apply ‘18 Dec 2019’ ‘19 Dec 2019’
            # res = asyncio.run(apply(db.greythr_user_id, db.greythr_password, start, end))
            res = asyncio.run(apply('T12546', '@123456789', start, end))
            return res, [{
                "type": "section",
                "text": {
                    "type": "mrkdwn",
                    "text": "dsf"
                }
            }]

        else:
            ret = "Command not available!"
            return ret
コード例 #21
0
ファイル: spider.py プロジェクト: Resa-Obamwonyi/spider-pyapp
def spider(page_id):
    ''' Takes a page id, selects the url linked to page id and runs the scraper
      Scraper takes url and returns a list of urls scraped,
      a maximum of 10 links are inserted into the database '''

    if type(page_id) != int or page_id == 0:
        raise ValueError('Page Id is not valid')

    get_url = DB.pages().get_url(page_id)

    if get_url is None:
        return ValueError('Page Id not found')

    else:
        url = get_url[0]
        all_links = []

        # set is_scraping to True where id == page_id
        DB.pages().update_by_id(True, page_id)

        res = requests.get(url)
        soup = BeautifulSoup(res.text, 'html.parser')

        for link in soup.find_all('a', href=True):

            if link['href'].startswith('http'):
                all_links.append(link['href'])

        # check if page id is in already in links table, delete all data with page id
        DB.links().delete_by_page_id(page_id)

        for link in all_links[:10]:
            # Insert each link into the links table
            Links(DB().connect()).insert(page_id, link)

        # set is_scraping to False in  where id == page_id
        DB.pages().update_by_id(False, page_id)
コード例 #22
0
 def test_setup(self):
     self.assertEqual(DB.setup(), None)
コード例 #23
0
 def test_seed(self):
     self.assertEqual(DB.seed(), None)
コード例 #24
0
 def test_links(self):
     self.assertIsNotNone(DB.links())
コード例 #25
0
 def test_pages(self):
     self.assertIsNotNone(DB.pages())
コード例 #26
0
 def setUp(self) -> None:
     # set up the Pages class
     self.exec = Pages(DB.connect())
コード例 #27
0
 def test_update(self):
     # Test the update method of Pages class in pages.py
     DB.seed()
     self.assertIsNone(self.exec.update(False, 1))
コード例 #28
0
 def test_find_url(self):
     # Test the find_url method of Pages class in pages.py
     DB.seed()
     self.assertIsNotNone(self.exec.find_url(1))
コード例 #29
0
 def test_find(self):
     # Test the find method of Pages class in pages.py
     DB.seed()
     result = self.exec.find(2)
     self.assertIsNotNone(result)
コード例 #30
0
 def setUp(self) -> None:
     self.DB = DB().serv_conn()