def SetURL(): # Get the url url = request.form['url'] if not url: return # Define valid url regex url_regex = re.compile( r'^(?:http)s?://' # http:// or https:// r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain... r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip r'(?::\d+)?' # optional port r'(?:/?|[/?]\S+)$', re.IGNORECASE) # Validate the url if not url_regex.match(url): return render_template("index.html", error=True, urlVal=url) # Connect to the DB with engine.connect() as con: # See if there is already a key for the url try: sql = text("SELECT * FROM url WHERE url = :url") res = con.execute(sql, url=url).fetchall() except SQLAlchemyError as e: return redirect( url_for('routes.Error', title='Error: Unhandled error', msg=type(e))) except: return redirect( url_for('routes.Error', title='Error: Unhandled error')) # If there is a key display that link if len(res) > 0: return render_template("index.html", short=f'{request.url_root}{res[0].key}') # Generate a new key key = URL.GenerateKey() try: # Insert the KVP into the database kvp = URL(key, url) db.session.add(kvp) db.session.commit() except: return redirect( url_for('routes.Error', title='Error: Unhandled error')) # Display the new link from the key return render_template("index.html", short=f'{request.url_root}{key}')
def testPopNextURLAndMarkAsVisitedHandlesCount(self): # Populate the test database. session = self.database_handler.CreateSession() the_url = URL('http://www.microsoft.com/', 1) the_url.links_to = 500 session.add(the_url) the_url = URL('http://www.google.com/', 1) the_url.links_to = 1000 session.add(the_url) session.commit() # Test pop. crawler_thread = CrawlerThread( self.database_handler, None, self.url_lock) the_url = crawler_thread.PopNextURLAndMarkAsVisited() self.assertEqual('http://www.google.com/', the_url) # Test second pop. the_url = crawler_thread.PopNextURLAndMarkAsVisited() self.assertEqual('http://www.microsoft.com/', the_url)
def testHandleHtmlResourceIncrementsLinksTo(self): # Populate the test database. session = self.database_handler.CreateSession() the_url = URL('http://www.google.com/', 1) the_url.links_to = 1000 session.add(the_url) session.commit() # Create test file. file_handle = StringIO.StringIO(textwrap.dedent(""" <a href='http://www.google.com/'>Google</a> """)) file_handle.url = 'http://www.test.com' # Test handling of HTML resource. crawler_thread = CrawlerThread( self.database_handler, None, self.url_lock) crawler_thread.HandleHtmlResource(file_handle) query = session.query(URL) results = query.filter(URL.url == 'http://www.google.com/') self.assertEqual(1, results.count()) the_url = results.first() self.assertEqual(1001, the_url.links_to)