def populate(file): pp.pprint("parsing file %s" % file) file_reader = open(file, "r") url_objects = [] for line in file_reader: if not line.startswith("#"): pp.pprint(line) line = line.strip() line = line.strip("'") line = line.strip("\n") pp.pprint(line) url = URL(util.hash_domain(line), util.hash_url(line)) if (not url.hash_domain == "" and not url.hash_url == ""): url_objects.append(url) db_session.add(url) db_session.commit() pp.pprint(url_objects) """ TODO: this doesn't work with the large data set, perhaps there is a max without any errors? Will create a SQL script to insert manually into DB try: db_session.bulk_save_objects(url_objects) db_session.commit() except exc.IntegrityError: db_session.rollback() """ results = URL.query.all() pp.pprint("Inserted %d rows" % len(results))
def get(self, url): # First, query for the hashed domain hash_raw_domain = util.hash_domain(url) if (hash_raw_domain == None or hash_raw_domain == ""): abort( 400, description= "BadRequest: The URL supplied is either missing or incorrectly formatted" ) domain_resulset = URL.query.filter( URL.hash_domain == hash_raw_domain).all() hash_raw_url = util.hash_url(url) # Second, search for the hashed url in the resultset exists = util.in_domain(domain_resulset, hash_raw_url) return {'exists': exists}
""" pp = pprint.PrettyPrinter(indent=4) def populate(): db_session.add(url1) db_session.add(url2) db_session.add(url3) db_session.add(url4) db_session.add(url5) db_session.commit() results = URL.query.all() pp.pprint(results) if __name__ == '__main__': url1 = URL(util.hash_domain('https://googo.com/gmail'), util.hash_url('https://googo.com/gmail')) url2 = URL(util.hash_domain('https://docs.googo.com/spreadsheets/u/01/'), util.hash_url('https://docs.googo.com/spreadsheets/u/01/')) url3 = URL(util.hash_domain('https://docs.googo.com/document/u/0/1'), util.hash_url('https://docs.googo.com/document/u/0/1')) url4 = URL(util.hash_domain('https://www.appa.com/mac/index.html'), util.hash_url('https://www.appa.com/mac/index.html')) url5 = URL(util.hash_domain('https://www.appa.com/ipad/stuff.htm'), util.hash_url('https://www.appa.com/ipad/stuff.htm')) populate()
test_domain5 = "www2.stuffandthings.co.au" test_parse_url1 = "https://google.com/gmail/" test_parse_url2 = "https://go.google.com/somethingelse" test_parse_url3 = "http://stuffandthings.co.au/somethingelse/anotherthing" test_parse_url4 = "http://www.stuffandthings.co.au/somethingelse/anotherthing" test_parse_url5 = "http://www.stuffandthings.co.au:8080/somethingelse/anotherthing/yaya.html" test_parse_path1 = "httpsgoogle.com/gmail" test_parse_path2 = "httpsgo.google.com/somethingelse" test_parse_path3 = "httpstuffandthings.co.au/somethingelse/anotherthing" test_parse_path4 = "httpwww.stuffandthings.co.au/somethingelse/anotherthing" test_parse_path5 = "http8080www.stuffandthings.co.au/somethingelse/anotherthing/yaya.html" test_list = [] url1 = URL(util.hash_domain(test_raw_url1), util.hash_url(test_raw_url1)) url2 = URL(util.hash_domain(test_raw_url2), util.hash_url(test_raw_url2)) url3 = URL(util.hash_domain(test_raw_url3), util.hash_url(test_raw_url3)) url4 = URL(util.hash_domain(test_raw_url4), util.hash_url(test_raw_url4)) url5 = URL(util.hash_domain(test_raw_url5), util.hash_url(test_raw_url5)) test_list.append(url1) test_list.append(url2) test_list.append(url3) test_list.append(url4) test_list.append(url5) # Domain parsing tests test_parse_domain(test_raw_url1, test_domain1) test_parse_domain(test_raw_url2, test_domain2) test_parse_domain(test_raw_url3, test_domain3)
db_session.add(url4) db_session.add(url5) db_session.commit() results = URL.query.filter(URL.hash_domain == url4.hash_domain) if (results.count() != 2): print("test_compare Failed. Wanted 2 results, got %d" % results.count()) return print("test_compare passed") pp.pprint(results[0]) pp.pprint(results[1]) if __name__ == '__main__': url1 = URL(util.hash_domain('https://googo.com/gmail/1/2/3/4'), util.hash_url('https://googo.com/gmail/1/2/3/4')) url2 = URL( util.hash_domain('https://docs.googollll.com/spreadsheets/u/04/'), util.hash_url('https://docs.googollll.com/spreadsheets/u/04/')) url3 = URL(util.hash_domain('https://docs.googollll.com/document/u/1/1'), util.hash_url('https://docs.googollll.com/document/u/1/1')) url4 = URL(util.hash_domain('https://www.appa3.com/mac/index.html'), util.hash_url('https://www.appa3.com/mac/index.html')) url5 = URL(util.hash_domain('https://www.appa3.com/ipad/stuff.htm'), util.hash_url('https://www.appa3.com/ipad/stuff.htm')) test_insert() test_compare() cleanup()