Ejemplo n.º 1
0
def populate(file):

    pp.pprint("parsing file %s" % file)
    file_reader = open(file, "r")
    url_objects = []
    for line in file_reader:
        if not line.startswith("#"):
            pp.pprint(line)
            line = line.strip()
            line = line.strip("'")
            line = line.strip("\n")
            pp.pprint(line)
            url = URL(util.hash_domain(line), util.hash_url(line))
            if (not url.hash_domain == "" and not url.hash_url == ""):
                url_objects.append(url)
                db_session.add(url)
            db_session.commit()

    pp.pprint(url_objects)
    """
    TODO: this doesn't work with the large data set, perhaps there is a max without any errors?
    Will create a SQL script to insert manually into DB

    try:
        db_session.bulk_save_objects(url_objects)
        db_session.commit()
    except exc.IntegrityError:
        db_session.rollback()
    """
    results = URL.query.all()

    pp.pprint("Inserted %d rows" % len(results))
Ejemplo n.º 2
0
 def get(self, url):
     # First, query for the hashed domain
     hash_raw_domain = util.hash_domain(url)
     if (hash_raw_domain == None or hash_raw_domain == ""):
         abort(
             400,
             description=
             "BadRequest: The URL supplied is either missing or incorrectly formatted"
         )
     domain_resulset = URL.query.filter(
         URL.hash_domain == hash_raw_domain).all()
     hash_raw_url = util.hash_url(url)
     # Second, search for the hashed url in the resultset
     exists = util.in_domain(domain_resulset, hash_raw_url)
     return {'exists': exists}
Ejemplo n.º 3
0
"""
pp = pprint.PrettyPrinter(indent=4)


def populate():
    db_session.add(url1)
    db_session.add(url2)
    db_session.add(url3)
    db_session.add(url4)
    db_session.add(url5)

    db_session.commit()
    results = URL.query.all()
    pp.pprint(results)


if __name__ == '__main__':

    url1 = URL(util.hash_domain('https://googo.com/gmail'),
               util.hash_url('https://googo.com/gmail'))
    url2 = URL(util.hash_domain('https://docs.googo.com/spreadsheets/u/01/'),
               util.hash_url('https://docs.googo.com/spreadsheets/u/01/'))
    url3 = URL(util.hash_domain('https://docs.googo.com/document/u/0/1'),
               util.hash_url('https://docs.googo.com/document/u/0/1'))
    url4 = URL(util.hash_domain('https://www.appa.com/mac/index.html'),
               util.hash_url('https://www.appa.com/mac/index.html'))
    url5 = URL(util.hash_domain('https://www.appa.com/ipad/stuff.htm'),
               util.hash_url('https://www.appa.com/ipad/stuff.htm'))

    populate()
Ejemplo n.º 4
0
    test_domain5 = "www2.stuffandthings.co.au"

    test_parse_url1 = "https://google.com/gmail/"
    test_parse_url2 = "https://go.google.com/somethingelse"
    test_parse_url3 = "http://stuffandthings.co.au/somethingelse/anotherthing"
    test_parse_url4 = "http://www.stuffandthings.co.au/somethingelse/anotherthing"
    test_parse_url5 = "http://www.stuffandthings.co.au:8080/somethingelse/anotherthing/yaya.html"

    test_parse_path1 = "httpsgoogle.com/gmail"
    test_parse_path2 = "httpsgo.google.com/somethingelse"
    test_parse_path3 = "httpstuffandthings.co.au/somethingelse/anotherthing"
    test_parse_path4 = "httpwww.stuffandthings.co.au/somethingelse/anotherthing"
    test_parse_path5 = "http8080www.stuffandthings.co.au/somethingelse/anotherthing/yaya.html"

    test_list = []
    url1 = URL(util.hash_domain(test_raw_url1), util.hash_url(test_raw_url1))
    url2 = URL(util.hash_domain(test_raw_url2), util.hash_url(test_raw_url2))
    url3 = URL(util.hash_domain(test_raw_url3), util.hash_url(test_raw_url3))
    url4 = URL(util.hash_domain(test_raw_url4), util.hash_url(test_raw_url4))
    url5 = URL(util.hash_domain(test_raw_url5), util.hash_url(test_raw_url5))

    test_list.append(url1)
    test_list.append(url2)
    test_list.append(url3)
    test_list.append(url4)
    test_list.append(url5)

    # Domain parsing tests
    test_parse_domain(test_raw_url1, test_domain1)
    test_parse_domain(test_raw_url2, test_domain2)
    test_parse_domain(test_raw_url3, test_domain3)
Ejemplo n.º 5
0
    db_session.add(url4)
    db_session.add(url5)
    db_session.commit()
    results = URL.query.filter(URL.hash_domain == url4.hash_domain)
    if (results.count() != 2):
        print("test_compare Failed. Wanted 2 results, got %d" %
              results.count())
        return
    print("test_compare passed")
    pp.pprint(results[0])
    pp.pprint(results[1])


if __name__ == '__main__':

    url1 = URL(util.hash_domain('https://googo.com/gmail/1/2/3/4'),
               util.hash_url('https://googo.com/gmail/1/2/3/4'))
    url2 = URL(
        util.hash_domain('https://docs.googollll.com/spreadsheets/u/04/'),
        util.hash_url('https://docs.googollll.com/spreadsheets/u/04/'))
    url3 = URL(util.hash_domain('https://docs.googollll.com/document/u/1/1'),
               util.hash_url('https://docs.googollll.com/document/u/1/1'))
    url4 = URL(util.hash_domain('https://www.appa3.com/mac/index.html'),
               util.hash_url('https://www.appa3.com/mac/index.html'))
    url5 = URL(util.hash_domain('https://www.appa3.com/ipad/stuff.htm'),
               util.hash_url('https://www.appa3.com/ipad/stuff.htm'))

    test_insert()
    test_compare()
    cleanup()