Example #1
0
def crawl(urloc:str) -> (str,list):
    db      = Database(PATH)
    parser  = Parser()
    session = connect_to_tor()

    # select here to find if in db 
    try:
        urlindb = db.isCrawled(urloc)
        if len(urlindb) > 0:
            # url already crawled
            del urlindb
            return urloc,[]
    except Exception as e:
        print(e)


    try:
        try:
            r = session.get(urloc,headers=TORBUNDLEHEADER,timeout=20)
            r.raise_for_status()

        except Exception as err:
            insert_data = {
            "protocol" : "Error",
            "url"      : urloc,
            "data"     : base64.b64encode(str(err).encode()),
            "lastvisit": int(time.time()),
            }

            try:
                db.insert(insert_data)
            except Exception as e:
                # if urloc in db dont crawl it again and return
                if "UNIQUE constraint failed" in str(e):
                    # update val
                    try:
                        db.update(insert_data)
                    except Exception as e:
                        pass    
        else:
            urls        = parser.urlExtractor(urloc,r.text)
            protocol    = urloc.split("://")[0]
            insert_data = {
            "protocol" : protocol,
            "url"      : urloc,
            "data"     : base64.b64encode(r.content),
            "lastvisit": int(time.time()),
            }
            
            try:
                db.insert(insert_data)
            except Exception as e:
                # if urloc in db dont crawl it again and return
                if "UNIQUE constraint failed" in str(e):
                    # update val
                    try:
                        db.update(insert_data)
                    except Exception as e:
                        pass

            retUrls = []
            for key, value in urls.items():
                if key == "http" or key == "https" :
                    # crawl only http protocol
                    for url in urls[key]:
                        tld = parser.tldExtractor(url)
                        # crawl only onion sites
                        if tld == "onion":
                            retUrls.append(url)
            return urloc, retUrls

    except Exception as e:
        return urloc,[]