def scrape_category(first_category_page, site_url="http://books.toscrape.com"): page_to_scrape = first_category_page pre_books_to_scrape = [] books_to_scrape = [] while True: ctext("Category page book list", "yellow") category_books = get_books_url_from_category_url(page_to_scrape) pprint(category_books) ctext("Next category page", "yellow") page_to_scrape = get_category_next_page(page_to_scrape) print(page_to_scrape) pre_books_to_scrape.append(category_books) if not page_to_scrape: break for book_list in pre_books_to_scrape: for book in book_list: books_to_scrape.append(book) ctext("Full Book list from category to scrape :", "green") pprint(books_to_scrape) ctext(f"there are {len(books_to_scrape)} books to scrape", 'green') for book_to_scrape in books_to_scrape: scrape_a_book_and_hydrate_csv(book_to_scrape, site_url) ctext("Done")
def find_subdomains(script): subdomain_regex = re.findall( r"[%\\]?[a-zA-Z0-9][a-zA-Z0-9-_.]*\." + args.u, str(script)) for subdomain in subdomain_regex: parsed_subdomain = "" # If the subdomain is preceded by URL encoding, we removed it. if "%" in subdomain: # Sort for double URL encoding while "%25" in subdomain: subdomain = subdomain.replace("%25", "%") parsed_subdomain = subdomain.split("%")[-1][2:] # If the subdomain is preceded by \x escape sequence, remove it. elif "\\x" in subdomain: ctext("[+] " + subdomain, "red") parsed_subdomain = subdomain.split("\\x")[-1][2:] # If the subdomain is preceded by \u unicode sequence, remove it. elif "\\u" in subdomain: ctext("[+] " + subdomain, "red") parsed_subdomain = subdomain.split("\\u")[-1][4:] # Otherwise proceed as normal. else: parsed_subdomain = subdomain if parsed_subdomain not in SUBDOMAINS_ENUMERATED: if args.v: ctext("[+] " + subdomain, "green") SUBDOMAINS_ENUMERATED.append(subdomain) ''' If our total subdomains discovered is not the same length as our sites visited, scan the rest of our subdomains. ''' if len(list(set(SUBDOMAINS_ENUMERATED))) != len(list(set(SITES_VISITED))): for site in SUBDOMAINS_ENUMERATED: find_scripts(site)
def find_subdomains(script): """ Once we have our list of javascript code, we must find all subdomains in the code. As such, we compare it to a regex and then sort for the various exceptions one might expect to find. """ subdomain_regex = re.findall( r"[%\\]?[a-zA-Z0-9][a-zA-Z0-9-_.]*\." + args.u, str(script)) for subdomain in subdomain_regex: # If the subdomain is preceded by URL encoding, we removed it. if "%" in subdomain: # Sort for double URL encoding while "%25" in subdomain: subdomain = subdomain.replace("%25", "%") parsed_subdomain = subdomain.split("%")[-1][2:] # If the subdomain is preceded by \x escape sequence, remove it. elif "\\x" in subdomain: ctext("[+] " + subdomain, "red") parsed_subdomain = subdomain.split("\\x")[-1][2:] # If the subdomain is preceded by \u unicode sequence, remove it. elif "\\u" in subdomain: ctext("[+] " + subdomain, "red") parsed_subdomain = subdomain.split("\\u")[-1][4:] # Otherwise proceed as normal. else: parsed_subdomain = subdomain if parsed_subdomain not in SUBDOMAINS_ENUMERATED: if args.v: ctext("[+] " + subdomain, "green") SUBDOMAINS_ENUMERATED.append(subdomain) # If our total subdomains discovered is not the same length as our sites visited, scan the rest of our subdomains. if len(list(set(SUBDOMAINS_ENUMERATED))) != len(list(set(SITES_VISITED))): for site in SUBDOMAINS_ENUMERATED: find_scripts(site)
def find_subdomains(script): subdomain_regex = re.findall( r"[%\\]?[a-zA-Z0-9][a-zA-Z0-9-_.]*\." + args.u, str(script)) for subdomain in subdomain_regex: if "%" in subdomain: # If the subdomain is preceded by URL encoding, we removed it. parsed_subdomain = subdomain.split("%")[-1][2:] if parsed_subdomain not in SUBDOMAINS_ENUMERATED: if args.v: ctext(parsed_subdomain, "green", "black") SUBDOMAINS_ENUMERATED.append(parsed_subdomain) elif "\\x" in subdomain: # If the subdomain is preceded by \x escape sequence, remove it. parsed_subdomain = subdomain.split("\\x")[-1][2:] if parsed_subdomain not in SUBDOMAINS_ENUMERATED: if args.v: ctext(parsed_subdomain, "green", "black") SUBDOMAINS_ENUMERATED.append(parsed_subdomain) else: # Otherwise proceed as normal. if subdomain not in SUBDOMAINS_ENUMERATED: if args.v: ctext(subdomain, "green", "black") SUBDOMAINS_ENUMERATED.append(subdomain) ''' If our total subdomains discovered is not the same length as our sites visited, scan the rest of our subdomains. ''' if len(list(set(SUBDOMAINS_ENUMERATED))) != len(list(set(SITES_VISITED))): for site in SUBDOMAINS_ENUMERATED: find_scripts(site)
def ascii_banner(): ctext(" `. ___", "red") ctext(" __,' __`. _..----....____", "red") ctext(" __...--.'``;. ,. ;``--..__ .' ,-._ _.-'", "red") ctext(" _..-''-------' `' `' `' O ``-''._ (,;') _,'", "red") ctext(",'________________ \`-._`-','", "red") ctext(" `._ ```````````------...___ '-.._'-:", "red") ctext(" ```--.._ ,. ````--...__\-.", "red") ctext(" `.--. `-` ____ | |`", "red") ctext(" `. `. ,'`````. ; ;`", "red") ctext(" `._`. __________ `. \'__/`", "red") ctext(" `-:._____/______/___/____`. \ `", "red") ctext(" SUBSCRAPER | `._ `. \\", "red") ctext(" SUBSCRAPER `._________`-. `. `.___", "red") ctext(" SUBSCRAPER v1.0.0 `------'`", "red") ctext("\nSubdomains Found:\n")
def WhichLineSide(x, y, x0, y0, x1, y1): det = x * y0 + y * x1 + x0 * y1 - x1 * y0 - y1 * x - x0 * y return sign(det) colors = [ 'black', 'grey', 'red', 'green', 'yellow', 'blue', 'magenta', 'cyan', 'white' ] if __name__ == '__main__': print('This is a module!') ctext( '' ) # For some reason colors in windows terminal don't work until I've called ctext() at least once, so here it is, unfortunately class Figure: def __init__(self, color='white'): if color not in colors: raise ValueError( '\'color\' must be one of these values: \'white\', \'red\', \'green\', \'blue\', \'yellow\', \'magenta\', \'cyan\', \'grey\', \'black\'.' ) self.color = color class Rectangle(Figure): def __init__(self, x, y, w, h, color='white', fill=False): Figure.__init__(self, color)
def errout(self): ctext(str(self), text="red", bg="black")