def scrape_category(first_category_page, site_url="http://books.toscrape.com"):
    page_to_scrape = first_category_page
    pre_books_to_scrape = []
    books_to_scrape = []
    while True:
        ctext("Category page book list", "yellow")
        category_books = get_books_url_from_category_url(page_to_scrape)
        pprint(category_books)
        ctext("Next category page", "yellow")
        page_to_scrape = get_category_next_page(page_to_scrape)
        print(page_to_scrape)
        pre_books_to_scrape.append(category_books)
        if not page_to_scrape:
            break

    for book_list in pre_books_to_scrape:
        for book in book_list:
            books_to_scrape.append(book)
    ctext("Full Book list from category to scrape :", "green")
    pprint(books_to_scrape)
    ctext(f"there are {len(books_to_scrape)} books to scrape", 'green')

    for book_to_scrape in books_to_scrape:
        scrape_a_book_and_hydrate_csv(book_to_scrape, site_url)

    ctext("Done")
Example #2
0
def find_subdomains(script):
    subdomain_regex = re.findall(
        r"[%\\]?[a-zA-Z0-9][a-zA-Z0-9-_.]*\." + args.u, str(script))
    for subdomain in subdomain_regex:
        parsed_subdomain = ""
        # If the subdomain is preceded by URL encoding, we removed it.
        if "%" in subdomain:
            # Sort for double URL encoding
            while "%25" in subdomain:
                subdomain = subdomain.replace("%25", "%")
            parsed_subdomain = subdomain.split("%")[-1][2:]
        # If the subdomain is preceded by \x escape sequence, remove it.
        elif "\\x" in subdomain:
            ctext("[+] " + subdomain, "red")
            parsed_subdomain = subdomain.split("\\x")[-1][2:]
        # If the subdomain is preceded by \u unicode sequence, remove it.
        elif "\\u" in subdomain:
            ctext("[+] " + subdomain, "red")
            parsed_subdomain = subdomain.split("\\u")[-1][4:]
        # Otherwise proceed as normal.
        else:
            parsed_subdomain = subdomain
        if parsed_subdomain not in SUBDOMAINS_ENUMERATED:
            if args.v:
                ctext("[+] " + subdomain, "green")
            SUBDOMAINS_ENUMERATED.append(subdomain)
    '''
    If our total subdomains discovered is not the same length as our sites visited, scan the rest of our subdomains.
    '''
    if len(list(set(SUBDOMAINS_ENUMERATED))) != len(list(set(SITES_VISITED))):
        for site in SUBDOMAINS_ENUMERATED:
            find_scripts(site)
Example #3
0
def find_subdomains(script):
    """
    Once we have our list of javascript code, we must find all subdomains in the code.
    As such, we compare it to a regex and then sort for the various exceptions one might expect to find.
    """
    subdomain_regex = re.findall(
        r"[%\\]?[a-zA-Z0-9][a-zA-Z0-9-_.]*\." + args.u, str(script))
    for subdomain in subdomain_regex:
        # If the subdomain is preceded by URL encoding, we removed it.
        if "%" in subdomain:
            # Sort for double URL encoding
            while "%25" in subdomain:
                subdomain = subdomain.replace("%25", "%")
            parsed_subdomain = subdomain.split("%")[-1][2:]
        # If the subdomain is preceded by \x escape sequence, remove it.
        elif "\\x" in subdomain:
            ctext("[+] " + subdomain, "red")
            parsed_subdomain = subdomain.split("\\x")[-1][2:]
        # If the subdomain is preceded by \u unicode sequence, remove it.
        elif "\\u" in subdomain:
            ctext("[+] " + subdomain, "red")
            parsed_subdomain = subdomain.split("\\u")[-1][4:]
        # Otherwise proceed as normal.
        else:
            parsed_subdomain = subdomain
        if parsed_subdomain not in SUBDOMAINS_ENUMERATED:
            if args.v:
                ctext("[+] " + subdomain, "green")
            SUBDOMAINS_ENUMERATED.append(subdomain)

    # If our total subdomains discovered is not the same length as our sites visited, scan the rest of our subdomains.
    if len(list(set(SUBDOMAINS_ENUMERATED))) != len(list(set(SITES_VISITED))):
        for site in SUBDOMAINS_ENUMERATED:
            find_scripts(site)
Example #4
0
def find_subdomains(script):
    subdomain_regex = re.findall(
        r"[%\\]?[a-zA-Z0-9][a-zA-Z0-9-_.]*\." + args.u, str(script))
    for subdomain in subdomain_regex:
        if "%" in subdomain:
            # If the subdomain is preceded by URL encoding, we removed it.
            parsed_subdomain = subdomain.split("%")[-1][2:]
            if parsed_subdomain not in SUBDOMAINS_ENUMERATED:
                if args.v:
                    ctext(parsed_subdomain, "green", "black")
                SUBDOMAINS_ENUMERATED.append(parsed_subdomain)
        elif "\\x" in subdomain:
            # If the subdomain is preceded by \x escape sequence, remove it.
            parsed_subdomain = subdomain.split("\\x")[-1][2:]
            if parsed_subdomain not in SUBDOMAINS_ENUMERATED:
                if args.v:
                    ctext(parsed_subdomain, "green", "black")
                SUBDOMAINS_ENUMERATED.append(parsed_subdomain)
        else:
            # Otherwise proceed as normal.
            if subdomain not in SUBDOMAINS_ENUMERATED:
                if args.v:
                    ctext(subdomain, "green", "black")
                SUBDOMAINS_ENUMERATED.append(subdomain)
    '''
    If our total subdomains discovered is not the same length as our sites visited, scan the rest of our subdomains.
    '''
    if len(list(set(SUBDOMAINS_ENUMERATED))) != len(list(set(SITES_VISITED))):
        for site in SUBDOMAINS_ENUMERATED:
            find_scripts(site)
Example #5
0
def ascii_banner():
    ctext("                      `. ___", "red")
    ctext("                    __,' __`.                _..----....____",
          "red")
    ctext("        __...--.'``;.   ,.   ;``--..__     .'    ,-._    _.-'",
          "red")
    ctext("  _..-''-------'   `'   `'   `'     O ``-''._   (,;') _,'", "red")
    ctext(",'________________                          \`-._`-','", "red")
    ctext(" `._              ```````````------...___   '-.._'-:", "red")
    ctext("    ```--.._      ,.                     ````--...__\-.", "red")
    ctext("            `.--. `-`                       ____    |  |`", "red")
    ctext("              `. `.                       ,'`````.  ;  ;`", "red")
    ctext("                `._`.        __________   `.      \'__/`", "red")
    ctext("                   `-:._____/______/___/____`.     \  `", "red")
    ctext("         SUBSCRAPER            |       `._    `.    \\", "red")
    ctext("         SUBSCRAPER            `._________`-.   `.   `.___", "red")
    ctext("         SUBSCRAPER                v1.0.0         `------'`", "red")
    ctext("\nSubdomains Found:\n")
Example #6
0
def WhichLineSide(x, y, x0, y0, x1, y1):
    det = x * y0 + y * x1 + x0 * y1 - x1 * y0 - y1 * x - x0 * y
    return sign(det)


colors = [
    'black', 'grey', 'red', 'green', 'yellow', 'blue', 'magenta', 'cyan',
    'white'
]

if __name__ == '__main__':
    print('This is a module!')

ctext(
    ''
)  # For some reason colors in windows terminal don't work until I've called ctext() at least once, so here it is, unfortunately


class Figure:
    def __init__(self, color='white'):
        if color not in colors:
            raise ValueError(
                '\'color\' must be one of these values: \'white\', \'red\', \'green\', \'blue\', \'yellow\', \'magenta\', \'cyan\', \'grey\', \'black\'.'
            )
        self.color = color


class Rectangle(Figure):
    def __init__(self, x, y, w, h, color='white', fill=False):
        Figure.__init__(self, color)
Example #7
0
 def errout(self):
     ctext(str(self), text="red", bg="black")