def main(): print_banner() root_url = get_args() links = list() tree = sitemap_tree_for_homepage(root_url) for page in tree.all_pages(): links.append(page.url) hc = hashek.Hashek() print("\nGathered %d links from %s" % (len(links), root_url)) errors_fixes = list() for i, link in enumerate(links): text = extract_text_from_link(link) print("Checking link: %s" % link) suggestions_dict = hc.check_text(text) errors_fixes.append({link: suggestions_dict}) hc.close()
def ask_for_url(): print("What URL do you want to search?") url = input('>') print("Do you want to search more than one URL?") answer = input('>').lower() if answer.startswith('y'): # Check sitemap of URL tree = sitemap_tree_for_homepage(url) #for page in tree.all_pages(): # print(page) # Tell user how many pages are found number_of_pages_found = len(list(tree.all_pages())) print(f'A total of {number_of_pages_found} pages were found.') # Allow user to specify how many pages to scrape print("How many pages do you want to scrape?") pages_to_scrape = int(input('>')) pages_scraped = [] for page in tree.all_pages(): pages_scraped.append(page.url) if len(pages_scraped) == pages_to_scrape: break return pages_scraped else: return [url]
def generate_sitemap(domain): # Generate the sitemap tree = sitemap_tree_for_homepage(domain) # Initialise the list of links links = [] # Iterate through all URLs found by the sitemap generator for page in tree.all_pages(): url = page.url # Some sites will not have the domain name in front of URL == add this in if url[0] == '/': url = domain + url # This is the structure of the db # Needs work to improve search functionality link_entry = {'url': url, 'domain': 'https://' + strip_domain(domain)} #Add this to the list of links needing to be appended links.append(link_entry) # Write the links to a file (one for each domain) write_to_file(domain, links)
def get_urls(url): tree = sitemap_tree_for_homepage(url) urls = [] for page in tree.all_pages(): urls.append(page.url) return urls
def get_sitemap_tree(common_list): """get all links from sitemap""" sitemap_tree = [] for link in common_list: web_client = _RequestsWebClient() tree = sitemap_tree_for_homepage(link, web_client) for page in tree.all_pages(): sitemap_tree.append(page.url) return sitemap_tree
def fetch_sitemap_pages_for_media_id(db: DatabaseHandler, media_id: int) -> None: """Fetch and store all pages (news stories or not) from media's sitemap tree.""" media = db.find_by_id(table='media', object_id=media_id) if not media: raise Exception("Unable to find media with ID {}".format(media_id)) media_url = media['url'] log.info("Fetching sitemap pages for media ID {} ({})...".format( media_id, media_url)) web_client = _SitemapWebClient() sitemaps = sitemap_tree_for_homepage(homepage_url=media_url, web_client=web_client) log.info("Fetched pages for media ID {} ({}).".format(media_id, media_url)) log.info("Storing sitemap pages for media ID {} ({})...".format( media_id, media_url)) insert_counter = 0 for page in sitemaps.all_pages(): db.query( """ INSERT INTO media_sitemap_pages ( media_id, url, last_modified, change_frequency, priority, news_title, news_publish_date ) VALUES ( %(media_id)s, %(url)s, %(last_modified)s, %(change_frequency)s, %(priority)s, %(news_title)s, %(news_publish_date)s ) ON CONFLICT (url) DO NOTHING """, { 'media_id': media_id, 'url': page.url, 'last_modified': page.last_modified, 'change_frequency': page.change_frequency.value if page.change_frequency is not None else None, 'priority': page.priority, 'news_title': page.news_story.title if page.news_story is not None else None, 'news_publish_date': page.news_story.publish_date if page.news_story is not None else None, }) insert_counter += 1 if insert_counter % 1000 == 0: log.info("Inserted {} URLs...".format(insert_counter)) log.info("Done storing {} sitemap pages for media ID {} ({}).".format( insert_counter, media_id, media_url))
def import_domains(): with open('ru_domains.txt') as file_data: for item in file_data: domain = item.split(';')[0] try: req = requests.get('https://' + domain, timeout=3) if req.status_code == requests.codes.ok: sitemap_tree = sitemap_tree_for_homepage('https://' + domain) for page in sitemap_tree.all_pages(): print(page.url) except requests.exceptions.RequestException: continue
def add_links_from_sitemap_xml(self): if self.sitemap_xml_processor is None: return assert self.website.main_page_url in self.website.url_nodes root_page = self.website.main_page_url.strip('/') tree = sitemap_tree_for_homepage(root_page) cnt = 0 useful = 0 for page in tree.all_pages(): cnt += 1 weight = self.sitemap_xml_processor(page.url) if weight > TLinkInfo.MINIMAL_LINK_WEIGHT: if page.url not in self.pages_to_process: useful += 1 link_info = TLinkInfo(TClickEngine.sitemap_xml, self.website.main_page_url, page.url, anchor_text="") link_info.weight = weight self.add_link_wrapper(link_info) self.logger.info("processed {} links from {}/sitemap.xml found {} useful links".format(cnt, root_page, useful))
def main(req: func.HttpRequest) -> func.HttpResponse: name = req.params.get('url') now = datetime.now() if not name: try: req_body = req.get_json() except ValueError: pass else: name = req_body.get('url') logging.info('Python HTTP trigger function processed a request.') try: tree = sitemap_tree_for_homepage(name) Cur_time=now+timedelta(minutes=3) output="" count =0 for page in tree.all_pages(): found="" m = re.search('url=(.+?), ', str(page)) if(datetime.now() > Cur_time): break if m: found = m.group(1) if str(found) not in output: output+="\""+str(found)+"\""+"," count+=1 output="["+output[:-1]+"]" print(output) print(count) except: output="Something went wrong" if (output==""): return func.HttpResponse( "Please pass a url" ) else: return func.HttpResponse(output)
def search_site(site, keywords, data): try: tree = sitemap_tree_for_homepage(site) counter = 0 for page in tree.all_pages(): page_text = '' for keyword in keywords: if not data.get(page.url): data[page.url] = {} if data[page.url].get(keyword, -1) >= 0: continue if not page_text: print( 'getting page text') # only get page_text if necessary page_text = requests.get(page.url).text print("Seaching on %s for %s" % (page.url, keyword)) data = store_match_count(data, page.url, page_text, keyword) counter += 1 if counter >= 20: save_data(data) counter = 0 except: print("failed to search on %s" % site) return data
two_factor = ("--two_factor" in args) print("Logging in") if two_factor: print("Two Factor authentication required") print("Not Implemented") exit() login_info = r.login(acc_info[0], acc_info[1]) print("Successfully Logged in") root = 'https://robinhood.com/' collections = [] stock_hs = set() tree = sitemap_tree_for_homepage('https://robinhood.com/sitemap.xml') pages = tree.all_pages() urls = [p.url.replace(root, "") for p in pages] for url in urls: if 'collections' in url: collections.append(url.replace('collections/', '')) elif 'stocks' in url: stock_hs.add(url.replace('stocks/', '')) print(len(collections)) print(len(stock_hs)) print("Updating list of stocks from collections set") print(stock_hs) for col in collections: print(col)
Parse sitemap and write it to csv file ''' import csv import argparse from usp.tree import sitemap_tree_for_homepage def write_csv(tree, csv_file): with open(csv_file, 'w', newline='', encoding='utf-8') as file: writer = csv.writer(file, delimiter=',') writer.writerow(['url', 'priority', 'last_modified']) for line in tree.all_pages(): writer.writerow( [line.url, str(line.priority), str(line.last_modified)]) if __name__ == '__main__': parser = argparse.ArgumentParser( description="Generate sitemap csv from given url") parser.add_argument('--url', default="http://www.freshdirect.com/") parser.add_argument('--csv', default="sitemap.csv") args = parser.parse_args() print(args) tree = sitemap_tree_for_homepage(args.url) write_csv(tree, args.csv)
from usp.tree import sitemap_tree_for_homepage tree = sitemap_tree_for_homepage('https://hostingspell.com/') print(tree)
def pages_from_sitemap(page_url: str) -> list: tree = sitemap_tree_for_homepage(page_url) return [page.url for page in tree.all_pages()]
from usp.tree import sitemap_tree_for_homepage import re tree = sitemap_tree_for_homepage('https://documents.polycom.com/') for page in tree.all_pages(): data = re.findall("(https:\/\/.+)", page) for url in data: print(url)
from usp.tree import sitemap_tree_for_homepage import argparse def parse_args(): parser = argparse.ArgumentParser() parser.add_argument("--output", dest="output_path", required=False, default="downloaded_sitemap_urls.txt") parser.add_argument("urls", nargs="*") args = parser.parse_args() return args if __name__ == '__main__': args = parse_args() if len(args.urls) > 0: url = args.urls[0] else: url = "http://sokirko.info" print("download all sitemaps from {}".format(url)) tree = sitemap_tree_for_homepage(url) urls = list(tree.all_pages()) print("write {} urls to {}".format(len(urls), args.output_path)) with open(args.output_path, "w") as outp: for u in urls: \ outp.write ("{}\n".format(u.url))
def dealer_urls(car_make: str, model: str, zip_code: int, dist_range: int = 100, min_stars: int = 4, prices_arg='full'): api = API url = 'https://maps.googleapis.com/maps/api/place/textsearch/json?' geoloc = Nominatim(user_agent="PriceScraper") try: lat = geoloc.geocode({'postalcode': zip_code})[1][0] long = geoloc.geocode({'postalcode': zip_code})[1][1] except: raise ValueError( str(zip_code) + ' is not a valid zip code, try again with an existing zip code') r = requests.get(url + 'query=' + car_make + '+Dealerships&location=' + str(lat) + ',' + str(long) + '&radius=' + str(dist_range) + '&key=' + api) ids = [res['place_id'] for res in r.json()['results']] print('Found ' + str(len(ids)) + ' matching ' + car_make + ' dealerships within ' + str(dist_range) + ' miles of ' + str(zip_code)) url2 = 'https://maps.googleapis.com/maps/api/place/details/json?' print('Getting urls for matching ' + car_make + ' Dealers....') url_list = [] for place_id in ids: try: request = requests.get(url2 + 'place_id=' + place_id + '&fields=name,rating,website' + '&key=' + api).json() print([ request['result']['name'], request['result']['rating'], request['result']['website'].split('/')[2] ]) url_list.append([ request['result']['name'], request['result']['rating'], request['result']['website'].split('/')[2] ]) except: pass # url_list = [[requests.get(url2 + 'place_id=' + place_id + '&fields=name,rating,website' + '&key=' + api).json()['result']['name'], requests.get(url2 + 'place_id=' + place_id + '&fields=name,rating,website' + '&key=' + api).json()['result']['rating'], 'https://' + requests.get(url2 + 'place_id=' + place_id + '&fields=name,rating,website' + '&key=' + api).json()['result']['website'].split('/')[2]] for place_id in ids if 'rating' in requests.get(url2 + 'place_id=' + place_id + '&fields=name,rating,website' + '&key=' + api).json()['result'] and 'website' in requests.get(url2 + 'place_id=' + place_id + '&fields=name,rating,website' + '&key=' + api).json()['result'] and car_make.lower() in requests.get(url2 + 'place_id=' + place_id + '&fields=name,rating,website' + '&key=' + api).json()['result']['website'].lower()] url_df = pd.DataFrame(url_list, columns=['Dealership Name', 'Rating', 'URL']) url_df = url_df[url_df['Rating'] >= min_stars] if len(url_df) == 0: raise ValueError( 'Too few rows remaining after filter. Try changing the minimum star rating for dealerships' ) sitemap = [] for row in range(len(url_df)): url = url_df.iloc[row]['URL'] if 'https' not in url: tree = sitemap_tree_for_homepage('https://' + url.replace('http://', '')) else: tree = sitemap_tree_for_homepage(url) try: with timeout(400, exception=RuntimeError): each = [] for page in tree.all_pages(): if model.lower() in page.url.lower() and ( 'new' in page.url.lower() or 'inventory' in page.url.lower()): each.append(page.url) else: pass sitemap.append(each) except RuntimeError as e: sitemap.append([]) url_df['Sitemap'] = sitemap url_df = url_df[url_df['Sitemap'].str.len() > 0] prices_index = (1 if prices_arg == 'full' else 3) prices = [] for i in range(len(url_df)): site = url_df.iloc[i]['Sitemap'] for url in site: page = requests.get(url) soup = bs4.BeautifulSoup(page.content, 'html.parser') try: name = soup.find_all(text=re.compile('[0-9]{4} ' + car_make + ' ' + model + ' [A-Z|a-z]{1,10}')) price = [ int(str(x).strip('$').replace(',', '')) for x in soup.find_all( text=re.compile('^\$[0-9]{2}\,[0-9]{3}')) ] if len(price) == 0: break if len(name[0]) > 120: j = 1 while j < len(name): if len(name[j]) > 120: j += 1 else: prices.append([ url_df.iloc[i]['Dealership Name'], name[j], sorted(price[:prices_index]), url ]) print('Getting price for a ' + name[j]) break prices.append([ url_df.iloc[i]['Dealership Name'], car_make + ' ' + model, sorted(price[:prices_index]), url ]) else: print('Getting price for a ' + name[0]) prices.append([ url_df.iloc[i]['Dealership Name'], name[0], sorted(price[:prices_index]), url ]) except Exception as e: print(e) pass prices_dat = pd.DataFrame( prices, columns=['Dealership Name', 'Model', 'Prices', 'URL']) if prices_arg == 'full': prices_dat = prices_dat[prices_dat['Prices'].str.len() > 2] prices_dat['Prices_MSRP'] = [x[2] for x in prices_dat['Prices']] prices_dat['Prices_First_Discount'] = [ x[1] for x in prices_dat['Prices'] ] prices_dat['Prices_Final_Discount'] = [ x[0] for x in prices_dat['Prices'] ] prices_dat = prices_dat.drop('Prices', axis=1) else: prices_dat['Prices_MSRP'] = [x[0] for x in prices_dat['Prices']] prices_dat = prices_dat.drop('Prices', axis=1) prices_dat.to_csv(os.path.join(os.path.abspath('.'), 'interface/static/interface/user_files/') + car_make + '_' + model + '_' + 'within_' + str(dist_range) + '_miles_of_' + str(zip_code) + '_' + '_prices_' + prices_arg + '.csv', index=False)
class TimeOutException(Exception): pass def alarm_handler(signum, frame): print("ALARM signal received") raise TimeOutException() docID = 0 urllist = [] recipes_dict = {} tree = sitemap_tree_for_homepage("https://www.bbcgoodfood.com/") for page in tree.all_pages(): url = page.url if "https://www.bbcgoodfood.com/recipes" in url: urllist.append(page.url) with open('url.txt', 'w') as fp: for url in urllist: fp.write(url + "\n") signal.signal(signal.SIGALRM, alarm_handler) with open('14_02_20.txt', 'w') as fp: for url in urllist: signal.alarm(8) print(url)