def main(): if len(sys.argv) != 2: print("Invalid command line arguments.") print("Usage: python3 diagnose.py <WEBSITE_URL>") exit() url = sys.argv[1] FILEPATH_PREFIX = "data/" FILEPATH_TEXT_SUFFIX_CLEAN = "_clean.txt" FILEPATH_TEXT_SUFFIX_BLOCK = "_block.txt" FILEPATH_IMAGE_SUFFIX_CLEAN = "_clean.png" FILEPATH_IMAGE_SUFFIX_BLOCK = "_block.png" txt_clean = FILEPATH_PREFIX + url + FILEPATH_TEXT_SUFFIX_CLEAN txt_block = FILEPATH_PREFIX + url + FILEPATH_IMAGE_SUFFIX_BLOCK img_clean = FILEPATH_PREFIX + url + FILEPATH_IMAGE_SUFFIX_CLEAN img_block = FILEPATH_PREFIX + url + FILEPATH_IMAGE_SUFFIX_BLOCK website_clean = Website(txt_clean, img_clean, "clean") website_block = Website(txt_block, img_block, "block") pair = WebsitePair(website_clean, website_block) # @TODO This if-else section can be expanded as future faults become detectable. if page_is_blank(website_clean, website_block, pair): print("Page is blank!") else: print("No faults detected.")
def setUp(self): self.single_plan = Plan('Single', 49, 1) self.plus_plan = Plan('Plus', 99, 3) self.website_1 = Website('https://google.com') self.website_2 = Website('https://google.com') self.customer_1 = Customer('customer_1', '123456789', '*****@*****.**')
def serializeWebsite(self, website): if ("sitemap" in website.keys()): return Website(website['homepage'], website['input_dict'], website['lastmod'], sitemap=website['sitemap']) else: return Website(website['homepage'], website['input_dict'], website['lastmod'])
def process_single_website(website_url): """Processes a single website and exports to csv string. """ txt_clean = FILEPATH_PREFIX + website_url + FILEPATH_TEXT_SUFFIX_CLEAN txt_block = FILEPATH_PREFIX + website_url + FILEPATH_IMAGE_SUFFIX_BLOCK img_clean = FILEPATH_PREFIX + website_url + FILEPATH_IMAGE_SUFFIX_CLEAN img_block = FILEPATH_PREFIX + website_url + FILEPATH_IMAGE_SUFFIX_BLOCK website_clean = Website(txt_clean, img_clean, "clean") website_block = Website(txt_block, img_block, "block") pair = WebsitePair(website_clean, website_block) print(get_csv_header(website_clean, website_block, pair))
def main(): # Initialize different plans single_plan = Plan('Single', 49, 1) plus_plan = Plan('Plus', 99, 3) infinite_plan = Plan('Infinite', 249, -1) # Initialize multiple websites website_1 = Website('https://website_1.com') website_2 = Website('https://website_2.com') website_3 = Website('https://website_3.com') website_4 = Website('https://website_4.com') # Initialize multiple customers customer_1 = Customer('customer_1', '123456789', '*****@*****.**') customer_2 = Customer('customer_2', '123456789', '*****@*****.**') customer_3 = Customer('customer_3', '123456789', '*****@*****.**') # customer_1 subscribed for single_plan customer_1.add_subscription(single_plan) print("{} has subscribed for {} plan".format(customer_1, customer_1.subscription.plan)) # customer_1 added one website customer_1.add_website(website_1) print("{} has added website {} as per the {} plan".format(customer_1, \ customer_1.websites, customer_1.subscription.plan)) # customer_1 can not add more website in single_plan customer_1.add_website(website_2) print("{} can't add website {} as per the {} plan".format(customer_1, \ website_2, customer_1.subscription.plan)) # customer_1 can change plan from single_plan to plus_plan customer_1.change_plan(plus_plan) print("{} has changed his current plan {} to {} plan".format(customer_1, \ single_plan, customer_1.subscription.plan)) # customer_2 subscribe for infinite_plan customer_2.add_subscription(infinite_plan) # customer_2 can add multiple websites customer_2.add_website(website_1) customer_2.add_website(website_2) customer_2.add_website(website_3) customer_2.add_website(website_4) print("{} has added four websites {} under infinite plan".format(customer_2, \ customer_2.websites))
def main(): logger.info("Cartriage v5.0") parser = argparse.ArgumentParser( description="Retrieves information from printers.") parser.add_argument( "l", type=open, metavar="printers", help="Text file containing printer IP addresses, one for each line.") parser.add_argument("o", metavar="output", help="Filename for resulting HTML page.") parser.add_argument("-v", action="store_true", help="Enable verbose mode.") try: args = parser.parse_args() if args.v: logger.info("Enabled verbose mode") logger.setLevel(logging.DEBUG) logger.debug(args) startTime = time.time() time.clock() scanned, successfullyScanned, printers = runScan(args.l) elapsedTime = "%d seconds" % (time.time() - startTime) site = Website(scanned, successfullyScanned, printers, elapsedTime) with open(args.o, "w") as output: output.write(str(site)) logger.info("Done! Results available in file: %s" % args.o) sys.exit(0) except IOError, e: logger.error(str(e)) sys.exit(1)
def startup(self): """ Some stuff that should get called after everything is loaded. """ self.env.seishub.startup() self.nw_tree.startup() # Connect some slots. QtCore.QObject.connect(self.nw_tree.nw_select_model, QtCore.SIGNAL("selectionChanged(QItemSelection, QItemSelection)"), \ self.waveforms.waveform_scene.add_channel) web = Website(env=self.env) web.startup() # Add a WebView to later display the map. file = open(os.path.join(self.env.temp_res_dir, 'map.html')) html = file.read() file.close() self.env.web.setHtml(html) self.picks.update() css_url = QtCore.QUrl.fromLocalFile(os.path.abspath(self.env.css)) server = '%s/manage/seismology/stations' % self.env.seishub_server url = QtCore.QUrl(server) url.setUserName(self.env.seishub_user) url.setPassword(self.env.seishub_password) # Might work with some Qt version... self.env.station_browser.page().settings().setUserStyleSheetUrl(css_url) self.env.station_browser.load(url) self.env.station_browser.page().settings().setUserStyleSheetUrl(css_url)
def go(self): self.work_pages(self.site) self.session.add(Website(url=self.site, title='', domain=self.site, pages_count=self.pages_count, HTML_version=0.0)) self.session.commit()
def get_website(self, url: str, check_interval: int): """ Instantiates Website instance. Safely returns instance or None depending on success. PARAMETERS: check_interval: Positive integer in seconds. Ping refresh freuency e.g. 30 would equate to check every 30 seconds url: String e.g. http://google.fr Instantiates Website instance. RETURNS: Website instance or None. """ try: website = Website(url=url, check_interval=check_interval) except Exception: print( "I wasn't able to connect with that URL.\n" + "Please revise it, including 'http://'" + " or 'https://' as appropriate)." ) return None return website
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) ticker = dynamodb.Attribute( name='Ticker', type=dynamodb.AttributeType.STRING, ) date = dynamodb.Attribute( name='Date', type=dynamodb.AttributeType.STRING, ) table = dynamodb.Table( self, 'StockHistory', partition_key=ticker, sort_key=date, billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST, removal_policy=core.RemovalPolicy.DESTROY, point_in_time_recovery=True, ) index_name = 'Date-index' table.add_global_secondary_index( index_name=index_name, partition_key=date, sort_key=ticker, projection_type=dynamodb.ProjectionType.INCLUDE, non_key_attributes=['Name']) Importer(self, 'Importer', table=table) restapi = RestApi(self, 'Api', table=table, index_name=index_name) Website(self, 'Website', api=restapi.api)
def home(): if request.method == 'POST': Website(request.form['url']) Website.check_all() return render_template( "home.html", pages=Website.all, length=len(Website.all) )
def test_fix_link(link, hostname, scheme, result): mock_parsed_url = Mock() mock_parsed_url.hostname = hostname mock_parsed_url.scheme = scheme mock_parsed_url.netloc = hostname website = Website('seed_url') assert website.fix_link(link, mock_parsed_url) == ( result, hostname )
def getWebsites(self): websites = dict() with open('data1/websites.json') as data_file: websitesData = json.load(data_file)['websites'] for website in websitesData: websites[website['id']] = Website(website) return websites
def test_scrape(monkeypatch, page_content, links, to_visit): mock_response = Mock() mock_response.text = page_content mock_response.status_code = 200 monkeypatch.setattr('website.requests.get', lambda x: mock_response) website = Website('http://hostname/url') website.scrape() # pages are 'http://hostname/url', 'http://hostname/new-url', # 'https://hostname/', 'http://hostname/', 'https://hostname/new-url' assert len(website.pages) == 5
def test_scrape_url(monkeypatch, page_content, links, to_visit): mock_response = Mock() mock_response.text = page_content mock_response.status_code = 200 monkeypatch.setattr('website.requests.get', lambda x: mock_response) website = Website('http://hostname/url') # Simulate visiting the page. url, _ = website.to_visit.popitem() website.scrape_url(url) assert website.to_visit == OrderedDict((key, None) for key in to_visit) assert website.pages[url].links == links
def process_manifest(): """Processes all websites in the manifest. """ m = manifest.MANIFEST for i in range(0, len(m)): entry = m[i] txt_clean = FILEPATH_PREFIX + entry[0] + FILEPATH_TEXT_SUFFIX_CLEAN txt_block = FILEPATH_PREFIX + entry[0] + FILEPATH_IMAGE_SUFFIX_BLOCK img_clean = FILEPATH_PREFIX + entry[0] + FILEPATH_IMAGE_SUFFIX_CLEAN img_block = FILEPATH_PREFIX + entry[0] + FILEPATH_IMAGE_SUFFIX_BLOCK website_clean = Website(txt_clean, img_clean, "clean") website_block = Website(txt_block, img_block, "block") pair = WebsitePair(website_clean, website_block) if i == 0: print(get_csv_header(website_clean, website_block, pair)) print(get_csv_string(website_clean, website_block, pair))
def generate_text(self, sites_file, search_limit, keep_to_sites): with open(sites_file, "r") as f: for site in f.readlines(): sys.stderr.write("Working on: " + site + '\n') ws = Website(home_page=site, search_limit=search_limit, keep_to_site=keep_to_sites) self.sites.append(ws) for site in self.sites: for link in site.links: self.text += site.get_page_text(link)
def __init__(self, url, keywords=None, searchPageLimit=2, websitesJsonFile="websites.json", isInitialCrawl=False): # iinitialize class attributes self.baseUrl = url self.keywords = keywords self.articleLinks = [] self.articleCount = 0 self.searchPageLimit = searchPageLimit self.websitesJsonFile = websitesJsonFile self.isInitialCrawl = isInitialCrawl # instantiate a Website object to interact with the website to be crawled try: self.website = Website(url, websitesJsonFile=self.websitesJsonFile) # raise exception if there is an error connecting to the website except WebsiteFailedToInitialize: raise WebsiteFailedToInitialize(url) # open the json file containing websites and their attributes with open(self.websitesJsonFile) as data_file: self.websites = json.load(data_file) data_file.close() # set the searchQuery attribute to the appropriate search query structure in the websites json file for website, attributes in self.websites.items(): if website in self.baseUrl: self.searchQuery = attributes["searchQuery"] self.nextPageType = attributes["nextPage"] # populate the exceptions attribute list with websites who's article urls need to be manually # crawled self.exceptions = [ "https://www.ourmidland.com/", "https://www.lakecountystar.com/", "https://www.northernexpress.com/", "https://www.manisteenews.com/" ] print("\r" + bcolors.OKGREEN + "[+]" + bcolors.ENDC + " Crawling " + self.baseUrl + "..." + bcolors.ENDC, end="") sys.stdout.flush() # start crawling self.crawl() print("\r" + bcolors.OKGREEN + "[+]" + bcolors.ENDC + " Crawled " + self.baseUrl + ": " + bcolors.OKGREEN + str(len(self.articleLinks)) + " URLs retrieved" + bcolors.ENDC)
def read_file(filename): """ Reads a file and returns a list of Website objects """ lines = [] with open(filename) as file: for line in file: url, interval = line.split(' ') interval = int(interval) website = Website(url, interval) lines.append(website) return lines
def getSites(self): global conn global cur cur.execute("SELECT * FROM sites") sitesData = cur.fetchall() allSiteObjs = [] for site in sitesData: siteObj = Website(site['id'], site['name'], site['url'], site['searchUrl'], site['resultListing'], site['resultUrl'], site['absoluteUrl'], site['pageTitle'], site['pageBody']) allSiteObjs.append(siteObj) return allSiteObjs
def test_find_links(page_content, hostname, scheme, links, to_visit): mock_parsed_url = Mock() mock_parsed_url.hostname = hostname mock_parsed_url.scheme = scheme mock_parsed_url.netloc = hostname website = Website('http://hostname/url') # Simulate visiting the page. website.to_visit.popitem() page = Page('a_url') bs = BeautifulSoup(page_content, 'html.parser') website.find_links(page, bs, mock_parsed_url) assert page.links == links assert website.to_visit == OrderedDict((key, None) for key in to_visit)
def run_website(): website = Website() @website.route('/') def index(): return 200, 'users list' @website.route('/users/([0-9]+)') def user(user_id): if user_id not in ['1', '2']: return 404, '' return 200, f'user {user_id}' website.run(_ADDRESS)
def __init__(self): self.w = Website() self.root = Tk() self.root.title("Auto site - Enter the fields") self.my_font = tkFont.Font(family="Helvetica", size=11) self.frame = Frame(self.root, height=800, width=800, \ padx=50, pady=10) self.frame.pack() self.fields() self.buttons() self.root.mainloop()
def test_stocks(): # READ IN ALL SP500 and NASDAQ INFO nasdaq = READ("nasdaq.txt", ".") sp100 = READ("sp500.txt") for stock in nasdaq.splitlines(): print("NASDAQ SEARCH: " + stock) url = wikipedia.wiki_search(stock) webpage = Website(url) webpage.set_directory("./wikipedia/") html = webpage.get_html() xml = webpage.get_xml() websites.append(webpage) for stock in sp100.splitlines(): print("EVALUATING STOCK SP500 " + stock) wikipedia.wiki_search(stock)
def test_city(): cities = [] for city in URL_CITY_ARRAY: location = Website(city) location.set_directory('./wikipedia/') html = location.get_html() cities.append(location) """ cities = [] fact_book = [] #for country in CIA_FACT_BOOK: # CIA.cia_indexer(Website(country)) for city in URL_CITY_ARRAY: cities.append(Website(city)) for city in cities: wikipedia.wiki_study_city(city) """ return
def analyse_URL(jsonData): """ Decide whether a website is phishing using its keywords and a Google search based on those. Parameters ---------- jsonData: contains site data """ ws = Website(json=jsonData) print(datetime.now().strftime("%H:%M:%S.%f") + "-- building vector") # build feature vector feat_vec_temp = {} feat_vect_site = build_feat_vec.feature_vector(extractor, ws) feat_vec_temp[0] = feat_vect_site feat_vect = DataFrame(feat_vec_temp) feat_vect = feat_vect.transpose().fillna(0) # prediction using gradient boosing exp = "238" features = feat_vect.columns print(datetime.now().strftime("%H:%M:%S.%f") + "-- vector done, start gradient boosting:") scoregb, predictiongb = _predict_gb(1, feat_vect, features, exp) gb_results = scoregb, predictiongb print(datetime.now().strftime("%H:%M:%S.%f") + "-- gradient done") global keep_track if keep_track: if gb_results[1] == 1: JSONtoFile(jsonData, True, jsonData['siteid']) else: JSONtoFile(jsonData, False, jsonData['siteid']) return gb_results, jsonData['jspageid'], jsonData['siteid']
def target_analyse(data): json_data = {'jspageid': data['jspageid']} json_data['siteid'] = data['siteid'] ws = Website(json=data) target_identity = identify_target(ws) mld = '.'.join(split_mld_ps(data['landurl'])) if mld == target_identity[0]: json_data['falsePositive'] = True else: json_data['falsePositive'] = False json_data['target'] = target_identity[0] json_data['otherTargets'] = target_identity[1] # print('Identified Target: ' + target_identity[0] + "\t/ other potential targets: " + str(target_identity[1])) return json_data
def add_site(self, url): """ The function to enable user's add website. Parameters: url (str): Url of the website. Returns: Object: Website object with details of the website. """ self.check_auth() subscription = database['subscriptions'].get(f'{self.email}') check_key(subscription, 'user has no subscription') subscription.check_sub() site_limit = subscription.plan.limit if site_limit and (site_limit == len(subscription.websites)): raise ValueError( f'Current plan can only allow {site_limit} website(s)') new_site = Website(self, url) subscription.websites[f'{new_site.id}'].append(new_site) subscription.save() return new_site
def crawler(self, classifier: webclassifier.Classifier): if len(self.website_nodes) == 0: print("There are no more websites need to be crawled.") return False new_website_nodes = [] for website_node in self.website_nodes: for url in website_node.website.soup.find_all('a'): try: url.attrs['href'] except KeyError: continue # Crawled website need not crawl again. if url.attrs['href'] in classifier.url: print('This website has been added into classifier.') continue # Decide which website can be stored try: new_website_node = WebsiteNode(Website(url.attrs['href'], get_soup(url.attrs['href']))) except Exception: print('Invalid url ', url['href']) else: classifier.add_website(new_website_node.website) classifier.cal() new_website_node.website.relevance = -1 for seed in classifier.seed_websites: rel = classifier.calculate_web_similarity_by_text(seed, new_website_node.website) new_website_node.website.relevance = max(new_website_node.website.relevance, rel) if new_website_node.website.relevance > self.threshold: print(url.attrs['href'], 'is relative, relevance is %s' % new_website_node.website.relevance) website_node.child.append(new_website_node) new_website_nodes.append(new_website_node) classifier.cal() self.website_nodes = new_website_nodes return True
def addWebsite(self): """ Add a website to the users data :return: """ print(msg.website_add_welc) name = input(msg.website_add_name) while True: url = input(msg.website_add_url) if (self.checkUrl(url)): break print(msg.website_url_inc) while True: checkInterval = input(msg.website_add_check) try: checkInterval = int(checkInterval) break except: print(msg.website_add_check_inc) newWebsite = Website(name=name, url=url, checkInterval=checkInterval) self.mySites[name] = newWebsite # update the data about the user data_utils.updateUser(self) return