def run(self): # sync top level domains with mozilla if the user is root if os.geteuid() == 0: update_tld_names() else: print("Not running as root, you are going to need those privs to nmap properly") sys.exit(-1) # try to resolve ip if self._isHostname: try: self._ip = socket.gethostbyname(self._target) except: print("== Error on resolving IP check that hostname resolves: ") print(sys.exc_info()) sys.exit(-1) else: self._ip = self._target # Iterate through plugins which require a hostname to be passed if self._isHostname: for plugin in self._hostnamePlugins: plugin.run(self._target) # Iterate through the remaining plugins with an IP for plugin in self._plugins: plugin.run(self._ip)
def init(self): if url_normalize is None: raise MissingDependencyError("url-normalize") url_version = pkg_resources.get_distribution("url-normalize").version if tuple(int(v) for v in url_version.split('.')) < ( 1, 4, 1) and self.default_scheme is not None: raise ValueError( "Parameter 'default_scheme' given but 'url-normalize' version %r does not support it. " "Get at least version '1.4.1'." % url_version) if get_tld is None: raise MissingDependencyError("tld") try: update_tld_names() except tld.exceptions.TldIOError: self.logger.info("Could not update TLD names cache.") if self.domain_whitelist != '': self._domain_whitelist.extend(self.domain_whitelist.split(',')) if self.substitutions != '': temp = self.substitutions.split(';') if len(temp) % 2 != 0: raise InvalidArgument( 'substitutions', got=self.substitutions, expected="even number of ; separated strings") for i in range(int(len(temp) / 2)): self._substitutions.append([temp[2 * i], temp[2 * i + 1]]) if not ClassificationType.is_valid(self.classification_type): self.classification_type = 'unknown' if self.default_scheme is not None: self.url_kwargs = {'default_scheme': self.default_scheme} else: self.url_kwargs = {}
def main(): # Setup update_tld_names() #Forces the script to sync the tld names with the latest version from Mozilla get_info() # Populate 'data' with objects subreddits = ['worldnews', 'news'] #threads = [] // Praw is not thread-safe reddit = praw.Reddit('SourceInfoBot', user_agent='SourceInfoBot v1.0') if reddit == None: log(ErrType.error, "Unable to connect to Reddit.") sys.exit() else: log(ErrType.info, "Connected to Reddit.") print("Connected to Reddit.") subs = "+".join(subreddits) for submission in reddit.subreddit(subs).stream.submissions(): comment = proc_submission(submission) if comment == None: continue else: submission.reply(comment)
def init(self): if url_normalize is None: raise ValueError("Could not import 'url-normalize'. Please install it.") url_version = pkg_resources.get_distribution("url-normalize").version if tuple(int(v) for v in url_version.split('.')) < (1, 4, 1) and hasattr(self.parameters, 'default_scheme'): raise ValueError("Parameter 'default_scheme' given but 'url-normalize' version %r does not support it. " "Get at least version '1.4.1'." % url_version) if get_tld is None: raise ValueError("Could not import 'tld'. Please install it.") try: update_tld_names() except tld.exceptions.TldIOError: self.logger.info("Could not update TLD names cache.") self.domain_whitelist = [] if getattr(self.parameters, "domain_whitelist", '') != '': self.domain_whitelist.extend(self.parameters.domain_whitelist.split(',')) self.substitutions = [] if getattr(self.parameters, "substitutions", '') != '': temp = self.parameters.substitutions.split(';') if len(temp) % 2 != 0: raise InvalidArgument( 'substitutions', got=self.parameters.substitutions, expected="even number of ; separeted strings") for i in range(int(len(temp) / 2)): self.substitutions.append([temp[2 * i], temp[2 * i + 1]]) self.classification_type = getattr(self.parameters, "classification_type", "unknown") if not ClassificationType.is_valid(self.classification_type): self.classification_type = 'unknown' if hasattr(self.parameters, 'default_scheme'): self.url_kwargs = {'default_scheme': self.parameters.default_scheme} else: self.url_kwargs = {}
def run(self): # sync top level domains with mozilla if the user is root if os.geteuid() == 0: update_tld_names() else: print "Not running as root, you are going to need those privs to nmap properly" sys.exit(-1) # try to resolve ip if self._isHostname: try: self._ip = socket.gethostbyname(self._target) except: print "== Error on resolving IP check that hostname resolves: " print sys.exc_info() sys.exit(-1) else: self._ip = self._target # Iterate through plugins which require a hostname to be passed if self._isHostname: for plugin in self._hostnamePlugins: plugin.run(self._target) # Iterate through the remaining plugins with an IP for plugin in self._plugins: plugin.run(self._ip)
def initialize(self): self.count = 0 links = self.frame.get_new(OneAdityan1MonishppSkanade1UnProcessedLink) if len(links) > 0: print "Resuming from the previous state." self.download_links(links) update_tld_names() else: l = Adityan1MonishppSkanade1Link("http://www.ics.uci.edu/") print l.full_url update_tld_names() self.frame.add(l)
def sort_domains(image_obj): update_tld_names() image_obj_ret = {} for path in image_obj: url = image_obj[path] try: tld = get_tld(url) if tld not in image_obj_ret: image_obj_ret[tld] = {} image_obj_ret[tld][url] = path except: pass return image_obj_ret
def main(): args = parser.parse_args() update_tld_names() if not congifLogger(args.logFile, args.logLevel): print '\nPermission denied: %s' % args.logFile print 'Please make sure you have the permission to save the log file!\n' elif args.testSelf: Crawler(args).selfTesting(args) else: crawler = Crawler(args) printProgress = PrintProgress(crawler) printProgress.start() crawler.start() printProgress.printSpendingTime()
def test_1_update_tld_names(self): """ Test updating the tld names (re-fetch mozilla source). """ res = update_tld_names(fail_silently=True) self.assertTrue(res) return res
def init(self): if url_normalize is None: raise ValueError("Could not import 'url-normalize'. Please install it.") if get_tld is None: raise ValueError("Could not import 'tld'. Please install it.") update_tld_names() self.domain_whitelist = [] if getattr(self.parameters, "domain_whitelist", '') != '': self.domain_whitelist.extend(self.parameters.domain_whitelist.split(',')) self.substitutions = [] if getattr(self.parameters, "substitutions", '') != '': temp = self.parameters.substitutions.split(';') if len(temp) % 2 != 0: raise InvalidArgument( 'substitutions', got=self.parameters.substitutions, expected="even number of ; separeted strings") for i in range(int(len(temp) / 2)): self.substitutions.append([temp[2 * i], temp[2 * i + 1]]) self.classification_type = getattr(self.parameters, "classification_type", "unknown") if not ClassificationType.is_valid(self.classification_type): self.classification_type = 'unknown'
def main(): args = parser.parse_args() update_tld_names() # Initilize the unvisited_urls. unvisited_url = deque() with io.open(args.domain_seeds, 'r+') as fp: urlList = fp.readlines() for url in urlList: if len(unvisited_url) < args.max_domain_seeds: unvisited_url.append(url) print 'We have got %d domain feeds.' % len(unvisited_url) if not config_logger(args.logFile, args.logLevel): print '\nPermission denied: %s' % args.logFile print 'Please make sure you have the permission to save the log file!\n' else: crawler = Crawler(args) print_progress = PrintProgress(crawler) print_progress.start() while len(unvisited_url) > 0: url = unvisited_url.popleft() crawler.crawl(url) print_progress.print_spending_time()
from datetime import datetime from utcdate import UtcDate from spamhandling import check_if_spam_json from globalvars import GlobalVars from datahandling import load_files, filter_auto_ignored_posts from metasmoke import Metasmoke from deletionwatcher import DeletionWatcher import json import time import requests # noinspection PyPackageRequirements from tld.utils import update_tld_names, TldIOError from helpers import log try: update_tld_names() except TldIOError as ioerr: with open('errorLogs.txt', 'a') as errlogs: if "permission denied:" in str(ioerr).lower(): if "/usr/local/lib/python2.7/dist-packages/" in str(ioerr): errlogs.write( "WARNING: Cannot update TLD names, due to `tld` being system-wide installed and not " "user-level installed. Skipping TLD names update. \n") if "/home/" in str( ioerr ) and ".local/lib/python2.7/site-packages/tld/" in str(ioerr): errlogs.write( "WARNING: Cannot read/write to user-space `tld` installation, check permissions on the " "path. Skipping TLD names update. \n")
import urllib2, sys, re, csv, json, lxml, lxml.html from lxml.html.clean import Cleaner import MySQLdb, sys from tld import get_tld from tld.utils import update_tld_names update_tld_names() reload(sys) sys.setdefaultencoding('utf8') def safeCrawl(): i=0 seed = [] db = MySQLdb.connect(host='127.0.0.1',db='jcbraunDB',user='******',passwd='3312crystal') cursor = db.cursor() outLinks = [] if (n ==0): execString = ("SELECT URL, Domain FROM safeSeed WHERE crawled=0;") cursor.execute(execString) seedx = cursor.fetchall() else: execString = ("SELECT URLTo FROM safeOutboundLinks WHERE lvl=%i;" % (n)) cursor.execute(execString) seedx = cursor.fetchall() print seedx for row in seedx: print ("NEW PAGE") i = i+1 try: url = row[0]
def seed (db): #establish cursor, update tld data cursor = db.cursor() update_tld_names() domain = "" #insert sites from seed and safeSeed csv files with open('seed.csv', 'rb') as csvfile: seedReader = csv.reader(csvfile, delimiter=',') for link in seedReader: link = link[0] if get_tld(link, fail_silently=True) != None: print "ADDING %s TO SPAM SEED... \n" % link domain = get_tld(link, fail_silently=True) try: execString = ("INSERT IGNORE INTO seed(Domain, URL, URLSource, Crawled) VALUES ('%s', '%s', 'list', '0');" %(domain, link)) cursor.execute(execString) db.commit() except: print ("FAILED TO EXECUTE SQL QUERY: %s" %execString) with open('safeSeed.csv', 'rb') as csvfile: seedReader = csv.reader(csvfile, delimiter=',') for link in seedReader: link = link[0] if get_tld(link, fail_silently=True) != None: print "ADDING %s TO SAFE SEED... \n" % link domain = get_tld(link, fail_silently=True) try: execString = ("INSERT IGNORE INTO safeSeed(Domain, URL, URLSource, Crawled) VALUES ('%s', '%s', 'list', '0');" %(domain, link)) cursor.execute(execString) db.commit() except: print ("FAILED TO EXECUTE SQL QUERY: %s" %execString) try: #get the whitelist from the sql server execString = ("SELECT Domain FROM WhiteList;") cursor.execute(execString) wl = list(cursor) #use a file user.json in this directory to log into Gmail and pull down spam flow = flow_from_clientsecrets('user.json', scope='https://www.googleapis.com/auth/gmail.readonly') http = httplib2.Http() credentials = 'gmail.storage'.get() if credentials is None or credentials.invalid: credentials = run(flow, 'gmail.storage', http=http) http = credentials.authorize(http) gmail_service = build('gmail', 'v1', http=http) spamMsgs = gmail_service.users().messages().list(userId='me', labelIds='SPAM').execute() execString = "" i=0 except: print ("Unable to read spam email. You need user.json gmail credentials in this directory.") for spam in spamMsgs['messages']: i = i+1 try: print spam messageId =(spam['id']) message = gmail_service.users().messages().get(id=messageId, userId='me').execute() stringe = (message['payload']['body']) for part in message['payload']['parts']: content = part['body']['data'] content = base64.urlsafe_b64decode(content.encode('ascii')) for url in re.findall('''http["'](.[^"']+)["']''', content): try: domainTo = (url.split("/"))[2] if ((domain + "/") in wl): print ("Whitelisted \n") bad = 0 else: bad =1 execString = ("INSERT IGNORE INTO seed (Domain, URL, URLSource, crawled) VALUES ('%s', '%s', 'list', 0);" % (domain, url)) cursor.execute(execString) except: print "Failed to add this piece of spam" content=db.escape_string(content) execString = ("INSERT INTO Content (Lvl, Content, Domain, URL, CopySource) VALUES ('0', '%s', '%i', '%s', 'email');" % (content, i, str(messageId))) cursor.execute(execString) db.commit() except Exception as e: print ("Failed to load email: %s" %execString) print (type(e)) print (e.args) db.close()
def update_tld(): """ Update tld """ update_tld_names()
def main(): ensure_dependencies() update_tld_names() start_spider()
def emailSeed (db): #establish cursor, update tld data cursor = db.cursor() update_tld_names() domain = "" spamMsgs={} try: #get the whitelist from the sql server execString = ("SELECT Domain FROM WhiteList;") cursor.execute(execString) wl = list(cursor) except: print ("Couldn't read whitelist") try: #use a file user.json in this directory to log into Gmail and pull down spam CLIENT_SECRET_FILE = 'user.json' OAUTH_SCOPE = 'https://www.googleapis.com/auth/gmail.readonly' STORAGE = Storage('gmail.storage') flow = flow_from_clientsecrets(CLIENT_SECRET_FILE, scope=OAUTH_SCOPE) http = httplib2.Http() credentials = STORAGE.get() if credentials is None or credentials.invalid: credentials = run(flow, STORAGE, http=http) http = credentials.authorize(http) gmail_service = build('gmail', 'v1', http=http) spamMsgs = gmail_service.users().messages().list(userId='me', labelIds='SPAM').execute() execString = "" i=0 except Exception as e: print ("Unable to access spam email. You need user.json gmail credentials in this directory.") print (type(e)) print (e.args) for spam in spamMsgs['messages']: i = i+1 try: #get messages messageId =(spam['id']) message = gmail_service.users().messages().get(id=messageId, userId='me').execute() stringe = (message['payload']['body']) #add each message part to the database for part in message['payload']['parts']: print part content = part['body']['data'] content = base64.urlsafe_b64decode(content.encode('ascii')) for url in re.findall('''http["'](.[^"']+)["']''', content): try: #set bad if in whitelist domainTo = (url.split("/"))[2] if ((domain + "/") in wl): print ("Whitelisted \n") bad = 0 else: bad =1 execString = ("INSERT IGNORE INTO seed (Domain, URL, URLSource, crawled) VALUES ('%s', '%s', 'list', 0);" % (domain, url)) cursor.execute(execString) except: print "Failed to add this piece of spam" content=db.escape_string(content) execString = ("INSERT IGNORE INTO Content (Lvl, Content, Domain, URL, CopySource) VALUES ('0', '%s', '%i', '%s', 'email');" % (content, i, str(messageId))) cursor.execute(execString) db.commit() except Exception as e: print ("Failed to load email with SQL query: %s" %execString) print (type(e)) print (e.args) db.close()
def __init__(self, url=None, update_TLD=False): # self.url = url if update_TLD: update_tld_names()