Ejemplo n.º 1
0
	def run(self):
	
		# sync top level domains with mozilla if the user is root
		if os.geteuid() == 0:
			update_tld_names()
		else:
			print("Not running as root, you are going to need those privs to nmap properly")
			sys.exit(-1)
		
		# try to resolve ip
		if self._isHostname:
			try:
				self._ip = socket.gethostbyname(self._target)
			except:
				print("== Error on resolving IP check that hostname resolves: ")
				print(sys.exc_info())
				sys.exit(-1)
		else:
			self._ip = self._target
		
		# Iterate through plugins which require a hostname to be passed	
		if self._isHostname:
			for plugin in self._hostnamePlugins:
				plugin.run(self._target)
		
		# Iterate through the remaining plugins with an IP
		for plugin in self._plugins:
			plugin.run(self._ip)
Ejemplo n.º 2
0
    def init(self):
        if url_normalize is None:
            raise MissingDependencyError("url-normalize")
        url_version = pkg_resources.get_distribution("url-normalize").version
        if tuple(int(v) for v in url_version.split('.')) < (
                1, 4, 1) and self.default_scheme is not None:
            raise ValueError(
                "Parameter 'default_scheme' given but 'url-normalize' version %r does not support it. "
                "Get at least version '1.4.1'." % url_version)
        if get_tld is None:
            raise MissingDependencyError("tld")
        try:
            update_tld_names()
        except tld.exceptions.TldIOError:
            self.logger.info("Could not update TLD names cache.")
        if self.domain_whitelist != '':
            self._domain_whitelist.extend(self.domain_whitelist.split(','))
        if self.substitutions != '':
            temp = self.substitutions.split(';')
            if len(temp) % 2 != 0:
                raise InvalidArgument(
                    'substitutions',
                    got=self.substitutions,
                    expected="even number of ; separated strings")
            for i in range(int(len(temp) / 2)):
                self._substitutions.append([temp[2 * i], temp[2 * i + 1]])
        if not ClassificationType.is_valid(self.classification_type):
            self.classification_type = 'unknown'

        if self.default_scheme is not None:
            self.url_kwargs = {'default_scheme': self.default_scheme}
        else:
            self.url_kwargs = {}
Ejemplo n.º 3
0
def main():
    # Setup
    update_tld_names() #Forces the script to sync the tld names with the latest version from Mozilla
    get_info() # Populate 'data' with objects
    

    subreddits = ['worldnews', 'news']
    #threads = [] // Praw is not thread-safe

    reddit = praw.Reddit('SourceInfoBot', user_agent='SourceInfoBot v1.0')
    
    if reddit == None:
        log(ErrType.error, "Unable to connect to Reddit.")
        sys.exit()
    else:
        log(ErrType.info, "Connected to Reddit.")
        print("Connected to Reddit.")

    subs = "+".join(subreddits)

    for submission in reddit.subreddit(subs).stream.submissions():
        comment = proc_submission(submission)
        if comment == None:
            continue
        else:
            submission.reply(comment)
Ejemplo n.º 4
0
    def init(self):
        if url_normalize is None:
            raise ValueError("Could not import 'url-normalize'. Please install it.")
        url_version = pkg_resources.get_distribution("url-normalize").version
        if tuple(int(v) for v in url_version.split('.')) < (1, 4, 1) and hasattr(self.parameters, 'default_scheme'):
            raise ValueError("Parameter 'default_scheme' given but 'url-normalize' version %r does not support it. "
                             "Get at least version '1.4.1'." % url_version)
        if get_tld is None:
            raise ValueError("Could not import 'tld'. Please install it.")
        try:
            update_tld_names()
        except tld.exceptions.TldIOError:
            self.logger.info("Could not update TLD names cache.")
        self.domain_whitelist = []
        if getattr(self.parameters, "domain_whitelist", '') != '':
            self.domain_whitelist.extend(self.parameters.domain_whitelist.split(','))
        self.substitutions = []
        if getattr(self.parameters, "substitutions", '') != '':
            temp = self.parameters.substitutions.split(';')
            if len(temp) % 2 != 0:
                raise InvalidArgument(
                    'substitutions',
                    got=self.parameters.substitutions,
                    expected="even number of ; separeted strings")
            for i in range(int(len(temp) / 2)):
                self.substitutions.append([temp[2 * i], temp[2 * i + 1]])
        self.classification_type = getattr(self.parameters, "classification_type", "unknown")
        if not ClassificationType.is_valid(self.classification_type):
            self.classification_type = 'unknown'

        if hasattr(self.parameters, 'default_scheme'):
            self.url_kwargs = {'default_scheme': self.parameters.default_scheme}
        else:
            self.url_kwargs = {}
Ejemplo n.º 5
0
    def run(self):

        # sync top level domains with mozilla if the user is root
        if os.geteuid() == 0:
            update_tld_names()
        else:
            print "Not running as root, you are going to need those privs to nmap properly"
            sys.exit(-1)

            # try to resolve ip
        if self._isHostname:
            try:
                self._ip = socket.gethostbyname(self._target)
            except:
                print "== Error on resolving IP check that hostname resolves: "
                print sys.exc_info()
                sys.exit(-1)
        else:
            self._ip = self._target

            # Iterate through plugins which require a hostname to be passed
        if self._isHostname:
            for plugin in self._hostnamePlugins:
                plugin.run(self._target)

                # Iterate through the remaining plugins with an IP
        for plugin in self._plugins:
            plugin.run(self._ip)
 def initialize(self):
     self.count = 0
     links = self.frame.get_new(OneAdityan1MonishppSkanade1UnProcessedLink)
     if len(links) > 0:
         print "Resuming from the previous state."
         self.download_links(links)
         update_tld_names()
     else:
         l = Adityan1MonishppSkanade1Link("http://www.ics.uci.edu/")
         print l.full_url
         update_tld_names()
         self.frame.add(l)
Ejemplo n.º 7
0
def sort_domains(image_obj):
    update_tld_names()
    image_obj_ret = {}
    for path in image_obj:
        url = image_obj[path]
        try:
            tld = get_tld(url)
            if tld not in image_obj_ret:
                image_obj_ret[tld] = {}
            image_obj_ret[tld][url] = path
        except:
            pass
    return image_obj_ret
Ejemplo n.º 8
0
def main():
    args = parser.parse_args()
    update_tld_names()
    if not congifLogger(args.logFile, args.logLevel):
        print '\nPermission denied: %s' % args.logFile
        print 'Please make sure you have the permission to save the log file!\n'
    elif args.testSelf:
        Crawler(args).selfTesting(args)
    else:
        crawler = Crawler(args)
        printProgress = PrintProgress(crawler)
        printProgress.start()
        crawler.start()
        printProgress.printSpendingTime()
Ejemplo n.º 9
0
 def test_1_update_tld_names(self):
     """
     Test updating the tld names (re-fetch mozilla source).
     """
     res = update_tld_names(fail_silently=True)
     self.assertTrue(res)
     return res
Ejemplo n.º 10
0
 def test_1_update_tld_names(self):
     """
     Test updating the tld names (re-fetch mozilla source).
     """
     res = update_tld_names(fail_silently=True)
     self.assertTrue(res)
     return res
Ejemplo n.º 11
0
 def init(self):
     if url_normalize is None:
         raise ValueError("Could not import 'url-normalize'. Please install it.")
     if get_tld is None:
         raise ValueError("Could not import 'tld'. Please install it.")
     update_tld_names()
     self.domain_whitelist = []
     if getattr(self.parameters, "domain_whitelist", '') != '':
         self.domain_whitelist.extend(self.parameters.domain_whitelist.split(','))
     self.substitutions = []
     if getattr(self.parameters, "substitutions", '') != '':
         temp = self.parameters.substitutions.split(';')
         if len(temp) % 2 != 0:
             raise InvalidArgument(
                 'substitutions',
                 got=self.parameters.substitutions,
                 expected="even number of ; separeted strings")
         for i in range(int(len(temp) / 2)):
             self.substitutions.append([temp[2 * i], temp[2 * i + 1]])
     self.classification_type = getattr(self.parameters, "classification_type", "unknown")
     if not ClassificationType.is_valid(self.classification_type):
         self.classification_type = 'unknown'
Ejemplo n.º 12
0
def main():
    args = parser.parse_args()
    update_tld_names()
    # Initilize the unvisited_urls.
    unvisited_url = deque()
    with io.open(args.domain_seeds, 'r+') as fp:
        urlList = fp.readlines()
        for url in urlList:
            if len(unvisited_url) < args.max_domain_seeds:
                unvisited_url.append(url)
        print 'We have got %d domain feeds.' % len(unvisited_url)

    if not config_logger(args.logFile, args.logLevel):
        print '\nPermission denied: %s' % args.logFile
        print 'Please make sure you have the permission to save the log file!\n'
    else:
        crawler = Crawler(args)
        print_progress = PrintProgress(crawler)
        print_progress.start()
        while len(unvisited_url) > 0:
            url = unvisited_url.popleft()
            crawler.crawl(url)
        print_progress.print_spending_time()
Ejemplo n.º 13
0
from datetime import datetime
from utcdate import UtcDate
from spamhandling import check_if_spam_json
from globalvars import GlobalVars
from datahandling import load_files, filter_auto_ignored_posts
from metasmoke import Metasmoke
from deletionwatcher import DeletionWatcher
import json
import time
import requests
# noinspection PyPackageRequirements
from tld.utils import update_tld_names, TldIOError
from helpers import log

try:
    update_tld_names()
except TldIOError as ioerr:
    with open('errorLogs.txt', 'a') as errlogs:
        if "permission denied:" in str(ioerr).lower():
            if "/usr/local/lib/python2.7/dist-packages/" in str(ioerr):
                errlogs.write(
                    "WARNING: Cannot update TLD names, due to `tld` being system-wide installed and not "
                    "user-level installed.  Skipping TLD names update. \n")

            if "/home/" in str(
                    ioerr
            ) and ".local/lib/python2.7/site-packages/tld/" in str(ioerr):
                errlogs.write(
                    "WARNING: Cannot read/write to user-space `tld` installation, check permissions on the "
                    "path.  Skipping TLD names update. \n")
Ejemplo n.º 14
0
import urllib2, sys, re, csv, json, lxml, lxml.html
from lxml.html.clean import Cleaner
import MySQLdb, sys
from tld import get_tld
from tld.utils import update_tld_names
update_tld_names()
reload(sys)
sys.setdefaultencoding('utf8')

def safeCrawl():
	i=0
	seed = []
	db = MySQLdb.connect(host='127.0.0.1',db='jcbraunDB',user='******',passwd='3312crystal')
	cursor = db.cursor()
	outLinks = []
	if (n ==0):
		execString = ("SELECT URL, Domain FROM safeSeed WHERE crawled=0;") 
		cursor.execute(execString)
		seedx = cursor.fetchall()
		
	else:
		execString = ("SELECT URLTo FROM safeOutboundLinks WHERE lvl=%i;" % (n)) 
		cursor.execute(execString)
		seedx = cursor.fetchall()
		print seedx

	for row in seedx:
		print ("NEW PAGE")
		i = i+1
		try:
			url = row[0]
Ejemplo n.º 15
0
def seed (db):
	#establish cursor, update tld data
	cursor = db.cursor()
	update_tld_names()
	domain = ""
	#insert sites from seed and safeSeed csv files
	with open('seed.csv', 'rb') as csvfile:
		seedReader = csv.reader(csvfile, delimiter=',')
		for link in seedReader:
			link = link[0]
			if get_tld(link, fail_silently=True) != None:
				print "ADDING %s TO SPAM SEED... \n" % link
				domain = get_tld(link, fail_silently=True)
			try:
				execString = ("INSERT IGNORE INTO seed(Domain, URL, URLSource, Crawled) VALUES ('%s', '%s', 'list', '0');" %(domain, link)) 
				cursor.execute(execString)
				db.commit()
			except:
				print ("FAILED TO EXECUTE SQL QUERY: %s" %execString)
				
	with open('safeSeed.csv', 'rb') as csvfile:
		seedReader = csv.reader(csvfile, delimiter=',')
		for link in seedReader:
			link = link[0]
			if get_tld(link, fail_silently=True) != None:
				print "ADDING %s TO SAFE SEED... \n" % link
				domain = get_tld(link, fail_silently=True)
			try:
				execString = ("INSERT IGNORE INTO safeSeed(Domain, URL, URLSource, Crawled) VALUES ('%s', '%s', 'list', '0');" %(domain, link)) 
				cursor.execute(execString)
				db.commit()
			except:
				print ("FAILED TO EXECUTE SQL QUERY: %s" %execString)			
		
	try:	
		#get the whitelist from the sql server
		execString = ("SELECT Domain FROM WhiteList;") 
		cursor.execute(execString)
		wl = list(cursor)
		
		#use a file user.json in this directory to log into Gmail and pull down spam
		flow = flow_from_clientsecrets('user.json', scope='https://www.googleapis.com/auth/gmail.readonly')
		http = httplib2.Http()
		credentials = 'gmail.storage'.get()
		if credentials is None or credentials.invalid:
		  credentials = run(flow, 'gmail.storage', http=http)
		http = credentials.authorize(http)
		gmail_service = build('gmail', 'v1', http=http)
		spamMsgs = gmail_service.users().messages().list(userId='me', labelIds='SPAM').execute()
		execString = "" 
		i=0
		
	except: 
		print ("Unable to read spam email. You need user.json gmail credentials in this directory.")
		
		for spam in spamMsgs['messages']:
			i = i+1
			try:
				print spam
				messageId =(spam['id'])
				message = gmail_service.users().messages().get(id=messageId, userId='me').execute()
				stringe = (message['payload']['body'])	
				for part in message['payload']['parts']:
					content = part['body']['data']
					content = base64.urlsafe_b64decode(content.encode('ascii'))
					for url in re.findall('''http["'](.[^"']+)["']''', content):
						try:
							domainTo = (url.split("/"))[2]
							if ((domain + "/") in wl):
								print ("Whitelisted \n")
								bad = 0
							else:
								bad =1
							execString = ("INSERT IGNORE INTO seed (Domain, URL, URLSource, crawled) VALUES ('%s', '%s', 'list', 0);" % (domain, url))
							cursor.execute(execString)
						except:
							print "Failed to add this piece of spam"
					content=db.escape_string(content)
					execString = ("INSERT INTO Content (Lvl, Content, Domain, URL, CopySource) VALUES ('0', '%s', '%i', '%s', 'email');" % (content, i, str(messageId))) 
					cursor.execute(execString)
					db.commit()
			except Exception as e:
				print ("Failed to load email: %s" %execString)	
				print (type(e))
				print (e.args)
		
	db.close()
Ejemplo n.º 16
0
def update_tld():
    """
    Update tld
    """
    update_tld_names()
Ejemplo n.º 17
0
def main():
    ensure_dependencies()
    update_tld_names()
    start_spider()
Ejemplo n.º 18
0
def main():
    ensure_dependencies()
    update_tld_names()
    start_spider()
Ejemplo n.º 19
0
def emailSeed (db):
	#establish cursor, update tld data
	cursor = db.cursor()
	update_tld_names()
	domain = ""
	spamMsgs={}

	try:	
		#get the whitelist from the sql server
		execString = ("SELECT Domain FROM WhiteList;") 
		cursor.execute(execString)
		wl = list(cursor)
	except:
		print ("Couldn't read whitelist")
		
	try:
		#use a file user.json in this directory to log into Gmail and pull down spam
		CLIENT_SECRET_FILE = 'user.json'
		OAUTH_SCOPE = 'https://www.googleapis.com/auth/gmail.readonly'
		STORAGE = Storage('gmail.storage')
		flow = flow_from_clientsecrets(CLIENT_SECRET_FILE, scope=OAUTH_SCOPE)
		http = httplib2.Http()
		credentials = STORAGE.get()
		if credentials is None or credentials.invalid:
			credentials = run(flow, STORAGE, http=http)
		http = credentials.authorize(http)
		gmail_service = build('gmail', 'v1', http=http)
		spamMsgs = gmail_service.users().messages().list(userId='me', labelIds='SPAM').execute()
		execString = ""
		i=0
		
	except Exception as e: 
		print ("Unable to access spam email. You need user.json gmail credentials in this directory.")
		print (type(e))
		print (e.args)
		
	for spam in spamMsgs['messages']:
		i = i+1
		try:
			#get messages
			messageId =(spam['id'])
			message = gmail_service.users().messages().get(id=messageId, userId='me').execute()
			stringe = (message['payload']['body'])
				
			#add each message part to the database
			for part in message['payload']['parts']:
				print part
				content = part['body']['data']
				content = base64.urlsafe_b64decode(content.encode('ascii'))
				for url in re.findall('''http["'](.[^"']+)["']''', content):
					try:
						#set bad if in whitelist
						domainTo = (url.split("/"))[2]
						if ((domain + "/") in wl):
							print ("Whitelisted \n")
							bad = 0
						else:
							bad =1
						execString = ("INSERT IGNORE INTO seed (Domain, URL, URLSource, crawled) VALUES ('%s', '%s', 'list', 0);" % (domain, url))
						cursor.execute(execString)
					except:
						print "Failed to add this piece of spam"
				content=db.escape_string(content)
				execString = ("INSERT IGNORE INTO Content (Lvl, Content, Domain, URL, CopySource) VALUES ('0', '%s', '%i', '%s', 'email');" % (content, i, str(messageId))) 
				cursor.execute(execString)
				db.commit()
		except Exception as e:
			print ("Failed to load email with SQL query: %s" %execString)	
			print (type(e))
			print (e.args)
	
	db.close()
Ejemplo n.º 20
0
 def __init__(self, url=None, update_TLD=False):
     # self.url = url
     if update_TLD:
         update_tld_names()