def get_crawlers(configuration, section): """ parse the config section for crawlers * does recognize (by name) known and implemented crawlers only * a robust config reading and more freedom for users :param configuration: RawConfigParser :param section: string :return: list """ crawlers = [] for crawler_class in Crawler.__subclasses__(): crawler_class_name = crawler_class.__name__ if not configuration.has_option(section, crawler_class_name): continue # skip crawler if not configured crawler_config = configuration.get(section, crawler_class_name) if not crawler_config or crawler_config.lower() == "false": continue # skip crawler if not configured or disabled crawler_uris = [] # mimic old behaviours for bool values if crawler_config.lower() == "true": if crawler_class == Pr0gramm: crawler_config = "static" elif crawler_class == SoupIO: crawler_config = "everyone" crawler_sites = [url_quote_plus(site_stripped) for site_stripped in [site.strip() for site in crawler_config.split(",")] # trim sites if site_stripped] # filter stripped list for valid values if not crawler_sites: continue # skip crawler if no valid sites configured logger.info("found configured Crawler: %s = %s" % (crawler_class_name, repr(crawler_sites))) if crawler_class == Reddit: crawler_uris = ["http://www.reddit.com/r/%s" % site for site in crawler_sites] elif crawler_class == NineGag: crawler_uris = ["http://9gag.com/%s" % site for site in crawler_sites] elif crawler_class == Pr0gramm: crawler_uris = ["http://pr0gramm.com/static/%s" % site for site in crawler_sites] elif crawler_class == SoupIO: crawler_uris = [("http://www.soup.io/%s" if site in ["everyone"] # public site else "http://%s.soup.io") % site # user site for site in crawler_sites] elif crawler_class == Instagram: crawler_uris = ["http://instagram.com/%s" % site for site in crawler_sites] elif crawler_class == Fourchan: crawler_uris = ["http://boards.4chan.org/%s/" % site for site in crawler_sites] elif crawler_class == Giphy: crawler_uris = ["http://api.giphy.com/v1/gifs/search?q=%s" % site for site in crawler_sites] elif crawler_class == Bildschirmarbeiter: crawler_uris = ["http://www.bildschirmarbeiter.com/plugs/category/%s/P120/" % site for site in crawler_sites] crawlers += [crawler_class(crawler_uri) for crawler_uri in crawler_uris] return crawlers
def generate_files(): maltego_folder = sys.argv[2] import os import inspect from lxml import etree as ET from crawler import Crawler def change_jar(file_name_jar, file_name_union): # todo: manterá os arquivos já existentes, alterando apenas os presentes no file_name_union - esse comportamento deve ser mantido? from subprocess import Popen, PIPE p = Popen(['jar', 'uf', file_name_jar, file_name_union], stdout=PIPE, cwd=maltego_folder) p.stdout.read() assert (p.wait() == 0) # apagar o file_name_union, pois ele agora é desnecessário import shutil shutil.rmtree(maltego_folder + file_name_union) ### # read xml config maltego_config = ET.parse(folder_this_file + '/config.xml').getroot() entity_icon_dict = { i.attrib['name']: i.text for i in maltego_config.find('entity_icons').findall('entity') } ### # entidades entities_names = [] dir_save_files = maltego_folder + '/com/paterva/maltego/entities/common/' os.makedirs(dir_save_files) if not os.path.exists(dir_save_files + 'spyck'): os.makedirs(dir_save_files + 'spyck') me = lib_files.MaltegoEntity(dir_save_files) # arbitrárias me.new_entity_info_from_entity('info', 'Phrase') me.new_entity_info_from_entity('info_list', 'OsiModelGolden') # crawlers populares crawler_populator = [] for current_crawler in Crawler.__subclasses__(): jump = False harvest_args = inspect.getargspec(current_crawler.harvest_debug).args if len(harvest_args) == 1: continue for i in harvest_args: if i[:7] == 'entity_': jump = True break if jump: continue me.new_entity_crawler_populator( current_crawler.name(), 'Objects', harvest_args[1] ) # todo: precisa ser generalizado! pois pode ser que algum crawler populator receba mais que apenas um único crawler_param crawler_populator.append(current_crawler) # entities for current_xml in os.listdir(folder_spyck + '/entities/'): crawler_root = ET.parse(folder_spyck + '/entities/' + current_xml).getroot() current_xml = current_xml[:-4] entities_names.append(current_xml) column_name, column_type = crawler_root.find('column').find( 'name').text, crawler_root.find('column').find('type').text me.new_entity_entity(current_xml, entity_icon_dict[current_xml], column_name, column_type) # layer me.save_layer() # colocar os arquivos no .jar do Maltego change_jar('com-paterva-maltego-entities-common.jar', 'com') ### # crawlers dir_save_files = maltego_folder + '/com/paterva/maltego/transforms/standard/' os.makedirs(dir_save_files) os.makedirs(dir_save_files + '/local') mt = lib_files.MaltegoTransform('/usr/bin/python3', folder_spyck, dir_save_files) # crawler arbitrário get_info_all mt.new_transform('get_info_all', [i for i in entities_names], 'get_info_all') mt.new_transform('unpack_list', ['info_list'], 'unpack_list') # crawlers de verdade for current_crawler in Crawler.__subclasses__(): if current_crawler in crawler_populator: continue mt.new_transform(current_crawler.name(), [ i[10:] for i in inspect.getargspec(current_crawler.harvest_debug).args if i[:7] == 'entity_' ], 'execute_crawler {}'.format(current_crawler.name())) # crawler populator for current_crawler in crawler_populator: mt.new_transform(current_crawler.name(), [current_crawler.name()], 'execute_crawler {}'.format(current_crawler.name())) # layer mt.save_layer() # colocar os arquivos no .jar do Maltego change_jar('com-paterva-maltego-transforms-standard.jar', 'com')
def get_crawlers(configuration, section): """ parse the config section for crawlers * does recognize (by name) known and implemented crawlers only * a robust config reading and more freedom for users :param configuration: RawConfigParser :param section: string :return: crawler, factors """ crawlers = {} factors = {} for crawler_class in Crawler.__subclasses__(): crawler_class_name = crawler_class.__name__ if not configuration.has_option(section, crawler_class_name): continue # skip crawler if not configured crawler_config = configuration.get(section, crawler_class_name) if not crawler_config or crawler_config.lower() == "false": continue # skip crawler if not configured or disabled crawler_uris = {} # mimic old behaviours for bool values if crawler_config.lower() == "true": if crawler_class == SoupIO: crawler_config = "everyone" crawler_sites_and_factors = [site_stripped for site_stripped in [site.strip() for site in crawler_config.split(",")] # trim sites if site_stripped] # filter stripped list for valid values if not crawler_sites_and_factors: continue # skip crawler if no valid sites configured crawler_sites = [] factors[crawler_class_name] = {} # Separate Site and Factor for factorPair in crawler_sites_and_factors: if factor_separator not in factorPair: # No Factor configured crawler_sites.append(url_quote_plus(factorPair)) continue factorPair_parts = [factorPairPart.strip() for factorPairPart in factorPair.split(factor_separator)] if not factorPair_parts or not len(factorPair_parts) == 2: continue site = url_quote_plus(factorPair_parts[0]) factor = float(factorPair_parts[1]) crawler_sites.append(site) if site not in factors[crawler_class_name] and 0 < factor <= 10: factors[crawler_class_name][site] = factor logger.info("found configured Crawler: %s = %s Factors: %s" % ( crawler_class_name, repr(crawler_sites), repr(factors[crawler_class_name]))) if crawler_class == Reddit: crawler_uris = {site: "https://www.reddit.com/r/%s" % site for site in crawler_sites} elif crawler_class == NineGag: crawler_uris = {site: "https://9gag.com/%s" % site for site in crawler_sites} elif crawler_class == Pr0gramm: crawler_uris = {crawler_sites[0]: "https://pr0gramm.com/api/items/get"} elif crawler_class == SoupIO: crawler_uris = {site: ("http://www.soup.io/%s" if site in ["everyone"] # public site else "http://%s.soup.io") % site # user site for site in crawler_sites} elif crawler_class == Instagram: crawler_uris = {site: "https://instagram.com/%s" % site for site in crawler_sites} elif crawler_class == Fourchan: crawler_uris = {site: "https://boards.4chan.org/%s/" % site for site in crawler_sites} elif crawler_class == Giphy: crawler_uris = {site: "https://api.giphy.com/v1/gifs/search?q=%s" % site for site in crawler_sites} if crawler_class_name not in crawlers: crawlers[crawler_class_name] = {} crawlers[crawler_class_name] = {site: crawler_class(crawler_uris[site], site) for site in crawler_uris} return crawlers, factors
def generate_files(): maltego_folder = sys.argv[2] import os import inspect from lxml import etree as ET from crawler import Crawler def change_jar(file_name_jar, file_name_union): # todo: manterá os arquivos já existentes, alterando apenas os presentes no file_name_union - esse comportamento deve ser mantido? from subprocess import Popen, PIPE p = Popen(['jar', 'uf', file_name_jar, file_name_union], stdout=PIPE, cwd=maltego_folder) p.stdout.read() assert(p.wait() == 0) # apagar o file_name_union, pois ele agora é desnecessário import shutil shutil.rmtree(maltego_folder + file_name_union) ### # read xml config maltego_config = ET.parse(folder_this_file + '/config.xml').getroot() entity_icon_dict = {i.attrib['name']: i.text for i in maltego_config.find('entity_icons').findall('entity')} ### # entidades entities_names = [] dir_save_files = maltego_folder + '/com/paterva/maltego/entities/common/' os.makedirs(dir_save_files) if not os.path.exists(dir_save_files + 'spyck'): os.makedirs(dir_save_files + 'spyck') me = lib_files.MaltegoEntity(dir_save_files) # arbitrárias me.new_entity_info_from_entity('info', 'Phrase') me.new_entity_info_from_entity('info_list', 'OsiModelGolden') # crawlers populares crawler_populator = [] for current_crawler in Crawler.__subclasses__(): jump = False harvest_args = inspect.getargspec(current_crawler.harvest_debug).args if len(harvest_args) == 1: continue for i in harvest_args: if i[:7] == 'entity_': jump = True break if jump: continue me.new_entity_crawler_populator(current_crawler.name(), 'Objects', harvest_args[1]) # todo: precisa ser generalizado! pois pode ser que algum crawler populator receba mais que apenas um único crawler_param crawler_populator.append(current_crawler) # entities for current_xml in os.listdir(folder_spyck + '/entities/'): crawler_root = ET.parse(folder_spyck + '/entities/' + current_xml).getroot() current_xml = current_xml[:-4] entities_names.append(current_xml) column_name, column_type = crawler_root.find('column').find('name').text, crawler_root.find('column').find('type').text me.new_entity_entity(current_xml, entity_icon_dict[current_xml], column_name, column_type) # layer me.save_layer() # colocar os arquivos no .jar do Maltego change_jar('com-paterva-maltego-entities-common.jar', 'com') ### # crawlers dir_save_files = maltego_folder + '/com/paterva/maltego/transforms/standard/' os.makedirs(dir_save_files) os.makedirs(dir_save_files + '/local') mt = lib_files.MaltegoTransform('/usr/bin/python3', folder_spyck, dir_save_files) # crawler arbitrário get_info_all mt.new_transform('get_info_all', [i for i in entities_names], 'get_info_all') mt.new_transform('unpack_list', ['info_list'], 'unpack_list') # crawlers de verdade for current_crawler in Crawler.__subclasses__(): if current_crawler in crawler_populator: continue mt.new_transform(current_crawler.name(), [i[10:] for i in inspect.getargspec(current_crawler.harvest_debug).args if i[:7] == 'entity_'], 'execute_crawler {}'.format(current_crawler.name())) # crawler populator for current_crawler in crawler_populator: mt.new_transform(current_crawler.name(), [current_crawler.name()], 'execute_crawler {}'.format(current_crawler.name())) # layer mt.save_layer() # colocar os arquivos no .jar do Maltego change_jar('com-paterva-maltego-transforms-standard.jar', 'com')
def __init__(self, trigger=True): import os path_spyck = os.path.dirname(__file__) self.con = sqlite3.connect(path_spyck + '/mydatabase.db', check_same_thread=False) self.c = self.con.cursor() ### # criar/atualizar banco de dados # table main_trigger: usada para armazanar dados de configuração e temporização dos triggers self.execute('CREATE TABLE IF NOT EXISTS main_trigger(' 'crawler TEXT,' 'infos TEXT' ');') # Deixar crawlers pronto para serem usados e atualizar a tabela main_trigger Crawler.db = self for cls in Crawler.__subclasses__(): setattr(self, 'crawler_' + cls.name(), cls()) if cls.trigger.__code__ != Crawler.trigger.__code__: if len(self.execute("SELECT * FROM main_trigger WHERE crawler=?", (cls.name(),)).fetchall()) == 0: self.execute('INSERT INTO main_trigger (crawler) VALUES (?)', (cls.name(),)) # criar tabelas das entities com base nos xml import xml.etree.ElementTree as ET for current_xml in os.listdir(path_spyck + '/entities/'): xml_root = ET.parse('entities/' + current_xml).getroot() columns = [(current_xml.find('name').text, current_xml.find('type').text) for current_xml in xml_root.findall('column')] entity_name = current_xml[:-4] self.execute( 'CREATE TABLE IF NOT EXISTS {}(' 'id INTEGER PRIMARY KEY AUTOINCREMENT,' '{}' ');'.format('entity_' + entity_name, ','.join([i[0] + ' ' + i[1] for i in columns])) ) self.execute( 'CREATE TABLE IF NOT EXISTS {}(' 'id INTEGER,' 'FOREIGN KEY(id) REFERENCES {}(id)' ');'.format('entity_' + entity_name + '_crawler', 'entity_' + entity_name) ) # Atualizar tabela entity_##name_crawler de acordo com os cralwers que requerem determinada entity entity_list = [ i[0] for i in self.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall() if i[0][:6] == 'entity' and i[0][-7:] != 'crawler' ] crawlers_names = [i.name() for i in Crawler.__subclasses__()] for i in entity_list: for i2 in [i3[0] for i3 in self.execute("SELECT name FROM sqlite_master WHERE sql LIKE '%{}_id INTEGER%'".format(i)).fetchall()]: if i2 in crawlers_names: try: self.execute('ALTER TABLE {} ADD COLUMN {} INTEGER DEFAULT 0;'.format(i + '_crawler', i2)) except: # coluna já existe pass # table main_arbitrary: permitir setar valores arbitrários self.execute('CREATE TABLE IF NOT EXISTS main_arbitrary(' 'entity_id INTEGER,' 'entity_name TEXT,' 'column_name TEXT,' 'column_value TEXT,' 'column_set_integer INTEGER DEFAULT 0' ');') # Executar crawlers trigáveis, se assim foi configurado if trigger: start_triggers() # salvar as mudanças no banco self.commit()
def get_entity_row_info(self, entity_id, entity_name, get_tables_secondary=True): crawler_list_success = self.crawler_list_success(entity_id, entity_name) crawler_list_success_cls = [i for i in Crawler.__subclasses__() if i.name() in crawler_list_success] crawler_list_success_cls = [i for i in crawler_list_success_cls if 'entity_' + entity_name in inspect.getargspec(i.harvest_debug).args] ### # Recolher infos da tabela da entity e da tabela principal dos crawlers fieldnames = self.select_column_and_value( 'SELECT * FROM entity_{} '.format(entity_name) + ' '.join([ 'INNER JOIN {} ON {}.entity_{}_id == {}'.format(i.name(), i.name(), entity_name, entity_id) for i in crawler_list_success_cls ]) + ' WHERE entity_{}.id == {}'.format(entity_name, entity_id), discard=['id', 'entity_{}_id'.format(entity_name)] ) ### # Recolher infos das tabelas secundárias dos crawlers que obtiveram sucesso if get_tables_secondary: def add_referenced_value(origin, to_add): if current_rule['table'] not in origin: origin[current_rule['table']] = [] origin[current_rule['table']].append(to_add) def get_deep_fieldnames(): # essa função irá listar os itens que servem de referência de acordo com o current_rule deep = fieldnames[cls.name() + '_' + current_rule['reference'][0]] for deeping in current_rule['reference'][1:]: deep = [t[deeping] for t in deep if deeping in t] deep = [tt for t in deep for tt in t] return deep for cls in crawler_list_success_cls: # Percorrer lista com as regras de leitura das tabelas secundárias for current_rule in cls.read_my_secondary_tables(): current_table_name = current_rule['table'] current_table_name_full = cls.name() + '_' + current_table_name # recolher infos da tabela infos = self.select_column_and_value_many( 'SELECT * FROM {} WHERE {}.entity_{}_id == {}'.format( current_table_name_full, current_table_name_full, entity_name, entity_id ) ) if 'reference' not in current_rule: # se a tabela não é referenciada, adicionar os seus dados à raiz de fieldnames fieldnames[current_table_name_full] = infos else: # se a tabela for referenciada, precisamos adicionar seu valores em sua respectiva referência [ add_referenced_value(a, b) for a in get_deep_fieldnames() for b in infos if a['reference'] == b['reference_' + current_rule['reference'][-1]] ] ### # Chamar método macro_at_data dos crawlers que obtiveram sucesso for cls in crawler_list_success_cls: for i in cls.macro_at_data(): fieldnames[i['column_name']] = i['how'](fieldnames) ### # Listar entities que referenciam para min # todo: precisa ser feito isso ainda # para isso, talvez eu precise criar uma tabela para fazer esse trabalho # sempre que uma linha for se referir a uma entity, precisará escrever nessa tabela # ela terá as colunas "id", "nome da tabela em que foi referenciada", "nome da coluna em que a entity id foi referenciada" ### # Recolher infos da tabela main_arbitrary def get_value_typed(j): if j['column_set_integer'] and j['column_value'] is not None: return int(j['column_value']) else: return j['column_value'] fieldnames.update( { i['column_name']: get_value_typed(i) for i in self.select_column_and_value_many('SELECT * FROM main_arbitrary WHERE entity_id=? and entity_name=?', (entity_id, entity_name)) } ) ### # Apagar valores, agora desnecessários, no fieldnames, tais como reference # todo return fieldnames
def __init__(self, trigger=True): import os path_spyck = os.path.dirname(__file__) self.con = sqlite3.connect(path_spyck + "/mydatabase.db", check_same_thread=False) self.c = self.con.cursor() ### # criar/atualizar banco de dados # table main_trigger: usada para armazanar dados de configuração e temporização dos triggers self.execute("CREATE TABLE IF NOT EXISTS main_trigger(" "crawler TEXT," "infos TEXT" ");") # Deixar crawlers pronto para serem usados e atualizar a tabela main_trigger Crawler.db = self for cls in Crawler.__subclasses__(): setattr(self, "crawler_" + cls.name(), cls()) if cls.trigger.__code__ != Crawler.trigger.__code__: if len(self.execute("SELECT * FROM main_trigger WHERE crawler=?", (cls.name(),)).fetchall()) == 0: self.execute("INSERT INTO main_trigger (crawler) VALUES (?)", (cls.name(),)) # criar tabelas das entities com base nos xml import xml.etree.ElementTree as ET for current_xml in os.listdir(path_spyck + "/entities/"): xml_root = ET.parse("entities/" + current_xml).getroot() columns = [ (current_xml.find("name").text, current_xml.find("type").text) for current_xml in xml_root.findall("column") ] entity_name = current_xml[:-4] self.execute( "CREATE TABLE IF NOT EXISTS {}(" "id INTEGER PRIMARY KEY AUTOINCREMENT," "{}" ");".format("entity_" + entity_name, ",".join([i[0] + " " + i[1] for i in columns])) ) self.execute( "CREATE TABLE IF NOT EXISTS {}(" "id INTEGER," "FOREIGN KEY(id) REFERENCES {}(id)" ");".format("entity_" + entity_name + "_crawler", "entity_" + entity_name) ) # Atualizar tabela entity_##name_crawler de acordo com os cralwers que requerem determinada entity entity_list = [ i[0] for i in self.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall() if i[0][:6] == "entity" and i[0][-7:] != "crawler" ] crawlers_names = [i.name() for i in Crawler.__subclasses__()] for i in entity_list: for i2 in [ i3[0] for i3 in self.execute( "SELECT name FROM sqlite_master WHERE sql LIKE '%{}_id INTEGER%'".format(i) ).fetchall() ]: if i2 in crawlers_names: try: self.execute("ALTER TABLE {} ADD COLUMN {} INTEGER DEFAULT 0;".format(i + "_crawler", i2)) except: # coluna já existe pass # table main_arbitrary: permitir setar valores arbitrários self.execute( "CREATE TABLE IF NOT EXISTS main_arbitrary(" "entity_id INTEGER," "entity_name TEXT," "column_name TEXT," "column_value TEXT," "column_set_integer INTEGER DEFAULT 0" ");" ) # Executar crawlers trigáveis, se assim foi configurado if trigger: start_triggers() # salvar as mudanças no banco self.commit()
def get_entity_row_info(self, entity_id, entity_name, get_tables_secondary=True): crawler_list_success = self.crawler_list_success(entity_id, entity_name) crawler_list_success_cls = [i for i in Crawler.__subclasses__() if i.name() in crawler_list_success] crawler_list_success_cls = [ i for i in crawler_list_success_cls if "entity_" + entity_name in inspect.getargspec(i.harvest_debug).args ] ### # Recolher infos da tabela da entity e da tabela principal dos crawlers fieldnames = self.select_column_and_value( "SELECT * FROM entity_{} ".format(entity_name) + " ".join( [ "INNER JOIN {} ON {}.entity_{}_id == {}".format(i.name(), i.name(), entity_name, entity_id) for i in crawler_list_success_cls ] ) + " WHERE entity_{}.id == {}".format(entity_name, entity_id), discard=["id", "entity_{}_id".format(entity_name)], ) ### # Recolher infos das tabelas secundárias dos crawlers que obtiveram sucesso if get_tables_secondary: def add_referenced_value(origin, to_add): if current_rule["table"] not in origin: origin[current_rule["table"]] = [] origin[current_rule["table"]].append(to_add) def get_deep_fieldnames(): # essa função irá listar os itens que servem de referência de acordo com o current_rule deep = fieldnames[cls.name() + "_" + current_rule["reference"][0]] for deeping in current_rule["reference"][1:]: deep = [t[deeping] for t in deep if deeping in t] deep = [tt for t in deep for tt in t] return deep for cls in crawler_list_success_cls: # Percorrer lista com as regras de leitura das tabelas secundárias for current_rule in cls.read_my_secondary_tables(): current_table_name = current_rule["table"] current_table_name_full = cls.name() + "_" + current_table_name # recolher infos da tabela infos = self.select_column_and_value_many( "SELECT * FROM {} WHERE {}.entity_{}_id == {}".format( current_table_name_full, current_table_name_full, entity_name, entity_id ) ) if "reference" not in current_rule: # se a tabela não é referenciada, adicionar os seus dados à raiz de fieldnames fieldnames[current_table_name_full] = infos else: # se a tabela for referenciada, precisamos adicionar seu valores em sua respectiva referência [ add_referenced_value(a, b) for a in get_deep_fieldnames() for b in infos if a["reference"] == b["reference_" + current_rule["reference"][-1]] ] ### # Chamar método macro_at_data dos crawlers que obtiveram sucesso for cls in crawler_list_success_cls: for i in cls.macro_at_data(): fieldnames[i["column_name"]] = i["how"](fieldnames) ### # Listar entities que referenciam para min # todo: precisa ser feito isso ainda # para isso, talvez eu precise criar uma tabela para fazer esse trabalho # sempre que uma linha for se referir a uma entity, precisará escrever nessa tabela # ela terá as colunas "id", "nome da tabela em que foi referenciada", "nome da coluna em que a entity id foi referenciada" ### # Recolher infos da tabela main_arbitrary def get_value_typed(j): if j["column_set_integer"] and j["column_value"] is not None: return int(j["column_value"]) else: return j["column_value"] fieldnames.update( { i["column_name"]: get_value_typed(i) for i in self.select_column_and_value_many( "SELECT * FROM main_arbitrary WHERE entity_id=? and entity_name=?", (entity_id, entity_name) ) } ) ### # Apagar valores, agora desnecessários, no fieldnames, tais como reference # todo return fieldnames