def run_command(): # create the argument parser parser = argparse.ArgumentParser() # add command option parser.add_argument("command", help="the command to run") # get the command command = parser.parse_args().command # get the config conf = common.config() # get the command command_obj = None for comm in conf['commands']: if comm == command: command_obj = conf['commands'][comm] break # if command not found, show error if not command_obj: raise CommandNotFound() else: # run the command filename = command_obj['filename'] mod = imp.load_module(filename=filename, name=comm, file=open(filename), details=('', 'r', imp.PY_SOURCE)) mod.main()
def __init__(self, config = common.config()): self.config = config self.train_lable = None self.train_data = None self.test_label = None self.test_data = None self.model = None self.cluster_info_dict = {} self.predict_result_dict = {}
def __init__(self, host, port, headers={}, use_ssl=True, use_urllib=False): self.host = host self.port = port self.headers = headers self.use_ssl = use_ssl self.use_urllib = use_urllib self.log = logging.getLogger(__name__) FORMAT = '%(asctime)s [%(levelname)s] %(message)s' logging.basicConfig(filename=config('DEBUG_LOGFILE'), level=logging.DEBUG, format=FORMAT)
article = news.ArticlePage(news_site_uid, _build_link( host, link)) #revisa que los vinculos esten bien construidos except (HTTPError, MaxRetryError) as e: #si ocurre un error, invalida el articulo logger.warning('Error while fechting the article', exc_info=False) #si el articulo no tiene cuerpo, queda invalidado if article and not article.body: logger.warning('Invalid article. There is no body') return None return article #Revisa los links y valida los principales con las expresiones regulares def _build_link(host, link): if is_well_formed_link.match( link): #si es de tipo https://example.com/hello return link elif is_root_path.match(link): #si es de tipo /some-text return '{}{}'.format(host, link) else: return '{}/{}'.format(host, link) if __name__ == '__main__': #enlista los nombres de los sites news_site_choices = list(config('config.yaml')['news_sites'].keys()) for choices in news_site_choices: _news_scraper(choices)
def __init__(self, news_site_uid, url): self._config = config()['news_sites'][news_site_uid] self._queries = self._config['queries'] self._html = None self._url = url self._visit(url)
#!/usr/bin/env python import yara import re import sys import json import os import argparse import time import common from check_base64 import extract_base64_strings from checkcryptonote import is_valid_wallet #load config file config = common.config() #load needed params samples_dir = config['samples_dir'] rules_dir = config['rules_dir'] def MoneroWallet(sha256, *base64list): #base64list = extract_base64_strings(sha256) regex_monero = r"(4[0-9AB][0-9a-zA-Z]{93,104})" sample_path = samples_dir + sha256 Monero_rule = yara.compile(filepath=rules_dir + './monerowallet.yara') matches = Monero_rule.match(sample_path) if matches != []: filtered_matches = [] for match in matches[0].strings: wallet_addr = re.search(regex_monero, str(match[2])) if not wallet_addr.group(0).islower() and not wallet_addr.group(
import datetime import logging import os import lxml.html as html import requests import yaml # Utilites from common import config logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) HOME_URL = str(config()["larepublica"]["url"]) XPATH_LINK_TO_ARTICLE = str(config()["larepublica"]["links"]) XPATH_TITLE = str(config()["larepublica"]["titulo"]) XPATH_SUMMARY = str(config()["larepublica"]["resumen"]) XPATH_BODY = str(config()["larepublica"]["cuerpo"]) def error_handler(): """ docstring """ pass def parse_notice(link, today): """ docstring """
def start_analysis(self, options, monitor): """Start the analysis by uploading all required files. @param options: the task options @param monitor: identifier of the monitor to be used. """ log.info("Starting analysis on guest (id=%s, ip=%s)", self.machine.label, self.machine.addr) self.options = options self.timeout = options["timeout"] + config("cuckoo:timeouts:critical") # Wait for the agent to come alive. self.wait_available() # Could be beautified a bit, but basically we have to perform the # same check here as we did in wait_available(). if db.guest_get_status(self.task_id) != "starting": return # Check whether this is the new Agent or the old one (by looking at # the status code of the index page). r = self.get("/", do_raise=False) if r.status_code != 200: log.critical( "While trying to determine the Agent version that your VM is " "running we retrieved an unexpected HTTP status code: %s. If " "this is a false positive, please report this issue to the " "Cuckoo Developers. HTTP response headers: %s", r.status_code, json.dumps(dict(r.headers)), ) db.guest_set_status(self.task_id, "failed") return try: status = r.json() version = status.get("version") features = status.get("features", []) except: log.critical( "We were unable to detect either the Old or New Agent in the " "Guest VM, are you sure you have set it up correctly? Please " "go through the documentation once more and otherwise inform " "the Cuckoo Developers of your issue.") db.guest_set_status(self.task_id, "failed") return log.info("Guest is running Cuckoo Agent %s (id=%s, ip=%s)", version, self.machine.label, self.machine.addr) # Pin the Agent to our IP address so that it is not accessible by # other Virtual Machines etc. if "pinning" in features: self.get("/pinning") # Obtain the environment variables. self.query_environ() # Upload the analyzer. self.upload_analyzer(monitor) # Pass along the analysis.conf file. self.add_config(options) # Allow Auxiliary modules to prepare the Guest. self.aux.callback("prepare_guest") # If the target is a file, upload it to the guest. if options["category"] == "file" or options["category"] == "archive": data = { "filepath": os.path.join(self.determine_temp_path(), options["file_name"]), } files = { "file": ("sample.bin", open(options["target"], "rb")), } self.post("/store", files=files, data=data) if "execpy" in features: data = { "filepath": "%s/analyzer.py" % self.analyzer_path, "async": "yes", "cwd": self.analyzer_path, } self.post("/execpy", data=data) else: # Execute the analyzer that we just uploaded. data = { "command": "C:\\Python27\\pythonw.exe %s\\analyzer.py" % self.analyzer_path, "async": "yes", "cwd": self.analyzer_path, } self.post("/execute", data=data)
def __init__(self, config = common.config()): self.config = config
def articles_and_categories_extraction(host, article_url, iterator): ''' Function that extracts the articles for url, and returns it in a dictionary, also, it returns a list for each category. ''' # Variables definition title_query = config()['news_sites'][iterator]['queries']['title'] subtitle_query = config()['news_sites'][iterator]['queries']['subtitle'] body_query = config()['news_sites'][iterator]['queries']['content'] images_query = config()['news_sites'][iterator]['queries']['images'] category_long_query = config()['news_sites'][iterator]['queries']['category_long'] tags_query = config()['news_sites'][iterator]['queries']['tags'] author_query = config()['news_sites'][iterator]['queries']['author'] publication_date_query = config()['news_sites'][iterator]['queries']['publication_date'] categories_query = config()['news_sites'][iterator]['queries']['categories'] data = {} try: logger.info(f'Extracting article and category content from {article_url}') # Requesting info from the categories list article_page = requests.get(article_url) if article_page.status_code == 200: home = article_page.content.decode('utf-8') parsed = html.fromstring(home) # Extracting the content for each article try: title = parsed.xpath(title_query) title = replacer(title) except ValueError as e: logger.warning(f'there is no title') title = None try: subtitle = parsed.xpath(subtitle_query) subtitle = replacer(subtitle) except ValueError as e: logger.warning('There is no subtitle') subtitle = None try: body = parsed.xpath(body_query) body = replacer(body) except ValueError as e: logger.warning(f'there is no body') body = None try: category_long = parsed.xpath(category_long_query) category_long = replacer(category_long) except ValueError as e: logger.warning(f'there is no category') category_long = None try: tags = parsed.xpath(tags_query) tags = replacer(tags) except ValueError as e: logger.warning(f'there is no tags') tags = None try: author = parsed.xpath(author_query) author = replacer(author) except ValueError as e: logger.warning(f'there is no author') author = None try: categories = parsed.xpath(categories_query) categories = replacer(categories) except ValueError as e: logger.warning(f'there are no categories') categories = None try: images = parsed.xpath(images_query) images = replacer(images) except ValueError as e: logger.warning(f'there are no images') images = None try: publication_date = parsed.xpath(publication_date_query) publication_date = replacer(publication_date) except ValueError as e: logger.warning(f'there is no publication date') publication_date = None data = { 'title': title, 'subtitle': subtitle, 'body': body, 'images': parsed.xpath(images_query), 'category_long': category_long, 'tags': tags, 'author': author, 'publication_date': publication_date, 'news_url': article_url, 'host': host } category = "".join(categories) else: logger.warning(f'{article_url}: {article_page.status_code}') raise ValueError(f'Error.') except (HTTPError, MaxRetryError) as e: logger.warning('Error while fetching article', exc_info=False) return data, category.capitalize()
def _news_scraper(news_site_uid): host = config()['news_sites'][news_site_uid]['url'] logging.info("Beginning scraper for {}".format(host)) homepage = news.HomePage(news_site_uid, host) for link in homepage.article_list: print(link)
def v_main(): knn = KnnClassifer(common.config()) knn.knn()
if __name__ == '__main__': data['categories'] = [] data['articles'] = [] articles = [] articles_recovered = [] categories_recovered = [] if os.path.isfile('urls.txt'): articles_recovered = recover_text_file('urls.txt') if os.path.isfile('categories.txt'): categories_recovered = recover_text_file('categories.txt') articles_to_scrape = [] categories = [] for i in range(6): host = config()['news_sites'][i]['url'] logger.info(f'Begining scraper for {host}') categories_urls = categories_urls_extraction(host, i) articles_links = articles_urls_extraction(host, categories_urls, i) for article in articles_links: if article not in articles_recovered: articles_to_scrape.append(article) articles, category = articles_and_categories_extraction(host, article, i) data['articles'].append(articles) categories.append(category) categories = list(set(categories)) for category in categories: if category not in categories_recovered: data['categories'].append({'categories':category})
writer = csv.writer(f) writer.writerow(csv_headers) for i in range(0,len(articles_title)): row = [category_id,articles_title[i],articles_price[i].strip(),articles_link[i],articles_image[i]] writer.writerow(row) def _find_article_info_in_page(news_site_uid,category_id,page): page = pages.HomePage(news_site_uid,category_id,page) return page.articles if __name__ == '__main__': parser = argparse.ArgumentParser() retail_site_choices = list(config()['retail_sites'].keys()) parser.add_argument('retail_site', help='The retail site that you want to scrape', type=str, choices=retail_site_choices) parser.add_argument('category_id', help='The category of articles that you want to scrape', type=str) parser.add_argument('num_pages', help='The number of pages in the selected category', type=int) args = parser.parse_args() _prices_scraper(args.retail_site, args.category_id, args.num_pages)
return link elif is_root_path.match(link): if news_site_uid == 'elpais': host = host.rstrip('/').rstrip(news_section_uid).rstrip('/') return '{}{}'.format(host, link) else: return '{}{}'.format(host, link) else: if news_site_uid == 'elpais': link = link.rstrip('/america/') return '{host}/{uri}'.format(host=host, uri=link) else: return '{host}/{uri}'.format(host=host, uri=link) if __name__ == '__main__': parser = argparse.ArgumentParser() news_site_choices = list(config()['news_sites'].keys()) parser.add_argument('news_site', help='the news site that you want to scrape', type=str, choices=news_site_choices) args = parser.parse_args() news_section_choices = list( config()['news_sites'][args.news_site]['queries'].keys()) news_section_uid = input( 'choose a section: {}\n'.format(news_section_choices)) _news_scraper(args.news_site, news_section_uid)
def _news_scraper(news_site_uid): host = config()['news_sites'][news_site_uid]['url'] logging.info(f'Beginning scrape for {host}')
return None return article def _build_link(host, link): if is_well_formed_link.match(link): return link elif is_root_path.match(link): return '{}{}'.format(host, link) else: return '{host}/{url}'.format(host=host, url=link) if __name__ == '__main__': parser = argparse.ArgumentParser() new_site_choices = list( config()["news_sites"].keys()) #Obtiene lista de opciones para scraper parser.add_argument('news_sites', help='Los Sitios para scraper', type=str, choices=new_site_choices ) #Agrega argumentos para ejecucion del usuario parser.add_argument( 'Time', help='Fecha desde la que se desea scraper. Ej: 2020/05/01', type=str) #Agrega argumentos para ejecucion del usuario args = parser.parse_args() _news_scraper(args.news_sites, args.Time)
def _news_scraper(news_site_uid): host = config()['news_sites'][news_site_uid]['url'] logging.info('Begging scrarper for {}'.format(host))
import connections as conne import pandas as pd from common import config import os source = conne.database() parameters = config()['destination'] sqlmaxtl = 'SELECT DimD.FullDateAlternateKey,DimO.OrganizationName,Ds.ScenarioName,DiA.AccountDescription, Facf.Amount \ FROM FactFinance Facf INNER JOIN DimDate DimD ON Facf.DateKey = DimD.DateKey INNER JOIN DimOrganization DimO ON Facf.OrganizationKey = DimO.OrganizationKey INNER JOIN DimScenario Ds ON Facf.ScenarioKey = Ds.ScenarioKey INNER JOIN DimAccount DiA ON Facf.AccountKey = DiA.AccountKey WHERE YEAR(DimD.FullDateAlternateKey) = 2010' df = source.__execute__(sqlmaxtl) #print(type(df)) #print(df.columns) #print(df.head()) #print(df.shape) #print(df.ndim) #print(df.tail()) #print(df.dtypes) print(parameters['path']) df.to_csv(parameters['path'])
def save_articles(news_site_uid, articles): now = datetime.datetime.now() csv_headers = list( filter(lambda properties: not property.startwith('_'), dir(articles[0]))) out_file_name = '{news_site_uid}_{datetime}_articles.csv'.format( news_site_uid=news_site_uid, datetime=now.strftime('%Y_%m_%d')) with open(out_file_name, mode='w+') as f: writer = csv.writer(f) writer.writerow(csv_headers) for article in articles: row = [str(getattr(article, prop)) for prop in csv_headers] writer.writerow(row) if __name__ == '__main__': parser = argparse.ArgumentParser() news_options = list(config()['sports_news_sites'].keys()) parser.add_argument('sports_news_site', help='Select the sports news site for scraping', type=str, choices=news_options) args = parser.parse_args() sports_news_scraper(args.sports_news_site)
def __init__(self, news_site_uid, url): self._config = config()["news_sites"][news_site_uid] self.queries = self._config["queries"] self._url = url self._html = None self._visit(url)
def __init__(self, config=common.config()): self.config = config self.db_engine = DB_Engine(self.config.db_string) self.filename = self.config.pickle_records_file
import argparse import logging from common import config import news_page_objects as news logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def _news_scrapper(_news_site_uid): logging.info('Start::..') host = config()['news_site'][_news_site_uid]['url'] logging.info('Beginning sraper for {}'.format(host)) homepage = news.HomePage(_news_site_uid, host) for link in homepage.article_links: print(link) if __name__ == "__main__": parser = argparse.ArgumentParser() logging.info('Middle::.. {}'.format(list(config()['news_site']))) new_sites_choices = list(config()['news_site'].keys()) parser.add_argument('news_site', help='The news sites that u want to scrape', type=str, choices=new_sites_choices) args = parser.parse_args() _news_scrapper(args.news_site)
elif opt == '--use_ram': pref['use_ram'] = arg elif opt == '--gpgpu': pref['gpgpu'] = True elif opt in ('-v', '--verbose'): pref['verbose'] = True elif opt == '--tmp': pref['tmp'] = arg else: print('Option: "', opt, '" is not defined.') if __name__ == "__main__": pref = {} init(pref) cfg = config() cfg.setDir({ 'ws': pref['root'], 'app': os.path.join(pref['root'], 'MyApp'), 'ref': pref['refdir'], 'pref': pref['paramdir'], 'sample': os.path.join(pref['root'], 'Sample'), 'tmp': pref['tmp'], 'test': os.path.join(pref['root'], 'test'), 'out': pref['outdir'] }) cfg.makeDirs() app = apprun(cfg) pref['reference'] = os.path.join(pref['refdir'], pref['reference']) if pref['verbose']: print(pref)
def _news_scraper(news_site_uid): host = config()['news_sites'][news_site_uid]['url'] logging.info('Beginning scraper for {}'.format(host)) homepage = news.HomePage(host) links = homepage.article_links homepage.save_articles(news_site_uid, links)
import pandas as pd import pyodbc as conn_d from common import config parameters = config()['source'] class database: """ Connection class to source and target databases """ def __init__(self): self.driver_source = parameters['driver_source'] self.database_source = parameters['database_source'] self.host_source = parameters['host_source'] self.port_distination = parameters['port_distination'] self.user_source = parameters['user_source'] self.password_source = parameters['password_source'] self.trusted_connection = parameters['trusted_connection'] def __source__connect__(self): self.conn_source = conn_d.connect( Driver=self.driver_source, Server=self.host_source, Database=self.database_source, user=self.user_source, Trusted_Connection=self.trusted_connection, password=self.password_source) self.cur_des = self.conn_source.cursor() def __disconnect_source__(self): self.conn_source.close()
except (HTTPError, MaxRetryError) as e: logger.warning('Error while fetching the article', exc_info=False) if article and not article.body: logger.warning('Invalid article. There is no body.') return None return article def _build_link(host, link): if is_well_formed_link.match(link): return link elif is_root_path.match(link): return f'{host}{link}' else: return '{host}/{uri}'.format(host=host, uri=link) if __name__ == '__main__': parser = argparse.ArgumentParser() news_site_choices = list(config()['news_sites'].keys()) # Python3 nos regresa un iterador, por eso lo convertimos a lista parser.add_argument( 'news_site', help='The news site that you want to scrappe', type=str, choices=news_site_choices) args = parser.parse_args() _news_scrapper(args.news_site)
def __init__(self,url): self._config = config()['amazon'] self._queries = self._config['products'] self._html = None self._visit(url) self.url = url
def _news_scraper(news_site_uid): paths = config()['news_sites'][news_site_uid] host = paths['url'] logging.info(f'Beginning scraper for {host}') try: response = requests.get(paths['url']) if response.status_code == 200: logger.info(f'Parsing url...') final_articles = [] home = response.content.decode('utf-8') parsed = html.fromstring(home) links_to_news = parsed.xpath( paths['queries']['XPATH_HOMEPAGE_LINKS_TO_ARTICLES']) good_links = _fix_links(links_to_news, host) for i, link in enumerate(good_links): try: article_response = requests.get(link, timeout=6) article = article_response.content.decode('utf-8') article_parsed = html.fromstring(article) article_elements = {} title = article_parsed.xpath( paths['queries']['XPATH_TITLE']) if len(title): article_elements['title'] = title[0] else: article_elements['title'] = None body = article_parsed.xpath(paths['queries']['XPATH_BODY']) p_elements = [] for text in body: if str(text)[0] in [',', '.', ' ']: p_elements.append(str(text)) else: p_elements.append(' ' + str(text)) body = ''.join(p_elements) if len(body): article_elements['body'] = body else: article_elements['body'] = None date = article_parsed.xpath(paths['queries']['XPATH_DATE']) if len(date): article_elements['date'] = date[0] else: article_elements['date'] = None author = article_parsed.xpath( paths['queries']['XPATH_AUTHOR']) if len(author): article_elements['author'] = author[0] else: article_elements['author'] = None article_elements['url'] = link final_articles.append(article_elements) logger.info(f'Article {i+1}/{len(good_links)} scraped!') except Exception as e: print(e) return final_articles else: print(f'Error. Status code {response.status_code}') except ValueError as ve: print(ve)
def __init__(self, config=common.config()): self.__config = config
def _news_scraper(news_site): host = config()['news_sites'][news_site]['url'] logging.info('Beginning scraper for {}'.format(host)) logging.info('Finding links in homepage...')
try: article = news.ArticlePage(news_sites_uid, _build_link(host, link)) except (HTTPError, MaxRetryError) as e: logger.warning('Error while fetching the article', exc_info = false) if article and not article.body: logger.warning('Articulo Invalido') return None return article def _build_link(host, link): if is_well_formed_url.match(link): return link elif is_root_path.match(link) return '{}{}'.format(host, link) else: return'{host}/{url}'.format(host=host, url=link) if __name__ == '__main__': parser = argparse.ArgumentParser() news_sites_choices = list(config()[new_site].keys()) parser.add_argument('news_sites', help = 'el nuevo sitio para hacer scrape', type = str, choices = news_sites_choices) args = parser.parse_args() _news_scrapper(args.news_sites)
articles.append(article) _save_articles(news_site_id, articles) def _save_articles(news_site_id, articles): #now = datetime.now().strftime("%Y_%m_%d") #output_filename = f"{news_site_id}_{now}_articles.csv" output_filename = f"{news_site_id}.csv" csv_headers = list(filter(lambda property: not property.startswith("_"), dir(articles[0]))) with open(output_filename, mode="w+") as f: writer = csv.writer(f) writer.writerow(csv_headers) for article in articles: row = [str(getattr(article, prop)) for prop in csv_headers] writer.writerow(row) if __name__ == "__main__": parser = argparse.ArgumentParser() news_sites_choices = list(config()["news_sites"].keys()) parser.add_argument('news_site', help="The news site to be scraped", type=str, choices=news_sites_choices) args = parser.parse_args() _news_scraper(args.news_site)
article = news.ArticlePage(news_site_uid, _build_link(host, link)) except (HTTPError, MaxRetryError) as e: logger.warning('Error while fetching the article', exc_info=False) if article and not article.body: logger.warning('Invalid article. There is no body') return None return article def _build_link(host, link): if is_well_formed_link.match(link): return link elif is_root_path.match(link): return '{}{}'.format(host, link) else: return '{host}/{uri}'.format(host=host, uri=link) if __name__ == "__main__": news_sites_choices = list(config()['news_sites'].keys()) parser = argparse.ArgumentParser() parser.add_argument('news_site', help='The news site you want to scrape', type=str, choices=news_sites_choices) args = parser.parse_args() _new_scraper(args.news_site)