Exemple #1
0
def run_command():
	# create the argument parser
	parser = argparse.ArgumentParser()

	# add command option
	parser.add_argument("command", help="the command to run")

	# get the command
	command = parser.parse_args().command

	# get the config 
	conf = common.config()

	# get the command
	command_obj = None
	for comm in conf['commands']:
		if comm == command:
			command_obj = conf['commands'][comm]
			break

	# if command not found, show error
	if not command_obj:
		raise CommandNotFound()
	else:
		# run the command
		filename = command_obj['filename']
		mod = imp.load_module(filename=filename, name=comm,
							  file=open(filename), details=('', 'r', imp.PY_SOURCE))
		mod.main()
Exemple #2
0
 def __init__(self, config = common.config()):
     self.config = config
     self.train_lable = None
     self.train_data = None
     self.test_label = None
     self.test_data = None
     self.model = None
     self.cluster_info_dict = {}
     self.predict_result_dict = {}
Exemple #3
0
 def __init__(self, host, port, headers={}, use_ssl=True, use_urllib=False):
     self.host = host
     self.port = port
     self.headers = headers
     self.use_ssl = use_ssl
     self.use_urllib = use_urllib
     self.log = logging.getLogger(__name__)
     FORMAT = '%(asctime)s [%(levelname)s] %(message)s'
     logging.basicConfig(filename=config('DEBUG_LOGFILE'),
                         level=logging.DEBUG, format=FORMAT)
        article = news.ArticlePage(news_site_uid, _build_link(
            host, link))  #revisa que los vinculos esten bien construidos
    except (HTTPError, MaxRetryError) as e:
        #si ocurre un error, invalida el articulo
        logger.warning('Error while fechting the article', exc_info=False)

    #si el articulo no tiene cuerpo, queda invalidado
    if article and not article.body:
        logger.warning('Invalid article. There is no body')
        return None

    return article


#Revisa los links y valida los principales con las expresiones regulares
def _build_link(host, link):
    if is_well_formed_link.match(
            link):  #si es de tipo  https://example.com/hello
        return link
    elif is_root_path.match(link):  #si es de tipo /some-text
        return '{}{}'.format(host, link)
    else:
        return '{}/{}'.format(host, link)


if __name__ == '__main__':

    #enlista los nombres de los sites
    news_site_choices = list(config('config.yaml')['news_sites'].keys())
    for choices in news_site_choices:
        _news_scraper(choices)
Exemple #5
0
 def __init__(self, news_site_uid, url):
     self._config = config()['news_sites'][news_site_uid]
     self._queries = self._config['queries']
     self._html = None
     self._url = url
     self._visit(url)
#!/usr/bin/env python
import yara
import re
import sys
import json
import os
import argparse
import time
import common
from check_base64 import extract_base64_strings
from checkcryptonote import is_valid_wallet

#load config file
config = common.config()

#load needed params
samples_dir = config['samples_dir']
rules_dir = config['rules_dir']


def MoneroWallet(sha256, *base64list):
    #base64list = extract_base64_strings(sha256)
    regex_monero = r"(4[0-9AB][0-9a-zA-Z]{93,104})"
    sample_path = samples_dir + sha256
    Monero_rule = yara.compile(filepath=rules_dir + './monerowallet.yara')
    matches = Monero_rule.match(sample_path)
    if matches != []:
        filtered_matches = []
        for match in matches[0].strings:
            wallet_addr = re.search(regex_monero, str(match[2]))
            if not wallet_addr.group(0).islower() and not wallet_addr.group(
import datetime
import logging
import os

import lxml.html as html
import requests
import yaml
# Utilites
from common import config

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

HOME_URL = str(config()["larepublica"]["url"])
XPATH_LINK_TO_ARTICLE = str(config()["larepublica"]["links"])
XPATH_TITLE = str(config()["larepublica"]["titulo"])
XPATH_SUMMARY = str(config()["larepublica"]["resumen"])
XPATH_BODY = str(config()["larepublica"]["cuerpo"])


def error_handler():
    """
    docstring
    """
    pass


def parse_notice(link, today):
    """
    docstring
    """
Exemple #8
0
    def start_analysis(self, options, monitor):
        """Start the analysis by uploading all required files.
        @param options: the task options
        @param monitor: identifier of the monitor to be used.
        """
        log.info("Starting analysis on guest (id=%s, ip=%s)",
                 self.machine.label, self.machine.addr)

        self.options = options
        self.timeout = options["timeout"] + config("cuckoo:timeouts:critical")

        # Wait for the agent to come alive.
        self.wait_available()

        # Could be beautified a bit, but basically we have to perform the
        # same check here as we did in wait_available().
        if db.guest_get_status(self.task_id) != "starting":
            return

        # Check whether this is the new Agent or the old one (by looking at
        # the status code of the index page).
        r = self.get("/", do_raise=False)

        if r.status_code != 200:
            log.critical(
                "While trying to determine the Agent version that your VM is "
                "running we retrieved an unexpected HTTP status code: %s. If "
                "this is a false positive, please report this issue to the "
                "Cuckoo Developers. HTTP response headers: %s",
                r.status_code,
                json.dumps(dict(r.headers)),
            )
            db.guest_set_status(self.task_id, "failed")
            return

        try:
            status = r.json()
            version = status.get("version")
            features = status.get("features", [])
        except:
            log.critical(
                "We were unable to detect either the Old or New Agent in the "
                "Guest VM, are you sure you have set it up correctly? Please "
                "go through the documentation once more and otherwise inform "
                "the Cuckoo Developers of your issue.")
            db.guest_set_status(self.task_id, "failed")
            return

        log.info("Guest is running Cuckoo Agent %s (id=%s, ip=%s)", version,
                 self.machine.label, self.machine.addr)

        # Pin the Agent to our IP address so that it is not accessible by
        # other Virtual Machines etc.
        if "pinning" in features:
            self.get("/pinning")

        # Obtain the environment variables.
        self.query_environ()

        # Upload the analyzer.
        self.upload_analyzer(monitor)

        # Pass along the analysis.conf file.
        self.add_config(options)

        # Allow Auxiliary modules to prepare the Guest.
        self.aux.callback("prepare_guest")

        # If the target is a file, upload it to the guest.
        if options["category"] == "file" or options["category"] == "archive":
            data = {
                "filepath":
                os.path.join(self.determine_temp_path(), options["file_name"]),
            }
            files = {
                "file": ("sample.bin", open(options["target"], "rb")),
            }
            self.post("/store", files=files, data=data)

        if "execpy" in features:
            data = {
                "filepath": "%s/analyzer.py" % self.analyzer_path,
                "async": "yes",
                "cwd": self.analyzer_path,
            }
            self.post("/execpy", data=data)
        else:
            # Execute the analyzer that we just uploaded.
            data = {
                "command": "C:\\Python27\\pythonw.exe %s\\analyzer.py" %
                self.analyzer_path,
                "async": "yes",
                "cwd": self.analyzer_path,
            }
            self.post("/execute", data=data)
Exemple #9
0
 def __init__(self, config = common.config()):
     self.config = config
def articles_and_categories_extraction(host, article_url, iterator):
    ''' Function that extracts the articles for url, and returns it in a dictionary, also, it returns a list for each category. '''
    # Variables definition
    title_query = config()['news_sites'][iterator]['queries']['title']
    subtitle_query = config()['news_sites'][iterator]['queries']['subtitle']
    body_query = config()['news_sites'][iterator]['queries']['content']
    images_query = config()['news_sites'][iterator]['queries']['images']
    category_long_query = config()['news_sites'][iterator]['queries']['category_long']
    tags_query = config()['news_sites'][iterator]['queries']['tags']
    author_query = config()['news_sites'][iterator]['queries']['author']
    publication_date_query = config()['news_sites'][iterator]['queries']['publication_date']
    categories_query = config()['news_sites'][iterator]['queries']['categories']
    data = {}
    try:
        logger.info(f'Extracting article and category content from {article_url}')
        # Requesting info from the categories list
        article_page = requests.get(article_url)
        if article_page.status_code == 200:
            home = article_page.content.decode('utf-8')
            parsed = html.fromstring(home)
            # Extracting the content for each article
            try:
                title = parsed.xpath(title_query)
                title = replacer(title)
            except ValueError as e:
                logger.warning(f'there is no title')
                title = None
            try:
                subtitle = parsed.xpath(subtitle_query)
                subtitle = replacer(subtitle)
            except ValueError as e:
                logger.warning('There is no subtitle')
                subtitle = None
            try:
                body = parsed.xpath(body_query)
                body = replacer(body)
            except ValueError as e:
                logger.warning(f'there is no body')
                body = None
            try:
                category_long = parsed.xpath(category_long_query)
                category_long = replacer(category_long)
            except ValueError as e:
                logger.warning(f'there is no category')
                category_long = None
            try:
                tags = parsed.xpath(tags_query)
                tags = replacer(tags)
            except ValueError as e:
                logger.warning(f'there is no tags')
                tags = None
            try:
                author = parsed.xpath(author_query)
                author = replacer(author)
            except ValueError as e:
                logger.warning(f'there is no author')
                author = None
            try:
                categories = parsed.xpath(categories_query)
                categories = replacer(categories)
            except ValueError as e:
                logger.warning(f'there are no categories')
                categories = None
            try:
                images = parsed.xpath(images_query)
                images = replacer(images)
            except ValueError as e:
                logger.warning(f'there are no images')
                images = None
            try:
                publication_date = parsed.xpath(publication_date_query)
                publication_date = replacer(publication_date)
            except ValueError as e:
                logger.warning(f'there is no publication date')
                publication_date = None
            
            data = {
                'title': title,
                'subtitle': subtitle,
                'body': body,
                'images': parsed.xpath(images_query),
                'category_long': category_long,
                'tags': tags,
                'author': author,
                'publication_date': publication_date,
                'news_url': article_url,
                'host': host
            }
            category = "".join(categories)
        else:
            logger.warning(f'{article_url}: {article_page.status_code}')
            raise ValueError(f'Error.')
    except (HTTPError, MaxRetryError) as e:
        logger.warning('Error while fetching article', exc_info=False)
    
    return data, category.capitalize()
def _news_scraper(news_site_uid):
    host = config()['news_sites'][news_site_uid]['url']
    logging.info("Beginning scraper for {}".format(host))
    homepage = news.HomePage(news_site_uid, host)
    for link in homepage.article_list:
        print(link)
Exemple #12
0
def v_main():
    knn = KnnClassifer(common.config())
    knn.knn()
if __name__ == '__main__':
    data['categories'] = []
    data['articles'] = []
    articles = []
    articles_recovered = []
    categories_recovered = []
    if os.path.isfile('urls.txt'):
        articles_recovered = recover_text_file('urls.txt')
    if os.path.isfile('categories.txt'):
        categories_recovered = recover_text_file('categories.txt')
    articles_to_scrape = []
    categories = []
    

    for i in range(6):
        host = config()['news_sites'][i]['url']
        logger.info(f'Begining scraper for {host}')
        categories_urls = categories_urls_extraction(host, i)
        articles_links = articles_urls_extraction(host, categories_urls, i)
        for article in articles_links:
            if article not in articles_recovered:
                articles_to_scrape.append(article)
                articles, category = articles_and_categories_extraction(host, article, i)
                data['articles'].append(articles)
                categories.append(category)
        
    categories = list(set(categories))
    for category in categories:
            if category not in categories_recovered:
                data['categories'].append({'categories':category})
        
Exemple #14
0
        writer = csv.writer(f)
        writer.writerow(csv_headers)

        for i in range(0,len(articles_title)):
            row = [category_id,articles_title[i],articles_price[i].strip(),articles_link[i],articles_image[i]]
            writer.writerow(row)

def _find_article_info_in_page(news_site_uid,category_id,page):
    page = pages.HomePage(news_site_uid,category_id,page)

    return page.articles

if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    retail_site_choices = list(config()['retail_sites'].keys())
    parser.add_argument('retail_site',
                        help='The retail site that you want to scrape',
                        type=str,
                        choices=retail_site_choices)

    parser.add_argument('category_id',
                        help='The category of articles that you want to scrape',
                        type=str)

    parser.add_argument('num_pages',
                        help='The number of pages in the selected category',
                        type=int)

    args = parser.parse_args()
    _prices_scraper(args.retail_site, args.category_id, args.num_pages)
Exemple #15
0
        return link
    elif is_root_path.match(link):
        if news_site_uid == 'elpais':
            host = host.rstrip('/').rstrip(news_section_uid).rstrip('/')
            return '{}{}'.format(host, link)
        else:
            return '{}{}'.format(host, link)
    else:
        if news_site_uid == 'elpais':
            link = link.rstrip('/america/')
            return '{host}/{uri}'.format(host=host, uri=link)
        else:
            return '{host}/{uri}'.format(host=host, uri=link)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    news_site_choices = list(config()['news_sites'].keys())
    parser.add_argument('news_site',
                        help='the news site that you want to scrape',
                        type=str,
                        choices=news_site_choices)

    args = parser.parse_args()
    news_section_choices = list(
        config()['news_sites'][args.news_site]['queries'].keys())
    news_section_uid = input(
        'choose a section: {}\n'.format(news_section_choices))
    _news_scraper(args.news_site, news_section_uid)
Exemple #16
0
def _news_scraper(news_site_uid):
    host = config()['news_sites'][news_site_uid]['url']

    logging.info(f'Beginning scrape for {host}')
        return None

    return article


def _build_link(host, link):
    if is_well_formed_link.match(link):
        return link
    elif is_root_path.match(link):
        return '{}{}'.format(host, link)
    else:
        return '{host}/{url}'.format(host=host, url=link)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    new_site_choices = list(
        config()["news_sites"].keys())  #Obtiene lista de opciones para scraper

    parser.add_argument('news_sites',
                        help='Los Sitios para scraper',
                        type=str,
                        choices=new_site_choices
                        )  #Agrega argumentos para ejecucion del usuario

    parser.add_argument(
        'Time',
        help='Fecha desde la que se desea scraper. Ej: 2020/05/01',
        type=str)  #Agrega argumentos para ejecucion del usuario
    args = parser.parse_args()
    _news_scraper(args.news_sites, args.Time)
Exemple #18
0
def _news_scraper(news_site_uid):
    host = config()['news_sites'][news_site_uid]['url']

    logging.info('Begging scrarper for {}'.format(host))
Exemple #19
0
import connections as conne
import pandas as pd
from common import config
import os

source = conne.database()
parameters = config()['destination']

sqlmaxtl = 'SELECT DimD.FullDateAlternateKey,DimO.OrganizationName,Ds.ScenarioName,DiA.AccountDescription, Facf.Amount \
FROM FactFinance Facf INNER JOIN DimDate DimD ON Facf.DateKey = DimD.DateKey INNER JOIN DimOrganization DimO ON Facf.OrganizationKey = DimO.OrganizationKey INNER JOIN DimScenario Ds ON Facf.ScenarioKey = Ds.ScenarioKey INNER JOIN DimAccount DiA ON Facf.AccountKey = DiA.AccountKey WHERE YEAR(DimD.FullDateAlternateKey) = 2010'

df = source.__execute__(sqlmaxtl)

#print(type(df))
#print(df.columns)
#print(df.head())
#print(df.shape)
#print(df.ndim)
#print(df.tail())
#print(df.dtypes)

print(parameters['path'])

df.to_csv(parameters['path'])
Exemple #20
0

def save_articles(news_site_uid, articles):
    now = datetime.datetime.now()
    csv_headers = list(
        filter(lambda properties: not property.startwith('_'),
               dir(articles[0])))
    out_file_name = '{news_site_uid}_{datetime}_articles.csv'.format(
        news_site_uid=news_site_uid, datetime=now.strftime('%Y_%m_%d'))

    with open(out_file_name, mode='w+') as f:
        writer = csv.writer(f)
        writer.writerow(csv_headers)

        for article in articles:
            row = [str(getattr(article, prop)) for prop in csv_headers]
            writer.writerow(row)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    news_options = list(config()['sports_news_sites'].keys())
    parser.add_argument('sports_news_site',
                        help='Select the sports news site for scraping',
                        type=str,
                        choices=news_options)

    args = parser.parse_args()
    sports_news_scraper(args.sports_news_site)
 def __init__(self, news_site_uid, url):
     self._config = config()["news_sites"][news_site_uid]
     self.queries = self._config["queries"]
     self._url = url
     self._html = None
     self._visit(url)
Exemple #22
0
 def __init__(self, config=common.config()):
     self.config = config
     self.db_engine = DB_Engine(self.config.db_string)
     self.filename = self.config.pickle_records_file
Exemple #23
0
import argparse
import logging
from common import config
import news_page_objects as news
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def _news_scrapper(_news_site_uid):
    logging.info('Start::..')
    host = config()['news_site'][_news_site_uid]['url']
    logging.info('Beginning sraper for {}'.format(host))
    homepage = news.HomePage(_news_site_uid, host)

    for link in homepage.article_links:
        print(link)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    logging.info('Middle::.. {}'.format(list(config()['news_site'])))
    new_sites_choices = list(config()['news_site'].keys())
    parser.add_argument('news_site',
                        help='The news sites that u want to scrape',
                        type=str,
                        choices=new_sites_choices)

    args = parser.parse_args()
    _news_scrapper(args.news_site)
Exemple #24
0
        elif opt == '--use_ram':
            pref['use_ram'] = arg
        elif opt == '--gpgpu':
            pref['gpgpu'] = True
        elif opt in ('-v', '--verbose'):
            pref['verbose'] = True
        elif opt == '--tmp':
            pref['tmp'] = arg
        else:
            print('Option: "', opt, '" is not defined.')


if __name__ == "__main__":
    pref = {}
    init(pref)
    cfg = config()
    cfg.setDir({
        'ws': pref['root'],
        'app': os.path.join(pref['root'], 'MyApp'),
        'ref': pref['refdir'],
        'pref': pref['paramdir'],
        'sample': os.path.join(pref['root'], 'Sample'),
        'tmp': pref['tmp'],
        'test': os.path.join(pref['root'], 'test'),
        'out': pref['outdir']
    })
    cfg.makeDirs()
    app = apprun(cfg)
    pref['reference'] = os.path.join(pref['refdir'], pref['reference'])
    if pref['verbose']:
        print(pref)
Exemple #25
0
def _news_scraper(news_site_uid):
    host = config()['news_sites'][news_site_uid]['url']
    logging.info('Beginning scraper for {}'.format(host))
    homepage = news.HomePage(host)
    links = homepage.article_links
    homepage.save_articles(news_site_uid, links)
Exemple #26
0
import pandas as pd
import pyodbc as conn_d
from common import config

parameters = config()['source']


class database:
    """ Connection class to source and target databases """
    def __init__(self):
        self.driver_source = parameters['driver_source']
        self.database_source = parameters['database_source']
        self.host_source = parameters['host_source']
        self.port_distination = parameters['port_distination']
        self.user_source = parameters['user_source']
        self.password_source = parameters['password_source']
        self.trusted_connection = parameters['trusted_connection']

    def __source__connect__(self):
        self.conn_source = conn_d.connect(
            Driver=self.driver_source,
            Server=self.host_source,
            Database=self.database_source,
            user=self.user_source,
            Trusted_Connection=self.trusted_connection,
            password=self.password_source)
        self.cur_des = self.conn_source.cursor()

    def __disconnect_source__(self):
        self.conn_source.close()
Exemple #27
0
    except (HTTPError, MaxRetryError) as e:
        logger.warning('Error while fetching the article', exc_info=False)

    if article and not article.body:
        logger.warning('Invalid article. There is no body.')
        return None

    return article


def _build_link(host, link):
    if is_well_formed_link.match(link):
        return link
    elif is_root_path.match(link):
        return f'{host}{link}'
    else:
        return '{host}/{uri}'.format(host=host, uri=link)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    news_site_choices = list(config()['news_sites'].keys()) # Python3 nos regresa un iterador, por eso lo convertimos a lista
    parser.add_argument(
        'news_site',
        help='The news site that you want to scrappe',
        type=str,
        choices=news_site_choices)

    args = parser.parse_args()
    _news_scrapper(args.news_site)
 def __init__(self,url):
     self._config = config()['amazon']
     self._queries = self._config['products']
     self._html = None
     self._visit(url)
     self.url = url
Exemple #29
0
def _news_scraper(news_site_uid):
    paths = config()['news_sites'][news_site_uid]
    host = paths['url']
    logging.info(f'Beginning scraper for {host}')

    try:
        response = requests.get(paths['url'])
        if response.status_code == 200:
            logger.info(f'Parsing url...')
            final_articles = []

            home = response.content.decode('utf-8')
            parsed = html.fromstring(home)
            links_to_news = parsed.xpath(
                paths['queries']['XPATH_HOMEPAGE_LINKS_TO_ARTICLES'])

            good_links = _fix_links(links_to_news, host)

            for i, link in enumerate(good_links):
                try:
                    article_response = requests.get(link, timeout=6)
                    article = article_response.content.decode('utf-8')
                    article_parsed = html.fromstring(article)
                    article_elements = {}

                    title = article_parsed.xpath(
                        paths['queries']['XPATH_TITLE'])
                    if len(title):
                        article_elements['title'] = title[0]
                    else:
                        article_elements['title'] = None

                    body = article_parsed.xpath(paths['queries']['XPATH_BODY'])
                    p_elements = []
                    for text in body:
                        if str(text)[0] in [',', '.', ' ']:
                            p_elements.append(str(text))
                        else:
                            p_elements.append(' ' + str(text))

                    body = ''.join(p_elements)
                    if len(body):
                        article_elements['body'] = body
                    else:
                        article_elements['body'] = None

                    date = article_parsed.xpath(paths['queries']['XPATH_DATE'])
                    if len(date):
                        article_elements['date'] = date[0]
                    else:
                        article_elements['date'] = None

                    author = article_parsed.xpath(
                        paths['queries']['XPATH_AUTHOR'])
                    if len(author):
                        article_elements['author'] = author[0]
                    else:
                        article_elements['author'] = None

                    article_elements['url'] = link
                    final_articles.append(article_elements)

                    logger.info(f'Article {i+1}/{len(good_links)} scraped!')
                except Exception as e:
                    print(e)

            return final_articles

        else:
            print(f'Error. Status code {response.status_code}')

    except ValueError as ve:
        print(ve)
Exemple #30
0
 def __init__(self, config=common.config()):
     self.config = config
     self.db_engine = DB_Engine(self.config.db_string)
     self.filename = self.config.pickle_records_file
Exemple #31
0
 def __init__(self, config=common.config()):
     self.__config = config
def _news_scraper(news_site):
    host = config()['news_sites'][news_site]['url']

    logging.info('Beginning scraper for {}'.format(host))
    logging.info('Finding links in homepage...')
Exemple #33
0
	try:
		article = news.ArticlePage(news_sites_uid, _build_link(host, link))
	except (HTTPError, MaxRetryError) as e:
	    logger.warning('Error while fetching the article', exc_info = false)

	if article and not article.body:
		logger.warning('Articulo Invalido')
		return None

	return article

def _build_link(host, link):
	if is_well_formed_url.match(link):
		return link
	elif is_root_path.match(link)
		return '{}{}'.format(host, link)
	else:
		return'{host}/{url}'.format(host=host, url=link)

if __name__ == '__main__':
	parser = argparse.ArgumentParser()

	news_sites_choices = list(config()[new_site].keys())
	parser.add_argument('news_sites', help = 'el nuevo sitio para hacer scrape',
					  type = str, 
 					  choices = news_sites_choices)

	args = parser.parse_args()
	_news_scrapper(args.news_sites)

Exemple #34
0
            articles.append(article)

    _save_articles(news_site_id, articles)


def _save_articles(news_site_id, articles):
    #now = datetime.now().strftime("%Y_%m_%d")
    #output_filename = f"{news_site_id}_{now}_articles.csv"
    output_filename = f"{news_site_id}.csv"

    csv_headers = list(filter(lambda property: not property.startswith("_"), dir(articles[0])))
    with open(output_filename, mode="w+") as f:
        writer = csv.writer(f)
        writer.writerow(csv_headers)
        for article in articles:
            row = [str(getattr(article, prop)) for prop in csv_headers]
            writer.writerow(row)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    news_sites_choices = list(config()["news_sites"].keys())
    parser.add_argument('news_site',
                        help="The news site to be scraped",
                        type=str,
                        choices=news_sites_choices)

    args = parser.parse_args()
    _news_scraper(args.news_site)
Exemple #35
0
        article = news.ArticlePage(news_site_uid, _build_link(host, link))
    except (HTTPError, MaxRetryError) as e:
        logger.warning('Error while fetching the article', exc_info=False)

    if article and not article.body:
        logger.warning('Invalid article. There is no body')
        return None

    return article


def _build_link(host, link):
    if is_well_formed_link.match(link):
        return link
    elif is_root_path.match(link):
        return '{}{}'.format(host, link)
    else:
        return '{host}/{uri}'.format(host=host, uri=link)


if __name__ == "__main__":
    news_sites_choices = list(config()['news_sites'].keys())
    parser = argparse.ArgumentParser()
    parser.add_argument('news_site',
                        help='The news site you want to scrape',
                        type=str,
                        choices=news_sites_choices)

    args = parser.parse_args()
    _new_scraper(args.news_site)
Exemple #36
0
def v_main():
    knn = KnnClassifer(common.config())
    knn.knn()