Beispiel #1
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
from utils.translation import ugettext as _

from utils.level import Level
from utils.code import Code
from seo.containers.ratings import TextAuditRating
from seo.containers.seo_document import _getMandatoryBlockTokens
from utils.median_distribution import getMedianDistributionInfo

DISPLAY = False
from utils.logger import LoggerFactory
app_logger = LoggerFactory.getInstance('app')


class MandatoryBlockLengthRating(object):

    RATING_NAME_TEMPLATE = u'%s-LENGTH-SCORE'

    def __init__(self, mandatoryField, seoLibrary, textSeoDocument):
        self.mandatoryField = mandatoryField
        self.seoLibrary = seoLibrary
        self.textSeoDocument = textSeoDocument
        self.RATING_NAME = MandatoryBlockLengthRating.RATING_NAME_TEMPLATE % self.mandatoryField.upper(
        ).replace(u'TOKENS', u'')

    def _getScore(self):

        lenList = []
        for seoDocument in self.seoLibrary.seoDocuments:
            tokens = _getMandatoryBlockTokens(seoDocument,
Beispiel #2
0
 def __init__(self):
     # BaseDAO.__init__(self)
     # super(BaseDAO, self).__init__()
     self.logger = LoggerFactory.create_daily_logger(
         __name__, PathMgr.get_log_path())
Beispiel #3
0
from config import settings
from utils.persistence.file_storage_factory import FileStorageFactory
from seo.containers.seo_document import DataDocument
from bs4.element import NavigableString
from utils.concurrence.urllib3_pool_factory import Urllib3PoolFactory
from urllib.parse import urlparse
from data_mining.web_pages.scrapers.readability import Readability
from utils.concurrence.request_factory import RequestFactory

try:
    import magic
except:
    magic = None  # Windows

from utils.logger import LoggerFactory
app_download_logger = LoggerFactory.getInstance('downloader')


class DownloadException(Exception):
    def __str__(self):
        return 'Download url error'


class UserAgent(object):
    chrome = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
    firefox = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1'
    safari = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A'
    seologies = 'Seologies/0.1 Bot'
    old = 'Mozilla/11.0'

    filter        string     Controls turning on or off the duplicate content filter. "1": Turns on duplicate content filter. 
    sort          string     The sort expression to apply to the results.
 
"""
import urllib
import json
import time

from config import settings
from utils.persistence.file_storage_factory import FileStorageFactory
from utils.concurrence.urllib3_pool_factory import Urllib3PoolFactory
from urllib3.util.retry import Retry

from utils.logger import LoggerFactory
from data_mining.search_engines.google.base_search import BaseSearchEngine
app_download_logger = LoggerFactory.getInstance('downloader')
app_error_logger = LoggerFactory.getInstance('app')


class GoogleSearchEngine(BaseSearchEngine):

    URL_TEMPLATE = 'https://www.googleapis.com/customsearch/v1?key=%s&cx=%s'
    CACHE_PATH = '/googleSearchEngine'

    def __init__(self,
                 query,
                 language='es',
                 country='ES',
                 googleHost='google.es',
                 dateRestrict=None,
                 max_results=settings.SEO_TERMS_DOWNLOAD_LIMIT,
Beispiel #5
0
        message = str(sys_error)
        status_code = 500

    error_info = traceback.format_exc()
    log_all_exceptions(logger)

    # This should be skipped depending on config
    if (CFG_OBJ.get('DEBUG') == True):
        print(error_info)
    return make_response(jsonify(message=message), status_code)


for error in default_exceptions.items():
    app.error_handler_spec[None][error] = system_error_handler
    app.error_handler_spec[None][error[0]] = system_error_handler

if __name__ == '__main__':
    # setup logger
    log_type = CFG_OBJ.get('LOGGING.TYPE')
    log_path = CFG_OBJ.get('LOGGING.PATH')
    logger = LoggerFactory.create(log_type, log_path)

    # log the app start message with a timestamp
    separator = '*' * 80
    logger.log("\n" + separator)
    logger.log("Application Starting :: {}".format(datetime.now()))
    logger.log("\n" + separator)

    # run the app
    app.run(**CFG_OBJ.FLASK_RUN_OPTS)
Beispiel #6
0
 def logger(self):
     return LoggerFactory.create_daily_logger(
         __name__, PathMgr.get_log_path('realtime'))
Beispiel #7
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

import urllib3
import random
from config import settings
from utils.persistence.file_storage_factory import FileStorageFactory
from multiprocessing import RLock
from utils.logger import LoggerFactory

app_logger = LoggerFactory.getInstance('proxy')


class ProxyBase(object):
    '''
    Base proy. Must implement:
        getProxies()
        _saveProxies()
    '''

    proxy_basic_auth = None
    COUNTER = 0
    SELECTED = 0
    LOCK = RLock()

    def __init__(self):
        self.proxies = self.getProxies()
        self.recoverInvalidatedProxies()

    def getNextProxy(self):
        with ProxyBase.LOCK:
Beispiel #8
0
    else:
        message = str(sys_error)
        status_code = 500

    error_info = traceback.format_exc()
    log_all_exceptions(logger)

    # This should be skipped depending on config
    if(CFG_OBJ.get('DEBUG') == True):
        print(error_info)
    return make_response(jsonify(message=message), status_code)

for error in default_exceptions.items():
    app.error_handler_spec[None][error] = system_error_handler
    app.error_handler_spec[None][error[0]] = system_error_handler

if __name__ == '__main__':
    # setup logger
    log_type = CFG_OBJ.get('LOGGING.TYPE')
    log_path = CFG_OBJ.get('LOGGING.PATH')
    logger = LoggerFactory.create(log_type, log_path)

    # log the app start message with a timestamp
    separator = '*' * 80
    logger.log("\n" + separator)
    logger.log("Application Starting :: {}".format(datetime.now()))
    logger.log("\n" + separator)

    # run the app
    app.run(**CFG_OBJ.FLASK_RUN_OPTS)
import tempfile
import zlib
import hashlib

try:
    from .utils.six import cPickle as pickle
except ImportError:
    import pickle

from utils.persistence.utils.base import DEFAULT_TIMEOUT
from utils.persistence.utils.filebased import FileBasedCache
from utils.persistence.utils.move import file_move_safe
from utils.persistence.utils.encoding import force_bytes

from utils.logger import LoggerFactory
app_cache_logger = LoggerFactory.getInstance('SeoAppCache')


def keyFunction(key, key_prefix, version):
    """
    Default function to generate keys.

    Constructs the key used by all other methods. By default it prepends
    the `key_prefix'. KEY_FUNCTION can be used to specify an alternate
    function with custom key making behavior.
    """
    key = hashlib.md5(force_bytes(key)).hexdigest()
    return '%s:%s:%s' % (key_prefix, version, key)


class FileStorage(FileBasedCache):
Beispiel #10
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

import re
import math
from bs4 import BeautifulSoup
from bs4 import Comment
from bs4.element import NavigableString
from utils.logger import LoggerFactory
from data_mining.web_pages import scrapping_rules as rules
from data_mining.web_pages.scrapers.base import ScraperBase

SIGMA_MULTIPLIER = 1
SEPARATOR = ' '

app_logger = LoggerFactory.getInstance('SeoAppScrapper')


def cleanHtml(rawHtml):
    ##rawHtml = re.sub(rules.REPLACE_BR_REGEX, "</p><p>", rawHtml)
    rawHtml = re.sub(rules.REPLACE_DOT_REGEX, ". ", rawHtml)
    ###rawHtml = re.sub(re.compile("<script.*>.*</script>"), "", rawHtml)
    return rawHtml

def removeEmptyTags(soup):
    empty_tags = []
    for tag in soup.find_all():
        if not tag.contents:
            empty_tags.append(tag)
    [tag.extract() for tag in empty_tags]