Exemple #1
0
def prepare_one_file_for_index(document, limit_domain):
    import gridfs
    from upol_search_engine.db import mongodb

    mongodb_client = mongodb.create_client()
    mongodb_database = mongodb.get_database(limit_domain, mongodb_client)
    fs = gridfs.GridFS(mongodb_database)
    out = fs.get(document.get('content').get('binary'))
    content = out.read()

    mongodb_client.close()

    content_hash = document.get('content').get('hashes').get('text')
    url_hash = document.get('_id')
    url = document.get('url')
    url_decoded = urls.decode(url)
    url_length = len(url)
    is_file = True
    file_type = document.get('file_type')
    filename = urls.get_filename(url_decoded)
    depth = document.get('depth')
    pagerank = document.get('pagerank')

    body_text = extract_content_from_pdf(content)

    # Reduce size of body_text for database
    while utf8len(body_text) > 800000:
        body_text = body_text[:-10000]

    if (body_text == "") or (body_text is None) or (len(body_text) < 500):
        return None

    max_length_detection = 10000
    body_text_length = len(body_text)

    try:
        if body_text_length < max_length_detection:
            language = detect(body_text)
        else:
            half = body_text_length / 2
            language = detect(body_text[int(half - max_length_detection /
                                            2):int(half +
                                                   max_length_detection / 2)])
    except lang_detect_exception.LangDetectException as e:
        # Fallback language
        language = 'cs'

    title = filename

    description = ""
    keywords = ""
    important_headlines = ""
    url_words = ' '.join(extract_words_from_url(url_decoded, limit_domain))

    row = (url_hash, url, url_decoded, url_words, title, language, keywords,
           description, important_headlines, body_text, content_hash, depth,
           is_file, file_type, pagerank, url_length)

    return row
Exemple #2
0
def feeder_task(crawler_settings, seed, batch_size, delay_between_feeding,
                task_id):
    from upol_search_engine.db import mongodb
    from upol_search_engine.utils import urls
    from upol_search_engine.upol_crawler.core import feeder

    client = mongodb.create_client()
    database = mongodb.get_database(crawler_settings.get('limit_domain'),
                                    client)
    regex = urls.generate_regex(crawler_settings.get('limit_domain'))

    mongodb.drop_database(
        urls.domain_replace_dots(crawler_settings.get('limit_domain')))

    # Init database
    mongodb.init(database)

    feeder.load_seed(seed, database, regex, crawler_settings.get('max_depth'),
                     crawler_settings.get('blacklist'))

    blacklist = crawler_settings.get('blacklist')

    for blacklisted_domain in blacklist:
        crawl_url_task.delay('http://' + blacklisted_domain,
                             crawler_settings.get('max_depth'),
                             crawler_settings,
                             ignore_blacklist=True)

    sleeping = False
    number_of_waiting = 0
    number_of_added_links = 0

    while True:
        if sleeping is False:

            feeder.feed_crawler(database, crawler_settings, batch_size)

            sleeping = True
        else:
            mongodb.update_crawler_progress(client, database, task_id)

            number_of_waiting = feeder.sleep_crawler(database,
                                                     number_of_waiting,
                                                     delay_between_feeding)

            if number_of_waiting >= 2:
                break

            sleeping = False

    mongodb.update_crawler_progress(client, database, task_id)
    client.close()
Exemple #3
0
def calculate_pagerank_task(crawler_settings, task_id):
    from upol_search_engine.db import mongodb
    from upol_search_engine.upol_crawler.core import pagerank

    client = mongodb.create_client()
    database = mongodb.get_database(crawler_settings.get('limit_domain'),
                                    client)

    mongodb.update_pagerank_progress(client, task_id, 'building_graph')
    graph = pagerank.build_graph(database)

    mongodb.update_pagerank_progress(client, task_id, 'calculation')
    graph_pagerank = pagerank.calculate_pagerank(graph, database)

    mongodb.update_pagerank_progress(client, task_id, 'uploading')
    pagerank.insert_pagerank_db(graph_pagerank, database)

    client.close()
Exemple #4
0
def crawl_url(url, depth, crawler_settings, ignore_blacklist=False):
    try:
        client = mongodb.create_client()
        database = mongodb.get_database(crawler_settings.get('limit_domain'),
                                        client)

        allowed = limiter.is_crawl_allowed(
            url, database, crawler_settings.get('frequency_per_server'))

        if not allowed:
            mongodb.set_url_for_recrawl(database, url)
            client.close()
            return

        url, original_url, redirected, response = get_page(
            url, crawler_settings.get('connect_max_timeout'),
            crawler_settings.get('read_max_timeout'))
    except requests.exceptions.ReadTimeout as e:
        # It also remove url from queue and set it as timeouted
        mongodb.set_timeout_url(database, url)
        log.warning('(Timeout) - ReadTimeout: {0}'.format(url))
    except requests.exceptions.ConnectionError as e:
        # It also remove url from queue and set it as timeouted
        mongodb.set_timeout_url(database, url)
        log.warning('(Timeout) - ConnectionError: {0}'.format(url))
    except requests.exceptions.ChunkedEncodingError as e:
        # It also remove url from queue and set it as timeouted
        mongodb.set_timeout_url(database, url)
        log.warning('(Timeout) - ChunkedEncodingError: {0}'.format(url))
    except Exception as e:
        mongodb.delete_url(database, url)
        log.exception('Exception: {0}'.format(url))
        client.close()
        raise
    else:
        _handle_response(database, url, original_url, redirected, response,
                         depth, crawler_settings.get('max_depth'),
                         crawler_settings.get('limit_domain'),
                         crawler_settings.get('blacklist'), ignore_blacklist)

    client.close()
Exemple #5
0
def datamining():

    def return_time_or_none(field):
        if field is None:
            return None
        else:
            return field.replace(tzinfo=None)

    def timedelta_to_string(timedelta):
        seconds = timedelta.total_seconds()

        return '{:.0f}h {:.0f}m'.format(seconds // 3600, seconds % 3600 // 60)

    def get_number_or_zero(number):
        if number is None:
            return 0
        else:
            return number

    def get_number_or_na(number):
        if number is None:
            return "N/A"
        else:
            return number

    def thousands_separator(number):
        return '{:,}'.format(number).replace(',', ' ')

    mongodb_client = mongodb.create_client()

    stages = {'finished': 'Naplánováno',
              'killed': 'Selhalo',
              'loading': 'Načítání',
              'crawler': 'Skenování',
              'indexer': 'Indexování',
              'pagerank': 'Pagerank'}

    time = datetime.now()

    stats = mongodb.get_latest_stats(mongodb_client)

    if stats is None:
        target_domain = "N/A"
        next_time_start = "N/A"
        stage_delta_time = "N/A"
        total_delta_time = "N/A"
        stage = stages.get('finished')
        crawler_queue_labels = ['Mimo frontu', 'Ve frontě', 'Navštíveno']
        crawler_queue_values = [0, 0, 0]
        crawler_progress_labels = ['Stránka', 'Alias', 'Soubor', 'Nevalidní', 'Časový limit']
        crawler_progress_values = [0, 0, 0, 0, 0]
        pagerank_graph_deltatime = "N/A"
        pagerank_calculation_deltatime = "N/A"
        pagerank_uploading_deltatime = "N/A"
        indexer_progress = "N/A"
        number_of_domains = "N/A"
        number_of_servers = "N/A"
        number_of_urls = "N/A"
    else:
        target_domain = stats.get('limit_domain')
        result_db = stats.get('progress').get('result')
        stage_db = stats.get('progress').get('stage')

        if result_db == 'running':
            stage = stages.get(stage_db)
        else:
            stage = stages.get(result_db)

        start_time_db = return_time_or_none(stats.get('progress').get('start'))
        end_time_db = return_time_or_none(stats.get('progress').get('end'))

        crawler_start_time_db = return_time_or_none(stats.get('crawler').get('start'))
        crawler_end_time_db = return_time_or_none(stats.get('crawler').get('end'))

        pagerank_start_time_db = return_time_or_none(stats.get('pagerank').get('start'))
        pagerank_end_time_db = return_time_or_none(stats.get('pagerank').get('end'))

        indexer_start_time_db = return_time_or_none(stats.get('indexer').get('start'))
        indexer_end_time_db = return_time_or_none(stats.get('indexer').get('end'))

        run_every_n_days = next_start_each_n_days()
        time_of_next_start = start_time_db + timedelta(days=run_every_n_days)
        next_time_start = timedelta_to_string(time_of_next_start - time)

        if start_time_db is None:
            stage_delta_time = "N/A"
        else:
            if crawler_start_time_db is None:
                stage_delta_time = timedelta_to_string(time - start_time_db)
            else:
                if crawler_end_time_db is None:
                    stage_delta_time = timedelta_to_string(time - crawler_start_time_db)
                else:
                    if pagerank_end_time_db is None:
                        stage_delta_time = timedelta_to_string(time - pagerank_start_time_db)
                    else:
                        if indexer_end_time_db is None:
                            stage_delta_time = timedelta_to_string(time - indexer_start_time_db)

        if end_time_db is None:
            total_delta_time = time - start_time_db
        else:
            total_delta_time = end_time_db - start_time_db
            stage_delta_time = "N/A"

        total_delta_time = timedelta_to_string(total_delta_time)

        if crawler_start_time_db is None:
            crawler_queue_labels = ['Mimo frontu', 'Ve frontě', 'Navštíveno']
            crawler_queue_values = [0, 0, 0]
            crawler_progress_labels = ['Stránka', 'Alias', 'Soubor', 'Nevalidní', 'Časový limit']
            crawler_progress_values = [0, 0, 0, 0, 0]
            number_of_domains = "N/A"
            number_of_servers = "N/A"
            number_of_urls = "N/A"
        else:
            crawler_progress_db = stats.get('crawler').get('progress')

            crawler_queue_labels = ['Mimo frontu', 'Ve frontě', 'Navštíveno']

            visited = get_number_or_zero(crawler_progress_db.get('urls_visited'))
            queued = get_number_or_zero(crawler_progress_db.get('urls_queued'))
            not_queued = get_number_or_zero(crawler_progress_db.get('urls_not_queued'))

            crawler_queue_values = [not_queued, queued, visited]

            crawler_progress_labels = ['Stránka', 'Alias', 'Soubor', 'Nevalidní', 'Časový limit']

            timeout = get_number_or_zero(crawler_progress_db.get('timeout_count'))
            invalid = get_number_or_zero(crawler_progress_db.get('invalid_count'))
            files = get_number_or_zero(crawler_progress_db.get('files_count'))
            aliases = get_number_or_zero(crawler_progress_db.get('aliases_count'))
            pages = visited - timeout - invalid - files - aliases
            number_of_domains = get_number_or_zero(crawler_progress_db.get('number_of_domains'))
            number_of_servers = get_number_or_zero(crawler_progress_db.get('number_of_servers'))
            number_of_urls = thousands_separator(get_number_or_zero(crawler_progress_db.get('urls_count')))

            crawler_progress_values = [pages, aliases, files, invalid, timeout]

        pagerank_progress_db = stats.get('pagerank').get('progress')

        if pagerank_progress_db is None:
            pagerank_graph_deltatime = "N/A"
            pagerank_calculation_deltatime = "N/A"
            pagerank_uploading_deltatime = "N/A"
        else:
            pagerank_graph_starttime = return_time_or_none(pagerank_progress_db.get('building_graph'))
            pagerank_calculation_starttime = return_time_or_none(pagerank_progress_db.get('calculation'))
            pagerank_uploading_starttime = return_time_or_none(pagerank_progress_db.get('uploading'))

            if pagerank_calculation_starttime is not None:
                pagerank_graph_deltatime = timedelta_to_string(pagerank_calculation_starttime - pagerank_start_time_db)
            else:
                pagerank_graph_deltatime = timedelta_to_string(time - pagerank_start_time_db)

            if pagerank_uploading_starttime is not None:
                pagerank_calculation_deltatime = timedelta_to_string(pagerank_uploading_starttime - pagerank_calculation_starttime)
            else:
                if pagerank_calculation_starttime is None:
                    pagerank_calculation_deltatime = "N/A"
                else:
                    pagerank_calculation_deltatime = timedelta_to_string(time - pagerank_calculation_starttime)

            if pagerank_end_time_db is not None:
                pagerank_uploading_deltatime = timedelta_to_string(pagerank_end_time_db - pagerank_uploading_starttime)
            else:
                if pagerank_uploading_starttime is None:
                    pagerank_uploading_deltatime = "N/A"
                else:
                    pagerank_uploading_deltatime = timedelta_to_string(time - pagerank_uploading_starttime)

        indexer_progress_db = stats.get('indexer').get('progress')

        if indexer_progress_db is None:
            indexer_progress = 0
        else:
            indexer_progress = thousands_separator(get_number_or_zero(indexer_progress_db.get('progress')))

    return jsonify(target_domain=target_domain,
                   stage=stage,
                   stage_delta_time=stage_delta_time,
                   total_delta_time=total_delta_time,
                   next_time_start=next_time_start,
                   crawler_progress_labels=crawler_progress_labels,
                   crawler_progress_values=crawler_progress_values,
                   crawler_queue_labels=crawler_queue_labels,
                   crawler_queue_values=crawler_queue_values,
                   indexer_progress=indexer_progress,
                   pagerank_graph_deltatime=pagerank_graph_deltatime,
                   pagerank_calculation_deltatime=pagerank_calculation_deltatime,
                   pagerank_uploading_deltatime=pagerank_uploading_deltatime,
                   number_of_domains=number_of_domains,
                   number_of_servers=number_of_servers,
                   number_of_urls=number_of_urls)
Exemple #6
0
def indexer_task(crawler_settings, indexer_settings, task_id):
    from upol_search_engine.db import mongodb
    from upol_search_engine.db import postgresql
    import locale
    from celery.result import AsyncResult
    from celery.states import PENDING, STARTED, RECEIVED, SUCCESS
    import time

    locale.setlocale(locale.LC_ALL, 'cs_CZ.utf-8')

    mongodb_client = mongodb.create_client()
    mongodb_database = mongodb.get_database(
        crawler_settings.get('limit_domain'), mongodb_client)
    mongodb_batch_size = indexer_settings.get('batch_size')

    postgresql_client = postgresql.create_client()
    postgresql_cursor = postgresql_client.cursor()
    postgresql_table_name = indexer_settings.get('table_name')
    postgresql_table_name_production = indexer_settings.get(
        'table_name_production')
    postgresql_metadata_table_name = indexer_settings.get(
        'metadata_table_name')
    postgresql_metadata_table_name_production = indexer_settings.get(
        'metadata_table_name_production')

    # Test if postgresql table is ready
    # if (not postgresql.test_if_table_exists(postgresql_client, postgresql_cursor, postgresql_table_name)) or (not postgresql.test_if_table_exists(postgresql_client, postgresql_cursor, 'metadata_tmp')):
    postgresql.reset_and_init_db(postgresql_client,
                                 postgresql_cursor,
                                 postgresql_table_name,
                                 postgresql_metadata_table_name)

    tasks_list = []

    while True:
        document_batch = mongodb.get_batch_of_ids_for_indexer(
            mongodb_database,
            mongodb_batch_size)

        document_batch = list(document_batch)

        if len(document_batch) == 0:
            break

        document_ids = []

        for document in document_batch:
            document_ids.append(document.get('representative'))

        if len(document_ids) > 0:
            mongodb.set_documents_as_indexed(mongodb_database, document_ids)
            counter = 0
            for document_id in document_ids:
                counter += 1
                tasks_list.append(index_document_task.delay(document_id,
                                                            task_id,
                                                            crawler_settings,
                                                            indexer_settings))

    waiting = True

    while waiting:
        n_of_running = 0

        for task in tasks_list:
            state = AsyncResult(task.task_id).status

            if state == PENDING or state == STARTED or state == RECEIVED:
                n_of_running += 1

        if n_of_running == 0:
            waiting = False
            for task in tasks_list:
                state = AsyncResult(task.task_id).status
                if state != SUCCESS:
                    print(state)

        time.sleep(10)

    postgresql.change_table_to_production(postgresql_client,
                                          postgresql_cursor,
                                          postgresql_table_name,
                                          postgresql_table_name_production)

    postgresql.create_psql_index(postgresql_client,
                                 postgresql_cursor,
                                 postgresql_table_name_production,
                                 'search_index',
                                 'search_idx')

    postgresql.change_table_to_production(postgresql_client,
                                          postgresql_cursor,
                                          postgresql_metadata_table_name,
                                          postgresql_metadata_table_name_production)

    postgresql.create_psql_index(postgresql_client,
                                 postgresql_cursor,
                                 postgresql_metadata_table_name_production,
                                 'microformat_index',
                                 'microformat_idx')

    postgresql_cursor.close()
    postgresql_client.close()
    mongodb_client.close()
Exemple #7
0
def index_document_task(document_id, task_id,
                        crawler_settings, indexer_settings):
    from upol_search_engine.db import mongodb
    from upol_search_engine.db import postgresql
    from upol_search_engine.upol_indexer import indexer
    from celery.utils.log import get_task_logger
    from upol_search_engine.upol_indexer import microformat
    from psycopg2 import IntegrityError
    import json
    import hashlib

    log = get_task_logger(__name__)

    mongodb_client = mongodb.create_client()
    mongodb_database = mongodb.get_database(
        crawler_settings.get('limit_domain'), mongodb_client)
    postgresql_client = postgresql.create_client()
    postgresql_cursor = postgresql_client.cursor()
    postgresql_table_name = indexer_settings.get('table_name')
    postgresql_table_name_production = indexer_settings.get(
        'table_name_production')
    postgresql_metadata_table_name = indexer_settings.get('metadata_table_name')
    postgresql_metadata_table_name_production = indexer_settings.get(
        'metadatatable_name_production')

    try:
        document = mongodb.get_document_by_id(mongodb_database, document_id)

        indexed_rows = []
        copied_rows = []

        does_production_exists = postgresql.test_if_table_exists(
            postgresql_client,
            postgresql_cursor,
            postgresql_table_name_production)

        try:
            is_file = document.get('file')

            # Metadata
            if not is_file:
                soup = indexer.get_soup_from_document(document)
                metadata = microformat.find_microformat_on_page(soup)

                if metadata is not None:
                    parsed_metadata = microformat.parse_json(metadata)
                    metadata_hash = hashlib.sha1(
                        json.dumps(parsed_metadata,
                                   sort_keys=True).encode('utf-8')).hexdigest()

                    if microformat.validate_json_schema(parsed_metadata):

                        parsed_metadata, metadata_type = microformat.prepare_metadata_for_insert(parsed_metadata)

                        try:
                            postgresql.insert_microformat(postgresql_client,
                                                          postgresql_cursor,
                                                          json.dumps(parsed_metadata),
                                                          metadata_hash,
                                                          metadata_type,
                                                          postgresql_metadata_table_name)
                        except IntegrityError as e:
                            log.info('METADATA duplicity: {}'.format(
                                parsed_metadata))
                    else:
                        log.info('METADATA not valid: {}'.format(
                            document.get('url')))

            if does_production_exists:
                url_hash = document.get('_id')
                content_hash = document.get('content').get('hashes').get('text')

                production_document = postgresql.get_document_by_hash(
                    postgresql_client,
                    postgresql_cursor,
                    url_hash,
                    postgresql_table_name_production)
            else:
                production_document = None

            if (production_document is None) or (production_document[10] != content_hash):
                if is_file:
                    log.info('INDEXER: Indexing document (file).')

                    # Index only pdf this time
                    if document.get('file_type') == 'pdf':
                        try:
                            row = indexer.prepare_one_file_for_index(
                                document, crawler_settings.get('limit_domain'))
                        except Exception as e:
                            log.exception('Exception: {0}'.format(document.get('url')))
                            row = None
                    else:
                        row = None
                else:
                    log.info('INDEXER: Indexing document.')
                    row = indexer.prepare_one_document_for_index(
                        document,
                        soup,
                        crawler_settings.get('limit_domain'))

                if row is not None:
                    indexed_rows.append(row)
            else:
                if is_file:
                    log.info('INDEXER: Coping document (file).')
                else:
                    log.info('INDEXER: Coping document.')

                copied_rows.append(production_document)

                postgresql.copy_row_from_table_to_table(
                    postgresql_client,
                    postgresql_cursor,
                    url_hash,
                    postgresql_table_name_production,
                    postgresql_table_name)
        except Exception as e:
            log.exception('Exception: {0}'.format(document.get('url')))

        if len(indexed_rows) > 0:
                postgresql.insert_rows_into_index(postgresql_client,
                                                  postgresql_cursor,
                                                  indexed_rows,
                                                  postgresql_table_name)

        mongodb.update_indexer_progress(
            mongodb_client, task_id, len(indexed_rows) + len(copied_rows))
    except Exception as e:
        log.exception('Exception: INDEXER TASK POSSIBLE FAILURE')
    finally:
        postgresql_cursor.close()
        postgresql_client.close()
        mongodb_client.close()
def main_task(self):
    """Main task of the project"""

    try:
        blacklist = urls.load_urls_from_file(settings.blacklist_path)
        seed = urls.load_urls_from_file(settings.seed_path)

        crawler_settings = {
            'limit_domain':
            settings.CONFIG.get('Crawler', 'limit_domain'),
            'max_depth':
            settings.CONFIG.getint('Crawler', 'max_depth'),
            'connect_max_timeout':
            settings.CONFIG.getfloat('Crawler', 'connect_max_timeout'),
            'read_max_timeout':
            settings.CONFIG.getint('Crawler', 'read_max_timeout'),
            'frequency_per_server':
            settings.CONFIG.getfloat('Crawler', 'frequency_per_server'),
            'blacklist':
            blacklist
        }

        indexer_settings = {
            'batch_size':
            settings.CONFIG.getint('Indexer', 'batch_size'),
            'table_name':
            settings.CONFIG.get('General', 'postgresql_table_name_tmp'),
            'table_name_production':
            settings.CONFIG.get('General', 'postgresql_table_name'),
            'metadata_table_name':
            settings.CONFIG.get('General',
                                'postgresql_metadata_table_name_tmp'),
            'metadata_table_name_production':
            settings.CONFIG.get('General', 'postgresql_metadata_table_name')
        }

        mongodb_client = mongodb.create_client()

        task_id = self.request.id

        mongodb.insert_engine_start(mongodb_client, task_id, crawler_settings)

        mongodb.insert_sub_task_start(mongodb_client, task_id, "crawler")

        crawler_tasks.feeder_task(crawler_settings=crawler_settings,
                                  seed=seed,
                                  batch_size=settings.CONFIG.getint(
                                      'Crawler', 'batch_size'),
                                  delay_between_feeding=settings.CONFIG.getint(
                                      'Crawler', 'delay_between_feeding'),
                                  task_id=task_id)

        mongodb.insert_sub_task_finish(mongodb_client, task_id, "crawler",
                                       "finished")

        mongodb.insert_sub_task_start(mongodb_client, task_id, "pagerank")

        crawler_tasks.calculate_pagerank_task(crawler_settings, task_id)

        mongodb.insert_sub_task_finish(mongodb_client, task_id, "pagerank",
                                       "finished")

        mongodb.insert_sub_task_start(mongodb_client, task_id, "indexer")

        indexer_tasks.indexer_task(crawler_settings, indexer_settings, task_id)

        mongodb.insert_sub_task_finish(mongodb_client, task_id, "indexer",
                                       "finished")

        mongodb.insert_engine_finish(mongodb_client, task_id, "finished")

        mongodb_client.close()
    except SoftTimeLimitExceeded:
        mongodb.insert_engine_finish(mongodb_client, task_id, "killed")

        mongodb_client.close()