def prepare_one_file_for_index(document, limit_domain): import gridfs from upol_search_engine.db import mongodb mongodb_client = mongodb.create_client() mongodb_database = mongodb.get_database(limit_domain, mongodb_client) fs = gridfs.GridFS(mongodb_database) out = fs.get(document.get('content').get('binary')) content = out.read() mongodb_client.close() content_hash = document.get('content').get('hashes').get('text') url_hash = document.get('_id') url = document.get('url') url_decoded = urls.decode(url) url_length = len(url) is_file = True file_type = document.get('file_type') filename = urls.get_filename(url_decoded) depth = document.get('depth') pagerank = document.get('pagerank') body_text = extract_content_from_pdf(content) # Reduce size of body_text for database while utf8len(body_text) > 800000: body_text = body_text[:-10000] if (body_text == "") or (body_text is None) or (len(body_text) < 500): return None max_length_detection = 10000 body_text_length = len(body_text) try: if body_text_length < max_length_detection: language = detect(body_text) else: half = body_text_length / 2 language = detect(body_text[int(half - max_length_detection / 2):int(half + max_length_detection / 2)]) except lang_detect_exception.LangDetectException as e: # Fallback language language = 'cs' title = filename description = "" keywords = "" important_headlines = "" url_words = ' '.join(extract_words_from_url(url_decoded, limit_domain)) row = (url_hash, url, url_decoded, url_words, title, language, keywords, description, important_headlines, body_text, content_hash, depth, is_file, file_type, pagerank, url_length) return row
def feeder_task(crawler_settings, seed, batch_size, delay_between_feeding, task_id): from upol_search_engine.db import mongodb from upol_search_engine.utils import urls from upol_search_engine.upol_crawler.core import feeder client = mongodb.create_client() database = mongodb.get_database(crawler_settings.get('limit_domain'), client) regex = urls.generate_regex(crawler_settings.get('limit_domain')) mongodb.drop_database( urls.domain_replace_dots(crawler_settings.get('limit_domain'))) # Init database mongodb.init(database) feeder.load_seed(seed, database, regex, crawler_settings.get('max_depth'), crawler_settings.get('blacklist')) blacklist = crawler_settings.get('blacklist') for blacklisted_domain in blacklist: crawl_url_task.delay('http://' + blacklisted_domain, crawler_settings.get('max_depth'), crawler_settings, ignore_blacklist=True) sleeping = False number_of_waiting = 0 number_of_added_links = 0 while True: if sleeping is False: feeder.feed_crawler(database, crawler_settings, batch_size) sleeping = True else: mongodb.update_crawler_progress(client, database, task_id) number_of_waiting = feeder.sleep_crawler(database, number_of_waiting, delay_between_feeding) if number_of_waiting >= 2: break sleeping = False mongodb.update_crawler_progress(client, database, task_id) client.close()
def calculate_pagerank_task(crawler_settings, task_id): from upol_search_engine.db import mongodb from upol_search_engine.upol_crawler.core import pagerank client = mongodb.create_client() database = mongodb.get_database(crawler_settings.get('limit_domain'), client) mongodb.update_pagerank_progress(client, task_id, 'building_graph') graph = pagerank.build_graph(database) mongodb.update_pagerank_progress(client, task_id, 'calculation') graph_pagerank = pagerank.calculate_pagerank(graph, database) mongodb.update_pagerank_progress(client, task_id, 'uploading') pagerank.insert_pagerank_db(graph_pagerank, database) client.close()
def crawl_url(url, depth, crawler_settings, ignore_blacklist=False): try: client = mongodb.create_client() database = mongodb.get_database(crawler_settings.get('limit_domain'), client) allowed = limiter.is_crawl_allowed( url, database, crawler_settings.get('frequency_per_server')) if not allowed: mongodb.set_url_for_recrawl(database, url) client.close() return url, original_url, redirected, response = get_page( url, crawler_settings.get('connect_max_timeout'), crawler_settings.get('read_max_timeout')) except requests.exceptions.ReadTimeout as e: # It also remove url from queue and set it as timeouted mongodb.set_timeout_url(database, url) log.warning('(Timeout) - ReadTimeout: {0}'.format(url)) except requests.exceptions.ConnectionError as e: # It also remove url from queue and set it as timeouted mongodb.set_timeout_url(database, url) log.warning('(Timeout) - ConnectionError: {0}'.format(url)) except requests.exceptions.ChunkedEncodingError as e: # It also remove url from queue and set it as timeouted mongodb.set_timeout_url(database, url) log.warning('(Timeout) - ChunkedEncodingError: {0}'.format(url)) except Exception as e: mongodb.delete_url(database, url) log.exception('Exception: {0}'.format(url)) client.close() raise else: _handle_response(database, url, original_url, redirected, response, depth, crawler_settings.get('max_depth'), crawler_settings.get('limit_domain'), crawler_settings.get('blacklist'), ignore_blacklist) client.close()
def datamining(): def return_time_or_none(field): if field is None: return None else: return field.replace(tzinfo=None) def timedelta_to_string(timedelta): seconds = timedelta.total_seconds() return '{:.0f}h {:.0f}m'.format(seconds // 3600, seconds % 3600 // 60) def get_number_or_zero(number): if number is None: return 0 else: return number def get_number_or_na(number): if number is None: return "N/A" else: return number def thousands_separator(number): return '{:,}'.format(number).replace(',', ' ') mongodb_client = mongodb.create_client() stages = {'finished': 'Naplánováno', 'killed': 'Selhalo', 'loading': 'Načítání', 'crawler': 'Skenování', 'indexer': 'Indexování', 'pagerank': 'Pagerank'} time = datetime.now() stats = mongodb.get_latest_stats(mongodb_client) if stats is None: target_domain = "N/A" next_time_start = "N/A" stage_delta_time = "N/A" total_delta_time = "N/A" stage = stages.get('finished') crawler_queue_labels = ['Mimo frontu', 'Ve frontě', 'Navštíveno'] crawler_queue_values = [0, 0, 0] crawler_progress_labels = ['Stránka', 'Alias', 'Soubor', 'Nevalidní', 'Časový limit'] crawler_progress_values = [0, 0, 0, 0, 0] pagerank_graph_deltatime = "N/A" pagerank_calculation_deltatime = "N/A" pagerank_uploading_deltatime = "N/A" indexer_progress = "N/A" number_of_domains = "N/A" number_of_servers = "N/A" number_of_urls = "N/A" else: target_domain = stats.get('limit_domain') result_db = stats.get('progress').get('result') stage_db = stats.get('progress').get('stage') if result_db == 'running': stage = stages.get(stage_db) else: stage = stages.get(result_db) start_time_db = return_time_or_none(stats.get('progress').get('start')) end_time_db = return_time_or_none(stats.get('progress').get('end')) crawler_start_time_db = return_time_or_none(stats.get('crawler').get('start')) crawler_end_time_db = return_time_or_none(stats.get('crawler').get('end')) pagerank_start_time_db = return_time_or_none(stats.get('pagerank').get('start')) pagerank_end_time_db = return_time_or_none(stats.get('pagerank').get('end')) indexer_start_time_db = return_time_or_none(stats.get('indexer').get('start')) indexer_end_time_db = return_time_or_none(stats.get('indexer').get('end')) run_every_n_days = next_start_each_n_days() time_of_next_start = start_time_db + timedelta(days=run_every_n_days) next_time_start = timedelta_to_string(time_of_next_start - time) if start_time_db is None: stage_delta_time = "N/A" else: if crawler_start_time_db is None: stage_delta_time = timedelta_to_string(time - start_time_db) else: if crawler_end_time_db is None: stage_delta_time = timedelta_to_string(time - crawler_start_time_db) else: if pagerank_end_time_db is None: stage_delta_time = timedelta_to_string(time - pagerank_start_time_db) else: if indexer_end_time_db is None: stage_delta_time = timedelta_to_string(time - indexer_start_time_db) if end_time_db is None: total_delta_time = time - start_time_db else: total_delta_time = end_time_db - start_time_db stage_delta_time = "N/A" total_delta_time = timedelta_to_string(total_delta_time) if crawler_start_time_db is None: crawler_queue_labels = ['Mimo frontu', 'Ve frontě', 'Navštíveno'] crawler_queue_values = [0, 0, 0] crawler_progress_labels = ['Stránka', 'Alias', 'Soubor', 'Nevalidní', 'Časový limit'] crawler_progress_values = [0, 0, 0, 0, 0] number_of_domains = "N/A" number_of_servers = "N/A" number_of_urls = "N/A" else: crawler_progress_db = stats.get('crawler').get('progress') crawler_queue_labels = ['Mimo frontu', 'Ve frontě', 'Navštíveno'] visited = get_number_or_zero(crawler_progress_db.get('urls_visited')) queued = get_number_or_zero(crawler_progress_db.get('urls_queued')) not_queued = get_number_or_zero(crawler_progress_db.get('urls_not_queued')) crawler_queue_values = [not_queued, queued, visited] crawler_progress_labels = ['Stránka', 'Alias', 'Soubor', 'Nevalidní', 'Časový limit'] timeout = get_number_or_zero(crawler_progress_db.get('timeout_count')) invalid = get_number_or_zero(crawler_progress_db.get('invalid_count')) files = get_number_or_zero(crawler_progress_db.get('files_count')) aliases = get_number_or_zero(crawler_progress_db.get('aliases_count')) pages = visited - timeout - invalid - files - aliases number_of_domains = get_number_or_zero(crawler_progress_db.get('number_of_domains')) number_of_servers = get_number_or_zero(crawler_progress_db.get('number_of_servers')) number_of_urls = thousands_separator(get_number_or_zero(crawler_progress_db.get('urls_count'))) crawler_progress_values = [pages, aliases, files, invalid, timeout] pagerank_progress_db = stats.get('pagerank').get('progress') if pagerank_progress_db is None: pagerank_graph_deltatime = "N/A" pagerank_calculation_deltatime = "N/A" pagerank_uploading_deltatime = "N/A" else: pagerank_graph_starttime = return_time_or_none(pagerank_progress_db.get('building_graph')) pagerank_calculation_starttime = return_time_or_none(pagerank_progress_db.get('calculation')) pagerank_uploading_starttime = return_time_or_none(pagerank_progress_db.get('uploading')) if pagerank_calculation_starttime is not None: pagerank_graph_deltatime = timedelta_to_string(pagerank_calculation_starttime - pagerank_start_time_db) else: pagerank_graph_deltatime = timedelta_to_string(time - pagerank_start_time_db) if pagerank_uploading_starttime is not None: pagerank_calculation_deltatime = timedelta_to_string(pagerank_uploading_starttime - pagerank_calculation_starttime) else: if pagerank_calculation_starttime is None: pagerank_calculation_deltatime = "N/A" else: pagerank_calculation_deltatime = timedelta_to_string(time - pagerank_calculation_starttime) if pagerank_end_time_db is not None: pagerank_uploading_deltatime = timedelta_to_string(pagerank_end_time_db - pagerank_uploading_starttime) else: if pagerank_uploading_starttime is None: pagerank_uploading_deltatime = "N/A" else: pagerank_uploading_deltatime = timedelta_to_string(time - pagerank_uploading_starttime) indexer_progress_db = stats.get('indexer').get('progress') if indexer_progress_db is None: indexer_progress = 0 else: indexer_progress = thousands_separator(get_number_or_zero(indexer_progress_db.get('progress'))) return jsonify(target_domain=target_domain, stage=stage, stage_delta_time=stage_delta_time, total_delta_time=total_delta_time, next_time_start=next_time_start, crawler_progress_labels=crawler_progress_labels, crawler_progress_values=crawler_progress_values, crawler_queue_labels=crawler_queue_labels, crawler_queue_values=crawler_queue_values, indexer_progress=indexer_progress, pagerank_graph_deltatime=pagerank_graph_deltatime, pagerank_calculation_deltatime=pagerank_calculation_deltatime, pagerank_uploading_deltatime=pagerank_uploading_deltatime, number_of_domains=number_of_domains, number_of_servers=number_of_servers, number_of_urls=number_of_urls)
def indexer_task(crawler_settings, indexer_settings, task_id): from upol_search_engine.db import mongodb from upol_search_engine.db import postgresql import locale from celery.result import AsyncResult from celery.states import PENDING, STARTED, RECEIVED, SUCCESS import time locale.setlocale(locale.LC_ALL, 'cs_CZ.utf-8') mongodb_client = mongodb.create_client() mongodb_database = mongodb.get_database( crawler_settings.get('limit_domain'), mongodb_client) mongodb_batch_size = indexer_settings.get('batch_size') postgresql_client = postgresql.create_client() postgresql_cursor = postgresql_client.cursor() postgresql_table_name = indexer_settings.get('table_name') postgresql_table_name_production = indexer_settings.get( 'table_name_production') postgresql_metadata_table_name = indexer_settings.get( 'metadata_table_name') postgresql_metadata_table_name_production = indexer_settings.get( 'metadata_table_name_production') # Test if postgresql table is ready # if (not postgresql.test_if_table_exists(postgresql_client, postgresql_cursor, postgresql_table_name)) or (not postgresql.test_if_table_exists(postgresql_client, postgresql_cursor, 'metadata_tmp')): postgresql.reset_and_init_db(postgresql_client, postgresql_cursor, postgresql_table_name, postgresql_metadata_table_name) tasks_list = [] while True: document_batch = mongodb.get_batch_of_ids_for_indexer( mongodb_database, mongodb_batch_size) document_batch = list(document_batch) if len(document_batch) == 0: break document_ids = [] for document in document_batch: document_ids.append(document.get('representative')) if len(document_ids) > 0: mongodb.set_documents_as_indexed(mongodb_database, document_ids) counter = 0 for document_id in document_ids: counter += 1 tasks_list.append(index_document_task.delay(document_id, task_id, crawler_settings, indexer_settings)) waiting = True while waiting: n_of_running = 0 for task in tasks_list: state = AsyncResult(task.task_id).status if state == PENDING or state == STARTED or state == RECEIVED: n_of_running += 1 if n_of_running == 0: waiting = False for task in tasks_list: state = AsyncResult(task.task_id).status if state != SUCCESS: print(state) time.sleep(10) postgresql.change_table_to_production(postgresql_client, postgresql_cursor, postgresql_table_name, postgresql_table_name_production) postgresql.create_psql_index(postgresql_client, postgresql_cursor, postgresql_table_name_production, 'search_index', 'search_idx') postgresql.change_table_to_production(postgresql_client, postgresql_cursor, postgresql_metadata_table_name, postgresql_metadata_table_name_production) postgresql.create_psql_index(postgresql_client, postgresql_cursor, postgresql_metadata_table_name_production, 'microformat_index', 'microformat_idx') postgresql_cursor.close() postgresql_client.close() mongodb_client.close()
def index_document_task(document_id, task_id, crawler_settings, indexer_settings): from upol_search_engine.db import mongodb from upol_search_engine.db import postgresql from upol_search_engine.upol_indexer import indexer from celery.utils.log import get_task_logger from upol_search_engine.upol_indexer import microformat from psycopg2 import IntegrityError import json import hashlib log = get_task_logger(__name__) mongodb_client = mongodb.create_client() mongodb_database = mongodb.get_database( crawler_settings.get('limit_domain'), mongodb_client) postgresql_client = postgresql.create_client() postgresql_cursor = postgresql_client.cursor() postgresql_table_name = indexer_settings.get('table_name') postgresql_table_name_production = indexer_settings.get( 'table_name_production') postgresql_metadata_table_name = indexer_settings.get('metadata_table_name') postgresql_metadata_table_name_production = indexer_settings.get( 'metadatatable_name_production') try: document = mongodb.get_document_by_id(mongodb_database, document_id) indexed_rows = [] copied_rows = [] does_production_exists = postgresql.test_if_table_exists( postgresql_client, postgresql_cursor, postgresql_table_name_production) try: is_file = document.get('file') # Metadata if not is_file: soup = indexer.get_soup_from_document(document) metadata = microformat.find_microformat_on_page(soup) if metadata is not None: parsed_metadata = microformat.parse_json(metadata) metadata_hash = hashlib.sha1( json.dumps(parsed_metadata, sort_keys=True).encode('utf-8')).hexdigest() if microformat.validate_json_schema(parsed_metadata): parsed_metadata, metadata_type = microformat.prepare_metadata_for_insert(parsed_metadata) try: postgresql.insert_microformat(postgresql_client, postgresql_cursor, json.dumps(parsed_metadata), metadata_hash, metadata_type, postgresql_metadata_table_name) except IntegrityError as e: log.info('METADATA duplicity: {}'.format( parsed_metadata)) else: log.info('METADATA not valid: {}'.format( document.get('url'))) if does_production_exists: url_hash = document.get('_id') content_hash = document.get('content').get('hashes').get('text') production_document = postgresql.get_document_by_hash( postgresql_client, postgresql_cursor, url_hash, postgresql_table_name_production) else: production_document = None if (production_document is None) or (production_document[10] != content_hash): if is_file: log.info('INDEXER: Indexing document (file).') # Index only pdf this time if document.get('file_type') == 'pdf': try: row = indexer.prepare_one_file_for_index( document, crawler_settings.get('limit_domain')) except Exception as e: log.exception('Exception: {0}'.format(document.get('url'))) row = None else: row = None else: log.info('INDEXER: Indexing document.') row = indexer.prepare_one_document_for_index( document, soup, crawler_settings.get('limit_domain')) if row is not None: indexed_rows.append(row) else: if is_file: log.info('INDEXER: Coping document (file).') else: log.info('INDEXER: Coping document.') copied_rows.append(production_document) postgresql.copy_row_from_table_to_table( postgresql_client, postgresql_cursor, url_hash, postgresql_table_name_production, postgresql_table_name) except Exception as e: log.exception('Exception: {0}'.format(document.get('url'))) if len(indexed_rows) > 0: postgresql.insert_rows_into_index(postgresql_client, postgresql_cursor, indexed_rows, postgresql_table_name) mongodb.update_indexer_progress( mongodb_client, task_id, len(indexed_rows) + len(copied_rows)) except Exception as e: log.exception('Exception: INDEXER TASK POSSIBLE FAILURE') finally: postgresql_cursor.close() postgresql_client.close() mongodb_client.close()
def main_task(self): """Main task of the project""" try: blacklist = urls.load_urls_from_file(settings.blacklist_path) seed = urls.load_urls_from_file(settings.seed_path) crawler_settings = { 'limit_domain': settings.CONFIG.get('Crawler', 'limit_domain'), 'max_depth': settings.CONFIG.getint('Crawler', 'max_depth'), 'connect_max_timeout': settings.CONFIG.getfloat('Crawler', 'connect_max_timeout'), 'read_max_timeout': settings.CONFIG.getint('Crawler', 'read_max_timeout'), 'frequency_per_server': settings.CONFIG.getfloat('Crawler', 'frequency_per_server'), 'blacklist': blacklist } indexer_settings = { 'batch_size': settings.CONFIG.getint('Indexer', 'batch_size'), 'table_name': settings.CONFIG.get('General', 'postgresql_table_name_tmp'), 'table_name_production': settings.CONFIG.get('General', 'postgresql_table_name'), 'metadata_table_name': settings.CONFIG.get('General', 'postgresql_metadata_table_name_tmp'), 'metadata_table_name_production': settings.CONFIG.get('General', 'postgresql_metadata_table_name') } mongodb_client = mongodb.create_client() task_id = self.request.id mongodb.insert_engine_start(mongodb_client, task_id, crawler_settings) mongodb.insert_sub_task_start(mongodb_client, task_id, "crawler") crawler_tasks.feeder_task(crawler_settings=crawler_settings, seed=seed, batch_size=settings.CONFIG.getint( 'Crawler', 'batch_size'), delay_between_feeding=settings.CONFIG.getint( 'Crawler', 'delay_between_feeding'), task_id=task_id) mongodb.insert_sub_task_finish(mongodb_client, task_id, "crawler", "finished") mongodb.insert_sub_task_start(mongodb_client, task_id, "pagerank") crawler_tasks.calculate_pagerank_task(crawler_settings, task_id) mongodb.insert_sub_task_finish(mongodb_client, task_id, "pagerank", "finished") mongodb.insert_sub_task_start(mongodb_client, task_id, "indexer") indexer_tasks.indexer_task(crawler_settings, indexer_settings, task_id) mongodb.insert_sub_task_finish(mongodb_client, task_id, "indexer", "finished") mongodb.insert_engine_finish(mongodb_client, task_id, "finished") mongodb_client.close() except SoftTimeLimitExceeded: mongodb.insert_engine_finish(mongodb_client, task_id, "killed") mongodb_client.close()