def get_content_chunk(): """ Get metadata for 5 chunks on each request :return: List of metadata, empty list if no content chunks """ # Get the first number_of_chunks crawled chunk ids results = db_manager.get_first_n_crawled_chunks(number_of_chunks) print(results) temp_chunks = [] if len(results) != 0: for chunk in results: temp_dict = {} temp_dict['chunk_id'] = chunk['chunk_id'] temp_dict['host'] = chunk['c_host'] temp_chunks.append(temp_dict) # Insert to index builder relation and mark chunk as 'building' db_manager.operate_on_index_builder_relation( 'INSERT', chunk['chunk_id'], host=request.remote_addr, task='building') return jsonify(temp_chunks) else: return jsonify([])
def set_content_chunk_state(): """ Set content chunk state. :return: 201 if successful, 400 if fail """ message = request.get_json() if message['state'] == "crawled": chunk_id = message['chunk_id'] # Get all links for that chunk id links = db_manager.get_links_for_chunk_id(chunk_id=chunk_id) # Update state of all links to crawled for entry in links: link = entry['link'] db_manager.operate_on_link_relation('UPDATE_STATE', link=link, state='crawled') # Update Crawler's chunk id task to crawled db_manager.operate_on_crawler_relation('UPDATE_TASK', chunk_id=chunk_id, task='crawled') response = {'message': 'Successfully updated state to crawled'} return jsonify(response), 201 else: response = {'message': 'There is no state available'} return jsonify(response), 400
def set_health(): """ Sets the health status for components; called by watchdogs :return: None """ message = request.get_json() # Update the health status in the database db_manager.operate_on_host_relation('UPDATE_HEALTH', host=message['host'], health=message['status'])
def set_component_state(): """ Set component state :return: 201 if success, 400 if fail """ message = request.get_json() if message['state'] == 'online': db_manager.operate_on_host_relation('UPDATE_STATE', host=request.remote_addr, state='online') response = {'message': 'Successfully set state to online'} return jsonify(response), 201 elif message['state'] == 'waiting': db_manager.operate_on_host_relation('UPDATE_STATE', host=request.remote_addr, state='waiting') response = {'message': 'Successfully set state to waiting'} return jsonify(response), 201 elif message['state'] == 'error': db_manager.operate_on_host_relation('UPDATE_STATE', host=request.remote_addr, state='error') response = {'message': 'Successfully set state to error'} return jsonify(response), 201 elif message['state'] == 'paused': db_manager.operate_on_host_relation('UPDATE_STATE', host=request.remote_addr, state='paused') response = {'message': 'Successfully set state to paused'} return jsonify(response), 201 else: response = {'message': 'Failed to set state'} return jsonify(response), 201
def add_links(): """ Add links to database :return: 201 if successful, 200 if no links to add """ links = request.get_json() if links['links'] != []: for link in links['links']: db_manager.operate_on_link_relation('INSERT', link=link) response = {'message': 'Successfully added links to database'} return jsonify(response), 201 else: response = {'message': 'There are no links to add'} return jsonify(response), 200
def main(): wd_num_items = 10 thread_count = 0 dm = '0.0.0.0:5000' host_relation = db_manager.get_relation('host') j = 0 watchdogs = [] hosts = [] for i in range(0, len(host_relation)): if j < wd_num_items: hosts.append(host_relation[i]['host']) j += 1 if j >= wd_num_items: thread_count += 1 wd = WatchDog(thread_count, dm, hosts) watchdogs.append(wd) hosts = [] j = 0 if (wd_num_items > len(hosts) > 0): thread_count += 1 wd = WatchDog(thread_count, dm, hosts) watchdogs.append(wd) for wd in watchdogs: wd.start()
def distribute_index_servers(): """ Distribute a list of index servers to a set number of rows :return: List """ index_servers = db_manager.get_all_index_servers() number_of_index_servers = len(index_servers) number_of_columns = int(number_of_index_servers / number_of_rows) row_count = 1 rows = [] k = 0 # the index in index_servers array while number_of_index_servers > 0: for i in range(0, number_of_rows): servers_in_row = [] for j in range(0, number_of_columns): servers_in_row.append(index_servers[k]['host']) k += 1 number_of_index_servers -= 1 rows.append({'current_index': 0, 'row_num': row_count, 'row': servers_in_row}) row_count += 1 for i in range(0, number_of_index_servers): rows[i]['row'].append(index_servers[k]['host']) k += 1 number_of_index_servers -= 1 return rows
def get_relation(relation_name): """ Get all records for relation :param relation_name: Relation name :return: List of dictionaries """ result = db_manager.get_relation(relation_name) return jsonify(result)
def assign_index_chunk(rows, chunk_id): """ Assign chunk id to Index Servers that has been distributed into rows :param rows: Array of index servers representing rows :param chunk_id: Chunk ID to assign :return: None """ for row in rows: current_index = row['current_index'] print('current_index = {0}'.format(current_index)) # if index reaches the last element, reset to 0 row['current_index'] = current_index + 1 print('len = {0}'.format(len(row['row']))) if row['current_index'] >= len(row['row']): row['current_index'] = 0 db_manager.operate_on_index_server_relation('INSERT',row=row['row_num'], chunk_id=chunk_id, host=row['row'][current_index]) print(row['row'][current_index] + '\n')
def get_chunks(): """ Get the hosts of the content and index chunks assigned to an index server :return: List of dictionary of hosts, empty list if no chunks assigned """ requester = request.remote_addr # Get all index servers for requester results = db_manager.get_chunk_hosts_for_index_servers(requester) return jsonify(results)
def get_unpropagated_chunks(): """ Get a list of unpropagated chunks :return: List of chunk ids """ results = db_manager.get_first_n_built_chunk_ids(number_of_chunks) temp = [] for chunk in results: temp.append(chunk['chunk_id']) return jsonify(chunks=temp)
def set_index_chunk_state(): """ Set the given state for index chunk :return: 201 if successful, 400 if fail """ message = request.get_json() if message['state'] == "built": # Update content chunk state to 'built' db_manager.operate_on_index_builder_relation( 'UPDATE_TASK', chunk_id=message['chunk_id'], host=request.remote_addr, task=message['state']) # Assign content and index chunks to index servers after the index is built assign_index_chunk(rows, message['chunk_id']) response = {'message': 'Successfully updated state to built'} return jsonify(response), 201 else: response = {'message': 'There is no state available'} return jsonify(response), 400
def set_link_state(): """ Set link state :return: 201 if success, a new pending link if fail """ message = request.get_json() if message['state'] == 'crawled': db_manager.operate_on_link_relation('UPDATE_STATE', message['link'], state='crawled') response = {'message': 'Successfully set state to crawled'} return jsonify(response), 201 elif message['state'] == 'error': # Update state to error db_manager.operate_on_link_relation('UPDATE_STATE', message['link'], state='error') # Find the chunk id for that link chunk_id = db_manager.get_chunk_id_for_link( message['link'])[0]['chunk_id'] # Get a new pending link new_link = db_manager.get_first_n_pending_links(1)[0]['link'] # Update link state to 'crawling' db_manager.operate_on_link_relation('UPDATE_STATE', link=new_link, state='crawling') # Update link's chunk id db_manager.operate_on_link_relation('UPDATE_CHUNK_ID', link=new_link, chunk_id=chunk_id) return jsonify(link=new_link) else: response = {'message': 'Failed to set state'} return jsonify(response), 201
def get_map(): """ Get a map of the index servers and chunks assigned to them :return: List representing a map of the index servers """ # Get a list of index servers index_servers = db_manager.get_all_index_servers() temp = [] for i in range(number_of_rows): temp.append([]) for server in index_servers: index_server_host = server['host'] # Get chunk ids for a given Index Server's host temp_dict = db_manager.get_chunk_ids_for_index_server( host=index_server_host) if len(temp_dict) != 0: index = temp_dict['row'] - 1 temp[index].append({ 'host': temp_dict['host'], 'chunk_id': temp_dict['chunk_ids'] }) return temp
def test_chunk_operation(): """ Test chunk relation's basic operations. :return: None """ db_manager = DatabaseManager() start_length = db_manager.get_relation_length('chunk') # INSERT db_manager.operate_on_chunk_relation('INSERT', chunk_id='101c') if db_manager.get_relation('chunk')[-1]['id'] == '101c': print('> PASSED | INSERT | Chunk relation') else: print('> FAILED | INSERT | Chunk relation') # DELETE db_manager.operate_on_chunk_relation('DELETE', chunk_id='101c') if db_manager.get_relation_length('chunk') == start_length: print('> PASSED | DELETE | Chunk relation') else: print('> FAILED | DELETE | Chunk relation')
def test_get_first_n_pending_links(): # Setup test env db_manager = DatabaseManager() db_manager.operate_on_link_relation('INSERT', link='test_link') # Test result = db_manager.get_first_n_pending_links(1) if len(result) == 1: print('> PASSED | get_first_n_pending_links()') else: print('> FAILED | get_first_n_pending_links()') # Clean up test env db_manager.operate_on_link_relation('DELETE', link='test_link')
def get_links(): """ Get num_of_links pending links from the database :return: JSON """ # Get first n number of pending links from the database results = db_manager.get_first_n_pending_links(number_of_links) links = [] # If there are enough links, append them to a temp list to return to Crawler if len(results) == number_of_links: # Generate a chunk id chunk_id = generate_chunk_id() # Insert to chunk relation db_manager.operate_on_chunk_relation('INSERT', chunk_id=chunk_id) # Insert to crawler relation, default task is 'crawling' db_manager.operate_on_crawler_relation('INSERT', host=request.remote_addr, chunk_id=chunk_id) # Update state and chunk id of each link for record in results: link = record['link'] links.append(link) # Update link state to 'crawling' db_manager.operate_on_link_relation('UPDATE_STATE', link=link, state='crawling') # Update link's chunk id db_manager.operate_on_link_relation('UPDATE_CHUNK_ID', link=link, chunk_id=chunk_id) return jsonify(links=links) else: return jsonify(links=[])
def test_initial_setup(): """ Initial Setup: - 25 crawled links, each belongs to 5 different chunk id from 1c to 5c - 5 chunk ids - 1 Crawler - 1 Index Builder - 10 Index Server :return: None """ db_manager = DatabaseManager() if len(db_manager.get_relation('link')) == 25: print("> PASSED | 25 Links") else: print("> FAILED | 25 Links") if len(db_manager.get_relation('chunk')) == 5: print("> PASSED | 5 Chunk IDs") else: print("> FAILED | 5 Chunk IDs") if len(db_manager.get_all_crawlers()) == 1: print("> PASSED | 1 Crawler") else: print("> FAILED | 1 Crawler") if len(db_manager.get_all_index_builders()) == 1: print("> PASSED | 1 Index Builder") else: print("> FAILED | 1 Index Builder") if len(db_manager.get_all_index_servers()) == 10: print("> PASSED | 10 Index Servers") else: print("> FAILED | 10 Index Servers")
else: print("> FAILED | 1 Index Builder") if len(db_manager.get_all_index_servers()) == 10: print("> PASSED | 10 Index Servers") else: print("> FAILED | 10 Index Servers") if __name__ == '__main__': """ Basic operations """ test_link_operation() test_chunk_operation() test_host_operation() test_crawler_operation() test_index_builder_operation() """ Helper functions """ test_get_relation_for_chunk_id() test_get_first_n_pending_links() test_get_first_n_crawled_chunk_ids() test_get_first_n_built_chunk_ids() test_initial_setup() db_manager = DatabaseManager() # results = db_manager.find_chunk_ids_for_index_servers('3.0.0.1') results = db_manager.get_relation('index_server') pprint(results)
def test_host_operation(): """ Test host relation basic operations. :return: None """ db_manager = DatabaseManager() start_length = db_manager.get_relation_length('host') # INSERT db_manager.operate_on_host_relation('INSERT', host='101.101.101.101:101', type='Crawler') if db_manager.get_relation('host')[-1]['host'] == '101.101.101.101:101' and db_manager.get_relation('host')[-1]['state'] == 'offline': print('> PASSED | INSERT | Host relation') else: print('> FAILED | INSERT | Host relation') # UPDATE STATE db_manager.operate_on_host_relation('UPDATE_STATE', host='101.101.101.101:101', state='online') if db_manager.get_relation('host')[-1]['state'] == 'online': print('> PASSED | UPDATE_STATE | Host relation') else: print('> FAILED | UPDATE_STATE | Host relation') # UPDATE HEALTH db_manager.operate_on_host_relation('UPDATE_HEALTH', host='101.101.101.101:101', health='healthy') if db_manager.get_relation('host')[-1]['health'] == 'healthy': print('> PASSED | UPDATE_HEALTH | Host relation') else: print('> FAILED | UPDATE_HEALTH | Host relation') # DELETE db_manager.operate_on_host_relation('DELETE', host='101.101.101.101:101') if db_manager.get_relation_length('host') == start_length: print('> PASSED | DELETE | Host relation') else: print('> FAILED | DELETE | Host relation')
def test_get_first_n_built_chunk_ids(): # Setup test env db_manager = DatabaseManager() db_manager.operate_on_chunk_relation('INSERT', chunk_id='101c') db_manager.operate_on_host_relation('INSERT', host='101.101.101.101:101', type='Test Server') db_manager.operate_on_index_builder_relation('INSERT', chunk_id='101c', host='101.101.101.101:101', task='built') # Test result = db_manager.get_first_n_built_chunk_ids(1) if len(result) == 1: print('> PASSED | get_first_n_built_chunk_ids()') else: print('> FAILED | get_first_n_built_chunk_ids()') # Clean up test env db_manager.operate_on_chunk_relation('DELETE', chunk_id='101c') db_manager.operate_on_host_relation('DELETE', host='101.101.101.101:101') db_manager.operate_on_index_builder_relation('DELETE', chunk_id='101c')
def test_get_relation_for_chunk_id(): # Setup test env db_manager = DatabaseManager() db_manager.operate_on_chunk_relation('INSERT', chunk_id='101c') db_manager.operate_on_host_relation('INSERT', host='101.101.101.101:101', type='Test Server') db_manager.operate_on_crawler_relation('INSERT', chunk_id='101c', host='101.101.101.101:101') # Test result = db_manager.get_relation_for_chunk_id('crawler', chunk_id='101c') if len(result) == 1: print('> PASSED | get_relation_for_chunk_id()') else: print('> FAILED | get_relation_for_chunk_id()') # Clean up test env db_manager.operate_on_chunk_relation('DELETE', chunk_id='101c') db_manager.operate_on_host_relation('DELETE', host='101.101.101.101:101') db_manager.operate_on_crawler_relation('DELETE', chunk_id='101c')
def test_index_server_operation(): """ Test index server basic operations. :return: None """ db_manager = DatabaseManager() start_length = db_manager.get_relation_length('index_server') # Setup temp chunk and host for testing db_manager.operate_on_chunk_relation('INSERT', chunk_id='101c') db_manager.operate_on_chunk_relation('INSERT', chunk_id='102c') db_manager.operate_on_host_relation('INSERT', host='101.101.101.101:101', type='Index Server') db_manager.operate_on_host_relation('INSERT', host='101.101.101.101:102', type='Index Server') # INSERT db_manager.operate_on_index_server_relation('INSERT', row=101, chunk_id='101c', host='101.101.101.101:101') if db_manager.get_relation('index_server')[-1]['chunk_id'] == '101c' and db_manager.get_relation('index_server')[-1]['row'] == 101: print('> PASSED | INSERT | Index Server relation') else: print('> FAILED | INSERT | Index Server relation') # UPDATE ROW db_manager.operate_on_index_server_relation('UPDATE_ROW', row=102, chunk_id='101c', host='101.101.101.101:101') if db_manager.get_relation('index_server')[-1]['row'] == 102: print('> PASSED | UPDATE_ROW | Index Server relation') else: print('> FAILED | UPDATE_ROW | Index Server relation') # UPDATE CHUNK ID db_manager.operate_on_index_server_relation('UPDATE_CHUNK_ID', chunk_id='102c', row=102, host='101.101.101.101:101') if db_manager.get_relation('index_server')[-1]['chunk_id'] == '102c': print('> PASSED | UPDATE_CHUNK_ID | Index Server relation') else: print('> FAILED | UPDATE_CHUNK_ID | Index Server relation') # UPDATE HOST db_manager.operate_on_index_server_relation('UPDATE_HOST', host='101.101.101.101:102', row=102, chunk_id='102c') if db_manager.get_relation('index_server')[-1]['is_host'] == '101.101.101.101:102': print('> PASSED | UPDATE_HOST | Index Server relation') else: print('> FAILED | UPDATE_HOST | Index Server relation') # DELETE db_manager.operate_on_index_server_relation('DELETE', row=102, chunk_id='102c', host='101.101.101.101:102') if db_manager.get_relation_length('index_server') == start_length: print('> PASSED | DELETE | Index Server relation') else: print('> FAILED | DELETE | Index Server relation') # Delete temp chunk and host after finish testing db_manager.operate_on_chunk_relation('DELETE', chunk_id='101c') db_manager.operate_on_chunk_relation('DELETE', chunk_id='102c') db_manager.operate_on_host_relation('DELETE', host='101.101.101.101:101') db_manager.operate_on_host_relation('DELETE', host='101.101.101.101:102')
def test_index_builder_operation(): """ Test index builder relation basic operations. :return: None """ db_manager = DatabaseManager() start_length = db_manager.get_relation_length('index_builder') # Setup temp chunk and host for testing db_manager.operate_on_chunk_relation('INSERT', chunk_id='101c') db_manager.operate_on_host_relation('INSERT', host='101.101.101.101:101', type='Index Builder') db_manager.operate_on_host_relation('INSERT', host='101.101.101.101:102', type='Index Builder') # INSERT db_manager.operate_on_index_builder_relation('INSERT', chunk_id='101c', host='101.101.101.101:101') if db_manager.get_relation('index_builder')[-1]['chunk_id'] == '101c' and db_manager.get_relation('index_builder')[-1]['ib_task'] == 'building': print('> PASSED | INSERT | Index Builder relation') else: print('> FAILED | INSERT | Index Builder relation') # UPDATE HOST db_manager.operate_on_index_builder_relation('UPDATE_HOST', chunk_id='101c', host='101.101.101.101:102') if db_manager.get_relation('index_builder')[-1]['ib_host'] == '101.101.101.101:102': print('> PASSED | UPDATE_HOST | Index Builder relation') else: print('> FAILED | UPDATE_HOST | Index Builder relation') # UPDATE TASK db_manager.operate_on_index_builder_relation('UPDATE_TASK', chunk_id='101c', task='built') if db_manager.get_relation('index_builder')[-1]['ib_task'] == 'built': print('> PASSED | UPDATE_TASK | Index Builder relation') else: print('> FAILED | UPDATE_TASK | Index Builder relation') # DELETE db_manager.operate_on_index_builder_relation('DELETE', chunk_id='101c') if db_manager.get_relation_length('index_builder') == start_length: print('> PASSED | DELETE | Index Builder relation') else: print('> FAILED | DELETE | Index Builder relation') # Delete temp chunk and host after finish testing db_manager.operate_on_chunk_relation('DELETE', chunk_id='101c') db_manager.operate_on_host_relation('DELETE', host='101.101.101.101:101') db_manager.operate_on_host_relation('DELETE', host='101.101.101.101:102')
def test_link_operation(): """ Test link relation's basic operations. :return: None """ db_manager = DatabaseManager() start_length = db_manager.get_relation_length('link') # INSERT db_manager.operate_on_link_relation('INSERT', link='https://www.example_101.com') if db_manager.get_relation('link')[-1]['link'] == 'https://www.example_101.com' and db_manager.get_relation('link')[-1]['state'] == 'pending': print('> PASSED | INSERT | Link relation') else: print('> FAILED | INSERT | Link relation') # UPDATE STATE db_manager.operate_on_link_relation('UPDATE_STATE', link='https://www.example_101.com', state='crawling') if db_manager.get_relation('link')[-1]['state'] == 'crawling': print('> PASSED | UPDATE_STATE | Link relation') else: print('> FAILED | UPDATE_STATE | Link relation') # UPDATE CHUNK ID db_manager.operate_on_chunk_relation('INSERT', chunk_id='101c') db_manager.operate_on_link_relation('UPDATE_CHUNK_ID', link='https://www.example_101.com', chunk_id='101c') if db_manager.get_relation('link')[-1]['chunk_id'] == '101c': print('> PASSED | UPDATE_CHUNK_ID | Link relation') else: print('> FAILED | UPDATE_CHUNK_ID | Link relation') db_manager.operate_on_chunk_relation('DELETE', chunk_id='101c') # DELETE db_manager.operate_on_link_relation('DELETE', link='https://www.example_101.com') if db_manager.get_relation_length('link') == start_length: print('> PASSED | DELETE | Link relation') else: print('> FAILED | DELETE | Link relation')
""" test_link_operation() test_chunk_operation() test_host_operation() test_crawler_operation() test_index_builder_operation() """ Helper functions """ test_get_relation_for_chunk_id() test_get_first_n_pending_links() test_get_first_n_crawled_chunk_ids() test_get_first_n_built_chunk_ids() db_manager = DatabaseManager() # pprint(db_manager.get_relation_length('link')) # # links_to_add = read_file(filename="/Users/hoanhan/ds-class/final-project-dev/mgmt/test/links.txt") # for link in links_to_add: # db_manager.operate_on_link_relation('INSERT', link=link) # pprint(links_to_add) # pprint(db_manager.get_relation('link')) # results = db_manager.find_chunk_ids_for_index_servers('3.0.0.1') # pprint(db_manager.get_relation('index_server')) # pprint(db_manager.get_all_index_servers())
#! /usr/bin/env python3 """ collection_interface.py - Collect stats in database and display it on a webpage Author: - Hoanh An ([email protected]) Date: 12/3/2017 """ from mgmt.src.database_manager import DatabaseManager from flask import Flask, jsonify, request, render_template app = Flask(__name__) db_manager = DatabaseManager() @app.route('/', methods=['GET']) def get_overview(): results = db_manager.get_all_relations_for_all_chunks() return render_template('overview.html', data=results) if __name__ == '__main__': app.run()