def fetch_random_incident_object_ids( number_of_incidents: int) -> Tuple[List[str], dict]: """ Fetches random ObjectIds :param number_of_incidents: Define the total of the unique incidents ids to fetch :return List of ObjectIds and a dictionary with ObjectIds wards pairs """ db = next(get_db()) incidents = set() object_ids_with_wards = dict() while len(incidents) < number_of_incidents: incidents_cur = db['incidents'].aggregate([{ '$sample': { 'size': 50000 } }, { '$project': { '_id': 1, 'ward': 1 } }]) for incident in incidents_cur: object_ids_with_wards.update( {incident.get('_id'): incident.get('ward')}) incidents.add(incident.get('_id')) return list(incidents), object_ids_with_wards
def create_constraints() -> None: """Creates required constraints in the database :return: """ # Get database connection graph_db = next(get_db()) graph_db.run( "CREATE CONSTRAINT AuthorNameConstraint IF NOT EXISTS ON (n:Author) ASSERT n.name IS UNIQUE" ) graph_db.run( "CREATE CONSTRAINT ArticleTitleConstraint IF NOT EXISTS ON (n:Article) ASSERT (n.title, n.year) " "IS UNIQUE") graph_db.run( "CREATE CONSTRAINT InproceedingsTitleConstraint IF NOT EXISTS ON (n:Inproceedings) " "ASSERT (n.title, n.year) IS UNIQUE") graph_db.run( "CREATE CONSTRAINT IncollectionTitleConstraint IF NOT EXISTS ON (n:Incollection) " "ASSERT (n.title, n.year) IS UNIQUE") graph_db.run( "CREATE CONSTRAINT JournalTitleConstraint IF NOT EXISTS ON (n:Article) ASSERT n.title IS UNIQUE" ) graph_db.run( "CREATE CONSTRAINT ConferenceConstraint IF NOT EXISTS ON (n:Conference) ASSERT n.title IS UNIQUE" ) graph_db.run( "CREATE CONSTRAINT BookTitleConstraint IF NOT EXISTS ON (n:Book) ASSERT n.title IS UNIQUE" )
def import_abandoned_vehicles(input_file: str) -> None: """ Import the requests for abandoned vehicles to the database. :param input_file: The file from which to load the requests for abandoned vehicles. """ print("Getting requests for abandoned vehicles") db = next(get_db()) input_df = pd.read_csv(input_file, sep=',').replace({np.nan: None}) input_df.columns = [ 'creation_date', 'status', 'completion_date', 'service_request_number', 'type_of_service_request', 'license_plate', 'vehicle_make_model', 'vehicle_color', 'current_activity', 'most_recent_action', 'days_of_report_as_parked', 'street_address', 'zip_code', 'x_coordinate', 'y_coordinate', 'ward', 'police_district', 'community_area', 'ssa', 'latitude', 'longitude', 'geo_location', 'historical_wards_03_15', 'zip_codes', 'community_areas', 'census_tracts', 'wards' ] input_df = __dataframe_normalization__(input_df, 'ABANDONED_VEHICLE') df_docs = input_df.to_dict(orient='records') docs = [] for df_doc in df_docs: docs.append({k: v for k, v in df_doc.items() if v is not None}) db['incidents'].insert_many(docs)
def clean_database() -> None: """Cleans up existing database :return: None """ # Get database connection graph_db = next(get_db()) graph_db.delete_all()
def import_alley_lights_out_or_street_lights_all_out(input_file: str, street_lights: bool): """ Import the requests for alley lights out or street lights all out (works the same for both of them) to the database. :param input_file: The file from which to load the requests for lights incidents. :param street_lights: Indicator if the method is called for street lights or not """ db = next(get_db()) input_df = pd.read_csv(input_file, sep=',').replace({np.nan: None}) input_df.columns = [ 'creation_date', 'status', 'completion_date', 'service_request_number', 'type_of_service_request', 'street_address', 'zip_code', 'x_coordinate', 'y_coordinate', 'ward', 'police_district', 'community_area', 'latitude', 'longitude', 'geo_location', 'historical_wards_03_15', 'zip_codes', 'community_areas', 'census_tracts', 'wards' ] if street_lights: print("Getting requests for street lights all out") input_df = __dataframe_normalization__(input_df, 'STREET_ALL_LIGHTS') else: print("Getting requests for alley lights out") input_df = __dataframe_normalization__(input_df, 'ALLEY_LIGHTS') df_docs = input_df.to_dict(orient='records') docs = [] for df_doc in df_docs: docs.append({k: v for k, v in df_doc.items() if v is not None}) db['incidents'].insert_many(docs)
def import_rodent_baiting(input_file: str) -> None: """ Import the requests for rodent baiting to the database. :param input_file: The file from which to load the requests for rodent baiting. """ print("Getting requests for rodent baiting") db = next(get_db()) input_df = pd.read_csv(input_file, sep=',').replace({np.nan: None}) input_df.columns = [ 'creation_date', 'status', 'completion_date', 'service_request_number', 'type_of_service_request', 'number_of_premises_baited', 'number_of_premises_w_garbage', 'number_of_premises_w_rats', 'current_activity', 'most_recent_action', 'street_address', 'zip_code', 'x_coordinate', 'y_coordinate', 'ward', 'police_district', 'community_area', 'latitude', 'longitude', 'geo_location', 'historical_wards_03_15', 'zip_codes', 'community_areas', 'census_tracts', 'wards' ] input_df = __dataframe_normalization__(input_df, 'RODENT_BAITING') df_docs = input_df.to_dict(orient='records') docs = [] for df_doc in df_docs: docs.append({k: v for k, v in df_doc.items() if v is not None}) db['incidents'].insert_many(docs)
def test_get_conn_fails_to_connect_with_wrong_host(client: TestClient) -> None: settings.MONGO_HOST = 'FAKE_HOST' db_connection = next(get_db()) cur = db_connection['test_collection'].find({}) with pytest.raises(errors.ServerSelectionTimeoutError): list(cur)
def test_get_conn_fails_to_connect_with_wrong_user(client: TestClient) -> None: settings.MONGO_USER = '******' settings.MONGO_PASSWORD = '******' db_connection = next(get_db()) cur = db_connection['test_collection'].find({}) with pytest.raises(errors.OperationFailure): list(cur)
def create_indexes(): db = next(get_db()) db['incidents'].create_index([('type_of_service_request', pymongo.ASCENDING)]) db['incidents'].create_index([('creation_date', pymongo.ASCENDING)]) db['incidents'].create_index([('type_of_service_request', pymongo.ASCENDING), ('creation_date', pymongo.ASCENDING)]) db['incidents'].create_index([('geo_location', pymongo.GEOSPHERE)]) db['incidents'].create_index([('total_votes', pymongo.ASCENDING)]) db['citizens'].create_index([('total_votes', pymongo.ASCENDING)]) db['citizens'].create_index([('total_wards', pymongo.ASCENDING)]) db['citizens'].create_index([('telephone_number', pymongo.ASCENDING)]) db['citizens'].create_index([('name', pymongo.ASCENDING)])
def create_up_votes() -> None: """ Routine that casts citizen vote to random incidents in order to populate the database with up-votes data :return: None """ incident_ids, object_ids_with_wards = fetch_random_incident_object_ids( number_of_incidents=2000000) citizens = create_rng_citizens(number_of_citizens=NUMBER_OF_CITIZENS) votes_list = list_random_chunks(elements_list=incident_ids, number_of_chunks=NUMBER_OF_CITIZENS) # Assign votes to citizens it = 0 for citizen in citizens: wards = set() for vote in votes_list[it]: ward = object_ids_with_wards.get(vote) if ward: wards.add(ward) citizen.update({'voted_incidents': list(votes_list[it])}) citizen.update({'total_votes': len(votes_list[it])}) citizen.update({'wards': list(wards)}) citizen.update({'total_wards': len(wards)}) it += 1 db = next(get_db()) db['citizens'].insert_many(citizens) # Fetch citizen data citizens_docs = db['citizens'].find({}) # Create a dictionary that associates Incident ObjectIds with Citizen ObjectIds votes_per_incident = dict() for citizen_doc in citizens_docs: voted_incidents = citizen_doc['voted_incidents'] for voted_incident in voted_incidents: try: votes_per_incident[voted_incident].append(citizen_doc['_id']) except KeyError: votes_per_incident[voted_incident] = [citizen_doc['_id']] # Assign votes to incidents for incident_id, citizens_ids in votes_per_incident.items(): db['incidents'].update_many({'_id': incident_id}, { '$set': { 'total_votes': len(citizens_ids), 'voted_by': citizens_ids } }, upsert=False)
def create_indices() -> None: """Creates required indices at the database :return: None """ # Get database connection graph_db = next(get_db()) graph_db.run( "CREATE INDEX AuthorNameIndex IF NOT EXISTS FOR (t:Author) ON (t.name)" ) graph_db.run( "CREATE INDEX ArticleTitleYearIndex IF NOT EXISTS FOR (t:Article) ON (t.title, t.year)" ) graph_db.run( "CREATE INDEX InproceedingsTitleYearIndex IF NOT EXISTS FOR (t:Inproceedings) ON (t.title, t.year)" ) graph_db.run( "CREATE INDEX IncollectionTitleYearIndex IF NOT EXISTS FOR (t:Incollection) ON (t.title, t.year)" ) graph_db.run( "CREATE INDEX ArticleTitleIndex IF NOT EXISTS FOR (t:Article) ON (t.title)" ) graph_db.run( "CREATE INDEX InproceedingsTitleIndex IF NOT EXISTS FOR (t:Inproceedings) ON (t.title)" ) graph_db.run( "CREATE INDEX IncollectionTitleIndex IF NOT EXISTS FOR (t:Incollection) ON (t.title)" ) graph_db.run( "CREATE INDEX ArticleYearIndex IF NOT EXISTS FOR (t:Article) ON (t.year)" ) graph_db.run( "CREATE INDEX InproceedingsYearIndex IF NOT EXISTS FOR (t:Inproceedings) ON (t.year)" ) graph_db.run( "CREATE INDEX IncollectionYearIndex IF NOT EXISTS FOR (t:Incollection) ON (t.year)" ) graph_db.run( "CREATE INDEX JournalTitleIndex IF NOT EXISTS FOR (t:Journal) ON (t.title)" ) graph_db.run( "CREATE INDEX ConferenceIndex IF NOT EXISTS FOR (t:Conference) ON (t.title)" ) graph_db.run( "CREATE INDEX BookTitleIndex IF NOT EXISTS FOR (t:Book) ON (t.title)")
def import_street_lights_one_out(input_file: str): """ Import the requests for street lights one out to the database. :param input_file: The file from which to load the requests for lights incidents. """ print("Getting requests for street lights one out") db = next(get_db()) input_df = pd.read_csv(input_file, sep=',').replace({np.nan: None}) input_df.columns = [ 'creation_date', 'status', 'completion_date', 'service_request_number', 'type_of_service_request', 'street_address', 'zip_code', 'x_coordinate', 'y_coordinate', 'ward', 'police_district', 'community_area', 'latitude', 'longitude', 'geo_location' ] input_df = __dataframe_normalization__(input_df, 'STREET_ONE_LIGHT') df_docs = input_df.to_dict(orient='records') docs = [] for df_doc in df_docs: docs.append({k: v for k, v in df_doc.items() if v is not None}) db['incidents'].insert_many(docs)
def import_sanitation_complaints(input_file: str) -> None: """ Import the requests for sanitation code complaints to the database. :param input_file: The file from which to load the requests for sanitation code complaints. """ print("Getting requests for sanitation code complaints") db = next(get_db()) input_df = pd.read_csv(input_file, sep=',').replace({np.nan: None}) input_df.columns = [ 'creation_date', 'status', 'completion_date', 'service_request_number', 'type_of_service_request', 'nature_of_code_violation', 'street_address', 'zip_code', 'x_coordinate', 'y_coordinate', 'ward', 'police_district', 'community_area', 'latitude', 'longitude', 'geo_location', 'historical_wards_03_15', 'zip_codes', 'community_areas', 'census_tracts', 'wards' ] input_df = __dataframe_normalization__(input_df, 'SANITATION_VIOLATION') df_docs = input_df.to_dict(orient='records') docs = [] for df_doc in df_docs: docs.append({k: v for k, v in df_doc.items() if v is not None}) db['incidents'].insert_many(docs)
def test_get_conn_(client: TestClient) -> None: db_connection = next(get_db()) cur = db_connection['test_collection'].find({}) assert list(cur) == []
def seed_database() -> None: """Populates the database with the extracted data :return: None """ # Get database connection graph_db = next(get_db()) batch_size = 5000 authors_iter = iter(authors_data) article_iter = iter(article_data) inproceedings_iter = iter(inproceedings_data) incollection_iter = iter(incollection_data) journal_iter = iter(journal_data) conference_iter = iter(conference_data) book_iter = iter(book_data) authors_articles_relations_iter = iter(authors_article_relations_data) authors_inproceedings_relations_iter = iter( authors_inproceedings_relations_data) authors_incollection_relations_iter = iter( authors_incollection_relations_data) article_journal_relations_iter = iter(article_journal_relations_data) inproceedings_conference_relations_iter = iter( inproceedings_conference_relations_data) incollection_book_relations_iter = iter(incollection_book_relations_data) # Insert all nodes first while True: authors_data_batch = list(islice(authors_iter, batch_size)) articles_data_batch = list(islice(article_iter, batch_size)) inproceedings_data_batch = list(islice(inproceedings_iter, batch_size)) incollection_data_batch = list(islice(incollection_iter, batch_size)) journal_data_batch = list(islice(journal_iter, batch_size)) conference_data_batch = list(islice(conference_iter, batch_size)) book_data_batch = list(islice(book_iter, batch_size)) if authors_data_batch: create_nodes(graph_db.auto(), data=authors_data_batch, labels={"Author"}) if articles_data_batch: create_nodes(graph_db.auto(), data=articles_data_batch, labels={"Article"}) if inproceedings_data_batch: create_nodes(graph_db.auto(), data=inproceedings_data_batch, labels={"Inproceedings"}) if incollection_data_batch: create_nodes(graph_db.auto(), data=incollection_data_batch, labels={"Incollection"}) if journal_data_batch: create_nodes(graph_db.auto(), data=journal_data_batch, labels={"Journal"}) if conference_data_batch: create_nodes(graph_db.auto(), data=conference_data_batch, labels={"Conference"}) if book_data_batch: create_nodes(graph_db.auto(), data=book_data_batch, labels={"Book"}) if not any([ authors_data_batch, articles_data_batch, inproceedings_data_batch, incollection_data_batch, journal_data_batch, conference_data_batch, book_data_batch ]): break # Continue up with relationships while True: authors_articles_relations_data_batch = list( islice(authors_articles_relations_iter, batch_size)) authors_inproceedings_relations_data_batch = list( islice(authors_inproceedings_relations_iter, batch_size)) authors_incollection_relations_data_batch = list( islice(authors_incollection_relations_iter, batch_size)) article_journal_relations_data_batch = list( islice(article_journal_relations_iter, batch_size)) inproceedings_conference_relations_data_batch = \ list(islice(inproceedings_conference_relations_iter, batch_size)) incollection_book_relations_data_batch = list( islice(incollection_book_relations_iter, batch_size)) if authors_articles_relations_data_batch: create_relationships(graph_db.auto(), authors_articles_relations_data_batch, "CONTRIBUTED", start_node_key=("Author", "name"), end_node_key=("Article", "title", "year")) if authors_inproceedings_relations_data_batch: create_relationships(graph_db.auto(), authors_inproceedings_relations_data_batch, "CONTRIBUTED", start_node_key=("Author", "name"), end_node_key=("Inproceedings", "title", "year")) if authors_incollection_relations_data_batch: create_relationships(graph_db.auto(), authors_incollection_relations_data_batch, "CONTRIBUTED", start_node_key=("Author", "name"), end_node_key=("Incollection", "title", "year")) if article_journal_relations_data_batch: create_relationships(graph_db.auto(), article_journal_relations_data_batch, "PUBLISHED", start_node_key=("Article", "title", "year"), end_node_key=("Journal", "title")) if inproceedings_conference_relations_data_batch: create_relationships(graph_db.auto(), inproceedings_conference_relations_data_batch, "PUBLISHED", start_node_key=("Inproceedings", "title", "year"), end_node_key=("Conference", "title")) if incollection_book_relations_data_batch: create_relationships(graph_db.auto(), incollection_book_relations_data_batch, "PUBLISHED", start_node_key=("Incollection", "title", "year"), end_node_key=("Book", "title")) if not any([ authors_articles_relations_data_batch, authors_inproceedings_relations_data_batch, authors_incollection_relations_data_batch, article_journal_relations_data_batch, inproceedings_conference_relations_data_batch, incollection_book_relations_data_batch ]): break