def fetch_random_incident_object_ids(
        number_of_incidents: int) -> Tuple[List[str], dict]:
    """ Fetches random ObjectIds

    :param number_of_incidents: Define the total of the unique incidents ids to fetch
    :return List of ObjectIds and a dictionary with ObjectIds wards pairs
    """
    db = next(get_db())
    incidents = set()
    object_ids_with_wards = dict()
    while len(incidents) < number_of_incidents:
        incidents_cur = db['incidents'].aggregate([{
            '$sample': {
                'size': 50000
            }
        }, {
            '$project': {
                '_id': 1,
                'ward': 1
            }
        }])
        for incident in incidents_cur:
            object_ids_with_wards.update(
                {incident.get('_id'): incident.get('ward')})
            incidents.add(incident.get('_id'))

    return list(incidents), object_ids_with_wards
Exemple #2
0
def create_constraints() -> None:
    """Creates required constraints in the database
    :return:
    """
    # Get database connection
    graph_db = next(get_db())
    graph_db.run(
        "CREATE CONSTRAINT AuthorNameConstraint IF NOT EXISTS ON (n:Author) ASSERT n.name IS UNIQUE"
    )
    graph_db.run(
        "CREATE CONSTRAINT ArticleTitleConstraint IF NOT EXISTS ON (n:Article) ASSERT (n.title, n.year) "
        "IS UNIQUE")
    graph_db.run(
        "CREATE CONSTRAINT InproceedingsTitleConstraint IF NOT EXISTS ON (n:Inproceedings) "
        "ASSERT (n.title, n.year) IS UNIQUE")
    graph_db.run(
        "CREATE CONSTRAINT IncollectionTitleConstraint IF NOT EXISTS ON (n:Incollection) "
        "ASSERT (n.title, n.year) IS UNIQUE")
    graph_db.run(
        "CREATE CONSTRAINT JournalTitleConstraint IF NOT EXISTS ON (n:Article) ASSERT n.title IS UNIQUE"
    )
    graph_db.run(
        "CREATE CONSTRAINT ConferenceConstraint IF NOT EXISTS ON (n:Conference) ASSERT n.title IS UNIQUE"
    )
    graph_db.run(
        "CREATE CONSTRAINT BookTitleConstraint IF NOT EXISTS ON (n:Book) ASSERT n.title IS UNIQUE"
    )
def import_abandoned_vehicles(input_file: str) -> None:
    """ Import the requests for abandoned vehicles to the database.

    :param input_file: The file from which to load the requests for abandoned vehicles.
    """
    print("Getting requests for abandoned vehicles")
    db = next(get_db())

    input_df = pd.read_csv(input_file, sep=',').replace({np.nan: None})
    input_df.columns = [
        'creation_date', 'status', 'completion_date', 'service_request_number',
        'type_of_service_request', 'license_plate', 'vehicle_make_model',
        'vehicle_color', 'current_activity', 'most_recent_action',
        'days_of_report_as_parked', 'street_address', 'zip_code',
        'x_coordinate', 'y_coordinate', 'ward', 'police_district',
        'community_area', 'ssa', 'latitude', 'longitude', 'geo_location',
        'historical_wards_03_15', 'zip_codes', 'community_areas',
        'census_tracts', 'wards'
    ]
    input_df = __dataframe_normalization__(input_df, 'ABANDONED_VEHICLE')
    df_docs = input_df.to_dict(orient='records')
    docs = []
    for df_doc in df_docs:
        docs.append({k: v for k, v in df_doc.items() if v is not None})
    db['incidents'].insert_many(docs)
Exemple #4
0
def clean_database() -> None:
    """Cleans up existing database
    :return: None
    """
    # Get database connection
    graph_db = next(get_db())
    graph_db.delete_all()
def import_alley_lights_out_or_street_lights_all_out(input_file: str,
                                                     street_lights: bool):
    """ Import the requests for alley lights out or street lights all out (works the same for both of them) to the
    database.

    :param input_file: The file from which to load the requests for lights incidents.
    :param street_lights: Indicator if the method is called for street lights or not
    """
    db = next(get_db())

    input_df = pd.read_csv(input_file, sep=',').replace({np.nan: None})
    input_df.columns = [
        'creation_date', 'status', 'completion_date', 'service_request_number',
        'type_of_service_request', 'street_address', 'zip_code',
        'x_coordinate', 'y_coordinate', 'ward', 'police_district',
        'community_area', 'latitude', 'longitude', 'geo_location',
        'historical_wards_03_15', 'zip_codes', 'community_areas',
        'census_tracts', 'wards'
    ]
    if street_lights:
        print("Getting requests for street lights all out")
        input_df = __dataframe_normalization__(input_df, 'STREET_ALL_LIGHTS')
    else:
        print("Getting requests for alley lights out")
        input_df = __dataframe_normalization__(input_df, 'ALLEY_LIGHTS')
    df_docs = input_df.to_dict(orient='records')
    docs = []
    for df_doc in df_docs:
        docs.append({k: v for k, v in df_doc.items() if v is not None})
    db['incidents'].insert_many(docs)
def import_rodent_baiting(input_file: str) -> None:
    """ Import the requests for rodent baiting to the database.

    :param input_file: The file from which to load the requests for rodent baiting.
    """
    print("Getting requests for rodent baiting")
    db = next(get_db())

    input_df = pd.read_csv(input_file, sep=',').replace({np.nan: None})

    input_df.columns = [
        'creation_date', 'status', 'completion_date', 'service_request_number',
        'type_of_service_request', 'number_of_premises_baited',
        'number_of_premises_w_garbage', 'number_of_premises_w_rats',
        'current_activity', 'most_recent_action', 'street_address', 'zip_code',
        'x_coordinate', 'y_coordinate', 'ward', 'police_district',
        'community_area', 'latitude', 'longitude', 'geo_location',
        'historical_wards_03_15', 'zip_codes', 'community_areas',
        'census_tracts', 'wards'
    ]
    input_df = __dataframe_normalization__(input_df, 'RODENT_BAITING')
    df_docs = input_df.to_dict(orient='records')
    docs = []
    for df_doc in df_docs:
        docs.append({k: v for k, v in df_doc.items() if v is not None})
    db['incidents'].insert_many(docs)
def test_get_conn_fails_to_connect_with_wrong_host(client: TestClient) -> None:

    settings.MONGO_HOST = 'FAKE_HOST'

    db_connection = next(get_db())
    cur = db_connection['test_collection'].find({})
    with pytest.raises(errors.ServerSelectionTimeoutError):
        list(cur)
def test_get_conn_fails_to_connect_with_wrong_user(client: TestClient) -> None:
    settings.MONGO_USER = '******'
    settings.MONGO_PASSWORD = '******'

    db_connection = next(get_db())
    cur = db_connection['test_collection'].find({})
    with pytest.raises(errors.OperationFailure):
        list(cur)
def create_indexes():
    db = next(get_db())
    db['incidents'].create_index([('type_of_service_request', pymongo.ASCENDING)])
    db['incidents'].create_index([('creation_date', pymongo.ASCENDING)])
    db['incidents'].create_index([('type_of_service_request', pymongo.ASCENDING), ('creation_date', pymongo.ASCENDING)])
    db['incidents'].create_index([('geo_location', pymongo.GEOSPHERE)])
    db['incidents'].create_index([('total_votes', pymongo.ASCENDING)])

    db['citizens'].create_index([('total_votes', pymongo.ASCENDING)])
    db['citizens'].create_index([('total_wards', pymongo.ASCENDING)])
    db['citizens'].create_index([('telephone_number', pymongo.ASCENDING)])
    db['citizens'].create_index([('name', pymongo.ASCENDING)])
def create_up_votes() -> None:
    """ Routine that casts citizen vote to random incidents in order to populate the database with up-votes data

    :return: None
    """
    incident_ids, object_ids_with_wards = fetch_random_incident_object_ids(
        number_of_incidents=2000000)
    citizens = create_rng_citizens(number_of_citizens=NUMBER_OF_CITIZENS)
    votes_list = list_random_chunks(elements_list=incident_ids,
                                    number_of_chunks=NUMBER_OF_CITIZENS)

    # Assign votes to citizens
    it = 0
    for citizen in citizens:

        wards = set()
        for vote in votes_list[it]:
            ward = object_ids_with_wards.get(vote)
            if ward:
                wards.add(ward)

        citizen.update({'voted_incidents': list(votes_list[it])})
        citizen.update({'total_votes': len(votes_list[it])})
        citizen.update({'wards': list(wards)})
        citizen.update({'total_wards': len(wards)})

        it += 1

    db = next(get_db())
    db['citizens'].insert_many(citizens)

    # Fetch citizen data
    citizens_docs = db['citizens'].find({})

    # Create a dictionary that associates Incident ObjectIds with Citizen ObjectIds
    votes_per_incident = dict()
    for citizen_doc in citizens_docs:
        voted_incidents = citizen_doc['voted_incidents']
        for voted_incident in voted_incidents:
            try:
                votes_per_incident[voted_incident].append(citizen_doc['_id'])
            except KeyError:
                votes_per_incident[voted_incident] = [citizen_doc['_id']]

    # Assign votes to incidents
    for incident_id, citizens_ids in votes_per_incident.items():
        db['incidents'].update_many({'_id': incident_id}, {
            '$set': {
                'total_votes': len(citizens_ids),
                'voted_by': citizens_ids
            }
        },
                                    upsert=False)
Exemple #11
0
def create_indices() -> None:
    """Creates required indices at the database
    :return: None
    """
    # Get database connection
    graph_db = next(get_db())
    graph_db.run(
        "CREATE INDEX AuthorNameIndex IF NOT EXISTS FOR (t:Author) ON (t.name)"
    )
    graph_db.run(
        "CREATE INDEX ArticleTitleYearIndex IF NOT EXISTS FOR (t:Article) ON (t.title, t.year)"
    )
    graph_db.run(
        "CREATE INDEX InproceedingsTitleYearIndex IF NOT EXISTS FOR (t:Inproceedings) ON (t.title, t.year)"
    )
    graph_db.run(
        "CREATE INDEX IncollectionTitleYearIndex IF NOT EXISTS FOR (t:Incollection) ON (t.title, t.year)"
    )
    graph_db.run(
        "CREATE INDEX ArticleTitleIndex IF NOT EXISTS FOR (t:Article) ON (t.title)"
    )
    graph_db.run(
        "CREATE INDEX InproceedingsTitleIndex IF NOT EXISTS FOR (t:Inproceedings) ON (t.title)"
    )
    graph_db.run(
        "CREATE INDEX IncollectionTitleIndex IF NOT EXISTS FOR (t:Incollection) ON (t.title)"
    )
    graph_db.run(
        "CREATE INDEX ArticleYearIndex IF NOT EXISTS FOR (t:Article) ON (t.year)"
    )
    graph_db.run(
        "CREATE INDEX InproceedingsYearIndex IF NOT EXISTS FOR (t:Inproceedings) ON (t.year)"
    )
    graph_db.run(
        "CREATE INDEX IncollectionYearIndex IF NOT EXISTS FOR (t:Incollection) ON (t.year)"
    )
    graph_db.run(
        "CREATE INDEX JournalTitleIndex IF NOT EXISTS FOR (t:Journal) ON (t.title)"
    )
    graph_db.run(
        "CREATE INDEX ConferenceIndex IF NOT EXISTS FOR (t:Conference) ON (t.title)"
    )
    graph_db.run(
        "CREATE INDEX BookTitleIndex IF NOT EXISTS FOR (t:Book) ON (t.title)")
def import_street_lights_one_out(input_file: str):
    """ Import the requests for street lights one out to the database.

    :param input_file: The file from which to load the requests for lights incidents.
    """
    print("Getting requests for street lights one out")
    db = next(get_db())

    input_df = pd.read_csv(input_file, sep=',').replace({np.nan: None})
    input_df.columns = [
        'creation_date', 'status', 'completion_date', 'service_request_number',
        'type_of_service_request', 'street_address', 'zip_code',
        'x_coordinate', 'y_coordinate', 'ward', 'police_district',
        'community_area', 'latitude', 'longitude', 'geo_location'
    ]
    input_df = __dataframe_normalization__(input_df, 'STREET_ONE_LIGHT')
    df_docs = input_df.to_dict(orient='records')
    docs = []
    for df_doc in df_docs:
        docs.append({k: v for k, v in df_doc.items() if v is not None})
    db['incidents'].insert_many(docs)
def import_sanitation_complaints(input_file: str) -> None:
    """ Import the requests for sanitation code complaints to the database.

    :param input_file: The file from which to load the requests for sanitation code complaints.
    """
    print("Getting requests for sanitation code complaints")
    db = next(get_db())

    input_df = pd.read_csv(input_file, sep=',').replace({np.nan: None})

    input_df.columns = [
        'creation_date', 'status', 'completion_date', 'service_request_number',
        'type_of_service_request', 'nature_of_code_violation',
        'street_address', 'zip_code', 'x_coordinate', 'y_coordinate', 'ward',
        'police_district', 'community_area', 'latitude', 'longitude',
        'geo_location', 'historical_wards_03_15', 'zip_codes',
        'community_areas', 'census_tracts', 'wards'
    ]
    input_df = __dataframe_normalization__(input_df, 'SANITATION_VIOLATION')
    df_docs = input_df.to_dict(orient='records')
    docs = []
    for df_doc in df_docs:
        docs.append({k: v for k, v in df_doc.items() if v is not None})
    db['incidents'].insert_many(docs)
def test_get_conn_(client: TestClient) -> None:
    db_connection = next(get_db())
    cur = db_connection['test_collection'].find({})
    assert list(cur) == []
Exemple #15
0
def seed_database() -> None:
    """Populates the database with the extracted data
    :return: None
    """
    # Get database connection
    graph_db = next(get_db())

    batch_size = 5000
    authors_iter = iter(authors_data)
    article_iter = iter(article_data)
    inproceedings_iter = iter(inproceedings_data)
    incollection_iter = iter(incollection_data)

    journal_iter = iter(journal_data)
    conference_iter = iter(conference_data)
    book_iter = iter(book_data)

    authors_articles_relations_iter = iter(authors_article_relations_data)
    authors_inproceedings_relations_iter = iter(
        authors_inproceedings_relations_data)
    authors_incollection_relations_iter = iter(
        authors_incollection_relations_data)

    article_journal_relations_iter = iter(article_journal_relations_data)
    inproceedings_conference_relations_iter = iter(
        inproceedings_conference_relations_data)
    incollection_book_relations_iter = iter(incollection_book_relations_data)

    # Insert all nodes first
    while True:
        authors_data_batch = list(islice(authors_iter, batch_size))
        articles_data_batch = list(islice(article_iter, batch_size))
        inproceedings_data_batch = list(islice(inproceedings_iter, batch_size))
        incollection_data_batch = list(islice(incollection_iter, batch_size))

        journal_data_batch = list(islice(journal_iter, batch_size))
        conference_data_batch = list(islice(conference_iter, batch_size))
        book_data_batch = list(islice(book_iter, batch_size))

        if authors_data_batch:
            create_nodes(graph_db.auto(),
                         data=authors_data_batch,
                         labels={"Author"})
        if articles_data_batch:
            create_nodes(graph_db.auto(),
                         data=articles_data_batch,
                         labels={"Article"})
        if inproceedings_data_batch:
            create_nodes(graph_db.auto(),
                         data=inproceedings_data_batch,
                         labels={"Inproceedings"})
        if incollection_data_batch:
            create_nodes(graph_db.auto(),
                         data=incollection_data_batch,
                         labels={"Incollection"})
        if journal_data_batch:
            create_nodes(graph_db.auto(),
                         data=journal_data_batch,
                         labels={"Journal"})
        if conference_data_batch:
            create_nodes(graph_db.auto(),
                         data=conference_data_batch,
                         labels={"Conference"})
        if book_data_batch:
            create_nodes(graph_db.auto(),
                         data=book_data_batch,
                         labels={"Book"})

        if not any([
                authors_data_batch, articles_data_batch,
                inproceedings_data_batch, incollection_data_batch,
                journal_data_batch, conference_data_batch, book_data_batch
        ]):
            break

    # Continue up with relationships
    while True:
        authors_articles_relations_data_batch = list(
            islice(authors_articles_relations_iter, batch_size))
        authors_inproceedings_relations_data_batch = list(
            islice(authors_inproceedings_relations_iter, batch_size))
        authors_incollection_relations_data_batch = list(
            islice(authors_incollection_relations_iter, batch_size))

        article_journal_relations_data_batch = list(
            islice(article_journal_relations_iter, batch_size))
        inproceedings_conference_relations_data_batch = \
            list(islice(inproceedings_conference_relations_iter, batch_size))
        incollection_book_relations_data_batch = list(
            islice(incollection_book_relations_iter, batch_size))

        if authors_articles_relations_data_batch:
            create_relationships(graph_db.auto(),
                                 authors_articles_relations_data_batch,
                                 "CONTRIBUTED",
                                 start_node_key=("Author", "name"),
                                 end_node_key=("Article", "title", "year"))
        if authors_inproceedings_relations_data_batch:
            create_relationships(graph_db.auto(),
                                 authors_inproceedings_relations_data_batch,
                                 "CONTRIBUTED",
                                 start_node_key=("Author", "name"),
                                 end_node_key=("Inproceedings", "title",
                                               "year"))
        if authors_incollection_relations_data_batch:
            create_relationships(graph_db.auto(),
                                 authors_incollection_relations_data_batch,
                                 "CONTRIBUTED",
                                 start_node_key=("Author", "name"),
                                 end_node_key=("Incollection", "title",
                                               "year"))
        if article_journal_relations_data_batch:
            create_relationships(graph_db.auto(),
                                 article_journal_relations_data_batch,
                                 "PUBLISHED",
                                 start_node_key=("Article", "title", "year"),
                                 end_node_key=("Journal", "title"))
        if inproceedings_conference_relations_data_batch:
            create_relationships(graph_db.auto(),
                                 inproceedings_conference_relations_data_batch,
                                 "PUBLISHED",
                                 start_node_key=("Inproceedings", "title",
                                                 "year"),
                                 end_node_key=("Conference", "title"))
        if incollection_book_relations_data_batch:
            create_relationships(graph_db.auto(),
                                 incollection_book_relations_data_batch,
                                 "PUBLISHED",
                                 start_node_key=("Incollection", "title",
                                                 "year"),
                                 end_node_key=("Book", "title"))

        if not any([
                authors_articles_relations_data_batch,
                authors_inproceedings_relations_data_batch,
                authors_incollection_relations_data_batch,
                article_journal_relations_data_batch,
                inproceedings_conference_relations_data_batch,
                incollection_book_relations_data_batch
        ]):
            break