def clear_data(c):
    protected_files = [".gitkeep", INPUT_DATA_FILENAME]
    data_path = get_data_path()
    files_to_be_deleted = [
        (f, join(data_path, f))
        for f in os.listdir(data_path)
        if isfile(join(data_path, f)) and f not in protected_files
    ]

    for file_name, file_path in files_to_be_deleted:
        print(f"DELETING {file_name}")
        os.remove(file_path)
def write_tfidf_term_document_matrix_to_file(
    preprocessed_file_name,
    word_list_file_name,
    term_document_matrix_filename,
    vectorizer_filename,
):
    df1 = pd.read_csv(join(get_data_path(), preprocessed_file_name),
                      encoding="utf-8")
    df1 = df1[[IATI_IDENTIFIER_COLUMN_NAME, DESCRIPTION_COLUMN_NAME]]

    vectorizer, term_document_matrix, word_list = create_tfidf_term_document_matrix(
        df1)

    with open(join(get_data_path(), term_document_matrix_filename),
              "wb") as output_file:
        pickle.dump(term_document_matrix, output_file)

    with open(join(get_data_path(), word_list_file_name), "wb") as output_file:
        pickle.dump(word_list, output_file)

    with open(join(get_data_path(), vectorizer_filename), "wb") as output_file:
        pickle.dump(vectorizer, output_file)
app = Flask(__name__)
health = HealthCheck(app, "/healthcheck")
app.config["OPENAPI_URL_PREFIX"] = "/openapi"
app.config["OPENAPI_JSON_PATH"] = "openapi.json"
app.config["OPENAPI_REDOC_PATH"] = "/doc/"
app.config["OPENAPI_SWAGGER_UI_PATH"] = "/swagger/"
app.config["OPENAPI_SWAGGER_UI_VERSION"] = "3.23.11"
app.config["OPENAPI_VERSION"] = "3.0.2"
openapi = Api(app)

if environment == "production":
    app.secret_key = os.getenv("APP_SECRET_KEY")
else:
    app.secret_key = "".join(random.choice(string.ascii_lowercase) for i in range(10))

with open(join(get_data_path(), VECTORIZER_FILENAME), "rb") as _file:
    vectorizer = pickle.load(_file)

with open(join(get_data_path(), TERM_DOCUMENT_MATRIX_FILENAME), "rb") as _file:
    term_document_matrix = pickle.load(_file)

with open(join(get_data_path(), WORD2VECMODEL_FILENAME), "rb") as _file:
    word_to_vec_model = pickle.load(_file)

with open(join(get_data_path(), WORD2VECAVG_FILENAME), "rb") as _file:
    word_to_vec_document_average = pickle.load(_file)

processed_iati_records = pd.read_csv(
    join(get_data_path(), PROCESSED_RECORDS_FILENAME), encoding="utf-8"
)
def kmeans_clustering(
    term_document_matrix,
    term_dataframe,
    minimum_number_of_clusters,
    maximum_number_of_clusters,
    increment,
):
    """
    Args:
        term_document_matrix: scipy sparse matrix of term document matrix
        term_dataframe: mapping between terms and iati records
        minimum_number_of_clusters: minimum number of clusters
        maximum_number_of_clusters: maximum number of clusters
        increment: int of value increments between minimum_number_of_clusters and maximum_number_of_clusters

    Returns:
        A dictionary mapping 'number of clusters':'within cluster sum of squares'
    """
    start = time.time()

    # Check dimensions reduced to n_components
    # print(term_document_matrix.shape)

    result_dict = {}

    # Test with n clusters (n_jobs -1 means max processes spawned)
    for n_clust in range(minimum_number_of_clusters,
                         maximum_number_of_clusters, increment):
        km = KMeans(n_clusters=n_clust, n_jobs=-1)
        start = time.time()
        clusters = km.fit(term_document_matrix)
        result_dict[n_clust] = clusters.inertia_
        end = time.time()
        print("{0} clusters: time elapsed: {1} seconds".format(
            n_clust, end - start))

        # within cluster sum of squares
        print("{0} clusters: within cluster ss: {1}".format(
            n_clust, clusters.inertia_))
        append_to_csv(
            join(get_data_path(),
                 CLUSTERING_SUM_OF_SQUARE_COMPARISON_FILENAME),
            [n_clust, clusters.inertia_],
        )

        term_dataframe.insert(term_dataframe.shape[1],
                              "cluster{0}".format(n_clust), clusters.labels_)
        # Write cluster assigned to each iati.identifier out to csv
        term_dataframe[[
            "iati.identifier", "cluster{0}".format(n_clust)
        ]].to_csv(
            join(
                get_data_path(),
                ACTIVITY_CLUSTER_ASSIGNMENT_FILENAME_CONVENTION.format(
                    n_clust),
            ),
            encoding="utf-8",
            index=False,
        )

        with open(
                join(get_data_path(),
                     CLUSTER_CENTROIDS_FILENAME_CONVENTION.format(n_clust)),
                "wb",
        ) as out:
            pickle.dump(clusters.cluster_centers_, out)

    return result_dict
    return np.unique(clusters.labels_, return_counts=True)[1].sum()


def get_number_of_iterations_of_kmeans(clusters):
    return clusters.n_iter_


def get_number_of_records_by_organisation_by_cluster(term_dataframe, n_clust):
    return term_dataframe.groupby(
        ["participating.org..Implementing.",
         "cluster{0}".format(n_clust)]).size()


if __name__ == "__main__":
    # Import iati.identifier csv for records included in doc-term matrix
    term_dataframe = pd.read_csv(join(get_data_path(),
                                      PROCESSED_RECORDS_FILENAME),
                                 encoding="utf-8")
    term_dataframe = term_dataframe[["iati.identifier"]]

    # Import Pickle file (need both IATI dataframe and term document matrix to be read in for this script)
    with open(join(get_data_path(), TERM_DOCUMENT_MATRIX_FILENAME),
              "rb") as _file:
        term_document_matrix = pickle.load(_file)

    minimum_number_of_clusters = 10
    maximum_number_of_clusters = 15
    increment = 2

    # Apply SVD to TDM
    svd_term_document_matrix = apply_svd(term_document_matrix)
        number_of_results_per_org)

    # Order by top organisation and within each top organisation the top projects
    top_project_results = top_project_results.sort_values(
        ["myorder", "cosine_sim"], ascending=[True, False])

    top_project_results = top_project_results.drop(["myorder"], axis=1)

    print("limited after {} seconds".format(time.time() - start_time))

    return top_project_results


if __name__ == "__main__":

    full_df = pd.read_csv(join(get_data_path(), INPUT_DATA_FILENAME),
                          encoding="utf-8")

    cosine_res_df = pd.read_csv(join(get_data_path(), COSINE_FILENAME),
                                encoding="utf-8")

    refined_res = process_results(cosine_res_df, full_df, 100)

    refined_res = preprocessing_initial_text_clean(refined_res,
                                                   ORG_ID_COLUMN_NAME)

    refined_res = remove_white_space(refined_res, ORG_ID_COLUMN_NAME)
    refined_res = remove_white_space(refined_res, DESCRIPTION_COLUMN_NAME)

    # top results per reporting organisation
    top_project_results = gather_top_results(refined_res, ORG_ID_COLUMN_NAME,

def results_per_corpus_df(input_df, w2v_model, dim_size=300):
    results_arr = []
    progress = set([i for i in range(10**5, 10**6, 10**5)])
    for index, row in input_df.iterrows():
        results_arr.append(
            average_per_doc(row[DESCRIPTION_COLUMN_NAME], w2v_model, dim_size))
        if index in progress:
            print("processed {0} records".format(index))
    return np.array(results_arr)


if __name__ == "__main__":

    df1 = pd.read_csv(join(get_data_path(), PROCESSED_RECORDS_FILENAME),
                      encoding="utf-8")

    df1 = df1[[DESCRIPTION_COLUMN_NAME]]

    model = Word2Vec.load(join(get_data_path(), WORD2VECMODEL_FILENAME))

    # This takes a while
    start = time.time()
    results = results_per_corpus_df(df1, model, 300)

    with open(join(get_data_path(), WORD2VECAVG_FILENAME),
              "wb") as output_file:
        pickle.dump(results, output_file)

    print("average array created and saved in {0} seconds".format(time.time() -
Beispiel #8
0
        DESCRIPTION_COLUMN_NAME,
    )
except ModuleNotFoundError:
    from utils import get_data_path
    from constants import (
        PROCESSED_RECORDS_FILENAME,
        WORD2VECMODEL_FILENAME,
        DESCRIPTION_COLUMN_NAME,
    )


def build_w2v_model(input_df, dim_size):
    vectorlist = [
        row[DESCRIPTION_COLUMN_NAME].split(" ")
        for index, row in input_df.iterrows()
    ]
    return Word2Vec(vectorlist, min_count=20, size=dim_size, workers=4)


if __name__ == "__main__":

    df1 = pd.read_csv(join(get_data_path(), PROCESSED_RECORDS_FILENAME),
                      encoding="utf-8")

    start = time.time()
    model = build_w2v_model(df1, 300)

    model.save(join(get_data_path(), WORD2VECMODEL_FILENAME))

    print("saved w2v model in {0} seconds".format(time.time() - start))
if __name__ == "__main__":

    start = time.time()

    query = """Despite impressive improvements in Vietnam's development and
    health status over the past decade, gains have not been equitable and significant unmet
    health needs remain. Poor and marginalized populations continue to disproportionally
    suffer from preventable illnesses while those in wealthier socioeconomic groups
    continue to enjoy greater health and longer life expectancy. Social Marketing for
    Improved Rural Health will include 3 main components: i) social marketing of SafeWat
    household water treatment solution and promotion of safer hygiene behaviors; ii) Good
    health, Great life and iii) behavior change communication to address non-supply side
    barriers to healthier behaviors."""

    # get clean data for embeddings
    clean_df = pd.read_csv(join(get_data_path(), PROCESSED_RECORDS_FILENAME),
                           encoding="utf-8")
    # preprocessing on both query and raw IATI description data
    query_df = preprocess_query_text(query)

    # unpickling
    with open(join(get_data_path(), TERM_DOCUMENT_MATRIX_FILENAME),
              "rb") as _file:
        term_document_matrix = pickle.load(_file)

    with open(join(get_data_path(), VECTORIZER_FILENAME), "rb") as _file:
        vectorizer = pickle.load(_file)

    with open(join(get_data_path(), WORD_LIST_FILENAME), "rb") as _file:
        word_list = pickle.load(_file)
Beispiel #10
0
        DESCRIPTION_COLUMN_NAME,
    )

if __name__ == "__main__":

    query = """Despite impressive improvements in Vietnam's development and
        health status over the past decade, gains have not been equitable and significant unmet
        health needs remain. Poor and marginalized populations continue to disproportionally
        suffer from preventable illnesses while those in wealthier socioeconomic groups
        continue to enjoy greater health and longer life expectancy. Social Marketing for
        Improved Rural Health will include 3 main components: i) social marketing of SafeWat
        household water treatment solution and promotion of safer hygiene behaviors; ii) Good
        health, Great life and iii) behavior change communication to address non-supply side
        barriers to healthier behaviors."""

    model = Word2Vec.load(join(get_data_path(), WORD2VECMODEL_FILENAME))

    iati_records = pd.read_csv(join(get_data_path(),
                                    PROCESSED_RECORDS_FILENAME),
                               encoding="utf-8")

    query_df = preprocess_query_text(query)

    if not query_df.empty:

        with open(join(get_data_path(), WORD2VECAVG_FILENAME), "rb") as _file:
            full_arr = pickle.load(_file)

        query_average = average_per_doc(
            str(query_df[DESCRIPTION_COLUMN_NAME][0]), model,
            300).reshape(1, -1)
Beispiel #11
0
    df = df[[IATI_IDENTIFIER_COLUMN_NAME, DESCRIPTION_COLUMN_NAME]]

    # preprocessing
    df = preprocessing_initial_text_clean(df, DESCRIPTION_COLUMN_NAME)
    df = preprocessing_nonenglish_words_remove(df, DESCRIPTION_COLUMN_NAME)
    df = preprocessing_stopwords_remove(df, DESCRIPTION_COLUMN_NAME)
    df = preprocessing_stem(df, DESCRIPTION_COLUMN_NAME)
    df = preprocessing_empty_text_remove(df, DESCRIPTION_COLUMN_NAME)
    return df


if __name__ == "__main__":
    start = time.time()

    # To import full dataset
    df = pd.read_csv(join(get_data_path(), INPUT_DATA_FILENAME),
                     encoding="utf-8")

    df = preprocess_pipeline(df)

    # write out df with reduced records
    df.to_csv(
        join(join(get_data_path(), PROCESSED_RECORDS_FILENAME)),
        index=False,
        encoding="utf-8",
    )

    end = time.time()

    print("completed in {0} seconds".format(end - start))
    health status over the past decade, gains have not been equitable and significant unmet
    health needs remain. Poor and marginalized populations continue to disproportionally
    suffer from preventable illnesses while those in wealthier socioeconomic groups
    continue to enjoy greater health and longer life expectancy. Social Marketing for
    Improved Rural Health will include 3 main components: i) social marketing of SafeWat
    household water treatment solution and promotion of safer hygiene behaviors; ii) Good
    health, Great life and iii) behavior change communication to address non-supply side
    barriers to healthier behaviors."""

    # Or uncomment below if wish to test input text at runtime
    # query = input("Please enter search text:\n")

    query_df = preprocess_query_text(query)

    if not query_df.empty:
        with open(join(get_data_path(), VECTORIZER_FILENAME), "rb") as _file:
            vectorizer = pickle.load(_file)
        query_vector = vectorize_input_text(query_df, vectorizer)

        with open(join(get_data_path(), TERM_DOCUMENT_MATRIX_FILENAME),
                  "rb") as _file:
            term_document_matrix = pickle.load(_file)

        iati_records = pd.read_csv(join(get_data_path(),
                                        PROCESSED_RECORDS_FILENAME),
                                   encoding="utf-8")

        iati_records = iati_records[[IATI_IDENTIFIER_COLUMN_NAME]]
        start_time = time.time()
        outDF = get_cosine_similarity(query_vector, term_document_matrix,
                                      iati_records)