def clear_data(c): protected_files = [".gitkeep", INPUT_DATA_FILENAME] data_path = get_data_path() files_to_be_deleted = [ (f, join(data_path, f)) for f in os.listdir(data_path) if isfile(join(data_path, f)) and f not in protected_files ] for file_name, file_path in files_to_be_deleted: print(f"DELETING {file_name}") os.remove(file_path)
def write_tfidf_term_document_matrix_to_file( preprocessed_file_name, word_list_file_name, term_document_matrix_filename, vectorizer_filename, ): df1 = pd.read_csv(join(get_data_path(), preprocessed_file_name), encoding="utf-8") df1 = df1[[IATI_IDENTIFIER_COLUMN_NAME, DESCRIPTION_COLUMN_NAME]] vectorizer, term_document_matrix, word_list = create_tfidf_term_document_matrix( df1) with open(join(get_data_path(), term_document_matrix_filename), "wb") as output_file: pickle.dump(term_document_matrix, output_file) with open(join(get_data_path(), word_list_file_name), "wb") as output_file: pickle.dump(word_list, output_file) with open(join(get_data_path(), vectorizer_filename), "wb") as output_file: pickle.dump(vectorizer, output_file)
app = Flask(__name__) health = HealthCheck(app, "/healthcheck") app.config["OPENAPI_URL_PREFIX"] = "/openapi" app.config["OPENAPI_JSON_PATH"] = "openapi.json" app.config["OPENAPI_REDOC_PATH"] = "/doc/" app.config["OPENAPI_SWAGGER_UI_PATH"] = "/swagger/" app.config["OPENAPI_SWAGGER_UI_VERSION"] = "3.23.11" app.config["OPENAPI_VERSION"] = "3.0.2" openapi = Api(app) if environment == "production": app.secret_key = os.getenv("APP_SECRET_KEY") else: app.secret_key = "".join(random.choice(string.ascii_lowercase) for i in range(10)) with open(join(get_data_path(), VECTORIZER_FILENAME), "rb") as _file: vectorizer = pickle.load(_file) with open(join(get_data_path(), TERM_DOCUMENT_MATRIX_FILENAME), "rb") as _file: term_document_matrix = pickle.load(_file) with open(join(get_data_path(), WORD2VECMODEL_FILENAME), "rb") as _file: word_to_vec_model = pickle.load(_file) with open(join(get_data_path(), WORD2VECAVG_FILENAME), "rb") as _file: word_to_vec_document_average = pickle.load(_file) processed_iati_records = pd.read_csv( join(get_data_path(), PROCESSED_RECORDS_FILENAME), encoding="utf-8" )
def kmeans_clustering( term_document_matrix, term_dataframe, minimum_number_of_clusters, maximum_number_of_clusters, increment, ): """ Args: term_document_matrix: scipy sparse matrix of term document matrix term_dataframe: mapping between terms and iati records minimum_number_of_clusters: minimum number of clusters maximum_number_of_clusters: maximum number of clusters increment: int of value increments between minimum_number_of_clusters and maximum_number_of_clusters Returns: A dictionary mapping 'number of clusters':'within cluster sum of squares' """ start = time.time() # Check dimensions reduced to n_components # print(term_document_matrix.shape) result_dict = {} # Test with n clusters (n_jobs -1 means max processes spawned) for n_clust in range(minimum_number_of_clusters, maximum_number_of_clusters, increment): km = KMeans(n_clusters=n_clust, n_jobs=-1) start = time.time() clusters = km.fit(term_document_matrix) result_dict[n_clust] = clusters.inertia_ end = time.time() print("{0} clusters: time elapsed: {1} seconds".format( n_clust, end - start)) # within cluster sum of squares print("{0} clusters: within cluster ss: {1}".format( n_clust, clusters.inertia_)) append_to_csv( join(get_data_path(), CLUSTERING_SUM_OF_SQUARE_COMPARISON_FILENAME), [n_clust, clusters.inertia_], ) term_dataframe.insert(term_dataframe.shape[1], "cluster{0}".format(n_clust), clusters.labels_) # Write cluster assigned to each iati.identifier out to csv term_dataframe[[ "iati.identifier", "cluster{0}".format(n_clust) ]].to_csv( join( get_data_path(), ACTIVITY_CLUSTER_ASSIGNMENT_FILENAME_CONVENTION.format( n_clust), ), encoding="utf-8", index=False, ) with open( join(get_data_path(), CLUSTER_CENTROIDS_FILENAME_CONVENTION.format(n_clust)), "wb", ) as out: pickle.dump(clusters.cluster_centers_, out) return result_dict
return np.unique(clusters.labels_, return_counts=True)[1].sum() def get_number_of_iterations_of_kmeans(clusters): return clusters.n_iter_ def get_number_of_records_by_organisation_by_cluster(term_dataframe, n_clust): return term_dataframe.groupby( ["participating.org..Implementing.", "cluster{0}".format(n_clust)]).size() if __name__ == "__main__": # Import iati.identifier csv for records included in doc-term matrix term_dataframe = pd.read_csv(join(get_data_path(), PROCESSED_RECORDS_FILENAME), encoding="utf-8") term_dataframe = term_dataframe[["iati.identifier"]] # Import Pickle file (need both IATI dataframe and term document matrix to be read in for this script) with open(join(get_data_path(), TERM_DOCUMENT_MATRIX_FILENAME), "rb") as _file: term_document_matrix = pickle.load(_file) minimum_number_of_clusters = 10 maximum_number_of_clusters = 15 increment = 2 # Apply SVD to TDM svd_term_document_matrix = apply_svd(term_document_matrix)
number_of_results_per_org) # Order by top organisation and within each top organisation the top projects top_project_results = top_project_results.sort_values( ["myorder", "cosine_sim"], ascending=[True, False]) top_project_results = top_project_results.drop(["myorder"], axis=1) print("limited after {} seconds".format(time.time() - start_time)) return top_project_results if __name__ == "__main__": full_df = pd.read_csv(join(get_data_path(), INPUT_DATA_FILENAME), encoding="utf-8") cosine_res_df = pd.read_csv(join(get_data_path(), COSINE_FILENAME), encoding="utf-8") refined_res = process_results(cosine_res_df, full_df, 100) refined_res = preprocessing_initial_text_clean(refined_res, ORG_ID_COLUMN_NAME) refined_res = remove_white_space(refined_res, ORG_ID_COLUMN_NAME) refined_res = remove_white_space(refined_res, DESCRIPTION_COLUMN_NAME) # top results per reporting organisation top_project_results = gather_top_results(refined_res, ORG_ID_COLUMN_NAME,
def results_per_corpus_df(input_df, w2v_model, dim_size=300): results_arr = [] progress = set([i for i in range(10**5, 10**6, 10**5)]) for index, row in input_df.iterrows(): results_arr.append( average_per_doc(row[DESCRIPTION_COLUMN_NAME], w2v_model, dim_size)) if index in progress: print("processed {0} records".format(index)) return np.array(results_arr) if __name__ == "__main__": df1 = pd.read_csv(join(get_data_path(), PROCESSED_RECORDS_FILENAME), encoding="utf-8") df1 = df1[[DESCRIPTION_COLUMN_NAME]] model = Word2Vec.load(join(get_data_path(), WORD2VECMODEL_FILENAME)) # This takes a while start = time.time() results = results_per_corpus_df(df1, model, 300) with open(join(get_data_path(), WORD2VECAVG_FILENAME), "wb") as output_file: pickle.dump(results, output_file) print("average array created and saved in {0} seconds".format(time.time() -
DESCRIPTION_COLUMN_NAME, ) except ModuleNotFoundError: from utils import get_data_path from constants import ( PROCESSED_RECORDS_FILENAME, WORD2VECMODEL_FILENAME, DESCRIPTION_COLUMN_NAME, ) def build_w2v_model(input_df, dim_size): vectorlist = [ row[DESCRIPTION_COLUMN_NAME].split(" ") for index, row in input_df.iterrows() ] return Word2Vec(vectorlist, min_count=20, size=dim_size, workers=4) if __name__ == "__main__": df1 = pd.read_csv(join(get_data_path(), PROCESSED_RECORDS_FILENAME), encoding="utf-8") start = time.time() model = build_w2v_model(df1, 300) model.save(join(get_data_path(), WORD2VECMODEL_FILENAME)) print("saved w2v model in {0} seconds".format(time.time() - start))
if __name__ == "__main__": start = time.time() query = """Despite impressive improvements in Vietnam's development and health status over the past decade, gains have not been equitable and significant unmet health needs remain. Poor and marginalized populations continue to disproportionally suffer from preventable illnesses while those in wealthier socioeconomic groups continue to enjoy greater health and longer life expectancy. Social Marketing for Improved Rural Health will include 3 main components: i) social marketing of SafeWat household water treatment solution and promotion of safer hygiene behaviors; ii) Good health, Great life and iii) behavior change communication to address non-supply side barriers to healthier behaviors.""" # get clean data for embeddings clean_df = pd.read_csv(join(get_data_path(), PROCESSED_RECORDS_FILENAME), encoding="utf-8") # preprocessing on both query and raw IATI description data query_df = preprocess_query_text(query) # unpickling with open(join(get_data_path(), TERM_DOCUMENT_MATRIX_FILENAME), "rb") as _file: term_document_matrix = pickle.load(_file) with open(join(get_data_path(), VECTORIZER_FILENAME), "rb") as _file: vectorizer = pickle.load(_file) with open(join(get_data_path(), WORD_LIST_FILENAME), "rb") as _file: word_list = pickle.load(_file)
DESCRIPTION_COLUMN_NAME, ) if __name__ == "__main__": query = """Despite impressive improvements in Vietnam's development and health status over the past decade, gains have not been equitable and significant unmet health needs remain. Poor and marginalized populations continue to disproportionally suffer from preventable illnesses while those in wealthier socioeconomic groups continue to enjoy greater health and longer life expectancy. Social Marketing for Improved Rural Health will include 3 main components: i) social marketing of SafeWat household water treatment solution and promotion of safer hygiene behaviors; ii) Good health, Great life and iii) behavior change communication to address non-supply side barriers to healthier behaviors.""" model = Word2Vec.load(join(get_data_path(), WORD2VECMODEL_FILENAME)) iati_records = pd.read_csv(join(get_data_path(), PROCESSED_RECORDS_FILENAME), encoding="utf-8") query_df = preprocess_query_text(query) if not query_df.empty: with open(join(get_data_path(), WORD2VECAVG_FILENAME), "rb") as _file: full_arr = pickle.load(_file) query_average = average_per_doc( str(query_df[DESCRIPTION_COLUMN_NAME][0]), model, 300).reshape(1, -1)
df = df[[IATI_IDENTIFIER_COLUMN_NAME, DESCRIPTION_COLUMN_NAME]] # preprocessing df = preprocessing_initial_text_clean(df, DESCRIPTION_COLUMN_NAME) df = preprocessing_nonenglish_words_remove(df, DESCRIPTION_COLUMN_NAME) df = preprocessing_stopwords_remove(df, DESCRIPTION_COLUMN_NAME) df = preprocessing_stem(df, DESCRIPTION_COLUMN_NAME) df = preprocessing_empty_text_remove(df, DESCRIPTION_COLUMN_NAME) return df if __name__ == "__main__": start = time.time() # To import full dataset df = pd.read_csv(join(get_data_path(), INPUT_DATA_FILENAME), encoding="utf-8") df = preprocess_pipeline(df) # write out df with reduced records df.to_csv( join(join(get_data_path(), PROCESSED_RECORDS_FILENAME)), index=False, encoding="utf-8", ) end = time.time() print("completed in {0} seconds".format(end - start))
health status over the past decade, gains have not been equitable and significant unmet health needs remain. Poor and marginalized populations continue to disproportionally suffer from preventable illnesses while those in wealthier socioeconomic groups continue to enjoy greater health and longer life expectancy. Social Marketing for Improved Rural Health will include 3 main components: i) social marketing of SafeWat household water treatment solution and promotion of safer hygiene behaviors; ii) Good health, Great life and iii) behavior change communication to address non-supply side barriers to healthier behaviors.""" # Or uncomment below if wish to test input text at runtime # query = input("Please enter search text:\n") query_df = preprocess_query_text(query) if not query_df.empty: with open(join(get_data_path(), VECTORIZER_FILENAME), "rb") as _file: vectorizer = pickle.load(_file) query_vector = vectorize_input_text(query_df, vectorizer) with open(join(get_data_path(), TERM_DOCUMENT_MATRIX_FILENAME), "rb") as _file: term_document_matrix = pickle.load(_file) iati_records = pd.read_csv(join(get_data_path(), PROCESSED_RECORDS_FILENAME), encoding="utf-8") iati_records = iati_records[[IATI_IDENTIFIER_COLUMN_NAME]] start_time = time.time() outDF = get_cosine_similarity(query_vector, term_document_matrix, iati_records)