def test_get_excluded_document_types(): doc_types = [ 'about', 'about_our_services', 'access_and_opening', 'business_support_finder', 'coming_soon', 'complaints_procedure', 'completed_transaction', 'contact', 'corporate_report', 'dfid_research_output', 'equality_and_diversity', 'field_of_operation', 'finder', 'finder_email_signup', 'gone', 'help_page', 'hmrc_manual_section', 'homepage', 'html_publication', 'licence_finder', 'mainstream_browse_page', 'manual_section', 'media_enquiries', 'membership', 'ministerial_role', 'need', 'organisation', 'our_energy_use', 'our_governance', 'person', 'personal_information_charter', 'placeholder_ministerial_role', 'placeholder_person', 'placeholder_policy_area', 'placeholder_topical_event', 'placeholder_world_location_news_page', 'policy_area', 'publication_scheme', 'redirect', 'search', 'service_manual_guide', 'service_manual_homepage', 'service_manual_service_standard', 'service_manual_service_toolkit', 'service_manual_topic', 'service_standard_report', 'services_and_information', 'social_media_use', 'special_route', 'staff_update', 'taxon', 'topic', 'topical_event', 'topical_event_about_page', 'travel_advice', 'travel_advice_index', 'uk_market_conformity_assessment_body', 'working_group', 'world_location', 'worldwide_organisation' ] assert doc_types == read_config_yaml( "document_types_excluded_from_the_topic_taxonomy.yml" )['document_types']
def test_make_weighted_network(structural_edges_fixture, functional_edges_fixture, weighted_network_fixture): """ Tests: 1) No edges from structural and functional network are lost in merge 2) Edges are unique 3) Every edge has a weight """ config = read_config_yaml("preprocessing-config.yml") weighted_network = make_weighted_network_from_structural_and_functional( structural_edges_fixture, functional_edges_fixture, config['structural_edge_weight']).reset_index(drop=True) structural_edges_fixture['edges'] = structural_edges_fixture['source_content_id'] + \ structural_edges_fixture['destination_content_id'] functional_edges_fixture['edges'] = functional_edges_fixture['source_content_id'] + \ functional_edges_fixture['destination_content_id'] z = set(structural_edges_fixture['edges']).union( set(functional_edges_fixture['edges'])) y = set(weighted_network['source_content_id'] + weighted_network['destination_content_id']) assert z == y assert len(y) == weighted_network.shape[0] assert all(pd.notna(weighted_network['weight']))
def test_return_data_frame(): """Tests that Edgeweight Extractor instantiates and runs query and result is unique set of edges and counts""" exclusions = read_config_yaml( "document_types_excluded_from_the_topic_taxonomy.yml" )['document_types'] instance = EdgeWeightExtractor( 'src/data_preprocessing/query_content_id_edge_weights.sql', blocklisted_document_types=exclusions, weight_threshold=10) instance.create_df() pd.set_option('display.max_colwidth', -1) instance.df['unique_edges'] = instance.df[ 'source_content_id'] + instance.df['destination_content_id'] assert (instance.df.shape[0] > 1) assert (len(set(instance.df['unique_edges'])) == instance.df.shape[0]) assert (instance.df['weight'].min(numeric_only=True) > 0)
def test_weight_summing(): config = read_config_yaml("preprocessing-config.yml") functional = pd.DataFrame({ "source_content_id": ["1", "2"], "destination_content_id": ["2", "1"], "weight": [10, 10] }) structural = pd.DataFrame({ "source_content_id": ["1", "2"], "destination_content_id": ["2", "1"] }) weighted_network = make_weighted_network_from_structural_and_functional( structural, functional, config['structural_edge_weight']) print(functional) assert weighted_network.shape[0] == 2 assert weighted_network['weight'].sum() == functional['weight'].sum() + ( config['structural_edge_weight'] * 2)
workers=workers, callbacks=[callbacks], iter=iter) def save_model(self, embeddings_filepath, model_file_path): self.logger.info(f'saving embeddings to {embeddings_filepath}') self.model.wv.save_word2vec_format(embeddings_filepath) self.logger.info(f'saving model to {model_file_path}') self.model.save(model_file_path) if __name__ == "__main__": # our module is being executed as a program data_dir = safe_getenv('DATA_DIR') preprocessing_config = read_config_yaml("preprocessing-config.yml") node2vec_config = read_config_yaml("node2vec-config.yml") network_input_filename = os.path.join(data_dir, preprocessing_config['network_filename']) model_filename = os.path.join(data_dir, preprocessing_config['model_filename']) node_embeddings_filename = os.path.join(data_dir, preprocessing_config['embeddings_filename']) module_logger = logging.getLogger('train_node2_vec_model') module_logger.info(f'reading in {network_input_filename}') edges = pd.read_csv(network_input_filename, dtype={'source_content_id': object, 'destination_content_id': object}) node2vec_model = N2VModel() node2vec_model.create_graph(edges, node2vec_config['weighted_graph'])
import pymongo from tqdm import tqdm import pickle from src.utils.miscellaneous import read_config_yaml, safe_getenv from src.utils import text_preprocessing as tp warnings.filterwarnings('ignore', category=UserWarning, module='bs4') tqdm.pandas() KEYS_FOR_LINK_TYPES = { "related": "ordered_related_items", "collection": "documents" } BLOCKLIST_DOCUMENT_TYPES = read_config_yaml( "document_types_excluded_from_the_topic_taxonomy.yml")['document_types'] EXCLUDED_SOURCE_CONTENT = read_config_yaml( "source_exclusions_that_are_not_linked_from.yml") EXCLUDED_TARGET_CONTENT = read_config_yaml( "target_exclusions_that_are_not_linked_to.yml") RELATED_LINKS_PROJECTION = { "expanded_links.ordered_related_items.base_path": 1, "expanded_links.ordered_related_items.content_id": 1, "content_id": 1 } COLLECTION_LINKS_PROJECTION = { "expanded_links.documents.base_path": 1, "expanded_links.documents.content_id": 1,
self.blocklisted_document_types) ]) def create_df(self): self.logger.info(f'running query from {self.query_path}') self.df = self.client.query( self.query_edge_list, job_config=self.query_config).to_dataframe() def extract_df_to_csv(self, file_path): self.df.to_csv(file_path, index=False) if __name__ == "__main__": data_dir = safe_getenv('DATA_DIR') preprocessing_config = read_config_yaml("preprocessing-config.yml") functional_edges_output_filename = os.path.join( data_dir, preprocessing_config["functional_edges_filename"]) module_logger = logging.getLogger('make_functional_edges_and_weights') blocklisted_document_types = read_config_yaml( "document_types_excluded_from_the_topic_taxonomy.yml" )['document_types'] weight_threshold = preprocessing_config['weight_threshold'] module_logger.info(f'Functional weight threshold is >= {weight_threshold}') to_date = ( datetime.today() - timedelta(preprocessing_config['to_days_ago'])).strftime('%Y%m%d')
return json.load(content_id_to_base_path_mapping_file) def get_content_ids_to_page_views_mapper(df): """ Transform BigQuery dataframe to a dictionary where keys are content_ids and the values are pageviews. :param df: :return: """ return df.set_index('content_id').T.to_dict('records')[0] if __name__ == '__main__': data_dir = safe_getenv('DATA_DIR') node2vec_cfg = read_config_yaml("node2vec-config.yml") model_filename = \ os.path.join(data_dir, node2vec_cfg['model_filename']) eligible_source_content_ids_filename = \ os.path.join(data_dir, 'eligible_source_content_ids.pkl') eligible_target_content_ids_filename = \ os.path.join(data_dir, 'eligible_target_content_ids.pkl') content_id_base_path_mapping_filename = \ os.path.join(data_dir, 'content_id_base_path_mapping.json') related_links_filename = os.path.join(data_dir, node2vec_cfg["predictions_filename"]) related_links_100_filename = os.path.join( data_dir, node2vec_cfg["top_100_predictions_filename"]) logging.config.fileConfig('src/logging.conf')
sort=False) # Deduplicate edges, summing structural and functional edge weights all_edges = all_edges.groupby( ['source_content_id', 'destination_content_id'], as_index=False).aggregate(sum) all_edges = all_edges[[ 'source_content_id', 'destination_content_id', 'weight' ]].reset_index(drop=True) return all_edges if __name__ == "__main__": # our module is being executed as a program data_dir = safe_getenv('DATA_DIR') preprocessing_config = read_config_yaml("preprocessing-config.yml") functional_edges_input_filename = os.path.join( data_dir, preprocessing_config["functional_edges_filename"]) structural_edges_input_filename = os.path.join( data_dir, preprocessing_config["structural_edges_filename"]) network_output_filename = os.path.join( data_dir, preprocessing_config["network_filename"]) structural_edge_weight = preprocessing_config['structural_edge_weight'] module_logger = logging.getLogger('making_network') module_logger.info(f'reading {structural_edges_input_filename}') structural_edges_df = pd.read_csv(structural_edges_input_filename) module_logger.info(