Ejemplo n.º 1
0
def test_get_excluded_document_types():
    doc_types = [
        'about', 'about_our_services', 'access_and_opening',
        'business_support_finder', 'coming_soon', 'complaints_procedure',
        'completed_transaction', 'contact', 'corporate_report',
        'dfid_research_output', 'equality_and_diversity', 'field_of_operation',
        'finder', 'finder_email_signup', 'gone', 'help_page',
        'hmrc_manual_section', 'homepage', 'html_publication',
        'licence_finder', 'mainstream_browse_page', 'manual_section',
        'media_enquiries', 'membership', 'ministerial_role', 'need',
        'organisation', 'our_energy_use', 'our_governance', 'person',
        'personal_information_charter', 'placeholder_ministerial_role',
        'placeholder_person', 'placeholder_policy_area',
        'placeholder_topical_event', 'placeholder_world_location_news_page',
        'policy_area', 'publication_scheme', 'redirect', 'search',
        'service_manual_guide', 'service_manual_homepage',
        'service_manual_service_standard', 'service_manual_service_toolkit',
        'service_manual_topic', 'service_standard_report',
        'services_and_information', 'social_media_use', 'special_route',
        'staff_update', 'taxon', 'topic', 'topical_event',
        'topical_event_about_page', 'travel_advice', 'travel_advice_index',
        'uk_market_conformity_assessment_body', 'working_group',
        'world_location', 'worldwide_organisation'
    ]

    assert doc_types == read_config_yaml(
        "document_types_excluded_from_the_topic_taxonomy.yml"
    )['document_types']
def test_make_weighted_network(structural_edges_fixture,
                               functional_edges_fixture,
                               weighted_network_fixture):
    """
    Tests:
    1) No edges from structural and functional network are lost in merge
    2) Edges are unique
    3) Every edge has a weight
    """

    config = read_config_yaml("preprocessing-config.yml")
    weighted_network = make_weighted_network_from_structural_and_functional(
        structural_edges_fixture, functional_edges_fixture,
        config['structural_edge_weight']).reset_index(drop=True)

    structural_edges_fixture['edges'] = structural_edges_fixture['source_content_id'] + \
        structural_edges_fixture['destination_content_id']
    functional_edges_fixture['edges'] = functional_edges_fixture['source_content_id'] + \
        functional_edges_fixture['destination_content_id']

    z = set(structural_edges_fixture['edges']).union(
        set(functional_edges_fixture['edges']))

    y = set(weighted_network['source_content_id'] +
            weighted_network['destination_content_id'])

    assert z == y

    assert len(y) == weighted_network.shape[0]

    assert all(pd.notna(weighted_network['weight']))
def test_return_data_frame():
    """Tests that Edgeweight Extractor instantiates and runs query and result is unique set of edges and counts"""
    exclusions = read_config_yaml(
        "document_types_excluded_from_the_topic_taxonomy.yml"
    )['document_types']
    instance = EdgeWeightExtractor(
        'src/data_preprocessing/query_content_id_edge_weights.sql',
        blocklisted_document_types=exclusions,
        weight_threshold=10)

    instance.create_df()
    pd.set_option('display.max_colwidth', -1)

    instance.df['unique_edges'] = instance.df[
        'source_content_id'] + instance.df['destination_content_id']
    assert (instance.df.shape[0] > 1)
    assert (len(set(instance.df['unique_edges'])) == instance.df.shape[0])
    assert (instance.df['weight'].min(numeric_only=True) > 0)
def test_weight_summing():
    config = read_config_yaml("preprocessing-config.yml")

    functional = pd.DataFrame({
        "source_content_id": ["1", "2"],
        "destination_content_id": ["2", "1"],
        "weight": [10, 10]
    })

    structural = pd.DataFrame({
        "source_content_id": ["1", "2"],
        "destination_content_id": ["2", "1"]
    })

    weighted_network = make_weighted_network_from_structural_and_functional(
        structural, functional, config['structural_edge_weight'])

    print(functional)

    assert weighted_network.shape[0] == 2
    assert weighted_network['weight'].sum() == functional['weight'].sum() + (
        config['structural_edge_weight'] * 2)
                                       workers=workers,
                                       callbacks=[callbacks],
                                       iter=iter)

    def save_model(self, embeddings_filepath, model_file_path):
        self.logger.info(f'saving  embeddings to {embeddings_filepath}')
        self.model.wv.save_word2vec_format(embeddings_filepath)

        self.logger.info(f'saving model to {model_file_path}')
        self.model.save(model_file_path)


if __name__ == "__main__":  # our module is being executed as a program

    data_dir = safe_getenv('DATA_DIR')
    preprocessing_config = read_config_yaml("preprocessing-config.yml")
    node2vec_config = read_config_yaml("node2vec-config.yml")

    network_input_filename = os.path.join(data_dir, preprocessing_config['network_filename'])
    model_filename = os.path.join(data_dir, preprocessing_config['model_filename'])
    node_embeddings_filename = os.path.join(data_dir, preprocessing_config['embeddings_filename'])

    module_logger = logging.getLogger('train_node2_vec_model')

    module_logger.info(f'reading in {network_input_filename}')
    edges = pd.read_csv(network_input_filename, dtype={'source_content_id': object, 'destination_content_id': object})

    node2vec_model = N2VModel()

    node2vec_model.create_graph(edges, node2vec_config['weighted_graph'])
import pymongo
from tqdm import tqdm
import pickle
from src.utils.miscellaneous import read_config_yaml, safe_getenv
from src.utils import text_preprocessing as tp

warnings.filterwarnings('ignore', category=UserWarning, module='bs4')

tqdm.pandas()

KEYS_FOR_LINK_TYPES = {
    "related": "ordered_related_items",
    "collection": "documents"
}

BLOCKLIST_DOCUMENT_TYPES = read_config_yaml(
    "document_types_excluded_from_the_topic_taxonomy.yml")['document_types']

EXCLUDED_SOURCE_CONTENT = read_config_yaml(
    "source_exclusions_that_are_not_linked_from.yml")
EXCLUDED_TARGET_CONTENT = read_config_yaml(
    "target_exclusions_that_are_not_linked_to.yml")

RELATED_LINKS_PROJECTION = {
    "expanded_links.ordered_related_items.base_path": 1,
    "expanded_links.ordered_related_items.content_id": 1,
    "content_id": 1
}

COLLECTION_LINKS_PROJECTION = {
    "expanded_links.documents.base_path": 1,
    "expanded_links.documents.content_id": 1,
                                         self.blocklisted_document_types)
        ])

    def create_df(self):
        self.logger.info(f'running query from {self.query_path}')
        self.df = self.client.query(
            self.query_edge_list, job_config=self.query_config).to_dataframe()

    def extract_df_to_csv(self, file_path):
        self.df.to_csv(file_path, index=False)


if __name__ == "__main__":
    data_dir = safe_getenv('DATA_DIR')

    preprocessing_config = read_config_yaml("preprocessing-config.yml")
    functional_edges_output_filename = os.path.join(
        data_dir, preprocessing_config["functional_edges_filename"])

    module_logger = logging.getLogger('make_functional_edges_and_weights')
    blocklisted_document_types = read_config_yaml(
        "document_types_excluded_from_the_topic_taxonomy.yml"
    )['document_types']

    weight_threshold = preprocessing_config['weight_threshold']

    module_logger.info(f'Functional weight threshold is >= {weight_threshold}')

    to_date = (
        datetime.today() -
        timedelta(preprocessing_config['to_days_ago'])).strftime('%Y%m%d')
        return json.load(content_id_to_base_path_mapping_file)


def get_content_ids_to_page_views_mapper(df):
    """
    Transform BigQuery dataframe to a dictionary where keys are content_ids and the values are pageviews.
    :param df:
    :return:
    """
    return df.set_index('content_id').T.to_dict('records')[0]


if __name__ == '__main__':

    data_dir = safe_getenv('DATA_DIR')
    node2vec_cfg = read_config_yaml("node2vec-config.yml")

    model_filename = \
        os.path.join(data_dir, node2vec_cfg['model_filename'])
    eligible_source_content_ids_filename = \
        os.path.join(data_dir, 'eligible_source_content_ids.pkl')
    eligible_target_content_ids_filename = \
        os.path.join(data_dir, 'eligible_target_content_ids.pkl')
    content_id_base_path_mapping_filename = \
        os.path.join(data_dir, 'content_id_base_path_mapping.json')
    related_links_filename = os.path.join(data_dir,
                                          node2vec_cfg["predictions_filename"])
    related_links_100_filename = os.path.join(
        data_dir, node2vec_cfg["top_100_predictions_filename"])

    logging.config.fileConfig('src/logging.conf')
                          sort=False)

    # Deduplicate edges, summing structural and functional edge weights
    all_edges = all_edges.groupby(
        ['source_content_id', 'destination_content_id'],
        as_index=False).aggregate(sum)
    all_edges = all_edges[[
        'source_content_id', 'destination_content_id', 'weight'
    ]].reset_index(drop=True)
    return all_edges


if __name__ == "__main__":  # our module is being executed as a program

    data_dir = safe_getenv('DATA_DIR')
    preprocessing_config = read_config_yaml("preprocessing-config.yml")

    functional_edges_input_filename = os.path.join(
        data_dir, preprocessing_config["functional_edges_filename"])
    structural_edges_input_filename = os.path.join(
        data_dir, preprocessing_config["structural_edges_filename"])
    network_output_filename = os.path.join(
        data_dir, preprocessing_config["network_filename"])

    structural_edge_weight = preprocessing_config['structural_edge_weight']

    module_logger = logging.getLogger('making_network')

    module_logger.info(f'reading {structural_edges_input_filename}')
    structural_edges_df = pd.read_csv(structural_edges_input_filename)
    module_logger.info(