Esempio n. 1
0
def create_map_dfs_in_file_structure():
    starting_points = pd.read_csv('subgraphs/starting_points.csv')

    query_temp_table = sql_statements.create_limited_links_temp()
    query_subgraphs = sql_statements.recursive_subgraphs()
    db_conn = conn((psql_user, psql_password))
    db_conn.simple_execute(query_temp_table)

    columns = ['from_url_id', 'to_url_id', 'link_path', 'depth']

    for starting_point in starting_points.starting_points:
        print(f'working on {starting_point}')
        var_dict = {'starting_point': starting_point}
        subgraph = db_conn.query_for_df_w_vars(self, query_str, var_dict,
                                               columns)
        if len(subgraph.from_url_id.values) < 2:
            print('skipped')
            continue
        depth = max(subgraph.depth)
        bredth = len(subgraph.from_url_id.values)
        directory = f'subgraphs/depth_{depth}/bredth_{bredth}/'
        if not os.path.exists(directory):
            os.makedirs(directory)
        print(f'{directory}starting_point_{starting_point}.csv')
        subgraph.to_csv(f'{directory}starting_point_{starting_point}.csv')
    conn.commit()
    conn.close()
Esempio n. 2
0
def check_url_already_seen(url):
    match_count = 0
    sql_statement = sql_statements.check_url_already_seen()
    var_dict = {'url_raw':url, 'linked': True}
    db_conn = conn((psql_user, psql_password))
    match_count = query_for_all_w_vars(query_str, var_dict)
    return (match_count[0][0] > 0)
Esempio n. 3
0
def create_node_edge_csvs():
    node_query = sql_statements.all_nodes()
    edge_query = sql_statments.all_edges()
    db_conn = conn((psql_user, psql_password))
    nodes = db_conn.query_for_all(node_query)
    edges = db_conn.query_for_all(edge_query)
    nodes.to_csv('data/nodes.csv')
    edges.to_csv('data/edges.csv')
Esempio n. 4
0
def bulk_check_url_already_seen():
    '''
    had to stop the initial load early, used this to cut down on the urls sent to threading
    '''
    match_count = 0
    sql_statement = sql_statements.bulk_check_url_already_seen()
    var_dict = {'linked': True}
    db_conn = conn((psql_user, psql_password))
    seen = query_for_all_w_vars(query_str, var_dict)
    seen = set([x[0] for x in seen])
    return seen
Esempio n. 5
0
def create_starting_points():
    query_starting_points = sql_statements.starting_points()
    db_conn = conn((psql_user, psql_password))
    results = db_conn.query_for_all(query_starting_points)
    # starting_points = psql.read_sql(query_starting_points, conn)
    starting_points = []
    for record in results:
        starting_points.append(record[0])
    pd.DataFrame(data=starting_points,
                 columns=['starting_points'
                          ]).to_csv('subgraphs/starting_points.csv')
Esempio n. 6
0
def write_to_tables(url, links, root, html_page):

    #add originating link html
    sql_statement = sql_statements.update_urls_table_from()

    description = get_description(html_page)

    if root is None:
        root = 'not_available'
    if description is None:
        description = 'not_available'

    var_dict = {'url_raw':url, 'html_raw':html_page, 'site_description':description, 'linked': True, 'root_url': root}

    if (not links is None) and len(links) > 0:
    #    add links to urls
        sql_temp, var_dict_addition = sql_statements.update_urls_table_tos(links)
        sql_statement += sql_temp
        var_dict.update(var_dict_addition)

    db_conn = conn((psql_user, psql_password))
    db_conn.insert_into_db_with_vars(var_dict, sql_statement)
Esempio n. 7
0
def get_url_layer():
    sql_statement = sql_statments.get_url_layer()
    db_conn = conn((psql_user, psql_password))
    links = db_conn.query_for_all(sql_statement)
    links = [link[0] for link in links]
    return links