def create_map_dfs_in_file_structure(): starting_points = pd.read_csv('subgraphs/starting_points.csv') query_temp_table = sql_statements.create_limited_links_temp() query_subgraphs = sql_statements.recursive_subgraphs() db_conn = conn((psql_user, psql_password)) db_conn.simple_execute(query_temp_table) columns = ['from_url_id', 'to_url_id', 'link_path', 'depth'] for starting_point in starting_points.starting_points: print(f'working on {starting_point}') var_dict = {'starting_point': starting_point} subgraph = db_conn.query_for_df_w_vars(self, query_str, var_dict, columns) if len(subgraph.from_url_id.values) < 2: print('skipped') continue depth = max(subgraph.depth) bredth = len(subgraph.from_url_id.values) directory = f'subgraphs/depth_{depth}/bredth_{bredth}/' if not os.path.exists(directory): os.makedirs(directory) print(f'{directory}starting_point_{starting_point}.csv') subgraph.to_csv(f'{directory}starting_point_{starting_point}.csv') conn.commit() conn.close()
def check_url_already_seen(url): match_count = 0 sql_statement = sql_statements.check_url_already_seen() var_dict = {'url_raw':url, 'linked': True} db_conn = conn((psql_user, psql_password)) match_count = query_for_all_w_vars(query_str, var_dict) return (match_count[0][0] > 0)
def create_node_edge_csvs(): node_query = sql_statements.all_nodes() edge_query = sql_statments.all_edges() db_conn = conn((psql_user, psql_password)) nodes = db_conn.query_for_all(node_query) edges = db_conn.query_for_all(edge_query) nodes.to_csv('data/nodes.csv') edges.to_csv('data/edges.csv')
def bulk_check_url_already_seen(): ''' had to stop the initial load early, used this to cut down on the urls sent to threading ''' match_count = 0 sql_statement = sql_statements.bulk_check_url_already_seen() var_dict = {'linked': True} db_conn = conn((psql_user, psql_password)) seen = query_for_all_w_vars(query_str, var_dict) seen = set([x[0] for x in seen]) return seen
def create_starting_points(): query_starting_points = sql_statements.starting_points() db_conn = conn((psql_user, psql_password)) results = db_conn.query_for_all(query_starting_points) # starting_points = psql.read_sql(query_starting_points, conn) starting_points = [] for record in results: starting_points.append(record[0]) pd.DataFrame(data=starting_points, columns=['starting_points' ]).to_csv('subgraphs/starting_points.csv')
def write_to_tables(url, links, root, html_page): #add originating link html sql_statement = sql_statements.update_urls_table_from() description = get_description(html_page) if root is None: root = 'not_available' if description is None: description = 'not_available' var_dict = {'url_raw':url, 'html_raw':html_page, 'site_description':description, 'linked': True, 'root_url': root} if (not links is None) and len(links) > 0: # add links to urls sql_temp, var_dict_addition = sql_statements.update_urls_table_tos(links) sql_statement += sql_temp var_dict.update(var_dict_addition) db_conn = conn((psql_user, psql_password)) db_conn.insert_into_db_with_vars(var_dict, sql_statement)
def get_url_layer(): sql_statement = sql_statments.get_url_layer() db_conn = conn((psql_user, psql_password)) links = db_conn.query_for_all(sql_statement) links = [link[0] for link in links] return links