from bs4 import BeautifulSoup ### Grab the information from our configuration file config = parserfunctions.load_config() homepages_dir = parserfunctions.homepages_dir(pubshort) link_pattern = re.compile(pattern) ### Establish our MySQL Connection (for logging, etc.) conn, cur, mysql_table_name, mysql_log_name = parserfunctions.create_mysql_conn( config) ### Create directory for success, if appropriate parserfunctions.create_success_dir(pubshort, homepages_dir, move_on_success) ### Get list of files to parse file_list, file_list_len = parserfunctions.get_file_list( pubshort, homepages_dir) i = 1 ### For each desktop homepage for homepage in file_list: print("Opening file %s (%s of %s for %s)" % (homepage, i, file_list_len, pubshort)) i += 1 ### Reset key variables curr_time, curr_time_utc, document_data, document_soup, document_soup_on_page, insert_statements, is_pop, link, message, mostviewed_linklist, on_page, on_page_link_list, on_page_link_list_tmp, pop_rank, pop_top_5_links, seriousness = [ None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None ] try:
### Grab the information from our configuration file config = parserfunctions.load_config() homepages_dir = parserfunctions.homepages_dir(pubshort) link_pattern = re.compile(pattern) ### Establish our MySQL Connection (for logging, etc.) conn, cur, mysql_table_name, mysql_log_name = parserfunctions.create_mysql_conn(config) ### Create directory for success, if appropriate parserfunctions.create_success_dir(pubshort, homepages_dir, move_on_success) ########### Parse Desktop Pages if process_desktop is not None: ### Get list of files to parse file_list, file_list_len = parserfunctions.get_file_list(pubshort, homepages_dir) i = 1 ### For each desktop homepage for homepage in file_list: print("Opening file %s (%s of %s for %s)" % (homepage, i, file_list_len, pubshort)) i += 1 ### Reset key variables curr_time, curr_time_utc, document_data, document_soup, document_soup_on_page, insert_statements, is_pop, is_pro, layout, link, message, mostviewed_linklist, on_page, on_page_link_list, on_page_link_list_tmp, pop_rank, pop_top_5_links, pro_rank, pro_top_5_links, prominence_linklist, seriousness = [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None] try: ### Get data from file curr_time, curr_time_utc = parserfunctions.get_curr_time(homepage, pub_tz) document_data = parserfunctions.open_data_file(homepage) except: