########### Set variables ### Set publication-specific variables pubshort = "cpd" # Short name for the publication pubmv_external = None # Do we need to get the list of most viewed items from another page? (None = No, anything else = Yes) pattern = "/(\d+)/(\d+)/(.+).html?|/news/article/" # Pattern for defining what is and what is not a news item pub_tz = "US/Eastern" # Timezone the publication is in move_on_success = None # Do we want to move files on success? (None = No, anything else = Yes) success_dir = "success/" # Directory for storing successful files (as subdirectory of data directory) ########### Load libraries import parserfunctions import re from bs4 import BeautifulSoup ### Grab the information from our configuration file config = parserfunctions.load_config() homepages_dir = parserfunctions.homepages_dir(pubshort) link_pattern = re.compile(pattern) ### Establish our MySQL Connection (for logging, etc.) conn, cur, mysql_table_name, mysql_log_name = parserfunctions.create_mysql_conn( config) ### Create directory for success, if appropriate parserfunctions.create_success_dir(pubshort, homepages_dir, move_on_success) ### Get list of files to parse file_list, file_list_len = parserfunctions.get_file_list( pubshort, homepages_dir) i = 1
pubshort = "tst" # Short name for the publication pubmv_external = None # Do we need to get the list of most viewed items from another page? (None = No, anything else = Yes) pattern = "/(.+)/(\d+)_(.+).html?|/(.+)/(\d+)/(\d+)/|projects\.(.+\.com)/(\d+)/|video\.(.+\.com)/(\d+)|/(.+)/photogalleries/(.+)(\d+)|/gamecenter/" # Link pattern for actual articles pub_tz = "US/Pacific" # Timezone the publication is in process_desktop = 1 # Do we want to process the desktop pages? (None = No, anything else = Yes) move_on_success = None # Do we want to move files on success? (None = No, anything else = Yes) success_dir = "success/" # Directory for storing successful files (as subdirectory of data directory) ########### Load libraries import parserfunctions import re from bs4 import BeautifulSoup ### Grab the information from our configuration file config = parserfunctions.load_config() homepages_dir = parserfunctions.homepages_dir(pubshort) link_pattern = re.compile(pattern) ### Establish our MySQL Connection (for logging, etc.) conn, cur, mysql_table_name, mysql_log_name = parserfunctions.create_mysql_conn(config) ### Create directory for success, if appropriate parserfunctions.create_success_dir(pubshort, homepages_dir, move_on_success) ########### Parse Desktop Pages if process_desktop is not None: ### Get list of files to parse file_list, file_list_len = parserfunctions.get_file_list(pubshort, homepages_dir) i = 1