def prep_firefox(args, logger): """Checks if a supported version of firefox is installed. Exits with error message if it isn't. Finally creates a directory to store firefox profiles. """ utils.installed('firefox', logger) logger.info("Checking Firefox version compatibility") fh = open( "firefox_template/extensions/[email protected]/" + "install.rdf", "r") data = fh.read() fh.close() match = re.search('maxVersion="((\d+\.\d+)\.?.*)"', data) if match: crawler_ff_version = float(match.group(2)) logger.debug("Expected maximum firefox version: %s " % crawler_ff_version) ff_version_output = subprocess.Popen( ["firefox", "--version"], stdout=subprocess.PIPE).communicate()[0] match = re.search('Mozilla Firefox (\d+\.\d+).*', ff_version_output) if match: system_ff_version = float(match.group(1)) logger.debug("System's firefox version: %s" % system_ff_version) if system_ff_version > crawler_ff_version: logger.critical( ("Crawler only supports Firefox up to \n" + "%.1f. The crawler extension needs to be updated. \n" + "Updating the maxVersion in install.rdf file in \n" + "[email protected] to the system firefox version \n" + "might work. \nExiting.") % crawler_ff_version) exit(1) # Create tmp directory for storing firefox profiles. profile_dir = os.path.join(utils.default_tmp_location(), config.PROFILE_DIR) if not os.path.exists(profile_dir): logger.info("Creating directory for firefox profiles") os.makedirs(profile_dir) else: logger.error("Firefox profile directory already exists. Something's" + "wrong. Please file a bug.")
def prep_firefox(args, logger): """Checks if a supported version of firefox is installed. Exits with error message if it isn't. Finally creates a directory to store firefox profiles. """ utils.installed('firefox', logger) logger.info("Checking Firefox version compatibility") fh = open("firefox_template/extensions/[email protected]/" + "install.rdf", "r") data = fh.read() fh.close() match = re.search('maxVersion="((\d+\.\d+)\.?.*)"', data) if match: crawler_ff_version = float(match.group(2)) logger.debug("Expected maximum firefox version: %s " % crawler_ff_version) ff_version_output = subprocess.Popen(["firefox", "--version"], stdout=subprocess.PIPE).communicate()[0] match = re.search('Mozilla Firefox (\d+\.\d+).*', ff_version_output) if match: system_ff_version = float(match.group(1)) logger.debug("System's firefox version: %s" % system_ff_version) if system_ff_version > crawler_ff_version: logger.critical(("Crawler only supports Firefox up to \n" + "%.1f. The crawler extension needs to be updated. \n"+ "Updating the maxVersion in install.rdf file in \n" + "[email protected] to the system firefox version \n" + "might work. \nExiting.") % crawler_ff_version) exit(1) # Create tmp directory for storing firefox profiles. profile_dir = os.path.join(utils.default_tmp_location(), config.PROFILE_DIR) if not os.path.exists(profile_dir): logger.info("Creating directory for firefox profiles") os.makedirs(profile_dir) else: logger.error("Firefox profile directory already exists. Something's"+ "wrong. Please file a bug.")
def setup_args(): """Creates all the arguments for the code alongwith the defaults. Returns the parsed arguments for use. """ # Get a default for log directory. default_log_dir = os.path.join(utils.default_tmp_location(), config.LOG_DIR) description = ('This script instruments a browser to collect information' + ' about a web page.') usage = 'usage: %prog [options]' parser = optparse.OptionParser(description=description, usage=usage, version='%prog 0.1') parser.add_option( '-x', '--no-xvfb', action='store_true', default=False, help='Give this option to turn XVFB (X Virtual Frame Buffer) \n' + 'off. It is useful for debugging. XVFB is enabled by \n' + 'default. Crawler uses XVFB default screen 0 and display:99.') # Output Directories parser.add_option( '--screenshot-dir', help='Base directory for storing screenshots. The crawler \n' + 'determines the directory and filename based on this \n' + 'option combined with the "screenshot" specification in \n' + 'visit request in the input file. \n' + 'Directory to save file in: \n' + '------------------------- \n' + 'The optional --screenshot-dir argument specifies the \n' + 'base directory to store all the screenshots in. The path \n' + 'in input file is appended to this base dir path to form \n' + 'the full directory path. Any directories that do not \n' + 'already exist in the path will be automatically created. \n' + 'For example, if the crawler is started with: \n' + ' python run.py --screenshot-dir /hdfs/pic \n' + 'and a visit has entry: \n' + ' "screenshot": "nchachra/recrawling/pic1.png" \n' + 'the file is stored as: \n' + ' /hdfs/pic/nchachra/recrawling/pic1.png \n' + 'Alternatively, if --screenshot-dir argument is missing, \n' + 'the file is stored in the current directory as, \n' + ' ./nchachra/recrawling/pic1.png \n' + 'If the visit specified the path as: \n' + ' "screenshot": "/nchachra/recawling/pic1.png" \n' + 'and the --screenshot-dir argument is missing, the file \n' + 'is stored as /nchachra/recrawling/pic1.png \n' + 'If both the --screenshot-dir argument, and per visit \n' + 'screenshot file is missing: \n' + ' "screenshot":"" \n' + 'the file is stored in the current directory. \n' + 'Filename to save file with: \n' + '--------------------------- \n' + 'If the full-path formed above ends with ".png", the \n' + 'file is stored with the user-specified name. For example \n' + 'for the path /hdfs/pic/nchachra/recrawling/pic1.png, \n ' + 'the file is stored as: pic1.png. \n' + 'If the file extension is not ".png", the path is assumed \n' + 'to be all directories. For example, if the .png is \n' + 'omitted above: /hdfs/pic/nchachra/recrawling/pic1, the \n' + 'file is stored in pic1 directory as <md5>.png \n' + 'If a filename is not specified, the file is named with \n' + 'its md5 hash. \n\n' + 'The mapping of id->filename can be found \n' + 'in the optional visit-chain file. Note that it is the \n' + 'user\'s responsibility to maintain unique url-ids for \n' + 'unique id->filename mappings.') parser.add_option( '--dom-dir', help='Directory for storing DOMs. Works like --screenshot-dir \n' + 'argument, except the file extensions are .html') parser.add_option( '--visit-chain-dir', help='Directory for storing the visit chains. A visit chain \n' + 'consists of all the URLs in encountered in the visit, \n' + 'their headers, server addresses, dom files and \n' + 'file mapping, the node the URL was crawled from, \n' + 'the timestamp and proxy used. The argument works like \n' + '--screenshot-dir argument, except the file extension is \n' + '.json. \n ' + 'NOTE: If valid screenshot/dom filenames are not provided \n' + 'in the input files, then the files will be saved as their\n' + ' md5.extension. If visit chains are not being saved, the \n' + 'mapping of feature:filename will be lost.') parser.add_option( '-i', '--input-file', action='append', help='Input file/directory containing URLs. Either specify \n' + 'any number of input files or a single input directory \n' + 'containing the input files. For example: \n' + ' python run.py -i input1.json -i input2.json \n' + 'or \n' + ' python run.py -i /path/to/input/directory \n') parser.add_option( '-n', '--num_browser', type=int, help='Maximum number of browser instances to run in parallel. \n' + 'A single browser instance visits only a single URL at a \n' + 'time.') parser.add_option( '--ext-start-port', default=4000, type=int, help='This script communicates with Firefox extension \n' + 'over TCP sockets. <num-browser> number of ports, \n' + 'starting from this one will be used, if Firefox is used.\n' + 'Default value is %(default)s.') parser.add_option( '--restart-browser', action='store_true', default=False, help='Giving this argument forces browser restart for every \n' + 'visit. While this will provide sanity, the browser \n' + 'typically takes up to 5 seconds to be set up so for \n' + 'efficiency, this option is disabled by default') parser.add_option('-b', '--browser', choices=['Firefox'], help='browser to be used. Default: %(default)s', default='Firefox') parser.add_option( '--proxy-file', help='Proxy file contains lists of the form \n' + '[host, port, type]. See proxy_sample.json for example. The \n' + 'crawler does not set up access to the proxy, that must be \n' + 'done by the user. One can also directly specify proxy ip and\n' + 'port in the input file, on a per-url basis, in which case \n' + 'the input file options are given preference.') parser.add_option( '--proxy-scheme', choices=['round-robin'], default='round-robin', help='If the proxy information is not specified in the input \n' + 'for every URL, a general policy can be used. \n' + 'Default: %(default)s') parser.add_option( '-v', '--verbosity', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='Anything at the log level and above will be logged. \n' + 'CRITICAL > INFO. Default: %(default)s', default='WARNING') parser.add_option('--log-dir', default=default_log_dir, help='Logs are stored in this directory. Default is \n' + '%(default)s directory in current directory.') parser.add_option( '--suppress-stdout', action='store_true', default=False, help="By default the logs are shown on stdout and sent to the \n" + "the log files. Use this option to suppress output on stdout.") parser.add_option( '--tags-file', help="JSON file with dictionaries of tags:\n" + '{ \n' + ' "tag_name_1": { \n' + ' "threshold": some_int, \n' + ' "regexes": [regex1, regex2,...]\n' + ' }\n' + '}\n' + 'See sample_tags.json for example. <tag_name> is applied \n' + 'if <threshold> number of <regexes> match.\n' + 'Note that only a single tagging file can be supplied. As \n' + 'long as this argument is used, all pages will be tagged. \n' + 'This includes URLs for which user has not requested DOMs for.') ''' parser.add_option('-r', '--report-recipient', help='Email address to send crawl summary. The summary is \n' + 'also saved in the log folder.') parser.add_option('--proxy-url', help='Same as --proxy-file except the \ file will be fetched from the URL when the crawler reboots. This is\ ideal if a service maintains a list of available proxies that is \ refreshed frequently.') parser.add_option('-e', '--email_errors', help='Email the errors that \ cause the crawler to crash to this recipient. The crash summary is\ also saved in log folder.') ''' (options, args) = parser.parse_args() if options.tags_file: options.tags_file = open(options.tags_file) return options
def setup_args(): """Creates all the arguments for the code alongwith the defaults. Returns the parsed arguments for use. """ # Get a default for log directory. default_log_dir = os.path.join(utils.default_tmp_location(), config.LOG_DIR) description = ('This script instruments a browser to collect information' + ' about a web page.') usage = 'usage: %prog [options]' parser = optparse.OptionParser(description = description, usage = usage, version = '%prog 0.1') parser.add_option('-x', '--no-xvfb', action='store_true', default=False, help='Give this option to turn XVFB (X Virtual Frame Buffer) \n' + 'off. It is useful for debugging. XVFB is enabled by \n' + 'default. Crawler uses XVFB default screen 0 and display:99.') # Output Directories parser.add_option('--screenshot-dir', help='Base directory for storing screenshots. The crawler \n' + 'determines the directory and filename based on this \n' + 'option combined with the "screenshot" specification in \n' + 'visit request in the input file. \n' + 'Directory to save file in: \n' + '------------------------- \n' + 'The optional --screenshot-dir argument specifies the \n' + 'base directory to store all the screenshots in. The path \n'+ 'in input file is appended to this base dir path to form \n' + 'the full directory path. Any directories that do not \n' + 'already exist in the path will be automatically created. \n'+ 'For example, if the crawler is started with: \n' + ' python run.py --screenshot-dir /hdfs/pic \n' + 'and a visit has entry: \n' + ' "screenshot": "nchachra/recrawling/pic1.png" \n' + 'the file is stored as: \n' + ' /hdfs/pic/nchachra/recrawling/pic1.png \n' + 'Alternatively, if --screenshot-dir argument is missing, \n' + 'the file is stored in the current directory as, \n' + ' ./nchachra/recrawling/pic1.png \n' + 'If the visit specified the path as: \n' + ' "screenshot": "/nchachra/recawling/pic1.png" \n' + 'and the --screenshot-dir argument is missing, the file \n' + 'is stored as /nchachra/recrawling/pic1.png \n' + 'If both the --screenshot-dir argument, and per visit \n' + 'screenshot file is missing: \n' + ' "screenshot":"" \n' + 'the file is stored in the current directory. \n' + 'Filename to save file with: \n' + '--------------------------- \n' + 'If the full-path formed above ends with ".png", the \n' + 'file is stored with the user-specified name. For example \n'+ 'for the path /hdfs/pic/nchachra/recrawling/pic1.png, \n ' + 'the file is stored as: pic1.png. \n' + 'If the file extension is not ".png", the path is assumed \n'+ 'to be all directories. For example, if the .png is \n' + 'omitted above: /hdfs/pic/nchachra/recrawling/pic1, the \n'+ 'file is stored in pic1 directory as <md5>.png \n' + 'If a filename is not specified, the file is named with \n' + 'its md5 hash. \n\n' + 'The mapping of id->filename can be found \n' + 'in the optional visit-chain file. Note that it is the \n' + 'user\'s responsibility to maintain unique url-ids for \n' + 'unique id->filename mappings.' ) parser.add_option('--dom-dir', help='Directory for storing DOMs. Works like --screenshot-dir \n' + 'argument, except the file extensions are .html') parser.add_option('--visit-chain-dir', help='Directory for storing the visit chains. A visit chain \n' + 'consists of all the URLs in encountered in the visit, \n' + 'their headers, server addresses, dom files and \n' + 'file mapping, the node the URL was crawled from, \n' + 'the timestamp and proxy used. The argument works like \n' + '--screenshot-dir argument, except the file extension is \n' + '.json. \n ' + 'NOTE: If valid screenshot/dom filenames are not provided \n'+ 'in the input files, then the files will be saved as their\n'+ ' md5.extension. If visit chains are not being saved, the \n'+ 'mapping of feature:filename will be lost.') parser.add_option('-i', '--input-file', action='append', help='Input file/directory containing URLs. Either specify \n'+ 'any number of input files or a single input directory \n'+ 'containing the input files. For example: \n' + ' python run.py -i input1.json -i input2.json \n' + 'or \n' + ' python run.py -i /path/to/input/directory \n') parser.add_option('-n', '--num_browser', type=int, help='Maximum number of browser instances to run in parallel. \n'+ 'A single browser instance visits only a single URL at a \n'+ 'time.') parser.add_option('--ext-start-port', default=4000, type=int, help='This script communicates with Firefox extension \n' + 'over TCP sockets. <num-browser> number of ports, \n' + 'starting from this one will be used, if Firefox is used.\n'+ 'Default value is %(default)s.') parser.add_option('--restart-browser', action='store_true', default=False, help='Giving this argument forces browser restart for every \n' + 'visit. While this will provide sanity, the browser \n' + 'typically takes up to 5 seconds to be set up so for \n' + 'efficiency, this option is disabled by default') parser.add_option('-b', '--browser', choices=['Firefox'], help='browser to be used. Default: %(default)s', default='Firefox') parser.add_option('--proxy-file', help='Proxy file contains lists of the form \n' + '[host, port, type]. See proxy_sample.json for example. The \n' + 'crawler does not set up access to the proxy, that must be \n' + 'done by the user. One can also directly specify proxy ip and\n' + 'port in the input file, on a per-url basis, in which case \n' + 'the input file options are given preference.') parser.add_option('--proxy-scheme', choices=['round-robin'], default='round-robin', help='If the proxy information is not specified in the input \n' + 'for every URL, a general policy can be used. \n' + 'Default: %(default)s') parser.add_option('-v', '--verbosity', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='Anything at the log level and above will be logged. \n' + 'CRITICAL > INFO. Default: %(default)s', default='WARNING') parser.add_option('--log-dir', default=default_log_dir, help='Logs are stored in this directory. Default is \n' + '%(default)s directory in current directory.') parser.add_option('--suppress-stdout', action='store_true', default=False, help="By default the logs are shown on stdout and sent to the \n"+ "the log files. Use this option to suppress output on stdout.") parser.add_option('--tags-file', help="JSON file with dictionaries of tags:\n" + '{ \n' + ' "tag_name_1": { \n' + ' "threshold": some_int, \n' + ' "regexes": [regex1, regex2,...]\n' + ' }\n'+ '}\n' + 'See sample_tags.json for example. <tag_name> is applied \n' + 'if <threshold> number of <regexes> match.\n' + 'Note that only a single tagging file can be supplied. As \n'+ 'long as this argument is used, all pages will be tagged. \n' + 'This includes URLs for which user has not requested DOMs for.') ''' parser.add_option('-r', '--report-recipient', help='Email address to send crawl summary. The summary is \n' + 'also saved in the log folder.') parser.add_option('--proxy-url', help='Same as --proxy-file except the \ file will be fetched from the URL when the crawler reboots. This is\ ideal if a service maintains a list of available proxies that is \ refreshed frequently.') parser.add_option('-e', '--email_errors', help='Email the errors that \ cause the crawler to crash to this recipient. The crash summary is\ also saved in log folder.') ''' (options, args) = parser.parse_args() if options.tags_file: options.tags_file = open(options.tags_file) return options