Example #1
0
def prep_firefox(args, logger):
    """Checks if a supported version of firefox is installed. Exits with 
    error message if it isn't. Finally creates a directory to store
    firefox profiles.
    """
    utils.installed('firefox', logger)
    logger.info("Checking Firefox version compatibility")
    fh = open(
        "firefox_template/extensions/[email protected]/" + "install.rdf",
        "r")
    data = fh.read()
    fh.close()
    match = re.search('maxVersion="((\d+\.\d+)\.?.*)"', data)
    if match:
        crawler_ff_version = float(match.group(2))
        logger.debug("Expected maximum firefox version: %s " %
                     crawler_ff_version)
    ff_version_output = subprocess.Popen(
        ["firefox", "--version"], stdout=subprocess.PIPE).communicate()[0]
    match = re.search('Mozilla Firefox (\d+\.\d+).*', ff_version_output)
    if match:
        system_ff_version = float(match.group(1))
        logger.debug("System's firefox version: %s" % system_ff_version)
    if system_ff_version > crawler_ff_version:
        logger.critical(
            ("Crawler only supports Firefox up to \n" +
             "%.1f. The crawler extension needs to be updated. \n" +
             "Updating the maxVersion in install.rdf file in \n" +
             "[email protected] to the system firefox version \n" +
             "might work. \nExiting.") % crawler_ff_version)
        exit(1)
    # Create tmp directory for storing firefox profiles.
    profile_dir = os.path.join(utils.default_tmp_location(),
                               config.PROFILE_DIR)
    if not os.path.exists(profile_dir):
        logger.info("Creating directory for firefox profiles")
        os.makedirs(profile_dir)
    else:
        logger.error("Firefox profile directory already exists. Something's" +
                     "wrong. Please file a bug.")
Example #2
0
def prep_firefox(args, logger):
    """Checks if a supported version of firefox is installed. Exits with 
    error message if it isn't. Finally creates a directory to store
    firefox profiles.
    """
    utils.installed('firefox', logger)
    logger.info("Checking Firefox version compatibility")
    fh = open("firefox_template/extensions/[email protected]/" +
                  "install.rdf", "r")
    data = fh.read()
    fh.close()
    match = re.search('maxVersion="((\d+\.\d+)\.?.*)"', data)
    if match:
        crawler_ff_version = float(match.group(2))
        logger.debug("Expected maximum firefox version: %s " 
                         % crawler_ff_version)
    ff_version_output = subprocess.Popen(["firefox", "--version"], 
                                    stdout=subprocess.PIPE).communicate()[0]
    match = re.search('Mozilla Firefox (\d+\.\d+).*', ff_version_output)
    if match:
        system_ff_version = float(match.group(1))
        logger.debug("System's firefox version: %s" % system_ff_version)
    if system_ff_version > crawler_ff_version:
        logger.critical(("Crawler only supports Firefox up to \n" +
                   "%.1f. The crawler extension needs to be updated. \n"+ 
                   "Updating the maxVersion in install.rdf file in \n" +
                   "[email protected] to the system firefox version \n" +
                   "might work. \nExiting.") % crawler_ff_version)
        exit(1)
    # Create tmp directory for storing firefox profiles.
    profile_dir = os.path.join(utils.default_tmp_location(), 
                               config.PROFILE_DIR)
    if not os.path.exists(profile_dir):
        logger.info("Creating directory for firefox profiles")
        os.makedirs(profile_dir)
    else:
        logger.error("Firefox profile directory already exists. Something's"+
                     "wrong. Please file a bug.")
Example #3
0
def setup_args():
    """Creates all the arguments for the code alongwith the defaults. Returns
    the parsed arguments for use.
    """
    # Get a default for log directory.
    default_log_dir = os.path.join(utils.default_tmp_location(),
                                   config.LOG_DIR)
    description = ('This script instruments a browser to collect information' +
                   ' about a web page.')
    usage = 'usage: %prog [options]'
    parser = optparse.OptionParser(description=description,
                                   usage=usage,
                                   version='%prog 0.1')
    parser.add_option(
        '-x',
        '--no-xvfb',
        action='store_true',
        default=False,
        help='Give this option to turn XVFB (X Virtual Frame Buffer) \n' +
        'off. It is useful for debugging. XVFB is enabled by \n' +
        'default. Crawler uses XVFB default screen 0 and display:99.')
    # Output Directories
    parser.add_option(
        '--screenshot-dir',
        help='Base directory for storing screenshots. The crawler \n' +
        'determines the directory and filename based on this \n' +
        'option combined with the "screenshot" specification in \n' +
        'visit request in the input file. \n' +
        'Directory to save file in: \n' + '------------------------- \n' +
        'The optional --screenshot-dir argument specifies the \n' +
        'base directory to store all the screenshots in. The path \n' +
        'in input file is appended to this base dir path to form \n' +
        'the full directory path. Any directories that do not \n' +
        'already exist in the path will be automatically created. \n' +
        'For example, if the crawler is started with: \n' +
        '    python run.py --screenshot-dir /hdfs/pic \n' +
        'and a visit has entry: \n' +
        '    "screenshot": "nchachra/recrawling/pic1.png" \n' +
        'the file is stored as: \n' +
        '     /hdfs/pic/nchachra/recrawling/pic1.png \n' +
        'Alternatively, if --screenshot-dir argument is missing, \n' +
        'the file is stored in the current directory as, \n' +
        '    ./nchachra/recrawling/pic1.png \n' +
        'If the visit specified the path as: \n' +
        '    "screenshot": "/nchachra/recawling/pic1.png" \n' +
        'and the --screenshot-dir argument is missing, the file \n' +
        'is stored as /nchachra/recrawling/pic1.png \n' +
        'If both the --screenshot-dir argument, and per visit \n' +
        'screenshot file is missing: \n' + '    "screenshot":"" \n' +
        'the file is stored in the current directory. \n' +
        'Filename to save file with: \n' + '--------------------------- \n' +
        'If the full-path formed above ends with ".png", the \n' +
        'file is stored with the user-specified name. For example \n' +
        'for the path /hdfs/pic/nchachra/recrawling/pic1.png, \n ' +
        'the file is stored as: pic1.png. \n' +
        'If the file extension is not ".png", the path is assumed \n' +
        'to be all directories. For example, if the .png is \n' +
        'omitted above: /hdfs/pic/nchachra/recrawling/pic1, the \n' +
        'file is stored in pic1 directory as <md5>.png \n' +
        'If a filename is not specified, the file is named with \n' +
        'its md5 hash. \n\n' + 'The mapping of id->filename can be found \n' +
        'in the optional visit-chain file. Note that it is the \n' +
        'user\'s responsibility to maintain unique url-ids for \n' +
        'unique id->filename mappings.')
    parser.add_option(
        '--dom-dir',
        help='Directory for storing DOMs. Works like --screenshot-dir \n' +
        'argument, except the file extensions are .html')
    parser.add_option(
        '--visit-chain-dir',
        help='Directory for storing the visit chains. A visit chain \n' +
        'consists of all the URLs in encountered in the visit, \n' +
        'their headers, server addresses, dom files and \n' +
        'file mapping, the node the URL was crawled from, \n' +
        'the timestamp and proxy used. The argument works like \n' +
        '--screenshot-dir argument, except the file extension is \n' +
        '.json. \n ' +
        'NOTE: If valid screenshot/dom filenames are not provided \n' +
        'in the input files, then the files will be saved as their\n' +
        ' md5.extension. If visit chains are not being saved, the \n' +
        'mapping of feature:filename will be lost.')
    parser.add_option(
        '-i',
        '--input-file',
        action='append',
        help='Input file/directory containing URLs. Either specify \n' +
        'any number of input files or a single input directory \n' +
        'containing the input files. For example: \n' +
        '    python run.py -i input1.json -i input2.json \n' + 'or \n' +
        '    python run.py -i /path/to/input/directory \n')
    parser.add_option(
        '-n',
        '--num_browser',
        type=int,
        help='Maximum number of browser instances to run in parallel. \n' +
        'A single browser instance visits only a single URL at a \n' + 'time.')
    parser.add_option(
        '--ext-start-port',
        default=4000,
        type=int,
        help='This script communicates with Firefox extension \n' +
        'over TCP sockets. <num-browser> number of ports, \n' +
        'starting from this one will be used, if Firefox is used.\n' +
        'Default value is %(default)s.')
    parser.add_option(
        '--restart-browser',
        action='store_true',
        default=False,
        help='Giving this argument forces browser restart for every \n' +
        'visit. While this will provide sanity, the browser \n' +
        'typically takes up to 5 seconds to be set up so for \n' +
        'efficiency, this option is disabled by default')
    parser.add_option('-b',
                      '--browser',
                      choices=['Firefox'],
                      help='browser to be used. Default: %(default)s',
                      default='Firefox')
    parser.add_option(
        '--proxy-file',
        help='Proxy file contains lists of the form \n' +
        '[host, port, type]. See proxy_sample.json for example. The \n' +
        'crawler does not set up access to the proxy, that must be \n' +
        'done by the user. One can also directly specify proxy ip and\n' +
        'port in the input file, on a per-url basis, in which case \n' +
        'the input file options are given preference.')
    parser.add_option(
        '--proxy-scheme',
        choices=['round-robin'],
        default='round-robin',
        help='If the proxy information is not specified in the input \n' +
        'for every URL, a general policy can be used. \n' +
        'Default: %(default)s')
    parser.add_option(
        '-v',
        '--verbosity',
        choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
        help='Anything at the log level and above will be logged. \n' +
        'CRITICAL > INFO. Default: %(default)s',
        default='WARNING')
    parser.add_option('--log-dir',
                      default=default_log_dir,
                      help='Logs are stored in this directory. Default is \n' +
                      '%(default)s directory in current directory.')
    parser.add_option(
        '--suppress-stdout',
        action='store_true',
        default=False,
        help="By default the logs are shown on stdout and sent to the \n" +
        "the log files. Use this option to suppress output on stdout.")
    parser.add_option(
        '--tags-file',
        help="JSON file with dictionaries of tags:\n" + '{ \n' +
        '    "tag_name_1": { \n' +
        '                       "threshold": some_int, \n' +
        '                       "regexes": [regex1, regex2,...]\n' +
        '                  }\n' + '}\n' +
        'See sample_tags.json for example. <tag_name> is applied \n' +
        'if <threshold> number of <regexes> match.\n' +
        'Note that only a single tagging file can be supplied. As \n' +
        'long as this argument is used, all pages will be tagged. \n' +
        'This includes URLs for which user has not requested DOMs for.')
    '''
    parser.add_option('-r', '--report-recipient', 
            help='Email address to send crawl summary. The summary is \n' +
            'also saved in the log folder.')
    parser.add_option('--proxy-url', help='Same as --proxy-file except the \
            file will be fetched from the URL when the crawler reboots. This is\
            ideal if a service maintains a list of available proxies that is \
            refreshed frequently.')
    parser.add_option('-e', '--email_errors', help='Email the errors that \
            cause the crawler to crash to this recipient. The crash summary is\
            also saved in log folder.')
    '''
    (options, args) = parser.parse_args()
    if options.tags_file:
        options.tags_file = open(options.tags_file)
    return options
Example #4
0
def setup_args():
    """Creates all the arguments for the code alongwith the defaults. Returns
    the parsed arguments for use.
    """
    # Get a default for log directory.
    default_log_dir = os.path.join(utils.default_tmp_location(), 
                                   config.LOG_DIR)
    description = ('This script instruments a browser to collect information' +
                    ' about a web page.')
    usage = 'usage: %prog [options]'
    parser = optparse.OptionParser(description = description, usage =
            usage, version = '%prog 0.1')
    parser.add_option('-x', '--no-xvfb', action='store_true', 
            default=False,
            help='Give this option to turn XVFB (X Virtual Frame Buffer) \n' + 
                 'off. It is useful for debugging. XVFB is enabled by \n' +
                 'default. Crawler uses XVFB default screen 0 and display:99.')
    # Output Directories
    parser.add_option('--screenshot-dir',
            help='Base directory for storing screenshots. The crawler \n' + 
                 'determines the directory and filename based on this \n' +
                 'option combined with the "screenshot" specification in \n' +
                 'visit request in the input file. \n' +
                 'Directory to save file in: \n' +
                 '------------------------- \n' +
                 'The optional --screenshot-dir argument specifies the \n' +
                 'base directory to store all the screenshots in. The path \n'+
                 'in input file is appended to this base dir path to form \n' +
                 'the full directory path. Any directories that do not \n' +
                 'already exist in the path will be automatically created. \n'+
                 'For example, if the crawler is started with: \n' +
                 '    python run.py --screenshot-dir /hdfs/pic \n' + 
                 'and a visit has entry: \n' +
                 '    "screenshot": "nchachra/recrawling/pic1.png" \n' +
                 'the file is stored as: \n' +
                 '     /hdfs/pic/nchachra/recrawling/pic1.png \n' +
                 'Alternatively, if --screenshot-dir argument is missing, \n' +
                 'the file is stored in the current directory as, \n' +
                 '    ./nchachra/recrawling/pic1.png \n' +
                 'If the visit specified the path as: \n' +
                 '    "screenshot": "/nchachra/recawling/pic1.png" \n' +
                 'and the --screenshot-dir argument is missing, the file \n' +
                 'is stored as /nchachra/recrawling/pic1.png \n' +
                 'If both the --screenshot-dir argument, and per visit \n' +
                 'screenshot file is missing: \n' +
                 '    "screenshot":"" \n' +
                 'the file is stored in the current directory. \n' +
                 'Filename to save file with: \n' +
                 '--------------------------- \n' +
                 'If the full-path formed above ends with ".png", the \n' +
                 'file is stored with the user-specified name. For example \n'+
                 'for the path /hdfs/pic/nchachra/recrawling/pic1.png, \n ' +
                 'the file is stored as: pic1.png. \n' +
                 'If the file extension is not ".png", the path is assumed \n'+
                 'to be all directories. For example, if the .png is \n' +
                 'omitted above: /hdfs/pic/nchachra/recrawling/pic1, the \n'+
                 'file is stored in pic1 directory as <md5>.png \n' +
                 'If a filename is not specified, the file is named with \n' +
                 'its md5 hash. \n\n' +
                 'The mapping of id->filename can be found \n' +
                 'in the optional visit-chain file. Note that it is the \n' +
                 'user\'s responsibility to maintain unique url-ids for \n' +
                 'unique id->filename mappings.'
                )
    parser.add_option('--dom-dir',  
            help='Directory for storing DOMs. Works like --screenshot-dir \n' +
                 'argument, except the file extensions are .html')
    parser.add_option('--visit-chain-dir', 
            help='Directory for storing the visit chains. A visit chain \n' +
                 'consists of all the URLs in encountered in the visit, \n' +
                 'their headers, server addresses, dom files and \n' +
                 'file mapping, the node the URL was crawled from, \n' +
                 'the timestamp and proxy used. The argument works like \n' +
                 '--screenshot-dir argument, except the file extension is \n' +
                 '.json. \n ' +
                 'NOTE: If valid screenshot/dom filenames are not provided \n'+
                 'in the input files, then the files will be saved as their\n'+
                 ' md5.extension. If visit chains are not being saved, the \n'+
                 'mapping of feature:filename will be lost.')
    parser.add_option('-i', '--input-file', action='append',
            help='Input file/directory containing URLs. Either specify \n'+
                 'any number of input files or a single input directory \n'+
                 'containing the input files. For example: \n' +
                 '    python run.py -i input1.json -i input2.json \n' +
                 'or \n' +
                 '    python run.py -i /path/to/input/directory \n')
    parser.add_option('-n', '--num_browser', type=int, 
            help='Maximum number of browser instances to run in parallel. \n'+
                 'A single browser instance visits only a single URL at a \n'+
                 'time.')
    parser.add_option('--ext-start-port', default=4000, type=int, 
            help='This script communicates with Firefox extension \n' +
                 'over TCP sockets. <num-browser> number of ports, \n' +
                 'starting from this one will be used, if Firefox is used.\n'+
                 'Default value is %(default)s.')
    parser.add_option('--restart-browser', action='store_true', 
            default=False,
            help='Giving this argument forces browser restart for every \n' +
                 'visit. While this will provide sanity, the browser \n' +
                 'typically takes up to 5 seconds to be set up so for \n' +
                 'efficiency, this option is disabled by default')
    parser.add_option('-b', '--browser', choices=['Firefox'],
            help='browser to be used. Default: %(default)s', default='Firefox')
    parser.add_option('--proxy-file', 
            help='Proxy file contains lists of the form \n' +
            '[host, port, type]. See proxy_sample.json for example. The \n' +
            'crawler does not set up access to the proxy, that must be \n' +
            'done by the user. One can also directly specify proxy ip and\n' +
            'port in the input file, on a per-url basis, in which case \n' +
            'the input file options are given preference.')
    parser.add_option('--proxy-scheme', choices=['round-robin'], 
            default='round-robin', 
            help='If the proxy information is not specified in the input \n' +
            'for every URL, a general policy can be used. \n' + 
            'Default: %(default)s')
    parser.add_option('-v', '--verbosity', 
            choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
            help='Anything at the log level and above will be logged. \n' +
            'CRITICAL > INFO. Default: %(default)s', default='WARNING')
    parser.add_option('--log-dir', default=default_log_dir,
            help='Logs are stored in this directory. Default is \n' + 
            '%(default)s directory in current directory.')
    parser.add_option('--suppress-stdout', action='store_true', 
            default=False, 
            help="By default the logs are shown on stdout and sent to the \n"+
            "the log files. Use this option to suppress output on stdout.")
    parser.add_option('--tags-file', 
            help="JSON file with dictionaries of tags:\n" +
            '{ \n' +
            '    "tag_name_1": { \n' +
            '                       "threshold": some_int, \n' +
            '                       "regexes": [regex1, regex2,...]\n' +
            '                  }\n'+
            '}\n' +
            'See sample_tags.json for example. <tag_name> is applied \n' +
            'if <threshold> number of <regexes> match.\n' +
            'Note that only a single tagging file can be supplied. As \n'+
            'long as this argument is used, all pages will be tagged. \n' +
            'This includes URLs for which user has not requested DOMs for.')
    '''
    parser.add_option('-r', '--report-recipient', 
            help='Email address to send crawl summary. The summary is \n' +
            'also saved in the log folder.')
    parser.add_option('--proxy-url', help='Same as --proxy-file except the \
            file will be fetched from the URL when the crawler reboots. This is\
            ideal if a service maintains a list of available proxies that is \
            refreshed frequently.')
    parser.add_option('-e', '--email_errors', help='Email the errors that \
            cause the crawler to crash to this recipient. The crash summary is\
            also saved in log folder.')
    '''
    (options, args) = parser.parse_args()
    if options.tags_file:
        options.tags_file = open(options.tags_file)
    return options