Example #1
0
def run(options):
    if options.database:
        with open(options.database, 'r') as f:
            env = yaml.load(f)
        db_conf = env.get('db', {})
    else:
        db_conf = {
            'host': os.environ.get('SEO_DB_HOSTNAME'),
            'user': os.environ.get('SEO_DB_USERNAME'),
            'pass': os.environ.get('SEO_DB_PASSWORD'),
            'name': os.environ.get('SEO_DB_DATABASE'),
        }

    # Initialize the database cursor
    db = MySQLdb.connect(host=db_conf.get('host'), user=db_conf.get('user'),
        passwd=db_conf.get('pass'), db=db_conf.get('name'), use_unicode=True)

    urls = []
    url_associations = {}
    processed_urls = {}
    run_id = None
    if options.file:
        with open(options.file, 'r') as f:
            urls = [url.strip() for url in f.readlines()]
    elif options.base_url:
        urls = [options.base_url,]
    elif options.yaml:
        with open(options.yaml, 'r') as f:
            url_yaml = yaml.load(f)
            urls = url_yaml.get('seocrawlerurls', [])
    elif options.run_id:
        save_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'seocrawler', 'jobs', options.run_id + '.gz')
        if not os.path.exists(save_file):
            raise Exception('Save directory %s was not found' % save_file)

        with gzip.open(save_file, 'r') as f:
            content = f.read()
            data = json.loads(content)

        if not data:
            raise Exception('No save data found')

        urls = data.get('urls', [])
        url_associations = data.get('associations', {})


        cur = db.cursor()
        run_id = options.run_id

        cur.execute('SELECT id, address FROM crawl_urls WHERE run_id = %s',
            (options.run_id,))
        processed_urls = dict([(row[1], row[0]) for row in cur.fetchall()])

    run_id = crawl(urls, db, options.internal, options.delay,
        options.user_agent, url_associations, run_id, processed_urls, limit=options.limit)

    if options.output:
        with open(options.output, 'w') as f:
            f.write(report(db, 'build', 'junit', run_id))
Example #2
0
def run(options):
    if options.database:
        with open(options.database, 'r') as f:
            env = yaml.load(f)
        db_conf = env.get('db', {})
    else:
        db_conf = {
            'host': os.environ.get('SEO_DB_HOSTNAME'),
            'user': os.environ.get('SEO_DB_USERNAME'),
            'pass': os.environ.get('SEO_DB_PASSWORD'),
            'name': os.environ.get('SEO_DB_DATABASE'),
        }

    # Initialize the database cursor
    db = MySQLdb.connect(host=db_conf.get('host'), user=db_conf.get('user'),
        passwd=db_conf.get('pass'), db=db_conf.get('name'), use_unicode=True,
        charset='utf8')

    if options.run_id:
        run_id = options.run_id
    else:
        # get the latest run_id
        run_id = seoreporter.fetch_latest_run_id(db)

    # Optionally upload to google docs
    gd_client = None
    if options.upload:
        def gd_login(email, password):
            client = gdata.docs.client.DocsClient(source="Recluse SEO Suite")
            client.api_version = "3"
            client.ssl = True
            # pp.pprint(vars(client))
            client.ClientLogin(email, password, client.source)
            return client

        def gd_upload(report, title, client, filename='tmp.xls'):
            with open(filename, 'w') as f:
                f.write(report)
            # print vars(gdata.docs)
            newResource = gdata.docs.data.Resource(filename, title)
            media = gdata.data.MediaSource()
            if options.format == 'xls':
                media.SetFileHandle(filename, 'application/vnd.ms-excel')
                # media.SetFileHandle(filename, 'application/vnd.google-apps.spreadsheet')
            elif options.format == 'csv':
                media.SetFileHandle(filename, 'text/csv')
            newDocument = client.CreateResource(newResource, create_uri=gdata.docs.client.RESOURCE_UPLOAD_URI, media=media)
            return newDocument

        with open(options.database, 'r') as f:
            login = yaml.load(f).get('gdata', {})
        gd_client = gd_login(login.get('user', None), login.get('pass', None))
        if not gd_client:
            raise Exception('Cannot connect to Google Docs.')

        entry = gd_upload(
            seoreporter.report(db, options.type, options.format, run_id),
            'seoreporter - %s - %s' % (options.type, datetime.datetime.today().strftime('%Y/%m/%d')),
            gd_client,
            options.output
            )
        print "Uploaded to Google Docs. URL is:"
        print entry.GetAlternateLink().href

    if options.output:
        with open(options.output, 'w') as f:
            f.write(seoreporter.report(db, options.type, options.format, run_id))
    else:
        print seoreporter.report(db, options.type, options.format, run_id)
Example #3
0
def run(options):
    if options.database:
        with open(options.database, 'r') as f:
            env = yaml.load(f)
        db_conf = env.get('db', {})
        moz_conf = env.get('moz', {})
    else:
        db_conf = {
            'host': os.environ.get('SEO_DB_HOSTNAME'),
            'user': os.environ.get('SEO_DB_USERNAME'),
            'pass': os.environ.get('SEO_DB_PASSWORD'),
            'name': os.environ.get('SEO_DB_DATABASE'),
        }

    # Initialize the database cursor
    db = MySQLdb.connect(host=db_conf.get('host'),
                         user=db_conf.get('user'),
                         passwd=db_conf.get('pass'),
                         db=db_conf.get('name'),
                         use_unicode=True,
                         charset='utf8')

    urls = []
    url_associations = {}
    processed_urls = {}
    run_id = None
    if options.file:
        with open(options.file, 'r') as f:
            urls = [url.strip() for url in f.readlines()]
    elif options.base_url:
        urls = [
            options.base_url,
        ]
    elif options.yaml:
        with open(options.yaml, 'r') as f:
            url_yaml = yaml.load(f)
            urls = url_yaml.get('seocrawlerurls', [])
    elif options.run_id:
        save_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                                 'seocrawler', 'jobs', options.run_id + '.gz')
        if not os.path.exists(save_file):
            raise Exception('Save directory %s was not found' % save_file)

        with gzip.open(save_file, 'r') as f:
            content = f.read()
            data = json.loads(content)

        if not data:
            raise Exception('No save data found')

        urls = data.get('urls', [])
        url_associations = data.get('associations', {})

        cur = db.cursor()
        run_id = options.run_id

        cur.execute('SELECT id, address FROM crawl_urls WHERE run_id = %s',
                    (options.run_id, ))
        processed_urls = dict([(row[1], row[0]) for row in cur.fetchall()])

    # Get Moz options
    moz_accessid = options.moz_accessid or moz_conf.get('accessid', None)
    moz_secretkey = options.moz_secretkey or moz_conf.get('secretkey', None)

    run_id = crawl(urls,
                   db,
                   options.internal,
                   options.delay,
                   options.user_agent,
                   url_associations,
                   run_id,
                   processed_urls,
                   limit=options.limit,
                   moz_accessid=moz_accessid,
                   moz_secretkey=moz_secretkey)

    if options.output:
        with open(options.output, 'w') as f:
            f.write(report(db, 'build', 'junit', run_id))
Example #4
0
def run(options):
    if options.database:
        with open(options.database, 'r') as f:
            env = yaml.load(f)
        db_conf = env.get('db', {})
    else:
        db_conf = {
            'host': os.environ.get('SEO_DB_HOSTNAME'),
            'user': os.environ.get('SEO_DB_USERNAME'),
            'pass': os.environ.get('SEO_DB_PASSWORD'),
            'name': os.environ.get('SEO_DB_DATABASE'),
        }

    # Initialize the database cursor
    db = MySQLdb.connect(host=db_conf.get('host'), user=db_conf.get('user'),
        passwd=db_conf.get('pass'), db=db_conf.get('name'), use_unicode=True)

    if options.run_id:
        run_id = options.run_id
    else:
        # get the latest run_id
        run_id = seoreporter.fetch_latest_run_id(db)

    # Optionally upload to google docs
    gd_client = None
    if options.upload:
        def gd_login(email, password):
            client = gdata.docs.client.DocsClient(source="Recluse SEO Suite")
            client.api_version = "3"
            client.ssl = True
            # pp.pprint(vars(client))
            client.ClientLogin(email, password, client.source)
            return client

        def gd_upload(report, title, client, filename='tmp.xls'):
            with open(filename, 'w') as f:
                f.write(report)
            # print vars(gdata.docs)
            newResource = gdata.docs.data.Resource(filename, title)
            media = gdata.data.MediaSource()
            if options.format == 'xls':
                media.SetFileHandle(filename, 'application/vnd.ms-excel')
                # media.SetFileHandle(filename, 'application/vnd.google-apps.spreadsheet')
            elif options.format == 'csv':
                media.SetFileHandle(filename, 'text/csv')
            newDocument = client.CreateResource(newResource, create_uri=gdata.docs.client.RESOURCE_UPLOAD_URI, media=media)
            return newDocument

        with open(options.database, 'r') as f:
            login = yaml.load(f).get('gdata', {})
        gd_client = gd_login(login.get('user', None), login.get('pass', None))
        if not gd_client:
            raise Exception('Cannot connect to Google Docs.')

        entry = gd_upload(
            seoreporter.report(db, options.type, options.format, run_id),
            'seoreporter - %s - %s' % (options.type, datetime.datetime.today().strftime('%Y/%m/%d')),
            gd_client,
            options.output
            )
        print "Uploaded to Google Docs. URL is:"
        print entry.GetAlternateLink().href

    if options.output:
        with open(options.output, 'w') as f:
            f.write(seoreporter.report(db, options.type, options.format, run_id))
    else:
        print seoreporter.report(db, options.type, options.format, run_id)