def run(options): if options.database: with open(options.database, 'r') as f: env = yaml.load(f) db_conf = env.get('db', {}) else: db_conf = { 'host': os.environ.get('SEO_DB_HOSTNAME'), 'user': os.environ.get('SEO_DB_USERNAME'), 'pass': os.environ.get('SEO_DB_PASSWORD'), 'name': os.environ.get('SEO_DB_DATABASE'), } # Initialize the database cursor db = MySQLdb.connect(host=db_conf.get('host'), user=db_conf.get('user'), passwd=db_conf.get('pass'), db=db_conf.get('name'), use_unicode=True) urls = [] url_associations = {} processed_urls = {} run_id = None if options.file: with open(options.file, 'r') as f: urls = [url.strip() for url in f.readlines()] elif options.base_url: urls = [options.base_url,] elif options.yaml: with open(options.yaml, 'r') as f: url_yaml = yaml.load(f) urls = url_yaml.get('seocrawlerurls', []) elif options.run_id: save_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'seocrawler', 'jobs', options.run_id + '.gz') if not os.path.exists(save_file): raise Exception('Save directory %s was not found' % save_file) with gzip.open(save_file, 'r') as f: content = f.read() data = json.loads(content) if not data: raise Exception('No save data found') urls = data.get('urls', []) url_associations = data.get('associations', {}) cur = db.cursor() run_id = options.run_id cur.execute('SELECT id, address FROM crawl_urls WHERE run_id = %s', (options.run_id,)) processed_urls = dict([(row[1], row[0]) for row in cur.fetchall()]) run_id = crawl(urls, db, options.internal, options.delay, options.user_agent, url_associations, run_id, processed_urls, limit=options.limit) if options.output: with open(options.output, 'w') as f: f.write(report(db, 'build', 'junit', run_id))
def run(options): if options.database: with open(options.database, 'r') as f: env = yaml.load(f) db_conf = env.get('db', {}) else: db_conf = { 'host': os.environ.get('SEO_DB_HOSTNAME'), 'user': os.environ.get('SEO_DB_USERNAME'), 'pass': os.environ.get('SEO_DB_PASSWORD'), 'name': os.environ.get('SEO_DB_DATABASE'), } # Initialize the database cursor db = MySQLdb.connect(host=db_conf.get('host'), user=db_conf.get('user'), passwd=db_conf.get('pass'), db=db_conf.get('name'), use_unicode=True, charset='utf8') if options.run_id: run_id = options.run_id else: # get the latest run_id run_id = seoreporter.fetch_latest_run_id(db) # Optionally upload to google docs gd_client = None if options.upload: def gd_login(email, password): client = gdata.docs.client.DocsClient(source="Recluse SEO Suite") client.api_version = "3" client.ssl = True # pp.pprint(vars(client)) client.ClientLogin(email, password, client.source) return client def gd_upload(report, title, client, filename='tmp.xls'): with open(filename, 'w') as f: f.write(report) # print vars(gdata.docs) newResource = gdata.docs.data.Resource(filename, title) media = gdata.data.MediaSource() if options.format == 'xls': media.SetFileHandle(filename, 'application/vnd.ms-excel') # media.SetFileHandle(filename, 'application/vnd.google-apps.spreadsheet') elif options.format == 'csv': media.SetFileHandle(filename, 'text/csv') newDocument = client.CreateResource(newResource, create_uri=gdata.docs.client.RESOURCE_UPLOAD_URI, media=media) return newDocument with open(options.database, 'r') as f: login = yaml.load(f).get('gdata', {}) gd_client = gd_login(login.get('user', None), login.get('pass', None)) if not gd_client: raise Exception('Cannot connect to Google Docs.') entry = gd_upload( seoreporter.report(db, options.type, options.format, run_id), 'seoreporter - %s - %s' % (options.type, datetime.datetime.today().strftime('%Y/%m/%d')), gd_client, options.output ) print "Uploaded to Google Docs. URL is:" print entry.GetAlternateLink().href if options.output: with open(options.output, 'w') as f: f.write(seoreporter.report(db, options.type, options.format, run_id)) else: print seoreporter.report(db, options.type, options.format, run_id)
def run(options): if options.database: with open(options.database, 'r') as f: env = yaml.load(f) db_conf = env.get('db', {}) moz_conf = env.get('moz', {}) else: db_conf = { 'host': os.environ.get('SEO_DB_HOSTNAME'), 'user': os.environ.get('SEO_DB_USERNAME'), 'pass': os.environ.get('SEO_DB_PASSWORD'), 'name': os.environ.get('SEO_DB_DATABASE'), } # Initialize the database cursor db = MySQLdb.connect(host=db_conf.get('host'), user=db_conf.get('user'), passwd=db_conf.get('pass'), db=db_conf.get('name'), use_unicode=True, charset='utf8') urls = [] url_associations = {} processed_urls = {} run_id = None if options.file: with open(options.file, 'r') as f: urls = [url.strip() for url in f.readlines()] elif options.base_url: urls = [ options.base_url, ] elif options.yaml: with open(options.yaml, 'r') as f: url_yaml = yaml.load(f) urls = url_yaml.get('seocrawlerurls', []) elif options.run_id: save_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'seocrawler', 'jobs', options.run_id + '.gz') if not os.path.exists(save_file): raise Exception('Save directory %s was not found' % save_file) with gzip.open(save_file, 'r') as f: content = f.read() data = json.loads(content) if not data: raise Exception('No save data found') urls = data.get('urls', []) url_associations = data.get('associations', {}) cur = db.cursor() run_id = options.run_id cur.execute('SELECT id, address FROM crawl_urls WHERE run_id = %s', (options.run_id, )) processed_urls = dict([(row[1], row[0]) for row in cur.fetchall()]) # Get Moz options moz_accessid = options.moz_accessid or moz_conf.get('accessid', None) moz_secretkey = options.moz_secretkey or moz_conf.get('secretkey', None) run_id = crawl(urls, db, options.internal, options.delay, options.user_agent, url_associations, run_id, processed_urls, limit=options.limit, moz_accessid=moz_accessid, moz_secretkey=moz_secretkey) if options.output: with open(options.output, 'w') as f: f.write(report(db, 'build', 'junit', run_id))
def run(options): if options.database: with open(options.database, 'r') as f: env = yaml.load(f) db_conf = env.get('db', {}) else: db_conf = { 'host': os.environ.get('SEO_DB_HOSTNAME'), 'user': os.environ.get('SEO_DB_USERNAME'), 'pass': os.environ.get('SEO_DB_PASSWORD'), 'name': os.environ.get('SEO_DB_DATABASE'), } # Initialize the database cursor db = MySQLdb.connect(host=db_conf.get('host'), user=db_conf.get('user'), passwd=db_conf.get('pass'), db=db_conf.get('name'), use_unicode=True) if options.run_id: run_id = options.run_id else: # get the latest run_id run_id = seoreporter.fetch_latest_run_id(db) # Optionally upload to google docs gd_client = None if options.upload: def gd_login(email, password): client = gdata.docs.client.DocsClient(source="Recluse SEO Suite") client.api_version = "3" client.ssl = True # pp.pprint(vars(client)) client.ClientLogin(email, password, client.source) return client def gd_upload(report, title, client, filename='tmp.xls'): with open(filename, 'w') as f: f.write(report) # print vars(gdata.docs) newResource = gdata.docs.data.Resource(filename, title) media = gdata.data.MediaSource() if options.format == 'xls': media.SetFileHandle(filename, 'application/vnd.ms-excel') # media.SetFileHandle(filename, 'application/vnd.google-apps.spreadsheet') elif options.format == 'csv': media.SetFileHandle(filename, 'text/csv') newDocument = client.CreateResource(newResource, create_uri=gdata.docs.client.RESOURCE_UPLOAD_URI, media=media) return newDocument with open(options.database, 'r') as f: login = yaml.load(f).get('gdata', {}) gd_client = gd_login(login.get('user', None), login.get('pass', None)) if not gd_client: raise Exception('Cannot connect to Google Docs.') entry = gd_upload( seoreporter.report(db, options.type, options.format, run_id), 'seoreporter - %s - %s' % (options.type, datetime.datetime.today().strftime('%Y/%m/%d')), gd_client, options.output ) print "Uploaded to Google Docs. URL is:" print entry.GetAlternateLink().href if options.output: with open(options.output, 'w') as f: f.write(seoreporter.report(db, options.type, options.format, run_id)) else: print seoreporter.report(db, options.type, options.format, run_id)