def add_audit(domain): get.dgs('https://www.semrush.com/projects/') #ADD DOMAIN get.dec('s-btn -xs -primary js-add-project') get.dsk(domain, 'js-input-domain') get.dsk(domain, 'js-input-name') get.dec('s-btn -s -success js-save-pr') while not get.de('setup', clas='data-action'): get.sleep(get.randint(3, 7)) get.log('> Waiting for setup button') get.dec('setup', clas='data-action') while 'Audit' not in get.det('s-btn__text', index=-1): get.sleep(get.randint(3, 7)) get.log('> Waiting for setup audit button') get.dec('s-btn__text', index=-1) counter = 0 max_loop = 20 try: #WAIT FOR PROGRESS BAR while not get.de('s-widget__progress-title'): get.sleep(get.randint(3, 7)) get.log('> Waiting for progress bar ' + str(counter * 3) + 's') if counter == max_loop: get.log('> ') break error_btn = get.de('s-btn -danger -xs') if error_btn: error_btn.click() counter += 1 except: get.pe() return get_project_id(get.DR.current_url), counter < max_loop
def soups(rows, folder, t_id): for row in rows: try: get.log('> GET ' + row['URL']) rq = get.get(row['URL']) folders = row['URL'].split('//')[1].split('/')[:-1] for i, folder in enumerate(folders): if i == 0: continue row['FOLDER LEVEL ' + str(i)] = folder row['DEPTH'] = len(folders) row['FINAL URL'] = rq.url row['FINAL URL LENGTH'] = len(rq.url) row['HEADERS'] = rq.headers row['CONTENT TYPE'] = rq.headers[ 'Content-Type'] if 'Content-Type' in rq.headers else '' #row['CONTENT LENGTH']=rq.headers['Content-Length'] if 'Content-Length' in rq.headers else '' #row['SERVER']=rq.headers['Server'] row['DATE'] = rq.headers['Date'] #row['ENCODE']=rq.encoding row['RESPONSE TIME (MS)'] = rq.elapsed.microseconds / 1000 row['REDIRECT TYPE'] = 'NONE' if not rq.is_redirect else 'PERMANENT' if rq.is_permanent_redirect else 'TEMPERARY' row['REDIRECT'] = rq.is_redirect row['STATUS'] = rq.reason row['STATUS CODE'] = rq.status_code row['HTTP/HTTPS'] = 'HTTPS' if 'https' in rq.url[:10] else 'HTTP' row['SOUP'] = get.bs(rq.content, 'html.parser') check('title', row) check('h1', row) check('h2', row) check_meta(row) check_img(row) check_canonical_link(row) except: get.pe()
def check(tag, row): try: for i, _tag in enumerate(row['SOUP'].findAll(tag)): key = tag.upper() if 'h' in tag: key = key + '-' + str(i + 1) row[key] = _tag.text _len = len(row[key]) row[key + ' LENGTH'] = _len row[key + ' STATUS'] = check_length_status(_len, tag) except: get.pe(str(_tag))
def get_data_from_table(index=0): tbl = get.be('backgrid', 'table', index=index) if not tbl: get.log("> There is no table data") return cols = [e.text.strip() for e in tbl.thead.findAll('th')][1:] #column names #get.log('> Columns: '+str(cols)) rows = [] for tr in [tr for tr in tbl.tbody.findAll('tr')]: try: row = {} tds = tr.findAll('td')[1:] for i, td in enumerate(tds): if cols[i] == 'Domain Name': #domain name column row[cols[i]] = td.text elif cols[i] == 'Overlap': if td.a: #overlap column overlap = td.a.div['style'].replace('width: ', '').replace( '%', '') row[cols[i]] = float(overlap) else: if td.i: # rank change column text = td.text.split('(') row['Rank'] = text[0] row['Rank Change'] = text[1].replace( ')', '') if len(text) > 1 else '' else: row[cols[i]] = -1 elif cols[i] == 'Keyword': keyword = get.bec(td) row[cols[i]] = keyword[0].text row[cols[i] + ' URL'] = keyword[1].text if len( keyword) > 1 else '' elif cols[i] == 'Ad Timeline': keyword = get.bec(td) row[cols[i]] = keyword[0].text elif len(cols[i]) > 0: #other column if '(' in td.text: text = td.text.split('(') row['Rank'] = int(text[0]) text[1] = text[1].replace(')', '') row[cols[i]] = int( text[1]) if text[1].isdigit() else 0 else: row[cols[i]] = get.s2f(td.text) except: get.pe() rows.append(row) return get.DataFrame(rows)
def check_canonical_link(row): try: for i, link in enumerate(row['SOUP'].findAll('link')): if not link.has_attr('rel'): continue for i in link['rel']: if i.lower() == 'canonical': key = i.upper() row[key] = link['href'] row[key + ' LENGTH'] = len(row[key]) break if 'CANONICAL' not in row: row['CANONICAL'] = 'None' row['CANONICAL LENGTH'] = '' except: get.pe(str(link))
def check_dp(df, output, col): try: if col in df.keys(): dps = list(df[df.duplicated(col)].groupby(col)) if len(dps) > 0: for i, dp in enumerate(dps): row_text, _dp = dp _dp = df[df[col] == row_text] row = {} row['TAG'] = col row['DUPLICATION TEXT'] = row_text row['NUMBER OF DUPLICATION'] = len(_dp) row['URL'] = '#\n' + '\n'.join(list(_dp['URL'])) output.append(row) except: get.pe(col)
def check_meta(row): try: for i, meta in enumerate(row['SOUP'].findAll('meta')): if not meta.has_attr('name') or not meta.has_attr('content'): continue key = meta['name'].lower() if key != 'description' and key != 'keywords': continue key = 'META ' + key.upper() row[key] = meta['content'] _len = len(row[key]) row[key + ' LENGTH'] = _len #if key=='keywords': #row[key+' STATUS'] = check_length_status(_len, 'keywords') if key == 'description': row[key + ' STATUS'] = check_length_status(_len, 'description') except: get.pe(str(meta))
def check_img(row): try: imgs = row['SOUP'].findAll('img') imgs_alt_tag = [img['alt'] for img in imgs if img.has_attr('alt')] row['Number of images'.upper()] = len(imgs) row['Number of with alt-tag'.upper()] = len(imgs_alt_tag) row['Number of without alt-tag'.upper( )] = len(imgs) - len(imgs_alt_tag) alt_tags = numpy.array([len(img) for img in imgs_alt_tag]) if len(alt_tags) > 0: row['alt-tag Minimum length'.upper()] = alt_tags.min() row['alt-tag Average length'.upper()] = alt_tags.mean() row['alt-tag Maximum length'.upper()] = alt_tags.max() df = get.DataFrame(imgs_alt_tag) row['Number of Alt-Tags that are duplicated'.upper()] = len( df[df.duplicated]) except: get.pe()
def delete_project(pid, domain): #DELETE DOMAIN FROM PROJECTS LIST #IF THERE ARE 5 PROJECT AND THE DOMAIN IS AT #5 THEN #WEBDRIVER HEIGHT HAS TO >1100 OR ELSE THE DELETE BUTTON IS NOT VISIBLE = NOT CLICKABLE try: get.log('> Deleting project: ' + domain + ', PID=' + pid) get.dgs('https://www.semrush.com/projects/') div = get.de('s-project js-project-' + pid + ' ') while not div: get.sleep(get.randint(3, 7)) get.dgs('https://www.semrush.com/projects/') div = get.de('s-project js-project-' + pid + ' ') div.find_element_by_class_name('sr-infomenu').click() content = div.find_element_by_class_name('sr-infomenu-content') content.find_elements_by_tag_name('li')[1].click() get.dsk(domain, 'Project name', 'placeholder') get.dec('s-btn -s -danger js-remove') get.log('> Deleted project: ' + domain + ', PID=' + pid) return True except: get.pe('Can not delete the project: ' + domain + ' > PID=' + pid)
get.log('> FILTERS: ' + str(FILTERS)) get.setup(debug=True, driver=True) URL = 'https://www.semrush.com/' login(URL) error_times = 0 error_allow = 5 error_domain = None for i, domain in enumerate(DOMAINS): get.log('> Progress: ' + str(i + 1) + '/' + str(len(DOMAINS)) + ': ' + domain) pid, success = add_audit(domain) if success: get.log('> Add audit successed') get.save(get_errors(pid), PROJECT + '/' + get.START_TIME + '/' + domain) else: #Add failed domain to the DOMAINS list if fail if not error_domain: error_domain = domain elif error_domain == domain: error_times += 1 else: error_times = 0 if error_times < error_allow: get.log('> Add ' + domain + ' back to project list ' + str(error_times) + ' times') DOMAINS.append(domain) error_domain = domain if not delete_project(pid, domain): get.log('> Fatal error encountered.') break except: get.pe() get.quit(PROJECT)
from ptl import get get.setup(debug=True, driver=True) try: #your code go here except: get.pe() get.quit()