def soups(rows, folder, t_id): for row in rows: try: get.log('> GET ' + row['URL']) rq = get.get(row['URL']) folders = row['URL'].split('//')[1].split('/')[:-1] for i, folder in enumerate(folders): if i == 0: continue row['FOLDER LEVEL ' + str(i)] = folder row['DEPTH'] = len(folders) row['FINAL URL'] = rq.url row['FINAL URL LENGTH'] = len(rq.url) row['HEADERS'] = rq.headers row['CONTENT TYPE'] = rq.headers[ 'Content-Type'] if 'Content-Type' in rq.headers else '' #row['CONTENT LENGTH']=rq.headers['Content-Length'] if 'Content-Length' in rq.headers else '' #row['SERVER']=rq.headers['Server'] row['DATE'] = rq.headers['Date'] #row['ENCODE']=rq.encoding row['RESPONSE TIME (MS)'] = rq.elapsed.microseconds / 1000 row['REDIRECT TYPE'] = 'NONE' if not rq.is_redirect else 'PERMANENT' if rq.is_permanent_redirect else 'TEMPERARY' row['REDIRECT'] = rq.is_redirect row['STATUS'] = rq.reason row['STATUS CODE'] = rq.status_code row['HTTP/HTTPS'] = 'HTTPS' if 'https' in rq.url[:10] else 'HTTP' row['SOUP'] = get.bs(rq.content, 'html.parser') check('title', row) check('h1', row) check('h2', row) check_meta(row) check_img(row) check_canonical_link(row) except: get.pe()
def login(url): get.dgs(url) get.log('> Login') account = get.f2l('SEMRush_account.txt') get.dec('header__navigation-login') get.dsk(account[0], 'email', 'name') get.dsk(account[1], 'password', 'name') get.dec('auth-popup__submit', clas='data-test') get.sleep(5)
def get_data_from_table(index=0): tbl = get.be('backgrid', 'table', index=index) if not tbl: get.log("> There is no table data") return cols = [e.text.strip() for e in tbl.thead.findAll('th')][1:] #column names #get.log('> Columns: '+str(cols)) rows = [] for tr in [tr for tr in tbl.tbody.findAll('tr')]: try: row = {} tds = tr.findAll('td')[1:] for i, td in enumerate(tds): if cols[i] == 'Domain Name': #domain name column row[cols[i]] = td.text elif cols[i] == 'Overlap': if td.a: #overlap column overlap = td.a.div['style'].replace('width: ', '').replace( '%', '') row[cols[i]] = float(overlap) else: if td.i: # rank change column text = td.text.split('(') row['Rank'] = text[0] row['Rank Change'] = text[1].replace( ')', '') if len(text) > 1 else '' else: row[cols[i]] = -1 elif cols[i] == 'Keyword': keyword = get.bec(td) row[cols[i]] = keyword[0].text row[cols[i] + ' URL'] = keyword[1].text if len( keyword) > 1 else '' elif cols[i] == 'Ad Timeline': keyword = get.bec(td) row[cols[i]] = keyword[0].text elif len(cols[i]) > 0: #other column if '(' in td.text: text = td.text.split('(') row['Rank'] = int(text[0]) text[1] = text[1].replace(')', '') row[cols[i]] = int( text[1]) if text[1].isdigit() else 0 else: row[cols[i]] = get.s2f(td.text) except: get.pe() rows.append(row) return get.DataFrame(rows)
def add_audit(domain): get.dgs('https://www.semrush.com/projects/') #ADD DOMAIN get.dec('s-btn -xs -primary js-add-project') get.dsk(domain, 'js-input-domain') get.dsk(domain, 'js-input-name') get.dec('s-btn -s -success js-save-pr') while not get.de('setup', clas='data-action'): get.sleep(get.randint(3, 7)) get.log('> Waiting for setup button') get.dec('setup', clas='data-action') while 'Audit' not in get.det('s-btn__text', index=-1): get.sleep(get.randint(3, 7)) get.log('> Waiting for setup audit button') get.dec('s-btn__text', index=-1) counter = 0 max_loop = 20 try: #WAIT FOR PROGRESS BAR while not get.de('s-widget__progress-title'): get.sleep(get.randint(3, 7)) get.log('> Waiting for progress bar ' + str(counter * 3) + 's') if counter == max_loop: get.log('> ') break error_btn = get.de('s-btn -danger -xs') if error_btn: error_btn.click() counter += 1 except: get.pe() return get_project_id(get.DR.current_url), counter < max_loop
def delete_project(pid, domain): #DELETE DOMAIN FROM PROJECTS LIST #IF THERE ARE 5 PROJECT AND THE DOMAIN IS AT #5 THEN #WEBDRIVER HEIGHT HAS TO >1100 OR ELSE THE DELETE BUTTON IS NOT VISIBLE = NOT CLICKABLE try: get.log('> Deleting project: ' + domain + ', PID=' + pid) get.dgs('https://www.semrush.com/projects/') div = get.de('s-project js-project-' + pid + ' ') while not div: get.sleep(get.randint(3, 7)) get.dgs('https://www.semrush.com/projects/') div = get.de('s-project js-project-' + pid + ' ') div.find_element_by_class_name('sr-infomenu').click() content = div.find_element_by_class_name('sr-infomenu-content') content.find_elements_by_tag_name('li')[1].click() get.dsk(domain, 'Project name', 'placeholder') get.dec('s-btn -s -danger js-remove') get.log('> Deleted project: ' + domain + ', PID=' + pid) return True except: get.pe('Can not delete the project: ' + domain + ' > PID=' + pid)
account = get.f2l('SEMRush_account.txt') get.dec('header__navigation-login') get.dsk(account[0], 'email', 'name') get.dsk(account[1], 'password', 'name') get.dec('auth-popup__submit', clas='data-test') get.sleep(5) try: PROJECT = 'SEMRushErrorsReport' FILTERS = set(get.f2l('filters.txt')) DOMAINS = get.f2l('domains.txt') DOMAINS = [dm[dm.index('//') + 2:] if '//' in dm else dm for dm in DOMAINS] DOMAINS = [dm[:dm.index('/')] if '/' in dm else dm for dm in DOMAINS] DOMAINS = [dm for dm in DOMAINS if len(dm) > 3] get.log('> DOMAINS: ' + str(DOMAINS)) get.log('> FILTERS: ' + str(FILTERS)) get.setup(debug=True, driver=True) URL = 'https://www.semrush.com/' login(URL) error_times = 0 error_allow = 5 error_domain = None for i, domain in enumerate(DOMAINS): get.log('> Progress: ' + str(i + 1) + '/' + str(len(DOMAINS)) + ': ' + domain) pid, success = add_audit(domain) if success: get.log('> Add audit successed') get.save(get_errors(pid), PROJECT + '/' + get.START_TIME + '/' + domain)
get.pe(col) def reformat_dp(dp, rows_text): dp = get.DataFrame(dp).sort_values(['TAG', 'NUMBER OF DUPLICATION'], ascending=False) rs = get.DataFrame() for tag in rows_text: rs = get.concat([rs, dp[dp['TAG'] == tag]]) return rs if __name__ == '__main__': try: sites = [line for line in get.f2l(get.argv[1]) if len(line) > 0] get.log('> All sites:' + str(sites)) for site in sites: URL = site DOMAIN = get.dmn(URL) DOWNLOADED = [] try: robots = get.get(URL + 'robots.txt') if not robots: get.log('> robots.txt not found') DOWNLOADED.append({ 'URL': URL + 'robots.txt', 'TITLE': 'Missing Robots.txt' }) else: DOWNLOADED.append({ 'URL': URL + 'robots.txt',
career = 'Career Link 1' df_input = get.df('input.csv')[[people, career]] scripts = [{ 'domain': get.dmn(page[0]), 'people': page[0], 'career': check_career(page[1]) } for k, page in df_input.iterrows()] if len(get.sys.argv) == 2: index = int(get.sys.argv[1]) scripts = scripts[index:index + 1] for i, script in enumerate(scripts): try: module = importlib.import_module("scripts." + script['domain']) root = get.dmr(script['people']) file_name = get.fi(i, 4) + '_' + script['domain'] get.log('> File name:\t' + file_name) get.log('> Run script:\tscripts.' + script['domain']) get.log('> GET data') if get.DEBUG: data_path = 'raw/' + get.os.listdir( 'raw')[-1] + '/' + file_name + 'P.csv' get.log('> Load data from: ' + data_path) df = get.df(data_path) rows = [] for k, row in df.iterrows(): rows.append({ 'URL': row['URL'], 'SOUP': get.bs(row['SOUP'], 'html.parser') }) else: get.log('> Load data from: ' + script['people'])