Exemple #1
0
def soups(rows, folder, t_id):
    for row in rows:
        try:
            get.log('> GET ' + row['URL'])
            rq = get.get(row['URL'])
            folders = row['URL'].split('//')[1].split('/')[:-1]
            for i, folder in enumerate(folders):
                if i == 0: continue
                row['FOLDER LEVEL ' + str(i)] = folder
            row['DEPTH'] = len(folders)
            row['FINAL URL'] = rq.url
            row['FINAL URL LENGTH'] = len(rq.url)
            row['HEADERS'] = rq.headers
            row['CONTENT TYPE'] = rq.headers[
                'Content-Type'] if 'Content-Type' in rq.headers else ''
            #row['CONTENT LENGTH']=rq.headers['Content-Length'] if 'Content-Length' in rq.headers else ''
            #row['SERVER']=rq.headers['Server']
            row['DATE'] = rq.headers['Date']
            #row['ENCODE']=rq.encoding
            row['RESPONSE TIME (MS)'] = rq.elapsed.microseconds / 1000
            row['REDIRECT TYPE'] = 'NONE' if not rq.is_redirect else 'PERMANENT' if rq.is_permanent_redirect else 'TEMPERARY'
            row['REDIRECT'] = rq.is_redirect
            row['STATUS'] = rq.reason
            row['STATUS CODE'] = rq.status_code
            row['HTTP/HTTPS'] = 'HTTPS' if 'https' in rq.url[:10] else 'HTTP'
            row['SOUP'] = get.bs(rq.content, 'html.parser')
            check('title', row)
            check('h1', row)
            check('h2', row)
            check_meta(row)
            check_img(row)
            check_canonical_link(row)
        except:
            get.pe()
def login(url):
    get.dgs(url)
    get.log('> Login')
    account = get.f2l('SEMRush_account.txt')
    get.dec('header__navigation-login')
    get.dsk(account[0], 'email', 'name')
    get.dsk(account[1], 'password', 'name')
    get.dec('auth-popup__submit', clas='data-test')
    get.sleep(5)
Exemple #3
0
 def get_data_from_table(index=0):
     tbl = get.be('backgrid', 'table', index=index)
     if not tbl:
         get.log("> There is no table data")
         return
     cols = [e.text.strip()
             for e in tbl.thead.findAll('th')][1:]  #column names
     #get.log('> Columns: '+str(cols))
     rows = []
     for tr in [tr for tr in tbl.tbody.findAll('tr')]:
         try:
             row = {}
             tds = tr.findAll('td')[1:]
             for i, td in enumerate(tds):
                 if cols[i] == 'Domain Name':  #domain name column
                     row[cols[i]] = td.text
                 elif cols[i] == 'Overlap':
                     if td.a:  #overlap column
                         overlap = td.a.div['style'].replace('width: ',
                                                             '').replace(
                                                                 '%', '')
                         row[cols[i]] = float(overlap)
                     else:
                         if td.i:  # rank change column
                             text = td.text.split('(')
                             row['Rank'] = text[0]
                             row['Rank Change'] = text[1].replace(
                                 ')', '') if len(text) > 1 else ''
                         else:
                             row[cols[i]] = -1
                 elif cols[i] == 'Keyword':
                     keyword = get.bec(td)
                     row[cols[i]] = keyword[0].text
                     row[cols[i] + ' URL'] = keyword[1].text if len(
                         keyword) > 1 else ''
                 elif cols[i] == 'Ad Timeline':
                     keyword = get.bec(td)
                     row[cols[i]] = keyword[0].text
                 elif len(cols[i]) > 0:  #other column
                     if '(' in td.text:
                         text = td.text.split('(')
                         row['Rank'] = int(text[0])
                         text[1] = text[1].replace(')', '')
                         row[cols[i]] = int(
                             text[1]) if text[1].isdigit() else 0
                     else:
                         row[cols[i]] = get.s2f(td.text)
         except:
             get.pe()
         rows.append(row)
     return get.DataFrame(rows)
def add_audit(domain):
    get.dgs('https://www.semrush.com/projects/')
    #ADD DOMAIN
    get.dec('s-btn -xs -primary js-add-project')
    get.dsk(domain, 'js-input-domain')
    get.dsk(domain, 'js-input-name')
    get.dec('s-btn -s -success js-save-pr')
    while not get.de('setup', clas='data-action'):
        get.sleep(get.randint(3, 7))
        get.log('> Waiting for setup button')
    get.dec('setup', clas='data-action')
    while 'Audit' not in get.det('s-btn__text', index=-1):
        get.sleep(get.randint(3, 7))
        get.log('> Waiting for setup audit button')
    get.dec('s-btn__text', index=-1)
    counter = 0
    max_loop = 20
    try:  #WAIT FOR PROGRESS BAR
        while not get.de('s-widget__progress-title'):
            get.sleep(get.randint(3, 7))
            get.log('> Waiting for progress bar ' + str(counter * 3) + 's')
            if counter == max_loop:
                get.log('> ')
                break
            error_btn = get.de('s-btn -danger -xs')
            if error_btn:
                error_btn.click()
            counter += 1
    except:
        get.pe()
    return get_project_id(get.DR.current_url), counter < max_loop
def delete_project(pid, domain):
    #DELETE DOMAIN FROM PROJECTS LIST
    #IF THERE ARE 5 PROJECT AND THE DOMAIN IS AT #5 THEN
    #WEBDRIVER HEIGHT HAS TO >1100 OR ELSE THE DELETE BUTTON IS NOT VISIBLE = NOT CLICKABLE
    try:
        get.log('> Deleting project: ' + domain + ', PID=' + pid)
        get.dgs('https://www.semrush.com/projects/')
        div = get.de('s-project js-project-' + pid + ' ')
        while not div:
            get.sleep(get.randint(3, 7))
            get.dgs('https://www.semrush.com/projects/')
            div = get.de('s-project js-project-' + pid + ' ')
        div.find_element_by_class_name('sr-infomenu').click()
        content = div.find_element_by_class_name('sr-infomenu-content')
        content.find_elements_by_tag_name('li')[1].click()
        get.dsk(domain, 'Project name', 'placeholder')
        get.dec('s-btn -s -danger js-remove')
        get.log('> Deleted project: ' + domain + ', PID=' + pid)
        return True
    except:
        get.pe('Can not delete the project: ' + domain + ' > PID=' + pid)
    account = get.f2l('SEMRush_account.txt')
    get.dec('header__navigation-login')
    get.dsk(account[0], 'email', 'name')
    get.dsk(account[1], 'password', 'name')
    get.dec('auth-popup__submit', clas='data-test')
    get.sleep(5)


try:
    PROJECT = 'SEMRushErrorsReport'
    FILTERS = set(get.f2l('filters.txt'))
    DOMAINS = get.f2l('domains.txt')
    DOMAINS = [dm[dm.index('//') + 2:] if '//' in dm else dm for dm in DOMAINS]
    DOMAINS = [dm[:dm.index('/')] if '/' in dm else dm for dm in DOMAINS]
    DOMAINS = [dm for dm in DOMAINS if len(dm) > 3]
    get.log('> DOMAINS: ' + str(DOMAINS))
    get.log('> FILTERS: ' + str(FILTERS))
    get.setup(debug=True, driver=True)
    URL = 'https://www.semrush.com/'
    login(URL)
    error_times = 0
    error_allow = 5
    error_domain = None
    for i, domain in enumerate(DOMAINS):
        get.log('> Progress: ' + str(i + 1) + '/' + str(len(DOMAINS)) + ': ' +
                domain)
        pid, success = add_audit(domain)
        if success:
            get.log('> Add audit successed')
            get.save(get_errors(pid),
                     PROJECT + '/' + get.START_TIME + '/' + domain)
Exemple #7
0
        get.pe(col)


def reformat_dp(dp, rows_text):
    dp = get.DataFrame(dp).sort_values(['TAG', 'NUMBER OF DUPLICATION'],
                                       ascending=False)
    rs = get.DataFrame()
    for tag in rows_text:
        rs = get.concat([rs, dp[dp['TAG'] == tag]])
    return rs


if __name__ == '__main__':
    try:
        sites = [line for line in get.f2l(get.argv[1]) if len(line) > 0]
        get.log('> All sites:' + str(sites))
        for site in sites:
            URL = site
            DOMAIN = get.dmn(URL)
            DOWNLOADED = []
            try:
                robots = get.get(URL + 'robots.txt')
                if not robots:
                    get.log('> robots.txt not found')
                    DOWNLOADED.append({
                        'URL': URL + 'robots.txt',
                        'TITLE': 'Missing Robots.txt'
                    })
                else:
                    DOWNLOADED.append({
                        'URL': URL + 'robots.txt',
Exemple #8
0
 career = 'Career Link 1'
 df_input = get.df('input.csv')[[people, career]]
 scripts = [{
     'domain': get.dmn(page[0]),
     'people': page[0],
     'career': check_career(page[1])
 } for k, page in df_input.iterrows()]
 if len(get.sys.argv) == 2:
     index = int(get.sys.argv[1])
     scripts = scripts[index:index + 1]
 for i, script in enumerate(scripts):
     try:
         module = importlib.import_module("scripts." + script['domain'])
         root = get.dmr(script['people'])
         file_name = get.fi(i, 4) + '_' + script['domain']
         get.log('> File name:\t' + file_name)
         get.log('> Run script:\tscripts.' + script['domain'])
         get.log('> GET data')
         if get.DEBUG:
             data_path = 'raw/' + get.os.listdir(
                 'raw')[-1] + '/' + file_name + 'P.csv'
             get.log('> Load data from: ' + data_path)
             df = get.df(data_path)
             rows = []
             for k, row in df.iterrows():
                 rows.append({
                     'URL': row['URL'],
                     'SOUP': get.bs(row['SOUP'], 'html.parser')
                 })
         else:
             get.log('> Load data from: ' + script['people'])