Ejemplo n.º 1
0
def actions(action_sets,reload='No'):
    from selenium import webdriver
    from bs4 import BeautifulSoup
    import time
    from common.commons import myreader, mywriter
    pathhead = 'crawlers/actions_by_product/'
    if reload == 'No':
        driver = webdriver.Safari()
        actions = []
        for row in action_sets:
            driver.get(row[3])
            time.sleep(8)
            actions_soup = BeautifulSoup(driver.page_source,"lxml")
            bowl = actions_soup.findAll('table',attrs='xisCas-actionstoc')
            for spoon in bowl:
                sip = spoon.findAll('tr')
                for swallow in sip:
                    if swallow.find('a'):
                        temp = swallow.find('td').find_next_sibling('td').text.strip()
                        actions.append([row[0],row[1],swallow.find('td').text.strip(),' '.join(temp.split()),swallow.find('a').get('href').strip()])
        driver.close()
        #keep the list of links for actions in actions.csv
        header=["Product","ActionSet","Action","Action_Describe","Action_Link"]
        mywriter(pathhead,header,actions,'actions')
        return actions
    else:
        actions = myreader(pathhead,'actions',header='drop')
        return actions
Ejemplo n.º 2
0
def action_sets(products,reload='No'):
    from selenium import webdriver
    from bs4 import BeautifulSoup
    import time
    from common.commons import myreader, mywriter
    pathhead = 'crawlers/actions_by_product/'
    if reload == 'No':
        driver = webdriver.Safari()
        action_sets = []
        for row in products:
            driver.get(row[1])
            time.sleep(10)
            action_soup = BeautifulSoup(driver.page_source,"lxml")
            bowl = action_soup.findAll('tr')
            for spoon in bowl:
                sip = spoon.findAll('td')
                if len(sip) == 3:
                    action_sets.append([row[0],sip[1].text.strip(),' '.join(sip[2].text.split()),sip[0].find('a').get('href').strip(),' '.join(sip[0].text.split())])
        driver.close()
        #keep the list of links for actions in action_sets.csv
        header=["Product","ActionSet","ActionSet_Describe","ActionSet_Link","ActionSet_LinkText"]
        mywriter(pathhead,header,action_sets,'action_sets')
        return action_sets
    else:
        action_sets = myreader(pathhead,'action_sets',header='drop')
        return action_sets
Ejemplo n.º 3
0
def products(url,reload='No'):
    from selenium import webdriver
    from bs4 import BeautifulSoup
    import time
    from common.commons import myreader, mywriter
    pathhead = 'crawlers/actions_by_product/'
    if reload == 'No':
        driver = webdriver.Safari()
        driver.get(url)
        time.sleep(10)
        soup = BeautifulSoup(driver.page_source,"lxml")
        driver.close()
        #print(soup)

        # Build the product list
        bowl = soup.findAll('div',attrs='xisDoc-toc_1 ng-scope')
        #printlist(bowl)
        products = []
        for spoon in bowl:
            products.append([spoon.text,spoon.find('a').get('href')])
        #printlist(products)
        #keep the list of links for actions in products.csv
        header=["Product","Product_Link"]
        mywriter(pathhead,header,products,'products')
        return products
    else:
        products = myreader(pathhead,'products',header='drop')
        return products
Ejemplo n.º 4
0
def procs_plus(procs,reload='No'):
    from selenium import webdriver
    from bs4 import BeautifulSoup
    import time
    from common.commons import myreader, mywriter
    pathhead = 'crawlers/procs_by_product/'
    if reload == 'No':
        # start with procs and add columns:
        procs_plus = procs
        #function to see check if link is for desired purpose and if it needs stump url
        def check_addstump(link,stump):
                link=link.strip()
                if link.startswith('http'):
                    return link
                else:
                    return stump + link
        # cycle through procedure links, check for overview and contrasted links: Collect = Product | Procedure | Procedure_Short | Procedure_Link | Overview_Link | Compared_Link
        comp_stump='https://documentation.sas.com'
        #procs_plus = procs_plus[393:397] #subset for testing
        #procs_plus = procs_plus[290:296] #subset for testing
        driver = webdriver.Safari()
        for row in procs_plus:
            driver.get(row[3])
            time.sleep(10)
            proc_soup = BeautifulSoup(driver.page_source,"lxml")
            for proc_link in proc_soup.find_all('a'):
                if ("Overview" in proc_link.text) and proc_link.get('href'):
                    if "overview" in proc_link.get('href'):
                        row.append(check_addstump(proc_link.get('href'),comp_stump))
            if len(row) != 5:
                    row.append('')
            for proc_link in proc_soup.find_all('a'):
                comps=["Contrasted","Compared"]
                if any(comp in proc_link.text for comp in comps) and proc_link.get('href'):
                    row.append(check_addstump(proc_link.get('href'),comp_stump))
            if len(row) !=6:
                row.append('')
        driver.quit()
        #keep the list of links for products and procedures in procs_plus.csv
        header=["Product","Procedure","Procedure_Short","Procedure_Link","Overview_Link","Compared_Link"]
        mywriter(pathhead,header,procs_plus,'procs_plus')
        return procs_plus
    else:
        procs_plus = myreader(pathhead,'procs_plus',header='drop')
        return procs_plus
Ejemplo n.º 5
0
def viya_procs(url, reload='No'):
    from selenium import webdriver
    from bs4 import BeautifulSoup
    import time
    from common.commons import myreader, mywriter
    pathhead = 'crawlers/viya_procs/'
    if reload == 'No':
        driver = webdriver.Safari()
        driver.get(url)
        time.sleep(10)
        soup = BeautifulSoup(driver.page_source, "lxml")
        driver.close()
        #print(soup)

        # Build the collect list: Product | Procedure | Procedure_Short | Procedure_Link
        bowl = soup.findAll(
            ['h2', 'p'], attrs={'class': ['xisDoc-title', 'xisDoc-paragraph']})
        products = []
        viya_procs = []
        for spoon in bowl:
            if spoon.name == 'h2' and "SAS Products" not in spoon.text:
                products.append(spoon.text.strip())
            if spoon.name == 'p' and products:
                block = spoon.find('a')
                if block:
                    link = block.get('href')
                    proc = ' '.join(block.text.split())
                    proc_short = proc.replace(
                        ': ', ' '
                    )  # template shows up as template: because it has multiple links
                    proc_short = proc_short.split(' ', 1)[0]
                    viya_procs.append(
                        [products[-1], proc, proc_short,
                         link.strip()])
        #keep the list of links for products and procedures in procs.csv
        header = ["Product", "Procedure", "Procedure_Short", "Procedure_Link"]
        mywriter(pathhead, header, viya_procs, 'viya_procs')
        return viya_procs
    else:
        viya_procs = myreader(pathhead, 'viya_procs', header='drop')
        return viya_procs
Ejemplo n.º 6
0
def procs(url,reload='No'):
    from selenium import webdriver
    from bs4 import BeautifulSoup
    import time
    from common.commons import myreader, mywriter
    pathhead = 'crawlers/procs_by_product/'
    if reload == 'No':
        driver = webdriver.Safari()
        driver.get(url)
        time.sleep(10)
        soup = BeautifulSoup(driver.page_source,"lxml")
        driver.close()
        #print(soup)

        # Build the collect list: Product | Procedure | Procedure_Short | Procedure_Link
        bowl = soup.findAll(['h2','p'],attrs={'class':['xisDoc-title','xisDoc-paragraph']})
        procs = []
        product = []
        for spoon in bowl:
            #print('line - ', spoon)
            if spoon.name=='h2' and "SAS Products" not in spoon.text:
                product.append(spoon.text.strip())
            if spoon.name=='p' and product:
                block = spoon.find('a')
                if block:
                    link = block.get('href')
                    proc = ' '.join(block.text.split())
                    proc_short = proc.replace(': ',' ') # template shows up as template: because it has multiple links
                    proc_short = proc_short.split(' ',1)[0]
                    procs.append([product[-1], proc, proc_short, link.strip()])
        #remove the few cases where a product starts by listing another product (not a proc): as in "includes contents of product..."
        for idx, item in enumerate(procs):
            if item[1] in product:
                del procs[idx]
        #keep the list of links for products and procedures in procs.csv
        header=["Product","Procedure","Procedure_Short","Procedure_Link"]
        mywriter(pathhead,header,procs,'procs')
        return procs
    else:
        procs = myreader(pathhead,'procs',header='drop')
        return procs
Ejemplo n.º 7
0
    return inputlist


# process procs_linked - data clean, create node_name for unique occurences of PROC
procs_linked = myreader('crawlers/procs_by_product/', 'procs_linked')
header = procs_linked.pop(0)

# The rows for TEMPLATE: type of template are causing the create of the dot.svg to break as : is interpreted for port
for row in procs_linked:
    if 'TEMPLATE:' in row[1]:
        row[1] = row[1].replace(':', ' -')

# add node_name column for the short proc name in [2], feed the first thre columns [:3] Product, Proc, Proc_Short
procs_linked = node_name(procs_linked, header, 'PROC_', 3, 2)
print(header)
mywriter('process/processed_data/', header, procs_linked, 'procs_linked')

# process viya actionsets - remove actionsets with no actions, add node_name to actions and action_sets
actions = myreader('crawlers/actions_by_product/', 'actions')
action_sets = myreader('crawlers/actions_by_product/', 'action_sets')

a_header = actions.pop(0)
as_header = action_sets.pop(0)

used_actionsets = []
for actionset in actions:
    if actionset[1] not in used_actionsets:
        used_actionsets.append(actionset[1])

for i, action_set in enumerate(action_sets):
    if action_set[1] not in used_actionsets: del action_sets[i]