コード例 #1
0
def actions(action_sets,reload='No'):
    from selenium import webdriver
    from bs4 import BeautifulSoup
    import time
    from common.commons import myreader, mywriter
    pathhead = 'crawlers/actions_by_product/'
    if reload == 'No':
        driver = webdriver.Safari()
        actions = []
        for row in action_sets:
            driver.get(row[3])
            time.sleep(8)
            actions_soup = BeautifulSoup(driver.page_source,"lxml")
            bowl = actions_soup.findAll('table',attrs='xisCas-actionstoc')
            for spoon in bowl:
                sip = spoon.findAll('tr')
                for swallow in sip:
                    if swallow.find('a'):
                        temp = swallow.find('td').find_next_sibling('td').text.strip()
                        actions.append([row[0],row[1],swallow.find('td').text.strip(),' '.join(temp.split()),swallow.find('a').get('href').strip()])
        driver.close()
        #keep the list of links for actions in actions.csv
        header=["Product","ActionSet","Action","Action_Describe","Action_Link"]
        mywriter(pathhead,header,actions,'actions')
        return actions
    else:
        actions = myreader(pathhead,'actions',header='drop')
        return actions
コード例 #2
0
def action_sets(products,reload='No'):
    from selenium import webdriver
    from bs4 import BeautifulSoup
    import time
    from common.commons import myreader, mywriter
    pathhead = 'crawlers/actions_by_product/'
    if reload == 'No':
        driver = webdriver.Safari()
        action_sets = []
        for row in products:
            driver.get(row[1])
            time.sleep(10)
            action_soup = BeautifulSoup(driver.page_source,"lxml")
            bowl = action_soup.findAll('tr')
            for spoon in bowl:
                sip = spoon.findAll('td')
                if len(sip) == 3:
                    action_sets.append([row[0],sip[1].text.strip(),' '.join(sip[2].text.split()),sip[0].find('a').get('href').strip(),' '.join(sip[0].text.split())])
        driver.close()
        #keep the list of links for actions in action_sets.csv
        header=["Product","ActionSet","ActionSet_Describe","ActionSet_Link","ActionSet_LinkText"]
        mywriter(pathhead,header,action_sets,'action_sets')
        return action_sets
    else:
        action_sets = myreader(pathhead,'action_sets',header='drop')
        return action_sets
コード例 #3
0
def products(url,reload='No'):
    from selenium import webdriver
    from bs4 import BeautifulSoup
    import time
    from common.commons import myreader, mywriter
    pathhead = 'crawlers/actions_by_product/'
    if reload == 'No':
        driver = webdriver.Safari()
        driver.get(url)
        time.sleep(10)
        soup = BeautifulSoup(driver.page_source,"lxml")
        driver.close()
        #print(soup)

        # Build the product list
        bowl = soup.findAll('div',attrs='xisDoc-toc_1 ng-scope')
        #printlist(bowl)
        products = []
        for spoon in bowl:
            products.append([spoon.text,spoon.find('a').get('href')])
        #printlist(products)
        #keep the list of links for actions in products.csv
        header=["Product","Product_Link"]
        mywriter(pathhead,header,products,'products')
        return products
    else:
        products = myreader(pathhead,'products',header='drop')
        return products
コード例 #4
0
def procs_plus(procs,reload='No'):
    from selenium import webdriver
    from bs4 import BeautifulSoup
    import time
    from common.commons import myreader, mywriter
    pathhead = 'crawlers/procs_by_product/'
    if reload == 'No':
        # start with procs and add columns:
        procs_plus = procs
        #function to see check if link is for desired purpose and if it needs stump url
        def check_addstump(link,stump):
                link=link.strip()
                if link.startswith('http'):
                    return link
                else:
                    return stump + link
        # cycle through procedure links, check for overview and contrasted links: Collect = Product | Procedure | Procedure_Short | Procedure_Link | Overview_Link | Compared_Link
        comp_stump='https://documentation.sas.com'
        #procs_plus = procs_plus[393:397] #subset for testing
        #procs_plus = procs_plus[290:296] #subset for testing
        driver = webdriver.Safari()
        for row in procs_plus:
            driver.get(row[3])
            time.sleep(10)
            proc_soup = BeautifulSoup(driver.page_source,"lxml")
            for proc_link in proc_soup.find_all('a'):
                if ("Overview" in proc_link.text) and proc_link.get('href'):
                    if "overview" in proc_link.get('href'):
                        row.append(check_addstump(proc_link.get('href'),comp_stump))
            if len(row) != 5:
                    row.append('')
            for proc_link in proc_soup.find_all('a'):
                comps=["Contrasted","Compared"]
                if any(comp in proc_link.text for comp in comps) and proc_link.get('href'):
                    row.append(check_addstump(proc_link.get('href'),comp_stump))
            if len(row) !=6:
                row.append('')
        driver.quit()
        #keep the list of links for products and procedures in procs_plus.csv
        header=["Product","Procedure","Procedure_Short","Procedure_Link","Overview_Link","Compared_Link"]
        mywriter(pathhead,header,procs_plus,'procs_plus')
        return procs_plus
    else:
        procs_plus = myreader(pathhead,'procs_plus',header='drop')
        return procs_plus
コード例 #5
0
def viya_procs(url, reload='No'):
    from selenium import webdriver
    from bs4 import BeautifulSoup
    import time
    from common.commons import myreader, mywriter
    pathhead = 'crawlers/viya_procs/'
    if reload == 'No':
        driver = webdriver.Safari()
        driver.get(url)
        time.sleep(10)
        soup = BeautifulSoup(driver.page_source, "lxml")
        driver.close()
        #print(soup)

        # Build the collect list: Product | Procedure | Procedure_Short | Procedure_Link
        bowl = soup.findAll(
            ['h2', 'p'], attrs={'class': ['xisDoc-title', 'xisDoc-paragraph']})
        products = []
        viya_procs = []
        for spoon in bowl:
            if spoon.name == 'h2' and "SAS Products" not in spoon.text:
                products.append(spoon.text.strip())
            if spoon.name == 'p' and products:
                block = spoon.find('a')
                if block:
                    link = block.get('href')
                    proc = ' '.join(block.text.split())
                    proc_short = proc.replace(
                        ': ', ' '
                    )  # template shows up as template: because it has multiple links
                    proc_short = proc_short.split(' ', 1)[0]
                    viya_procs.append(
                        [products[-1], proc, proc_short,
                         link.strip()])
        #keep the list of links for products and procedures in procs.csv
        header = ["Product", "Procedure", "Procedure_Short", "Procedure_Link"]
        mywriter(pathhead, header, viya_procs, 'viya_procs')
        return viya_procs
    else:
        viya_procs = myreader(pathhead, 'viya_procs', header='drop')
        return viya_procs
コード例 #6
0
def procs(url,reload='No'):
    from selenium import webdriver
    from bs4 import BeautifulSoup
    import time
    from common.commons import myreader, mywriter
    pathhead = 'crawlers/procs_by_product/'
    if reload == 'No':
        driver = webdriver.Safari()
        driver.get(url)
        time.sleep(10)
        soup = BeautifulSoup(driver.page_source,"lxml")
        driver.close()
        #print(soup)

        # Build the collect list: Product | Procedure | Procedure_Short | Procedure_Link
        bowl = soup.findAll(['h2','p'],attrs={'class':['xisDoc-title','xisDoc-paragraph']})
        procs = []
        product = []
        for spoon in bowl:
            #print('line - ', spoon)
            if spoon.name=='h2' and "SAS Products" not in spoon.text:
                product.append(spoon.text.strip())
            if spoon.name=='p' and product:
                block = spoon.find('a')
                if block:
                    link = block.get('href')
                    proc = ' '.join(block.text.split())
                    proc_short = proc.replace(': ',' ') # template shows up as template: because it has multiple links
                    proc_short = proc_short.split(' ',1)[0]
                    procs.append([product[-1], proc, proc_short, link.strip()])
        #remove the few cases where a product starts by listing another product (not a proc): as in "includes contents of product..."
        for idx, item in enumerate(procs):
            if item[1] in product:
                del procs[idx]
        #keep the list of links for products and procedures in procs.csv
        header=["Product","Procedure","Procedure_Short","Procedure_Link"]
        mywriter(pathhead,header,procs,'procs')
        return procs
    else:
        procs = myreader(pathhead,'procs',header='drop')
        return procs
コード例 #7
0
def procs_linked(procs_plus,reload='No'):
    from selenium import webdriver
    from bs4 import BeautifulSoup
    import time
    import csv
    import re
    from common.commons import myreader, mywriter
    pathhead = 'crawlers/procs_by_product/'
    if reload == 'No':
        # start with procs_plus and add columns:
        procs_linked = procs_plus
        #build a list of procedures
        procedures = []
        for row in procs_linked:
            if row[2] not in procedures:
                procedures.append(row[2])
        #keep the list of links for products and procedures in procs_linked.csv
        header=["Product","Procedure","Procedure_Short","Procedure_Link","Overview_Link","Compared_Link",'Compared_PROCS']
        with open(pathhead+"procs_linked.csv", "w", newline="") as f:
            writer = csv.writer(f)
            writer.writerow(header)
        f.close
        driver = webdriver.Safari()
        for row in procs_linked:
            row.append('')
            regex = r"\b[A-Z][A-Z]+\b"
            compared_procs = []
            if row[5]: # get compared PROCs
                driver.get(row[5])
                time.sleep(10)
                comp_soup = BeautifulSoup(driver.page_source,"lxml")
                for comp_link in comp_soup.find_all('p'):
                    for match in re.finditer(regex, comp_link.text):
                        if (match.group() not in compared_procs) and (match.group() in procedures) and (match.group() != row[2]): #not already found, is in full list, not the current proc
                            compared_procs.append(match.group())
                            row[6]=match.group()
                            with open(pathhead+"procs_linked.csv","a") as f:
                                writer = csv.writer(f)
                                writer.writerow(row)
            if row[4]: # get overview PROCs - only keep ones not already covered in compared
                driver.get(row[4])
                time.sleep(10)
                comp_soup = BeautifulSoup(driver.page_source,"lxml")
                for comp_link in comp_soup.find_all('p'):
                    for match in re.finditer(regex, comp_link.text):
                        if (match.group() not in compared_procs) and (match.group() in procedures) and (match.group() != row[2]): #not already found, is in full list, not the current proc
                            compared_procs.append(match.group())
                            row[6]=match.group()
                            with open(pathhead+"procs_linked.csv","a") as f:
                                writer = csv.writer(f)
                                writer.writerow(row)
            if not compared_procs:
                row[6]=''
                with open(pathhead+"procs_linked.csv","a") as f:
                    writer = csv.writer(f)
                    writer.writerow(row)
        driver.quit()
        return procs_linked
    else:
        procs_linked = myreader(pathhead,'procs_linked',header='drop')
        return procs_linked
コード例 #8
0
from graphviz import Digraph
import csv
import itertools
from common.commons import myreader

# read in csv
prodclus_input = myreader('process/processed_data/',
                          'product_clusters',
                          header='drop')
procs_linked_input = myreader('process/processed_data/',
                              'procs_linked',
                              header='drop')
prodedge_input = myreader('process/manual_inputs/',
                          'product_edges_manual',
                          header='drop')
action_sets_input = myreader('process/processed_data/',
                             'action_sets',
                             header='drop')
actions_input = myreader('process/processed_data/', 'actions', header='drop')


# function for cluster (subgraph) naming with iteration
def subs(name, label, color):
    name = Digraph(name, comment=label)
    name.attr(label=label,
              style="filled",
              fillcolor=color,
              tooltip="This Tooltip (Cluster)")
    return name

コード例 #9
0
from graphviz import Digraph
import csv
import itertools
from common.commons import myreader



# read in csv
prodclus = myreader('process/processed_data/','product_clusters',header='drop')
collect = myreader('process/processed_data/','procs_linked',header='drop')
prodedge = myreader('process/manual_inputs/','product_edges_manual',header='drop')

# function for cluster (subgraph) naming with iteration
def subs(name,label,color):
    name = Digraph(name,comment=label)
    name.attr(label=label,style="filled",fillcolor=color,tooltip="This Tooltip (Cluster)")
    return name

# function for escaping & in URL so SVG works in browser
def url_escape(url):
    url_esc ={"&": "&"}
    return "".join(url_esc.get(c,c) for c in url)




#create dot graph with graphviz
dot = Digraph(comment = 'SAS',strict=True)
# ,splines='ortho'
dot.attr(rankdir='LR',splines='polyline')#,compound='True',nodesep='0.1',ranksep='.02')#,ratio='compress')
コード例 #10
0
    node_list = []
    for group in groups:
        for i, row in enumerate(group):
            row.append(prefix + row[labelcoln] + str(i))
            node_list.append(row)
    # 4 - merge node_list back into collect to add column Node_name
    for row in inputlist:
        for node in node_list:
            if row[:keepcoln] == node[:keepcoln]:
                row.append(node[-1])
    header.append('Node_Name')
    return inputlist


# process procs_linked - data clean, create node_name for unique occurences of PROC
procs_linked = myreader('crawlers/procs_by_product/', 'procs_linked')
header = procs_linked.pop(0)

# The rows for TEMPLATE: type of template are causing the create of the dot.svg to break as : is interpreted for port
for row in procs_linked:
    if 'TEMPLATE:' in row[1]:
        row[1] = row[1].replace(':', ' -')

# add node_name column for the short proc name in [2], feed the first thre columns [:3] Product, Proc, Proc_Short
procs_linked = node_name(procs_linked, header, 'PROC_', 3, 2)
print(header)
mywriter('process/processed_data/', header, procs_linked, 'procs_linked')

# process viya actionsets - remove actionsets with no actions, add node_name to actions and action_sets
actions = myreader('crawlers/actions_by_product/', 'actions')
action_sets = myreader('crawlers/actions_by_product/', 'action_sets')
コード例 #11
0
from graphviz import Digraph
import csv
import itertools
from common.commons import myreader

# read in csv
collect = myreader('process/processed_data/', 'procs_linked', header='drop')

# iterate by groups
groups = []
uniquekey = []
collect = sorted(collect, key=lambda coll: coll[0])
for k, g in itertools.groupby(collect, lambda coll: coll[0]):
    groups.append(list(g))
    uniquekey.append(k)
#print(uniquekey)
#prit(groups[0])

#create dot graph with graphviz
dot = Digraph(comment='SAS Procedures', strict=True)
dot.attr(rankdir='LR')


# function for cluster (subgraph) naming with iteration
def subs(name, label):
    name = Digraph(name, comment=label)
    name.attr(label=label,
              style="filled",
              fillcolor="lightgrey",
              tooltip="This Tooltip (Cluster)")
    return name