Beispiel #1
0
def get_collection(doc_collection):
    while True:  #Loop that aquires web-pages and builds doc_collection
        web_page = input('Enter URL: ')
        if web_page == 'stop':
            break
        try:
            title, body = web_scraper(web_page)
            doc_collection.append([title, body])
        except:
            print('Error: Invalid Webpage Entry')
    return (doc_collection)
Beispiel #2
0
 def wiki_scraper(self, event):
     site = self.entry.get()
     self.init()
     self.entry.delete(0, "end")
     if site:
         scraper = web_scraper(site)
         txt = scraper.txt
         self.ori_text = txt
         self.pro_text = self.ori_text
         self.update_text(self.pro_text)
         self.lang_detector(self.ori_text)
         print 'Scraping Complete'
     return
Beispiel #3
0
    def __init__(self, dblp_filename):
        self.__raw_interface = xml_processor.extractor(dblp_filename)
        self.__web_interface = web_scraper.web_scraper('https://dblp.org')
        self._confs_xml = None
        self._papers_xml = None
        self._authors_xml = None

        self._series_ids = None
        self._conf_series_ids_to_names = None
        self._author_id_lookup = None
        self._disambiguation_ids = None
        self._conf_objects = {}
        self._author_objects = {}
def main():
    # Get the arguments and reference them as options
    options = arguments()
    # Display the welcome banner
    welcomebanner()
    find_bucket = ''
    html = ''

    if options.html:
        html = options.html

    if options.accountlist:
        update_account_list = options.accountlist
    else:
        ## One or many accounts
        print(Fore.YELLOW)
        update_account_list = input("Update the account list (y/n): ")
        print(Fore.RESET)

    if update_account_list.lower() == 'y' or update_account_list.lower() == 'yes':
        web_scraper(options)

    if options.all_accounts:
            aws_accounts_question = options.all_accounts
    else:
        ## Select one or many accounts
        print(Fore.YELLOW)
        aws_accounts_question = input("List S3 buckets in one or all accounts: ")
        print(Fore.RESET)

    if aws_accounts_question.lower() == "one":
        interactive = 1
    else:
        interactive = 0

    if options.pageid:
        pageid = options.pageid
    else:
        pageid=223215673 # Main S3 Buckets Page
   
    if options.title:
        title = options.title
    else:
        title = 'AWS List Buckets - Test'

    if options.verbose:
        show_details = options.verbose
    else:
        print(Fore.YELLOW)
        show_details = input("Display bucket names (y/n): ")
        print(Fore.RESET)

    if options.send_email:
        email_answer = options.send_email
    else:
        print(Fore.YELLOW)
        email_answer = input("Send an email (y/n): ")
        print(Fore.RESET)

    if options.write_confluence:
        confluence_answer = options.write_confluence
    else:
        print(Fore.CYAN)
        confluence_answer = input("Write the list to confluence (y/n): ")
        print(Fore.RESET)


    # If to select one or many accounts based on the interactive variable
    if interactive == 1:
        # Set the account name
        if options.account_name:
            aws_account = options.account_name
        else:
            print(Fore.YELLOW)
            aws_account = input("Enter the name of the AWS account you'll be working in: ")
            print(Fore.RESET)

        # Find an S3 bucket
        if options.search:
            search_bucket = options.search
        else:
            search_bucket = input("Search for a bucket (y/n): ")

        ## Set the account number
        aws_account_number = aws_accounts_to_account_numbers(aws_account)
       
        # Grab variables from initialize
        today, aws_env_list, output_file, output_file_name, fieldnames = initialize(interactive, aws_account)
        print(Fore.CYAN)
        message = f"Working in AWS account: {aws_account}"
        output_file = list_s3_buckets(aws_account,aws_account_number, interactive, show_details, search_bucket)
        htmlfile, htmlfile_name, remove_htmlfile = convert_csv_to_html_table(output_file, today, interactive, aws_account)
        print(Fore.YELLOW)

        message = " Send an Email "
        print(Fore.YELLOW)
        banner(message, "*")
        print(Fore.RESET)  
        # Send the email
        if email_answer.lower() == 'y' or email_answer == 'yes':
            send_email(aws_accounts_question,aws_account,aws_account_number, interactive)
        else:
            message = "Not sending an email."
            print(Fore.YELLOW)
            banner(message)
            print(Fore.RESET)
       
        with open(htmlfile, 'r', encoding='utf-8') as htmlfile:
            html = htmlfile.read()

        message = "* Write to Confluence *"
        print(Fore.CYAN)
        banner(message, "*")
        print(Fore.RESET)

        if options.user and options.password:
            user = options.user
            password = options.password
            auth = (user, password)
            write_data_to_confluence(auth, html, pageid, title)  
        elif confluence_answer.lower() == 'yes' or confluence_answer.lower() == 'y':      
            auth = authenticate()
            write_data_to_confluence(auth, html, pageid, title)
        else:
            message = "Okay. Not writing to confluence."
            print(Fore.CYAN)
            banner(message)
            print(Fore.RESET)

    else:
        aws_account = 'all'
        today, aws_env_list, output_file, output_file_name, fieldnames = initialize(interactive, aws_account)
       
        # Find an S3 bucket
        if options.search:
            search_bucket = options.search
        else:
            search_bucket = input("Search for a bucket (y/n): ")

        if search_bucket.lower() == 'yes' or search_bucket.lower() == 'y':
            if options.bucket_name:
                find_bucket = options.bucket_name
            else:
                message = "Enter 'none' if you don't want to search for a bucket."
                banner(message)
                find_bucket = input("Enter a bucket name to find in the accounts: ")
       
        if find_bucket.lower() != 'none':
            search_bucket = find_bucket.lower()
        else:
            search_bucket = None

        if search_bucket is None:
            print("OK. Not searching for a bucket name")
           
        with open(output_file, mode='w+') as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter=',', lineterminator='\n')
            writer.writeheader()          
        with open(aws_env_list, 'r') as aws_envs:
                csv_reader = csv.reader(aws_envs, delimiter=',')
                next(csv_reader)
                for aws_accounts in csv_reader:
                    aws_account = str(aws_accounts[0])
                    aws_account_number = str(aws_accounts[1])
                    print("\n")
                    print("Working in AWS Account: ", aws_account)
                    print(Fore.RESET + "-----------------------------------------------")
                    output_file = list_s3_buckets(aws_account,aws_account_number, interactive, search_bucket)
                    htmlfile, htmlfile_name, remove_htmlfile = convert_csv_to_html_table(output_file,today, interactive, aws_account)
                    print(Fore.RESET + "-----------------------------------------------")
       
        message = " Send an Email "
        print(Fore.YELLOW)
        banner(message, "*")
        print(Fore.RESET)  
        # Send the email
        if email_answer.lower() == 'y' or email_answer == 'yes':
            send_email(aws_accounts_question,aws_account,aws_account_number, interactive)
        else:
            message = "Not sending an email."
            print(Fore.YELLOW)
            banner(message)
            print

        message = "* Write to Confluence *"
        print(Fore.CYAN)
        banner(message, "*")
        print(Fore.RESET)
        if options.user and options.password:
            user = options.user
            password = options.password
            auth = (user, password)
            write_data_to_confluence(auth, html, pageid, title)  
        elif confluence_answer.lower() == 'yes' or confluence_answer.lower() == 'y':      
            auth = authenticate()
            write_data_to_confluence(auth, html, pageid, title)
        else:
            message = "Okay. Not writing to confluence."
            print(Fore.CYAN)
            banner(message)
            print(Fore.RESET)
    #remove_file(output_file, output_file_name)
    #remove_file(remove_htmlfile, htmlfile_name)
    endbanner()
import csv
from web_scraper import web_scraper

abb = web_scraper()
count = 0
with open('../mli_train_v1.csv', 'r') as file1, open('Abb/mli_train_v1.csv',
                                                     'w',
                                                     newline='') as file2:
    reader = csv.reader(file1, delimiter=',')
    writer = csv.writer(file2, delimiter=',')

    for row in reader:
        for key, value in abb.items():
            if ';' not in value:
                if key in row[0].split() or key in row[3].split():
                    count += 1
                    replaced = row[0].replace(key, value)
                    row[0] = replaced
                    replaced = row[3].replace(key, value)
                    row[3] = replaced
        writer.writerow(row)

file1.close()
file2.close()
print(count)
Beispiel #6
0
from aws_textract import aws_textract
from web_scraper import web_scraper

awstextract = aws_textract()
scraper = web_scraper()
scanned_label = awstextract.main("wine-analyzer", "hoya.jpg")
wine_info = scraper.get_wine_info(scanned_label)
print(wine_info)