Beispiel #1
0
import analytics_functions
import boto3
from fabric import Connection
import os
import time

### This file will be run to send commands to the namenode to run the tfidf.py and pearson.py scripts

fil = open('./analytics_generated_items/namenode_ip_and_key', 'r')
namenode_ip_and_key = fil.read()
fil.close()

namenode_ip, key_pair = namenode_ip_and_key.split('\n')
### Get connected to namenode and start running commands ###

input(
    'Press Enterget Pearson Correlation output score(wait around a minute or so): '
)
c = analytics_functions.theconnector(namenode_ip, key_pair)
c.run('cd spark_scripts && python3 pearson.py')

# input('Press Enter to run the TFIDF script, results will arrive shortly in a file named tfidf_results')
# c.run('cd spark_scripts && python3 tfidf.py')
Beispiel #2
0
sys.path.append('../')
import analytics_functions

sys.path.append('../hadoop')
import scaling

shutil.copy('../hadoop/{}.pem'.format(scaling.key_pair), './')

# change tfidf_output to testcopy
bash_file = open("get_tfidf.sh", 'w')
bash_file.write('scp -i {}.pem -r ubuntu@{}:tfidf_output ./'.format(
    scaling.key_pair, scaling.namenode_ip))
bash_file.close()

c = analytics_functions.theconnector(scaling.namenode_ip, scaling.key_pair)

# c.run('cd tfidf_output && ls')

print('now getting the TFIDF (this will take 8-10 mins)')
c.run('python3 tfidf.py')
print('downloading the tfidf, will take awhile')

print(
    'now getting the Pearson Correlation (this will take ~1min and printed to console)'
)
c.run('export PYSPARK_PYTHON=/usr/bin/python3 && python3 pearson.py')

# #test copying
# c.run('mkdir ./testcopy')
# c.run('cp ./tfidf_output/part-00099-5200a268-b05a-403d-b56d-c9d1b2558fd6-c000.csv ./testcopy/')
Beispiel #3
0



print("Waiting for instances  to start up")
time.sleep(60)


# ---------------------------------- update the packages ------------------------------------------- >


for instance_ip in all_node_ips:
    success = False
    while(not success):
        try: 
            c = analytics_functions.theconnector(instance_ip, key_pair)
            c.sudo('apt-get update')
            success = True

        except: 
            # in case fail
            print('something went wrong, retrying i a moment')
            time.sleep(10)


# ------------------------------------------- reboot ---------------------------------------------------- >

try:
    ec2.reboot_instances(InstanceIds=all_node_ids, DryRun=True)
except ClientError as e:
    if 'DryRunOperation' not in str(e):
Beispiel #4
0
print("Waiting for instances to start up")
time.sleep(120)

# ---------------------------------- update the packages on the new data nodes------------------------------------------- >

print(
    "------------------------- Updating the packages on the new data nodes --------------------------------------"
)

# update the packages only on the new nodes
for instance_ip in new_node_ips:
    success = False
    tryfactor = 0
    while (not success):
        try:
            c = analytics_functions.theconnector(instance_ip, key_pair)
            c.sudo('apt-get -y update')
            success = True

        except:
            # in case fail
            print('something went wrong, retrying in a moment')
            tryfactor += 1
            if tryfactor == 10:
                print(
                    'It has been {} times, something went horribly wrong. Ctrl C to exit and try again'
                    .format(tryfactor))
            time.sleep(10)

# ------------------------------------------- reboot ----------------------------------------------------
print(