def send_config(config_path, token_file, server_address="https://api.neuralet.io"): """ Send a config to server for starting a new job :param config_path: Path of .json config file :param server_address: Server address :param token_file: Path of token file :return: """ config_json = json_reader(config_path) url = server_address + "/api/v1/model/train/" token = token_reader(token_file) headers = { "Content-Type": "application/json", "Accept": "application/json", "Authorization": f"Bearer {token}", } print(f"Waiting for {url} ...") r = requests.post(url, headers=headers, data=config_json) if r.status_code == 200: job_id = r.json()["job_id"] print(f"The job is successfully initiated, job_id= {job_id}") return job_id else: print(f"ERROR! ({r.status_code})") return None
def read_train(filename, stem=False, bigram=False): global doc_data, cnt, size_voc count = [0] * 5 cnt = 0 for i in range(5): class_dict[i][0] = Counter() for doc in utils.json_reader(filename): #txt = word_tokenize(doc["text"].lower()) txt = re.findall(r"[\w']+", doc["text"].lower()) #txt = doc["text"].split(" ") if stem == True: s = " " txt = utils.getStemmedDocuments(s.join(txt)) txt = [item for item in txt if not item.isdigit()] if bigram == True: txt = (list(nltk.bigrams(txt))) vocab.update(txt) cnt = cnt + 1 class_dict[int(doc["stars"]) - 1][0].update(txt) class_dict[int(doc["stars"]) - 1][2] += 1 doc_data.append([doc["stars"], Counter(txt)]) for i in range(5): class_dict[i][1] = sum(class_dict[i][0].values()) print class_dict[i][1], class_dict[i][2] print "vocab" print len(vocab) size_voc = len(vocab)
def read_t(filename, stem=False, bigram=False): global doc_data for doc in utils.json_reader(filename): #txt = word_tokenize(doc["text"].lower()) txt = re.findall(r"[\w']+", doc["text"].lower()) #txt = doc["text"].split(" ") if stem == True: s = " " txt = utils.getStemmedDocuments(s.join(txt)) txt = [item for item in txt if not item.isdigit()] if bigram == True: txt = (list(nltk.bigrams(txt))) doc_data.append([doc["stars"], Counter(txt)])
def set_model(train_file, model_file, preprocess_type='None', feature='None'): if os.path.exists(model_file): return docs = utility.json_reader(train_file) stars = np.zeros(5) category_count = np.zeros(5) class_frequency = {} count = 0 all_words = [] for doc in docs: count = count + 1 if (count % 1000 == 0): print(count) words = text_processing(doc['text'], preprocess_type) words = feature_engg(words, feature) star = int(doc['stars']) stars[star - 1] += 1 category_count[star - 1] += len(words) for word in words: if word not in class_frequency: all_words.append(word) class_frequency[word] = np.ones(5) class_frequency[word][star - 1] += 1 #print(class_frequency) #print(all_words) m = count vocab_size = len(all_words) category_count += vocab_size for i in all_words: class_frequency[i] = np.log(class_frequency[i] / category_count) phai_y = np.log(stars / m) parameters = [class_frequency, phai_y, category_count] # open the file for writing obj_writer = open(model_file, 'wb') pickle.dump(parameters, obj_writer) obj_writer.close() print('done')
def data_loader(filename): start = time.time() X, Y = [], [] data_gen = json_reader(filename) for i, sample in enumerate(data_gen): review = sample['text'] # review = nltk.word_tokenize(review) stars = sample['stars'] X.append(review) Y.append(stars) # if len(Y) == 5000: # break df = pd.DataFrame({'text': X, 'stars': Y}) print('Time taken = {}'.format(time.time() - start)) return df
def post_reports_to_slack(): global value message = "" url = "https://hooks.slack.com/services/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" test_report_file = os.path.abspath( os.path.join(os.path.dirname(__file__), '.report.json')) # Add report file name and address here summary_json = utils.json_reader(test_report_file) # print(summary_json) summary = str(summary_json["summary"]) print(summary) if 'failed' in summary: bar_color = "#ff0000" else: bar_color = "#36a64f" try: slack_message = { 'blocks': [{ 'type': 'section', 'text': { 'type': 'mrkdwn', 'text': ':bomb:* Test Automation Result:*' } }], "attachments": [{ "color": bar_color, "title": "Test Report", "text": summary }] } json_params_encoded = json.dumps(slack_message) requests.post(url=url, data=json_params_encoded, headers={"Content-type": "application/json"}) except Exception as e: print(e)
def get_prediction(test_file, model_file, mode='None', preprocess_type='None', feature='None'): parameters = get_the_model(model_file) count = 0 prob_dict = parameters[0] phai_y = parameters[1] category_count = parameters[2] print(len(prob_dict)) docs = utility.json_reader(test_file) prediction = [] original = [] for doc in docs: if (count % 100000 == 0): print("iter:", count) count += 1 if mode == 'b1': prediction.append(randint(1, 5)) elif mode == 'b2': prediction.append(np.argmax(category_count) + 1) elif mode == 'a': words = text_processing(doc['text'], preprocess_type) words = feature_engg(words, feature) sum_of_logs = phai_y for word in words: if word not in prob_dict: sum_of_logs = np.add(sum_of_logs, np.log(1 / category_count)) else: sum_of_logs = np.add(sum_of_logs, prob_dict[word]) prediction.append(np.argmax(sum_of_logs) + 1) original.append(int(doc['stars'])) return prediction, original
def main(train, test): # test = "Stopped here today to give it a try and must admit the food was excellent" # bigram = nltk.bigrams(test.split()) # print(list(map(''.join, bigram))) #Making Vocabulary for different labels out of training data vocab_list = [{}, {}, {}, {}, {}] vocabulary = {} vocab_list_bigrams = [{}, {}, {}, {}, {}] vocabulary_bigrams = {} #Count of each label in training data label_count = np.zeros(5) label_word_count = np.zeros(5) label_bigram_count = np.zeros(5) start1 = time.time() ############################################################################## #Training part iter = (ut.json_reader(train)) # for i in range(TRAINFULLSIZE): i1=0 for element in iter: i1+=1 if (i1%1000)==0: print("Training: ", i1/1000) # for i in range(1): # element = next(iter) label_count[int(element["stars"])-1]+=1 # print((remove_duplicates((element["text"]).split()))) # label_word_count[int(element["stars"])-1]+= len((element["text"]).split()) # Switch these lines for stemming stemmed = (element["text"].split()) # stemmed = ut.getStemmedDocuments(element["text"]) # bigram = nltk.bigrams(stemmed) # bigramlist = list(map(''.join, bigram)) label_word_count[int(element["stars"])-1]+= len(stemmed) # label_bigram_count[int(element["stars"])-1]+= len(bigramlist) # stemmed.extend(bigramlist) # print(stemmed) for x in (stemmed): # for x in ((element["text"]).split()): word = x.strip(string.punctuation) # word = x # print(word) if word=="": continue if word in vocab_list[int(element["stars"]-1)]: (vocab_list[int(element["stars"])-1])[word]+=1 else: (vocab_list[int(element["stars"])-1])[word]=1 vocabulary[word]=1 # for x in (bigramlist): # # for x in ((element["text"]).split()): # word = x.strip(string.punctuation) # # word = x # # print(word) # if word=="": # continue # if word in vocab_list_bigrams[int(element["stars"]-1)]: # (vocab_list_bigrams[int(element["stars"])-1])[word]+=1 # else: # (vocab_list_bigrams[int(element["stars"])-1])[word]=1 # vocabulary_bigrams[word]=1 ############################################################################## end1 = time.time() print("Training done, Time taken(mins)", int(end1-start1)/60) # print(len(vocab)) # count=0; # for i in range(5): # print(label_count[i]) # count+=(label_count[i]) # print(count) prior = label_count/TRAINSIZE # print(prior) actual_value = [] predicted_value = [] random_prediction = [] start2 = time.time() ############################################################################## #TESTING i2=0 iter2 = (ut.json_reader(test)) for test_element in iter2: i2+=1 if (i2%1000)==0: print("Testing: ", i2/1000) # print(i) #Random number between 1-5 random_prediction.append(random.randint(1,6)) # test_element = next(iter2) actual_value.append(int(test_element["stars"])) # test = "Stopped here today to give it a try and must admit the food was excellent. I ordered the vegetarian Soyrizo (fake sausage) burrito and fell in love. It was well worth the $6. It's not like the big chain restaurants where they serve you a massive sloppy burrito. It was the perfect size and easily handled. \nIt's small and quaint, with some seating outside in under a canopy. The owners were a lovely couple, passionate about their food. \nExcellent." # test = "Fast, easy, helpful. In and out quickly and got the medicine I needed. Smart staff who was kind and helpful. Clean facility. No complaints from me" # test = "Service good, we had hummas, gyros, spiced date crumble.... all real good... need to try the flamming cheese next time!... messed up on a few tables bill.. including ours but got it fixed. I liked it. . . my guest was on the fence." test = test_element["text"] test_list = ((test).split()) # test_list = (ut.getStemmedDocuments(test_element["text"])) # bigram = nltk.bigrams(test_list) # bigramlist = list(map(''.join, bigram)) # test_list.extend(bigramlist) # print(test_list) results = [] for i in range(5): #check for 1 rating # i=0 py = prior[i] logr = 0 rating=i+1 for x in test_list: word = x.strip(string.punctuation) # word = x # print(word) if word == "": continue if word in vocab_list[i]: # print(word) # print(((vocab_list[i])[word])) # print(label_count[i]) probability = (((vocab_list[i])[word])+1)/(label_word_count[i]+len(vocabulary)) logr+=math.log(probability) else: # print("not") logr+=math.log(1/(label_word_count[i]+len(vocabulary))) # for x in bigramlist: # word = x.strip(string.punctuation) # # word = x # # print(word) # if word == "": # continue # if word in vocab_list_bigrams[i]: # # print(word) # # print(((vocab_list[i])[word])) # # print(label_count[i]) # probability = (((vocab_list_bigrams[i])[word])+1)/(label_bigram_count[i]+len(vocabulary_bigrams)) # logr+=math.log(probability) # else: # # print("not") # logr+=math.log(1/(label_bigram_count[i]+len(vocabulary_bigrams))) results.append(logr+(math.log(py))) # print("------------------------------------------") predicted_value.append(results.index(max(results))+1) # print(results.index(max(results))+1) ############################################################################## # print(len(predicted_value)) major = list(label_count).index(max(label_count))+1 correct=0 correct_random=0 correct_major=0; # confusion = np.zeros((5,5)) # calc_f1_score = np.zeros(5) for i in range(len(predicted_value)): # print(predicted_value[i]) if(predicted_value[i]==actual_value[i]): correct+=1 if(random_prediction[i]==actual_value[i]): correct_random+=1 if(major==actual_value[i]): correct_major+=1 # confusion[predicted_value[i]-1][actual_value[i]-1]+=1 # row_sum = np.sum(confusion, axis=1) # column_sum = np.sum(confusion, axis=0) # for i in range(5): # precision = confusion[i][i]/row_sum[i] # recall = confusion[i][i]/column_sum[i] # calc_f1_score[i] = 2*((precision*recall)/(precision+recall)) end2 = time.time() print("Testing done, Time taken(mins)", int(end2-start2)/60) # print("Correct") # print(correct) # print(len(actual_value)) print("Accuracy using Naive Bayes: ", int(correct/len(actual_value)*100) , "%") print("Accuracy using Random prediciton: ", int(correct_random/len(actual_value)*100) , "%") print("Accuracy using Majority prediciton: ", int(correct_major/len(actual_value)*100) , "%")
def main(): # TODO: add logging # TODO: check the existence of input file(s) # TODO: read multiple csv files or call this script multiple times with given path to csv csv_path_taxi_trips = sys.argv[1] dir_root = os.path.dirname(os.path.realpath(__file__)) # config json config_json_path = os.path.join(dir_root, "config.json") config_json = json_reader(config_json_path) if not config_json: print("no config json found or empty {}".format(config_json_path)) return 1 else: if 'db' not in config_json: print("db config not found or empty") return 1 # database settings from config db_settings = config_json['db'] db_hostname = db_settings['hostname'] db_name = db_settings['db_name'] db_schema = db_settings['schema'] db_username = db_settings['username'] # TODO: better to store DDLs separately and run before this job ddl_taxi_trips = """ CREATE TABLE IF NOT EXISTS {0}.taxi_trips ( vendor_id INTEGER, lpep_pickup_datetime TIMESTAMP, lpep_dropoff_datetime TIMESTAMP, store_and_fwd_flag VARCHAR(1), ratecode_id INTEGER, pulocation_id INTEGER, dolocation_id INTEGER, passenger_count INTEGER, trip_distance DECIMAL, fare_amount DECIMAL, extra DECIMAL, mta_tax DECIMAL, tip_amount DECIMAL, tolls_amount DECIMAL, ehail_fee DECIMAL, improvement_surcharge DECIMAL, total_amount DECIMAL, payment_type INTEGER, trip_type INTEGER, congestion_surcharge DECIMAL, taxi_type VARCHAR(50) ); """.format(db_schema) try: conn = get_connection(hostname=db_hostname, db=db_name, username=db_username) except: print("Error in creating a connection.") return 1 # for dev env conn.autocommit = True with conn.cursor() as cursor: # Create a taxi_trips table if not exists try: execute_ddl(cursor, ddl_taxi_trips) except: print("Error in creating taxi_trips table") # return 1 # Insert the rows from taxi trips csv by reading using generator # TODO: insert only new rows (taxi_type, YYYY-MM) try: insert_taxi_trips(conn, iter( rows_from_a_csv_file(csv_path_taxi_trips, skip_first_line=True)), page_size=1000) except: print("Error in inserting the data into taxi_trips table") return 1 return 0
import sys from datetime import datetime import os.path import ipdb sys.path.append('../src/') from web3 import Web3, HTTPProvider, TestRPCProvider, KeepAliveRPCProvider from solc import compile_source from web3.contract import ConciseContract import etherscan.accounts as accounts from utils import json_reader, parallel_dict_update, write_pickle, read_pickle api_key = '3JS9BXYFNNGNX17WKANJMU63R6BQKJW5WE' config = json_reader('config.json') LR_ABI = config['LR_ABI'] LEDGER_ABI = config['LEDGER_ABI'] LEDGER_ADDRESS = config['LEDGER_ADDRESS'] node = config['node'] DECIMALS = 1e18 def timestamp_converter(timeStamp): timeStamp = int(timeStamp) return datetime.fromtimestamp(timeStamp).strftime('%Y-%m-%d %H:%M:%S') def get_contract_time(contract_adress, api_key): api = accounts.Account(address=contract_adress, api_key=api_key) time = api.get_transaction_page(page=1, offset=1, internal=True)[
import pandas as pd from utils.scraper import * from utils.classifier import * from utils.db_controller import * from utils.json_reader import * print('Process started... "Scraping profiles"') scraper = Scraper() print('Process ended... "Scraping profiles"') print('Process started... "Reading JSON files"') reader = json_reader() posts, comments = reader.get_df() print('Process ended... "Reading JSON FILES"') print('Process started... "Classifying comments"') classif = classifier(comments.copy()) comments = classif.get_df() print('Process ended... "Classifying comments"') print('Process started... "Inserting into DB"') db = db_controller(posts, comments) db.insert_into() print('Process ended... "Inserting into DB"') print('Cleaning the JSON files.') reader.clean_files()
from clean.master import MasterCleaner from utils import json_reader, simple_reader, simple_writer if __name__ == "__main__": config = json_reader('cleaner_config.json') cleaner = MasterCleaner(config) target_data = simple_reader('dataset/test_data.txt') corpus = list() for line in target_data: new_line = cleaner.cleaning(line) corpus.append(new_line) simple_writer('output/test_data_cleaned.txt', corpus) # faster approach """ writer = codecs.open('output/test_data_cleaned.txt', 'w', encoding='utf-8') target_data = simple_reader('dataset/test_data.txt') corpus = list() for line in target_data: new_line = cleaner.cleaning(line) writer.write(new_line + '\n') writer.close() """
def main(): # TODO: check the existence of input file(s) csv_path_taxi_zone = sys.argv[1] dir_root = os.path.dirname(os.path.realpath(__file__)) # config json config_json_path = os.path.join(dir_root, "config.json") config_json = json_reader(config_json_path) if not config_json: print("no config json found or empty {}".format(config_json_path)) return 1 else: if 'db' not in config_json: print("db config not found or empty") return 1 # database settings from config db_settings = config_json['db'] db_hostname = db_settings['hostname'] db_name = db_settings['db_name'] db_schema = db_settings['schema'] db_username = db_settings['username'] # TODO: better to store DDLs separately and run before this job ddl_taxi_zone = """ CREATE TABLE IF NOT EXISTS {0}.taxi_zone ( location_id INTEGER, borough VARCHAR(255), zone VARCHAR(255), service_zone VARCHAR(255) ); """.format(db_schema) # creating a connection to db try: conn = get_connection(hostname=db_hostname, db=db_name, username=db_username) except: print("Error in creating a connection.") return 1 # for dev env conn.autocommit = True with conn.cursor() as cursor: # Create a taxi_zone table if not exists try: execute_ddl(cursor, ddl_taxi_zone) except: print("Error in creating taxi_zone table") # return 1 # Insert into taxi_zone dictionary # TODO: insert only new rows try: insert_taxi_zone( conn, iter(rows_from_a_csv_file(csv_path_taxi_zone, skip_first_line=True)), page_size=1000 ) except: print("Error in inserting the data into taxi_zone table") return 1 return 0