def main(remoteSavePath): output = {} for file in listdir('results'): if isfile(join('results', file)): s3.upload('results', remoteSavePath, file) if file == 'config.json': output['config'] = s3.generate_downloads(remoteSavePath, file) elif file == 'div.html': output['visualization'] = s3.generate_downloads(remoteSavePath, file) elif file == 'AutoPhrase_multi-words.txt': output['multi-words'] = s3.generate_downloads(remoteSavePath, file) elif file == 'AutoPhrase_single-word.txt': output['single-word'] = s3.generate_downloads(remoteSavePath, file) elif file == 'AutoPhrase.txt': output['autophrase'] = s3.generate_downloads(remoteSavePath, file) elif file == 'segmentation.model': output['model'] = s3.generate_downloads(remoteSavePath, file) elif file == 'token_mapping.txt': output['token-mapping'] = s3.generate_downloads(remoteSavePath, file) else: output['misc'] = s3.generate_downloads(remoteSavePath, file) return output
def lambda_handler(event, context): output = dict() uid = event['uid'] awsPath = event['s3FolderName'] + '/ML/classification/' + uid + '/' localSavePath = '/tmp/' + event[ 's3FolderName'] + '/ML/classification/' + uid + '/' if not os.path.exists(localSavePath): os.makedirs(localSavePath) if not os.path.exists(localSavePath): os.makedirs(localSavePath) # download config to local folder fname_config = 'config.json' try: s3.downloadToDisk(fname_config, localSavePath, awsPath) with open(localSavePath + fname_config, "r") as fp: data = json.load(fp) for key in data.keys(): if key not in event.keys(): event[key] = data[key] with open(localSavePath + fname_config, "w") as f: json.dump(event, f) s3.upload(localSavePath, awsPath, fname_config) output['config'] = s3.generate_downloads(awsPath, fname_config) output['uid'] = uid except: raise ValueError('This session ID is invalid!') exit() # download unlabeled data to local folder fname_unlabeled = 'testing.csv' try: s3.downloadToDisk(fname_unlabeled, localSavePath, awsPath) except: raise ValueError('You\'re requesting ' + fname_unlabeled + ' file, and it\'s not found in your remote directory!\ It is likely that you have not yet performed step 1 -- split the dataset into training and predicting set, or you have provided the wrong sessionID.' ) exit() #download pickle model to local folder fname_pickle = 'pipeline.pickle' try: s3.downloadToDisk(fname_pickle, localSavePath, awsPath) except: raise ValueError( 'You\'re requesting ' + fname_pickle + ' file, and it\'s not found in your remote directory! \ It is likely that you have not yet performed step 2 -- model training, or you have provided the wrong sessionID.' ) exit() classification = Classification(awsPath, localSavePath) output['predicting'] = classification.predict() output['div_category'] = classification.plot() return output
def save_remote_output(localSavePath, remoteSavePath, fname): """ :param localSavePath: :param remoteSavePath: :param fname: :param output_data: :return: """ zipf = zipfile.ZipFile(os.path.join(localSavePath, fname), 'w', zipfile.ZIP_DEFLATED) zipdir(os.path.join(localSavePath, 'img'), zipf) zipf.close() s3.upload(localSavePath, remoteSavePath, fname) url = s3.generate_downloads(remoteSavePath, fname) return url
def metrics(self): report = np.array(metrics.precision_recall_fscore_support(self.target,self.predicted,labels=self.labels)).T avg_report = list(metrics.precision_recall_fscore_support(self.target,self.predicted,average='weighted')) avg_report.insert(0,'AVG') # save metrics report fname_metrics = 'classification_report.csv' with open(self.localSavePath + fname_metrics,'w',newline="") as f: writer = csv.writer(f) writer.writerow(['label','precision','recall','f1-score','support']) for i in range(len(report)): writer.writerow([self.labels[i], round(report[i][0],4), round(report[i][1],4), round(report[i][2],4), round(report[i][3],4)]) writer.writerow(avg_report) s3.upload(self.localSavePath, self.awsPath, fname_metrics) return {'metrics': s3.generate_downloads(self.awsPath, fname_metrics)}
def plot(self): y_pred_dict = Counter(self.predicted) labels = [] values = [] for i in y_pred_dict.keys(): labels.append("class: " + str(i)) values.append(y_pred_dict[i]) trace = go.Pie(labels=labels, values=values, textinfo='label') div_category = plot([trace], output_type='div', image='png', auto_open=False, image_filename='plot_img') fname_div_category = 'div_category.html' with open(self.localSavePath + fname_div_category, "w") as f: f.write(div_category) s3.upload(self.localSavePath, self.awsPath, fname_div_category) return s3.generate_downloads(self.awsPath, fname_div_category)
def lambda_handler(event, context): awsPath = os.path.join(event['sessionID'], event['screen_name']) localSavePath = os.path.join('/tmp', event['sessionID'], event['screen_name']) if not os.path.exists(localSavePath): os.makedirs(localSavePath) auth = tweepy.OAuthHandler(event['consumer_key'], event['consumer_secret']) auth.set_access_token(event['access_token'], event['access_token_secret']) api = tweepy.API(auth) tweets = [] for status in tweepy.Cursor(api.user_timeline, screen_name=event['screen_name'], count=100, tweet_mode="extended").items(): tweets.append([ status._json['id'], status._json['full_text'].encode('utf-8', 'ignore').decode() ]) if len(tweets) > 0: fname = event['screen_name'] + '_tweets.txt' with open(os.path.join(localSavePath, fname), 'w', encoding='utf-8', newline='') as f: header = ['id', 'text'] writer = csv.writer(f, delimiter=",") writer.writerow(header) for row in tweets: writer.writerow(row) s3.upload(localSavePath, awsPath, fname) return {'url': s3.generate_downloads(awsPath, fname)} else: raise ValueError('This user\'s timeline (screen_name: ' + event['screen_name'] + ') is empty. There is nothing to analyze!')
def save_remote_output(localSavePath, remoteSavePath, fname, output_data): """ save output in memory first to local file, then upload to remote S3 bucket :param localSavePath: local saved file :param remoteSavePath: remote save file path :param fname: filename :param output_data: the actual data :return: url of the file saved in S3 bucket """ # json if isinstance(output_data, dict): fname += '.json' with open(os.path.join(localSavePath, fname), 'w') as f: json.dump(output_data, f) # # dataframe to csv # elif isinstance(output_data, pd.DataFrame): # fname += '.csv' # output_data.to_csv(fname) # string to html elif isinstance(output_data, str): fname += '.html' with open(os.path.join(localSavePath, fname), 'w') as f: f.write(output_data) # list(list) to csv elif isinstance(output_data, list) \ and (isinstance(output_data[0], list) or isinstance(output_data[0], tuple)): fname += '.csv' with open(os.path.join(localSavePath, fname), 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) for row in output_data: try: writer.writerow(row) except UnicodeEncodeError as e: print(e) # generator elif isinstance(output_data, types.GeneratorType): if fname == 'gephi': fname += '.gml' elif fname == 'pajek': fname += '.net' else: fname += '.unknown' with open(os.path.join(localSavePath, fname), 'w', newline='') as f: for line in output_data: f.write(line + '\n') # else pickle the object else: fname += '.pickle' with open(os.path.join(localSavePath, fname), 'wb') as f: pickle.dump(output_data, f) s3.upload(localSavePath, remoteSavePath, fname) url = s3.generate_downloads(remoteSavePath, fname) return url
# loop through the id and store their comments for url, id in zip(urls, ids): url = "https://www.reddit.com" + url try: submission = reddit.submission(url=url) if not bfs(submission, id, comments_folder): # zip goes here zipf = zipfile.ZipFile(temp_dir + fname_zip, 'w', zipfile.ZIP_DEFLATED) zipdir(comments_folder + '/', zipf) zipf.close() # upload this zip to the s3 corresponding folder s3.upload(temp_dir, args.remoteReadPath, fname_zip) url = s3.generate_downloads(args.remoteReadPath, fname_zip) # delete the files d.deletedir('/tmp') # send out email notification n.notification(args.email, case=1, filename=args.remoteReadPath, links=url, sessionURL=args.sessionURL) exit(code='Lack of disk space') except: # escape those can't be found in url pass # success and send email notification # zip goes here
def classify(self, model): if model == 'NaiveBayes': text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf',MultinomialNB())]) # 10 fold cross validation self.predicted = cross_val_predict(text_clf, self.data, self.target, cv=10) # fit the model text_clf.fit(self.data, self.target) y_score = text_clf.predict_proba(self.data) elif model == 'Perceptron': text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf',Perceptron())]) # 10 fold cross validation self.predicted = cross_val_predict(text_clf, self.data, self.target, cv=10) # fit the model text_clf.fit(self.data, self.target) y_score = text_clf.decision_function(self.data) elif model == 'SGD': text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf',SGDClassifier())]) # 10 fold cross validation self.predicted = cross_val_predict(text_clf, self.data, self.target, cv=10) # fit the model text_clf.fit(self.data, self.target) y_score = text_clf.decision_function(self.data) elif model == 'RandomForest': text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf',RandomForestClassifier(n_estimators=100))]) # 10 fold cross validation self.predicted = cross_val_predict(text_clf, self.data, self.target, cv=10) # fit the model text_clf.fit(self.data, self.target) y_score = text_clf.predict_proba(self.data) elif model == 'KNN': text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf',KNeighborsClassifier(n_neighbors=10))]) # 10 fold cross validation self.predicted = cross_val_predict(text_clf, self.data, self.target, cv=10) # fit the model text_clf.fit(self.data, self.target) y_score = text_clf.predict_proba(self.data) elif model == 'passiveAggressive': text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf',PassiveAggressiveClassifier(n_iter=50))]) # 10 fold cross validation self.predicted = cross_val_predict(text_clf, self.data, self.target, cv=10) # fit the model text_clf.fit(self.data, self.target) y_score = text_clf.decision_function(self.data) # get 10 fold cross validation accuracy score fold_scores = cross_val_score(text_clf, self.data, self.target, cv=10) fname_folds = 'accuracy_score.csv' with open(self.localSavePath + fname_folds,'w',newline="") as f: writer = csv.writer(f) writer.writerow(['fold_1','fold_2','fold_3','fold_4','fold_5', 'fold_6','fold_7','fold_8','fold_9','fold_10']) writer.writerow([ '%.4f' % elem for elem in fold_scores ]) s3.upload(self.localSavePath, self.awsPath, fname_folds) accuracy_url = s3.generate_downloads(self.awsPath, fname_folds) # pickle the Pipeline for future use fname_pickle = 'classification_pipeline.pickle' with open(self.localSavePath + fname_pickle,'wb') as f: pickle.dump(text_clf,f) s3.upload(self.localSavePath, self.awsPath, fname_pickle) pickle_url = s3.generate_downloads(self.awsPath, fname_pickle) # plotting the roc curve self.labels = text_clf.classes_ y = label_binarize(self.target,classes = self.labels) # binary class if len(self.labels) <= 2: if model == 'Perceptron' or model == 'SGD' or model == 'passiveAggressive': fpr, tpr, _ = roc_curve(y[:, 0], y_score) else: y = [] for label in self.target: item = [] for i in range(len(text_clf.classes_)): if label == text_clf.classes_[i]: item.append(1) else: item.append(0) y.append(item) y = np.array(y) fpr, tpr, _ = roc_curve(y.ravel(), y_score.ravel()) roc_auc = auc(fpr, tpr) trace = go.Scatter( x = fpr, y = tpr, name = 'ROC curve (area =' + str(roc_auc) + ' )', line = dict(color=('deeppink'), width = 4) ) data = [trace] # multiclasses else: fpr = {} tpr = {} roc_auc = {} for i in range(len(self.labels)): fpr[self.labels[i]], tpr[self.labels[i]], _ = roc_curve(y[:, i], y_score[:, i]) roc_auc[self.labels[i]] = auc(fpr[self.labels[i]], tpr[self.labels[i]]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y.ravel(), y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # First aggregate all false positive rates all_fpr = np.unique(np.concatenate([fpr[self.labels[i]] for i in range(len(self.labels))])) # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in range(len(self.labels)): mean_tpr += interp(all_fpr, fpr[self.labels[i]], tpr[self.labels[i]]) # Finally average it and compute AUC mean_tpr /= len(self.labels) fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) # plotting trace0 = go.Scatter( x = fpr['micro'], y = tpr['micro'], name = 'micro-average ROC curve (area =' + str(roc_auc["micro"]) + ' )', line = dict(color=('deeppink'), width = 4) ) trace1 = go.Scatter( x = fpr['macro'], y = tpr['macro'], name = 'macro-average ROC curve (area =' + str(roc_auc["macro"]) + ' )', line = dict( color = ('navy'), width = 4,) ) data = [trace0, trace1] colors = cycle(['aqua', 'darkorange', 'cornflowerblue']) for i, color in zip(range(len(self.labels)), colors): trace = go.Scatter( x = fpr[self.labels[i]], y = tpr[self.labels[i]], name = 'ROC curve of class {0} (area = {1:0.2f})'.format(self.labels[i], roc_auc[self.labels[i]]), line = dict( color = (color), width = 4, dash = 'dash') ) data.append(trace) layout = dict(title = model + ' model ROC curve', xaxis = dict(title = 'False Positive Rate'), yaxis = dict(title = 'True Positive Rate'), ) fig = dict(data=data, layout=layout) div = plot(fig, output_type='div',image='png',auto_open=False, image_filename='plot_img') # print the graph file fname_div ='div.html' with open(self.localSavePath + fname_div,'w') as f: f.write(div) s3.upload(self.localSavePath, self.awsPath, fname_div) div_url = s3.generate_downloads(self.awsPath, fname_div) return {'accuracy':accuracy_url, 'pickle':pickle_url, 'div':div_url }
os.makedirs(localSavePath) if not os.path.exists(localReadPath): os.makedirs(localReadPath) fname_config = 'config.json' if s3.checkExist(awsPath, fname_config): s3.downloadToDisk(fname_config, localSavePath, awsPath) with open(localSavePath + fname_config, "r") as fp: data = json.load(fp) for key in vars(args).keys(): if key not in data.keys(): data[key] = vars(args)[key] with open(localSavePath + fname_config,"w") as f: json.dump(data,f) s3.upload(localSavePath, awsPath, fname_config) output['config'] = s3.generate_downloads(awsPath, fname_config) output['uuid'] = uid else: raise ValueError('This session ID is invalid!') exit() # download the labeled data from s3 to tmp classification = Classification(awsPath, localSavePath, localReadPath, args.remoteReadPath,args.labeledFilename) output.update(classification.classify(args.model)) output.update(classification.metrics()) d.deletedir('/tmp') n.notification(args.email,case=3,filename=awsPath)
def predict(self): # load classification model pkl_model = os.path.join(self.localSavePath, 'pipeline.pickle') with open(pkl_model, 'rb') as f: text_clf = pickle.load(f) # load text set data = [] try: with open(self.localSavePath + 'testing.csv', 'r', encoding='utf-8', errors="ignore") as f: reader = list(csv.reader(f)) for row in reader[1:]: try: data.extend(row) except Exception as e: pass except: with open(self.localSavePath + 'testing.csv', 'r', encoding='ISO-8859-1', errors="ignore") as f: reader = list(csv.reader(f)) for row in reader[1:]: try: data.extend(row) except Exception as e: pass # predict using trained model self.predicted = text_clf.predict(data) # save result fname = 'predicting.csv' try: with open(self.localSavePath + fname, 'w', newline="", encoding='utf-8', errors="ignore") as f: writer = csv.writer(f) writer.writerow(['text', 'category']) for i in range(len(data)): try: writer.writerow([data[i], self.predicted[i]]) except: pass except: with open(self.localSavePath + fname, 'w', newline="", encoding='ISO-8859-1', errors="ignore") as f: writer = csv.writer(f) writer.writerow(['text', 'category']) for i in range(len(data)): try: writer.writerow([data[i], self.predicted[i]]) except: pass s3.upload(self.localSavePath, self.awsPath, fname) return s3.generate_downloads(self.awsPath, fname)
def split(self, ratio): training_set = list( random.sample(self.corpus, int(len(self.corpus) * ratio / 100))) testing_set = [ item for item in self.corpus if item not in training_set ] # plot a pie chart of the split labels = ['training set data points', 'unlabeled data points'] values = [len(training_set), len(testing_set)] trace = go.Pie(labels=labels, values=values, textinfo='value') div_split = plot([trace], output_type='div', image='png', auto_open=False, image_filename='plot_img') fname_div_split = 'div_split.html' with open(self.localSavePath + fname_div_split, "w") as f: f.write(div_split) s3.upload(self.localSavePath, self.awsPath, fname_div_split) div_url = s3.generate_downloads(self.awsPath, fname_div_split) fname1 = 'TRAINING_' + self.filename try: with open(self.localSavePath + fname1, 'w', newline="", encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(['text', 'category']) for row in training_set: try: writer.writerow([row]) except UnicodeDecodeError: pass except: with open(self.localSavePath + fname1, 'w', newline="", encoding='ISO-8859-1') as f: writer = csv.writer(f) writer.writerow(['text', 'category']) for row in training_set: try: writer.writerow([row]) except UnicodeDecodeError: pass s3.upload(self.localSavePath, self.awsPath, fname1) training_url = s3.generate_downloads(self.awsPath, fname1) fname2 = 'UNLABELED_' + self.filename try: with open(self.localSavePath + fname2, 'w', newline="", encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(['text']) for row in testing_set: try: writer.writerow([row]) except UnicodeDecodeError: pass except: with open(self.localSavePath + fname2, 'w', newline="", encoding='ISO-8859-1') as f: writer = csv.writer(f) writer.writerow(['text']) for row in testing_set: try: writer.writerow([row]) except UnicodeDecodeError: pass s3.upload(self.localSavePath, self.awsPath, fname2) unlabeled_url = s3.generate_downloads(self.awsPath, fname2) return { 'div': div_url, 'training': training_url, 'testing': unlabeled_url }
def calc_tweet_personality(sessionID, screen_name, profile_img): # load embedding dataset curr_path = os.path.dirname(os.path.abspath(__file__)) dataset_path = curr_path + "/fastText/wiki-news-300d-1M.vec" wordDictionary = dsu.parseFastText(dataset_path) # load predictive models models = {} for trait in ["O", "C", "E", "A", "N"]: models[trait] = joblib.load(curr_path + "/models/model_" + trait + ".pkl") # read tweets awsPath = os.path.join(sessionID, screen_name) sessionDir = os.environ['SESSIONDIR'] localPath = os.path.join(sessionDir + '/collection', sessionID) if not os.path.exists(localPath): try: os.makedirs(localPath) except: pass try: s3.downloadToDisk(screen_name + '_tweets.txt', localPath, awsPath) except: raise ValueError('Cannot find the timeline in the remote storage!') # process the tweets tweet_file_path = os.path.join(localPath, screen_name + '_tweets.txt') filteredTweets = [] word_count = 0 for tweet in open(tweet_file_path, "r", encoding="utf-8"): if re.match(r'^(RT)', tweet) or tweet == '\n' \ or tweet == '' or tweet == ' ': continue #remove links starting with "http" tweet = re.sub(r'((http)([^\s]*)(\s|$))|((http)([^\s]*)$)', "", tweet) #remove links with no http (probably unnecessary) tweet = re.sub( r'(\s([^\s]*)\.([^\s]*)\/([^\s]*)\s)|(^([^\s]*)\.([^\s]*)\/([^\s]*)(\s|$))|(\s([^\s]*)\.([^\s]*)\/([^\s]*)$)', " ", tweet) #remove mentions tweet = re.sub(r'(\s(@)([^\s]*)\s)|((^@)([^\s]*)(\s|$))|(@([^\s]*)$)', " ", tweet) #hashtags are removed by countvectorizer filteredTweets.append(tweet) word_count += len(tweet.split()) if len(filteredTweets) == 0: print("Not enough tweets for prediction.") continue #now we can process the tweet using embeddings.transofrmTextForTraining try: tweetEmbeddings = embeddings.transformTextForTesting( wordDictionary, 3, filteredTweets, "conc") except: print("Not enough tweets for prediction.") # predict using saved models # range is 0 ~ 5 scores = {} for trait in ["O", "C", "E", "A", "N"]: model = models[trait] preds = model.predict(tweetEmbeddings) scores[trait] = float(str(np.mean(np.array(preds)))[0:5]) jung = "" if scores["E"] > 3: jung = "E" else: jung = "I" if scores["O"] > 3: jung = jung + "N" else: jung = jung + "S" if scores["A"] > 3: jung = jung + "F" else: jung = jung + "T" if scores["C"] > 3: jung = jung + "J" else: jung = jung + "P" scores["jung"] = jung # sort the output result = {} result['screen_name'] = screen_name result['profile_img'] = profile_img result['personality'] = { "word_count": word_count, "processed_language": "en", 'personality': [{ 'name': 'Openness', 'percentile': scores['O'] / 5 }, { 'name': 'Conscientiousness', 'percentile': scores['C'] / 5 }, { 'name': 'Extraversion', 'percentile': scores['E'] / 5 }, { 'name': 'Agreeableness', 'percentile': scores['A'] / 5 }, { 'name': 'Emotional range', 'percentile': scores['N'] / 5 }] } # save to json and upload to s3 bucket with open(os.path.join(localPath, screen_name + '_twitPersonality.json'), 'w') as outfile: json.dump(result, outfile) s3.upload(localPath, awsPath, screen_name + '_twitPersonality.json') # delete localPath files try: os.remove(os.path.join(localPath, screen_name + '_tweets.txt')) os.remove( os.path.join(localPath, screen_name + '_twitPersonality.json')) except: # already deleted! pass print(s3.generate_downloads(awsPath, screen_name + '_twitPersonality.json')) return result