def main(remoteSavePath): output = {} for file in listdir('results'): if isfile(join('results', file)): s3.upload('results', remoteSavePath, file) if file == 'config.json': output['config'] = s3.generate_downloads(remoteSavePath, file) elif file == 'div.html': output['visualization'] = s3.generate_downloads(remoteSavePath, file) elif file == 'AutoPhrase_multi-words.txt': output['multi-words'] = s3.generate_downloads(remoteSavePath, file) elif file == 'AutoPhrase_single-word.txt': output['single-word'] = s3.generate_downloads(remoteSavePath, file) elif file == 'AutoPhrase.txt': output['autophrase'] = s3.generate_downloads(remoteSavePath, file) elif file == 'segmentation.model': output['model'] = s3.generate_downloads(remoteSavePath, file) elif file == 'token_mapping.txt': output['token-mapping'] = s3.generate_downloads(remoteSavePath, file) else: output['misc'] = s3.generate_downloads(remoteSavePath, file) return output
def lambda_handler(event, context): output = dict() uid = event['uid'] awsPath = event['s3FolderName'] + '/ML/classification/' + uid + '/' localSavePath = '/tmp/' + event[ 's3FolderName'] + '/ML/classification/' + uid + '/' if not os.path.exists(localSavePath): os.makedirs(localSavePath) if not os.path.exists(localSavePath): os.makedirs(localSavePath) # download config to local folder fname_config = 'config.json' try: s3.downloadToDisk(fname_config, localSavePath, awsPath) with open(localSavePath + fname_config, "r") as fp: data = json.load(fp) for key in data.keys(): if key not in event.keys(): event[key] = data[key] with open(localSavePath + fname_config, "w") as f: json.dump(event, f) s3.upload(localSavePath, awsPath, fname_config) output['config'] = s3.generate_downloads(awsPath, fname_config) output['uid'] = uid except: raise ValueError('This session ID is invalid!') exit() # download unlabeled data to local folder fname_unlabeled = 'testing.csv' try: s3.downloadToDisk(fname_unlabeled, localSavePath, awsPath) except: raise ValueError('You\'re requesting ' + fname_unlabeled + ' file, and it\'s not found in your remote directory!\ It is likely that you have not yet performed step 1 -- split the dataset into training and predicting set, or you have provided the wrong sessionID.' ) exit() #download pickle model to local folder fname_pickle = 'pipeline.pickle' try: s3.downloadToDisk(fname_pickle, localSavePath, awsPath) except: raise ValueError( 'You\'re requesting ' + fname_pickle + ' file, and it\'s not found in your remote directory! \ It is likely that you have not yet performed step 2 -- model training, or you have provided the wrong sessionID.' ) exit() classification = Classification(awsPath, localSavePath) output['predicting'] = classification.predict() output['div_category'] = classification.plot() return output
def lambda_handler(event, context): # create local path localPath = os.path.join('/tmp', 'crimson') if not os.path.exists(localPath): os.makedirs(localPath) today = date.today() yesterday = today - timedelta(days=1) dayBeforeYesterday = today - timedelta(days=2) fname = collect_crimson_monitor_data( dayBeforeYesterday.strftime("%Y-%m-%d"), yesterday.strftime("%Y-%m-%d"), localPath) s3.upload("macroscope-paho-covid", localPath, "input/crimson", fname) return None
def lambda_handler(event, context): # create local path localPath = os.path.join('/tmp', 'sentiment') if not os.path.exists(localPath): os.makedirs(localPath) # collect top sources and plot today = date.today() yesterday = today - timedelta(days=1) dayBeforeYesterday = today - timedelta(days=2) fnames = crimson_sentiment(dayBeforeYesterday.strftime("%Y-%m-%d"), yesterday.strftime("%Y-%m-%d"), localPath) for fname in fnames: s3.upload("macroscope-paho-covid", localPath, "sentiment", fname) return None
def save_remote_output(localSavePath, remoteSavePath, fname): """ :param localSavePath: :param remoteSavePath: :param fname: :param output_data: :return: """ zipf = zipfile.ZipFile(os.path.join(localSavePath, fname), 'w', zipfile.ZIP_DEFLATED) zipdir(os.path.join(localSavePath, 'img'), zipf) zipf.close() s3.upload(localSavePath, remoteSavePath, fname) url = s3.generate_downloads(remoteSavePath, fname) return url
def plot(self): y_pred_dict = Counter(self.predicted) labels = [] values = [] for i in y_pred_dict.keys(): labels.append("class: " + str(i)) values.append(y_pred_dict[i]) trace = go.Pie(labels=labels, values=values, textinfo='label') div_category = plot([trace], output_type='div', image='png', auto_open=False, image_filename='plot_img') fname_div_category = 'div_category.html' with open(self.localSavePath + fname_div_category, "w") as f: f.write(div_category) s3.upload(self.localSavePath, self.awsPath, fname_div_category) return s3.generate_downloads(self.awsPath, fname_div_category)
def lambda_handler(event, context): awsPath = os.path.join(event['sessionID'], event['screen_name']) localPath = os.path.join('/tmp', event['sessionID'], event['screen_name']) if not os.path.exists(localPath): os.makedirs(localPath) screen_name = event['screen_name'] try: s3.downloadToDisk(screen_name + '_tweets.txt', localPath, awsPath) except: raise ValueError('Cannot find the timeline in the remote storage!') with open(os.path.join(localPath, screen_name + '_tweets.txt'), 'r') as personality_text: headers = {'Content-Type': 'text/plain', 'Accept': 'application/json'} # concatenate the text field to be a paragraph df = pd.read_csv(os.path.join(localPath, screen_name + '_tweets.txt')) tweets = df['text'].tolist() body = '. '.join(tweets).encode('utf-8', 'ignore') r = requests.post( 'https://gateway.watsonplatform.net/personality-insights/api/v3/profile?version=2017-10-13&consumption_preferences=true&raw_scores=true', headers=headers, data=body, auth=('apikey', event['apikey']), timeout=300) if r.status_code == 200: data = {'personality': r.json()} with open( os.path.join(localPath, screen_name + '_personality' + '.json'), 'w') as outfile: json.dump(data, outfile) s3.upload(localPath, awsPath, screen_name + '_personality.json') return data else: raise ValueError(r.text)
def metrics(self): report = np.array(metrics.precision_recall_fscore_support(self.target,self.predicted,labels=self.labels)).T avg_report = list(metrics.precision_recall_fscore_support(self.target,self.predicted,average='weighted')) avg_report.insert(0,'AVG') # save metrics report fname_metrics = 'classification_report.csv' with open(self.localSavePath + fname_metrics,'w',newline="") as f: writer = csv.writer(f) writer.writerow(['label','precision','recall','f1-score','support']) for i in range(len(report)): writer.writerow([self.labels[i], round(report[i][0],4), round(report[i][1],4), round(report[i][2],4), round(report[i][3],4)]) writer.writerow(avg_report) s3.upload(self.localSavePath, self.awsPath, fname_metrics) return {'metrics': s3.generate_downloads(self.awsPath, fname_metrics)}
def lambda_handler(event, context): # create local path localPath = os.path.join('/tmp', 'hashtag') if not os.path.exists(localPath): os.makedirs(localPath) # download triggered file bucket = event['Records'][0]['s3']['bucket']['name'] key = unquote_plus(event['Records'][0]['s3']['object']['key']) remotePath = "/".join(key.split("/")[:-1]) filename = key.split("/")[-1] s3.downloadToDisk(bucket, filename, localPath, remotePath) # load to dataframe df = pd.read_csv(os.path.join(localPath, filename)) # extract hashtag hash = extract_hashtag(df) # plot bar chart (frequency chart) index = hash['hashtags'].values.tolist()[:10] counts = hash['Freq'].values.tolist()[:10] title = 'Top 10 prevalent hashtags (' + filename.split(".")[0] + ')' div = plot.plot_bar_chart(index, counts, title) # save result and write back to s3 hash_filename = filename.split(".")[0] hash.to_csv(os.path.join(localPath, hash_filename + "_extracted_hashtag.csv"), index=False) s3.upload("macroscope-paho-covid", localPath, "hashtags", hash_filename + "_extracted_hashtag.csv") with open( os.path.join(localPath, hash_filename + "_extracted_hashtag_frequency.html"), 'w') as f: f.write(div) s3.upload("macroscope-paho-covid", localPath, "hashtags", hash_filename + "_extracted_hashtag_frequency.html") return None
def lambda_handler(event, context): awsPath = os.path.join(event['sessionID'], event['screen_name']) localSavePath = os.path.join('/tmp', event['sessionID'], event['screen_name']) if not os.path.exists(localSavePath): os.makedirs(localSavePath) auth = tweepy.OAuthHandler(event['consumer_key'], event['consumer_secret']) auth.set_access_token(event['access_token'], event['access_token_secret']) api = tweepy.API(auth) tweets = [] for status in tweepy.Cursor(api.user_timeline, screen_name=event['screen_name'], count=100, tweet_mode="extended").items(): tweets.append([ status._json['id'], status._json['full_text'].encode('utf-8', 'ignore').decode() ]) if len(tweets) > 0: fname = event['screen_name'] + '_tweets.txt' with open(os.path.join(localSavePath, fname), 'w', encoding='utf-8', newline='') as f: header = ['id', 'text'] writer = csv.writer(f, delimiter=",") writer.writerow(header) for row in tweets: writer.writerow(row) s3.upload(localSavePath, awsPath, fname) return {'url': s3.generate_downloads(awsPath, fname)} else: raise ValueError('This user\'s timeline (screen_name: ' + event['screen_name'] + ') is empty. There is nothing to analyze!')
def related_queries(keywords, language, localPath): if language.lower() == 'spanish': pytrend = TrendReq(hl='sp-SP') else: pytrend = TrendReq() timeframes = {'now 1-d': '1day', 'now 7-d': '7days', 'today 1-m': '30days'} # there is a limit on 100 characters for keywords break them to multiple requests then while len(keywords) > 0: character_len = 0 keywords_split = [] for kk in keywords: character_len += len(kk) if character_len < 50: keywords_split.append(kk) for item in keywords_split: keywords.remove(item) indices = {} counts = {} title = {} subtitles = {} for timeframe in timeframes.keys(): pytrend.build_payload(kw_list=keywords_split, timeframe=timeframe) df_queries = pytrend.related_queries() for keyword in keywords_split: if keyword not in indices.keys(): indices[keyword] = [] if keyword not in counts.keys(): counts[keyword] = [] if keyword not in subtitles.keys(): subtitles[keyword] = [] df_top = df_queries[keyword]['top'] df_rising = df_queries[keyword]['rising'] if df_top is not None and df_rising is not None: # plot bar chart side by side indices[keyword].append([df_top["query"].tolist()[:10], df_rising["query"].tolist()[:10]]) counts[keyword].append([df_top["value"].tolist()[:10], df_rising["value"].tolist()[:10]]) title[keyword] = "Google Trends Queries related to keyword: " + keyword subtitles[keyword].append(["top related query(" + timeframes[timeframe] + ")", "rising related query(" + timeframes[timeframe] + ")"]) # save csv df_top.rename(columns={'query': 'top related query'}, inplace=True) df_rising.rename(columns={'query': 'rising related query'}, inplace=True) result = pd.concat([df_top, df_rising], axis=1) result.to_csv(os.path.join(localPath, keyword.replace(" ", "_") + "_" + timeframes[timeframe] + "_related_queries.csv"), index=False) s3.upload("macroscope-paho-covid", localPath, "related_queries", keyword.replace(" ", "_") + "_" + timeframes[timeframe] + "_related_queries.csv") for keyword in keywords_split: div = plot.plot_multiple_bar_chart(indices[keyword], counts[keyword], title[keyword], subtitles[keyword]) with open(os.path.join(localPath, keyword.replace(" ", "_") + "_related_queries.html"), 'w') as f: f.write(div) s3.upload("macroscope-paho-covid", localPath, "related_queries", keyword.replace(" ", "_") + "_related_queries.html") return None
def predict(self): # load classification model pkl_model = os.path.join(self.localSavePath, 'pipeline.pickle') with open(pkl_model, 'rb') as f: text_clf = pickle.load(f) # load text set data = [] try: with open(self.localSavePath + 'testing.csv', 'r', encoding='utf-8', errors="ignore") as f: reader = list(csv.reader(f)) for row in reader[1:]: try: data.extend(row) except Exception as e: pass except: with open(self.localSavePath + 'testing.csv', 'r', encoding='ISO-8859-1', errors="ignore") as f: reader = list(csv.reader(f)) for row in reader[1:]: try: data.extend(row) except Exception as e: pass # predict using trained model self.predicted = text_clf.predict(data) # save result fname = 'predicting.csv' try: with open(self.localSavePath + fname, 'w', newline="", encoding='utf-8', errors="ignore") as f: writer = csv.writer(f) writer.writerow(['text', 'category']) for i in range(len(data)): try: writer.writerow([data[i], self.predicted[i]]) except: pass except: with open(self.localSavePath + fname, 'w', newline="", encoding='ISO-8859-1', errors="ignore") as f: writer = csv.writer(f) writer.writerow(['text', 'category']) for i in range(len(data)): try: writer.writerow([data[i], self.predicted[i]]) except: pass s3.upload(self.localSavePath, self.awsPath, fname) return s3.generate_downloads(self.awsPath, fname)
try: s3.downloadToDisk(screen_name + '_tweets.txt', localPath, awsPath) except: raise ValueError('Cannot find the timeline in the remote storage!') # calculate brand personality model = MultiLabelClassificationModel('roberta', 'checkpoint-17315-epoch-5', num_labels=5, args={ "reprocess_input_data": True, 'use_cached_eval_features': False }, use_cuda=False) df = pd.read_csv(os.path.join(localPath, screen_name + '_tweets.txt')) new_df = multiple_sentences(df, model) fname_sentences = screen_name + '_utku_personality_sentences.csv' new_df.to_csv(os.path.join(localPath, fname_sentences), index=False) s3.upload(localPath, awsPath, fname_sentences) # get the average score mean_metrics = average(new_df) fname_average = screen_name + '_utku_personality_average.json' with open(os.path.join(localPath, fname_average), 'w') as f: json.dump(mean_metrics, f) s3.upload(localPath, awsPath, fname_average) # push notification email notification(toaddr=params['email'], sessionURL=params['sessionURL'])
def lambda_handler(event, context): screen_names = [ "msalnacion", "gisbarbados", "MFABelize", "MinSaludBolivia", "minsaude", "GovCanHealth", "caymangovt", "ministeriosalud", "MinSaludCol", "msaludcr", "GoDomRep", "Salud_Ec", "ars_guyane", "MsppOfficiel", "themohwgovjm", "GobiernoMX", "msaludpy", "PeruPaisDigital", "skngov", "MOH_TT", "MSPUruguay", "USAGov" ] # create local path localPath = os.path.join('/tmp', 'tweets') if not os.path.exists(localPath): os.makedirs(localPath) # collect timeline auth = tweepy.OAuthHandler(os.environ['consumer_key'], os.environ['consumer_secret']) auth.set_access_token(os.environ['access_token'], os.environ['access_token_secret']) api = tweepy.API(auth) header = [ "created_at", "id", "id_str", "full_text", "truncated", "display_text_range", "source", "in_reply_to_status_id", "in_reply_to_status_id_str", "in_reply_to_user_id", "in_reply_to_user_id_str", "in_reply_to_screen_name", "is_quote_status", "retweet_count", "favorite_count", "favorited", "retweeted", "possibly_sensitive", "lang" ] for screen_name in screen_names: tweets = [] for status in tweepy.Cursor(api.user_timeline, screen_name=screen_name, count=200, tweet_mode="extended").items(): if "created_at" in status._json.keys( ) and status._json["created_at"][-4:] == "2020": tweet = [] for key in header: if key in status._json.keys(): # make sure date if key == 'full_text': tweet.append(status._json[key].encode( 'utf-8', 'ignore').decode()) else: tweet.append(status._json[key]) else: tweet.append("NA") tweets.append(tweet) else: break if len(tweets) > 0: fname = screen_name + '_tweets.csv' with open(os.path.join(localPath, fname), 'w', encoding='utf-8', newline='') as f: writer = csv.writer(f, delimiter=",") writer.writerow(header) for row in tweets: writer.writerow(row) s3.upload("macroscope-paho-covid", localPath, "input/twitter", fname) time.sleep(2) return None
def save_remote_output(localSavePath, remoteSavePath, fname, output_data): """ save output in memory first to local file, then upload to remote S3 bucket :param localSavePath: local saved file :param remoteSavePath: remote save file path :param fname: filename :param output_data: the actual data :return: url of the file saved in S3 bucket """ # json if isinstance(output_data, dict): fname += '.json' with open(os.path.join(localSavePath, fname), 'w') as f: json.dump(output_data, f) # # dataframe to csv # elif isinstance(output_data, pd.DataFrame): # fname += '.csv' # output_data.to_csv(fname) # string to html elif isinstance(output_data, str): fname += '.html' with open(os.path.join(localSavePath, fname), 'w') as f: f.write(output_data) # list(list) to csv elif isinstance(output_data, list) \ and (isinstance(output_data[0], list) or isinstance(output_data[0], tuple)): fname += '.csv' with open(os.path.join(localSavePath, fname), 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) for row in output_data: try: writer.writerow(row) except UnicodeEncodeError as e: print(e) # generator elif isinstance(output_data, types.GeneratorType): if fname == 'gephi': fname += '.gml' elif fname == 'pajek': fname += '.net' else: fname += '.unknown' with open(os.path.join(localSavePath, fname), 'w', newline='') as f: for line in output_data: f.write(line + '\n') # else pickle the object else: fname += '.pickle' with open(os.path.join(localSavePath, fname), 'wb') as f: pickle.dump(output_data, f) s3.upload(localSavePath, remoteSavePath, fname) url = s3.generate_downloads(remoteSavePath, fname) return url
def calc_tweet_personality(sessionID, screen_name, profile_img): # load embedding dataset curr_path = os.path.dirname(os.path.abspath(__file__)) dataset_path = curr_path + "/fastText/wiki-news-300d-1M.vec" wordDictionary = dsu.parseFastText(dataset_path) # load predictive models models = {} for trait in ["O", "C", "E", "A", "N"]: models[trait] = joblib.load(curr_path + "/models/model_" + trait + ".pkl") # read tweets awsPath = os.path.join(sessionID, screen_name) sessionDir = os.environ['SESSIONDIR'] localPath = os.path.join(sessionDir + '/collection', sessionID) if not os.path.exists(localPath): try: os.makedirs(localPath) except: pass try: s3.downloadToDisk(screen_name + '_tweets.txt', localPath, awsPath) except: raise ValueError('Cannot find the timeline in the remote storage!') # process the tweets tweet_file_path = os.path.join(localPath, screen_name + '_tweets.txt') filteredTweets = [] word_count = 0 for tweet in open(tweet_file_path, "r", encoding="utf-8"): if re.match(r'^(RT)', tweet) or tweet == '\n' \ or tweet == '' or tweet == ' ': continue #remove links starting with "http" tweet = re.sub(r'((http)([^\s]*)(\s|$))|((http)([^\s]*)$)', "", tweet) #remove links with no http (probably unnecessary) tweet = re.sub( r'(\s([^\s]*)\.([^\s]*)\/([^\s]*)\s)|(^([^\s]*)\.([^\s]*)\/([^\s]*)(\s|$))|(\s([^\s]*)\.([^\s]*)\/([^\s]*)$)', " ", tweet) #remove mentions tweet = re.sub(r'(\s(@)([^\s]*)\s)|((^@)([^\s]*)(\s|$))|(@([^\s]*)$)', " ", tweet) #hashtags are removed by countvectorizer filteredTweets.append(tweet) word_count += len(tweet.split()) if len(filteredTweets) == 0: print("Not enough tweets for prediction.") continue #now we can process the tweet using embeddings.transofrmTextForTraining try: tweetEmbeddings = embeddings.transformTextForTesting( wordDictionary, 3, filteredTweets, "conc") except: print("Not enough tweets for prediction.") # predict using saved models # range is 0 ~ 5 scores = {} for trait in ["O", "C", "E", "A", "N"]: model = models[trait] preds = model.predict(tweetEmbeddings) scores[trait] = float(str(np.mean(np.array(preds)))[0:5]) jung = "" if scores["E"] > 3: jung = "E" else: jung = "I" if scores["O"] > 3: jung = jung + "N" else: jung = jung + "S" if scores["A"] > 3: jung = jung + "F" else: jung = jung + "T" if scores["C"] > 3: jung = jung + "J" else: jung = jung + "P" scores["jung"] = jung # sort the output result = {} result['screen_name'] = screen_name result['profile_img'] = profile_img result['personality'] = { "word_count": word_count, "processed_language": "en", 'personality': [{ 'name': 'Openness', 'percentile': scores['O'] / 5 }, { 'name': 'Conscientiousness', 'percentile': scores['C'] / 5 }, { 'name': 'Extraversion', 'percentile': scores['E'] / 5 }, { 'name': 'Agreeableness', 'percentile': scores['A'] / 5 }, { 'name': 'Emotional range', 'percentile': scores['N'] / 5 }] } # save to json and upload to s3 bucket with open(os.path.join(localPath, screen_name + '_twitPersonality.json'), 'w') as outfile: json.dump(result, outfile) s3.upload(localPath, awsPath, screen_name + '_twitPersonality.json') # delete localPath files try: os.remove(os.path.join(localPath, screen_name + '_tweets.txt')) os.remove( os.path.join(localPath, screen_name + '_twitPersonality.json')) except: # already deleted! pass print(s3.generate_downloads(awsPath, screen_name + '_twitPersonality.json')) return result
def classify(self, model): if model == 'NaiveBayes': text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf',MultinomialNB())]) # 10 fold cross validation self.predicted = cross_val_predict(text_clf, self.data, self.target, cv=10) # fit the model text_clf.fit(self.data, self.target) y_score = text_clf.predict_proba(self.data) elif model == 'Perceptron': text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf',Perceptron())]) # 10 fold cross validation self.predicted = cross_val_predict(text_clf, self.data, self.target, cv=10) # fit the model text_clf.fit(self.data, self.target) y_score = text_clf.decision_function(self.data) elif model == 'SGD': text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf',SGDClassifier())]) # 10 fold cross validation self.predicted = cross_val_predict(text_clf, self.data, self.target, cv=10) # fit the model text_clf.fit(self.data, self.target) y_score = text_clf.decision_function(self.data) elif model == 'RandomForest': text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf',RandomForestClassifier(n_estimators=100))]) # 10 fold cross validation self.predicted = cross_val_predict(text_clf, self.data, self.target, cv=10) # fit the model text_clf.fit(self.data, self.target) y_score = text_clf.predict_proba(self.data) elif model == 'KNN': text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf',KNeighborsClassifier(n_neighbors=10))]) # 10 fold cross validation self.predicted = cross_val_predict(text_clf, self.data, self.target, cv=10) # fit the model text_clf.fit(self.data, self.target) y_score = text_clf.predict_proba(self.data) elif model == 'passiveAggressive': text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('clf',PassiveAggressiveClassifier(n_iter=50))]) # 10 fold cross validation self.predicted = cross_val_predict(text_clf, self.data, self.target, cv=10) # fit the model text_clf.fit(self.data, self.target) y_score = text_clf.decision_function(self.data) # get 10 fold cross validation accuracy score fold_scores = cross_val_score(text_clf, self.data, self.target, cv=10) fname_folds = 'accuracy_score.csv' with open(self.localSavePath + fname_folds,'w',newline="") as f: writer = csv.writer(f) writer.writerow(['fold_1','fold_2','fold_3','fold_4','fold_5', 'fold_6','fold_7','fold_8','fold_9','fold_10']) writer.writerow([ '%.4f' % elem for elem in fold_scores ]) s3.upload(self.localSavePath, self.awsPath, fname_folds) accuracy_url = s3.generate_downloads(self.awsPath, fname_folds) # pickle the Pipeline for future use fname_pickle = 'classification_pipeline.pickle' with open(self.localSavePath + fname_pickle,'wb') as f: pickle.dump(text_clf,f) s3.upload(self.localSavePath, self.awsPath, fname_pickle) pickle_url = s3.generate_downloads(self.awsPath, fname_pickle) # plotting the roc curve self.labels = text_clf.classes_ y = label_binarize(self.target,classes = self.labels) # binary class if len(self.labels) <= 2: if model == 'Perceptron' or model == 'SGD' or model == 'passiveAggressive': fpr, tpr, _ = roc_curve(y[:, 0], y_score) else: y = [] for label in self.target: item = [] for i in range(len(text_clf.classes_)): if label == text_clf.classes_[i]: item.append(1) else: item.append(0) y.append(item) y = np.array(y) fpr, tpr, _ = roc_curve(y.ravel(), y_score.ravel()) roc_auc = auc(fpr, tpr) trace = go.Scatter( x = fpr, y = tpr, name = 'ROC curve (area =' + str(roc_auc) + ' )', line = dict(color=('deeppink'), width = 4) ) data = [trace] # multiclasses else: fpr = {} tpr = {} roc_auc = {} for i in range(len(self.labels)): fpr[self.labels[i]], tpr[self.labels[i]], _ = roc_curve(y[:, i], y_score[:, i]) roc_auc[self.labels[i]] = auc(fpr[self.labels[i]], tpr[self.labels[i]]) # Compute micro-average ROC curve and ROC area fpr["micro"], tpr["micro"], _ = roc_curve(y.ravel(), y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # First aggregate all false positive rates all_fpr = np.unique(np.concatenate([fpr[self.labels[i]] for i in range(len(self.labels))])) # Then interpolate all ROC curves at this points mean_tpr = np.zeros_like(all_fpr) for i in range(len(self.labels)): mean_tpr += interp(all_fpr, fpr[self.labels[i]], tpr[self.labels[i]]) # Finally average it and compute AUC mean_tpr /= len(self.labels) fpr["macro"] = all_fpr tpr["macro"] = mean_tpr roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) # plotting trace0 = go.Scatter( x = fpr['micro'], y = tpr['micro'], name = 'micro-average ROC curve (area =' + str(roc_auc["micro"]) + ' )', line = dict(color=('deeppink'), width = 4) ) trace1 = go.Scatter( x = fpr['macro'], y = tpr['macro'], name = 'macro-average ROC curve (area =' + str(roc_auc["macro"]) + ' )', line = dict( color = ('navy'), width = 4,) ) data = [trace0, trace1] colors = cycle(['aqua', 'darkorange', 'cornflowerblue']) for i, color in zip(range(len(self.labels)), colors): trace = go.Scatter( x = fpr[self.labels[i]], y = tpr[self.labels[i]], name = 'ROC curve of class {0} (area = {1:0.2f})'.format(self.labels[i], roc_auc[self.labels[i]]), line = dict( color = (color), width = 4, dash = 'dash') ) data.append(trace) layout = dict(title = model + ' model ROC curve', xaxis = dict(title = 'False Positive Rate'), yaxis = dict(title = 'True Positive Rate'), ) fig = dict(data=data, layout=layout) div = plot(fig, output_type='div',image='png',auto_open=False, image_filename='plot_img') # print the graph file fname_div ='div.html' with open(self.localSavePath + fname_div,'w') as f: f.write(div) s3.upload(self.localSavePath, self.awsPath, fname_div) div_url = s3.generate_downloads(self.awsPath, fname_div) return {'accuracy':accuracy_url, 'pickle':pickle_url, 'div':div_url }
client_secret="***REMOVED***") # loop through the id and store their comments for url, id in zip(urls, ids): url = "https://www.reddit.com" + url try: submission = reddit.submission(url=url) if not bfs(submission, id, comments_folder): # zip goes here zipf = zipfile.ZipFile(temp_dir + fname_zip, 'w', zipfile.ZIP_DEFLATED) zipdir(comments_folder + '/', zipf) zipf.close() # upload this zip to the s3 corresponding folder s3.upload(temp_dir, args.remoteReadPath, fname_zip) url = s3.generate_downloads(args.remoteReadPath, fname_zip) # delete the files d.deletedir('/tmp') # send out email notification n.notification(args.email, case=1, filename=args.remoteReadPath, links=url, sessionURL=args.sessionURL) exit(code='Lack of disk space') except: # escape those can't be found in url pass # success and send email notification
if not os.path.exists(localSavePath): os.makedirs(localSavePath) if not os.path.exists(localReadPath): os.makedirs(localReadPath) fname_config = 'config.json' if s3.checkExist(awsPath, fname_config): s3.downloadToDisk(fname_config, localSavePath, awsPath) with open(localSavePath + fname_config, "r") as fp: data = json.load(fp) for key in vars(args).keys(): if key not in data.keys(): data[key] = vars(args)[key] with open(localSavePath + fname_config,"w") as f: json.dump(data,f) s3.upload(localSavePath, awsPath, fname_config) output['config'] = s3.generate_downloads(awsPath, fname_config) output['uuid'] = uid else: raise ValueError('This session ID is invalid!') exit() # download the labeled data from s3 to tmp classification = Classification(awsPath, localSavePath, localReadPath, args.remoteReadPath,args.labeledFilename) output.update(classification.classify(args.model)) output.update(classification.metrics()) d.deletedir('/tmp')
parser = argparse.ArgumentParser(description="processing...") parser.add_argument('--remoteReadPath', required=True) parser.add_argument('--ratio', required=True) parser.add_argument('--s3FolderName', required=True) parser.add_argument('--email', required=True) args = parser.parse_args() # arranging the paths uid = str(uuid.uuid4()) awsPath = args.s3FolderName + '/ML/classification/' + uid + '/' localSavePath = '/tmp/' + args.s3FolderName + '/ML/classification/' + uid + '/' localReadPath = '/tmp/' + args.s3FolderName + '/' + uid + '/' if not os.path.exists(localSavePath): os.makedirs(localSavePath) if not os.path.exists(localReadPath): os.makedirs(localReadPath) fname = 'config.json' with open(localSavePath + fname, "w") as f: json.dump(vars(args), f) s3.upload(localSavePath, awsPath, fname) output['config'] = s3.generate_downloads(awsPath, fname) output['uuid'] = uid classification = Classification(awsPath, localSavePath, localReadPath, args.remoteReadPath) output.update(classification.split(int(args.ratio))) d.deletedir('/tmp') n.notification(args.email, case=3, filename=awsPath)
def split(self, ratio): training_set = list( random.sample(self.corpus, int(len(self.corpus) * ratio / 100))) testing_set = [ item for item in self.corpus if item not in training_set ] # plot a pie chart of the split labels = ['training set data points', 'unlabeled data points'] values = [len(training_set), len(testing_set)] trace = go.Pie(labels=labels, values=values, textinfo='value') div_split = plot([trace], output_type='div', image='png', auto_open=False, image_filename='plot_img') fname_div_split = 'div_split.html' with open(self.localSavePath + fname_div_split, "w") as f: f.write(div_split) s3.upload(self.localSavePath, self.awsPath, fname_div_split) div_url = s3.generate_downloads(self.awsPath, fname_div_split) fname1 = 'TRAINING_' + self.filename try: with open(self.localSavePath + fname1, 'w', newline="", encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(['text', 'category']) for row in training_set: try: writer.writerow([row]) except UnicodeDecodeError: pass except: with open(self.localSavePath + fname1, 'w', newline="", encoding='ISO-8859-1') as f: writer = csv.writer(f) writer.writerow(['text', 'category']) for row in training_set: try: writer.writerow([row]) except UnicodeDecodeError: pass s3.upload(self.localSavePath, self.awsPath, fname1) training_url = s3.generate_downloads(self.awsPath, fname1) fname2 = 'UNLABELED_' + self.filename try: with open(self.localSavePath + fname2, 'w', newline="", encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(['text']) for row in testing_set: try: writer.writerow([row]) except UnicodeDecodeError: pass except: with open(self.localSavePath + fname2, 'w', newline="", encoding='ISO-8859-1') as f: writer = csv.writer(f) writer.writerow(['text']) for row in testing_set: try: writer.writerow([row]) except UnicodeDecodeError: pass s3.upload(self.localSavePath, self.awsPath, fname2) unlabeled_url = s3.generate_downloads(self.awsPath, fname2) return { 'div': div_url, 'training': training_url, 'testing': unlabeled_url }
def extract_frequent_phrases(df, hashtag, date_marker, localPath): # filter df by hashtag new_df = df[df['Contents'].str.contains("#" + hashtag, na=False)] most_common = FreqDist( tokenize_no_stop(big_string(new_df['Contents'].values))) most_common_bigrams = FreqDist( ngram(tokenize_no_stop(big_string(new_df['Contents'].values)), 2)) most_common_trigrams = FreqDist( ngram(tokenize_no_stop(big_string(new_df['Contents'].values)), 3)) indices = [] counts = [] for phrases in [ most_common.most_common(10), most_common_bigrams.most_common(10), most_common_trigrams.most_common(10) ]: index = [] count = [] for item in phrases: if isinstance(item[0], tuple): index.append(' '.join(item[0])) else: index.append(item[0]) count.append(item[1]) indices.append(index) counts.append(count) # upload to s3 with open( os.path.join( localPath, hashtag + "_" + date_marker + "_extracted_frequent_words.csv"), "w") as f: writer = csv.writer(f) writer.writerow(['word', 'count']) for row in most_common.most_common(): writer.writerow(row) s3.upload("macroscope-paho-covid", localPath, "frequent_phrases", hashtag + "_" + date_marker + "_extracted_frequent_words.csv") with open( os.path.join( localPath, hashtag + "_" + date_marker + "_extracted_frequent_bigrams.csv"), "w") as f: writer = csv.writer(f) writer.writerow(['bigram', 'count']) for row in most_common_bigrams.most_common(): writer.writerow(row) s3.upload("macroscope-paho-covid", localPath, "frequent_phrases", hashtag + "_" + date_marker + "_extracted_frequent_bigrams.csv") with open( os.path.join( localPath, hashtag + "_" + date_marker + "_extracted_frequent_trigrams.csv"), "w") as f: writer = csv.writer(f) writer.writerow(['trigram', 'count']) for row in most_common_trigrams.most_common(): writer.writerow(row) s3.upload("macroscope-paho-covid", localPath, "frequent_phrases", hashtag + "_" + date_marker + "_extracted_frequent_trigrams.csv") return indices, counts
comment_queue = submission.comments[:] # Seed with top-level comments_no_order = [[ 'author', 'body', 'created_utc', 'id', 'link_id', 'parent_id', 'score', 'subreddit_display_name', 'subreddit_name_prefixed', 'subreddit_id' ]] while comment_queue: comment = comment_queue.pop(0) comments_no_order.append([ str(comment.author), comment.body, comment.created_utc, comment.id, comment.link_id, comment.parent_id, comment.score, comment.subreddit.display_name, comment.subreddit_name_prefixed, comment.subreddit_id ]) comment_queue.extend(comment.replies) # if folder doesnt exist create it # save to csv with open(id + '.csv', 'w', newline="", encoding='utf-8') as f: writer = csv.writer(f, delimiter=',') writer.writerows(comments_no_order) # push to s3 bucket s3.upload('', 'Comment/' + sub + '/' + folder_id + '/', id + '.csv') # delete local file remove(id + '.csv')
def lambda_handler(event, context): # create local path localPath = os.path.join('/tmp', 'frequent_phrases') if not os.path.exists(localPath): os.makedirs(localPath) # download triggered file bucket = event['Records'][0]['s3']['bucket']['name'] key = unquote_plus(event['Records'][0]['s3']['object']['key']) remotePath = "/".join(key.split("/")[:-1]) files = s3.listFiles(bucket, remotePath) sorted_files = sorted(files, key=lambda file: file['LastModified'], reverse=True) hashtags = ["COVID19", "coronavirus", "COVID_19"] date_markers = ["1day", "7days", "30days"] for hashtag in hashtags: indices_row = [] counts_row = [] legends_row = [] for date_marker in date_markers: if date_marker == "1day": today = key.split("/")[-1] s3.downloadToDisk(bucket, today, localPath, remotePath) df_today = pd.read_csv(os.path.join(localPath, today)) indices, counts = extract_frequent_phrases( df_today, hashtag, date_marker, localPath) legends = ["word (1day)", "bigram (1day)", "trigram(1day)"] elif date_marker == "7days": last_7_days_files = sorted_files[:7] last_7_days_list = [] for file in last_7_days_files: fname = file['Key'].split("/")[-1] s3.downloadToDisk(bucket, fname, localPath, remotePath) last_7_days_list.append( pd.read_csv(os.path.join(localPath, fname))) last_7_days_df = pd.concat(last_7_days_list, axis=0, ignore_index=True) indices, counts = extract_frequent_phrases( last_7_days_df, hashtag, date_marker, localPath) legends = ["word (7days)", "bigram (7days)", "trigram(7days)"] elif date_marker == "30days": last_30_days_files = sorted_files[:30] last_30_days_list = [] for file in last_30_days_files: fname = file['Key'].split("/")[-1] s3.downloadToDisk(bucket, fname, localPath, remotePath) last_30_days_list.append( pd.read_csv(os.path.join(localPath, fname))) last_30_days_list = pd.concat(last_30_days_list, axis=0, ignore_index=True) indices, counts = extract_frequent_phrases( last_30_days_list, hashtag, date_marker, localPath) legends = [ "word (30days)", "bigram (30days)", "trigram(30days)" ] else: break indices_row.append(indices) counts_row.append(counts) legends_row.append(legends) # Plot and save title = "Most prevalent 10 frequent words and phrases used in #" + hashtag + " tweets" div = plot.plot_multiple_bar_chart(indices_row, counts_row, title, legends_row) with open( os.path.join(localPath, hashtag + "_extracted_frequent_phrases.html"), 'w') as f: f.write(div) s3.upload("macroscope-paho-covid", localPath, "frequent_phrases", hashtag + "_extracted_frequent_phrases.html") return None
def interest_by_region(keywords, language, localPath): country_code = pd.read_csv("tableconvert_csv_j8hnfj.csv", quotechar="\"") if language.lower() == 'spanish': pytrend = TrendReq(hl='sp-SP') else: pytrend = TrendReq() today = date.today() march = "2020-03-01" # there is a limit on 100 characters for keywords break them to multiple requests then while len(keywords) > 0: character_len = 0 keywords_split = [] for kk in keywords: character_len += len(kk) if character_len < 50: keywords_split.append(kk) for item in keywords_split: keywords.remove(item) pytrend.build_payload(kw_list=keywords_split, timeframe=march + " " + today.strftime("%Y-%m-%d")) df_regions = pytrend.interest_by_region(inc_geo_code=True) df_regions['country'] = df_regions.index df_regions = pd.merge(df_regions, country_code, left_on="geoCode", right_on="Alpha-2 code", how="left") for keyword in keywords_split: geo_name_df = df_regions[[ 'country', keyword, 'Alpha-2 code', 'Alpha-3 code', 'Numeric code', 'Latitude (average)', 'Longitude (average)' ]] if geo_name_df is not None: title = "Google Trends Interest by Region related to keyword: " + keyword + " (Since March 2020)" div = plot.plot_geograph(geo_name_df, keyword, title) with open( os.path.join( localPath, keyword.replace(" ", "_") + "_interest_by_region.html"), 'w') as f: f.write(div) s3.upload( "macroscope-paho-covid", localPath, "interest_by_region", keyword.replace(" ", "_") + "_interest_by_region.html") # save csv geo_name_df.to_csv(os.path.join( localPath, keyword.replace(" ", "_") + "_interest_by_region.csv"), index=False) s3.upload( "macroscope-paho-covid", localPath, "interest_by_region", keyword.replace(" ", "_") + "_interest_by_region.csv") return None