def __init__(self, awsPath, localSavePath, localReadPath, remoteReadPath,filename): self.localSavePath = localSavePath self.awsPath = awsPath # download remote socialmedia data into a temp folder # load it into csv s3.downloadToDisk(filename=filename,localpath=localReadPath, remotepath=remoteReadPath) Array = [] try: with open(localReadPath + filename,'r',encoding="utf-8") as f: reader = csv.reader(f) for row in reader: try: Array.append(row) except Exception as e: pass except: with open(localReadPath + filename,'r',encoding="ISO-8859-1") as f: reader = csv.reader(f) for row in reader: try: Array.append(row) except Exception as e: pass self.data = [] self.target = [] for a in Array[1:]: if len(a) == 2: self.data.append(a[0]) self.target.append(a[1])
def main(remoteReadPath, column): filename = remoteReadPath.split('/')[-2] + '.csv' s3.downloadToDisk(filename=filename, localpath='data/', remotepath=remoteReadPath) Array = [] try: with open('data/' + filename,'r',encoding="utf-8", errors="ignore") as f: reader = csv.reader(f) try: for row in reader: Array.append(row) except Exception as e: pass except: with open('data/' + filename,'r',encoding="ISO-8859-1", errors="ignore") as f: reader = csv.reader(f) try: for row in reader: Array.append(row) except Exception as e: pass df = pd.DataFrame(Array[1:],columns=Array[0]) df[df[column]!=''][column].dropna().astype('str').to_csv('data/raw_train.txt', index=False) return None
def lambda_handler(event, context): # create local path localPath = os.path.join('/tmp', 'hashtag') if not os.path.exists(localPath): os.makedirs(localPath) # download triggered file bucket = event['Records'][0]['s3']['bucket']['name'] key = unquote_plus(event['Records'][0]['s3']['object']['key']) remotePath = "/".join(key.split("/")[:-1]) filename = key.split("/")[-1] s3.downloadToDisk(bucket, filename, localPath, remotePath) # load to dataframe df = pd.read_csv(os.path.join(localPath, filename)) # extract hashtag hash = extract_hashtag(df) # plot bar chart (frequency chart) index = hash['hashtags'].values.tolist()[:10] counts = hash['Freq'].values.tolist()[:10] title = 'Top 10 prevalent hashtags (' + filename.split(".")[0] + ')' div = plot.plot_bar_chart(index, counts, title) # save result and write back to s3 hash_filename = filename.split(".")[0] hash.to_csv(os.path.join(localPath, hash_filename + "_extracted_hashtag.csv"), index=False) s3.upload("macroscope-paho-covid", localPath, "hashtags", hash_filename + "_extracted_hashtag.csv") with open( os.path.join(localPath, hash_filename + "_extracted_hashtag_frequency.html"), 'w') as f: f.write(div) s3.upload("macroscope-paho-covid", localPath, "hashtags", hash_filename + "_extracted_hashtag_frequency.html") return None
def lambda_handler(event, context): awsPath = os.path.join(event['sessionID'], event['screen_name']) localPath = os.path.join('/tmp', event['sessionID'], event['screen_name']) if not os.path.exists(localPath): os.makedirs(localPath) screen_name = event['screen_name'] try: s3.downloadToDisk(screen_name + '_tweets.txt', localPath, awsPath) except: raise ValueError('Cannot find the timeline in the remote storage!') with open(os.path.join(localPath, screen_name + '_tweets.txt'), 'r') as personality_text: headers = {'Content-Type': 'text/plain', 'Accept': 'application/json'} # concatenate the text field to be a paragraph df = pd.read_csv(os.path.join(localPath, screen_name + '_tweets.txt')) tweets = df['text'].tolist() body = '. '.join(tweets).encode('utf-8', 'ignore') r = requests.post( 'https://gateway.watsonplatform.net/personality-insights/api/v3/profile?version=2017-10-13&consumption_preferences=true&raw_scores=true', headers=headers, data=body, auth=('apikey', event['apikey']), timeout=300) if r.status_code == 200: data = {'personality': r.json()} with open( os.path.join(localPath, screen_name + '_personality' + '.json'), 'w') as outfile: json.dump(data, outfile) s3.upload(localPath, awsPath, screen_name + '_personality.json') return data else: raise ValueError(r.text)
def lambda_handler(event, context): output = dict() uid = event['uid'] awsPath = event['s3FolderName'] + '/ML/classification/' + uid + '/' localSavePath = '/tmp/' + event[ 's3FolderName'] + '/ML/classification/' + uid + '/' if not os.path.exists(localSavePath): os.makedirs(localSavePath) if not os.path.exists(localSavePath): os.makedirs(localSavePath) # download config to local folder fname_config = 'config.json' try: s3.downloadToDisk(fname_config, localSavePath, awsPath) with open(localSavePath + fname_config, "r") as fp: data = json.load(fp) for key in data.keys(): if key not in event.keys(): event[key] = data[key] with open(localSavePath + fname_config, "w") as f: json.dump(event, f) s3.upload(localSavePath, awsPath, fname_config) output['config'] = s3.generate_downloads(awsPath, fname_config) output['uid'] = uid except: raise ValueError('This session ID is invalid!') exit() # download unlabeled data to local folder fname_unlabeled = 'testing.csv' try: s3.downloadToDisk(fname_unlabeled, localSavePath, awsPath) except: raise ValueError('You\'re requesting ' + fname_unlabeled + ' file, and it\'s not found in your remote directory!\ It is likely that you have not yet performed step 1 -- split the dataset into training and predicting set, or you have provided the wrong sessionID.' ) exit() #download pickle model to local folder fname_pickle = 'pipeline.pickle' try: s3.downloadToDisk(fname_pickle, localSavePath, awsPath) except: raise ValueError( 'You\'re requesting ' + fname_pickle + ' file, and it\'s not found in your remote directory! \ It is likely that you have not yet performed step 2 -- model training, or you have provided the wrong sessionID.' ) exit() classification = Classification(awsPath, localSavePath) output['predicting'] = classification.predict() output['div_category'] = classification.plot() return output
def get_remote_input(remoteReadPath, filename, localReadPath): """ download input file from s3 bucket to a local location, and then load it to a pandas dataframe :param remoteReadPath: remote path in s3 to store the data :param localReadPath: local location to store the data, usually in /tmp :return: df: dataframe that contains the complete input file """ s3.downloadToDisk(filename, localReadPath, remoteReadPath) # quick fix for decoding error, sometimes the data is coded in ISO-8859-1 # Array = 2D nested list holding column and row data Array = [] try: with open(os.path.join(localReadPath, filename), 'r', encoding='utf-8', errors="ignore") as f: reader = csv.reader(f) try: for row in reader: Array.append(row) except Exception as e: print(e) except Exception: with open(os.path.join(localReadPath, filename), 'r', encoding='ISO-8859-1', errors="ignore") as f: reader = csv.reader(f) try: for row in reader: Array.append(row) except Exception as e: print(e) # load to pandas dataframe df = pd.DataFrame(Array[1:], columns=Array[0]) return df
# load url and id temp_dir = '/tmp/' + args.s3FolderName + '/' + uid + '/' if not os.path.exists(temp_dir): os.makedirs(temp_dir) # configure output directory # save it in download/temp/xxx-xxxxxxxxxxxxx-xxxxx/aww-comments file = args.remoteReadPath.split('/')[-2] comments_folder = temp_dir + file + '-comments/' if not os.path.exists(comments_folder): os.makedirs(comments_folder) fname_zip = file + '.zip' s3.downloadToDisk(filename=file + '.csv', localpath=temp_dir, remotepath=args.remoteReadPath) Array = [] try: with open(temp_dir + file + '.csv', 'r', encoding='utf-8') as f: reader = csv.reader(f) try: for row in reader: Array.append(row) except Exception as e: pass except: with open(temp_dir + file + '.csv', 'r', encoding="ISO-8859-1") as f: reader = csv.reader(f) try:
def lambda_handler(event, context): localPath = os.path.join('/tmp', event['sessionID']) if not os.path.exists(localPath): os.makedirs(localPath) # default algorithm to IBM-Watson to be compatible with old version if 'algorithm' not in event.keys(): event['algorithm'] = 'IBM-Personality' comparison_table = [[]] # download and read personality scores if event['algorithm'] == 'IBM-Personality': comparison_table = [[ 'screen_name', 'Personality_Openness', 'Personality_Conscientiousness', 'Personality_Extraversion', 'Personality_Agreeableness', 'Personality_Emotional_Range', 'Needs_Challenge', 'Needs_Closeness', 'Needs_Curiosity', 'Needs_Excitement', 'Needs_Harmony', 'Needs_Ideal', 'Needs_Liberty', 'Needs_Love', 'Needs_Practicality', 'Needs_Self_Expression', 'Needs_Stability', 'Needs_Structure', 'Values_Conservation', 'Values_Openness', 'Values_Hedonism', 'Values_Self_Enhancement', 'Values_Self_Transcendence' ]] for screen_name in event['screen_names']: awsPath = os.path.join(event['sessionID'], screen_name) try: s3.downloadToDisk(screen_name + '_personality.json', localPath, awsPath) except: raise ValueError( 'Cannot find the personality in the remote storage!') with open( os.path.join(localPath, screen_name + '_personality.json'), 'r') as f: data = json.load(f)['personality'] user_info = [screen_name] for p in data['personality']: user_info.append(p['percentile']) for p in data['needs']: user_info.append(p['percentile']) for p in data['values']: user_info.append(p['percentile']) comparison_table.append(user_info) elif event['algorithm'] == 'TwitPersonality': comparison_table = [[ 'screen_name', 'Personality_Openness', 'Personality_Conscientiousness', 'Personality_Extraversion', 'Personality_Agreeableness', 'Personality_Emotional_Range' ]] for screen_name in event['screen_names']: awsPath = os.path.join(event['sessionID'], screen_name) try: s3.downloadToDisk(screen_name + '_twitPersonality.json', localPath, awsPath) except: raise ValueError( 'Cannot find the personality in the remote storage!') with open( os.path.join(localPath, screen_name + '_twitPersonality.json'), 'r') as f: data = json.load(f)['personality'] user_info = [screen_name] for p in data['personality']: user_info.append(p['percentile']) comparison_table.append(user_info) elif event['algorithm'] == 'Pamuksuz-Personality': comparison_table = [[ 'screen_name', 'sophistication', 'excitement', 'sincerity', 'competence', 'ruggedness' ]] for screen_name in event['screen_names']: awsPath = os.path.join(event['sessionID'], screen_name) try: s3.downloadToDisk( screen_name + '_utku_personality_average.json', localPath, awsPath) except: raise ValueError( 'Cannot find the personality in the remote storage!') with open( os.path.join( localPath, screen_name + '_utku_personality_average.json'), 'r') as f: data = json.load(f) comparison_table.append([ screen_name, data['sophistication'], data['excitement'], data['sincerity'], data['competence'], data['ruggedness'] ]) # computer correlations event['screen_names'].insert(0, 'Correlation') correlation_matrix = [event['screen_names']] correlation_matrix_no_legends = [] for i in range(1, len(comparison_table)): row = [comparison_table[i][0]] row_no_legends = [] for j in range(1, len(comparison_table)): vector_a = comparison_table[i][1:] vector_b = comparison_table[j][1:] row.append(cos_sim(vector_a, vector_b)) row_no_legends.append(cos_sim(vector_a, vector_b)) correlation_matrix.append(row) correlation_matrix_no_legends.append(row_no_legends) return { 'comparison_table': comparison_table, 'correlation_matrix': correlation_matrix, 'correlation_matrix_no_legends': correlation_matrix_no_legends }
# arranging the paths uid = args.uuid # check if this awsPath exist!!! if not exist, exit with error awsPath = args.s3FolderName + '/ML/classification/' + uid +'/' localSavePath = '/tmp/' + args.s3FolderName + '/ML/classification/' + uid + '/' localReadPath = '/tmp/' + args.s3FolderName + '/' if not os.path.exists(localSavePath): os.makedirs(localSavePath) if not os.path.exists(localReadPath): os.makedirs(localReadPath) fname_config = 'config.json' if s3.checkExist(awsPath, fname_config): s3.downloadToDisk(fname_config, localSavePath, awsPath) with open(localSavePath + fname_config, "r") as fp: data = json.load(fp) for key in vars(args).keys(): if key not in data.keys(): data[key] = vars(args)[key] with open(localSavePath + fname_config,"w") as f: json.dump(data,f) s3.upload(localSavePath, awsPath, fname_config) output['config'] = s3.generate_downloads(awsPath, fname_config) output['uuid'] = uid else: raise ValueError('This session ID is invalid!') exit()
def lambda_handler(event, context): awsUserPath = os.path.join(event['sessionID'], event['user_screen_name']) awsBrandPath = os.path.join(event['sessionID'], event['brand_screen_name']) localPath = os.path.join('/tmp', event['sessionID']) if not os.path.exists(localPath): os.makedirs(localPath) # default algorithm to IBM-Watson to be compatible with old version if 'algorithm' not in event.keys(): event['algorithm'] = 'IBM-Watson' # calculate similarity score vector_a = [] vector_b = [] # download and read personality scores if event['algorithm'] == 'IBM-Watson': try: s3.downloadToDisk(event['user_screen_name'] + '_personality.json', localPath, awsUserPath) s3.downloadToDisk(event['brand_screen_name'] + '_personality.json', localPath, awsBrandPath) # open json and read in values with open(os.path.join(localPath, event['user_screen_name'] + '_personality.json'), 'r') as f: user_data = json.load(f)['personality'] with open(os.path.join(localPath, event['brand_screen_name'] + '_personality.json'),'r') as f: brand_data = json.load(f)['personality'] if event['option'] == 'personality_sim_score': for p in user_data['personality']: vector_a.append(p['percentile']) for p in brand_data['personality']: vector_b.append(p['percentile']) elif event['option'] == 'needs_sim_score': for p in user_data['needs']: vector_a.append(p['percentile']) for p in brand_data['needs']: vector_b.append(p['percentile']) elif event['option'] == 'values_sim_score': for p in user_data['values']: vector_a.append(p['percentile']) for p in brand_data['values']: vector_b.append(p['percentile']) elif event['option'] == 'consumption_sim_score': for p in user_data['consumption_preferences']: for c in p['consumption_preferences']: vector_a.append(c['score']) for p in brand_data['consumption_preferences']: for c in p['consumption_preferences']: vector_b.append(c['score']) except: raise ValueError('Cannot find the timeline in the remote storage!') elif event['algorithm'] == 'TwitPersonality': try: s3.downloadToDisk(event['user_screen_name'] + '_twitPersonality.json', localPath, awsUserPath) s3.downloadToDisk(event['brand_screen_name'] + '_twitPersonality.json', localPath, awsBrandPath) # open json and read in values with open(os.path.join(localPath, event['user_screen_name'] + '_twitPersonality.json'), 'r') as f: user_data = json.load(f)['personality'] with open(os.path.join(localPath, event['brand_screen_name'] + '_twitPersonality.json'),'r') as f: brand_data = json.load(f)['personality'] if event['option'] == 'personality_sim_score': for p in user_data['personality']: vector_a.append(p['percentile']) for p in brand_data['personality']: vector_b.append(p['percentile']) except: raise ValueError('Cannot find the timeline in the remote storage!') if event['algorithm'] == 'Pamuksuz-Personality': try: s3.downloadToDisk(event['user_screen_name'] + '_utku_personality_average.json', localPath, awsUserPath) s3.downloadToDisk(event['brand_screen_name'] + '_utku_personality_average.json', localPath, awsBrandPath) # open json and read in values with open(os.path.join(localPath, event['user_screen_name'] + '_utku_personality_average.json'), 'r') as f: user_data = json.load(f) with open(os.path.join(localPath, event['brand_screen_name'] + '_utku_personality_average.json'), 'r') as f: brand_data = json.load(f) for metric in user_data.keys(): vector_a.append(user_data[metric]) vector_b.append(brand_data[metric]) except: raise ValueError('Cannot find the timeline in the remote storage!') try: return {'sim_score': cos_sim(vector_a, vector_b)} except: raise ValueError( 'cannot calculate the cosine similarity of these two vectors!')
parser.add_argument('--s3FolderName', required=True) parser.add_argument('--email', required=True) args = parser.parse_args() uid = args.uuid awsPath = args.s3FolderName + '/ML/classification/' + uid + '/' localSavePath = '/tmp/' + args.s3FolderName + '/ML/classification/' + uid + '/' if not os.path.exists(localSavePath): os.makedirs(localSavePath) if not os.path.exists(localSavePath): os.makedirs(localSavePath) # download config to local folder fname_config = 'config.json' if s3.checkExist(awsPath, fname_config): s3.downloadToDisk(fname_config, localSavePath, awsPath) with open(localSavePath + fname_config, "r") as fp: data = json.load(fp) for key in vars(args).keys(): if key not in data.keys(): data[key] = vars(args)[key] with open(localSavePath + fname_config, "w") as f: json.dump(data, f) s3.upload(localSavePath, awsPath, fname_config) output['config'] = s3.generate_downloads(awsPath, fname_config) output['uuid'] = uid else: raise ValueError('This session ID is invalid!') exit()
def lambda_handler(event, context): # create local path localPath = os.path.join('/tmp', 'frequent_phrases') if not os.path.exists(localPath): os.makedirs(localPath) # download triggered file bucket = event['Records'][0]['s3']['bucket']['name'] key = unquote_plus(event['Records'][0]['s3']['object']['key']) remotePath = "/".join(key.split("/")[:-1]) files = s3.listFiles(bucket, remotePath) sorted_files = sorted(files, key=lambda file: file['LastModified'], reverse=True) hashtags = ["COVID19", "coronavirus", "COVID_19"] date_markers = ["1day", "7days", "30days"] for hashtag in hashtags: indices_row = [] counts_row = [] legends_row = [] for date_marker in date_markers: if date_marker == "1day": today = key.split("/")[-1] s3.downloadToDisk(bucket, today, localPath, remotePath) df_today = pd.read_csv(os.path.join(localPath, today)) indices, counts = extract_frequent_phrases( df_today, hashtag, date_marker, localPath) legends = ["word (1day)", "bigram (1day)", "trigram(1day)"] elif date_marker == "7days": last_7_days_files = sorted_files[:7] last_7_days_list = [] for file in last_7_days_files: fname = file['Key'].split("/")[-1] s3.downloadToDisk(bucket, fname, localPath, remotePath) last_7_days_list.append( pd.read_csv(os.path.join(localPath, fname))) last_7_days_df = pd.concat(last_7_days_list, axis=0, ignore_index=True) indices, counts = extract_frequent_phrases( last_7_days_df, hashtag, date_marker, localPath) legends = ["word (7days)", "bigram (7days)", "trigram(7days)"] elif date_marker == "30days": last_30_days_files = sorted_files[:30] last_30_days_list = [] for file in last_30_days_files: fname = file['Key'].split("/")[-1] s3.downloadToDisk(bucket, fname, localPath, remotePath) last_30_days_list.append( pd.read_csv(os.path.join(localPath, fname))) last_30_days_list = pd.concat(last_30_days_list, axis=0, ignore_index=True) indices, counts = extract_frequent_phrases( last_30_days_list, hashtag, date_marker, localPath) legends = [ "word (30days)", "bigram (30days)", "trigram(30days)" ] else: break indices_row.append(indices) counts_row.append(counts) legends_row.append(legends) # Plot and save title = "Most prevalent 10 frequent words and phrases used in #" + hashtag + " tweets" div = plot.plot_multiple_bar_chart(indices_row, counts_row, title, legends_row) with open( os.path.join(localPath, hashtag + "_extracted_frequent_phrases.html"), 'w') as f: f.write(div) s3.upload("macroscope-paho-covid", localPath, "frequent_phrases", hashtag + "_extracted_frequent_phrases.html") return None
def __init__(self, awsPath, localSavePath, localReadPath, remoteReadPath): self.localSavePath = localSavePath self.awsPath = awsPath # download remote socialmedia data into a temp folder # load it into csv filename = remoteReadPath.split('/')[-2] + '.csv' self.filename = filename # save it so split function can reuse this name s3.downloadToDisk(filename=filename, localpath=localReadPath, remotepath=remoteReadPath) Array = [] try: with open(localReadPath + filename, 'r', encoding='utf-8') as f: reader = csv.reader(f) for row in reader: try: Array.append(row) except Exception as e: pass except: with open(localReadPath + filename, 'r', encoding='ISO-8859-1') as f: reader = csv.reader(f) for row in reader: try: Array.append(row) except Exception as e: pass df = pandas.DataFrame(Array[1:], columns=Array[0]) # remoteReadPath always follows format of sessionID/folderID/datasetName/ # example: local/GraphQL/twitter-Tweet/trump/ => ['local','GraphQL', 'twitter-Tweet','trump',''] source = remoteReadPath.split('/')[2] if (source == 'twitter-Tweet') and ('text' in Array[0]): self.corpus = list( set(df[df['text'] != '']['text'].dropna().astype( 'str').tolist())) elif (source == 'twitter-Stream') and ('_source.text' in Array[0]): self.corpus = list( set(df[df['_source.text'] != ''] ['_source.text'].dropna().astype('str').tolist())) # find the unique content in crimson hexagon elif (source == 'crimson-Hexagon') and ('contents' in Array[0]): self.corpus = list( set(df[df['contents'] != '']['contents'].dropna().astype( 'str').tolist())) # find the unique title in reddit posts elif (source == 'reddit-Search' or source == 'reddit-Post') and 'title' in Array[0]: self.corpus = list( set(df[df['title'] != '']['title'].dropna().astype( 'str').tolist())) elif source == 'reddit-Historical-Post' and '_source.title' in Array[0]: self.corpus = list( set(df[df['_source.title'] != ''] ['_source.title'].dropna().astype('str').tolist())) # find the unique body in reddit comments elif (source == 'reddit-Comment' or source == 'reddit-Historical-Comment') and 'body' in Array[0]: self.corpus = list( set(df[df['body'] != '']['body'].dropna().astype( 'str').tolist())) # TODO: switch reddit comment to elasticsearch endpoint # elif source == 'reddit-Historical-Comment' and '_source.body' in Array[0]: # self.corpus = list(set(df[df['_source.body']!='']['_source.body'].dropna().astype('str').tolist())) # strip http in the corpus self.corpus = [re.sub(r"http\S+", "", text) for text in self.corpus]
def calc_tweet_personality(sessionID, screen_name, profile_img): # load embedding dataset curr_path = os.path.dirname(os.path.abspath(__file__)) dataset_path = curr_path + "/fastText/wiki-news-300d-1M.vec" wordDictionary = dsu.parseFastText(dataset_path) # load predictive models models = {} for trait in ["O", "C", "E", "A", "N"]: models[trait] = joblib.load(curr_path + "/models/model_" + trait + ".pkl") # read tweets awsPath = os.path.join(sessionID, screen_name) sessionDir = os.environ['SESSIONDIR'] localPath = os.path.join(sessionDir + '/collection', sessionID) if not os.path.exists(localPath): try: os.makedirs(localPath) except: pass try: s3.downloadToDisk(screen_name + '_tweets.txt', localPath, awsPath) except: raise ValueError('Cannot find the timeline in the remote storage!') # process the tweets tweet_file_path = os.path.join(localPath, screen_name + '_tweets.txt') filteredTweets = [] word_count = 0 for tweet in open(tweet_file_path, "r", encoding="utf-8"): if re.match(r'^(RT)', tweet) or tweet == '\n' \ or tweet == '' or tweet == ' ': continue #remove links starting with "http" tweet = re.sub(r'((http)([^\s]*)(\s|$))|((http)([^\s]*)$)', "", tweet) #remove links with no http (probably unnecessary) tweet = re.sub( r'(\s([^\s]*)\.([^\s]*)\/([^\s]*)\s)|(^([^\s]*)\.([^\s]*)\/([^\s]*)(\s|$))|(\s([^\s]*)\.([^\s]*)\/([^\s]*)$)', " ", tweet) #remove mentions tweet = re.sub(r'(\s(@)([^\s]*)\s)|((^@)([^\s]*)(\s|$))|(@([^\s]*)$)', " ", tweet) #hashtags are removed by countvectorizer filteredTweets.append(tweet) word_count += len(tweet.split()) if len(filteredTweets) == 0: print("Not enough tweets for prediction.") continue #now we can process the tweet using embeddings.transofrmTextForTraining try: tweetEmbeddings = embeddings.transformTextForTesting( wordDictionary, 3, filteredTweets, "conc") except: print("Not enough tweets for prediction.") # predict using saved models # range is 0 ~ 5 scores = {} for trait in ["O", "C", "E", "A", "N"]: model = models[trait] preds = model.predict(tweetEmbeddings) scores[trait] = float(str(np.mean(np.array(preds)))[0:5]) jung = "" if scores["E"] > 3: jung = "E" else: jung = "I" if scores["O"] > 3: jung = jung + "N" else: jung = jung + "S" if scores["A"] > 3: jung = jung + "F" else: jung = jung + "T" if scores["C"] > 3: jung = jung + "J" else: jung = jung + "P" scores["jung"] = jung # sort the output result = {} result['screen_name'] = screen_name result['profile_img'] = profile_img result['personality'] = { "word_count": word_count, "processed_language": "en", 'personality': [{ 'name': 'Openness', 'percentile': scores['O'] / 5 }, { 'name': 'Conscientiousness', 'percentile': scores['C'] / 5 }, { 'name': 'Extraversion', 'percentile': scores['E'] / 5 }, { 'name': 'Agreeableness', 'percentile': scores['A'] / 5 }, { 'name': 'Emotional range', 'percentile': scores['N'] / 5 }] } # save to json and upload to s3 bucket with open(os.path.join(localPath, screen_name + '_twitPersonality.json'), 'w') as outfile: json.dump(result, outfile) s3.upload(localPath, awsPath, screen_name + '_twitPersonality.json') # delete localPath files try: os.remove(os.path.join(localPath, screen_name + '_tweets.txt')) os.remove( os.path.join(localPath, screen_name + '_twitPersonality.json')) except: # already deleted! pass print(s3.generate_downloads(awsPath, screen_name + '_twitPersonality.json')) return result
parsed, unknown = parser.parse_known_args() for arg in unknown: if arg.startswith("--"): parser.add_argument(arg, required=False) params = vars(parser.parse_args()) awsPath = os.path.join(params['sessionID'], params['screen_name']) localPath = os.path.join('/tmp', params['sessionID'], params['screen_name']) if not os.path.exists(localPath): os.makedirs(localPath) screen_name = params['screen_name'] try: s3.downloadToDisk(screen_name + '_tweets.txt', localPath, awsPath) except: raise ValueError('Cannot find the timeline in the remote storage!') # calculate brand personality model = MultiLabelClassificationModel('roberta', 'checkpoint-17315-epoch-5', num_labels=5, args={ "reprocess_input_data": True, 'use_cached_eval_features': False }, use_cuda=False) df = pd.read_csv(os.path.join(localPath, screen_name + '_tweets.txt')) new_df = multiple_sentences(df, model) fname_sentences = screen_name + '_utku_personality_sentences.csv'