def process_query_results(query_results, base_map): in_shape_ids = [] incoming_flow = {} out_shape_ids = [] outgoing_flow = {} for itinerary in query_results: origin_id = Utils.convert_id(itinerary[0]) destination_id = Utils.convert_id(itinerary[1]) weight = itinerary[2] origin_ids = [] destination_ids = [] # we look only at shapes that are to be rendered if origin_id in base_map.shape_dict or destination_id in base_map.shape_dict: shape_origin = base_map.shape_dict[origin_id] origin_ids.append(origin_id) shape_destination = base_map.shape_dict[destination_id] destination_ids.append(destination_id) # We build a dictionary of outgoing traffic for origin_id in origin_ids: if origin_id not in out_shape_ids: out_shape_ids.append(origin_id) outgoing_flow[shape_origin] = [] outgoing_flow[shape_origin].append((shape_destination, weight)) # We build a dictionary of incoming traffic for destination_id in destination_ids: if destination_id not in in_shape_ids: in_shape_ids.append(destination_id) incoming_flow[shape_destination] = [] incoming_flow[shape_destination].append((shape_origin, weight)) return outgoing_flow, incoming_flow
def get_shape_coords(self): shape_zone = self.shapefile.shape(self.shape_id) points = [(i[0], i[1]) for i in shape_zone.points] x_center, y_center = Utils.calculate_centroid(points) center = (x_center, y_center) max_bound, min_bound = Utils.calculate_boundaries(points) return (points, center, max_bound, min_bound)
def find_max_coords(self): all_max_bound = [] all_min_bound = [] shape_dict = self.shape_dict for zone_id in shape_dict: zone_shape = shape_dict[zone_id] max_bound_zone = zone_shape.max_bound min_bound_zone = zone_shape.min_bound all_max_bound.append(max_bound_zone) all_min_bound.append(min_bound_zone) map_max_bound, unused_max = Utils.calculate_boundaries(all_max_bound) unused_min, map_min_bound = Utils.calculate_boundaries(all_min_bound) return (map_max_bound, map_min_bound)
def get_user_articles(self, user_id): ''' INPUT: user_id - (int) a user id user_item - (pandas dataframe) matrix of users by articles: 1's when a user has interacted with an article, 0 otherwise OUTPUT: article_ids - (list) a list of the article ids seen by the user article_names - (list) a list of article names associated with the list of article ids (this is identified by the doc_full_name column in df_content) Description: Provides a list of the article_ids and article titles that have been seen by a user ''' user_row = np.where(self.user_item.index == user_id)[0][0] user_articles = np.where(self.user_item.iloc[user_row] == 1)[0] article_ids = [] for article in user_articles: article_id = self.user_item.iloc[:, article].name article_ids.append(str(article_id)) # to match the expected str type as output article_names = Utils.get_article_names(article_ids, self.interactions_df, 'title') return article_ids, article_names # return the ids and names
def project_shape_coords(self, projection): shape_zone = self.shapefile.shape(self.shape_id) points = [ projection.apply_projection([i[0], i[1]]) for i in shape_zone.points ] points = [projection.apply_translation([i[0], i[1]]) for i in points] self.points = points x_center, y_center = Utils.calculate_centroid(points) self.center = (x_center, y_center) max_bound, min_bound = Utils.calculate_boundaries(points) self.max_bound = max_bound self.min_bound = min_bound
def make_user_user_recommendations(self, user_id, num_recommendations=10): ''' INPUT: user_id - (int) a user id num_recommendations - (int) the number of recommendations you want for the user OUTPUT: recs - (list) a list of recommendations for the user by article id rec_names - (list) a list of recommendations for the user by article title Description: Loops through the users based on closeness to the input user_id For each user - finds articles the user hasn't seen before and provides them as recs Does this until m recommendations are found Notes: * Choose the users that have the most total article interactions before choosing those with fewer article interactions. * Choose articles with the articles with the most total interactions before choosing those with fewer total interactions. ''' recs = [] neighbors_df = self.get_top_sorted_users(user_id) user_articles_id, user_articles_names = self.get_user_articles(user_id) for neighbor in neighbors_df.index: neighbor_articles_id, neighbor_articles_names = self.get_user_articles(neighbor) sorted_neighbor_article_ids = Utils.get_top_articles_df(neighbor_articles_id, self.interactions_df) sorted_neighbor_article_ids = sorted_neighbor_article_ids.index.values article_not_read = np.setdiff1d(sorted_neighbor_article_ids, user_articles_id, assume_unique=True) article_not_read = [str(i) for i in article_not_read] recs = np.unique(np.concatenate([article_not_read, recs], axis=0)) if len(recs) >= num_recommendations: break if len(recs) >= num_recommendations: recs = recs[:num_recommendations] recommended_articles = Utils.get_article_names(recs, self.interactions_df, 'title') return recommended_articles
def make_SVD_recommendations(self, user_id, num_recommendations=10): preds = np.around(np.dot(np.dot(self.u_matrix, self.s_matrix), self.vt_matrix)) articles_idx = preds.argsort()[-num_recommendations:][::-1] rec_ids = self.user_item.columns[articles_idx] recommended_articles = Utils.get_article_names(rec_ids, self.interactions_df, 'title') recommended_articles = recommended_articles[:num_recommendations] return recommended_articles
def process_query_results(query_results_dict, map_item): processed_query_results_dict = {} # we find the min and max passengers for the whole year min_passenger = 999999999 max_passenger = 0 for query_date in query_results_dict: temp_min, temp_max = Utils.compute_min_max_passengers(query_results_dict[query_date], 2) if temp_min < min_passenger: min_passenger = temp_min if temp_max > max_passenger: max_passenger = temp_max # we transform the query_results_dict to use instances of the PointOnMap class for query_date in query_results_dict: query_result = query_results_dict[query_date] processed_query_results_dict[query_date] = [] for itinerary in query_result: processed_itinerary = [] zone_id_origin = Utils.convert_id(itinerary[0]) zone_id_destination = Utils.convert_id(itinerary[1]) if zone_id_origin == zone_id_destination: color = (141, 91, 67) else: color = (135, 162, 34) weight = compute_weight(map_item[0], itinerary[2], max_passenger) shape_origin = map_item[1].shape_dict[zone_id_origin] coords = shape_origin.center point_to_render = classfile.PointOnMap(coords, weight, color) processed_itinerary.append(point_to_render) shape_dest = map_item[1].shape_dict[zone_id_destination] target_coords = shape_dest.center processed_itinerary.append(target_coords) processed_itinerary.append(weight) processed_query_results_dict[query_date].append(processed_itinerary) return processed_query_results_dict, min_passenger, max_passenger
def render_single_map(flow_dict, flow_dir, base_map, file_name, zone_shape): map_rendered = base_map.map_file.copy() zone_name = find_names(zone_shape, base_map) zone_id = Utils.convert_id(zone_shape.shape_id, inverse=True) map_title = '{}_{}_{}_{}_{}'.format(file_name[0], zone_id, zone_name, flow_dir, file_name[1]) trips_list = flow_dict[zone_shape] min_passenger, max_passenger = Utils.compute_min_max_passengers( trips_list, 1) colors = [] for linked_zone in trips_list: shape_to_color = linked_zone[0] if shape_to_color.shape_id != zone_shape.shape_id: weight = linked_zone[1] render_color = compute_color(weight, min_passenger, max_passenger) shape_to_color.color_fill = render_color if render_color not in colors: colors.append(render_color) shape_to_color.fill_in_shape(map_rendered) # we draw again the boundaries of the shape after filling it in pts = np.array(shape_to_color.points, np.int32) cv2.polylines(map_rendered, [pts], True, (255, 255, 255), 1, cv2.LINE_AA) # outline the focused shape zone_shape.color_line = [95, 240, 255] zone_shape.line_thick = 3 pts = np.array(zone_shape.points, np.int32) cv2.polylines(map_rendered, [pts], True, zone_shape.color_line, zone_shape.line_thick, cv2.LINE_AA) # display the legend display_specific_text(map_rendered, zone_id, zone_name, flow_dir, min_passenger, max_passenger, colors) # save the image cv2.imwrite(('{}.png').format(map_title), map_rendered)
def create_test_and_train_user_item(self): ''' INPUT: df_train - training dataframe df_test - test dataframe OUTPUT: user_item_train - a user-item matrix of the training dataframe (unique users for each row and unique articles for each column) user_item_test - a user-item matrix of the testing dataframe (unique users for each row and unique articles for each column) ''' num_interactions = len(self.interactions_df) len_train = int(70*num_interactions/100) # 70% of the df for train len_test = num_interactions - len_train # 30% of the df for test df_train = self.interactions_df.head(len_train) df_test = self.interactions_df.tail(len_test) # we reuse the create_user_item_matrix we defined earlier user_item_train = Utils.create_user_item_matrix(df_train) user_item_test = Utils.create_user_item_matrix(df_test) return (user_item_train, user_item_test)
def NLP_processing(self, text): # remove NaN from text text = Utils.remove_NaN(text, 0) # initialize count vectorizer object vect = CountVectorizer(lowercase=False, tokenizer=self.tokenize) # get counts of each token (word) in text data X = vect.fit_transform(self.content_analysis_target) # initialize tf-idf transformer object transformer = TfidfTransformer(smooth_idf=False) # use counts from count vectorizer results to compute tf-idf values tfidf = transformer.fit_transform(X) return tfidf
def make_content_user_recommendations(self, _id, num_recommendations=10): ''' INPUT: _id, the id of the user we want recommended articles for self.content_similarity_matrix, the similarity matrix of the articles, by default cosine matrix computed separately self.interactions_df, the dataframe with the interactions of users with articles self.article_content_df - the df containing details about the articles num_recommendations, the number of recommendations expected as an output, by default 10 OUTPUT: recommended_articles, a list of recommended articles, given by name ''' # get the articles a user read user_articles_id, user_articles_names = self.get_user_articles(_id) # filter out the articles that are not in the df of article details user_articles_id = [float(i) for i in user_articles_id] user_articles = self.article_content_df[self.article_content_df['article_id'].isin(user_articles_id)]['article_id'].values # sort the articles_id per number of interactions user_article_inter_dict = {} for article in user_articles: interact = len(self.interactions_df[(self.interactions_df['user_id'] == _id) & (self.interactions_df['article_id'] == article)]) article_title = self.interactions_df[self.interactions_df['article_id'] == article]['title'].values[0] user_article_inter_dict[article] = {'num_interactions': interact, 'title': article_title} top_user_articles_df = pd.DataFrame.from_dict(user_article_inter_dict, orient='index') top_user_articles_df = top_user_articles_df.sort_values(by='num_interactions', ascending=False) # find similar articles in order recommended_articles = [] for article in top_user_articles_df.index: articles_sim = self.find_similar_articles(article) unread_articles = np.setdiff1d(articles_sim, top_user_articles_df.index, assume_unique=True) for unread_article in unread_articles: if unread_article not in recommended_articles: recommended_articles.append(unread_article) if len(recommended_articles) > num_recommendations: break recommended_articles = recommended_articles[:num_recommendations] recommended_articles = Utils.get_article_names(recommended_articles, self.article_content_df, 'doc_full_name') return recommended_articles
def make_content_article_recommendations(self, _id, num_recommendations=10): ''' INPUT: _id, the id of the article we want similar articles for self.content_similarity_matrix, the similarity matrix of the articles, by default cosine matrix computed separately self.interactions_df, the dataframe with the interactions of users with articles self.article_content_df - the df containing details about the articles num_recommendations, the number of recommendations expected as an output, by default 10 OUTPUT: recommended_articles, a list of similar articles, given by name ''' recommended_articles = self.find_similar_articles(_id) recommended_articles = recommended_articles[:num_recommendations] recommended_articles = Utils.get_article_names(recommended_articles, self.article_content_df, 'doc_full_name') return recommended_articles
def __init__(self, interactions_df): self.interactions_df = interactions_df self.article_ids = self.interactions_df['article_id'].unique() self.top_articles_df = Utils.get_top_articles_df(self.article_ids, self.interactions_df)
def process_query_arg(render_animation_dict): period = render_animation_dict['period'] query_dict = render_animation_dict['query_dict'] database = render_animation_dict['database'] specific_weekdays = query_dict['specific_weekdays'] date = query_dict['date'] aggregate_period = render_animation_dict['aggregate_period'] weekdays = render_animation_dict['weekdays'] query_results_dict = {} if aggregate_period is False and query_dict['date'] == 'loop_through_period': # in this case we want the result for each day of the period provided # if we have the flag loop_through_period in the query dict, it means the period # set for the query is multiple dates daterange = pd.date_range(period[0], period[1]) # we run queries for each date in the daterange specified for single_date in daterange: date = pd.to_datetime(single_date) if specific_weekdays == 'on_specific_weekdays': # we check if the date of the daterange matches the weekday(s) we target if date.dayofweek in weekdays: single_date = date.date().strftime('%Y-%m-%d') query_dict['date'] = single_date query = prepare_sql_query(query_dict) query_results = Utils.make_sql_query(query, database) query_results_dict[query_dict['date']] = query_results else: # if a date in the range is not among the weekdays we want, we skip it continue else: single_date = date.date().strftime('%Y-%m-%d') query_dict['date'] = single_date query = prepare_sql_query(query_dict) query_results = Utils.make_sql_query(query, database) query_results_dict[query_dict['date']] = query_results elif aggregate_period is True and query_dict['date'] == 'loop_through_period': # in this case, we want to aggregate the results (sum) per week daterange = pd.date_range(period[0], period[1]) start_date = pd.to_datetime(period[0]) end_date = pd.to_datetime(period[1]) # let's build a list of all intervals we will want to aggregate the data for all_aggr_init = [] start = start_date end = end_date # we add one list of dates per week to the list of all intervals i = 0 for date in daterange: # we handle separately the first date of the period if i == 0: curr_week = [start.date().strftime('%Y-%m-%d')] if date != start_date and date != end_date: start_week_number = start.isocalendar()[1] date_week_number = date.isocalendar()[1] if date_week_number == start_week_number: curr_week.append(date.date().strftime('%Y-%m-%d')) i += 1 else: start = date all_aggr_init.append(curr_week) i = 0 # we handle separately the last date of the period if curr_week not in all_aggr_init: curr_week.append(end_date.date().strftime('%Y-%m-%d')) all_aggr_init.append(curr_week) else: curr_week = [end_date.date().strftime('%Y-%m-%d')] all_aggr_init.append(curr_week) # now we keep only the first and last item of each interval all_aggr = [] for interval in all_aggr_init: interval_new = [interval[0], interval[-1]] all_aggr.append(interval_new) # we now query for each interval for interval in all_aggr: query_dict['date'] = interval query = prepare_sql_query(query_dict) query_results = Utils.make_sql_query(query, database) query_results_dict[query_dict['date'][0]] = query_results else: # we have a single date to render for, so nothing to aggregate! # just in case we check that there is no mismatch between the single day and the # argument containing specific weekdays restrictions if any if specific_weekdays == 'on_specific_weekdays': # we check if the date of the daterange matches the weekday(s) we target date = pd.to_datetime(query_dict['date']) if date.dayofweek in weekdays: query = prepare_sql_query(query_dict) query_results = Utils.make_sql_query(query, database) query_results_dict[query_dict['date']] = query_results else: print("The date selected does not match the weekday(s) indicated. Please select either an interval ('time_granularity': 'period') or a valid weekday(s) list.") else: query = prepare_sql_query(query_dict) query_results = Utils.make_sql_query(query, database) query_results_dict[query_dict['date']] = query_results return query_results_dict
import os import argparse from utility import Utils if __name__ == '__main__': parser = argparse.ArgumentParser( description='Downloading ncodes for imagenet dataset') dir_path = os.path.realpath(os.path.dirname(__file__)) path_to_save = os.path.realpath(os.path.dirname(__file__)) optional = parser.add_argument_group('optional arguments') optional.add_argument('-nc_dir', '--ncodes_dir', default=path_to_save, help='path to saved ncodes data') optional.add_argument('-url_data', '--with_url_data', default=False, help='tag whether to download url data') args = parser.parse_args() util_obj = Utils() # download ncodes data csv util_obj.download_ncodes_image_net(path_to_save=args.ncodes_dir) # download imagenet urls csv args.url_data = eval(str(args.with_url_data)) util_obj.download_image_net_urls(path_to_save=args.ncodes_dir)
import pandas as pd from classfile import RecommendationEngine from utility import Utils # import and cleandata sources interactions_df = pd.read_csv('data/user-item-interactions.csv') article_content_df = pd.read_csv('data/articles_community.csv') del interactions_df['Unnamed: 0'] del article_content_df['Unnamed: 0'] email_encoded = Utils.email_mapper(interactions_df['email']) del interactions_df['email'] interactions_df['user_id'] = email_encoded # create a matrix of user-article interactions user_item = Utils.create_user_item_matrix(interactions_df) # create an instance of the Recommendation Engine that can be used for multiple situations rec_engine = RecommendationEngine(interactions_df, article_content_df, user_item) # test the code for a few situations (expected returned output of 10 article titles) # recommendations for an article _id_type = 'article' _id = 10 recommended_articles = rec_engine.make_recommendations(_id, _id_type) print('Test article') print( 'The following articles are recommended based on your query for {} id {}:'. format(_id_type, _id))
'weekdays': [], 'aggregate_period': False } query_dict = build_query_dict(render_heat_map_dict) if query_dict['date'] == 'loop_through_period': # if we have the flag loop_through_period in the query dict, it means the period # set for the query is multiple dates, therefore we want the query to return an # average on a time interval, and not on a single date period = render_heat_map_dict['period'] daterange = pd.date_range(period[0], period[1]) query_dict['date'] = period query = prepare_sql_query(query_dict) query_results = Utils.make_sql_query(query, database) for single_map, base_map, projection in base_maps: # we process the query results outgoing_flow, incoming_flow = process_query_results( query_results, base_map) print('Rendering {}...'.format(single_map)) if single_map == 'total': if time_granularity == 'weekdays_vs_weekends': file_name = ['NYC', '2018_diff_WD_WE'] else: file_name = ['NYC', '2018'] else: if time_granularity == 'weekdays_vs_weekends': file_name = ['{}'.format(single_map), '2018_diff_WD_WE'] else:
default=True, help='downloaded parallelly or sequentially') optional.add_argument( '-v', '--verbose', default=True, help='bool represent whether to display ncode level download stats') optional.add_argument( '-b', '--batch_size', default=None, help='number of images to download paralelly, if parallel is TRUE') args = parser.parse_args() args.parallel = eval(str(args.parallel)) args.with_annotation = eval(str(args.with_annotation)) args.verbose = eval(str(args.verbose)) args.batch_size = eval(str(args.batch_size)) # starting the download process util_obj = Utils() util_obj.subset_ncodes_to_download(ncodes_data_path) util_obj.download_partial_imagenet_dataset( path_to_url_dataset=args.url_data, path_to_annotations=args.annotations_dir, path_to_save_dataset=args.save_dir, only_annotations=args.with_annotation, parallel=args.parallel, verbose=args.verbose, batch_size_=args.batch_size)
username, fname, lname = '', '', '' email = '' def __init__(self, username, fname, lname, *args, **kwargs): self.username = username self.fname, self.lname = fname, lname @classmethod def setEmail(cls, email): cls.email = f'{email}@g.net' return cls.email def setFullName(self): return f'{self.fname} {self.lname}' def __str__(self): return f'Hi {self.username}\nEmail:{self.setEmail(self.username)}\nFull Name:{self.setFullName()}' usr = User(username='******', fname='John', lname='@Doe') setmail = usr.setEmail(email='John') assert Utils.remove_punctuation(usr.setEmail('john')) == 'johngnet' assert Utils.remove_dollar_white_space(",12") == 12.0 assert Utils.remove_dollar_white_space("$123") == 123.0 assert Utils.remove_comma_and_spaces(" , ,1234") == 1234.0 my_str = "Hello!!!, he said ---and went." assert Utils.remove_punctuation(my_str) == "Hello he said and went" print('Assertion Test Complete')