def find_similar_locations_for_given_model(self, location_id, k, model): loc_id_key_map = DBUtils.create_location_id_key_map(self._db_operations) location_key = loc_id_key_map[location_id] primary_df = self.create_concatenated_and_normalised_data_frame_for_model(model, True) most_similar_locations = [] primary_loc_data_frame = primary_df[primary_df["location"] == location_id] for id in loc_id_key_map: if id != location_id: arg_loc_key = loc_id_key_map[id] arg_data_frame = primary_df[primary_df["location"] == id] loc_similarity = self.get_distance_measure_and_similarity_for_data_frames(arg_loc_key, primary_loc_data_frame, arg_data_frame) most_similar_locations_len = len(most_similar_locations) if most_similar_locations_len < k: most_similar_locations.append(loc_similarity) most_similar_locations = sorted(most_similar_locations, key=lambda location: location.weighted_distance) elif most_similar_locations[k-1].weighted_distance > loc_similarity.weighted_distance: most_similar_locations = most_similar_locations[:k-1] most_similar_locations.append(loc_similarity) most_similar_locations = sorted(most_similar_locations, key=lambda location: location.weighted_distance) print("Input location is {0}".format(location_key)) self.display_result_for_model(most_similar_locations)
def reduce_dimensions_given_Location_Model(self, input_param, model, entity_id, k): """ Gives 5 related location for a given model and image id after preforming dimensionality reduction to k latent semantics Parameters ---------- vector_space : Original Object Feature Martix input_param: int Reduction algorithm given by the user 1.PCA 2.SVD 3.LDA model: model given by user k : int Number of latent semantics to be which matrix has to be reduced(given by user) entity_id: 5 Location id given by the users Returns ------- reduced_dimensions, post_projection_vectors, loc_id_key_map Gives 5 related locations for a given model and location id """ loc_id_key_map = DBUtils.create_location_id_key_map(self._db_operations) vector_space = self.create_concatenated_and_normalized_data_frame_for_a_location_model(entity_id, input_param, model ) (reduced_dimensions, VT) = self.reduction_method[input_param](vector_space, k) post_projection_vectors = self.project_data_onto_new_dimensions(entity_id, len(loc_id_key_map), VT, 4, model, input_param) return reduced_dimensions, post_projection_vectors, loc_id_key_map
def create_concatenated_and_normalised_data_frame_for_model(self, model,input_option): """ Concatenate data frame for all locations for a given model Parameters ---------- model : model given by user input_option : int Type of reduction algorithm 1.PCA 2.SVD 3.LDA Returns ------- primary_df : For PCA and LDA, it returns normalised dataframe For SVD dataframe with all locations for a given model """ loc_id_key_map = DBUtils.create_location_id_key_map(self._db_operations) primary_df = None for id in loc_id_key_map: loc_key = loc_id_key_map[id] file_name = self.get_file_name_from_input(loc_key, model) if primary_df is None: primary_df = self.get_data_frame(file_name) primary_df.insert(1, "locationId", value=id) else: data_frame_to_add = self.get_data_frame(file_name) data_frame_to_add.insert(1, "locationId", value=id) primary_df = pd.concat([primary_df, data_frame_to_add], axis=0, sort=False) return primary_df if input_option ==2 else self.normalise_methodformodel[input_option] (primary_df) #if not SVD .then normalise
def visualize_with_ids(self, image_id_loc): loc_id_key_map = DBUtils.create_location_id_key_map(self.database_ops) image_list = [] for i in image_id_loc: location_key = loc_id_key_map[i['loc']] image_list.append(self.img_path + location_key + "/" + str(i['imageId']) + "." + self.format) image_viewer = ImageViewerMain() image_viewer.start_image_viewer(image_list)
def prepare_file_list(self, image_indexes, obj_index): loc_id_key_map = DBUtils.create_location_id_key_map(self.database_ops) file_list = [] for image_index in image_indexes: image_tuple = obj_index.iloc[image_index] location_id = image_tuple["location"] location_key = loc_id_key_map[location_id] image_id = image_tuple[0] file_list.append(self.img_path + location_key + "/" + str(image_id) + "." + self.format) return file_list
def create_concatenated_and_normalized_data_frame_for_a_location(self, location_id, input_option, model=None): loc_id_key_map = DBUtils.create_location_id_key_map(self._db_operations) location_key = loc_id_key_map[int(location_id)] primary_data_frames_by_model = pd.DataFrame() for model in self.get_visual_model_types(): file_name = self.get_file_name_from_input(location_key, model.name) data_frame_to_add = self.get_data_frame(file_name) data_frame_to_add.drop(data_frame_to_add.columns[0], axis=1, inplace=True) primary_data_frames_by_model = pd.concat([primary_data_frames_by_model, data_frame_to_add], ignore_index= True, axis=1, sort=False) return primary_data_frames_by_model if input_option == 2 else self.normalise_method[input_option]( primary_data_frames_by_model) # if not SVD .then normalise
def create_concatenated_and_normalised_data_frame_for_model(self, model, normalise=False): loc_id_key_map = DBUtils.create_location_id_key_map(self._db_operations) primary_df = None for id in loc_id_key_map: loc_key = loc_id_key_map[id] file_name = self.get_file_name_from_input(loc_key, model) if primary_df is None: primary_df = self.get_data_frame(file_name) primary_df.insert(1, "location", value=id) else: data_frame_to_add = self.get_data_frame(file_name) data_frame_to_add.insert(1, "location", value=id) primary_df = pd.concat([primary_df, data_frame_to_add], axis=0,ignore_index=True, sort=False) return primary_df if not normalise else self.normalise_data_frame(primary_df)
def process_desctxt_files(self): text_processor = TextFileProcessor(self._base_path) xml_processor = XmlFileProcessor(self._base_path) xml_processor.parse_xml(self._devset_topics, self.process_devset_topics_xml) queries = text_processor.process_text_file( self._textTermsPerUserFile, self.process_text_terms_per_user) queries += text_processor.process_text_file( self._textTermsPerImageFile, self.process_text_terms_per_image) queries += text_processor.process_text_file( self._textTermsPerPOIwFolderNamesFile, self.process_text_terms_per_POI, DBUtils.create_location_key_id_map(self._database_operations)) self._database_operations.executeWriteQueries(queries)
def find_similar_locations_for_all_models(self, location_id, k): loc_id_key_map = DBUtils.create_location_id_key_map(self._db_operations) location_key = loc_id_key_map[location_id] primary_data_frames_by_model = {} most_similar_locations = [] models = self.get_visual_model_types() for model in models: file_name = self.get_file_name_from_input(location_key, model.name) primary_data_frames_by_model[model.name] = self.get_normalised_data_frame(file_name) for id in loc_id_key_map: arg_loc_key = loc_id_key_map[id] loc_similarity_for_models = [] total_distance_for_all_models = 0 if id != location_id: for model in models: primary_df = primary_data_frames_by_model[model.name] arg_data_frame = self.get_normalised_data_frame(self.get_file_name_from_input(arg_loc_key, model.name)) loc_similarity_for_model = self.get_distance_measure_for_data_frames(arg_loc_key, primary_df, arg_data_frame) loc_similarity_for_model.model = model.name loc_similarity_for_models.append(loc_similarity_for_model) similarity_contribution = loc_similarity_for_model.weighted_distance/model.dimensions; total_distance_for_all_models += similarity_contribution most_similar_locations_count = len(most_similar_locations) if most_similar_locations_count < k: most_similar_locations.append(TotalLocationSimilarity(arg_loc_key, total_distance_for_all_models, loc_similarity_for_models)) most_similar_locations = sorted(most_similar_locations, key=lambda location: location.distance) elif most_similar_locations[k - 1].distance > total_distance_for_all_models: most_similar_locations = most_similar_locations[:k - 1] most_similar_locations.append(TotalLocationSimilarity(arg_loc_key, total_distance_for_all_models, loc_similarity_for_models)) most_similar_locations = sorted(most_similar_locations, key=lambda location: location.distance) print("Input location is {0}".format(location_key)) self.display_result_for_all_models(most_similar_locations); return None
def create_concatenated_and_normalized_data_frame_for_a_location_model(self, location_id, input_option, model): """ Get data frame for a given location for a given model Parameters ---------- model : model given by user location_id : int Location id given by user Returns ------- data_frame_to_add : Data frame for a given location for a given model """ loc_id_key_map = DBUtils.create_location_id_key_map(self._db_operations) location_key = loc_id_key_map[int(location_id)] primary_data_frames_by_model = pd.DataFrame() file_name = self.get_file_name_from_input(location_key, model) data_frame_to_add = self.get_data_frame(file_name) data_frame_to_add.drop(data_frame_to_add.columns[0], axis=1, inplace=True) return data_frame_to_add if input_option == 2 else self.normalise_method[input_option]( data_frame_to_add)
def find_similar_locations(self, location_id, model, k): get_terms_query = "select term, {1} from termsPerLocation where locationId = \"{0}\"".format( location_id, model) source_word_dict = {} get_terms_query_result = self._database_operations.executeSelectQuery( get_terms_query) conversion_func = self.get_conversion_func(model) for item in get_terms_query_result: source_word_dict[item[0]] = conversion_func(item[1]) join_query_result = "select te.locationId,te.term,te.{0}, te1.locationId, te1.term, te1.{0} from (select te2.locationId, te2.term, te2.{0} from termsPerLocation te2 where locationId <> {1})" \ " te LEFT JOIN (select locationId, term, {0} from termsPerLocation where locationId = {1}) te1 on te1.term=te.term;".format( model, location_id) result = self._database_operations.executeSelectQuery( join_query_result) result = self.process_text_result_sets(result, k, source_word_dict, self.get_conversion_func(model)) location_map = DBUtils.create_location_id_key_map( self._database_operations) self.display_location_result(result, location_map)
def reduce_dimensions_givenmodel(self,input_option, model, k, count): """ Gives 5 related images and location for a given model and image id after preforming dimensionality reduction Parameters ---------- vector_space : Original Object Feature Martix input_option: int Reduction algorithm given by the user 1.PCA 2.SVD 3.LDA model: model given by user k : int Number of latent semantics to be which matrix has to be reduced(given by user) count: 5 Given in task 3 Returns ------- Gives 5 related images and 5 related locations for a given model and image id """ loc_id_key_map = DBUtils.create_location_id_key_map(self._db_operations) input_method = int(input_option) count=int(count) vector_space=self.create_concatenated_and_normalised_data_frame_for_model(model,input_method) vector_space=vector_space.rename(columns={ vector_space.columns[0]: "image" }) vector_space = vector_space.sort_values(['locationId','image'], ascending=[True,True]) # sort based on loc and img vector_space.reset_index(drop=True, inplace=True) (latent_semantics_matrix,VT)=self.reduction_method[input_method](vector_space.iloc[:,2:],k) latent_semantics=pd.DataFrame(latent_semantics_matrix) print("Latent semantics are") print(pd.DataFrame(VT)) latent_semantics.reset_index(drop=True, inplace=True) reduced_space = pd.concat([vector_space.iloc[:,:2], latent_semantics], axis=1) print("Enter Image ID to search") image_id = int(input()) (ImageMatrix,LocationMateix)=DistanceUtils.find_similar_images_locations_for_given_model(image_id, k, model, reduced_space,count,loc_id_key_map) df_loc_id_key_map = pd.DataFrame(list(loc_id_key_map.items()), columns=['locationId', 'locationKey']) ImageMatrix = pd.DataFrame(pd.merge(ImageMatrix,df_loc_id_key_map,on='locationId',how='left')) LocationMateix=pd.DataFrame(pd.merge(LocationMateix,df_loc_id_key_map,on='locationId',how='left')) print("5 related Images are") print(ImageMatrix.loc[:,['image','locationKey','dist']]) print("5 related locations are") print(LocationMateix.loc[:,['locationKey','dist','locationId']])
def get_location_location_similarity_matrix_and_reduce(self, k): """ Creates a location location similarity matrix based on cosine and reduces it Parameters ---------- k : int number of dimensions to be reduced to Returns ------- void """ (vector_space, object_index_dict, term_index_dict) = self.get_vector_space(3, "TF_IDF", True) np_vector_space = np.array(vector_space) distance_matrix = distance.cdist(np_vector_space, np_vector_space, metric='cosine') subtract_func = lambda t: 1 - t vfunc = np.vectorize(subtract_func) distance_matrix = vfunc(distance_matrix) (reduced_dimensions, projection_direction) = self.reduction_method[2](distance_matrix, k) location_index = DBUtils.create_location_id_key_map( self._database_operations) self.display_topics(projection_direction, location_index)
import requests from lxml import etree import time from dbUtils import DBUtils from multiprocessing import Pool # 加入请求头 headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36', 'Connection': 'close' } # 创建数据库工具类 db = DBUtils() # 数组,存放每页数据 # list_page_info = [] # 标志,是否第一次爬取数据 flag_first_access = True # 存放基本属性名称 list_basic_attribute_name = [] def get_sub_links(url): try: res = requests.get(url, headers=headers, timeout=2) except ( requests.Timeout, requests.ConnectTimeout,
from gsheets import GSheets from configparser import ConfigParser from datetime import datetime, timedelta config = ConfigParser() config.read('conf.ini') API_ID = config['CONF']['API_ID'] API_HASH = config['CONF']['API_HASH'] PHONE_NUMBER = config['CONF']['PHONE_NUMBER_IN_INTERNATIONAL_FORMAT'] BOT_TOKEN = config['CONF']['BOT_TOKEN'] DB_URL = config['CONF']['DB_URL'] mongoClient = MongoClient(DB_URL) db = mongoClient.telegramDB dbUtils = DBUtils(db) sheets = GSheets(db) # Telethon client client = TelegramClient(f'quart_{PHONE_NUMBER}', API_ID, API_HASH) bot = TelegramClient('Bot', API_ID, API_HASH) client.parse_mode = 'html' # <- Render things nicely bot.parse_mode = 'html' botObject = None clientObject = None phone = None # Quart app app = Quart(__name__) app.secret_key = 'CHANGE THIS TO SOMETHING SECRET' logged_in = True
def get_shapes(): return dumps(list(DBUtils().get_collection_obj("LOCAL", "shapes", "shape").find({})))
def reduce_dimensions(self, input_param, data_option, entity_id, k): loc_id_key_map = DBUtils.create_location_id_key_map(self._db_operations) vector_space = self.data_load_option[data_option](entity_id, input_param) (reduced_dimensions, VT) = self.reduction_method[input_param](vector_space, k) post_projection_vectors = self.project_data_onto_new_dimensions(entity_id, len(loc_id_key_map), VT,5,None, input_param ) #Model=None return reduced_dimensions, post_projection_vectors, loc_id_key_map
def get_shape_defination(): shape = request.args.get("shape") if shape is not None: return dumps(DBUtils().get_collection_obj("LOCAL", "shapes", "shape").find_one({"id": shape})) else: return "shape is a compulsory param", 404
def analyse_data(): db = DBUtils() # 连接数据库 db.db_connect() # 加载数据库中的指定数据 db_query_data = db.db_get_info({ '_id': 0, '建筑面积': 1, '房屋总价': 1, '装修情况': 1, '行政区域': 1 }) db.db_close() # 使用pandas将数据转为为数据集 dataSet = pd.DataFrame(list(db_query_data)) # print(dataSet) # 修改指定数据格式 dataSet['建筑面积'] = dataSet['建筑面积'].str[:-1] # 将数据转为数组形式 dataArr = np.array(dataSet) # 区域 x_district = dataArr[:, 2] # 按区域分割数据集 data_address1 = dataArr[x_district == '鼓楼', :] # 装修情况 x1_decoration = data_address1[:, 3] # 按装修情况分割数据集 dataset_address1_decoration = data_address1[x1_decoration == '毛坯', :] dataset_address2_decoration = data_address1[x1_decoration == '简装', :] dataset_address3_decoration = data_address1[x1_decoration == '精装', :] dataset_address4_decoration = data_address1[x1_decoration == '其他', :] # print(data_address1) # 面积与价格数据 x1_area_d = dataset_address1_decoration[:, 0].astype('float') y1_price_d = dataset_address1_decoration[:, 1].astype('float') x2_area_d = dataset_address2_decoration[:, 0].astype('float') y2_price_d = dataset_address2_decoration[:, 1].astype('float') x3_area_d = dataset_address3_decoration[:, 0].astype('float') y3_price_d = dataset_address3_decoration[:, 1].astype('float') x4_area_d = dataset_address4_decoration[:, 0].astype('float') y4_price_d = dataset_address4_decoration[:, 1].astype('float') # 排序 x1_sort = np.sort(x1_area_d) index = np.argsort(x1_area_d) y1_sort = [y1_price_d[i] for i in index] x2_sort = np.sort(x2_area_d) index = np.argsort(x2_area_d) y2_sort = [y2_price_d[i] for i in index] x3_sort = np.sort(x3_area_d) index = np.argsort(x3_area_d) y3_sort = [y3_price_d[i] for i in index] x4_sort = np.sort(x4_area_d) index = np.argsort(x4_area_d) y4_sort = [y4_price_d[i] for i in index] # print(x1_sort) # 绘图 ax1 = plt.subplot2grid(shape=(2, 2), loc=(0, 0)) ax1.scatter(x1_sort, y1_sort, s=10) ax1.set_ylabel('总价(万元)') ax1.set_xlabel('面积(平方米)') ax1.set_title('鼓楼区二手房-毛坯-建筑面积与房屋总价格关系图') ax2 = plt.subplot2grid(shape=(2, 2), loc=(0, 1)) ax2.scatter(x2_sort, y2_sort, s=10) ax2.set_ylabel('总价(万元)') ax2.set_xlabel('面积(平方米)') ax2.set_title('鼓楼区二手房-简装-建筑面积与房屋总价格关系图') ax3 = plt.subplot2grid(shape=(2, 2), loc=(1, 0)) ax3.scatter(x3_sort, y3_sort, s=10) ax3.set_ylabel('总价(万元)') ax3.set_xlabel('面积(平方米)') ax3.set_title('鼓楼区二手房-精装-建筑面积与房屋总价格关系图') ax4 = plt.subplot2grid(shape=(2, 2), loc=(1, 1)) ax4.scatter(x4_sort, y4_sort, s=10) ax4.set_ylabel('总价(万元)') ax4.set_xlabel('面积(平方米)') ax4.set_title('鼓楼区二手房-其他-建筑面积与房屋总价格关系图') plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 plt.subplots_adjust(hspace=0.6, wspace=0.5) plt.show()