def format_dataframe(df: DataFrame) -> Tuple[DataFrame, DataFrame]: """ Create unique descriptor vector id and generate mapping table T5 and save duplicates. Args: df (DataFrame): dataframe with descriptor features and values, as well as input compound id. Returns: Tuple[Dataframe, Dataframe, DataFrame]: T5 mapping table, T6_table, descriptor-based duplicates """ # identify duplicated fingerprint, create unique descriptor vector ID for them, # and sort them according to the new descriptor ID df["descriptor_vector_id"] = df.groupby(["fp_feat", "fp_val", "fold_id"]).ngroup() # extract the mapping table before duplicate checking df_T5 = df[["input_compound_id", "fold_id", "descriptor_vector_id"]] # we sort now, as we need to have sorted datframes for T6 and the duplicates df = df.sort_values("descriptor_vector_id") # we identify duplciates # duplicate removal based desriptor vector ID is sufficient, becuase it implies unique 'fp_feat', 'fp_val', 'fold_id' combinations df_T6 = df.drop_duplicates("descriptor_vector_id")[[ "descriptor_vector_id", "fp_feat", "fp_val", "fold_id" ]] is_duplicated = df.duplicated(["descriptor_vector_id"], keep=False) df_duplicates = df.loc[is_duplicated, [ "input_compound_id", "canonical_smiles", "fp_feat", "fp_val", "fold_id", "descriptor_vector_id", ], ] return df_T5, df_T6, df_duplicates
def find_similar_test(query_str, str_list): # df_temp=df similarity = [] for index in range(len(str_list)): # 文本相似度————汉明距离 if len(str_list[index]) - len(query_str) > 0: hanming = [] for i in range(len(str_list[index]) - len(query_str) + 1): str_comp = str_list[index][i:i + len(query_str)] han = sum(el1 == el2 for el1, el2 in zip(str_comp, query_str)) hanming.append(han) sim = numpy.max(hanming) else: hanming = [] for i in range(len(query_str) - len(str_list[index]) + 1): str_comp = query_str[i:i + len(str_list[index])] han = sum(el1 == el2 for el1, el2 in zip(str_comp, str_list[index])) hanming.append(han) sim = numpy.max(hanming) similarity.append(sim) # similarity是相似度列表 比较两种方法的正确程度可能可以从similarity入手? df_str = {"similarity": similarity, "str": str_list} str_with_sim = DataFrame(df_str) #print str_with_sim # 可以设置head里面的数字来控制最接近的n个值 根据相似度距离来对dataframe进行排序, str_with_sim = str_with_sim.drop_duplicates().sort_values( by=['similarity'], ascending=False) # str_with_sim = str_with_sim[str_with_sim.similarity == max(similarity)].drop_duplicates() print(str_with_sim)
def get_nodes_edges(self, graph): ''' 从网络中将节点和边存储下来, 只提取最大联通字图的边和节点 ''' #edges = graph.edges() res = [ c for c in sorted( nx.connected_components(graph), key=len, reverse=True) ] nodes = res[0] edges = np.array(graph.edges()) sub_graph_edges = edges[np.isin(edges[:, 0], nodes) == np.isin( edges[:, 1], nodes)] sub_graph_edges = DataFrame(sub_graph_edges) sub_graph_edges.drop_duplicates(inplace=True) return nodes, sub_graph_edges
def get_count(): all_item_id = [] user_count = len(train_set_mini['user_id'].drop_duplicates()) cate_count = len(train_set_mini['cate_list'].drop_duplicates()) train_set_mini['viewed_item_id_list'] = train_set_mini.viewed_item_id.apply(lambda x: x[1:-1].split(',')) train_set_mini_item_id_list = train_set_mini.viewed_item_id_list for item_id_list in train_set_mini_item_id_list: all_item_id.extend(item_id_list) all_item_id_df = DataFrame(all_item_id) item_count = len(all_item_id_df.drop_duplicates()) return user_count, item_count, cate_count
def __getitem__(self, index): entry = self.entries[index] features = np.load( os.path.join(self.data_dir, self.name, str(entry['index']) + '.npy')) boxes = entry['boxes'] if self.ROI_select == 0: features = features[:self.max_area_boxes] boxes = boxes[:self.max_area_boxes] elif self.ROI_select == 1: # hard sample中只尝试了按面积筛选,但是最高分数中并未用到 ##根据面积来截断TOP box if boxes.shape[0] > self.max_area_boxes: area_count = [] for index, box in enumerate(boxes): area = (box[2] - box[0]) * (box[3] - box[1]) area_count.append(area) data = DataFrame({ "index": range(len(area_count)), 'area': area_count }) data = data.loc[data['area'].rank() > len(area_count) - self.max_area_boxes] data = data.drop_duplicates(subset=['area']) data_index_2 = data['index'].tolist() features = features[data_index_2] boxes = boxes[data_index_2] features = np.pad(features, ((0, self.max_area_boxes - features.shape[0]), (0, 0)), mode='constant', constant_values=0) img_h, img_w = entry['image_h'], entry['image_w'] boxes = boxes.copy() boxes[:, (0, 2)] /= img_w boxes[:, (1, 3)] /= img_h boxes = np.pad(boxes, ((0, self.max_area_boxes - boxes.shape[0]), (0, 0)), mode='constant', constant_values=0) query = entry['query'] question_id = entry['query_id'] product_id = entry['product_id'] return features, boxes, query, question_id, product_id
def _drop_duplicates(df: DataFrame, cols: Sequence[str]) -> DataFrame: """Drop duplicates and then sort the DataFrame. Args: df: DataFrame to have duplicates removed. cols: Columns for use in removing duplicates and for sorting. Returns: A DataFrame with duplicates removed (only the last duplicate is kept). The DataFrame is sorted according to the columns provided. """ df = df.drop_duplicates(subset=cols, keep="last") return df.sort_values(by=cols, ignore_index=True)
def get_idList(self, bw_id=None): with open(self.filename, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) idList = [row['bw_id'] for row in reader] if self.temp: # 减少转发紊乱导致的重复爬取 # 用set会重新排序,则断点失效 df = DataFrame(idList) df.columns = ['bw_id'] df = df.drop_duplicates(keep='last') idList = df['bw_id'] idList = idList.tolist() if bw_id: pos = idList.index(bw_id) # 必须为字符串形式 idList = idList[pos + 1:] return idList
def quChong(inputFileName): data = getLocation(inputFileName) df = DataFrame(data, columns=[ 'Station MAC', 'First time seen', 'Last time seen', 'Power', ' packets', 'BSSID', 'Probed ESSIDs', 'location' ]) newDF = df.drop_duplicates() newDF = newDF.drop(0) #print(newDF) #print(type(newDF)) #print(newDF.dtypes) #print(newDF.head()) return newDF
def label_encode(df): """ become nominal value to number value :return: label encoded """ sql_manager = SqlManager("information.sqlite") for column in main_columns: if str(df[column].dtype) == "object": le = preprocessing.LabelEncoder() label_encoded = le.fit_transform(df[column]) df2 = DataFrame({"main": df[column].copy()}) df[column] = label_encoded df2["encode"] = df[column].copy() df2 = df2.drop_duplicates() df2.to_sql(name="encoding_guide", con=sql_manager.conn, if_exists="replace", index=False)
def label_encode(column): """ become nominal value to number value :param column: each column :return: label encoded """ sql_manager = SqlManager("information.sqlite") column_value = sql_manager.crs.execute( 'select {} from information '.format(column)).fetchall() labels = [x[0] for x in list(column_value)] if type(labels[0]) == int: label_encoded = labels else: le = preprocessing.LabelEncoder() label_encoded = le.fit_transform(labels) col_list = [] for i in range(len(label_encoded)): col_list.append(column) df = DataFrame({"Lable": labels, "encode": label_encoded, "column": column}) df = df.drop_duplicates() df.to_sql(name="encoding_guide", con=sql_manager.conn, if_exists="append") return label_encoded
class MainApp(QMainWindow, ui): def __init__(self): super().__init__() QMainWindow.__init__(self) self.setupUi(self) self.HandleButtons() self.InitUI() #Store dataset to this self.data_train = DataFrame() self.data_test = DataFrame() self.columnsRemove = [] self.data_cleaned = DataFrame() self.train = True def InitUI(self): self.tabWidget.tabBar().setVisible(False) #Disabling remove columns before loading dataset for training self.listWidget_data_train.setEnabled(False) style = open('./themes/default.css', 'r') style = style.read() self.setStyleSheet(style) def HandleButtons(self): self.button_data_train.clicked.connect(self.HandleTrainBrowse) self.button_data_test.clicked.connect(self.HandleRunBrowse) self.button_drop.clicked.connect(self.RemoveColumn) self.button_drop_2.clicked.connect(self.RemoveColumn) self.button_train.clicked.connect(self.TrainModel) self.button_run.clicked.connect(self.RunModel) self.pushButton.clicked.connect(self.Open_Create) self.pushButton_2.clicked.connect(self.Open_Run) self.pushButton_3.clicked.connect(self.Open_Summary) self.pushButton_4.clicked.connect(self.open_Settings) self.button_model.clicked.connect(self.HandleModelBrowse) self.button_summary.clicked.connect(self.Summary) self.button_darkblue.clicked.connect(self.Apply_DarkBlue_Style) self.button_darkorange.clicked.connect(self.Apply_DarkOrange_Style) self.button_dark.clicked.connect(self.Apply_QDark_Style) self.button_darkgray.clicked.connect(self.Apply_DarkGray_Style) def GetLocation(self, operation: str, filter: str, caption: str) -> str: ''' Get file location either save or open file ''' if operation == 'open': return QFileDialog.getOpenFileName(self, caption=caption, directory='.', filter=filter)[0].strip() elif operation == 'save': return QFileDialog.getSaveFileName(self, caption=caption, directory='.', filter=filter)[0].strip() def HandleTrainBrowse(self): ## enable browseing to our os , pick save location save_location: str = self.GetLocation(operation='open', caption="Open", filter="CSV Files(*.csv)") print(save_location) if (save_location != ''): self.lineEdit_data_train.setText(str(save_location)) #display columns in listWidget self.data_train = pd.read_csv(self.lineEdit_data_train.text()) cols = self.data_train.columns.values.tolist() print(cols) self.listWidget_data_train.addItems(cols) self.listWidget_data_train.setEnabled(True) self.button_drop.setEnabled(True) self.train = True def HandleModelBrowse(self): self.model_location = self.GetLocation(operation='open', caption="Open", filter="JobLib Files(*.joblib)") if (self.model_location != ''): self.lineEdit_model.setText(str(self.model_location)) def HandleRunBrowse(self): ## enable browseing to our os , pick save location data_location = self.GetLocation(operation='open', caption="Open", filter="CSV Files(*.csv)") if data_location != '': self.lineEdit_data_test.setText(str(data_location)) #display columns in listWidget self.data_test = pd.read_csv(self.lineEdit_data_test.text()) cols = self.data_test.columns.values.tolist() print(cols) self.listWidget_data_test.addItems(cols) self.listWidget_data_test.setEnabled(True) self.button_drop_2.setEnabled(True) self.train = False def RemoveColumn(self): if (self.train): items = self.listWidget_data_train.selectedItems() list = self.listWidget_data_train data = self.data_train else: items = self.listWidget_data_test.selectedItems() list = self.listWidget_data_test data = self.data_test if items is None: return reply = QMessageBox.question( self, "Drop", "Remove`{0}'?".format(' '.join(map(lambda item: item.text(), items))), QMessageBox.Yes | QMessageBox.No, QMessageBox.No) if reply == QMessageBox.Yes: for item in items: row = list.row(item) item = list.takeItem(row) self.columnsRemove.append(item.text()) del item #Delete from dataframe only in training self.data_cleaned = data.drop(columns=self.columnsRemove, inplace=self.train) def TrainModel(self): print(self.data_train.columns) self.listWidget_data_train.clear() self.columnsRemove.clear() save_location = self.GetLocation(operation='save', caption="Save as", filter="JobLib Files(*.joblib)") if save_location != '': print(save_location, 'model train start') #train model self.data_train.dropna(inplace=True) self.data_train.drop_duplicates(inplace=True) X = pd.get_dummies(self.data_train) kmeans = KMeans(init='k-means++', max_iter=300, n_init=10, random_state=4) scaler = MinMaxScaler() scaled_features = scaler.fit_transform(X) visualizer = KElbowVisualizer(kmeans, k=(4, 12), metric='silhouette', timings=False) visualizer.fit(X) if (not visualizer.elbow_value_): clusterValue = 3 else: clusterValue = visualizer.elbow_value_ kmeans = KMeans(max_iter=300, n_init=10, random_state=4, n_clusters=clusterValue) print(clusterValue) kmeans.fit(scaled_features) #save model dump(kmeans, save_location + '.joblib') print('model train done') def RunModel(self): print(self.data_cleaned.columns) self.listWidget_data_test.clear() self.model = load(self.model_location) self.columnsRemove.clear() self.data_cleaned.dropna(inplace=True) self.data_cleaned.drop_duplicates(inplace=True) X = pd.get_dummies(self.data_cleaned) scaler = MinMaxScaler() scaled_features = scaler.fit_transform(X) y_means = self.model.predict(scaled_features) self.data_cleaned['Cluster'] = y_means self.data_cleaned.to_csv('output.csv') def Summary(self): data_location = self.GetLocation('open', 'CSV Files(*.csv)', 'Open') if data_location != '': self.lineEdit_summary.setText(data_location) df = pd.read_csv(data_location) summary_df = df.describe() #Row count row = summary_df.shape[0] self.tableWidget.setRowCount(row) #Column count column = summary_df.shape[1] self.tableWidget.setColumnCount(column) self.tableWidget.setHorizontalHeaderLabels( summary_df.columns.values.tolist()) self.tableWidget.setVerticalHeaderLabels( summary_df.index.values.tolist()) print(row, column) for i in range(row): for j in range(column): self.tableWidget.setItem( i, j, QTableWidgetItem(str(summary_df.iloc[i, j]))) self.tableWidget.resizeColumnsToContents() self.tableWidget.resizeRowsToContents() self.tableWidget.setEnabled(True) ################################################ ###### UI CHanges Methods def Open_Create(self): self.tabWidget.setCurrentIndex(0) def Open_Run(self): self.tabWidget.setCurrentIndex(3) def Open_Summary(self): self.tabWidget.setCurrentIndex(2) def open_Settings(self): self.tabWidget.setCurrentIndex(1) ################################################ ###### App Themes #### def Apply_DarkOrange_Style(self): style = open('./themes/darkorange.css', 'r') style = style.read() self.setStyleSheet(style) def Apply_QDark_Style(self): style = open('themes/qdark.css', 'r') style = style.read() self.setStyleSheet(style) def Apply_DarkGray_Style(self): style = open('themes/qdarkgray.css', 'r') style = style.read() self.setStyleSheet(style) def Apply_DarkBlue_Style(self): style = open('./themes/darkblue.css', 'r') style = style.read() self.setStyleSheet(style)
# from .items import PcautoPowerSpider Item connection = pymongo.MongoClient('192.168.1.94', 27017) db = connection["newcar"] collection = db["pcauto_tmp"] model_data = collection.find({}, { "carid": 1, "brandname": 1, "factoryname": 1, "familyname": 1, "brandid": 1, "_id": 0 }) car_msg_list = list(model_data) car_msg_df = DataFrame(car_msg_list) car_msg_df_new = car_msg_df.drop_duplicates('carid') class PcautoPowerSpider(scrapy.Spider): name = 'pcauto_power_minBtPrice' allowed_domains = ['pcauto.com'] # start_urls = ['https://price.pcauto.com.cn/price/api/v1/serialgroup/serial_group_bt_data/r3-m85355'] @classmethod def update_settings(cls, settings): settings.setdict(getattr( cls, 'custom_debug_settings' if getattr(cls, 'is_debug', False) else 'custom_settings', None) or {}, priority='spider') def __init__(self, **kwargs):
def _prepare_dataframes( forecast_df: DataFrame, truth_df: DataFrame, percentiles: Optional[List[float]] = None, experiment: Optional[str] = None, ) -> Tuple[DataFrame, DataFrame]: """Prepare dataframes for conversion to cubes by: 1) checking that the expected columns are present, 2) checking the percentiles are as expected, 3) removing duplicates from the forecast and truth, 4) finding the sites common to both the forecast and truth dataframes and 5) replacing and supplementing the truth dataframe with information from the forecast dataframe. Note that this third step will also ensure that a row containing a NaN for the ob_value is inserted for any missing observations. Args: forecast_df: DataFrame expected to contain the following columns: forecast, blend_time, forecast_period, forecast_reference_time, time, wmo_id, percentile, diagnostic, latitude, longitude, altitude, period, height, cf_name, units and experiment. Any other columns are ignored. truth_df: DataFrame expected to contain the following columns: ob_value, time, wmo_id, diagnostic, latitude, longitude and altitude. Any other columns are ignored. percentiles: The set of percentiles to be used for estimating EMOS coefficients. experiment: A value within the experiment column to select from the forecast table. Returns: A sanitised version of the forecasts and truth dataframes that are ready for conversion to cubes. """ _dataframe_column_check(forecast_df, FORECAST_DATAFRAME_COLUMNS) _dataframe_column_check(truth_df, TRUTH_DATAFRAME_COLUMNS) # Filter to select only one experiment if experiment: forecast_df = forecast_df.loc[forecast_df["experiment"] == experiment] if forecast_df["experiment"].nunique() > 1: unique_exps = forecast_df["experiment"].unique() msg = ( "More than one value for the experiment column found in the " f"forecast dataframe. Values for experiment column {unique_exps}") raise ValueError(msg) # Extract the required percentiles. if percentiles: indices = [ np.isclose(forecast_df["percentile"], float(p)) for p in percentiles ] forecast_df = forecast_df[np.logical_or.reduce(indices)] # Check the percentiles can be considered to be equally space quantiles. _quantile_check(forecast_df) # Remove forecast duplicates. forecast_df = forecast_df.drop_duplicates( subset=[ "diagnostic", "forecast_period", "percentile", "time", "wmo_id" ], keep="last", ) # Sort to ensure a consistent ordering after removing duplicates. forecast_df.sort_values( by=["blend_time", "percentile", "wmo_id"], inplace=True, ignore_index=True, ) # Remove truth duplicates. truth_cols = ["diagnostic", "time", "wmo_id"] truth_df = truth_df.drop_duplicates( subset=truth_cols, keep="last", ) # Sort to ensure a consistent ordering after removing duplicates. truth_df.sort_values( by=truth_cols, inplace=True, ignore_index=True, ) # Find the common set of WMO IDs. common_wmo_ids = sorted( set(forecast_df["wmo_id"].unique()).intersection( truth_df["wmo_id"].unique())) forecast_df = forecast_df[forecast_df["wmo_id"].isin(common_wmo_ids)] truth_df = truth_df[truth_df["wmo_id"].isin(common_wmo_ids)] # Ensure time in forecasts is present in truths. forecast_df = forecast_df[forecast_df["time"].isin( truth_df["time"].unique())] # Ensure time in truths is present in forecasts. truth_df = truth_df[truth_df["time"].isin(forecast_df["time"].unique())] truth_df = truth_df.drop(columns=["altitude", "latitude", "longitude"]) # Identify columns to copy onto the truth_df from the forecast_df forecast_subset = forecast_df[[ "wmo_id", "latitude", "longitude", "altitude", "period", "height", "cf_name", "units", "time", "diagnostic", ]].drop_duplicates() # Use "right" to fill in any missing observations in the truth dataframe # and retain the order from the forecast_subset. truth_df = truth_df.merge(forecast_subset, on=["wmo_id", "time", "diagnostic"], how="right") return forecast_df, truth_df
# from .items import PcautoPowerSpider Item connection = pymongo.MongoClient('192.168.1.94', 27017) db = connection["newcar"] collection = db["pcauto_tmp"] model_data = collection.find({}, { "familyid": 1, "brandname": 1, "factoryname": 1, "familyname": 1, "brandid": 1, "_id": 0 }) car_msg_list = list(model_data) car_msg_df = DataFrame(car_msg_list) car_msg_df_new = car_msg_df.drop_duplicates('familyid') class PcautoPowerSpider(scrapy.Spider): name = 'pcauto_power' allowed_domains = ['pcauto.com'] # start_urls = ['http://pcauto.com/'] @classmethod def update_settings(cls, settings): settings.setdict(getattr( cls, 'custom_debug_settings' if getattr(cls, 'is_debug', False) else 'custom_settings', None) or {}, priority='spider') def __init__(self, **kwargs):
db = connection["dasouche"] collection_city = db["dasouche_city"] collection_modellist = db["dasouche_modellist"] model_city = collection_city.find({}, {"cityName": 1, "cityId": 1, "_id": 0}) car_city_list = list(model_city) city_dic = {data["cityName"]: data["cityId"] for data in car_city_list} print(city_dic) model_data = collection_modellist.find({}, { "brandName": 1, "modelCode": 1, "year": 1, "_id": 0 }) car_msg_df = DataFrame(list(model_data)).drop_duplicates('modelCode') num = car_msg_df.drop_duplicates('modelCode')['modelCode'].count() print(num) for index, car in car_msg_df.iterrows(): url_list = list() month_now = datetime.now().month year_now = datetime.now().year for year in range(car["year"] - 1, year_now + 1): month = month_now - 1 if year == year_now else month_now for city_n, city_i in city_dic.items(): registerDate = str(year) + "-" + str(month) mile = 0.1 if year == year_now else (2 * (year_now - year)) meta = { "model": car["modelCode"], "registerDate": registerDate, "city_n": city_n,
"regDate": 1, "mile": 1, "_id": 0 }) # car_msg_list1 = list(model_data1)[5:500] # car_msg_df1 = DataFrame(car_msg_list1) # car_msg_df_new1 = car_msg_df1.drop_duplicates('salesdescid').dropna(axis=0, how='any') # sid_list = [str(sid).replace('.0', '') for sid in car_msg_df_new1["salesdescid"].values] # print(sid_list) car_msg_list3 = list(model_data3) car_msg_df3 = DataFrame(car_msg_list3) sid_list = [ str(sid).replace('.0', '') for sid in car_msg_df3.drop_duplicates('salesdescid')["salesdescid"].values ] print(len(sid_list)) car_msg_list2 = list(model_data2) car_msg_df2 = DataFrame(car_msg_list2) car_msg_df_new2 = car_msg_df2[car_msg_df2['salesdescid'].isin(sid_list)] sid_list2 = [ str(sid).replace('.0', '') for sid in car_msg_df_new2.drop_duplicates( 'salesdescid')["salesdescid"].values ] print(car_msg_df_new2["salesdescid"].count()) print(len(sid_list2)) msg_df_new2 = car_msg_df2[car_msg_df2['salesdescid'].isin(sid_list2)] msg_df_new3 = car_msg_df3[car_msg_df3['salesdescid'].isin(sid_list2)]
db = connection["chexiu"] collection = db["chexiu_car"] model_data = collection.find({}, { "vehicle_id": 1, "vehicle": 1, "brandname": 1, "brand_id": 1, "familyname": 1, "family_id": 1, "factoryname": 1, '_id': 0 }) car_msg_list = list(model_data) car_msg_df = DataFrame(car_msg_list) car_msg_df_new = car_msg_df.drop_duplicates('vehicle_id') class ChexiuspiderSpider(scrapy.Spider): name = 'chexiuSpider' allowed_domains = ['chexiu.com'] start_urls = [ 'https://sz.chexiu.com/index.php?r=site/api/depList&isshowall=1' ] @classmethod def update_settings(cls, settings): settings.setdict(getattr( cls, 'custom_debug_settings' if getattr(cls, 'is_debug', False) else 'custom_settings', None) or {},
# 获取要跑那块数据 data_f = '2020-05-02' data_0 = '2020-06-23' date1 = time.strptime(data_0, '%Y-%m-%d') now_d = time.strftime('%Y-%m-%d', time.localtime()) date2 = time.strptime(now_d, '%Y-%m-%d') start_d = datetime(date1[0], date1[1], date1[2]) end_d = datetime(date2[0], date2[1], date2[2]) part_num = (end_d - start_d).days % 28 # part_num = 28 if part_num == 0 else part_num print(part_num+1) # # 挑选当天的数据id # car_msg_df_new = car_msg_df[car_msg_df["part"] == part_num+1] car_msg_df_new = car_msg_df.drop_duplicates('salesdescid') print(car_msg_df_new["salesdescid"].count()) # # # 更改表名字 connection2 = pymongo.MongoClient('192.168.2.149', 27017) local_time = time.strftime('%Y-%m-%d', time.localtime()) print(local_time) db2 = connection2['che300'] collection2 = db2['che300_21_price'] # count = collection2.count() # if count: # print(count) # name = 'che300_21_price_' + str(part_num) + '_' + str(getYesterday())
collection.rename(name) collection3.remove() data_list = [] # data = {"start_num": 0, "end_num": 100} data = {"start_num": end_num, "end_num": end_num + 100} data_list.append(data) collection3.insert(data_list) collection1 = db3["che300_split"] model_data1 = collection1.find({}, {"brandname": 1, "brandid": 1, "familyid": 1, "salesdescid": 1, "min_reg_year": 1, "max_reg_year": 1, "part": 1, "_id": 0}) car_msg_list1 = list(model_data1) car_msg_df = DataFrame(car_msg_list1) car_msg_df_new = car_msg_df.drop_duplicates('salesdescid').dropna(axis=0, how='any') print(car_msg_df_new["salesdescid"].count()) db2 = connection2["che300"] collection2 = db2["che300_queue"] model_data2 = collection2.find({}, {"salesdescid": 1, "_id": 0}) # start_num = 0 # end_num = 500 car_msg_list2 = list(model_data2) sid_list = list() for i in car_msg_list2[start_num:end_num]: sid_list.append(str(i["salesdescid"]).replace('.0', '')) # print(sid_list) partnerId = ['douyin', 'escsh', 'yanchebang', 'jhhz', 'ynhcj', 'chexiaopang']
uri1 = f'mongodb://192.168.2.149:{settings["MONGODB_PORT"]}/' # connection = pymongo.MongoClient(uri1, unicode_decode_error_handler='ignore') connection = pymongo.MongoClient(uri1) db = connection['che300'] collection = db['che300_price_daily'] # collection = collection.with_options(codec_options=bson.CodecOptions(unicode_decode_error_handler="ignore")) uri2 = f'mongodb://192.168.1.92:{settings["MONGODB_PORT"]}/' connection2 = pymongo.MongoClient(uri2) db2 = connection2[settings['MONGODB_DB']] collection2 = db2['che300_41city_url'] model_data = collection.find({}, {"url": 1, "_id": 0}) car_msg_df = DataFrame(list(model_data)) car_msg_df = car_msg_df.drop_duplicates('url') have_num = car_msg_df["url"].count() print(f"现有数据量:{have_num}") model_data2 = collection2.find({}, {"url": 1, "_id": 0}) car_msg_df2 = DataFrame(list(model_data2)).drop_duplicates('url') all_num = car_msg_df2["url"].count() print(f"总共数据量:{all_num}") df_a_filter = car_msg_df2[~car_msg_df2['url'].isin(car_msg_df['url'])] miss_num = df_a_filter["url"].count() print(f"缺少数据量:{miss_num}") miss_sid_list = [ 1373449, 1373450, 1373451, 1373455, 1400877, 1400879, 1400880, 1400976, 1401128, 1401129, 1401130, 1401131, 1401134, 1401137, 1401138, 1401141,