def read_file(self): data = pd.read_csv(self.file_path, encoding='utf-8') # 读取数据 print("data:") # 逐个元素判断是否为空值,将空值行,放入一个队列中 for i in range(len(data)): print("i:", i) user_id = "" # 用户ID service_id = "" # 服务ID date_time = "" # 格式化时间 time_stamp = "" # 时间戳 activity = "" # 行为 content = "" # 内容 key_words = [] # 内容关键词(自然语言做的分词) retweet = 0 # 转推数 like = 0 # 喜欢数 reply_num = 0 # 回复数 row = data.iloc[i] # 数据元组 # print("type:", type(row), "row: ", row) for j in data.columns: j = str(j) # if str(data.iloc[i][j]) == "nan": # print("空值", row, j, "element:", data.iloc[i][j]) # self.nan_list.append(row) # continue element = str(data.iloc[i][j]) if j == "用户ID": user_id = element print(j, ":", user_id) if j == "服务ID": service_id = element print(j, ":", service_id) if j == "时间": if str(data.iloc[i][j]) == "nan": self.nan_list.append(row) continue if self.match_timestamp(element): print("10位数字的时间戳") time_stamp = int(self.match_timestamp(element)) # 将时间戳timestamp转换成格式化的字符串Datetime l_time = time.localtime(time_stamp) date_time = time.strftime("%Y-%m-%d %H:%M:%S", l_time) print("date_time:", date_time, ", timestamp:", time_stamp) else: if "上午" in element: element = element.replace("上午", "") # 格式化的字符串转换成Datetime print("时间:", element) dt = datetime.datetime.strptime( element, "%H:%M - %Y年%m月%d日") date_time = str(dt) print("时间-:", date_time) # 转化成时间戳 timeArray = time.strptime(date_time, "%Y-%m-%d %H:%M:%S") # 2将"2011-09-28 10:00:00"转化为时间戳 timestamp = int(time.mktime(timeArray)) print("timestamp:", timestamp) time_stamp = timestamp if "下午" in element: element = element.replace("下午", "") # 格式化的字符串转换成Datetime dt = datetime.datetime.strptime( element, "%H:%M - %Y年%m月%d日") print("dt:", dt) # 加上12小时 aDay = timedelta(days=0.5) now = dt + aDay print("new now:", now) element = str(now) date_time = element # 再变成时间戳 timeArray = time.strptime(element, "%Y-%m-%d %H:%M:%S") # 2将"2011-09-28 10:00:00"转化为时间戳 timestamp = int(time.mktime(timeArray)) print("timestamp:", timestamp) time_stamp = timestamp if j == "行为": activity = element print(j, ":", activity) if j == "内容": # 判断是否有空值 if str(data.iloc[i][j]) == "nan": # print("空值", row, j, "element:", data.iloc[i][j]) self.nan_list.append(row) continue element = element.replace("Twitter", "") element = element.replace("的", "") element = element.replace(' “@', "") content = element print(j, ":", content) if activity is "": if user_id in content: activity = "Post" print("行为:", activity) else: activity = "Reply" print("行为:", activity) temp_keywords = dpt4.main(element) # keywords是一个List结构 print("keywords:", temp_keywords) key_words = key_words + temp_keywords if j == "转推": retweet = element print(j, ":", retweet) if j == "喜欢": like = element print(j, ":", like) if j == "回复": reply_num = element print(j, ":", reply_num) # end if _id = self.get_next_counter() print("_id:", _id) # 输出每一行的input_list insert_text = { "uid": self.collection_name, "用户ID": user_id, "服务ID": service_id, "时间": date_time, "timestamp": time_stamp, "activity": activity, "内容": content, "keywords": key_words, "转推": retweet, "喜欢": like, "回复": reply_num, "_id": _id } print("row_input_list:", insert_text) # 插入数据库 if self.flag_insert == "1": self.input_database(insert_text) print() # end for, 判断是不是有空值的元组 if self.nan_list: for var_nan in self.nan_list: print("NaN row:", var_nan) print("空值的个数:", len(self.nan_list))
def read_file(self): data = pd.read_csv(self.file_path, encoding='utf-8') # 读取数据 print("data:") # 逐个元素判断是否为空值,将空值行,放入一个队列中 for i in range(len(data)): print("i:", i) user_id = "" # 用户ID service_id = "" # 服务ID date_time = "" # 格式化时间 time_stamp = "" # 时间戳 content = "" # 内容 key_words = [] # 内容关键词(自然语言做的分词) row = data.iloc[i] # 数据元组 # print("type:", type(row), "row: ", row) for j in data.columns: j = str(j) # if str(data.iloc[i][j]) == "nan": # print("空值", row, j, "element:", data.iloc[i][j]) # self.nan_list.append(row) # continue element = str(data.iloc[i][j]) if j == "用户ID": user_id = element print(j, ":", user_id) if j == "服务ID": service_id = element print(j, ":", service_id) if j == "时间": if str(data.iloc[i][j]) == "nan": # print("空值", row, j, "element:", data.iloc[i][j]) self.nan_list.append(row) continue print(j, ":", element) # 格式化时间 date_time = self.format_datetime(element) print("格式化时间date_time:", date_time) # 转化成时间戳 timeArray = time.strptime(date_time, "%Y-%m-%d %H:%M:%S") # 2将"2011-09-28 10:00:00"转化为时间戳 timestamp = int(time.mktime(timeArray)) print("timestamp:", timestamp) time_stamp = timestamp if j == "行为": activity = element print(j, ":", activity) if j == "内容" or j == "title": # 判断是否有空值 if str(data.iloc[i][j]) == "nan": # print("空值", row, j, "element:", data.iloc[i][j]) self.nan_list.append(row) continue content = element print(j, ":", content) temp_keywords = dpt4.main(element) # keywords是一个List结构 print("keywords:", temp_keywords) key_words = key_words + temp_keywords # end if # 输出每一行的input_list _id = self.get_next_counter() insert_text = {"uid": self.collection_name, "用户ID": user_id, "服务ID": service_id, "时间": date_time, "timestamp": time_stamp, "内容": content, "keywords": key_words, "_id": _id } print("row_input_list:", insert_text) # 插入数据库 if self.flag_insert == "1": self.input_database(insert_text) print() # end for, 判断是不是有空值的元组 if self.nan_list: for var_nan in self.nan_list: print("NaN row:", var_nan) print("空值的个数:", len(self.nan_list))
def read_file(self): data = pd.read_csv(self.file_path, encoding='utf-8') # 读取数据 print("data:") # 逐个元素判断是否为空值,将空值行,放入一个队列中 for i in range(len(data)): print("i:", i) user_id = "" # 用户ID service_id = "" # 服务ID date_time = "" # 格式化时间 time_stamp = "" # 时间戳 activity = "" # 行为 content = "" # 内容 key_words = [] # 内容关键词(自然语言做的分词) title_text = "" # 标题 star_badge = "" # 奖励 row = data.iloc[i] # 数据元组 # print("type:", type(row), "row: ", row) for j in data.columns: j = str(j) # if str(data.iloc[i][j]) == "nan": # print("空值", row, j, "element:", data.iloc[i][j]) # self.nan_list.append(row) # continue element = str(data.iloc[i][j]) if j == "用户ID": user_id = element print(j, ":", user_id) if j == "服务ID": service_id = element print(j, ":", service_id) if j == "时间": if str(data.iloc[i][j]) == "nan": # print("空值", row, j, "element:", data.iloc[i][j]) self.nan_list.append(row) continue element = element.replace("Z", "") element = element.replace("T", " ") date_time = element print(j, ":", date_time) # 将格式化时间转换成时间戳10位 # 1中间过程,一般都需要将字符串转化为时间数组 try: timeArray = time.strptime(element, "%Y-%m-%d %H:%M:%S") except ValueError: element = element.replace(" GM", "") element = element.replace("Mon, ", "") element = element.replace("Tue, ", "") element = element.replace("Wed, ", "") element = element.replace("Thu, ", "") element = element.replace("Fri, ", "") element = element.replace("Sat, ", "") element = element.replace("Sun, ", "") timeArray = time.strptime(element, "%Y-%m-%d") # 2将"2011-09-28 10:00:00"转化为时间戳 timestamp = int(time.mktime(timeArray)) print("timestamp:", timestamp, " type:", type(timestamp)) time_stamp = timestamp if j == "行为": activity = element print(j, ":", activity) if j == "内容": content = element print(j, ":", content) temp_keywords = dpt4.main(element) # keywords是一个List结构 print("keywords:", temp_keywords) key_words = key_words + temp_keywords if j == "title": title_text = element print(j, ":", title_text) temp_keywords = dpt4.main(title_text) # keywords是一个List结构 print("keywords:", temp_keywords) key_words = key_words + temp_keywords if j == "标记": star_badge = element print(j, ":", star_badge) if j == "repository": content = element print(j, ":", content) temp_keywords = dpt4.main(element) # keywords是一个List结构 print("keywords:", temp_keywords) key_words = key_words + temp_keywords # end if # 调用Switch结构 # 输出每一行的input_list _id = self.get_next_counter() insert_text = { "uid": self.collection_name, "用户ID": user_id, "服务ID": service_id, "时间": date_time, "timestamp": time_stamp, "activity": activity, "内容": content, "keywords": key_words, "_id": _id } print("row_input_list:", insert_text) # 插入数据库 if self.flag_insert == "1": self.input_database(insert_text) print() # end for, 判断是不是有空值的元组 if self.nan_list: for var_nan in self.nan_list: print("NaN row:", var_nan) print("空值的个数:", len(self.nan_list))
def read_file(self): data = pd.read_csv(self.file_path, encoding='utf-8') # 读取数据 print("data:") # 逐个元素判断是否为空值,将空值行,放入一个队列中 for i in range(len(data)): print("i:", i) user_id = "" # 用户ID service_id = "" # 服务ID date_time = "" # 格式化时间 time_stamp = "" # 时间戳 activity = "" # 行为 content = "" # 内容 title = "" # 标题 key_words = [] # 内容关键词(自然语言做的分词) row = data.iloc[i] # 数据元组 for j in data.columns: j = str(j) element = str(data.iloc[i][j]) if j == "用户ID": user_id = element print(j, ":", user_id) if j == "服务ID": service_id = element print(j, ":", service_id) if j == "时间": if str(data.iloc[i][j]) == "nan": # print("空值", row, j, "element:", data.iloc[i][j]) self.nan_list.append(row) continue element = self.match_date_time(element) print(j, ":", element) # 将格式化时间转换成时间戳10位 # 1中间过程,一般都需要将字符串转化为时间数组 timeArray = time.strptime(element, "%Y年%m月%d日") # 2将"2011-09-28 10:00:00"转化为时间戳 time_stamp = int(time.mktime(timeArray)) # 将时间戳timestamp转换成格式化的字符串Datetime l_time = time.localtime(time_stamp) date_time = time.strftime("%Y-%m-%d %H:%M:%S", l_time) print("date_time:", date_time, ", timestamp:", time_stamp) if j == "行为": activity = element print(j, ":", activity) if j == "内容": content = element print(j, ":", content) temp_keywords = dpt4.main(element) # keywords是一个List结构 print("keywords:", temp_keywords) key_words += temp_keywords if j == "title" or j == "Title": title = element print(j, ":", title) temp_keywords = dpt4.main(element) # keywords是一个List结构 print("keywords:", temp_keywords) key_words += temp_keywords # end if # 调用Switch结构 # 输出每一行的input_list _id = self.get_next_counter() insert_text = { "uid": self.collection_name, "用户ID": user_id, "服务ID": service_id, "时间": date_time, "timestamp": time_stamp, "activity": activity, "内容": content, "keywords": key_words, "_id": _id, "title": title } print("row_input_list:", insert_text) # 插入数据库 if self.flag_insert == "1": self.input_database(insert_text) print() # end for, 判断是不是有空值的元组 if self.nan_list: for var_nan in self.nan_list: print("NaN row:", var_nan) print("空值的个数:", len(self.nan_list))
def read_file(self): data = pd.read_csv(self.file_path, encoding='utf-8') # 读取数据 print("data:") # 逐个元素判断是否为空值,将空值行,放入一个队列中 for i in range(len(data)): print("i:", i) user_id = "" # 用户ID service_id = "" # 服务ID date_time = "" # 格式化时间 time_stamp = "" # 时间戳 title = "" # 标题 content = "" # 内容 key_words = [] # 内容关键词(自然语言做的分词) retweet = 0 # 转推数 like = 0 # 喜欢数 reply_num = 0 # 回复数 row = data.iloc[i] # 数据元组 # print("type:", type(row), "row: ", row) for j in data.columns: j = str(j) # if str(data.iloc[i][j]) == "nan": # print("空值", row, j, "element:", data.iloc[i][j]) # self.nan_list.append(row) # continue element = str(data.iloc[i][j]) if j == "用户ID": user_id = element print(j, ":", user_id) if j == "服务ID": service_id = element print(j, ":", service_id) if j == "时间": if str(data.iloc[i][j]) == "nan": # print("空值", row, j, "element:", data.iloc[i][j]) self.nan_list.append(row) continue element = self.match_timestamp(element) element = self.format_datetime(element) print(j, ":", element) if "am" in element: element = element.replace("am", "") # 格式化的字符串转换成Datetime dt = datetime.datetime.strptime( element, "%d %m %Y, %H:%M") date_time = str(dt) print("时间:", date_time) # 转化成时间戳 timeArray = time.strptime(date_time, "%Y-%m-%d %H:%M:%S") # 2将"2011-09-28 10:00:00"转化为时间戳 timestamp = int(time.mktime(timeArray)) print("timestamp:", timestamp) time_stamp = timestamp if "pm" in element: element = element.replace("pm", "") # 格式化的字符串转换成Datetime dt = datetime.datetime.strptime( element, "%d %m %Y, %H:%M") print("dt:", dt) # 加上12小时 aDay = timedelta(days=0.5) now = dt + aDay print("new now:", now) element = str(now) date_time = element # 再变成时间戳 timeArray = time.strptime(element, "%Y-%m-%d %H:%M:%S") # 2将"2011-09-28 10:00:00"转化为时间戳 timestamp = int(time.mktime(timeArray)) print("timestamp:", timestamp) time_stamp = timestamp if j == "title": title = element print(j, ":", title) temp_keywords = dpt4.main(title) # keywords是一个List结构 key_words += temp_keywords print("keywords:", temp_keywords) if j == "content_time": # 判断是否有空值 if str(data.iloc[i][j]) == "nan": # print("空值", row, j, "element:", data.iloc[i][j]) self.nan_list.append(row) continue list_ele = element.split("_") content = list_ele[0] date_time = list_ele[1] print("内容:", content) print("时间:", date_time) timeArray = time.strptime(str(date_time), "%Y-%m-%d %H:%M:%S") # 2将"2011-09-28 10:00:00"转化为时间戳 timestamp = int(time.mktime(timeArray)) print("timestamp:", timestamp) time_stamp = timestamp temp_keywords = dpt4.main(content) # keywords是一个List结构 print("keywords:", temp_keywords) key_words = key_words + temp_keywords # end if # 调用Switch结构 # 输出每一行的input_list insert_text = { "uid": self.collection_name, "用户ID": user_id, "服务ID": service_id, "时间": date_time, "timestamp": time_stamp, "内容": content, "title": title, "keywords": key_words } print("row_input_list:", insert_text) # 插入数据库 if self.flag_insert == "1": self.input_database(insert_text) print() # end for, 判断是不是有空值的元组 if self.nan_list: for var_nan in self.nan_list: print("NaN row:", var_nan) print("空值的个数:", len(self.nan_list))
def read_file(self): data = pd.read_csv(self.file_path, encoding='utf-8') # 读取数据 print("data:") # 逐个元素判断是否为空值,将空值行,放入一个队列中 for i in range(len(data)): print("i:", i) user_id = "" # 用户ID service_id = "" # 服务ID date_time = "" # 格式化时间 time_stamp = "" # 时间戳 temp_time = "" # 临时时间 content = "" # 内容 key_words = [] # 内容关键词(自然语言做的分词) row = data.iloc[i] # 数据元组 # print("type:", type(row), "row: ", row) for j in data.columns: j = str(j) # if str(data.iloc[i][j]) == "nan": # print("空值", row, j, "element:", data.iloc[i][j]) # self.nan_list.append(row) # continue element = str(data.iloc[i][j]) if j == "ATT": con_list = element.split(",") for item in range(len(con_list)): print("item:", item, "element:", con_list[item]) if item == 0: user_id = con_list[item] if item == 1: service_id = con_list[item] if item == 2: content = con_list[item] temp_keywords = dpt4.main( content) # keywords是一个List结构 print("keywords:", temp_keywords) key_words = key_words + temp_keywords if item == 3: temp_time = con_list[item] temp_time = temp_time.replace("T", " ") element = temp_time.replace("+00:00", "") # 格式化的字符串转换成Datetime dt = datetime.datetime.strptime( element, "%Y-%m-%d %H:%M:%S") date_time = str(dt) print("时间:", date_time) # 转化成时间戳 timeArray = time.strptime(date_time, "%Y-%m-%d %H:%M:%S") # 2将"2011-09-28 10:00:00"转化为时间戳 timestamp = int(time.mktime(timeArray)) print("timestamp:", timestamp) time_stamp = timestamp if j == "时间": if str(data.iloc[i][j]) == "nan": # print("空值", row, j, "element:", data.iloc[i][j]) self.nan_list.append(row) continue print(j, ":", element) if "上午" in element: element = element.replace("上午", "") # 格式化的字符串转换成Datetime dt = datetime.datetime.strptime( element, "%H:%M - %Y年%m月%d日") date_time = str(dt) print("时间:", date_time) # 转化成时间戳 timeArray = time.strptime(date_time, "%Y-%m-%d %H:%M:%S") # 2将"2011-09-28 10:00:00"转化为时间戳 timestamp = int(time.mktime(timeArray)) print("timestamp:", timestamp) time_stamp = timestamp if "下午" in element: element = element.replace("下午", "") # 格式化的字符串转换成Datetime dt = datetime.datetime.strptime( element, "%H:%M - %Y年%m月%d日") print("dt:", dt) # 加上12小时 aDay = timedelta(days=0.5) now = dt + aDay print("new now:", now) element = str(now) # 再变成时间戳 timeArray = time.strptime(element, "%Y-%m-%d %H:%M:%S") # 2将"2011-09-28 10:00:00"转化为时间戳 timestamp = int(time.mktime(timeArray)) print("timestamp:", timestamp) time_stamp = timestamp # end if # 调用Switch结构 # 输出每一行的input_list insert_text = { "uid": self.collection_name, "用户ID": user_id, "服务ID": service_id, "时间": date_time, "timestamp": time_stamp, "内容": content, "keywords": key_words } print("row_input_list:", insert_text) # 插入数据库 self.input_database(insert_text) print() # end for, 判断是不是有空值的元组 if self.nan_list: for var_nan in self.nan_list: print("NaN row:", var_nan) print("空值的个数:", len(self.nan_list))
def read_file(self): data = pd.read_csv(self.file_path, encoding='utf-8') # 读取数据 print("data:") # 逐个元素判断是否为空值,将空值行,放入一个队列中 for i in range(len(data)): print("i:", i) user_id = "" # 用户ID service_id = "" # 服务ID date_time = "" # 格式化时间 time_stamp = "" # 时间戳 activity = "" # 行为 content = "" # 内容 key_words = [] # 内容关键词(自然语言做的分词) row = data.iloc[i] # 数据元组 # print("type:", type(row), "row: ", row) for j in data.columns: j = str(j) # if str(data.iloc[i][j]) == "nan": # print("空值", row, j, "element:", data.iloc[i][j]) # self.nan_list.append(row) # continue element = str(data.iloc[i][j]) if j == "用户ID": user_id = element print(j, ":", user_id) if j == "服务ID": service_id = element print(j, ":", service_id) if j == "timestamp": if str(data.iloc[i][j]) == "nan": self.nan_list.append(row) continue element = element.replace(".0", "") print("element:", element, ", type:", type(element)) time_stamp = int(element) # 将时间戳timestamp转换成格式化的字符串Datetime l_time = time.localtime(time_stamp) date_time = time.strftime("%Y-%m-%d %H:%M:%S", l_time) print("date_time:", date_time, ", timestamp:", time_stamp) if j == "内容": element = element.replace("Idan Adar", "") element = element.replace("的", "") element = element.replace(' “@', "") content = element print(j, ":", content) temp_keywords = dpt4.main(element) # keywords是一个List结构 print("keywords:", temp_keywords) key_words = key_words + temp_keywords if j == "title": element = element.replace("Idan Adar", "") element = element.replace("的", "") element = element.replace(' “@', "") content = element print(j, ":", content) temp_keywords = dpt4.main(element) # keywords是一个List结构 print("keywords:", temp_keywords) key_words = key_words + temp_keywords # end if _id = self.get_next_counter() print("_id:", _id) # 输出每一行的input_list insert_text = { "uid": self.collection_name, "用户ID": user_id, "服务ID": service_id, "时间": date_time, "timestamp": time_stamp, "内容": content, "keywords": key_words, "_id": _id } print("row_input_list:", insert_text) # 插入数据库 if self.flag_insert == "1": self.input_database(insert_text) print() # end for, 判断是不是有空值的元组 if self.nan_list: for var_nan in self.nan_list: print("NaN row:", var_nan) print("空值的个数:", len(self.nan_list))