def excel2txt(): # 对王辉的excel的表格转成txt path = r'C:\Users\chend\Desktop\dataset\doubanbook-wanghui\doubanbook-wanghui.xlsx' txt = pd.read_excel(path, header=0, sheet_name=None, dtype='str') txt = list(txt.values()) user = handleUser(txt[0]) book = handleBook(txt[1]) user_book = handleUser_book(txt[2]) book_labels = handleBook_labels(txt[3]) user_book_score = handleUser_book_score(txt[4],txt[2]) print('-----------------------------------------------------------------------------') print(user.shape,book.shape,user_book.shape,book_labels.shape,user_book_score.shape) print('-----------------------------------------------------------------------------') dbop.store(user, 'user') dbop.store(book,'book') dbop.store(book_labels,'book_labels') dbop.store(user_book,'user_book') dbop.store(user_book_score,'user_book_score')
booksid = [] booklabels = [] # booklabelsdf = pd.DataFrame(columns=['bookid','booklabels']) for line in txt: line = eval(line) booksid.append(line['图书id']) booklabels.append(line['图书标签']) book_labels = pd.DataFrame(zip(booksid, booklabels), columns=['bookid', 'booklabels']) # 去除没有标签的图书 book_labels = delNolabelBook(book_labels) # 对book_labels个格式进行调整 book_labels = regularTable(book_labels) # 显示所有列与行 # pd.set_option('display.max_columns', None) # pd.set_option('display.max_rows', None) # 清洗标签,只保留四个一下汉字,且化繁体为简体 book_labels = cleanData(book_labels) # print(book_labels) return book_labels if __name__ == '__main__': book_labels = read_txt() print(book_labels.shape) dbop.store(book_labels, 'book_labels')
txt = f.readlines() usesid = [] booksid = [] booksscore = [] for line in txt: line = eval(line) usesid.append(line['用户id']) booksid.append(line['图书id']) booksscore.append(line['图书评分']) user_book_score = pd.DataFrame(zip(usesid, booksid, booksscore), columns=['userid', 'booksid', 'booksscore']) # 显示所有列与行 # pd.set_option('display.max_columns', None) # pd.set_option('display.max_rows', None) # 去除没有读过书的的用户,没给评分 user_book_score = delNoitemInfo(user_book_score) # 表格结构调整 user_book_score = regularTable(user_book_score) return user_book_score if __name__ == "__main__": txt_path = r'C:\Users\chend\Desktop\dataset\doubanbook-lijia1\user-score.txt' user_book_score = read_txt() print(user_book_score.shape) dbop.store(user_book_score, 'user_book_score')
return user_book def read_txt(): path = r'C:\Users\chend\Desktop\dataset\doubanbook-lijia\user-book.txt' with open(path, encoding='utf-8-sig') as f: txt = f.readlines() usesid = [] booksid = [] for line in txt: line = eval(line) usesid.append(line['用户id']) booksid.append(line['图书id']) user_book = pd.DataFrame(zip(usesid, booksid), columns=['userid', 'booksid']) # 删除没读过书的用户 user_book = delNoReadBook(user_book) #user_booK格式调整 user_book = regularTable(user_book) return user_book if __name__ == "__main__": user_book = read_txt() print(user_book.shape) dbop.store(user_book, 'user_book')
# booktable.append(book) return book if __name__ == '__main__': path = r'C:\Users\chend\Desktop\dataset\doubanbook-lijia\book.txt' # 从txt文件中读入DataFrame with open(path, encoding='utf-8') as f: txt = f.readlines() # print(txt) booksid = [] booksname = [] booksauthor = [] for line in txt: line = eval(line) booksid.append(line['图书id']) booksname.append(line['图书名称']) booksauthor.append(line['图书作者']) book = pd.DataFrame(zip(booksid, booksname, booksauthor), columns=['bookid', 'bookname', 'bookauthor']) # print(book) book = addcover(book) print( '--------------------------------------------------------------------------------------' ) # pd.set_option('display.max_columns', None) # 存入数据库book表 print(book.shape) dbop.store(book, 'book')