Beispiel #1
0
def excel2txt():
    # 对王辉的excel的表格转成txt
    path = r'C:\Users\chend\Desktop\dataset\doubanbook-wanghui\doubanbook-wanghui.xlsx'
    txt = pd.read_excel(path, header=0, sheet_name=None, dtype='str')
    txt = list(txt.values())

    user = handleUser(txt[0])
    book = handleBook(txt[1])
    user_book = handleUser_book(txt[2])
    book_labels = handleBook_labels(txt[3])
    user_book_score = handleUser_book_score(txt[4],txt[2])
    print('-----------------------------------------------------------------------------')
    print(user.shape,book.shape,user_book.shape,book_labels.shape,user_book_score.shape)
    print('-----------------------------------------------------------------------------')
    dbop.store(user, 'user')
    dbop.store(book,'book')
    dbop.store(book_labels,'book_labels')
    dbop.store(user_book,'user_book')
    dbop.store(user_book_score,'user_book_score')
Beispiel #2
0
    booksid = []
    booklabels = []

    # booklabelsdf = pd.DataFrame(columns=['bookid','booklabels'])
    for line in txt:
        line = eval(line)
        booksid.append(line['图书id'])
        booklabels.append(line['图书标签'])
    book_labels = pd.DataFrame(zip(booksid, booklabels),
                               columns=['bookid', 'booklabels'])
    # 去除没有标签的图书
    book_labels = delNolabelBook(book_labels)

    # 对book_labels个格式进行调整
    book_labels = regularTable(book_labels)

    # 显示所有列与行
    # pd.set_option('display.max_columns', None)
    # pd.set_option('display.max_rows', None)

    # 清洗标签,只保留四个一下汉字,且化繁体为简体
    book_labels = cleanData(book_labels)
    # print(book_labels)
    return book_labels


if __name__ == '__main__':
    book_labels = read_txt()
    print(book_labels.shape)
    dbop.store(book_labels, 'book_labels')
        txt = f.readlines()
    usesid = []
    booksid = []
    booksscore = []
    for line in txt:
        line = eval(line)
        usesid.append(line['用户id'])
        booksid.append(line['图书id'])
        booksscore.append(line['图书评分'])
    user_book_score = pd.DataFrame(zip(usesid, booksid, booksscore),
                                   columns=['userid', 'booksid', 'booksscore'])

    # 显示所有列与行
    # pd.set_option('display.max_columns', None)
    # pd.set_option('display.max_rows', None)

    # 去除没有读过书的的用户,没给评分
    user_book_score = delNoitemInfo(user_book_score)

    # 表格结构调整
    user_book_score = regularTable(user_book_score)

    return user_book_score


if __name__ == "__main__":
    txt_path = r'C:\Users\chend\Desktop\dataset\doubanbook-lijia1\user-score.txt'
    user_book_score = read_txt()
    print(user_book_score.shape)
    dbop.store(user_book_score, 'user_book_score')
Beispiel #4
0
    return user_book


def read_txt():
    path = r'C:\Users\chend\Desktop\dataset\doubanbook-lijia\user-book.txt'
    with open(path, encoding='utf-8-sig') as f:
        txt = f.readlines()
    usesid = []
    booksid = []
    for line in txt:
        line = eval(line)
        usesid.append(line['用户id'])
        booksid.append(line['图书id'])
    user_book = pd.DataFrame(zip(usesid, booksid),
                             columns=['userid', 'booksid'])

    # 删除没读过书的用户
    user_book = delNoReadBook(user_book)

    #user_booK格式调整
    user_book = regularTable(user_book)

    return user_book


if __name__ == "__main__":

    user_book = read_txt()
    print(user_book.shape)
    dbop.store(user_book, 'user_book')
Beispiel #5
0
    # booktable.append(book)
    return book


if __name__ == '__main__':

    path = r'C:\Users\chend\Desktop\dataset\doubanbook-lijia\book.txt'
    # 从txt文件中读入DataFrame
    with open(path, encoding='utf-8') as f:
        txt = f.readlines()
    # print(txt)
    booksid = []
    booksname = []
    booksauthor = []
    for line in txt:
        line = eval(line)
        booksid.append(line['图书id'])
        booksname.append(line['图书名称'])
        booksauthor.append(line['图书作者'])
    book = pd.DataFrame(zip(booksid, booksname, booksauthor),
                        columns=['bookid', 'bookname', 'bookauthor'])
    # print(book)
    book = addcover(book)
    print(
        '--------------------------------------------------------------------------------------'
    )
    # pd.set_option('display.max_columns', None)
    # 存入数据库book表
    print(book.shape)
    dbop.store(book, 'book')