Esempio n. 1
0
def step_six():
    """
    把数据转换成sql语句
    :return:
    """
    for tag in tags:
        path = "books//" + tag + ".txt"
        books = tools.read(path)
        for book in books:
            bk = Book(book)
            tools.write("to_sql//all.sql", bk.to_sql())
Esempio n. 2
0
def combine():
    """
    将所有标签下的书籍整合到一个文件中
    :return:
    """
    books = []
    for tag in tags:
        path = "..//DoubanData//books//" + tag + ".txt"
        one_tag = tool.read(path)
        for one in one_tag:
            books.append(one)
    for book in books:
        tool.write("data//all.txt", book)
Esempio n. 3
0
def step_five():
    """
    去重 先把它放入set再放入list简单去重
    :return:
    """
    for tag in tags:
        print(tag)
        path = "books//" + tag + ".txt"
        books = tools.read(path)
        print(len(books))
        books = list(set(books))
        print(len(books))
        tools.truncatefile(path)
        for book in books:
            tools.write(path, book)
Esempio n. 4
0
def step_three():
    """
    正常情况下每一本书的有17个特征,那就把特征值不是17的书当作是异常值来处理,这里是直接舍弃
    :return:
    """
    for tag in tags:
        path = "books//" + tag + ".txt"
        urls = tools.read(path)
        out = []
        for url in urls:
            lis = url.split(',')
            if len(lis) == 17:
                out.append(url)
        tools.truncatefile(path)
        for url in out:
            tools.write(path, url)
Esempio n. 5
0
def step_two():
    """
    如果在爬虫的时候最后一栏不是标签的话,就运行这个函数进行处理,给他最后加上标签
    :return:
    """
    for tag in tags:
        lis = []
        path = "books//" + tag + ".txt"
        urls_list = tools.read(path)
        for url in urls_list:
            out = url.split(',')
            out.append(tag)
            output = ','.join(out)
            lis.append(output)
        tools.truncatefile(path)
        for li in lis:
            tools.write(path, li)
Esempio n. 6
0
def step_seven():
    """
    在插入数据库的时候发现,有些书籍的名字字段过长导致插入失败
    所以在进行转换成sql语句之前应该先进行这个操作 把书名字段长于60的给舍弃掉
    :return:
    """
    for tag in tags:
        new_books = []
        path = "books//" + tag + ".txt"
        books = tools.read(path)
        for book in books:
            lis = book.split(',')
            name = lis[0]
            if len(name) > 60:
                continue
            new_books.append(book)
        tools.truncatefile(path)
        for book in new_books:
            tools.write(path, book)
Esempio n. 7
0
def transform():
    """
    将txt文件转换成csv文件,在每个标签下随机取200条数据,若那个标签里的数据不足200条则全取
    :return:
    """
    tool.truncatefile("test.csv")
    name = [
        'click', 'name', 'author', 'img', 'price', 'publish_time', 'score',
        'judge', 'rec_most', 'rec_more', 'rec_normal', 'rec_bad',
        'rec_morebad', 'readed', 'reading', 'readup', 'mess', 'tag'
    ]
    data = []
    for tag in tags:
        path = "..//DoubanData//books//" + tag + ".txt"
        one_tag = tool.read(path)
        needed = 200
        if len(one_tag) < needed:
            needed = len(one_tag)
        books = random.sample(one_tag, needed)
        for book in books:
            clicked = 0
            one = book.split(',')
            if float(one[5]) > 9.5 and int(one[6]) > 1000:
                clicked = 1
            if int(one[6]) > 100000:
                clicked = 1
            if float(one[5]) > 7 and int(one[6]) > 1000:
                clicked = 1
            if clicked == 0:
                rd = random.randint(0, 15)
                if rd > 10:
                    clicked = 1
            else:
                rd = random.randint(0, 15)
                if rd > 10:
                    clicked = 0
            data.append([
                clicked, one[0], one[1], one[2], one[3], one[4], one[5],
                one[6], one[7], one[8], one[9], one[10], one[11], one[12],
                one[13], one[14], one[15], one[16]
            ])
    test = pd.DataFrame(columns=name, data=data)
    test.to_csv("test.csv", index=None)
Esempio n. 8
0
def step_one():
    """
    清洗数据,因为在爬虫的时候会有失败的数据,失败的话我就直接把url写入文件
    失败的记录它长度肯定是小于50的,成功的数据长度都是大于50的
    :return:
    """
    count = 0
    for tag in tags:
        path = "books//" + tag + ".txt"
        old_books = tools.read(path)
        new_books = []
        for old_book in old_books:
            if len(old_book) > 50:
                new_books.append(old_book)
        tools.truncatefile(path)
        for new_book in new_books:
            count += 1
            tools.write(path, new_book)

    print(count)
Esempio n. 9
0
def step_four():
    """
    对价格进行格式化处理,因为有些价格不是RMB 采取处理方式是只保留书本价格的数字部分 便于存入数据库
    :return:
    """
    for tag in tags:
        path = "books//" + tag + ".txt"
        books = tools.read(path)
        out = []
        for book in books:
            lis = book.split(',')
            price = lis[3]
            # print(re.findall(r"\d+\.?\d*", price))
            try:
                lis[3] = re.findall(r"\d+\.?\d*", price)[0]
                book = ','.join(lis)
                out.append(book)
            except Exception as e:
                print(e, book)
            # lis[6] = int(lis[7]) + int(lis[8]) + int(lis[9]) + int(lis[10]) + int(lis[11])
            # lis[6] = str(lis[6])
        tools.truncatefile(path)
        for book in out:
            tools.write(path, book)
Esempio n. 10
0
import DoubanData.tools as tools
from DoubanData.books import Book
import re

tags = tools.read("tags.txt")
# tags = ["test"]


def step_one():
    """
    清洗数据,因为在爬虫的时候会有失败的数据,失败的话我就直接把url写入文件
    失败的记录它长度肯定是小于50的,成功的数据长度都是大于50的
    :return:
    """
    count = 0
    for tag in tags:
        path = "books//" + tag + ".txt"
        old_books = tools.read(path)
        new_books = []
        for old_book in old_books:
            if len(old_book) > 50:
                new_books.append(old_book)
        tools.truncatefile(path)
        for new_book in new_books:
            count += 1
            tools.write(path, new_book)

    print(count)


def step_two():
Esempio n. 11
0
import random

import DoubanData.tools as tool
import numpy as np
import pandas as pd

tags = tool.read("..//DoubanData//tags.txt")


def combine():
    """
    将所有标签下的书籍整合到一个文件中
    :return:
    """
    books = []
    for tag in tags:
        path = "..//DoubanData//books//" + tag + ".txt"
        one_tag = tool.read(path)
        for one in one_tag:
            books.append(one)
    for book in books:
        tool.write("data//all.txt", book)


def transform():
    """
    将txt文件转换成csv文件,在每个标签下随机取200条数据,若那个标签里的数据不足200条则全取
    :return:
    """
    tool.truncatefile("test.csv")
    name = [