Beispiel #1
0
def feature_transform():
    tool.truncatefile("train.csv")
    df1 = pd.read_csv("test.csv")
    df2 = df1.drop(
        labels=['name', 'author', 'img', 'publish_time', 'mess', 'tag'],
        axis=1)
    df2.to_csv("train.csv", index=None)
def split_data():
    data = pd.read_csv("train.csv")
    train, test = train_test_split(data, test_size=0.2, random_state=20)
    tool.truncatefile("train_data.csv")
    tool.truncatefile("test_data.csv")
    train.to_csv("train_data.csv", index="None")
    test.to_csv("test_data.csv", index="None")
    return train, test
def step_five():
    """
    去重 先把它放入set再放入list简单去重
    :return:
    """
    for tag in tags:
        print(tag)
        path = "books//" + tag + ".txt"
        books = tools.read(path)
        print(len(books))
        books = list(set(books))
        print(len(books))
        tools.truncatefile(path)
        for book in books:
            tools.write(path, book)
def step_three():
    """
    正常情况下每一本书的有17个特征,那就把特征值不是17的书当作是异常值来处理,这里是直接舍弃
    :return:
    """
    for tag in tags:
        path = "books//" + tag + ".txt"
        urls = tools.read(path)
        out = []
        for url in urls:
            lis = url.split(',')
            if len(lis) == 17:
                out.append(url)
        tools.truncatefile(path)
        for url in out:
            tools.write(path, url)
def step_two():
    """
    如果在爬虫的时候最后一栏不是标签的话,就运行这个函数进行处理,给他最后加上标签
    :return:
    """
    for tag in tags:
        lis = []
        path = "books//" + tag + ".txt"
        urls_list = tools.read(path)
        for url in urls_list:
            out = url.split(',')
            out.append(tag)
            output = ','.join(out)
            lis.append(output)
        tools.truncatefile(path)
        for li in lis:
            tools.write(path, li)
def step_seven():
    """
    在插入数据库的时候发现,有些书籍的名字字段过长导致插入失败
    所以在进行转换成sql语句之前应该先进行这个操作 把书名字段长于60的给舍弃掉
    :return:
    """
    for tag in tags:
        new_books = []
        path = "books//" + tag + ".txt"
        books = tools.read(path)
        for book in books:
            lis = book.split(',')
            name = lis[0]
            if len(name) > 60:
                continue
            new_books.append(book)
        tools.truncatefile(path)
        for book in new_books:
            tools.write(path, book)
Beispiel #7
0
def transform():
    """
    将txt文件转换成csv文件,在每个标签下随机取200条数据,若那个标签里的数据不足200条则全取
    :return:
    """
    tool.truncatefile("test.csv")
    name = [
        'click', 'name', 'author', 'img', 'price', 'publish_time', 'score',
        'judge', 'rec_most', 'rec_more', 'rec_normal', 'rec_bad',
        'rec_morebad', 'readed', 'reading', 'readup', 'mess', 'tag'
    ]
    data = []
    for tag in tags:
        path = "..//DoubanData//books//" + tag + ".txt"
        one_tag = tool.read(path)
        needed = 200
        if len(one_tag) < needed:
            needed = len(one_tag)
        books = random.sample(one_tag, needed)
        for book in books:
            clicked = 0
            one = book.split(',')
            if float(one[5]) > 9.5 and int(one[6]) > 1000:
                clicked = 1
            if int(one[6]) > 100000:
                clicked = 1
            if float(one[5]) > 7 and int(one[6]) > 1000:
                clicked = 1
            if clicked == 0:
                rd = random.randint(0, 15)
                if rd > 10:
                    clicked = 1
            else:
                rd = random.randint(0, 15)
                if rd > 10:
                    clicked = 0
            data.append([
                clicked, one[0], one[1], one[2], one[3], one[4], one[5],
                one[6], one[7], one[8], one[9], one[10], one[11], one[12],
                one[13], one[14], one[15], one[16]
            ])
    test = pd.DataFrame(columns=name, data=data)
    test.to_csv("test.csv", index=None)
def step_one():
    """
    清洗数据,因为在爬虫的时候会有失败的数据,失败的话我就直接把url写入文件
    失败的记录它长度肯定是小于50的,成功的数据长度都是大于50的
    :return:
    """
    count = 0
    for tag in tags:
        path = "books//" + tag + ".txt"
        old_books = tools.read(path)
        new_books = []
        for old_book in old_books:
            if len(old_book) > 50:
                new_books.append(old_book)
        tools.truncatefile(path)
        for new_book in new_books:
            count += 1
            tools.write(path, new_book)

    print(count)
def step_four():
    """
    对价格进行格式化处理,因为有些价格不是RMB 采取处理方式是只保留书本价格的数字部分 便于存入数据库
    :return:
    """
    for tag in tags:
        path = "books//" + tag + ".txt"
        books = tools.read(path)
        out = []
        for book in books:
            lis = book.split(',')
            price = lis[3]
            # print(re.findall(r"\d+\.?\d*", price))
            try:
                lis[3] = re.findall(r"\d+\.?\d*", price)[0]
                book = ','.join(lis)
                out.append(book)
            except Exception as e:
                print(e, book)
            # lis[6] = int(lis[7]) + int(lis[8]) + int(lis[9]) + int(lis[10]) + int(lis[11])
            # lis[6] = str(lis[6])
        tools.truncatefile(path)
        for book in out:
            tools.write(path, book)