def feature_transform(): tool.truncatefile("train.csv") df1 = pd.read_csv("test.csv") df2 = df1.drop( labels=['name', 'author', 'img', 'publish_time', 'mess', 'tag'], axis=1) df2.to_csv("train.csv", index=None)
def split_data(): data = pd.read_csv("train.csv") train, test = train_test_split(data, test_size=0.2, random_state=20) tool.truncatefile("train_data.csv") tool.truncatefile("test_data.csv") train.to_csv("train_data.csv", index="None") test.to_csv("test_data.csv", index="None") return train, test
def step_five(): """ 去重 先把它放入set再放入list简单去重 :return: """ for tag in tags: print(tag) path = "books//" + tag + ".txt" books = tools.read(path) print(len(books)) books = list(set(books)) print(len(books)) tools.truncatefile(path) for book in books: tools.write(path, book)
def step_three(): """ 正常情况下每一本书的有17个特征,那就把特征值不是17的书当作是异常值来处理,这里是直接舍弃 :return: """ for tag in tags: path = "books//" + tag + ".txt" urls = tools.read(path) out = [] for url in urls: lis = url.split(',') if len(lis) == 17: out.append(url) tools.truncatefile(path) for url in out: tools.write(path, url)
def step_two(): """ 如果在爬虫的时候最后一栏不是标签的话,就运行这个函数进行处理,给他最后加上标签 :return: """ for tag in tags: lis = [] path = "books//" + tag + ".txt" urls_list = tools.read(path) for url in urls_list: out = url.split(',') out.append(tag) output = ','.join(out) lis.append(output) tools.truncatefile(path) for li in lis: tools.write(path, li)
def step_seven(): """ 在插入数据库的时候发现,有些书籍的名字字段过长导致插入失败 所以在进行转换成sql语句之前应该先进行这个操作 把书名字段长于60的给舍弃掉 :return: """ for tag in tags: new_books = [] path = "books//" + tag + ".txt" books = tools.read(path) for book in books: lis = book.split(',') name = lis[0] if len(name) > 60: continue new_books.append(book) tools.truncatefile(path) for book in new_books: tools.write(path, book)
def transform(): """ 将txt文件转换成csv文件,在每个标签下随机取200条数据,若那个标签里的数据不足200条则全取 :return: """ tool.truncatefile("test.csv") name = [ 'click', 'name', 'author', 'img', 'price', 'publish_time', 'score', 'judge', 'rec_most', 'rec_more', 'rec_normal', 'rec_bad', 'rec_morebad', 'readed', 'reading', 'readup', 'mess', 'tag' ] data = [] for tag in tags: path = "..//DoubanData//books//" + tag + ".txt" one_tag = tool.read(path) needed = 200 if len(one_tag) < needed: needed = len(one_tag) books = random.sample(one_tag, needed) for book in books: clicked = 0 one = book.split(',') if float(one[5]) > 9.5 and int(one[6]) > 1000: clicked = 1 if int(one[6]) > 100000: clicked = 1 if float(one[5]) > 7 and int(one[6]) > 1000: clicked = 1 if clicked == 0: rd = random.randint(0, 15) if rd > 10: clicked = 1 else: rd = random.randint(0, 15) if rd > 10: clicked = 0 data.append([ clicked, one[0], one[1], one[2], one[3], one[4], one[5], one[6], one[7], one[8], one[9], one[10], one[11], one[12], one[13], one[14], one[15], one[16] ]) test = pd.DataFrame(columns=name, data=data) test.to_csv("test.csv", index=None)
def step_one(): """ 清洗数据,因为在爬虫的时候会有失败的数据,失败的话我就直接把url写入文件 失败的记录它长度肯定是小于50的,成功的数据长度都是大于50的 :return: """ count = 0 for tag in tags: path = "books//" + tag + ".txt" old_books = tools.read(path) new_books = [] for old_book in old_books: if len(old_book) > 50: new_books.append(old_book) tools.truncatefile(path) for new_book in new_books: count += 1 tools.write(path, new_book) print(count)
def step_four(): """ 对价格进行格式化处理,因为有些价格不是RMB 采取处理方式是只保留书本价格的数字部分 便于存入数据库 :return: """ for tag in tags: path = "books//" + tag + ".txt" books = tools.read(path) out = [] for book in books: lis = book.split(',') price = lis[3] # print(re.findall(r"\d+\.?\d*", price)) try: lis[3] = re.findall(r"\d+\.?\d*", price)[0] book = ','.join(lis) out.append(book) except Exception as e: print(e, book) # lis[6] = int(lis[7]) + int(lis[8]) + int(lis[9]) + int(lis[10]) + int(lis[11]) # lis[6] = str(lis[6]) tools.truncatefile(path) for book in out: tools.write(path, book)