def step_six(): """ 把数据转换成sql语句 :return: """ for tag in tags: path = "books//" + tag + ".txt" books = tools.read(path) for book in books: bk = Book(book) tools.write("to_sql//all.sql", bk.to_sql())
def combine(): """ 将所有标签下的书籍整合到一个文件中 :return: """ books = [] for tag in tags: path = "..//DoubanData//books//" + tag + ".txt" one_tag = tool.read(path) for one in one_tag: books.append(one) for book in books: tool.write("data//all.txt", book)
def step_five(): """ 去重 先把它放入set再放入list简单去重 :return: """ for tag in tags: print(tag) path = "books//" + tag + ".txt" books = tools.read(path) print(len(books)) books = list(set(books)) print(len(books)) tools.truncatefile(path) for book in books: tools.write(path, book)
def step_three(): """ 正常情况下每一本书的有17个特征,那就把特征值不是17的书当作是异常值来处理,这里是直接舍弃 :return: """ for tag in tags: path = "books//" + tag + ".txt" urls = tools.read(path) out = [] for url in urls: lis = url.split(',') if len(lis) == 17: out.append(url) tools.truncatefile(path) for url in out: tools.write(path, url)
def step_two(): """ 如果在爬虫的时候最后一栏不是标签的话,就运行这个函数进行处理,给他最后加上标签 :return: """ for tag in tags: lis = [] path = "books//" + tag + ".txt" urls_list = tools.read(path) for url in urls_list: out = url.split(',') out.append(tag) output = ','.join(out) lis.append(output) tools.truncatefile(path) for li in lis: tools.write(path, li)
def step_seven(): """ 在插入数据库的时候发现,有些书籍的名字字段过长导致插入失败 所以在进行转换成sql语句之前应该先进行这个操作 把书名字段长于60的给舍弃掉 :return: """ for tag in tags: new_books = [] path = "books//" + tag + ".txt" books = tools.read(path) for book in books: lis = book.split(',') name = lis[0] if len(name) > 60: continue new_books.append(book) tools.truncatefile(path) for book in new_books: tools.write(path, book)
def step_one(): """ 清洗数据,因为在爬虫的时候会有失败的数据,失败的话我就直接把url写入文件 失败的记录它长度肯定是小于50的,成功的数据长度都是大于50的 :return: """ count = 0 for tag in tags: path = "books//" + tag + ".txt" old_books = tools.read(path) new_books = [] for old_book in old_books: if len(old_book) > 50: new_books.append(old_book) tools.truncatefile(path) for new_book in new_books: count += 1 tools.write(path, new_book) print(count)
def step_four(): """ 对价格进行格式化处理,因为有些价格不是RMB 采取处理方式是只保留书本价格的数字部分 便于存入数据库 :return: """ for tag in tags: path = "books//" + tag + ".txt" books = tools.read(path) out = [] for book in books: lis = book.split(',') price = lis[3] # print(re.findall(r"\d+\.?\d*", price)) try: lis[3] = re.findall(r"\d+\.?\d*", price)[0] book = ','.join(lis) out.append(book) except Exception as e: print(e, book) # lis[6] = int(lis[7]) + int(lis[8]) + int(lis[9]) + int(lis[10]) + int(lis[11]) # lis[6] = str(lis[6]) tools.truncatefile(path) for book in out: tools.write(path, book)