def parseLine(line): pos = [m.start() for m in re.finditer("\t", line)] if len(pos) < 4: Utils.error("len(pos)<4") url = line[0:pos[0]] category = line[pos[0] + 1:pos[1]] subcategory = line[pos[1] + 1:pos[2]] title = line[pos[2] + 1:pos[3]] if len(pos) > 4: content = line[pos[3] + 1:pos[4]] else: content = line[pos[3] + 1:] # url = url.strip() category = category.strip() title = title.strip() content = content.strip() subcategory = subcategory.strip() return category, url, title, content, subcategory