def chinese_train(cl): for file in glob.glob(TRAIN_DIR + "*.xlsx"): print file excel_file = xlrd.open_workbook(file) for sheet_name in excel_file._sheet_names: sheet = excel_file.sheet_by_name(sheet_name) for i in range(sheet.nrows): cat = None url = None content = None try: cat = sheet.cell(colx=0, rowx=i).value cat = str(cat).decode('utf-8', 'ignore') cat = excel_to_json.purify_search_word(cat) cat = cat.strip() if not cat: continue except IndexError: continue try: url = sheet.cell(colx=1, rowx=i).value url = str(url).decode('utf-8', 'ignore') url = url.strip() if not url: continue except IndexError: continue print url try: content = sheet.cell(colx=2, rowx=i).value content = str(content).decode('utf-8', 'ignore') content = excel_to_json.purify_search_word(content) content = content.strip() if not content: continue except IndexError: continue if cat is None or content is None: continue body = content.decode('utf-8', 'ignore') body = body.replace('\01', ' ') if not chinese.search(body): continue cl.train(body, cat)
def excel_test(file_name): if not os.path.exists(file_name): print("file: %s not exists" % file_name) return excel_file = xlrd.open_workbook(file_name) for sheet_name in excel_file.sheet_names(): print("Sheet name: %s" % sheet_name) sheet = excel_file.sheet_by_name(sheet_name) for i in range(sheet.nrows): if i == 0: continue url = None title = None keywords = None description = None content = None try: url = sheet.cell(colx=0, rowx=i).value url = str(url).decode('utf-8', 'ignore') url = url.strip() if not url: continue except IndexError: continue print(url) try: title = sheet.cell(colx=1, rowx=i).value title = str(title).decode('utf-8', 'ignore') title = excel_to_json.purify_search_word(title) title = title.strip() if not title: continue except IndexError: continue try: keywords = sheet.cell(colx=2, rowx=i).value if keywords is not None: keywords = str(keywords).decode('utf-8', 'ignore') keywords = excel_to_json.purify_search_word(keywords) keywords = keywords.strip() except IndexError: pass try: description = sheet.cell(colx=3, rowx=i).value if description is not None: description = str(description).decode('utf-8', 'ignore') description = excel_to_json.purify_search_word(description) description = description.strip() except IndexError: pass try: a_content = sheet.cell(colx=4, rowx=i).value a_content = str(a_content).decode('utf-8', 'ignore') a_content = excel_to_json.purify_search_word(a_content) a_content = a_content.strip() p_content = sheet.cell(colx=5, rowx=i).value p_content = str(p_content).decode('utf-8', 'ignore') p_content = excel_to_json.purify_search_word(p_content) p_content = p_content.strip() content = a_content + p_content content = content.replace('\01', ' ') except IndexError: pass if url is None or title is None: continue item = dict() item[Classifier.KEY_TITLE] = title if keywords is not None: item[Classifier.KEY_KEYWORDS] = keywords if description is not None: item[Classifier.KEY_DESCRIPTION] = description if content is not None: item[Classifier.KEY_CONTENT] = content cat, prob = cl.classify(item, default=Classifier.UNKNOWN_CATEGORY) print(u"\t".join([cat, str(prob), url]))
def chinese_train(cl): for file in glob.glob(TRAIN_DIR + EXCEL_PREFIX): print(file) excel_file = xlrd.open_workbook(file) for sheet_name in excel_file._sheet_names: sheet = excel_file.sheet_by_name(sheet_name) for i in range(sheet.nrows): if i == 0: continue cat = None url = None title = None keywords = None description = None content = None try: cat = sheet.cell(colx=0, rowx=i).value cat = str(cat).decode('utf-8', 'ignore') cat = excel_to_json.purify_search_word(cat) cat = cat.strip() if not cat: continue except IndexError: continue try: url = sheet.cell(colx=1, rowx=i).value url = str(url).decode('utf-8', 'ignore') url = url.strip() if not url: continue except IndexError: continue print(url) try: title = sheet.cell(colx=2, rowx=i).value title = str(title).decode('utf-8', 'ignore') title = excel_to_json.purify_search_word(title) title = title.strip() if not title: continue except IndexError: continue try: keywords = sheet.cell(colx=3, rowx=i).value if keywords is not None: keywords = str(keywords).decode('utf-8', 'ignore') keywords = excel_to_json.purify_search_word(keywords) keywords = keywords.strip() except IndexError: continue try: description = sheet.cell(colx=4, rowx=i).value if description is not None: description = str(description).decode( 'utf-8', 'ignore') description = excel_to_json.purify_search_word( description) description = description.strip() except IndexError: continue try: a_content = sheet.cell(colx=5, rowx=i).value a_content = str(a_content).decode('utf-8', 'ignore') a_content = excel_to_json.purify_search_word(a_content) a_content = a_content.strip() p_content = sheet.cell(colx=6, rowx=i).value p_content = str(p_content).decode('utf-8', 'ignore') p_content = excel_to_json.purify_search_word(p_content) p_content = p_content.strip() content = a_content + p_content content = content.replace('\01', ' ') except IndexError: continue if cat is None or url is None: continue item = dict() item[Classifier.KEY_TITLE] = title if keywords is not None: item[Classifier.KEY_KEYWORDS] = keywords if description is not None: item[Classifier.KEY_DESCRIPTION] = description if content is not None: item[Classifier.KEY_CONTENT] = content cl.train(item, cat)
TEST_EXCEL_PATTERN): # print(file_name) excel_file = xlrd.open_workbook(file_name) for sheet_name in excel_file._sheet_names: sheet = excel_file.sheet_by_name(sheet_name) for i in range(sheet.nrows): cat = None url = None title = None keywords = None description = None content = None try: cat = sheet.cell(colx=0, rowx=i).value cat = str(cat).decode('utf-8', 'ignore') cat = excel_to_json.purify_search_word(cat) cat = cat.strip() if not cat: continue except IndexError: continue try: url = sheet.cell(colx=1, rowx=i).value url = str(url).decode('utf-8', 'ignore') url = url.strip() if not url: continue except IndexError: continue