# nltk.download("punkt") # nltk.download("maxnet_treebank_pos_tagger") #设置新添的下载的nltk依赖库路径 data.path.append(r"nltk_data") DIRECTNAME = 'Reuters' # print("establishing the INDEX...") # establishIndex.createIndex(DIRECTNAME) print("getting word list...") WORDLIST = getIndex.getWordList() print("getting index...") INDEX = getIndex.getIndex() print("loading the wordnet...") stemming.lemmatize_sentence("a", False) PATH = tools.projectpath + DIRECTNAME FILES = os.listdir(tools.reuterspath) FILENUM = len(FILES) LOOP = True print("=================Searching System=================") while LOOP: print("searching operation: ") print( "[1] Overall [2]TOP K [3]BOOL [4]Phrase [5]wildcard [6]synonyms [7]exit" ) print("your choice(int):") try:
def preProcess(filename): file = open(filename, 'r') content = file.read() words = stemming.lemmatize_sentence(content, False) return words
K = 10 while LOOP: print("---------> Start search No. {0}".format(SEARCH_NUM)) syn_FLAG = False print("Enable Synonym Retrieval?[Y]/[N] ", end='') choose = input() if choose == "Y": syn_FLAG = True print("input the query statement(EXIT to quit): ", end='') STATEMENT = input() if STATEMENT == "EXIT": break print("stemming...") INPUT_WORDS = stemming.lemmatize_sentence(STATEMENT, True) DOC_LIST = BoolSearchDel.bool_search(DOC_NUM, INPUT_WORDS, INDEX, syn_FLAG) print("Found {0} document(s) that matched query".format(len(DOC_LIST))) for i in range(min(K, len(DOC_LIST))): print("doc name:{0}.html, score = {1}".format(DOC_LIST[i][0], DOC_LIST[i][1])) SEARCH_NUM += 1 print("ByeBye!")
def preProcess(filename): file = open(filename, 'r') content = file.read() words = stemming.lemmatize_sentence(content) print(words)