def _search_pharse_func_tester(pharse, doc_id): terms = [] t_st = Token_Preprocessing_Engine() for token in pharse.split(): terms.append(t_st.process_token(token)) result = search_pharse(terms, doc_id) send_stdout(result)
def process_query(query): # initialize stemmer (Lemmatizer) if STEMMER: st = Token_Preprocessing_Engine() # process query terms = [] for token in query.split(): # Stemming and Lowercasing if STEMMER: t = st.process_token(token) else: t = token.lower() terms.append(t) return terms
def main(): global st # read arguments args = parse_arguments() # get filenames from the [document dir] try: doc_files = [ f for f in listdir(args.doc_dir) if isfile(join(args.doc_dir, f)) ] except FileNotFoundError as e: send_stdout('Error! No such file or directory "{}".'.format( args.doc_dir)) return # check whether the index file for zone scoring already exist if isfile(join(args.index_dir, ZONE_INDEX_FILE)): send_stdout('Error! Index file "{}" already exist.'.\ format(join(args.index_dir, ZONE_INDEX_FILE))) return # initialize stemmer (Lemmatizer) if STEMMER: st = Token_Preprocessing_Engine() # read directory -> read doc -> create zone indexes read_dir(args.doc_dir, doc_files) # write index to file f_out = open(join(args.index_dir, ZONE_INDEX_FILE), 'w') for term in sorted(zone_index.keys()): f_out.write('{term} {posting}\n'.format(term=term, posting=zone_index[term])) f_out.close()
def main(): global st # read arguments "% ./create_lms [document dir] [output_dir]" if len(sys.argv) != 3: send_stdout("Usage: python3 {} [document_dir] [output_dir]".format( sys.argv[0])) return # get filenames from the [document dir] try: DOC_DIR = sys.argv[1] docs = [f for f in listdir(DOC_DIR) if isfile(join(DOC_DIR, f))] except FileNotFoundError as e: send_stdout('Error! No such file or directory "{}".'.format(DOC_DIR)) return # check whether the index file already exist in the [output_dir] LM_FILE = join(sys.argv[2], LM_NAME) if isfile(LM_FILE): send_stdout('Error! LM file "{}" already exist.'.format(LM_FILE)) return # initialize stemmer (Lemmatizer) if STEMMER: st = Token_Preprocessing_Engine() skipped_docs = [] invalid_filename_docs = [] f_num = len(docs) for i in range(f_num): fname = docs[i] success, docID = filename_validation(fname) if not success: invalid_filename_docs.append(fname) continue try: # read file, and create language models (calculate MLE) read_file(join(DOC_DIR, fname), docID) except Exception as e: skipped_docs.append(fname) continue # update progress bar progress(i + 1, f_num) send_stdout() # show invalid document name/format to stdout if len(invalid_filename_docs) != 0: send_stdout('Warning! Invalid document name format:') send_stdout('{}, Skipped.'.format(invalid_filename_docs)) if len(skipped_docs) != 0: send_stdout('Warning! Cannot process the following doc(s):') send_stdout('{}, Skipped.'.format(skipped_docs)) # write index to file f_out = open(LM_FILE, 'w') f_out.write(str(LM_LMS)) f_out.close()
def main(): global st, documents # read arguments args = parse_arguments() # query validation if not validate_query(args.q): send_stdout('Error! Invalided boolean query.') sys.exit() # open index file try: path = join(args.index_dir, ZONE_INDEX_FILE) f = open(path) except FileNotFoundError as e: send_stdout('Error! Zone index file "{}" does not exits.'.format(path)) sys.exit() # read index send_stdout("Reading zone index ...") try: read_index(f) except Exception as e: print(e) send_stdout('Error! Invalided zone index file format.') sys.exit() # initialize query stemmer (Lemmatizer) if STEMMER: st = Token_Preprocessing_Engine() # query preprocessing p_query = preprocessing_query(args.q) # parse query lisp_bool_query = str(searchExpr.parseString(p_query)[0]) send_stdout("Pharsed Boolean Query: {}.".format(lisp_bool_query)) # find document that satisfied the boolean query send_stdout("Searching and scoring ...") result = {} for doc_id in documents: score = 0 if query_valuation(lisp_bool_query, doc_id, TITLE): score += 1 * args.g if query_valuation(lisp_bool_query, doc_id, BODY): score += 1 * (1 - args.g) result[doc_id] = score k_result = sorted(result.items(), key=lambda x: x[1], reverse=True) for i in range(min(args.k, len(k_result))): d, s = k_result[i] send_stdout('{id} \t {score}'.format(id=d, score=s)) f.close()
def main(): # read arguments args = parse_arguments() if args.score not in ['y', 'n']: send_stdout('Error! arg "scores" should be either y or n') sys.exit() # open index file try: path = join(args.path, INDEX_FILE) f = open(path) except FileNotFoundError as e: send_stdout('Error! Index file "{}" does not exits.'.format(path)) sys.exit() # initialize query stemmer (Lemmatizer) if STEMMER: st = Token_Preprocessing_Engine() query = [st.process_token(t) for t in args.terms] else: query = [t.lower() for t in args.terms] # read index try: read_index(f) except: send_stdout('Error! Invalided index file format.') sys.exit() # compute vector space scores score = cosine_score(query) k_score = sorted(score.items(), key=lambda x: x[1], reverse=True) for i in range(min(args.k, len(k_score))): d, s = k_score[i] if args.score == 'y': send_stdout('{id} \t {score}'.format(id=d, score=s)) else: send_stdout('{id}'.format(id=d)) f.close()
def main(): global st # read arguments if len(sys.argv) != 2: send_stdout("format: python {} [dir]".format(sys.argv[0])) return # get filenames from the [dir] try: path = sys.argv[1] files = [f for f in listdir(path) if isfile(join(path, f))] except FileNotFoundError as e: send_stdout('Error! No such file or directory "{}".'.format(path)) return # check whether the index file already exist if isfile(INDEX_FILE): send_stdout('Error! Index file "{}" already exist.'.format(INDEX_FILE)) return # initialize stemmer (Lemmatizer) if STEMMER: st = Token_Preprocessing_Engine() skipped_files = [] f_num = len(files) for i in range(f_num): fname = files[i] finfo = fname.split(sep='_', maxsplit=2) # filename validation if finfo[0] != 'doc': skipped_files.append(fname) continue try: # read file, and create indexes read_file(join(path, fname), int(finfo[1])) except Exception as e: skipped_files.append(fname) continue # update progress bar progress(i + 1, f_num) send_stdout() if len(skipped_files) != 0: send_stdout('Warning! Cannot index the following file(s):') send_stdout('{}, Skipped.'.format(skipped_files)) # write index to file f_out = open(INDEX_FILE, 'w') for term in sorted(positional_index.keys()): f_out.write('{term} {index}\n'.format(term=term, index=positional_index[term])) f_out.close()
def main(): global st, documents # read arguments args = parse_arguments() # query validation if not validate_query(args.query): send_stdout('Error! Invalided boolean query.') sys.exit() # open index file try: path = join(args.path, INDEX_FILE) f = open(path) except FileNotFoundError as e: send_stdout('Error! Index file "{}" does not exits.'.format(path)) sys.exit() # read index try: read_index(f) except: send_stdout('Error! Invalided index file format.') sys.exit() # initialize query stemmer (Lemmatizer) if STEMMER: st = Token_Preprocessing_Engine() # query preprocessing p_query = preprocessing_query(args.query) # parse query lisp_bool_query = str(searchExpr.parseString(p_query)[0]) send_stdout("Pharsed Boolean Query: {}.".format(lisp_bool_query)) # find document that satisfied the boolean query result = [] for doc_id in documents: if query_valuation(lisp_bool_query, doc_id): result.append(doc_id) send_stdout("Documents: {}.".format(result)) f.close()