def _search_pharse_func_tester(pharse, doc_id): terms = [] t_st = Token_Preprocessing_Engine() for token in pharse.split(): terms.append(t_st.process_token(token)) result = search_pharse(terms, doc_id) send_stdout(result)
def main(): global st # read arguments args = parse_arguments() # get filenames from the [document dir] try: doc_files = [ f for f in listdir(args.doc_dir) if isfile(join(args.doc_dir, f)) ] except FileNotFoundError as e: send_stdout('Error! No such file or directory "{}".'.format( args.doc_dir)) return # check whether the index file for zone scoring already exist if isfile(join(args.index_dir, ZONE_INDEX_FILE)): send_stdout('Error! Index file "{}" already exist.'.\ format(join(args.index_dir, ZONE_INDEX_FILE))) return # initialize stemmer (Lemmatizer) if STEMMER: st = Token_Preprocessing_Engine() # read directory -> read doc -> create zone indexes read_dir(args.doc_dir, doc_files) # write index to file f_out = open(join(args.index_dir, ZONE_INDEX_FILE), 'w') for term in sorted(zone_index.keys()): f_out.write('{term} {posting}\n'.format(term=term, posting=zone_index[term])) f_out.close()
def read_index(f): global document_ids send_stdout('reading index ...') entries = f.readlines() for entry in entries: term, index = entry.split(maxsplit=1) index = ast.literal_eval(index) positional_index[term] = index document_ids = document_ids | set(index.keys())
def rw_index(f): entries = f.readlines() for entry in entries: term, index = entry.split(maxsplit=1) index_out = [] index = ast.literal_eval(index) for docID in sorted(index.keys()): pos = [str(p) for p in index[docID]] index_out.append('{0}:{1}'.format(docID, ','.join(pos))) send_stdout('{0} \t {1} '.format(term, ';'.join(index_out)))
def print_lms(f): LM_txt = f.readline() LM_LMS = ast.literal_eval(LM_txt) for docID in sorted(LM_LMS.keys()): doc_TF = LM_LMS[docID] L = doc_TF[L_TOKEN] output = '{} \t '.format(docID) for term in doc_TF: # MLE(t|d) = tf(t,d) / L(d) output = output + term + ':' + str(doc_TF[term] / L) + ', ' if output[-2:] == ', ': output = output[:-2] # FORMAT: 'doc_id \t term_1:term_1_MLE, term_2:term_2_MLE, ...' send_stdout(output)
def main(): global st, documents # read arguments args = parse_arguments() # query validation if not validate_query(args.query): send_stdout('Error! Invalided boolean query.') sys.exit() # open index file try: path = join(args.path, INDEX_FILE) f = open(path) except FileNotFoundError as e: send_stdout('Error! Index file "{}" does not exits.'.format(path)) sys.exit() # read index try: read_index(f) except: send_stdout('Error! Invalided index file format.') sys.exit() # initialize query stemmer (Lemmatizer) if STEMMER: st = Token_Preprocessing_Engine() # query preprocessing p_query = preprocessing_query(args.query) # parse query lisp_bool_query = str(searchExpr.parseString(p_query)[0]) send_stdout("Pharsed Boolean Query: {}.".format(lisp_bool_query)) # find document that satisfied the boolean query result = [] for doc_id in documents: if query_valuation(lisp_bool_query, doc_id): result.append(doc_id) send_stdout("Documents: {}.".format(result)) f.close()
def main(): # read arguments args = parse_arguments() if args.score not in ['y', 'n']: send_stdout('Error! arg "scores" should be either y or n') sys.exit() # open index file try: path = join(args.path, INDEX_FILE) f = open(path) except FileNotFoundError as e: send_stdout('Error! Index file "{}" does not exits.'.format(path)) sys.exit() # initialize query stemmer (Lemmatizer) if STEMMER: st = Token_Preprocessing_Engine() query = [st.process_token(t) for t in args.terms] else: query = [t.lower() for t in args.terms] # read index try: read_index(f) except: send_stdout('Error! Invalided index file format.') sys.exit() # compute vector space scores score = cosine_score(query) k_score = sorted(score.items(), key=lambda x: x[1], reverse=True) for i in range(min(args.k, len(k_score))): d, s = k_score[i] if args.score == 'y': send_stdout('{id} \t {score}'.format(id=d, score=s)) else: send_stdout('{id}'.format(id=d)) f.close()
def main(): global LM_LMS # read arguments args = parse_arguments() # open language models file try: path = join(args.LM_DIR, LM_NAME) f = open(path) except (FileNotFoundError, NotADirectoryError) as e: send_stdout('Error! Language models file does not find "{}".'.format(path)) return # read language models file send_stdout('Reading language models file ...') try: LM_txt = f.readline() LM_LMS = ast.literal_eval(LM_txt) except Exception as e: send_stdout('Error! Language models file format "{}".'.format(path)) f.close(); return # Tokenize query and run stemmer / Lemmatizer query_terms = process_query(args.query) # Estimate query likelihood per document likelihood = estimate_query_lh(query_terms) # Output the top K documents by likelihood sorted_docIDs = sorted(likelihood, key=likelihood.get, reverse=True) k = min(len(sorted_docIDs), args.k) for idx in range(k): docID = sorted_docIDs[idx] lh = likelihood[docID] # FORMAT: doc_id_1 \t query_likelihood \n send_stdout('{} \t {}'.format(docID, lh)) f.close()
def main(): # read arguments # % ./print_lms [language_models_location] if len(sys.argv) != 2: send_stdout("Usage: python3 {} [language_models_location]".format( sys.argv[0])) return # open language models file try: path = join(sys.argv[1], LM_NAME) f = open(path) except (FileNotFoundError, NotADirectoryError) as e: send_stdout( 'Error! Language models file does not find "{}".'.format(path)) return # read language models file and print the MLE per term & document send_stdout('Reading language models file ...') try: print_lms(f) except Exception as e: send_stdout('Error! Language models file format "{}".'.format(path)) f.close()
def main(): # read arguments if len(sys.argv) != 2: send_stdout("format: python {} [directory]".format(sys.argv[0])) return # open index file try: path = join(sys.argv[1], INDEX_FILE) f = open(path) except FileNotFoundError as e: send_stdout('Error! Index file does not find "{}".'.format(path)) return # read index file try: rw_index(f) except: send_stdout('Error! Invalided index file format "{}".'.format(path)) f.close()
def read_dir(doc_dir, doc_files): skipped_files = [] f_num = len(doc_files) for i in range(f_num): fname = doc_files[i] finfo = fname.split(sep='_', maxsplit=2) # filename validation if finfo[0] != 'doc': skipped_files.append(fname) continue try: # read file, and create indexes read_doc(join(doc_dir, fname), int(finfo[1]), finfo[2]) except Exception as e: print(e) skipped_files.append(fname) continue # update progress bar progress(i + 1, f_num) # show skipped invalid docs send_stdout() if len(skipped_files) != 0: send_stdout('Warning! Cannot index the following file(s):') send_stdout('{}, Skipped.'.format(skipped_files))
def main(): global st, documents # read arguments args = parse_arguments() # query validation if not validate_query(args.q): send_stdout('Error! Invalided boolean query.') sys.exit() # open index file try: path = join(args.index_dir, ZONE_INDEX_FILE) f = open(path) except FileNotFoundError as e: send_stdout('Error! Zone index file "{}" does not exits.'.format(path)) sys.exit() # read index send_stdout("Reading zone index ...") try: read_index(f) except Exception as e: print(e) send_stdout('Error! Invalided zone index file format.') sys.exit() # initialize query stemmer (Lemmatizer) if STEMMER: st = Token_Preprocessing_Engine() # query preprocessing p_query = preprocessing_query(args.q) # parse query lisp_bool_query = str(searchExpr.parseString(p_query)[0]) send_stdout("Pharsed Boolean Query: {}.".format(lisp_bool_query)) # find document that satisfied the boolean query send_stdout("Searching and scoring ...") result = {} for doc_id in documents: score = 0 if query_valuation(lisp_bool_query, doc_id, TITLE): score += 1 * args.g if query_valuation(lisp_bool_query, doc_id, BODY): score += 1 * (1 - args.g) result[doc_id] = score k_result = sorted(result.items(), key=lambda x: x[1], reverse=True) for i in range(min(args.k, len(k_result))): d, s = k_result[i] send_stdout('{id} \t {score}'.format(id=d, score=s)) f.close()
def main(): global st # read arguments if len(sys.argv) != 2: send_stdout("format: python {} [dir]".format(sys.argv[0])) return # get filenames from the [dir] try: path = sys.argv[1] files = [f for f in listdir(path) if isfile(join(path, f))] except FileNotFoundError as e: send_stdout('Error! No such file or directory "{}".'.format(path)) return # check whether the index file already exist if isfile(INDEX_FILE): send_stdout('Error! Index file "{}" already exist.'.format(INDEX_FILE)) return # initialize stemmer (Lemmatizer) if STEMMER: st = Token_Preprocessing_Engine() skipped_files = [] f_num = len(files) for i in range(f_num): fname = files[i] finfo = fname.split(sep='_', maxsplit=2) # filename validation if finfo[0] != 'doc': skipped_files.append(fname) continue try: # read file, and create indexes read_file(join(path, fname), int(finfo[1])) except Exception as e: skipped_files.append(fname) continue # update progress bar progress(i + 1, f_num) send_stdout() if len(skipped_files) != 0: send_stdout('Warning! Cannot index the following file(s):') send_stdout('{}, Skipped.'.format(skipped_files)) # write index to file f_out = open(INDEX_FILE, 'w') for term in sorted(positional_index.keys()): f_out.write('{term} {index}\n'.format(term=term, index=positional_index[term])) f_out.close()
def main(): global st # read arguments "% ./create_lms [document dir] [output_dir]" if len(sys.argv) != 3: send_stdout("Usage: python3 {} [document_dir] [output_dir]".format( sys.argv[0])) return # get filenames from the [document dir] try: DOC_DIR = sys.argv[1] docs = [f for f in listdir(DOC_DIR) if isfile(join(DOC_DIR, f))] except FileNotFoundError as e: send_stdout('Error! No such file or directory "{}".'.format(DOC_DIR)) return # check whether the index file already exist in the [output_dir] LM_FILE = join(sys.argv[2], LM_NAME) if isfile(LM_FILE): send_stdout('Error! LM file "{}" already exist.'.format(LM_FILE)) return # initialize stemmer (Lemmatizer) if STEMMER: st = Token_Preprocessing_Engine() skipped_docs = [] invalid_filename_docs = [] f_num = len(docs) for i in range(f_num): fname = docs[i] success, docID = filename_validation(fname) if not success: invalid_filename_docs.append(fname) continue try: # read file, and create language models (calculate MLE) read_file(join(DOC_DIR, fname), docID) except Exception as e: skipped_docs.append(fname) continue # update progress bar progress(i + 1, f_num) send_stdout() # show invalid document name/format to stdout if len(invalid_filename_docs) != 0: send_stdout('Warning! Invalid document name format:') send_stdout('{}, Skipped.'.format(invalid_filename_docs)) if len(skipped_docs) != 0: send_stdout('Warning! Cannot process the following doc(s):') send_stdout('{}, Skipped.'.format(skipped_docs)) # write index to file f_out = open(LM_FILE, 'w') f_out.write(str(LM_LMS)) f_out.close()