def _search_pharse_func_tester(pharse, doc_id):
    terms = []
    t_st = Token_Preprocessing_Engine()
    for token in pharse.split():
        terms.append(t_st.process_token(token))
    result = search_pharse(terms, doc_id)
    send_stdout(result)
def main():
    global st
    # read arguments
    args = parse_arguments()

    # get filenames from the [document dir]
    try:
        doc_files = [
            f for f in listdir(args.doc_dir) if isfile(join(args.doc_dir, f))
        ]
    except FileNotFoundError as e:
        send_stdout('Error! No such file or directory "{}".'.format(
            args.doc_dir))
        return
    # check whether the index file for zone scoring already exist
    if isfile(join(args.index_dir, ZONE_INDEX_FILE)):
        send_stdout('Error! Index file "{}" already exist.'.\
            format(join(args.index_dir, ZONE_INDEX_FILE)))
        return

    # initialize stemmer (Lemmatizer)
    if STEMMER:
        st = Token_Preprocessing_Engine()

    # read directory -> read doc -> create zone indexes
    read_dir(args.doc_dir, doc_files)

    # write index to file
    f_out = open(join(args.index_dir, ZONE_INDEX_FILE), 'w')
    for term in sorted(zone_index.keys()):
        f_out.write('{term} {posting}\n'.format(term=term,
                                                posting=zone_index[term]))
    f_out.close()
def read_index(f):
    global document_ids
    send_stdout('reading index ...')
    entries = f.readlines()
    for entry in entries:
        term, index = entry.split(maxsplit=1)
        index = ast.literal_eval(index)
        positional_index[term] = index
        document_ids = document_ids | set(index.keys())
Ejemplo n.º 4
0
def rw_index(f):
    entries = f.readlines()
    for entry in entries:
        term, index = entry.split(maxsplit=1)
        index_out = []
        index = ast.literal_eval(index)
        for docID in sorted(index.keys()):
            pos = [str(p) for p in index[docID]]
            index_out.append('{0}:{1}'.format(docID, ','.join(pos)))
        send_stdout('{0} \t {1} '.format(term, ';'.join(index_out)))
def print_lms(f):
    LM_txt = f.readline()
    LM_LMS = ast.literal_eval(LM_txt)
    for docID in sorted(LM_LMS.keys()):
        doc_TF = LM_LMS[docID]
        L = doc_TF[L_TOKEN]
        output = '{} \t '.format(docID)
        for term in doc_TF:
            # MLE(t|d) = tf(t,d) / L(d)
            output = output + term + ':' + str(doc_TF[term] / L) + ', '
        if output[-2:] == ', ':
            output = output[:-2]
        # FORMAT: 'doc_id \t term_1:term_1_MLE, term_2:term_2_MLE, ...'
        send_stdout(output)
def main():
    global st, documents

    # read arguments
    args = parse_arguments()

    # query validation
    if not validate_query(args.query):
        send_stdout('Error! Invalided boolean query.')
        sys.exit()

    # open index file
    try:
        path = join(args.path, INDEX_FILE)
        f = open(path)
    except FileNotFoundError as e:
        send_stdout('Error! Index file "{}" does not exits.'.format(path))
        sys.exit()

    # read index
    try:
        read_index(f)
    except:
        send_stdout('Error! Invalided index file format.')
        sys.exit()

    # initialize query stemmer (Lemmatizer)
    if STEMMER:
        st = Token_Preprocessing_Engine()

    # query preprocessing
    p_query = preprocessing_query(args.query)
    # parse query
    lisp_bool_query = str(searchExpr.parseString(p_query)[0])
    send_stdout("Pharsed Boolean Query: {}.".format(lisp_bool_query))

    # find document that satisfied the boolean query
    result = []
    for doc_id in documents:
        if query_valuation(lisp_bool_query, doc_id):
            result.append(doc_id)
    send_stdout("Documents: {}.".format(result))

    f.close()
def main():
    # read arguments
    args = parse_arguments()
    if args.score not in ['y', 'n']:
        send_stdout('Error! arg "scores" should be either y or n')
        sys.exit()

    # open index file
    try:
        path = join(args.path, INDEX_FILE)
        f = open(path)
    except FileNotFoundError as e:
        send_stdout('Error! Index file "{}" does not exits.'.format(path))
        sys.exit()

    # initialize query stemmer (Lemmatizer)
    if STEMMER:
        st = Token_Preprocessing_Engine()
        query = [st.process_token(t) for t in args.terms]
    else:
        query = [t.lower() for t in args.terms]

    # read index
    try:
        read_index(f)
    except:
        send_stdout('Error! Invalided index file format.')
        sys.exit()

    # compute vector space scores
    score = cosine_score(query)
    k_score = sorted(score.items(), key=lambda x: x[1], reverse=True)
    for i in range(min(args.k, len(k_score))):
        d, s = k_score[i]
        if args.score == 'y':
            send_stdout('{id} \t {score}'.format(id=d, score=s))
        else:
            send_stdout('{id}'.format(id=d))

    f.close()
def main():
    global LM_LMS
    # read arguments
    args = parse_arguments()

    # open language models file
    try:
        path = join(args.LM_DIR, LM_NAME)
        f = open(path)
    except (FileNotFoundError, NotADirectoryError) as e:
        send_stdout('Error! Language models file does not find "{}".'.format(path))
        return

    # read language models file
    send_stdout('Reading language models file ...')
    try:
        LM_txt = f.readline()
        LM_LMS = ast.literal_eval(LM_txt)
    except Exception as e:
        send_stdout('Error! Language models file format "{}".'.format(path))
        f.close(); return

    # Tokenize query and run stemmer / Lemmatizer
    query_terms = process_query(args.query)

    # Estimate query likelihood per document
    likelihood = estimate_query_lh(query_terms)

    # Output the top K documents by likelihood
    sorted_docIDs = sorted(likelihood, key=likelihood.get, reverse=True)
    k = min(len(sorted_docIDs), args.k)
    for idx in range(k):
        docID = sorted_docIDs[idx]
        lh = likelihood[docID]
        # FORMAT: doc_id_1 \t query_likelihood \n
        send_stdout('{} \t {}'.format(docID, lh))

    f.close()
def main():
    # read arguments
    # % ./print_lms [language_models_location]
    if len(sys.argv) != 2:
        send_stdout("Usage: python3 {} [language_models_location]".format(
            sys.argv[0]))
        return
    # open language models file
    try:
        path = join(sys.argv[1], LM_NAME)
        f = open(path)
    except (FileNotFoundError, NotADirectoryError) as e:
        send_stdout(
            'Error! Language models file does not find "{}".'.format(path))
        return

    # read language models file and print the MLE per term & document
    send_stdout('Reading language models file ...')
    try:
        print_lms(f)
    except Exception as e:
        send_stdout('Error! Language models file format "{}".'.format(path))

    f.close()
Ejemplo n.º 10
0
def main():
    # read arguments
    if len(sys.argv) != 2:
        send_stdout("format: python {} [directory]".format(sys.argv[0]))
        return
    # open index file
    try:
        path = join(sys.argv[1], INDEX_FILE)
        f = open(path)
    except FileNotFoundError as e:
        send_stdout('Error! Index file does not find "{}".'.format(path))
        return

    # read index file
    try:
        rw_index(f)
    except:
        send_stdout('Error! Invalided index file format "{}".'.format(path))

    f.close()
def read_dir(doc_dir, doc_files):
    skipped_files = []
    f_num = len(doc_files)
    for i in range(f_num):
        fname = doc_files[i]
        finfo = fname.split(sep='_', maxsplit=2)
        # filename validation
        if finfo[0] != 'doc':
            skipped_files.append(fname)
            continue
        try:
            # read file, and create indexes
            read_doc(join(doc_dir, fname), int(finfo[1]), finfo[2])
        except Exception as e:
            print(e)
            skipped_files.append(fname)
            continue
        # update progress bar
        progress(i + 1, f_num)
    # show skipped invalid docs
    send_stdout()
    if len(skipped_files) != 0:
        send_stdout('Warning! Cannot index the following file(s):')
        send_stdout('{}, Skipped.'.format(skipped_files))
def main():
    global st, documents

    # read arguments
    args = parse_arguments()

    # query validation
    if not validate_query(args.q):
        send_stdout('Error! Invalided boolean query.')
        sys.exit()

    # open index file
    try:
        path = join(args.index_dir, ZONE_INDEX_FILE)
        f = open(path)
    except FileNotFoundError as e:
        send_stdout('Error! Zone index file "{}" does not exits.'.format(path))
        sys.exit()

    # read index
    send_stdout("Reading zone index ...")
    try:
        read_index(f)
    except Exception as e:
        print(e)
        send_stdout('Error! Invalided zone index file format.')
        sys.exit()

    # initialize query stemmer (Lemmatizer)
    if STEMMER:
        st = Token_Preprocessing_Engine()

    # query preprocessing
    p_query = preprocessing_query(args.q)
    # parse query
    lisp_bool_query = str(searchExpr.parseString(p_query)[0])
    send_stdout("Pharsed Boolean Query: {}.".format(lisp_bool_query))

    # find document that satisfied the boolean query
    send_stdout("Searching and scoring ...")
    result = {}
    for doc_id in documents:
        score = 0
        if query_valuation(lisp_bool_query, doc_id, TITLE):
            score += 1 * args.g
        if query_valuation(lisp_bool_query, doc_id, BODY):
            score += 1 * (1 - args.g)
        result[doc_id] = score
    k_result = sorted(result.items(), key=lambda x: x[1], reverse=True)
    for i in range(min(args.k, len(k_result))):
        d, s = k_result[i]
        send_stdout('{id} \t {score}'.format(id=d, score=s))

    f.close()
Ejemplo n.º 13
0
def main():
    global st
    # read arguments
    if len(sys.argv) != 2:
        send_stdout("format: python {} [dir]".format(sys.argv[0]))
        return
    # get filenames from the [dir]
    try:
        path = sys.argv[1]
        files = [f for f in listdir(path) if isfile(join(path, f))]
    except FileNotFoundError as e:
        send_stdout('Error! No such file or directory "{}".'.format(path))
        return
    # check whether the index file already exist
    if isfile(INDEX_FILE):
        send_stdout('Error! Index file "{}" already exist.'.format(INDEX_FILE))
        return

    # initialize stemmer (Lemmatizer)
    if STEMMER:
        st = Token_Preprocessing_Engine()

    skipped_files = []
    f_num = len(files)
    for i in range(f_num):
        fname = files[i]
        finfo = fname.split(sep='_', maxsplit=2)
        # filename validation
        if finfo[0] != 'doc':
            skipped_files.append(fname)
            continue
        try:
            # read file, and create indexes
            read_file(join(path, fname), int(finfo[1]))
        except Exception as e:
            skipped_files.append(fname)
            continue
        # update progress bar
        progress(i + 1, f_num)

    send_stdout()
    if len(skipped_files) != 0:
        send_stdout('Warning! Cannot index the following file(s):')
        send_stdout('{}, Skipped.'.format(skipped_files))

    # write index to file
    f_out = open(INDEX_FILE, 'w')
    for term in sorted(positional_index.keys()):
        f_out.write('{term} {index}\n'.format(term=term,
                                              index=positional_index[term]))
    f_out.close()
Ejemplo n.º 14
0
def main():
    global st

    # read arguments "% ./create_lms [document dir] [output_dir]"
    if len(sys.argv) != 3:
        send_stdout("Usage: python3 {} [document_dir] [output_dir]".format(
            sys.argv[0]))
        return
    # get filenames from the [document dir]
    try:
        DOC_DIR = sys.argv[1]
        docs = [f for f in listdir(DOC_DIR) if isfile(join(DOC_DIR, f))]
    except FileNotFoundError as e:
        send_stdout('Error! No such file or directory "{}".'.format(DOC_DIR))
        return
    # check whether the index file already exist in the [output_dir]
    LM_FILE = join(sys.argv[2], LM_NAME)
    if isfile(LM_FILE):
        send_stdout('Error! LM file "{}" already exist.'.format(LM_FILE))
        return

    # initialize stemmer (Lemmatizer)
    if STEMMER:
        st = Token_Preprocessing_Engine()

    skipped_docs = []
    invalid_filename_docs = []
    f_num = len(docs)
    for i in range(f_num):
        fname = docs[i]
        success, docID = filename_validation(fname)
        if not success:
            invalid_filename_docs.append(fname)
            continue
        try:
            # read file, and create language models (calculate MLE)
            read_file(join(DOC_DIR, fname), docID)
        except Exception as e:
            skipped_docs.append(fname)
            continue
        # update progress bar
        progress(i + 1, f_num)

    send_stdout()
    # show invalid document name/format to stdout
    if len(invalid_filename_docs) != 0:
        send_stdout('Warning! Invalid document name format:')
        send_stdout('{}, Skipped.'.format(invalid_filename_docs))
    if len(skipped_docs) != 0:
        send_stdout('Warning! Cannot process the following doc(s):')
        send_stdout('{}, Skipped.'.format(skipped_docs))

    # write index to file
    f_out = open(LM_FILE, 'w')
    f_out.write(str(LM_LMS))
    f_out.close()