Esempio n. 1
0
 def load_index(self):
     if os.path.exists(os.path.join(self.data_directory, 'ent2id.txt')):
         self.entity_dict = load_index(
             os.path.join(self.data_directory, 'ent2id.txt'))
         print("Load preprocessed entity index")
     elif os.path.exists(os.path.join(self.data_directory, 'ent2ids')):
         self.entity_dict = unserialize(os.path.join(
             self.data_directory, "ent2ids"),
                                        form='json')
         print("Load raw entity index")
     else:
         print("Entity Index not exist")
         self.entity_dict = {}
     if os.path.exists(os.path.join(self.data_directory,
                                    'relation2id.txt')):
         self.relation_dict = load_index(
             os.path.join(self.data_directory, 'relation2id.txt'))
         print("Load preprocessed relation index")
     elif os.path.exists(os.path.join(self.data_directory, 'relation2ids')):
         self.relation_dict = unserialize(os.path.join(
             self.data_directory, "relation2ids"),
                                          form='json')
         print("Load raw relation index")
     else:
         print("Relation Index not exist")
         self.relation_dict = {}
Esempio n. 2
0
 def load_data(self):
     self.data_directory = os.path.join(self.root_directory, "data")
     self.entity_dict = load_index(
         os.path.join(self.data_directory, "ent2id.txt"))
     self.relation_dict = load_index(
         os.path.join(self.data_directory, "relation2id.txt"))
     self.facts_data = translate_facts(
         load_facts(os.path.join(self.data_directory, "train.txt")),
         self.entity_dict, self.relation_dict)
     self.test_support = translate_facts(
         load_facts(os.path.join(self.data_directory, "test_support.txt")),
         self.entity_dict, self.relation_dict)
     self.valid_support = translate_facts(
         load_facts(os.path.join(self.data_directory, "valid_support.txt")),
         self.entity_dict, self.relation_dict)
     self.test_eval = translate_facts(
         load_facts(os.path.join(self.data_directory, "test_eval.txt")),
         self.entity_dict, self.relation_dict)
     self.valid_eval = translate_facts(
         load_facts(os.path.join(self.data_directory, "valid_eval.txt")),
         self.entity_dict, self.relation_dict)
     # augment
     with open(os.path.join(self.data_directory, 'pagerank.txt')) as file:
         self.pagerank = list(
             map(lambda x: float(x.strip()), file.readlines()))
     if os.path.exists(os.path.join(self.data_directory, "fact_dist")):
         self.fact_dist = unserialize(
             os.path.join(self.data_directory, "fact_dist"))
     else:
         self.fact_dist = None
     if os.path.exists(os.path.join(self.data_directory, "train_graphs")):
         self.train_graphs = unserialize(
             os.path.join(self.data_directory, "train_graphs"))
     else:
         self.train_graphs = None
     assert os.path.exists(
         os.path.join(self.data_directory, "evaluate_graphs"))
     if os.path.exists(os.path.join(self.data_directory,
                                    "evaluate_graphs")):
         print("Use evaluate graphs")
         self.evaluate_graphs = unserialize(
             os.path.join(self.data_directory, "evaluate_graphs"))
     else:
         self.evaluate_graphs = None
     if os.path.exists(os.path.join(self.data_directory, "rel2candidates")):
         self.rel2candidate = unserialize(
             os.path.join(self.data_directory, "rel2candidates"))
     else:
         self.rel2candidate = {}
     # self.rel2candidate = {self.relation_dict[key]: value for key, value in self.rel2candidate.items() if
     #                       key in self.relation_dict}
     self.id2entity = sorted(self.entity_dict.keys(),
                             key=self.entity_dict.get)
     self.id2relation = sorted(self.relation_dict.keys(),
                               key=self.relation_dict.get)
     self.data_loaded = True
Esempio n. 3
0
def main():
    # Get index fle
    idx_desired = input('Which index file do you wish to query (press enter for default inverted_index.txt): ')

    if idx_desired == '':
        idx_desired = os.path.join(CACHE_DIR, INDEX_FILE)
    else:
        idx_desired = os.path.join(CACHE_DIR, idx_desired)

    index_filename = idx_desired.lower()

    # determine if query should be stemmed
    while True:
        should_stem = input('Is this a stemmed index? [Y]es/[N]o: ').lower()
        possible_values = ['y', 'yes', 'n', 'no']

        if should_stem in possible_values:
            break
        else:
            print('Please enter a correct response.')

    query_prompt = "Specify which type of query you are making [1] TF-IDF [2] BM25: "
    # for i, qtype in enumerate(QUERY_TYPES):
    #     query_prompt += f'[{i+1}] {qtype}\n'

    # determine how the documents should be ranked
    while True:
        try:
            desired_query_type = int(input(query_prompt)) - 1
            break
        except ValueError as e:
            print(e)
            print(f'Please enter and integer between 1-{len(QUERY_TYPES)}.')

    # format the query appropriately
    original_query = input('Please enter your query (keep in mind that casing is irrelevant): ')
    tokenizer = nltk.RegexpTokenizer(indexer.TOKENIZING_REGEX)
    query_tokens = indexer.custom_tokenizer(original_query, tokenizer, stem_doc=should_stem)
    index = utils.load_index(input_file=index_filename)
    print(f'\nYour formatted query is: {" ".join(query_tokens)}')

    # rank documents
    ranked_docs: dict = rank_documents(query_tokens, index, QUERY_TYPES[desired_query_type])

    top15 = sorted(ranked_docs.items(), reverse=True, key=lambda x: x[1])[:15]
    i = 1
    URLs = []
    print()
    for docID, rank in top15:
        print(f'{i}.\tDoc {docID}\twith rank {round(rank,3)}')
        with open(os.path.join(CACHE_DIR, 'documents', str(docID), 'url'), mode='r') as f:
            URLs.append(f.read())
        i -= -1
    print('\nView those documents in the browser by clicking on their corresponding URLs:\n')
    for i, url in enumerate(URLs):
        print(f'{i+1}.\t{url}')
Esempio n. 4
0
def run():
    args = init_params()

    print(
        f'\nCompression performed on {args.input_file} and stored in {args.output_file}'
    )

    unfiltered: dict = utils.load_index(args.input_file)

    table = {
        'unfiltered': {
            'tokens': {
                'number': len(unfiltered),
                'delta %': round(0.0, 2),
                'total %': round(0.0, 2)
            },
            'non-positional postings': {
                'number': sum([unfiltered[token][0] for token in unfiltered]),
                'delta %': round(0.0, 2),
                'total %': round(0.0, 2)
            }
        }
    }

    no_numbers: dict = remove_numbers(unfiltered.copy())
    table = update_table(table, 'unfiltered', 'no numbers', no_numbers)

    case_folding: dict = case_fold(no_numbers)
    table = update_table(table, 'no numbers', 'case folding', case_folding)

    remove30 = remove_stop_words(case_folding, stop_words[:30])
    table = update_table(table, 'case folding', '30 stop words', remove30)

    remove150 = remove_stop_words(case_folding, stop_words)
    table = update_table(table, '30 stop words', '150 stop words', remove150)

    final = remove150

    utils.save_index_to_disk(final, args.output_file)

    print(f'\nCompression Table:')
    print(display_table(table))
Esempio n. 5
0
def merge_blocks_into_one_index(
        dir=BLOCK_DIR) -> Dict[str, Tuple[int, List[int]]]:
    '''
    Merge all block indices in BLOCK_DIR into a single inverted index.
    '''

    inverted_index = {}
    print('Merging all blocks')
    for block_file in tqdm(os.listdir(dir)):
        block = utils.load_index(os.path.join(dir, block_file))
        for token in block:
            postings: set = inverted_index.get(token, set())
            _, block_postings = block[token]
            postings.update(block_postings)
            inverted_index[token] = postings

    for token in inverted_index:
        postings: set = inverted_index[token]
        postings: list = sorted(list(postings))
        freq = len(postings)

        inverted_index[token] = freq, postings

    return inverted_index
Esempio n. 6
0
                dt = val["dt"].strftime("%Y/%m/%d")
                tm = val["dt"].strftime("%H:%M:%S")
                dist = val["dist"]
                values.append((key, 1,dt, tm, dist))

        rows = db.addAttendanceMulti(values)
        print(f"{rows} inserted....")


u.file_check(DISTANCE_FILE, "recognize_multi.py", "No User exists. Add a user...")
u.file_check(LABELS_FILE, "recognize_multi.py", "User names not found...")
u.file_check(CAM_FILE, "recognize_multi.py", "Cam file not found...")
u.file_check(CONFIG_FILE, "recognize_multi.py", "Config file not found...")


annoy_object = u.load_index(DISTANCE_FILE)
print("[INFO] [recognize_multi.py] Distance file loaded...")

labels = u.read_data(LABELS_FILE)["labels"]
print("[INFO] [recognize_multi.py] Labels file loaded...")

cam_links = u.read_txtfile(CAM_FILE)
print("[INFO] [recognize_multi.py] cam file loaded...")

configs = eval(u.read_txtfile(CONFIG_FILE)[0])
TIMESTAMP = configs["time_stamp"]
dbConfig = configs["db"]
host, user, passwd, dbname = dbConfig["host"], dbConfig["user"], dbConfig["passwd"], dbConfig["db"]
print("[INFO] [recognize_multi.py] cam file loaded...")

detector = FaceDetectionSSD()
Esempio n. 7
0
            print('\rindexing {}/{}'.format(i + 1, n_images), end='')
            sys.stdout.flush()
        print('')

        save_index(index, index_file)
        print('{} saved'.format(index_file))

    # ---------
    # RETRIEVAL
    # ---------

    vocabulary = load_data(vocabulary_file)

    print('loading index ...', end=' ')
    sys.stdout.flush()
    index = load_index(index_file)
    print('OK')

    idf = np.log(index['n'] / (index['df'] + 2**-23))
    idf2 = idf**2.0

    n_short_list = 100

    score = []

    query_list = [image_list[i] for i in range(0, 4 * N_QUERY, 4)]

    for fname in query_list:
        imfile = join(base_path, fname)

        # compute low-level features
    if os.path.isfile(qfile):
        return

    with open(qfile, mode='w') as f:
        f.write("{}")  # because we want utils.load_json_from_disk to load an empty dict instead of raising an exception


if __name__ == '__main__':

    args = init_params()
    utils.ensure_dir_exists('output')
    ensure_query_file(args.output_file)

    if args.query_string == None or type(args.query_string) != str:
        print("Please provide a query str with flag -q")
        exit()

    inv_index: Dict[str, Tuple[int, List[int]]] = utils.load_index(args.input_file)

    result: dict = exec_query(args.query_string, inv_index)
    x = result[args.query_string]
    print(f'<{args.query_string}> query was {x["message"]}: {x["frequency"]} hits found')

    queries: dict = utils.load_json_from_disk(args.output_file)

    del x['message']

    queries.update(result)

    utils.write_json_obj_2_disk(queries, args.output_file, indentation=4)
Esempio n. 9
0
import time
import os
from config import *
from tqdm import tqdm

if __name__ == "__main__":
    vocab = utils.get_vocabulary()  # loads vocabulary present in system
    citations = utils.get_citations()  # loads citation counts for doc_ids
    while True:
        print("Enter a word to search the index for:")
        x = input()

        if x in vocab:
            start_time = time.time()
            if os.path.exists(("indexes/inverted_index_" + x + ".pbz2")):
                index = utils.load_index("indexes/inverted_index_" + x)
                loaded = x
            else:
                index = utils.load_index(filename="indexes/inverted_index_" +
                                         x[0])
                loaded = x[0]
            end_time = time.time()
            print(("Took {} seconds to load index " +
                   loaded).format(end_time - start_time))
            print(index[x]["doc_frequency"])  # print number of docs term is in
            for k in list(index[x]["doc_ids"].keys())[:10]:
                print(
                    k, index[x]["doc_ids"][k], citations[k]
                )  # print top 10 docs for term, how many times term in doc, and citations of doc
        else:
            print("Sorry, this word is not in the index. Try another.")
def main():

    inverted_index = utils.load_index(indexer.INVERTED_INDEX_FILE)

    query_type = {
        'a': 'AND',
        'and': 'AND',
        'o': 'OR',
        'or': 'OR',
        'r': 'RANKED',
        'ranked': 'RANKED',
    }
    requested = ''
    while requested not in query_type:
        if requested != '':
            print(f'\n"{requested}" is not a valid type of search.')
        requested = input("Enter the type of search you want to perform ([a]nd, [o]r, [r]anked): ").lower()

    query_terms = nltk.regexp_tokenize(input("Please enter your query: "), indexer.TOKENIZING_REGEX)

    results = ''
    documentIDs = []
    if query_type[requested] == 'RANKED':

        postings = list(union_postings(query_terms, inverted_index).keys())
        ranking = {}
        L_avg = find_average_document_length()

        for docID in postings:
            ranking[docID] = round(compute_ranking_RSV_11_32(query_terms, docID, L_avg, inverted_index), 2)

        sorted_rankings = sorted(ranking.items(), reverse=True, key=lambda x: x[1])
        top10 = sorted_rankings[:10]

        if len(top10) == 0:
            results = '\nSorry, no documents match your query.'

        for i in range(len(top10)):
            results += f'\n{i+1}. \tDocument ID {top10[i][0]} \twith ranking {top10[i][1]}'

        documentIDs = [t[0] for t in top10]

    elif query_type[requested] == 'AND':

        postings = sorted(list(intersect_postings(query_terms, inverted_index)))
        print(f'\nThere are {len(postings)} documents that contain all query terms.')
        print('The first 10 are:')
        first10 = postings[:10]
        if len(first10) == 0:
            results = '\nSorry, no documents match your query.'

        for i in range(len(first10)):
            results += f'\n{i+1}. \tDocument ID {first10[i]}'

        documentIDs = first10

    elif query_type[requested] == 'OR':

        doc_vs_occurence_tuples = union_postings(query_terms, inverted_index).items()
        postings = sorted(doc_vs_occurence_tuples, reverse=True, key=lambda x: (x[1], -x[0]))  # sort first by number of query terms appearing in document, then by ID
        top10 = postings[:10]
        print(f'\nThere are {len(postings)} documents that contain at least one query term.')

        if len(top10) == 0:
            print('Sorry, no documents match your query.')

        for i in range(len(top10)):
            results += f'\n{i+1}. \tDocument ID {top10[i][0]} \tcontains {top10[i][1]} query terms'

        documentIDs = [t[0] for t in top10]

    else:
        raise AssertionError("Program flow should never reach this code block")

    print(results)

    if len(documentIDs) > 0:
        generated_html_path = generate_html(query_terms, query_type[requested], documentIDs)
        print("\nYou can view the contents of those documents in the browser if you open:")
        print(generated_html_path)