Exemple #1
0
def get_top_members(results, classes, x):
    k = int(0.1 * len(results))
    top_few = results[:k]
    members = []
    for result in top_few:
        try:
            members.extend(classes[result])
        except KeyError:
            pass
    return helper.get_top_k(members, x)
Exemple #2
0
def get_top_classes(results, classes, x):
    k = int(0.1 * len(results))
    top_few = results[:k]
    PC = []
    for result in top_few:
        try:
            PC.append(classes[result])
        except KeyError:
            pass
    return helper.get_top_k(PC, x)
Exemple #3
0
def get_top_members(results, classes, x):
    k = int(0.1 * len(results))
    top_few = results[:k]
    members = []
    for result in top_few:
        try:
            members.extend(classes[result])
        except KeyError:
            pass
    return helper.get_top_k(members, x)
Exemple #4
0
def get_top_classes(results, classes, x):
    k = int(0.1 * len(results))
    top_few = results[:k]
    PC = []
    for result in top_few:
        try:
            PC.append(classes[result])
        except KeyError:
            pass
    return helper.get_top_k(PC, x)
Exemple #5
0
def build(index_directory, dictionary_file, postings_file):
    files_to_index = [f for f in listdir(index_directory) if isfile(join(index_directory, f))]
    index = []
    doc_lengths = {}
    IPC_class = {}
    UPC_class = {}
    family_members = {}
    cited_by = {}
    counter = 0
    doc_top_terms = {}

    for file_name in files_to_index:
        file_path = format_directory_path(index_directory) + file_name
        # Read XML
        tree = ET.parse(file_path)
        root = tree.getroot()
        tokens = []
        for child in root:
            attr_name = child.attrib['name']
            if attr_name == 'Title' or attr_name == 'Abstract':
                t = build_tokens(child.text)
                tokens.extend(t)
            elif attr_name == 'IPC Class':
                IPC_class[remove_file_ext(file_name)] = child.text.strip()
            elif attr_name == 'UPC Class':
                UPC_class[remove_file_ext(file_name)] = child.text.strip()
            elif attr_name == 'Family Members':
                members = child.text.strip().split('|')
                m = []
                for member in members:
                    m.append(member.strip())                
                family_members[remove_file_ext(file_name)] = m
            elif attr_name == 'Cited By':
                members = child.text.strip().split('|')
                m = []
                for member in members:
                    m.append(member.strip())                
                cited_by[remove_file_ext(file_name)] = m


        tokens = helper.remove_stop_words(helper.filter_invalid_characters(tokens))
        doc_top_terms[remove_file_ext(file_name)] = helper.get_top_k(tokens, 10)

        # build tokens
        doc_lengths[remove_file_ext(file_name)] = get_doc_length(tokens)
        index_entries = add_doc_id_to_tokens(tokens, remove_file_ext(file_name))
        index.extend(index_entries)
        counter += 1
        if counter % 300 == 0:
            print 'indexing ............... {}% completed'.format(round(float(counter)/len(files_to_index)*100, 2))
    print 'Writing index to disk...'
    index = sort_inverted_index(index)    
    index = group_index(index)
    write_index_to_disk(index, dictionary_file, postings_file)
    write_meta_data_to_disk(doc_lengths, len(files_to_index), doc_top_terms, UPC_class, IPC_class, family_members, cited_by)
Exemple #6
0
def expand_query(results, doc_top_terms, inverted_index, meta_data):
    """
    To deal with the anomalous state of knowledge problem
    We take top 10% of documents. For each document, pick the 10 most frequent words (already indexed)
    From this pool of words, pick the final top 10 by frequency.
    Run query again and return results
    """
    k = int(0.1 * len(results))
    top_few = results[:k]
    pool_of_words = []
    for result in top_few:
        pool_of_words.extend(doc_top_terms[result])

    new_query = helper.get_top_k(pool_of_words, 10)
    return execute_query([], new_query, [], inverted_index, meta_data)
Exemple #7
0
def expand_query(results, doc_top_terms, inverted_index, meta_data):
    """
    To deal with the anomalous state of knowledge problem
    We take top 10% of documents. For each document, pick the 10 most frequent words (already indexed)
    From this pool of words, pick the final top 10 by frequency.
    Run query again and return results
    """
    k = int(0.1 * len(results))
    top_few = results[:k]
    pool_of_words = []
    for result in top_few:
        pool_of_words.extend(doc_top_terms[result])

    new_query = helper.get_top_k(pool_of_words, 10)
    return execute_query([], new_query, [], inverted_index, meta_data)