Exemple #1
0
def find_10_most_relevant(query, dictionary, postings, num_of_doc):
    '''
    Compute cosine similarity between the query and each document, i.e.,
    the lnc tf-idf for the tuples (term, frequency).
    Compute the score for each document containing one of those terms in 
    the query.
    Return (at most) 10 most relavant document id (sorted) by score.

    @param query - The query string: str
    @param dictonary - The dictionary containing the doc frequency of a
                        token: DefaultDict[int, Entry]
    @param postings - The postings dictionary containing a mapping of 
                        doc ID to the weight for a given token: Posting
    @param num_of_doc - The number of the documents indexed
    '''
    '''
    Get tokens (stemmed words in the query), terms (set of tokens), 
    and the dictionary of term frequency in the query: DefaultDict[str, int]
    '''
    tokens, terms, term_freq = get_term_freq(query)

    if phrasal_query:
        doc_candidate = interection(terms, postings)
        doc_to_rank = verify(doc_candidate, tokens, postings)

    # Compute cosine similarity between the query and each document,
    # with the weights follow the tf×idf calculation, and then do
    # normalization
    query_weight = normalize([
        get_tf(freq) * get_idf(num_of_doc, dictionary[term].frequency)
        for (term, freq) in term_freq.items()
    ])

    # Compute the score for each document containing one of those
    # terms in the query.
    score = Counter()
    for ((term, _), q_weight) in zip(term_freq.items(), query_weight):
        if q_weight > 0:
            ''' get the postings lists of the term, update the score '''
            for doc_id, value in postings[term].items():
                if phrasal_query and (doc_id not in doc_to_rank):
                    continue
                score[doc_id] += q_weight * value.weight
    ''' rank and get result '''
    return [doc_id for (doc_id, _) in score.most_common(TOP_K)]
def build_index(in_dir, out_dict, out_postings):
    """
    build index from documents stored in the input directory,
    then output the dictionary file and postings file
    """
    print('indexing...')

    #reading the files
    corpus = PlaintextCorpusReader(in_dir, '.*')
    file_names_str = corpus.fileids()
    file_names = sorted(map(int, file_names_str))

    #Load corpus and generate the postings dictionary
    postings = defaultdict(dict)
    tokens = list()
    for docID in file_names:
        content = corpus.raw(str(docID))  # read file content
        content = preprocess(content)
        words = tokenize(content)  # tokenization: content -> words
        tokens = stemming(words)  # stemming

        if phrasal_query:
            token_len = defaultdict(list)
        else:
            token_len = defaultdict(int)
        # count the apeearing times of the token in the file
        term_pos = 0
        for token in tokens:
            if phrasal_query:
                if token in token_len.keys():
                    token_len[token][0] += 1
                    token_len[token][1].append(term_pos)
                else:
                    token_len[token] = [1, [term_pos]]
            else:
                token_len[token] += 1
            term_pos += 1
        '''
        Generate weighted token frequency.

        Generate dictionary of key -> token, value -> a dict with k,v 
        as file_name, weighted_token_frequency
        '''
        if phrasal_query:

            weighted_tokenfreq = normalize(
                [get_tf(y[0]) for (x, y) in token_len.items()])

            for ((token, freq), w_tf) in zip(token_len.items(),
                                             weighted_tokenfreq):
                postings[token][docID] = PhrasalToken(freq[0], freq[1], w_tf)
        else:

            weighted_tokenfreq = normalize(
                [get_tf(y) for (x, y) in token_len.items()])

            for ((token, freq), w_tf) in zip(token_len.items(),
                                             weighted_tokenfreq):
                postings[token][docID] = Token(w_tf)
    ''' 
    Output dictionary and postings files 

    - Dictionary file stores all the tokens, with their doc frequency, the offset 
    in the postings file.
    - Postings file stores the list of tuples -> (document ID, term freq).
    '''
    # write postings file
    dictionary = defaultdict(Entry)
    #print(postings.items())
    with open(out_postings, mode="wb") as postings_file:
        for key, value in postings.items():
            #print(value)
            '''
            len(value) := the document frequency of the token
                       := how many times the token appears in all documents
            offset := current writing position of the postings file
            '''
            offset = postings_file.tell()
            pickle.dump(value, postings_file)
            size = postings_file.write(pickle.dumps(value))
            dictionary[key] = Entry(len(value), offset, size)

    # write dictionary file
    with open(out_dict, mode="wb") as dictionary_file:
        pickle.dump(url_map, dictionary_file)
        pickle.dump(doc_id_map, dictionary_file)
        pickle.dump(pr_result, dictionary_file)
        pickle.dump(dictionary, dictionary_file)
        print("dictionary done")
Exemple #3
0
def execute_search(query, dictionary, postings, num_of_doc):
    '''
    Compute cosine similarity between the query and each document, i.e.,
    the lnc tf-idf for the tuples (term, frequency).
    Compute the score for each document containing one of those terms in 
    the query.
    Return (at most) 10 most relavant document id (sorted) by score.

    @param query - The query string: str
    @param dictonary - The dictionary containing the doc frequency of a
                        token: DefaultDict[int, Entry]
    @param postings - The postings dictionary containing a mapping of 
                        doc ID to the weight for a given token: Posting
    @param num_of_doc - The number of the documents indexed
    '''
    '''
    Get tokens (stemmed words in the query), terms (set of tokens), 
    and the dictionary of term frequency in the query: DefaultDict[str, int]
    '''

    if not boolean_query:
        if lesk_on:
            query = lesk(query)
            #print(query)
        if expand:
            query = expand_query(query)
            #print(query)

    tokens, terms, term_freq = get_term_freq(query)

    if phrasal_query:
        doc_candidate = intersection(terms, postings)
        doc_to_rank = verify(doc_candidate, tokens, postings)

    # Compute cosine similarity between the query and each document,
    # with the weights follow the tf×idf calculation, and then do
    # normalization
    query_weight = normalize([
        get_tf(freq) * get_idf(num_of_doc, dictionary[term].frequency)
        for (term, freq) in term_freq.items()
    ])

    # Compute the score for each document containing one of those
    # terms in the query.
    score = Counter()
    query_vector = {}
    for ((term, _), q_weight) in zip(term_freq.items(), query_weight):
        query_vector[term] = q_weight
        if q_weight > 0:
            ''' get the postings lists of the term, update the score '''
            for doc_id, value in postings[term].items():
                if phrasal_query and (doc_id not in doc_to_rank):
                    continue
                score[doc_id] += q_weight * value.weight
    if not boolean_query and prf_on:
        ''' rank and get result'''
        most_rel_docs = [
            doc_id for (doc_id, _) in score.most_common(K_MOST_RELEVANT)
        ]
        new_query = pseudo_rel_feedback(postings, dictionary, most_rel_docs,
                                        query_vector)
        ''' normalizing the new query '''
        norm = sqrt(sum([i * i for i in new_query.values()], 0))
        for term in new_query:
            new_query[term] = new_query[term] / norm

        score = Counter()
        for term in new_query:
            try:
                items = postings[term].items()
            except:
                continue
            for doc_id, freq in items:
                if phrasal_query and (doc_id not in doc_to_rank):
                    continue
                score[doc_id] += new_query[term] * value.weight
    return score
Exemple #4
0
def build_index(in_dir, out_dict, out_postings):
    """
    build index from documents stored in the input directory,
    then output the dictionary file and postings file
    """
    print('indexing...')
    ''' Create a sorted list of the files inside the directory '''
    corpus = PlaintextCorpusReader(in_dir, '.*\.txt')
    file_id_strs = corpus.fileids()
    # file_ids = sorted(convert2int(file_id_strs))
    ''' Load corpus and generate the postings dictionary '''
    postings = defaultdict(dict)
    tokens = list()
    docsInfo = defaultdict(dict)
    for fn_str in file_id_strs:
        content_raw = corpus.raw(fn_str)  # read file content
        title, anchor_text, content = splitContent(content_raw)
        words = tokenize(uk2us(content))  # tokenization: content -> words
        tokens = stemming(words, stopword=False)  # stemming

        fn = convert2int(fn_str)
        docsInfo[fn] = [title, anchor_text]

        print("processing: " + fn_str)

        if phrasal_query:
            token_len = defaultdict(list)
        else:
            token_len = defaultdict(int)
        # count the apeearing times of the token in the file
        term_pos = 0
        for token in tokens:
            if phrasal_query:
                if token in token_len.keys():
                    token_len[token][0] += 1
                    token_len[token][1].append(term_pos)
                else:
                    token_len[token] = [1, [term_pos]]
            else:
                token_len[token] += 1

            term_pos += 1
        ''' 
        Generate weighted token frequency.
        
        Generate dictionary of key -> token, value -> a dict with k,v 
        as file_name, (frequency, weighted_token_frequency)
        '''
        if phrasal_query:

            weighted_tokenfreq = normalize(
                [get_tf(y[0]) for (x, y) in token_len.items()])

            for ((token, freq), w_tf) in zip(token_len.items(),
                                             weighted_tokenfreq):
                postings[token][fn] = PhrasalToken(freq[0], freq[1], w_tf)
        else:

            weighted_tokenfreq = normalize(
                [get_tf(y) for (x, y) in token_len.items()])

            for ((token, freq), w_tf) in zip(token_len.items(),
                                             weighted_tokenfreq):
                postings[token][fn] = Token(freq, w_tf)
    ''' 
    Output dictionary and postings files 
    
    - Dictionary file stores all the tokens, with their doc frequency, the offset 
    in the postings file, and the size (in bytes).
    - Postings file stores the list of tuples -> (document ID, term freq).
    '''

    # write postings file
    dictionary = defaultdict(Entry)
    with open(out_postings, mode="wb") as postings_file:
        for key, value in postings.items():
            '''
            len(value) := the document frequency of the token
                       := how many times the token appears in all documents
            offset := current writing position of the postings file
            size := the number of characters written in postings file, in terms of 
                    this token
            '''
            offset = postings_file.tell()
            size = postings_file.write(pickle.dumps(value))
            dictionary[key] = Entry(len(value), offset, size)
    print("postings done.")

    # write dictionary file
    with open(out_dict, mode="wb") as dictionary_file:
        pickle.dump(len(file_id_strs), dictionary_file)
        print("docs length: " + str(len(file_id_strs)))
        pickle.dump(docsInfo, dictionary_file)
        print(docsInfo)
        print("docsInfo done.")
        pickle.dump(dictionary, dictionary_file)
        print(dictionary)
        print("dictionary done.")
Exemple #5
0
def build_index(in_dir, out_dict, out_postings):
    """
    build index from documents stored in the input directory,
    then output the dictionary file and postings file
    """
    print('indexing...')
    ''' read csv files into nest list '''
    maxInt = sys.maxsize
    while True:
        try:
            csv.field_size_limit(maxInt)
            break
        except OverflowError:
            maxInt = int(maxInt / 10)

    with open(in_dir, 'r', encoding='UTF-8') as csvfile:
        reader = csv.reader(csvfile)
        rows = [row for row in reader]

    rows.pop(0)
    ''' Load corpus and generate the postings dictionary '''
    postings = defaultdict(dict)
    tokens = list()
    docsInfo = defaultdict(dict)
    # docs_to_terms = defaultdict(dict)

    print(str(len(rows)) + " rows in total. ")

    rowID = 1
    consecutive_ids = defaultdict(dict)
    doc_num = 0
    for docID, _, content, date, court in rows:
        consecutive_ids[doc_num] = docID
        docID = doc_num
        doc_num += 1
        print("processing row: " + str(rowID))
        rowID += 1
        docsInfo[docID] = [date, court]
        words = tokenize(uk2us(content))  # tokenization: content -> words
        tokens = stemming(words, stopword=True, lemma=True)  # stemming
        # docs_to_terms[docID] = tokens

        if phrasal_query:
            token_len = defaultdict(list)
        else:
            token_len = defaultdict(int)
        # count the apeearing times of the token in the file
        term_pos = 0
        for token in tokens:
            if phrasal_query:
                if token in token_len.keys():
                    token_len[token][0] += 1
                    token_len[token][1].append(term_pos)
                else:
                    token_len[token] = [1, [term_pos]]
            else:
                token_len[token] += 1

            term_pos += 1
        '''
        Generate weighted token frequency.
        
        Generate dictionary of key -> token, value -> a dict with k,v 
        as file_name, weighted_token_frequency
        '''
        if phrasal_query:

            weighted_tokenfreq = normalize(
                [get_tf(y[0]) for (x, y) in token_len.items()])

            for ((token, freq), w_tf) in zip(token_len.items(),
                                             weighted_tokenfreq):
                postings[token][docID] = PhrasalToken(freq[1], w_tf)
        else:

            weighted_tokenfreq = normalize(
                [get_tf(y) for (x, y) in token_len.items()])

            for ((token, freq), w_tf) in zip(token_len.items(),
                                             weighted_tokenfreq):
                postings[token][docID] = Token(w_tf)
    ''' 
    Output dictionary and postings files 
    
    - Dictionary file stores all the tokens, with their doc frequency, the offset 
    in the postings file.
    - Postings file stores the list of tuples -> (document ID, term freq).
    '''
    # write postings file
    dictionary = defaultdict(Entry)
    with open(out_postings, mode="wb") as postings_file:
        for key, value in postings.items():
            '''
            len(value) := the document frequency of the token
                       := how many times the token appears in all documents
            offset := current writing position of the postings file
            '''
            offset = postings_file.tell()
            pickle.dump(value, postings_file)
            dictionary[key] = Entry(len(value), offset)

    # write dictionary file
    with open(out_dict, mode="wb") as dictionary_file:

        pickle.dump(len(rows), dictionary_file)
        print("length done.")
        pickle.dump(consecutive_ids, dictionary_file)
        pickle.dump(docsInfo, dictionary_file)
        print("docsInfo done.")
        # pickle.dump(docs_to_terms, dictionary_file)
        # print("docs_to_terms done")
        pickle.dump(dictionary, dictionary_file)
        print("dictionary done")