d_block_1 = d_block_1.rstrip('\n')
    d_block = d_block_1 + "," + d_block_2
    mfw.write(term + "=" + d_block)


# step 1: go through all the documents in batches of 1000 and find the tuples and unique terms
if __name__ == "__main__":
    startTime = datetime.now()

    stopwords_path = "/Users/snehagaikwad/Documents/IR_data/AP_DATA/stoplist.txt"
    with open(stopwords_path, 'r') as sf:
        stopList = sf.read().replace('\n', ' ')
    #print stopList

    # the stemmer object
    ps = PorterStemmer()

    path = "/Users/snehagaikwad/Documents/IR_data/AP_DATA/ap89_collection/"
    tupleFilePath = "/Users/snehagaikwad/Documents/IR_data/AP_DATA/indexStemStop/tuples/"

    fileCount = 0
    uniqueTerms = {}
    documents = {}
    doc_close_tag = "</DOC>"
    doc_id_tag = "<DOCNO>"
    text_open_tag = "<TEXT>"
    text_close_tag = "</TEXT>"
    text = ""
    doc_count = 0
    uniqueTermCount = 0
    totalCF = 0
Ejemplo n.º 2
0
            doc_id = doc.split()[1]
            doc_name_id_map[doc_num] = doc_id

    sum_doc_len = 0
    doc_len_file = "/Users/snehagaikwad/Documents/IR_data/AP_DATA/document_length.txt"
    doc_len_map = {}
    with open(doc_len_file, 'r') as dl:
        for dl_line in iter(dl):
            doc_id = dl_line.split()[0]
            doc_len = dl_line.split()[1]
            doc_len_map[doc_id] = doc_len
            sum_doc_len += int(doc_len)
        avg_len_d = Decimal(sum_doc_len) / Decimal(no_of_docs)

    # the stemmer object
    ps = PorterStemmer()

    # converting the query terms to their stems and removing the stopwords from the queries
    query_file = "/Users/snehagaikwad/Documents/IR_data/AP_DATA/query_desc.51-100.short.txt"
    stopwords_path = "/Users/snehagaikwad/Documents/IR_data/AP_DATA/stoplist.txt"
    with open(stopwords_path, 'r') as sf:
        stopList = sf.read().replace('\n', ' ')
    with open(query_file, 'r') as f:
        lines = f.readlines()

    queries = {
    }  # map to store the comma separated stemmed query words against the query number
    word_dfw_map = {
    }  # map that stores the term and the no of docs that have the term i.e. dfw
    term_id_tf_map = {
    }  # map TF(w,d) that stores id_tf_map(doc_id,TF) for each query_word
    with open(doc_len_file, 'r') as dl:
        for dl_line in iter(dl):
            doc_id = dl_line.split()[0]
            doc_len = dl_line.split()[1]
            doc_len_map[doc_id] = int(doc_len)

    # DEFINING  CONSTANTS
    C = 1500
    indexParamsFile = "/Users/snehagaikwad/Documents/IR_data/AP_DATA/IndexNoStopStemmed/indexParams.txt"
    ipf = open(indexParamsFile,'r')
    param = ipf.readline()
    params = param.split("\t")
    V = int(params[1])

    # the stemmer object
    ps = PorterStemmer();

    # converting the query terms to their stems and removing the stopwords from the queries
    query_file = "/Users/snehagaikwad/Documents/IR_data/AP_DATA/query_desc.51-100.short.txt"
    stopwords_path = "/Users/snehagaikwad/Documents/IR_data/AP_DATA/stoplist.txt"
    with open(stopwords_path, 'r') as sf:
        stopList = sf.read().replace('\n', ' ')
    with open(query_file, 'r') as f:
        lines = f.readlines()

    queries = {}  # map to store the comma separated stemmed query words against the query number
    term_doc_position_map = {}
    doc_list = []

    for query in lines:
        if not query == "\n":