def index(self): for doi, title, abstract in self.col: if self.tokenizerType == '0': # simple tokenizer = Tokenizer.SimpleTokenizer(title, abstract) else: # better tokenizer = Tokenizer.BetterTokenizer(title, abstract) terms = tokenizer.getTerms() for term in terms: if term in self.term_map.keys(): if doi in self.term_map[term].keys(): self.term_map[term][doi] += 1 else: self.term_map[term][doi] = 1 else: term_freq_map = {} # key: docId, value: term_freq term_freq_map[doi] = 1 self.term_map[term] = term_freq_map
def index(self): self.N = 0 if self.tokenizerType == '0': # simple tokenizer = Tokenizer.SimpleTokenizer('') else: # better tokenizer = Tokenizer.BetterTokenizer('') for doi, title, abstract in CorpusReader.CorpusReader( self.collectionPath).readCorpus(): self.N += 1 tokenizer.changeText(title + " " + abstract) terms = tokenizer.getTerms() # first, we populate the dictionary postingsMaps with the term frequency {term: {docId: term_freq} } for term in terms: if term in self.postingsMaps.keys(): if doi in self.postingsMaps[term].keys(): self.postingsMaps[term][doi] += 1 else: self.postingsMaps[term][doi] = 1 else: self.postingsMaps[term] = { doi: 1 } # key: docId, value: term_freq # lnc (logarithmic term frequency, no document frequency, cosine normalization) # then, we modify the postingsMaps from {term: {docId: term_freq}} to {term: idf, {docId: weight}} # logarithmic term frequency self.postingsMaps = { term: (getIDFt(term, self.postingsMaps, self.N), { docId: getLogWeight(term, docId, self.postingsMaps) for docId in self.postingsMaps[term].keys() }) for term in self.postingsMaps.keys() }
def main(argv): """ Main script for the discipline's assignment 3. This script is responsable for calling the correct classes and for creating the data flow necessary for querying an existing index. :param argv: receives the arguments passed to the program during execution :type argv: list<str> """ HELP = """USAGE:\n python3 QueryIndex.py [-h] [-o outputFile] [-t tokenizer] [-r limitRAM] [-f feedback] [-s rocchioScope] [-c numChamps] [-l limit] <queryFile> <indexFolder> [a b g] \n OPTIONS: h - shows this help f - tells o - define output file's name t - define the tokenizer used for the program r - limit program execution to defined RAM capacity f - define the feedback used for the Rocchio algorithm s - define the number of retrieved documents considered for the Rocchio algorithm c - define the size of the champions list l - define the number of scores to return ARGUMENTS: outputFile - actual name for the output file tokenizer - must be 'simple' or 'complex' limitRAM - maximum RAM(in Gb) used in the indexing process queryFile - name of the file containing 1 or more queries indexFolder - name of the folder that contains the indexes a - alpha weight for the Rocchio algorithm b - beta weight for the Rocchio algorithm g - gamma weight for the Rocchio algorithm feedback - must be 'user' or 'pseudo' rocchioScope - number of retrieved documents considered for the Rocchio algorithm numChamps - size of the champions list limit - limit number of scores to return""" # default variables outputFile = "../queryResults/" tokenizer = "complex" maximumRAM = None feedback = None # None, pseudo or user rocchioWeights = [] # alpha, beta and gamma n = None # number of relevant docs (for feedback) k = 10000 # champions list size limit = 100 # number of scores try: opts, args = getopt.getopt(argv, "ho:t:r:f:c:s:l:") except getopt.GetoptError: print(HELP) return 1 if args == [] or (len(args) != 2 and len(args) != 4 and len(args) != 5): print(HELP) return 2 # verifies if any option was passed to the script for opt, arg in opts: if opt == '-h': print(HELP) return 3 elif opt == "-o": outputFile = arg elif opt == "-t": assert arg in ( "simple", "complex" ), "Tokenizer option must be either \"simple\" or \"complex\"." tokenizer = arg elif opt == "-r": maxM = psutil.virtual_memory().free if arg != "": maximumRAM = float(arg) * 1000000000 else: maximumRAM = maxM if maximumRAM > maxM: maximumRAM = maxM print( "Warning: Memory available is less than the asked value, maximumRAM set to " + str(int(maximumRAM / 1000000000)) + "Gb.") elif opt == "-f": assert arg in ( "user", "pseudo" ), "Feedback option must be either \"user\" or \"pseudo\"." feedback = arg elif opt == "-c": assert int( arg) > 0, "Error: numChamps value must be a positive integer" k = int(arg) elif opt == "-s": assert int( arg ) > 0, "Error: rocchioScope value must be a positive integer" n = int(arg) elif opt == "-l": assert int( arg) > 0, "Error: limit value must be a positive integer" limit = int(arg) if feedback: if feedback == "pseudo": assert len( args ) == 4, "Error: if you want to use pseudo feedback, please insert alpha and beta as well" rocchioWeights.append(float(args[2])) rocchioWeights.append(float(args[3])) # rocchioWeights.append(float(args[4])) else: assert len( args ) == 5, "Error: if you want to use user feedback, please insert alpha, beta and gamma as well" rocchioWeights.append(float(args[2])) rocchioWeights.append(float(args[3])) rocchioWeights.append(float(args[4])) # taking in account the choosen tokenizer, the respective data flow is created if tokenizer == "simple": assignment3(outputFile, Tokenizer.SimpleTokenizer(), maximumRAM, feedback, n, k, limit, args[0], args[1], rocchioWeights) else: # 'complex' = default tokenizer assignment3(outputFile, Tokenizer.ComplexTokenizer(), maximumRAM, feedback, n, k, limit, args[0], args[1], rocchioWeights) return 0
def main(argv): """ Main script for the discipline's assignments 1 and 2. This script is responsable for calling the correct classes and for creating the data flow necessary for the index to be created and persisted. :param argv: receives the arguments passed to the program during execution :type argv: list<str> """ HELP = """USAGE:\n python3 CreateIndex.py [-h] [-p] [-w] [-o outputFolder] [-l limit] [-t tokenizer] [-r limitRAM] inputFolder\n OPTIONS: h - shows this help o - define output file's folder l - define limit for the number of lines to be processed in each input file t - define the tokenizer used for the program r - limit program execution to defined RAM capacity w - process weights of terms p - process positions of terms ARGUMENTS: outputFolder - actual name for the output folder limit - value for the number of lines limit tokenizer - must be simple(for the simple 2.1 tokenizer) or complex(for the more advanced 2.2 tokenizer) limitRAM - maximum RAM(in Gb) used in the indexing process inputFolder - name of the folder that contains the input files to be processed""" # default variables outputFolder = "index" limit = None tokenizer = "simple" maximumRAM = None weightCalc = False positionCalc = False fileLimit = float("inf") try: opts, args = getopt.getopt(argv, "wpho:t:l:r:f:") except getopt.GetoptError: print(HELP) return 1 if args == [] or len(args) != 1: print(HELP) return 2 # verifies if any option was passed to the script for opt, arg in opts: if opt == '-h': print(HELP) return 3 elif opt == "-o": outputFolder = arg elif opt == "-l": limit = int(arg) elif opt == "-f": fileLimit = float(arg) * 1000000000 elif opt == "-t": assert arg in ( "simple", "complex" ), "Tokenizer option must be either \"simple\" or \"complex\"." tokenizer = arg elif opt == "-w": weightCalc = True elif opt == "-p": positionCalc = True elif opt == "-r": maxM = psutil.virtual_memory().free if arg != "": maximumRAM = float(arg) * 1000000000 else: maximumRAM = maxM if maximumRAM > maxM: maximumRAM = maxM print( "Warning: Memory available is less than the asked value, maximumRAM set to " + str(int(maximumRAM / 1000000000)) + "Gb.") # taking in account the choosen tokenizer, the respective data flow is created if tokenizer == "simple": if maximumRAM is None: assignment1(Tokenizer.SimpleTokenizer(), outputFolder, args[0], limit, weightCalc, positionCalc, fileLimit) else: assignment2(Tokenizer.SimpleTokenizer(), outputFolder, args[0], limit, weightCalc, positionCalc, maximumRAM, fileLimit) else: # 'complex' = default tokenizer if maximumRAM is None: assignment1(Tokenizer.ComplexTokenizer(), outputFolder, args[0], limit, weightCalc, positionCalc, fileLimit) else: assignment2(Tokenizer.ComplexTokenizer(), outputFolder, args[0], limit, weightCalc, positionCalc, maximumRAM, fileLimit) return 0
def index(self): start_indexing = timeit.default_timer() self.N = 0 if self.tokenizerType == '0': # simple tokenizer = Tokenizer.SimpleTokenizer('') else: # better tokenizer = Tokenizer.BetterTokenizer('') corpusReader = CorpusReader.CorpusReader(self.collectionPath) print('start memory available: {}%'.format( psutil.virtual_memory().available * 100 / psutil.virtual_memory().total)) corpusReader.startReadingCorpus() nDicts = 0 # ------------------------------------------ INDEX WITH TERM POSITIONS ----------------------------------------- if self.withPositions: # ---------------------------------------------- INDEX BLOCKS ---------------------------------------------- while True: # -------------------------------------------- Get Document -------------------------------------------- doc = corpusReader.readDoc() # last document if doc == -1: if self.postingsMaps != {}: # start = timeit.default_timer() self.writeIndexToBlockFileWithPositions( './dicts/dict' + str(nDicts)) # stop = timeit.default_timer() # print('write: {} seconds'.format(stop - start)) # print('memory used: {} %'.format(psutil.Process(os.getpid()).memory_percent() * 100)) print('available memory: {} %'.format( psutil.virtual_memory().available * 100 / psutil.virtual_memory().total)) nDicts += 1 self.postingsMaps = {} # clean dictionary break elif doc == None: continue (doi, title, abstract) = doc del doc self.N += 1 #startdocreadtime = timeit.default_timer() # ------------------------------------------- Get Document Terms --------------------------------------- tokenizer.changeText(title + " " + abstract) del title del abstract terms, termPositions = tokenizer.getTerms(withPositions=True) tokenizer.changeText("") # clean term memory from tokenizer # first, we populate the dictionary postingsMaps with the term positions {term: {docId: [termpositions]} } if (psutil.virtual_memory().available * 100 / psutil.virtual_memory().total ) <= 10 and self.postingsMaps != {}: # available memory #start = timeit.default_timer() self.writeIndexToBlockFileWithPositions('./dicts/dict' + str(nDicts)) #stop = timeit.default_timer() #print('write: {} seconds'.format(stop - start)) #print('memory used: {} %'.format(psutil.Process(os.getpid()).memory_percent() * 100)) print('available memory: {} %'.format( psutil.virtual_memory().available * 100 / psutil.virtual_memory().total)) nDicts += 1 self.postingsMaps = {} # clean dictionary else: #while terms != [] and termPositions != []: # if terms[0] in self.postingsMaps.keys(): # #if doi not in self.postingsMaps[terms[0]].keys(): -> doi always not in self.postingsMaps[terms[0]].keys() # self.postingsMaps[terms[0]][doi] = termPositions[0] # else: # self.postingsMaps[terms[0]] = {doi: termPositions[0]} # key: docId, value: [pos1,pos2,pos3,...] # # terms = terms[1:] # termPositions = termPositions[1:] _ = [ self.postingsMaps.update({ terms[termInd]: { doi: termPositions[termInd] } }) if terms[termInd] not in self.postingsMaps.keys() else self.postingsMaps[terms[termInd]].update( {doi: termPositions[termInd]}) for termInd in range(len(terms)) ] del terms del termPositions #enddocreadtime = timeit.default_timer() #print('document {}: {} seconds'.format(doi, enddocreadtime - startdocreadtime)) # ---------------------------------------- ENDED INDEXING BLOCKS ------------------------------------------- stop_indexing = timeit.default_timer() print('indexing into blocks: {} minutes and {} seconds'.format( (stop_indexing - start_indexing) // 60, (stop_indexing - start_indexing) % 60)) start = timeit.default_timer() if os.path.isfile("index"): os.remove("index") final_dict = open("index", "w") dict_names = [ './dicts/dict' + str(nDict) for nDict in range(nDicts) ] # -------------------------------------------- MERGE INDEX BLOCKS ------------------------------------------ print('merging dictionary fase and writting index to disk') temp_dicts = [open(dict_name, "r") for dict_name in dict_names] ntermsToDisk = 0 while temp_dicts != []: for dict_file in temp_dicts: # ---------------------- Read first line of each file ------------------------------------------ line = dict_file.readline() if not line: #print('file: {}, temp_dicts: {}'.format(dict_file, temp_dicts)) dict_file.close() # delete dictionary block from disk os.remove(dict_names[temp_dicts.index(dict_file)]) dict_names.remove( dict_names[temp_dicts.index(dict_file)]) temp_dicts.remove(dict_file) continue # ------------------------ Save line info to memory -------------------------------------------- info = line.split( '|' ) # 'term', 'docid', 'pos1,pos2,pos3', 'docid', 'pos1,pos2,pos3', ... info.remove('\n') while '' in info: info.remove('') term = info[0] # term #print('term: {}'.format(term)) docIds = info[1:][0::2] # [docid, docid, ...] #print('docIds: {}'.format(docIds)) termPositions = [ positions.split(',') for positions in info[1:][1::2] ] # [[pos1,pos2,pos3], [pos1,pos2,pos3], ...] #print('termPositions: {}'.format(termPositions)) #print('postingsMaps: {}'.format(list(self.postingsMaps.items()))) if term in self.postingsMaps.keys(): #for docId in docIds: # if docId in line_temp_dict[term].keys(): -> doesnt happpen because we only write to file after reading the whole document # merge postings list (update in order if dict document) self.postingsMaps[term].update({ docIds[docInd]: termPositions[docInd] for docInd in range(len(docIds)) }) else: self.postingsMaps.update({ term: { docIds[docInd]: termPositions[docInd] for docInd in range(len(docIds)) } }) if self.postingsMaps != {}: # ------------------------- CALCULATE WEIGHTS AND WRITE ON FINAL INDEX ----------------------------- # todo: verify all this functions (storecalculations) work with this new self.postingsMaps dictionary structure # get first element of alphabetical sorted list of terms in memory minorTerm = sorted(self.postingsMaps.keys())[0] #print('[\'-Complex@ZIF-67:qgdvdy3k:1|gltf4m6w:1|\n:5.422985219043376|\n\']') #print('term: ' + minorTerm) #print('idf: ' + str(getIDFt(minorTerm, self.postingsMaps, self.N))) #print('doc_ids: ' + ''.join([str(doc_id) for doc_id, positions in self.postingsMaps[minorTerm].items()])) #print('LogWeightPositions: ' + ''.join([str(getLogWeightPositions(minorTerm, doc_id, self.postingsMaps)) for doc_id, positions in self.postingsMaps[minorTerm].items()])) #print('positions: ' + ','.join([','.join([str(pos) for pos in positions]) for doc_id, positions in self.postingsMaps[minorTerm].items()])) # write its information to the final dictionary\ final_dict.writelines([ minorTerm + ':' + # term: str(getIDFt(minorTerm, self.postingsMaps, self.N)) + '|' + # idf| '|'.join([ str(doc_id) + ':' + # doc_id: str( getLogWeightPositions(minorTerm, doc_id, self.postingsMaps)) + ':' + # term_weight: ','.join([str(pos) for pos in positions]) # pos1,pos2,... for doc_id, positions in self.postingsMaps[minorTerm].items() ]) + '\n' ]) ntermsToDisk += 1 #print('merging dictionary fase: writed into disk {} terms'.format(ntermsToDisk)) # remove it from memory del self.postingsMaps[minorTerm] # ---------------------------------------- ENDED MERGING INDEX BLOCKS -------------------------------------- del info del term del docIds del termPositions del minorTerm final_dict.close() stop = timeit.default_timer() print( 'merge and write of final dictionary: {} minutes and {} seconds' .format((stop - start) // 60, (stop - start) % 60)) # ----------------------------------------- INDEX WITHOUT TERM POSITIONS --------------------------------------- else: # ---------------------------------------------- INDEX BLOCKS ---------------------------------------------- while True: doc = corpusReader.readDoc() # last document if doc == -1: if self.postingsMaps != {}: # start = timeit.default_timer() self.writeIndexToBlockFile('./dicts/dict' + str(nDicts)) # stop = timeit.default_timer() # print('write: {} seconds'.format(stop - start)) # print('memory used: {} %'.format(psutil.Process(os.getpid()).memory_percent() * 100)) print('available memory: {} %'.format( psutil.virtual_memory().available * 100 / psutil.virtual_memory().total)) nDicts += 1 self.postingsMaps = {} # clean dictionary break elif doc == None: continue (doi, title, abstract) = doc del doc self.N += 1 # ------------------------------------------- Get Document Terms --------------------------------------- tokenizer.changeText(title + " " + abstract) del title del abstract terms = tokenizer.getTerms(withPositions=False) tokenizer.changeText("") # clean term memory from tokenizer # first, we populate the dictionary postingsMaps with the term frequency {term: {docId: term_freq} } nDicts = 0 for term in terms: if ( psutil.virtual_memory().available * 100 / psutil.virtual_memory().total ) <= 10 and self.postingsMaps != {}: # available memory # lnc (logarithmic term frequency, no document frequency, cosine normalization) # then, we modify the postingsMaps from {term: {docId: term_freq}} to {term: idf, {docId: weight}} # logarithmic term frequency self.writeIndexToBlockFile('./dicts/dict' + str(nDicts)) nDicts += 1 self.postingsMaps = {} # clean dictionary if term in self.postingsMaps.keys(): if doi in self.postingsMaps[term].keys(): self.postingsMaps[term][doi] += 1 else: self.postingsMaps[term][doi] = 1 else: self.postingsMaps[term] = { doi: 1 } # key: docId, value: term_freq # ---------------------------------------- ENDED INDEXING BLOCKS ------------------------------------------- start = timeit.default_timer() if os.path.isfile("index"): os.remove("index") final_dict = open("index", "w") dict_names = [ './dicts/dict' + str(nDict) for nDict in range(nDicts) ] # -------------------------------------------- MERGE INDEX BLOCKS ------------------------------------------ temp_dicts = [open(dict_name, "r") for dict_name in dict_names] while temp_dicts != []: for dict_file in temp_dicts: # ---------------------- Read first line of each file ------------------------------------------ line = dict_file.readline() if not line: dict_file.close() # delete dictionary block from disk os.remove(dict_names[temp_dicts.index(dict_file)]) dict_names.remove( dict_names[temp_dicts.index(dict_file)]) temp_dicts.remove(dict_file) continue # ------------------------ Save line info to memory -------------------------------------------- info = line.split( '|' ) # 'term', 'docid', 'term_freq', 'docid', 'term_freq', ... info.remove('\n') while '' in info: info.remove('') term = info[0] # term docIds = info[1:][0::2] # [docid, docid, ...] termFreqs = info[1:][1::2] # [term_freq, term_freq, ...] if term in self.postingsMaps.keys(): self.postingsMaps[term].update({ docIds[docInd]: termFreqs[docInd] for docInd in range(len(docIds)) }) else: self.postingsMaps.update({ term: { docIds[docInd]: termFreqs[docInd] for docInd in range(len(docIds)) } }) if self.postingsMaps != {}: # ------------------------- CALCULATE WEIGHTS AND WRITE ON FINAL INDEX ----------------------------- # todo: verify all this functions (storecalculations) work with this new self.postingsMaps dictionary structure # get first element of alphabetical sorted list of terms in memory minorTerm = sorted(self.postingsMaps.keys())[0] # write its information to the final dictionary final_dict.writelines([ minorTerm + ':' + # term: str(getIDFt(minorTerm, self.postingsMaps, self.N)) + '|' + # idf| '|'.join([ str(doc_id) + ':' + # doc_id: str( getLogWeight(minorTerm, doc_id, self. postingsMaps)) # term_weight| for doc_id, positions in self.postingsMaps[minorTerm].items() ]) + '\n' ]) # remove it from memory del self.postingsMaps[minorTerm] # ---------------------------------------- ENDED MERGING INDEX BLOCKS -------------------------------------- del info del term del docIds del termFreqs del minorTerm final_dict.close() stop = timeit.default_timer() print( 'merge and write of final dictionary: {} minutes and {} seconds' .format((stop - start) // 60, (stop - start) % 60))