def query(pl_list, tokens, isRankedBased): if not pl_list and not isRankedBased: return [] results = intersect(pl_list) # in rankedBased, only the lists of NOT and Phrase inputs if isRankedBased: for docID in results: # Only the Docs that have at least one query term get + 0.5 bonus if getNewsList()[docID].hasQueryTerm: getNewsList()[docID].score += 0.5 finalizeResults() # Top K results = getResults() # it is nothing but a return :/ structuredResults = [] for result in results: structuredResults.append(getNewsList()[result].structuredFormatResultsPage(tokens)) return structuredResults
def NOT(term_pl): all_docs = [] pointer = 0 if len(term_pl) == 0: for d in range(0, len(getNewsList())): all_docs.append(d) else: for d in range(0, len(getNewsList())): if d != term_pl[pointer]: all_docs.append(d) else: pointer += 1 if pointer == len(term_pl): d += 1 while d < len(getNewsList()): all_docs.append(d) d += 1 break return all_docs
def constructInvertedIndex(): global dictionary dictionary = BTree(Node("سسسسسس", 1, [])) nodesList = [] docCounter = 0 for news in getNewsList(): nodes = {} position = 0 for term in tokenize(normalize(news.content), check_finglish): if term != invalidToken: nodes[dictionary.addOccurrence(term, news.id, position)] = True position += 1 nodesList.append(nodes) for node in nodes: node.cal_tf(news.id) docCounter += 1 if docCounter % 20 == 0: Laws.heap(getDictionary()) calAllIdf(dictionary.root) i = 0 for news in getNewsList(): # calculate the documents' normalize factors for 3 scoring schemes nodes = nodesList[i] sum_of_squares_1 = 0 sum_of_squares_2 = 0 sum_of_squares_3 = 0 for node in nodes.keys(): sum_of_squares_1 += math.pow((getTf(news.id, node.postingsList) - 1) * node.idf, 2) sum_of_squares_2 += math.pow(getTf(news.id, node.postingsList), 2) sum_of_squares_3 += math.pow(getTf(news.id, node.postingsList) * node.idf, 2) normalizationFactorsScheme1.append(math.sqrt(sum_of_squares_1)) normalizationFactorsScheme2.append(math.sqrt(sum_of_squares_2)) normalizationFactorsScheme3.append(math.sqrt(sum_of_squares_3)) i += 1 Laws.storeHeapDataSet() storeDictionary(dictionary) storeNormFactors()
def cal_idf(node): N = len(getNewsList()) node.idf = math.log10(N / node.frequency)
def storeHeapDataSet(): with open('Laws/heap' + str(len(getNewsList())) + '.pickle', 'wb') as handle: pickle.dump(heapDataSet, handle, protocol=pickle.HIGHEST_PROTOCOL)