def set_tokens(self): input = FileInOut() dictionary = input.readDic() M = len(dictionary) print("M :" + str(M)) T = 755440 return T, M
def __init__(self): self.g = Group() self.input = FileInOut() self.classes = self.input.readClasses("KNN") # self.classes = self.input.readClasses("NB") self.docVectorList, self.vectorsIds = self.input.readDocsVector() self.wordFormer = FormWords()
def __init__(self): self.input = FileInOut() self.wordFormer = FormWords() self.constants = ConstantVars() self.dictionary = dict() self.posting_list = np.array([dict() for j in range(150000)]) self.dicIndex = 0 self.docIndex = 0 self.c = 0
class SimilarNews(): def __init__(self): self.inOut = FileInOut() self.clusters = self.inOut.readClusters() self.g = Group() self.similarity = Similiarity() self.v, self.d = self.inOut.readDocsVector() def minusDocs(self, doc1, doc2): if not doc1: return [] return list(set(doc1) - set(doc2)) def findSimilarNews(self,query, no_news): queryProcess = QueryProcByCluster() docs , positions, nearestCentroids = queryProcess.processQueryByCluster(query, no_news) if docs: docIds, positionsIds = self.g.out_group_of_file(docs, positions) c = 0 relatedDocs = [] for doc in docIds: t2 = Time(doc) c += 1 if doc in self.clusters[nearestCentroids[0]]: similarities = [] for d in self.minusDocs(self.clusters[nearestCentroids[0]], docs[0]): t1 = Time(d+1) if t1.year == t2.year and t1.month == t2.month and abs(t1.day - t2.day) < 3 and d not in relatedDocs: index = self.d.index(d) index_doc = self.d.index(doc) similarities.append(self.similarity.compute_similarity(self.v[index], self.v[index_doc])) if similarities: maximum = np.max(similarities) docindex = similarities.index(maximum) relatedDocs.append(docindex) if len(relatedDocs) == 5: break else: similarities = [] for d in self.minusDocs(self.clusters[nearestCentroids[1]], docs[0]): t1 = Time(d+1) if t1.year == t2.year and t1.month == t2.month and abs(t1.day - t2.day) < 3 and d not in relatedDocs: index = self.d.index(d) index_doc = self.d.index(doc) # docVec = self.v[index] similarities.append(self.similarity.compute_similarity(self.v[index], self.v[index_doc])) if similarities: maximum = np.max(similarities) docindex = similarities.index(maximum) relatedDocs.append(docindex) if len(relatedDocs) == 5: break return docs, positions, relatedDocs else:return {0:[]} , {0:[]}, []
def set_cf_dictionary(self): input = FileInOut() postings = input.readPostingList() cfis = {} for i in range(len(postings) - 1): cfis[i] = 0 for j in range(len(postings[i]) - 1): cfis[i] += len(postings[i][j]) self.cfDic = sorted(cfis.items(), key=lambda cfis: cfis[1], reverse=True) cfis.clear()
def __init__(self): self.input = FileInOut() self.Dic = self.input.readDic() self.DocID_file = self.input.readDocID() self.posting_file = self.input.readPostingList() self.wordFormer = FormWords() self.constants = ConstantVars() self.relatedDocs = [] self.notRelatedDocs = [] self.relatedDocsPos = [] self.notRelatedDocsPos = [] self.notRelatedCounts = 0
def __init__(self): self.inOut = FileInOut() self.df = dict() v, d = self.inOut.readDocsVector() for i in range(1, 38729): for j in v: if i in j.keys(): self.df.setdefault(str(i), []).append(j[i]) else: self.df.setdefault(str(i), []).append(0) self.df = pd.DataFrame(self.df) self.df.index=d print('phase 1 completed')
def __init__(self, algorithm): self.train_data = Train_data() print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!") self.input = FileInOut() self.k = 5 # self.train_data = self.input.N self.docVectorList, self.vectorsIds = self.input.readDocsVector() print("222222222222222222222") self.trainVectorList, self.trainvectorsIds = self.input.readTrainDocsVector( ) print("33333333333333333333333333333") self.num_ov_results = 100 self.gp = Group([7745]) print("ta ghable knn umaaaaaaad") self.classes = self.KNN()
class Kmean: def __init__(self): self.inOut = FileInOut() self.df = dict() v, d = self.inOut.readDocsVector() for i in range(1, 38729): for j in v: if i in j.keys(): self.df.setdefault(str(i), []).append(j[i]) else: self.df.setdefault(str(i), []).append(0) self.df = pd.DataFrame(self.df) self.df.index=d print('phase 1 completed') # print(self.df.head()) # self. df = pd.DataFrame({ # '1': [12, 20, 28, 18, 29, 33, 24, 45, 45, 52, 51, 52, 55, 53, 55, 61, 64, 69, 72], # '2': [39, 36, 30, 52, 54, 46, 55, 59, 63, 70, 66, 63, 58, 23, 14, 8, 19, 7, 24] # }) def similarity(self, centroids, k): for i in range(k): d = (self.df.sub(centroids.iloc[i, :]) ** 2).sum(axis=1) # d = d ** 2 # powSum = d.sum(axis=1) self.df['distance_from_{}'.format(i)] = ( np.sqrt(d) ) centroid_distance_cols = ['distance_from_{}'.format(i) for i in range(k)] self.df['closest'] = self.df.loc[:, centroid_distance_cols].idxmin(axis=1) self.df['closest'] = self.df['closest'].map(lambda x: int(x.lstrip('distance_from_'))) return self.df def updateCentroids(self, centroids, k): for i in range(k): # if len(self.df.loc[self.df['closest'] == i]) > 0: centroids.iloc[i] = self.df.loc[self.df['closest'] == i, [str(l) for l in range(1,38729)]].mean()#no.features 38729 return centroids def cluster(self, k): # centroids = { # i + 1: self.df.loc[np.random.randint(0,18)] for i in range(k) # } # centroids = self.df.ix[np.random.sample(self.df.index, k)] centroids = self.df.sample(n = k) centroids.index = range(k) # np.random.seed(200) # centroids = pd.DataFrame({ # str(i): [np.random.randint(0, 80) for i in range(k)] # for i in range(1,3) #no.feature38729 # }) self.similarity(centroids, k) print('sim1') a = 0 while True: a += 1 closest_centroids = self.df['closest'].copy(deep=True) centroids= self.updateCentroids(centroids,k) print('update') self.similarity(centroids, k) if closest_centroids.equals(self.df['closest']) or a == 10: break dist = self.RSSmeasure(centroids, self.df) finalcenters = {str(i):{j+1: list(centroids.loc[i, :])[j] for j in range(0, 38728)} for i in range(k)}#no feature-1 # print(finalcenters) self.inOut.writeCentroids(finalcenters, k) finalCluster = {str(i): list(self.df.index[self.df['closest'] == i]) for i in range(k)} # print(finalCluster) self.inOut.writeClusters(finalCluster, k) # colmap = {1: 'r', 2: 'g', 3: 'b', 4: 'y', 5:'black', 0:'brown'} # fig = plt.figure(figsize=(5, 5)) # l = [] # print(self.df) # for d in self.df['closest']: # l.append(colmap[d]) # plt.scatter(self.df['1'], self.df['2'], color=l, alpha=0.5, edgecolor='k') # for i in range(k): # plt.scatter(*centroids[i], color=colmap[i]) # plt.xlim(0, 80) # plt.ylim(0, 80) # plt.show() print(dist) return dist def RSSmeasure(self, centroids, df): dist = 0 for i in range(len(self.df.index)):#no. doc dist += self.df.iloc[i]['distance_from_'+str(int(self.df.iloc[i]['closest']))] ** 2 return dist
class Classifier: def __init__(self, algorithm): self.train_data = Train_data() print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!") self.input = FileInOut() self.k = 5 # self.train_data = self.input.N self.docVectorList, self.vectorsIds = self.input.readDocsVector() print("222222222222222222222") self.trainVectorList, self.trainvectorsIds = self.input.readTrainDocsVector( ) print("33333333333333333333333333333") self.num_ov_results = 100 self.gp = Group([7745]) print("ta ghable knn umaaaaaaad") self.classes = self.KNN() # self.classes = self.NB() # self.classes = self.input.readClasses(algorithm) def KNN(self): classes = { "science": [], "cultureart": [], "politics": [], "economy": [], "social": [], "international": [], "sport": [], "multimedia": [] } for key in classes.keys(): classes[key].append([]) print("for size : " + str(len(self.docVectorList))) for j in range(len(self.docVectorList)): # for j in range(2): print("j: " + str(j)) kbest = [] for t in range(len(self.trainVectorList)): similarity = self.compute_similarity(self.docVectorList[j], self.trainVectorList[t]) if len(kbest) < self.k: kbest.append([ similarity, self.train_data.get_cat(self.trainvectorsIds[t]) ]) else: minimum = min(kbest, key=lambda x: x[0]) if similarity > minimum[0]: kbest[kbest.index(minimum)] = [ similarity, self.train_data.get_cat(self.trainvectorsIds[t]) ] cat = [Counter(col).most_common(1)[0][0] for col in zip(*kbest)][1] fnum, id = self.gp.pack_id(self.vectorsIds[j]) classes[cat][fnum].append(id) # self.input.writeClasses(classes, "KNN") writeClassesF0() return classes def NB(self): classes = { "science": [], "cultureart": [], "politics": [], "economy": [], "social": [], "international": [], "sport": [], "multimedia": [] } print("for size : " + str(len(self.docVectorList))) class_tf, nci, tf_tid = self.get_classes_tf() for key in classes.keys(): classes[key].append([]) for j in range(len(self.docVectorList)): # print("j: " + str(j)) cat = self.determine_category(self.docVectorList[j], class_tf, nci, tf_tid) fnum, id = self.gp.pack_id(self.vectorsIds[j]) classes[cat][fnum].append(id) self.input.writeClasses(classes, "NB") return classes def get_classes_tf(self): class_tf = { "science": 0, "cultureart": 0, "politics": 0, "economy": 0, "social": 0, "international": 0, "sport": 0, "multimedia": 0 } nci = { "science": 0, "cultureart": 0, "politics": 0, "economy": 0, "social": 0, "international": 0, "sport": 0, "multimedia": 0 } tf_tid = { "science": {}, "cultureart": {}, "politics": {}, "economy": {}, "social": {}, "international": {}, "sport": {}, "multimedia": {} } for t in range(len(self.trainVectorList)): td_cat = self.train_data.get_cat(self.trainvectorsIds[t]) for tid in self.trainVectorList[t].keys(): if tf_tid[td_cat].get(tid, None) is None: tf_tid[td_cat][tid] = self.trainVectorList[t][tid] else: tf_tid[td_cat][tid] += self.trainVectorList[t][tid] nci[td_cat] += 1 # alpha = 1 class_tf[td_cat] += sum(self.trainVectorList[t].values() ) + 1 * len(self.trainVectorList[t]) print( "LLLLLLLLLLLLLLLLLLFFFFFFFFFFFFFFFFFFFFFFFFFFLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLFFFFFFFFFFFFFFFFFFFFFFFF" ) print(tf_tid) return class_tf, nci, tf_tid def determine_category(self, docVector, class_tf, nci, tf_tid): c_score = { "science": 0, "cultureart": 0, "politics": 0, "economy": 0, "social": 0, "international": 0, "sport": 0, "multimedia": 0 } for cat in c_score.keys(): c_score[cat] += math.log10(nci[cat] / 1000) for tid in docVector.keys(): if tf_tid[cat].get(tid, None) is None: c_score[cat] += math.log10((1) / class_tf[cat]) else: c_score[cat] += math.log10( (tf_tid[cat][tid] + 1) / class_tf[cat]) print("score for each class:") print(c_score) determined_cat = max(c_score.items(), key=operator.itemgetter(1))[0] return determined_cat def process_cat(self, query): cat_inq = find(query, "cat") category = cat_inq[0].split(":")[1] query = query.replace(cat_inq[0], '') q1 = QueryProc() notEliminated = query.replace("!", "") docList, indexList = q1.processQueryBySimilarity(notEliminated) doc_dic, index_dic = self.gp.grouping_by_file(docList, indexList) for key in doc_dic.keys(): for docId in doc_dic[key]: if not docId in self.classes[category][key]: to_remove = doc_dic[key].index(docId) doc_dic[key].pop(to_remove) index_dic[key].pop(to_remove) max_heap = self.make_heap(doc_dic, index_dic, query) return self.getKbest(max_heap, self.num_ov_results) def make_heap(self, doc_dic, index_dic, query): maxHeap = MaxHeap() queryVector = self.compute_query_wieght( Similiarity.get_query_termList(query)) for key in doc_dic.keys(): for i in range(len(doc_dic[key])): # if docList[i]==7744: # continue tot_did = self.gp.unpacking_index(doc_dic[key][i], key) k = self.vectorsIds.index(tot_did) similarity = self.compute_similarity(queryVector, self.docVectorList[k]) if not Similiarity.is_similsrity_zero(similarity): maxHeap.insert( DocNode(tot_did, index_dic[k][i], similarity)) return maxHeap def getKbest(self, maxHeap, k): docList = [] indexList = [] for i in range(k): docNode = maxHeap.extractMax() if docNode is None: break docList.append(docNode.docId) indexList.append(docNode.indexList) doc_ids, indexes = self.gp.grouping_by_file(docList, indexList) return doc_ids, indexes def compute_similarity(self, query_vector, doc_vector): sum = 0 for term_id in query_vector.keys(): sum += query_vector.get(term_id) * doc_vector.get(term_id, 0) similarity = sum / (self.get_size(query_vector) * self.get_size(doc_vector)) return similarity def get_size(self, vector): tfs = vector.values() sum = 0 for tf in tfs: sum += pow(tf, 2) return math.sqrt(sum) def compute_query_wieght(self, termsList): dictionary = self.input.readDic() docIDs = self.input.readDocID() vector = {} negative = [] indices = [i for i, x in enumerate(termsList) if x == '!'] indices.sort(reverse=True) for i in indices: negative.append(termsList.pop(i + 1)) termsList.pop(i) for x in termsList: term_id = dictionary.index(x) + 1 if x in dictionary else -1 if term_id != -1 and vector.get(term_id) is None: tf = termsList.count(x) vector[term_id] = (1 + math.log10(tf)) * math.log10( self.input.N / len(docIDs[term_id])) # vector[term_id] = weighting_scheme2_query(len(docIDs[term_id]), self.N) # vector[term_id] = weighting_scheme3_query(tf, len(docIDs[term_id]), self.N) for y in negative: term_id = dictionary.index(y) if y in dictionary else -1 if term_id != -1 and vector.get(term_id) is None: tf = negative.count(y) value = (1 + math.log10(tf)) * math.log10( self.input.N / len(docIDs[term_id])) # value = weighting_scheme2_query(len(docIDs[term_id]), self.N) # value = weighting_scheme3_query(tf, len(docIDs[term_id]), self.N) vector[term_id] = -1 * value return vector
class Index: def __init__(self): self.input = FileInOut() self.wordFormer = FormWords() self.constants = ConstantVars() self.dictionary = dict() self.posting_list = np.array([dict() for j in range(150000)]) self.dicIndex = 0 self.docIndex = 0 self.c = 0 def Filter(self, string, substr): return [ str if not any(sub == str for sub in substr) else '**' for str in string ] def makeDic(self, value, j): if value not in self.dictionary.keys() and value != '**': # print(self.dicIndex) # print(value) if '\n' in value: pass else: self.dictionary[value] = 1 self.input.writeDic([value]) self.posting_list[self.dicIndex][self.docIndex] = [j] self.dicIndex += 1 elif value in self.dictionary.keys() and value != '**': if self.docIndex in self.posting_list[list( self.dictionary.keys()).index(value)].keys(): self.posting_list[list(self.dictionary.keys()).index(value)][ self.docIndex].append(j) else: self.posting_list[list( self.dictionary.keys()).index(value)][self.docIndex] = [j] def indexData(self): for n in range(15): data = self.input.readData('ir-news-' + str(n) + '.csv') for d in data["content"]: print(self.docIndex) self.docIndex += 1 d = self.cleanContent(d) d = self.wordFormer.normalize(d) tokens = self.wordFormer.tokenize(d) self.c += len(tokens) tokens = list(filter(lambda a: a != '\n', tokens)) tokens = self.wordFormer.uniform(tokens) # postaged_tokens = self.wordFormer.posTagging(tokens) stemmed_tokens = self.wordFormer.stemmWords(tokens) lemmatized_tokens = self.wordFormer.lemmatizeWords( stemmed_tokens) lemmatized_tokens = self.Filter( lemmatized_tokens, self.constants.punctuations() + ['\"', '\"', '!', '', '\n'] + self.constants.StopWords()) list( map(self.makeDic, lemmatized_tokens, [i for i in range(0, len(lemmatized_tokens))])) print('doc' + str(n) + ': ' + str(self.docIndex)) # for i in range(len(list(self.dictionary.keys()))): # print(i) # print(list(self.dictionary.keys()).pop(i)) for i in range(0, len(self.posting_list)): self.input.writeDocID(self.posting_list[i]) self.input.writePostingList([ self.stringmaker(self.posting_list[i][key]) for key in self.posting_list[i].keys() ]) print('number of tokens') print(self.c) print(time.time()) def getRelatedDocs(self, token): if token in self.dictionary: return self.posting_list[np.where(self.dictionary == token)][0] else: return {} def cleanContent(self, raw): cleaner = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});') cleanText = re.sub(cleaner, ' ', raw) return cleanText def stringmaker(self, list): stri = '' for i in list: stri = stri + str(i) + ' ' return stri
class QueryProcByCluster: def __init__(self): self.g = Group() self.input = FileInOut() self.classes = self.input.readClasses("KNN") # self.classes = self.input.readClasses("NB") self.docVectorList, self.vectorsIds = self.input.readDocsVector() self.wordFormer = FormWords() def find(self, s, pat): pat = r'(\w*%s\w*)' % pat # Not thrilled about this line return re.findall(pat, s) def process_cat(self, query, num_ov_results): cat_inq = self.find(query, "cat") category = cat_inq[0].split(":")[1] query = query.replace(cat_inq[0], '') q1 = QueryProc() notEliminated = query.replace("!", "") docList, indexList = q1.processQueryBySimilarity(notEliminated) doc_dic, index_dic = self.g.grouping_by_file(docList, indexList) for key in doc_dic.keys(): for docId in doc_dic[key]: if not docId in self.classes[category][key]: to_remove = doc_dic[key].index(docId) doc_dic[key].pop(to_remove) index_dic[key].pop(to_remove) max_heap = self.make_heap(doc_dic, index_dic, query) return self.getKbest(max_heap, num_ov_results) def make_heap(self, doc_dic, index_dic, query): maxHeap = MaxHeap() simi = Similiarity() queryVector = self.compute_query_wieght(simi.get_query_termList(query)) for key in doc_dic.keys(): for i in range(len(doc_dic[key])): tot_did = self.g.unpacking_index(doc_dic[key][i], key) # if tot_did > 7743: # continue k = self.vectorsIds.index(tot_did) similarity = self.compute_similarity(queryVector, self.docVectorList[k]) if not simi.is_similsrity_zero(similarity): maxHeap.insert(DocNode(tot_did, index_dic[key][i], similarity)) return maxHeap def getKbest(self, maxHeap, k): docList = [] indexList = [] for i in range(k): docNode = maxHeap.extractMax() if docNode is None: break docList.append(docNode.docId) indexList.append(docNode.indexList) doc_ids, indexes = self.g.grouping_by_file(docList, indexList) return doc_ids, indexes def compute_similarity(self, query_vector, doc_vector): sum = 0 for term_id in query_vector.keys(): sum += query_vector.get(term_id) * doc_vector.get(term_id, 0) similarity = sum / (self.get_size(query_vector) * self.get_size(doc_vector)) return similarity def get_size(self, vector): tfs = vector.values() sum = 0 for tf in tfs: sum += pow(tf, 2) return math.sqrt(sum) def compute_query_wieght(self, termsList): dictionary = self.input.readDic() docIDs = self.input.readDocID() vector = {} negative = [] indices = [i for i, x in enumerate(termsList) if x == '!'] indices.sort(reverse=True) for i in indices: negative.append(termsList.pop(i + 1)) termsList.pop(i) for x in termsList: term_id = dictionary.index(x) + 1 if x in dictionary else -1 if term_id != -1 and vector.get(term_id) is None: tf = termsList.count(x) vector[term_id] = (1 + math.log10(tf)) * math.log10(self.input.N / len(docIDs[term_id])) # vector[term_id] = weighting_scheme2_query(len(docIDs[term_id]), self.N) # vector[term_id] = weighting_scheme3_query(tf, len(docIDs[term_id]), self.N) for y in negative: term_id = dictionary.index(y) if y in dictionary else -1 if term_id != -1 and vector.get(term_id) is None: tf = negative.count(y) value = (1 + math.log10(tf)) * math.log10(self.input.N / len(docIDs[term_id])) # value = weighting_scheme2_query(len(docIDs[term_id]), self.N) # value = weighting_scheme3_query(tf, len(docIDs[term_id]), self.N) vector[term_id] = -1 * value return vector def nearestCentroids(self, query): centers, labels = self.input.readCenters() distances = [] for center in centers: distances.append(self.compute_similarity(query, center)) maximum = np.max(distances) nearestNeabeor = distances.index(maximum) distances2 = distances distances.remove(maximum) maximum2 = np.max(distances) if maximum == maximum2: nearestNeabeor2 = distances.index(maximum2, nearestNeabeor+1) else: nearestNeabeor2 = distances.index(maximum2) # distances2.remove(minimum2) return [str(nearestNeabeor) , str(nearestNeabeor2)] def mergeDocs(self, doc1, doc2, index1): if not doc1: return [], [] # print('type') # print(doc1) # print(type(doc1)) withOutQoute = list(set(doc2) - set(doc1)) doc2 = list(set(doc2)) for d in withOutQoute: doc2.remove(d) positions = [] for d in doc2: positions.append(index1[doc1.index(d)]) return doc2, positions def processQueryByCluster(self, query, num_ov_results): cat = False if "cat" in query: cat_inq = self.find(query, "cat:") category = cat_inq[0].split(":")[1] query = query.replace(cat_inq[0], '') cat = True q1 = QueryProc() docList, indexList = q1.processQueryBySimilarity(query) term_list = Similiarity.get_query_termList(query) queryVec = self.compute_query_wieght(term_list) if queryVec == {}: return {0:[]} , {0:[]}, [] nearestCentroids = self.nearestCentroids(queryVec) print('center') print(nearestCentroids) docs = [] for neabors in nearestCentroids: a = list(self.input.readClusters()[neabors]) docs = docs + a print('doclist') print(docList) mergeDocs, positions = self.mergeDocs(docList, docs, indexList) if not mergeDocs: return {0:[]} , {0:[]}, nearestCentroids doc_dic, index_dic = self.g.grouping_by_file(mergeDocs, positions) print("boolean cat fffffffffffff") print(cat) if cat: # result_doc_dic = {} # result_indexes = {} # print("cat tttttttttttt") # print(category) # print("Class kkkkkkkkkkkkk") # print(self.classes) # if category.lower() in self.classes.keys(): for key in doc_dic.keys(): # result_doc_dic[key] = [] # result_indexes[key] = [] print("qabl hazffffff: "+str(len(doc_dic[key]))) print(doc_dic[key]) print("category.lower()") print(category.lower()) # print(self.classes[category.lower()][key]) to_remove = [] for t in range(len(doc_dic[key])):###doc_dic[key][t]=docid # print("docidll : "+str(doc_dic[key][t])) if doc_dic[key][t] not in self.classes[category.lower()][key]: # print(str(doc_dic[key][t])+" not in fffffffffffffffff ") # in_cat = doc_dic[key].index(docId) # result_doc_dic[key].append(doc_dic[key].pop(in_cat)) # result_indexes[key].append(index_dic[key].pop(in_cat)) # doc_dic[key].pop(in_cat) to_remove.append(t) # print(doc_dic[key].pop(t)) # index_dic[key].pop(t) to_remove.sort(reverse=True) for f in range(len(to_remove)): index_dic[key].pop(to_remove[f]) doc_dic[key].pop(to_remove[f]) print("bad hazffffff: " + str(len(doc_dic[key]))) print(doc_dic) max_heap = self.make_heap(doc_dic, index_dic, query) kbest1, kbest2 = self.getKbest(max_heap, num_ov_results) return kbest1, kbest2, nearestCentroids
def __init__(self): self.inOut = FileInOut() self.clusters = self.inOut.readClusters() self.g = Group() self.similarity = Similiarity() self.v, self.d = self.inOut.readDocsVector()
def __init__(self): self.input = FileInOut() self.N = self.input.N
class Similiarity: def __init__(self): self.input = FileInOut() self.N = self.input.N # self.docVectorList, self.vectorsIds = self.input.readpDocsVector() def get_size(self, vector): tfs = vector.values() sum = 0 for tf in tfs: sum += pow(tf, 2) return math.sqrt(sum) def compute_similarity(self, query_vector, doc_vector): sum = 0 for term_id in query_vector.keys(): sum += query_vector.get(term_id) * doc_vector.get(term_id, 0) similarity = sum / (self.get_size(query_vector) * self.get_size(doc_vector)) return similarity def get_index(self, doc_vectors, value): for x in doc_vectors: if x.docId == value: return doc_vectors.index(x) return -1 # doc = next((x for x in doc_vectors if x.docid == value), None) # return doc_vectors.index(doc) if doc != None else -1 @staticmethod def get_query_termList(query): wordFormer = FormWords() constants = ConstantVars() query = wordFormer.normalize(query) query_tokens = wordFormer.tokenize(query) for token in query_tokens: if token in constants.punctuations( ) or token in constants.StopWords(): query_tokens.remove(token) query_tokens = wordFormer.uniform(query_tokens) # postaged_tokens = wordFormer.posTagging(query_tokens) stemmed_tokens = wordFormer.stemmWords(query_tokens) lemmatized_tokens = wordFormer.lemmatizeWords(stemmed_tokens) lemmatized_tokens = list(filter(lambda a: a != '"', lemmatized_tokens)) return lemmatized_tokens def compute_query_wieght(self, termsList): dictionary = self.input.readDic() docIDs = self.input.readDocID() vector = {} negative = [] indices = [i for i, x in enumerate(termsList) if x == '!'] indices.sort(reverse=True) for i in indices: negative.append(termsList.pop(i + 1)) termsList.pop(i) for x in termsList: term_id = dictionary.index(x) + 1 if x in dictionary else -1 if term_id != -1 and vector.get(term_id) is None: tf = termsList.count(x) vector[term_id] = (1 + math.log10(tf)) * math.log10( self.N / len(docIDs[term_id])) # vector[term_id] = weighting_scheme2_query(len(docIDs[term_id]), self.N) # vector[term_id] = weighting_scheme3_query(tf, len(docIDs[term_id]), self.N) for y in negative: term_id = dictionary.index(y) if y in dictionary else -1 if term_id != -1 and vector.get(term_id) is None: tf = negative.count(y) value = (1 + math.log10(tf)) * math.log10( self.N / len(docIDs[term_id])) # value = weighting_scheme2_query(len(docIDs[term_id]), self.N) # value = weighting_scheme3_query(tf, len(docIDs[term_id]), self.N) vector[term_id] = -1 * value return vector def compute_docs_wieghts(self): docIDs = self.input.readDocID() postings = self.input.readPostingList() doc_vectors = [] for i in range(115148): print(i) for j in range(len(docIDs[i]) - 1): index = self.get_index(doc_vectors, docIDs[i][j]) if index == -1: doc_vectors.append(DocumentVector(docIDs[i][j])) index = self.get_index(doc_vectors, docIDs[i][j]) doc_vectors[index].fill_vector(i + 1, len(postings[i][j]), len(docIDs[i]), self.N, docIDs[i]) doc_vectors.sort(key=lambda x: int(x.docId)) self.input.writepDocsVector(doc_vectors) return doc_vectors def process_query(self, query, k): q1 = QueryProc() notEliminated = query.replace("!", "") docList, indexList = q1.processQueryBySimilarity(notEliminated) max_heap = self.make_heap(docList, indexList, query) return self.getKbest(max_heap, k) def make_heap(self, docList, indexList, query): maxHeap = MaxHeap() queryVector = self.compute_query_wieght(self.get_query_termList(query)) for i in range(len(docList)): if docList[i] == 7744: continue k = self.vectorsIds.index(docList[i]) similarity = self.compute_similarity(queryVector, self.docVectorList[k]) if not self.is_similsrity_zero(similarity): maxHeap.insert(DocNode(docList[i], indexList[i], similarity)) return maxHeap @staticmethod def is_similsrity_zero(similarity): return similarity == 0.0 def getKbest(self, maxHeap, k): simsum = 0 docList = [] indexList = [] for i in range(k): docNode = maxHeap.extractMax() if docNode is None: break simsum += docNode.similarity docList.append(docNode.docId) indexList.append(docNode.indexList) return docList, indexList
class QueryProc: def __init__(self): self.input = FileInOut() self.Dic = self.input.readDic() self.DocID_file = self.input.readDocID() self.posting_file = self.input.readPostingList() self.wordFormer = FormWords() self.constants = ConstantVars() self.relatedDocs = [] self.notRelatedDocs = [] self.relatedDocsPos = [] self.notRelatedDocsPos = [] self.notRelatedCounts = 0 def initializing(self, query): print(query) query = self.wordFormer.normalize(query) print(query) query_tokens = self.wordFormer.tokenize(query) for token in query_tokens: if token in self.constants.punctuations() or token in self.constants.StopWords(): query_tokens.remove(token) query_tokens = self.wordFormer.uniform(query_tokens) # postaged_tokens = self.wordFormer.posTagging(query_tokens) stemmed_tokens = self.wordFormer.stemmWords(query_tokens) lemmatized_tokens = self.wordFormer.lemmatizeWords(stemmed_tokens) i = j = 0 k = 0 not_include = False order = False orderTokens = [[] for i in range(5)] for token in lemmatized_tokens: print(token) if token == "«" and order == False: print('first') k += 1 order = True continue if token == "»" and order == True: print('second') order = False continue if order: orderTokens[k - 1].append(token) continue if token == "!": not_include = True self.notRelatedCounts += 1 continue if not_include: self.notRelatedDocs.append(self.getRelatedSavedDocs(token)) self.notRelatedDocsPos.append(self.getRelatedSavedpos(token)) not_include = False print('order') print(order) if not not_include and not order: print('hahahaha') self.relatedDocs.append(self.getRelatedSavedDocs(token)) self.relatedDocsPos.append(self.getRelatedSavedpos(token)) # related_result, relatedPos = self.merge(self.relatedDocs, i) related_result = [] relatedPos = [] for res in range(len(self.relatedDocs)): related_result = related_result + self.relatedDocs[res] relatedPos = relatedPos + self.relatedDocsPos[res] related_result = list(set(related_result)) relatedPos = relatedPos[:len(related_result)] docs = [] doc_pos = [] j = 0 if related_result != []: docs.append(related_result) doc_pos.append(relatedPos) j += 1 for i in range(0, k): phrase_container, phrase_pos = self.phraseContainerDocs(orderTokens[i]) docs.append(phrase_container) doc_pos.append(phrase_pos) j += 1 final_result, final_pos = self.finalMerge(docs, doc_pos, j) relateds_and_not_unrelateds, related_position = self.notMerge(final_result, final_pos) # i += 1 return relateds_and_not_unrelateds,related_position def merge_common_docs(self, common_list, docList1, docList2, indexList1, indexList2): for doc in common_list: i1 = docList1.index(doc) i2 = docList2.index(doc) docList2.pop(i2) indexList1[i1] = indexList1[i1] + indexList2.pop(i2) indexList1 = indexList1 + indexList2 docList1 = docList1 + docList2 return indexList1, docList1 def similarity_merge(self, docLists, indexLists): if len(docLists) == 0: return None, None docs = docLists.pop(0) indexes = list(filter(lambda n: n != [], indexLists.pop(0))) if len(docLists) == 0: return docs, indexes for doc in docLists: i = docLists.index(doc) doci = docLists.pop(i) dociPos = list(filter(lambda n: n != [], indexLists.pop(i))) common = list(set(doci) & set(docs)) indexes, docs = self.merge_common_docs(common, docs, doci, indexes, dociPos) return docs, indexes def processQueryBySimilarity(self, query): print('queryyy') print(query) docList, indexList = self.initializing(query) # related_result, related_pos = self.relatedDocs, self.relatedDocsPos # j = 0 # if related_result != []: # j += 1 # for i in range(0, k): # phrase_container, phrase_pos = self.phraseContainerDocs(orderTokens[i]) # related_result.append(phrase_container) # related_pos.append(phrase_pos) # j += 1 # relateds_and_not_unrelateds, related_position = self.finalMerge(related_result, related_pos, j) # # relateds_and_not_unrelateds, related_position = self.similarity_merge(related_result, related_pos) # docList, indexList = self.notMerge(relateds_and_not_unrelateds, related_position) return docList, indexList def processQuery(self, query): query = self.wordFormer.normalize(query) query_tokens = self.wordFormer.tokenize(query) for token in query_tokens: if token in self.constants.punctuations() or token in self.constants.StopWords(): query_tokens.remove(token) query_tokens = self.wordFormer.uniform(query_tokens) # postaged_tokens = self.wordFormer.posTagging(query_tokens) stemmed_tokens = self.wordFormer.stemmWords(query_tokens) lemmatized_tokens = self.wordFormer.lemmatizeWords(stemmed_tokens) i = j = 0 k = 0 not_include = False order = False orderTokens = [[] for i in range(5)] for token in lemmatized_tokens: if token == "\"" and order == False: k += 1 order = True continue if token == "\"" and order == True: order = False continue if order: orderTokens[k - 1].append(token) continue if token == "!": not_include = True self.notRelatedCounts += 1 continue if not_include: self.notRelatedDocs.append(self.getRelatedSavedDocs(token)) self.notRelatedDocsPos.append(self.getRelatedSavedpos(token)) not_include = False else: self.relatedDocs.append(self.getRelatedSavedDocs(token)) self.relatedDocsPos.append(self.getRelatedSavedpos(token)) i += 1 # print('related docs') # print(self.relatedDocs) related_result, relatedPos = self.merge(self.relatedDocs, i) docs = [] doc_pos = [] j = 0 if related_result != []: docs.append(related_result) doc_pos.append(relatedPos) j += 1 for i in range(0, k): phrase_container, phrase_pos = self.phraseContainerDocs(orderTokens[i]) docs.append(phrase_container) doc_pos.append(phrase_pos) j += 1 final_result, final_pos = self.finalMerge(docs, doc_pos, j) # print("self.notRelatedCounts") # print(self.notRelatedCounts) # print('no relate') # print(self.notRelatedDocs) relateds_and_not_unrelateds, related_position = self.notMerge(final_result, final_pos) # for i in range(len(related_pos)): # related_pos[i] = related_pos[i] # print(relateds_and_not_unrelateds) # print(related_position) return relateds_and_not_unrelateds, related_position def merge(self, docs, leng): answer = [] postingAns = [] if leng == 0: return [], [] elif leng == 1: return docs[0], self.relatedDocsPos[0] else: p2 = docs[0] postings2 = [] for j in range(len(p2)): postings2.append(self.relatedDocsPos[0][j]) i = 1 while i < leng: p1 = docs[i] postings1 = [] for j in range(len(p1)): postings1.append(self.relatedDocsPos[i][j]) i += 1 while p1 != [] and p2 != []: if p1[0] == p2[0]: answer.append(p1[0]) postingAns.append(postings1[0] + postings2[0]) p1.remove(p1[0]) p2.remove(p2[0]) postings1.remove(postings1[0]) postings2.remove(postings2[0]) elif p1[0] < p2[0]: p1.remove(p1[0]) postings1.remove(postings1[0]) else: p2.remove(p2[0]) postings2.remove(postings2[0]) p2 = answer postings2 = postingAns print('docc') print(answer) print(postingAns) return answer, postingAns def finalMerge(self, docs, docPos, length): answer = [] docPosAns = [] if length == 0: return [], [] elif length == 1: return list(docs[0]), list(docPos[0]) else: p2 = list(docs[0]) docPos2 = list(docPos[0]) i = 1 while i < length: p1 = list(docs[i]) docPos1 = list(docPos[i]) i += 1 while p1 != [] and p2 != []: if p1[0] == p2[0]: answer.append(p1[0]) docPosAns.append(docPos1[0] + docPos2[0]) p1.remove(p1[0]) p2.remove(p2[0]) docPos1.remove(docPos1[0]) docPos2.remove(docPos2[0]) elif p1[0] < p2[0]: p1.remove(p1[0]) docPos1.remove(docPos1[0]) else: p2.remove(p2[0]) docPos2.remove(docPos2[0]) p2 = answer docPos2 = docPosAns # print('docc and double quote') # print(answer) # print(docPosAns) return answer, docPosAns def notMerge(self, relatedDocs, relatedPos): print('no relate') print(self.notRelatedDocs) answer = [] postingAns = [] if self.notRelatedCounts == 0: if len(relatedDocs) != 0: return relatedDocs, list(relatedPos) else: return [], [] else: p1 = relatedDocs posting1 = relatedPos i = 0 while i < self.notRelatedCounts: p2 = self.notRelatedDocs[i] i += 1 while p1 != [] and p2 != []: if p1[0] == p2[0]: p1.remove(p1[0]) posting1.remove(posting1[0]) p2.remove(p2[0]) elif p1[0] < p2[0]: answer.append(p1[0]) postingAns.append(posting1[0]) posting1.remove(posting1[0]) p1.remove(p1[0]) else: p2.remove(p2[0]) for p in p1: answer.append(p) for posting in posting1: postingAns.append(posting) print('finall docc') return answer, postingAns def phraseContainerDocs(self, pharase): # to numbers of pharase length docs = [] docsPos = [] for p in pharase: docs.append(self.getRelatedSavedDocs(p)) docsPos.append(self.getRelatedSavedpos(p)) answer = [] answer_posting = [[] for k in range(50)] length = len(docs) if length == 0: return [], [] elif length == 1: # print(docs[0]) return docs[0], docsPos[0] else: p2 = docs[0] posting2 = docsPos[0] i = 1 while i < len(pharase): index = -1 answer = [] answer_posting = [[] for k in range(50)] p1 = docs[i] posting1 = docsPos[i] i += 1 while (p1 != [] and p2 != []): if p1[0] == p2[0]: for posting in posting2[0]: if (posting + 1) in posting1[0]: if p1[0] not in answer: answer.append(p1[0]) index += 1 answer_posting[index].append(posting + 1) # print({p1[0] : docs[i - 1][p1[0]]}) p1.remove(p1[0]) p2.remove(p2[0]) posting1.remove(posting1[0]) posting2.remove(posting2[0]) elif p1[0] < p2[0]: p1.remove(p1[0]) posting1.remove(posting1[0]) else: p2.remove(p2[0]) posting2.remove(posting2[0]) p2 = answer # print('ans') # print(answer) # print(answer_posting) posting2 = answer_posting # print('double qoute') # print(answer) # print(answer_posting) return answer, answer_posting def getRelatedSavedDocs(self, token): i = 0 if token in self.Dic: # print(self.Dic.index(token)) posting = list(map(int, self.DocID_file[self.Dic.index(token)])) i += 1 print(posting) return posting return [] def getRelatedSavedpos(self, token): i = 0 if token in self.Dic: # print(self.Dic.index(token)) posting = [list(map(int, self.posting_file[self.Dic.index(token)][j].split(' '))) for j in range(len(self.posting_file[self.Dic.index(token)]))] i += 1 return posting return []