class Client(): def __init__(self, host='192.168.0.107', port=7777, list_file='inputfiles-full.txt', freqs_file='wordlist', dataset_dir='/home/aelphy/Desktop/ir_project_dataset'): self.normalizer = Normalizer() self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.socket.connect((host, port)) self.document_freqs_list_filename = freqs_file self.document_list_filename = list_file self.dataset_dir = dataset_dir self.documents_freqs = {} self.document_identificators = {} self.identificator_documents = {} with open(self.document_list_filename) as f: for line in f: data = line.strip().split() index = int(data[0]) identifier = data[1] self.document_identificators[index] = identifier self.identificator_documents[identifier] = index self.documents_number = index with open(self.document_freqs_list_filename) as f: for line in f: data = line.strip().split() self.documents_freqs[self.identificator_documents[data[1]]] = int(data[0]) self.avgdl = sum(self.documents_freqs.values()) / float(self.documents_number) self.k1 = 2.0 self.b = 0.75 def process_query(self, query): terms = self.normalizer.normalize_line(query) if not terms: return set() inverted_index = {} term_doc_tf = {} for term in terms: self.send_message(term) term_doc_tf[term] = self.parse_message_array(self.recieve_message().split()) inverted_index[term] = term_doc_tf[term].keys() documents = self.merge(inverted_index, terms) if not documents: return set() return self.rank(documents, term_doc_tf, terms) def recieve_message(self): message = '' while not message.endswith('\n'): message += self.socket.recv(1024).decode('utf-8') return message.strip() def send_message(self, message): self.socket.send((message + '\n').encode('utf-8')) def parse_message_array(self, message_array): result = {} i = 0 while i <= len(message_array) - 1: result[int(message_array[i])] = int(message_array[i + 1]) i = i + 2 return result def merge(self, inverted_index, terms): result = set(inverted_index[terms[0]]) for i in range(1, len(terms)): result = result.intersection(set(inverted_index[terms[i]])) return result def rank(self, documents, term_doc_tf, terms): document_scores = {} document_freqs = {} for document in documents: document_scores[document] = self.score_document(document, term_doc_tf, terms) result = sorted(document_scores.items(), key=lambda x: x[1], reverse=True) best_match = result[0] best_match_file = open(os.path.join(self.dataset_dir, client.document_identificators[best_match[0]]), 'r') best_match_data = ' '.join(best_match_file.readlines()) for term in terms: best_match_data = best_match_data.replace(term, color.RED + term + color.END) print(best_match_data) return result def score_document(self, document, term_doc_tf, terms): result = 0 for term in terms: IDF = math.log((self.documents_number - len(term_doc_tf[term]) + 0.5) / (len(term_doc_tf[term]) + 0.5)) f = term_doc_tf[term][document] result += IDF * (f * (self.k1 + 1)) / (f + self.k1 * (1 - self.b + self.b * self.documents_freqs[document] / self.avgdl)) return result