def ToBinary(doc_vector_file): f_vector = open(doc_vector_file) f_bin = open(doc_vector_file + '.bin', 'wb') content = True indexes = [] count = 0 start_time = time.clock() while content: content = f_vector.readline() if content == '' or content == None: break if content.endswith('\n'): content = content[:-1] items = content.split(',') term_list = [] for item in items: data = item.split(':') term_list.append((int(data[0]), float(data[1]))) indexes.append(str(f_bin.tell())) f_bin.write(struct.pack('i', len(term_list))) for term in term_list: f_bin.write(struct.pack('i', term[0])) f_bin.write(struct.pack('f', term[1])) count += 1 if count % 1000 == 0: print_speed(start_time, count) write_all_text(doc_vector_file + '.bin.index', ','.join(indexes)) f_bin.close()
def get_result_thread(self): f_name = open(self.name_file,'w') f_content = open(self.content_file,'w') count = 0 start_time = time.clock() worker_count = 0 while 1: result_list = self.result_queue.get() #得到n个结果 #检测是否有worker退出 if result_list == None: worker_count+=1 if worker_count < self.map_worker_count: continue else: break for result in result_list: f_name.write('%s\n' % result[0]) f_content.write('%s\n' % result[1]) count+=1 print_speed(start_time,count) f_name.close() f_content.close() print_speed(start_time,count) print u'%s已保存' % f_name.name print u'%s已保存' % f_content.name print u'\n程序执行完毕'
def DivideByFreqThread(term_inverter_file, threshold): print u'将倒排索引文件按词频分为两个文件' f_content = open(term_inverter_file) f_low = open(term_inverter_file + '.low', 'w') f_high = open(term_inverter_file + '.high', 'w') i = 0 start_time = time.clock() tmp = True high_indexes = [] while tmp: tmp = f_content.readline() if tmp == None or tmp == '': break index = tmp.index(',') index2 = tmp.index(',', index + 1) count = int(tmp[index + 1:index2]) if count < threshold: f_low.write(tmp) else: high_indexes.append('%s:%s' % (tmp[:index], str(f_high.tell()))) f_high.write(tmp) i += 1 if i % 10000 == 0: print_speed(start_time, i) f_content.close() f_low.close() f_high.close() print_speed(start_time, i) print u'高低频词分离完毕,阈值:%d' % threshold print u'保存%s' % f_low.name print u'保存%s' % f_high.name write_all_text(f_high.name + '.index', ','.join(high_indexes)) print '保存%s.index' % f_high.name
def ConvertDocVectorToTermVector(doc_vector_file, term_vector_file): f_vector = open(term_vector_file, 'w') f_index = open(term_vector_file + '.index', 'w') begin = 0 step = 40000 end = step max_term_id = 0 term_dict = {} while 1: print 'begin=%d,end=%d,max_term_id=%d' % (begin, end, max_term_id) start_time = time.clock() doc_id = 0 term_dict = {} #循环读一次正向索引 f_doc = open(doc_vector_file, 'rb') while 1: try: count = readInt(f_doc) except: break #读一个文档的正向索引 for i in range(count): term_id = readInt(f_doc) if term_id > max_term_id: max_term_id = term_id weight = readFloat(f_doc) if term_id >= begin and term_id < end: if term_dict.has_key(term_id): term_dict[term_id].append((doc_id, weight)) else: term_dict[term_id] = [(doc_id, weight)] doc_id += 1 if doc_id % 1000 == 0: print_speed(start_time, doc_id) f_doc.close() if len(term_dict) > 0: term_list = term_dict.items() term_list.sort() for term in term_list: term_id = term[0] doc_list = ['%d:%f' % doc for doc in term[1]] f_index.write('%d:%ld,' % (term_id, f_vector.tell())) f_vector.write('%d,%s\n' % (term_id, ','.join(doc_list))) del term_list del term_dict begin = end end = begin + step if begin > max_term_id: break gc.collect() f_vector.close() f_index.close() print u'已保存%s' % f_vector.name
def CreateDocVector(term_inverter_file, content_list_file, doc_vector_file): print u'创建doc vectors...' term_dict = {} #存储(termid,每个term对应的出现文档个数) tmp = True f_inverter = open(term_inverter_file) i = 0 while tmp: tmp = f_inverter.readline() if tmp == None or tmp == '': break item = MergeReader.ToKeyValue(tmp) term_dict[item[0]] = (i, int(item[2])) i += 1 f_inverter.close() print u'总词数:%d' % len(term_dict) f_doc_vector = open(doc_vector_file, 'w') f_content = open(content_list_file) tmp = True start_time = time.clock() count = 0 indexes = [] while tmp: tmp = f_content.readline() if tmp == None or tmp == '': break count += 1 term_list = Converter.ToTermList(tmp) #w_list为weight集合 w_list = [ CalWeight(int(term[1]), term_dict[term[0]][1], N) for term in term_list ] #归一化 w_list = Normalize(w_list) term_id_list = [term_dict[term[0]][0] for term in term_list] #w_list为(termid,weight) w_list = [ '%d:%f' % (term_id_list[i], w_list[i]) for i in range(len(w_list)) ] indexes.append(str(f_doc_vector.tell())) f_doc_vector.write(','.join(w_list)) f_doc_vector.write('\n') if count % 10000 == 0: print_speed(start_time, count) print_speed(start_time, count) f_content.close() f_doc_vector.close() print '保存%s' % f_doc_vector.name write_all_text(doc_vector_file + '.index', ','.join(indexes)) print '保存%s.index' % doc_vector_file
def CalDistance2(topic, searcher): doc_result = {} count = 0 start_time = time.clock() print 'topic', topic.id, u' 词数:', len(topic.term_dict) for term in topic.term_dict: doc_list = searcher.search(term) for doc in doc_list: doc_result[doc[0]] = doc_result.get( doc[0], 0) + float(doc[1]) * topic.term_dict[term] count += 1 print_speed(start_time, count) print_speed(start_time, count) print '' doc_result = sorted(doc_result.iteritems(), key=itemgetter(1), reverse=True) return doc_result[:1000]
def CalDistance(topic, searcher): doc_result = {} print len(topic.term_dict) count = 0 start_time = time.clock() doc_list = [] for term in topic.term_dict: doc_list.extend(searcher.search(term)) count += 1 count = 0 doc_list = set(doc_list) print len(doc_list) start_time = time.clock() for docid in doc_list: term_list = searcher.searchByDocId(docid) count += 1 if count % 1000 == 0: print_speed(start_time, count) print len(doc_list)
def RemoveLowWords(low_term_inverter_file, content_file): print u'将正索引中词频低的词去掉' file_low = open(low_term_inverter_file) low_term_list = [] tmp = True while tmp: tmp = file_low.readline() if tmp == '' or tmp == None: break index = tmp.index(',') low_term_list.append(tmp[:index]) file_low.close() low_term_set = set(low_term_list) print u'已载入低频词表' file_content = open(content_file) file_content_high = open(content_file + '.high', 'w') file_index_list = [] tmp = True count = 0 start_time = time.clock() while tmp: tmp = file_content.readline() if tmp == None or tmp == '': break term_list = ContentFile.ToTermList(tmp[:-1]) term_list = [ '%s:%s' % term for term in term_list if term[0] not in low_term_set ] file_index_list.append(str(file_content_high.tell())) file_content_high.write(','.join(term_list)) file_content_high.write('\n') count += 1 if count % 10000 == 0: print_speed(start_time, count) file_content.close() file_content_high.close() print_speed(start_time, count) print u'已保存%s' % file_content_high.name file_content_high_index = open(content_file + '.high.index', 'w') file_content_high_index.write(','.join(file_index_list)) file_content_high_index.close() print u'已保存%s' % file_content_high_index.name
def ReduceSingleThread(content_file): print u'创建倒排索引...' worker = ReduceWorker() reader = IOWorker([content_file], 20000) doc_list = [] count = 0 save_count = 0 start_time = time.clock() clear_dir('tmp') while 1: task_raw_data = reader.GetLines() data_count = len(task_raw_data) if data_count == 0: break doc_list = Converter.ToDocList(task_raw_data, count) count += data_count term_inverter_list = worker.run(doc_list) term_inverter_string = Converter.TermInverterToString( term_inverter_list) save_count += 1 IOWorker.SaveText('tmp\\%d' % save_count, term_inverter_string) print_speed(start_time, count) print u'保存完毕,共%d个临时文件' % save_count