def ToBinary(doc_vector_file):
    f_vector = open(doc_vector_file)
    f_bin = open(doc_vector_file + '.bin', 'wb')
    content = True
    indexes = []
    count = 0
    start_time = time.clock()
    while content:
        content = f_vector.readline()
        if content == '' or content == None:
            break
        if content.endswith('\n'):
            content = content[:-1]
        items = content.split(',')
        term_list = []
        for item in items:
            data = item.split(':')
            term_list.append((int(data[0]), float(data[1])))
        indexes.append(str(f_bin.tell()))
        f_bin.write(struct.pack('i', len(term_list)))
        for term in term_list:
            f_bin.write(struct.pack('i', term[0]))
            f_bin.write(struct.pack('f', term[1]))

        count += 1
        if count % 1000 == 0:
            print_speed(start_time, count)
    write_all_text(doc_vector_file + '.bin.index', ','.join(indexes))
    f_bin.close()
Exemple #2
0
 def get_result_thread(self):
     f_name = open(self.name_file,'w')
     f_content = open(self.content_file,'w')
     count = 0
     start_time = time.clock()
     worker_count = 0
     while 1:
         result_list = self.result_queue.get()     #得到n个结果
         #检测是否有worker退出
         if result_list == None:
             worker_count+=1
             if worker_count < self.map_worker_count:
                 continue
             else:
                 break
         for result in result_list:
             f_name.write('%s\n' % result[0])
             f_content.write('%s\n' % result[1])
         count+=1
         print_speed(start_time,count)
     f_name.close()
     f_content.close()
     print_speed(start_time,count)
     print u'%s已保存' % f_name.name
     print u'%s已保存' % f_content.name
     print u'\n程序执行完毕'
def DivideByFreqThread(term_inverter_file, threshold):
    print u'将倒排索引文件按词频分为两个文件'
    f_content = open(term_inverter_file)
    f_low = open(term_inverter_file + '.low', 'w')
    f_high = open(term_inverter_file + '.high', 'w')
    i = 0
    start_time = time.clock()
    tmp = True
    high_indexes = []
    while tmp:
        tmp = f_content.readline()
        if tmp == None or tmp == '':
            break
        index = tmp.index(',')
        index2 = tmp.index(',', index + 1)
        count = int(tmp[index + 1:index2])
        if count < threshold:
            f_low.write(tmp)
        else:
            high_indexes.append('%s:%s' % (tmp[:index], str(f_high.tell())))
            f_high.write(tmp)
        i += 1
        if i % 10000 == 0:
            print_speed(start_time, i)
    f_content.close()
    f_low.close()
    f_high.close()
    print_speed(start_time, i)
    print u'高低频词分离完毕,阈值:%d' % threshold
    print u'保存%s' % f_low.name
    print u'保存%s' % f_high.name
    write_all_text(f_high.name + '.index', ','.join(high_indexes))
    print '保存%s.index' % f_high.name
def ConvertDocVectorToTermVector(doc_vector_file, term_vector_file):
    f_vector = open(term_vector_file, 'w')
    f_index = open(term_vector_file + '.index', 'w')
    begin = 0
    step = 40000
    end = step
    max_term_id = 0
    term_dict = {}
    while 1:
        print 'begin=%d,end=%d,max_term_id=%d' % (begin, end, max_term_id)
        start_time = time.clock()
        doc_id = 0
        term_dict = {}
        #循环读一次正向索引
        f_doc = open(doc_vector_file, 'rb')
        while 1:
            try:
                count = readInt(f_doc)
            except:
                break
            #读一个文档的正向索引
            for i in range(count):
                term_id = readInt(f_doc)
                if term_id > max_term_id: max_term_id = term_id
                weight = readFloat(f_doc)
                if term_id >= begin and term_id < end:
                    if term_dict.has_key(term_id):
                        term_dict[term_id].append((doc_id, weight))
                    else:
                        term_dict[term_id] = [(doc_id, weight)]
            doc_id += 1
            if doc_id % 1000 == 0:
                print_speed(start_time, doc_id)

        f_doc.close()
        if len(term_dict) > 0:
            term_list = term_dict.items()
            term_list.sort()
            for term in term_list:
                term_id = term[0]
                doc_list = ['%d:%f' % doc for doc in term[1]]
                f_index.write('%d:%ld,' % (term_id, f_vector.tell()))
                f_vector.write('%d,%s\n' % (term_id, ','.join(doc_list)))
            del term_list
        del term_dict
        begin = end
        end = begin + step
        if begin > max_term_id:
            break
        gc.collect()
    f_vector.close()
    f_index.close()
    print u'已保存%s' % f_vector.name
def CreateDocVector(term_inverter_file, content_list_file, doc_vector_file):
    print u'创建doc vectors...'
    term_dict = {}  #存储(termid,每个term对应的出现文档个数)
    tmp = True
    f_inverter = open(term_inverter_file)

    i = 0
    while tmp:
        tmp = f_inverter.readline()
        if tmp == None or tmp == '':
            break
        item = MergeReader.ToKeyValue(tmp)
        term_dict[item[0]] = (i, int(item[2]))
        i += 1
    f_inverter.close()
    print u'总词数:%d' % len(term_dict)

    f_doc_vector = open(doc_vector_file, 'w')
    f_content = open(content_list_file)
    tmp = True
    start_time = time.clock()
    count = 0
    indexes = []
    while tmp:
        tmp = f_content.readline()
        if tmp == None or tmp == '':
            break
        count += 1
        term_list = Converter.ToTermList(tmp)
        #w_list为weight集合
        w_list = [
            CalWeight(int(term[1]), term_dict[term[0]][1], N)
            for term in term_list
        ]
        #归一化
        w_list = Normalize(w_list)
        term_id_list = [term_dict[term[0]][0] for term in term_list]
        #w_list为(termid,weight)
        w_list = [
            '%d:%f' % (term_id_list[i], w_list[i]) for i in range(len(w_list))
        ]
        indexes.append(str(f_doc_vector.tell()))
        f_doc_vector.write(','.join(w_list))
        f_doc_vector.write('\n')
        if count % 10000 == 0:
            print_speed(start_time, count)
    print_speed(start_time, count)
    f_content.close()
    f_doc_vector.close()
    print '保存%s' % f_doc_vector.name
    write_all_text(doc_vector_file + '.index', ','.join(indexes))
    print '保存%s.index' % doc_vector_file
def CalDistance2(topic, searcher):
    doc_result = {}
    count = 0
    start_time = time.clock()
    print 'topic', topic.id, u' 词数:', len(topic.term_dict)
    for term in topic.term_dict:
        doc_list = searcher.search(term)
        for doc in doc_list:
            doc_result[doc[0]] = doc_result.get(
                doc[0], 0) + float(doc[1]) * topic.term_dict[term]
        count += 1
        print_speed(start_time, count)
    print_speed(start_time, count)
    print ''
    doc_result = sorted(doc_result.iteritems(),
                        key=itemgetter(1),
                        reverse=True)
    return doc_result[:1000]
def CalDistance(topic, searcher):
    doc_result = {}
    print len(topic.term_dict)
    count = 0
    start_time = time.clock()
    doc_list = []
    for term in topic.term_dict:
        doc_list.extend(searcher.search(term))
        count += 1
    count = 0
    doc_list = set(doc_list)
    print len(doc_list)
    start_time = time.clock()
    for docid in doc_list:
        term_list = searcher.searchByDocId(docid)
        count += 1
        if count % 1000 == 0:
            print_speed(start_time, count)
    print len(doc_list)
def RemoveLowWords(low_term_inverter_file, content_file):
    print u'将正索引中词频低的词去掉'
    file_low = open(low_term_inverter_file)
    low_term_list = []
    tmp = True
    while tmp:
        tmp = file_low.readline()
        if tmp == '' or tmp == None:
            break
        index = tmp.index(',')
        low_term_list.append(tmp[:index])
    file_low.close()

    low_term_set = set(low_term_list)
    print u'已载入低频词表'
    file_content = open(content_file)
    file_content_high = open(content_file + '.high', 'w')
    file_index_list = []
    tmp = True
    count = 0
    start_time = time.clock()
    while tmp:
        tmp = file_content.readline()
        if tmp == None or tmp == '':
            break
        term_list = ContentFile.ToTermList(tmp[:-1])
        term_list = [
            '%s:%s' % term for term in term_list if term[0] not in low_term_set
        ]
        file_index_list.append(str(file_content_high.tell()))
        file_content_high.write(','.join(term_list))
        file_content_high.write('\n')
        count += 1
        if count % 10000 == 0:
            print_speed(start_time, count)
    file_content.close()
    file_content_high.close()
    print_speed(start_time, count)
    print u'已保存%s' % file_content_high.name
    file_content_high_index = open(content_file + '.high.index', 'w')
    file_content_high_index.write(','.join(file_index_list))
    file_content_high_index.close()
    print u'已保存%s' % file_content_high_index.name
def ReduceSingleThread(content_file):
    print u'创建倒排索引...'
    worker = ReduceWorker()
    reader = IOWorker([content_file], 20000)
    doc_list = []
    count = 0
    save_count = 0
    start_time = time.clock()
    clear_dir('tmp')
    while 1:
        task_raw_data = reader.GetLines()
        data_count = len(task_raw_data)
        if data_count == 0:
            break
        doc_list = Converter.ToDocList(task_raw_data, count)
        count += data_count
        term_inverter_list = worker.run(doc_list)
        term_inverter_string = Converter.TermInverterToString(
            term_inverter_list)
        save_count += 1
        IOWorker.SaveText('tmp\\%d' % save_count, term_inverter_string)
        print_speed(start_time, count)
    print u'保存完毕,共%d个临时文件' % save_count