def itertextindex(index_or_dirname, indexname, docnum_field): import whoosh.index if isinstance(index_or_dirname, string_types): dirname = index_or_dirname index = whoosh.index.open_dir(dirname, indexname=indexname, readonly=True) needs_closing = True elif isinstance(index_or_dirname, whoosh.index.Index): index = index_or_dirname needs_closing = False else: raise ArgumentError('expected string or index, found %r' % index_or_dirname) try: if docnum_field is None: # figure out the field names hdr = tuple(index.schema.stored_names()) yield hdr # yield all documents astuple = operator.itemgetter(*index.schema.stored_names()) for _, stored_fields_dict in index.reader().iter_docs(): yield astuple(stored_fields_dict) else: # figure out the field names hdr = (docnum_field, ) + tuple(index.schema.stored_names()) yield hdr # yield all documents astuple = operator.itemgetter(*index.schema.stored_names()) for docnum, stored_fields_dict in index.reader().iter_docs(): yield (docnum, ) + astuple(stored_fields_dict) except: raise finally: if needs_closing: # close the index if we're the ones who opened it index.close()
def itertextindex(index_or_dirname, indexname, docnum_field): import whoosh.index if isinstance(index_or_dirname, string_types): dirname = index_or_dirname index = whoosh.index.open_dir(dirname, indexname=indexname, readonly=True) needs_closing = True elif isinstance(index_or_dirname, whoosh.index.Index): index = index_or_dirname needs_closing = False else: raise ArgumentError('expected string or index, found %r' % index_or_dirname) try: if docnum_field is None: # figure out the field names hdr = tuple(index.schema.stored_names()) yield hdr # yield all documents astuple = operator.itemgetter(*index.schema.stored_names()) for _, stored_fields_dict in index.reader().iter_docs(): yield astuple(stored_fields_dict) else: # figure out the field names hdr = (docnum_field,) + tuple(index.schema.stored_names()) yield hdr # yield all documents astuple = operator.itemgetter(*index.schema.stored_names()) for docnum, stored_fields_dict in index.reader().iter_docs(): yield (docnum,) + astuple(stored_fields_dict) except: raise finally: if needs_closing: # close the index if we're the ones who opened it index.close()
def key_terms(storage, schema): index = storage.open_index(schema=schema) ixreader = index.reader() searcher = index.searcher() docnums = [] KEY_LEN = 500 DOC_LEN = 1000 for id in xrange(DOC_LEN): docnums.append(id) #for id in ixreader.all_doc_ids(): # print id, terms = {} i = 0 for term, score in searcher.key_terms(docnums, content_field_name, KEY_LEN): terms[term] = i i += 1 print 'key_terms finished' ar = np.zeros((len(docnums), KEY_LEN)) for i in xrange(DOC_LEN): term_weights = ixreader.vector_as("weight", i, content_field_name) all_weight = 0 n = 0 for term, weight in term_weights: if term in terms: ar[i][terms[term]] = weight all_weight += weight n += 1 for j in xrange(KEY_LEN): ar[i][j] = ar[i][j] / weight u, s, v = lin.svd(ar, full_matrices=False) data = u[:, 0:100] print 'svd finished' k = KMeans(init='k-means++', n_init=10) k.fit(data) #centroids = k.cluster_centers_ labels = k.labels_ print 'kmeans finished' #af = AffinityPropagation(affinity="euclidean").fit(data) #cluster_centers_indices = af.cluster_centers_indices_ #labels = af.labels_ doc_arr = np.array(range(DOC_LEN)) for i in range(np.max(labels)): print 'group:', (i + 1) for doc_num in doc_arr[labels == i]: print ixreader.stored_fields(doc_num).get( 'id'), ixreader.stored_fields(doc_num).get('title').split( '|')[0] + '/', print '\n'
def key_terms(storage, schema): index = storage.open_index(schema=schema) ixreader = index.reader() searcher = index.searcher() docnums = [] KEY_LEN = 500 DOC_LEN = 1000 for id in xrange(DOC_LEN): docnums.append(id) #for id in ixreader.all_doc_ids(): # print id, terms = {} i = 0 for term,score in searcher.key_terms(docnums, content_field_name, KEY_LEN): terms[term] = i i += 1 print 'key_terms finished' ar = np.zeros( (len(docnums), KEY_LEN) ) for i in xrange(DOC_LEN): term_weights = ixreader.vector_as("weight", i, content_field_name) all_weight = 0 n = 0 for term,weight in term_weights: if term in terms: ar[i][terms[term]] = weight all_weight += weight n += 1 for j in xrange(KEY_LEN): ar[i][j] = ar[i][j]/weight u,s,v = lin.svd(ar, full_matrices=False) data = u[:,0:100] print 'svd finished' k = KMeans(init='k-means++', n_init=10) k.fit(data) #centroids = k.cluster_centers_ labels = k.labels_ print 'kmeans finished' #af = AffinityPropagation(affinity="euclidean").fit(data) #cluster_centers_indices = af.cluster_centers_indices_ #labels = af.labels_ doc_arr = np.array(range(DOC_LEN)) for i in range(np.max(labels)): print 'group:', (i+1) for doc_num in doc_arr[labels==i]: print ixreader.stored_fields(doc_num).get('id'), ixreader.stored_fields(doc_num).get('title').split('|')[0]+ '/', print '\n'