def get_scores(self, fav_doc): """ identify important terms from a liked tweet, determine similarity scores of these terms against all other tweets and return dict of all scores. """ bm25_scores = {} doc_vector = self.doc_vectors[fav_doc] # get the 3 most significant terms from favorite tweet then get similar tweets via BM25 top_terms = dict(sorted(doc_vector.vector.iteritems(), key=itemgetter(1), reverse=True)[:3]) for key, value in top_terms.iteritems(): actual_term = doc_vector.terms_dict.keys()[doc_vector.terms_dict.values().index(key)] # set up and run query against index query = self.queryparser.parse(actual_term) collector = TopScoreDocCollector.create(6, True) self.searcher.search(query, collector) hits = collector.topDocs().scoreDocs for hit in hits: # skip if this is the favorite tweet being examined if hit.doc == fav_doc: continue if hit.doc not in bm25_scores: bm25_scores[hit.doc] = 0.0 # note the final score is weighted depending on top term IDF significance bm25_scores[hit.doc] += value * hit.score return bm25_scores
def populate_frame(self, date_range, term_vector) -> pd.DataFrame: data_frame = pd.DataFrame(data=0, index=date_range, columns=term_vector) iterator = self.lucene_dictionary.getEntryIterator() for term in BytesRefIterator.cast_(iterator): term_as_string = term.utf8ToString() # print('term:', term_as_string) query = QueryParser("contents", self.analyzer).parse(term_as_string) collector = TopScoreDocCollector.create(10000, 10000) hits = self.searcher.search(query, 1000) if hits is None: # print("No hit for term: ", term_as_string) continue print("Found hit: " + term_as_string) for hit in hits.scoreDocs: document = self.searcher.doc(hit.doc) doc_name = document.getField("doc_name") date = datetime.datetime.strptime(doc_name.stringValue(), '%m%d%y') current_value = data_frame.at[date, term_as_string] if np.isnan(current_value): current_value = 0 data_frame.at[date, term_as_string] = current_value + 1 return data_frame
def search(self, q, page = 1, duplicates = False): query = self.parser.parse(q) if not duplicates: query = self.addDuplicatesQuery(query) perPage = 10 start = (page - 1) * perPage results = TopScoreDocCollector.create(1000, True) self.searcher.search(query, results) highlighter = Highlighter(QueryScorer(query)) highlighter.setTextFragmenter(SimpleFragmenter(40)) docs = [] for scoreDoc in results.topDocs(start, perPage).scoreDocs: doc = self.searcher.doc(scoreDoc.doc) tokenStream = self.analyzer.tokenStream("contents", StringReader(doc['contents'])) highlight = highlighter.getBestFragments(tokenStream, doc['contents'], 3, "...") docs.append({ 'title': doc['title'], 'url': doc['url'], 'duplicate': doc['duplicate'], 'highlight': highlight} ) del self.searcher totalPages = int(math.ceil(results.getTotalHits()/float(perPage))) return totalPages, docs
def searchWithRequestAndQuery(cls, query, indexReader, taxoReader, indexingParams, facetRequest): """ Search an index with facets for given query and facet requests. returns a List<FacetResult> """ # prepare searcher to search against searcher = IndexSearcher(indexReader) # collect matching documents into a collector topDocsCollector = TopScoreDocCollector.create(10, True) if not indexingParams: indexingParams = FacetIndexingParams.DEFAULT # Faceted search parameters indicate which facets are we interested in facetRequests = [facetRequest,] facetRequests = Arrays.asList(facetRequests) # Add the facet request of interest to the search params: facetSearchParams = FacetSearchParams(indexingParams, facetRequests) # and create a FacetsCollector to use in our facetted search: facetsCollector = FacetsCollector.create(facetSearchParams, indexReader, taxoReader) # perform documents search and facets accumulation searcher.search(query, MultiCollector.wrap([topDocsCollector, facetsCollector])) print "\nFound %d Documents for query=%s" % (topDocsCollector.totalHits, query.toString().encode('utf-8')) # Obtain facets results and print them res = facetsCollector.getFacetResults() i = 0 for facetResult in res: print "Result #%d has %d descendants" % (i, facetResult.getNumValidDescendants()) print "Result #%d : %s" % (i, facetResult) i += 1 return res
def search(self, q, page=1, duplicates=False): query = self.parser.parse(q) if not duplicates: query = self.addDuplicatesQuery(query) perPage = 10 start = (page - 1) * perPage results = TopScoreDocCollector.create(1000, True) self.searcher.search(query, results) highlighter = Highlighter(QueryScorer(query)) highlighter.setTextFragmenter(SimpleFragmenter(40)) docs = [] for scoreDoc in results.topDocs(start, perPage).scoreDocs: doc = self.searcher.doc(scoreDoc.doc) tokenStream = self.analyzer.tokenStream( "contents", StringReader(doc['contents'])) highlight = highlighter.getBestFragments(tokenStream, doc['contents'], 3, "...") docs.append({ 'title': doc['title'], 'url': doc['url'], 'duplicate': doc['duplicate'], 'highlight': highlight }) del self.searcher totalPages = int(math.ceil(results.getTotalHits() / float(perPage))) return totalPages, docs
def runQuery(self, structured_query, parameters, test_guid, max_results=MAX_RESULTS_RECALL): """ Run the query, return a list of tuples (score,metadata) of top docs """ if not structured_query or len(structured_query) == 0 : return [] self.last_query=structured_query query_text=self.rewriteQuery(structured_query["structured_query"],parameters,test_guid) try: query = self.query_parser(LuceneVersion.LUCENE_CURRENT, "text", self.analyzer).parse(query_text) except lucene.JavaError: # this is to deal with the "too many boolean clauses" exception print("Lucene exception:",sys.exc_info()[:2]) print("Query:",query_text) return [] structured_query["lucene_query"]=query_text if self.useExplainQuery: # TODO remove this completely, use DisjunctionMax # this should only exist until I fix the lucene bulkScorer to give the same results hits=self.runQueryViaExplain(query,max_results) else: collector=TopScoreDocCollector.create(max_results, True) self.searcher.search(query, collector) hits = collector.topDocs().scoreDocs res=[] # explain the query if self.logger: self.logger.logReport(query_text+"\n") if self.logger.full_citation_id in self.logger.citations_extra_info: max_explanations=len(hits) else: max_explanations=1 for index in range(max_explanations): self.logger.logReport(self.searcher.explain(query,index)) for hit in hits: doc = self.searcher.doc(hit.doc) metadata=json.loads(doc.get("metadata")) res.append((hit.score,metadata)) if self.logger and self.logger.full_citation_id in self.logger.citations_extra_info: print(query_text,"\n", hits, "\n", res, "\n") del hits del query del query_text return res
def runQuery(self, structured_query, parameters, test_guid, max_results=MAX_RESULTS_RECALL): """ Run the query, return a list of tuples (score,metadata) of top docs """ if not structured_query or len(structured_query) == 0 : return [] self.last_query=structured_query query_text=self.rewriteQuery(structured_query["structured_query"],parameters,test_guid) try: query = self.query_parser(LuceneVersion.LUCENE_CURRENT, "text", self.analyzer).parse(query_text) except lucene.JavaError: # this is to deal with the "too many boolean clauses" exception print("Lucene exception:",sys.exc_info()[:2]) print("Query:",query_text) return [] structured_query["lucene_query"]=query_text if self.useExplainQuery: # TODO remove this completely, use DisjunctionMax # this should only exist until I fix the lucene bulkScorer to give the same results hits=self.runQueryViaExplain(query,max_results) else: collector=TopScoreDocCollector.create(max_results, True) self.searcher.search(query, collector) hits = collector.topDocs().scoreDocs res=[] # explain the query if self.logger: self.logger.logReport(query_text+"\n") if self.logger.full_citation_id in self.logger.citations_extra_info: max_explanations=len(hits) else: max_explanations=1 for index in range(max_explanations): self.logger.logReport(self.searcher.explain(query,index)) for hit in hits: doc = self.searcher.doc(hit.doc) metadata=json.loads(doc.get("metadata")) res.append((hit.score,metadata)) if self.logger and self.logger.full_citation_id in self.logger.citations_extra_info: print(query_text,"\n", hits, "\n", res, "\n") del hits del query del query_text return res
def lucene_retrieval_multifield(q_string, q_class, feature_type, use_BM25=False): """ multifield: different query string for different field not same word on different field :param q_string: :param feature_type: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def retrieval_scores(hists): """ return sorted document+score by score :param hists: """ def doc_score(hists): """ return doc_name & score :param hists: """ for h in hists: # docID = h.doc # doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") # text = doc.get("text") score = h.score # yield (file_name, doc_name, score, text) yield score doc_score_list = list(doc_score(hists)) return map(lambda f: f(doc_score_list), feature_type) # feature_type is a list of function text_query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) subject_query = QueryParser(version, 'corpus_name', analyzer).parse(QueryParser.escape(q_class)) query = BooleanQuery() # BooleanClause.Occur # MUST implies that the keyword must occur # SHOULD implies that the keyword SHOULD occur query.add(text_query, BooleanClause.Occur.SHOULD) query.add(subject_query, BooleanClause.Occur.SHOULD) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists results = retrieval_scores(hs) # reader.close() return results # retrieval_scores for each question-answer pair
def lucene_retrieval(q_string, feature_type, use_BM25=False): """ :param q_string: :param feature_type: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def retrieval_scores(hists): """ return sorted document+score by score :param hists: """ def doc_score(hists): """ return doc_name & score :param hists: """ for h in hists: # docID = h.doc # doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") # text = doc.get("text") score = h.score # yield (file_name, doc_name, score, text) yield score doc_score_list = list(doc_score(hists)) return map(lambda f: f(doc_score_list), feature_type) if len(doc_score_list) != 0 else [0] * len( feature_type) # feature_type is a list of function # escape special characters via escape function query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists results = retrieval_scores(hs) # reader.close() return results # retrieval_scores for each question-answer pair
def doc_search(self, field, keywords, numHits): if field != 'All': analyzer = StandardAnalyzer() parser = QueryParser(field, analyzer) query = parser.parse(keywords) # self.lReader.getDocCount("title"); try: collector = TopScoreDocCollector.create(numHits) self.lSearcher.search(query, collector) hits = collector.topDocs().scoreDocs except RuntimeError: print "Score docoment run fail" self.hits = hits self.field = field return hits else: analyzer = WhitespaceAnalyzer() parser = MultiFieldQueryParser(['Title', 'Body'], analyzer) query = MultiFieldQueryParser.parse(parser, keywords) # self.lReader.getDocCount("title"); try: collector = TopScoreDocCollector.create(numHits) self.lSearcher.search(query, collector) hits = collector.topDocs().scoreDocs except RuntimeError: print "Score docoment run fail" self.hits = hits self.field = field return hits self.hits = hits self.field = field return hits
def retrieve(self,query,field,hitsPerPage): querystr=query # build query q_lucene = QueryParser(field, self.analyzer).parse(querystr) # build searcher collector = TopScoreDocCollector.create(hitsPerPage) (self.searcher).search(q_lucene, collector); hits = collector.topDocs().scoreDocs; len_hits=len(hits) single_query_result=[(self.searcher.doc(hits[j].doc),hits[j].doc) for j in range(len_hits)] return single_query_result
def lucene_retrieval(q_string, feature_type, use_BM25=False): """ :param q_string: :param feature_type: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def retrieval_scores(hists): """ return sorted document+score by score :param hists: """ def doc_score(hists): """ return doc_name & score :param hists: """ for h in hists: # docID = h.doc # doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") # text = doc.get("text") score = h.score # yield (file_name, doc_name, score, text) yield score doc_score_list = list(doc_score(hists)) return map(lambda f: f(doc_score_list), feature_type) if len(doc_score_list) != 0 else [0]*len(feature_type) # feature_type is a list of function # escape special characters via escape function query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists results = retrieval_scores(hs) # reader.close() return results # retrieval_scores for each question-answer pair
def doc_search(self, keywords): analyzer = StandardAnalyzer() parser = QueryParser('Title', analyzer) query = parser.parse(keywords) try: collector = TopScoreDocCollector.create(3000) self.lSearcher.search(query, collector) hits = collector.topDocs().scoreDocs except RuntimeError: print "Score docoment run fail" self.hits = hits return hits
def lucene_retrieval(q_string, use_BM25=False): """ :param q_string: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def doc_text(hists): """ return doc_name & score :param hists: """ text = '_NONE_' for h in hists: docID = h.doc doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") text = doc.get("text") #score = h.score # yield (file_name, doc_name, score, text) return text result = '_NONE_' # escape special characters via escape function if q_string and q_string.strip(): # when pre-process answers, `none of the above` -> '' cause error here #print(q_string) query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists result = doc_text(hs) # reader.close() return result # text: also nodes
def runQuery(self, structured_query, max_results=MAX_RESULTS_RECALL): """ LOTS OF SWEET LUCENE """ original_query=structured_query if not structured_query or len(structured_query) == 0 : return [] self.last_query=structured_query query_text=self.rewriteQuery(structured_query["structured_query"], ["text"]) try: query = self.query_parser(lucene.Version.LUCENE_CURRENT, "text", self.analyzer).parse(query_text) except: print("Lucene exception:",sys.exc_info()[:2]) return None structured_query["lucene_query"]=query_text if self.useExplainQuery: # this should only exist until I fix the lucene bulkScorer to give the same results hits=self.runQueryViaExplain(query,max_results) else: collector=TopScoreDocCollector.create(max_results, True) self.searcher.search(query, collector) hits = collector.topDocs().scoreDocs ## print("Found %d document(s) that matched query '%s':" % (hits.totalHits, query)) res=[] ## if len(hits.scoreDocs) ==0: ## print "Original query:",original_query ## print "Query:", query for hit in hits: doc = self.searcher.doc(hit.doc) metadata= json.loads(doc.get("metadata")) res.append((hit.score,metadata)) return res
def runQuery(self, structured_query, max_results=MAX_RESULTS_RECALL): """ LOTS OF SWEET LUCENE """ original_query=structured_query if not structured_query or len(structured_query) == 0 : return [] self.last_query=structured_query query_text=self.rewriteQuery(structured_query["structured_query"], ["text"]) try: query = self.query_parser(lucene.Version.LUCENE_CURRENT, "text", self.analyzer).parse(query_text) except: print("Lucene exception:",sys.exc_info()[:2]) return None structured_query["lucene_query"]=query_text if self.useExplainQuery: # this should only exist until I fix the lucene bulkScorer to give the same results hits=self.runQueryViaExplain(query,max_results) else: collector=TopScoreDocCollector.create(max_results, True) self.searcher.search(query, collector) hits = collector.topDocs().scoreDocs ## print("Found %d document(s) that matched query '%s':" % (hits.totalHits, query)) res=[] ## if len(hits.scoreDocs) ==0: ## print "Original query:",original_query ## print "Query:", query for hit in hits: doc = self.searcher.doc(hit.doc) metadata= json.loads(doc.get("metadata")) res.append((hit.score,metadata)) return res
def main(): global lucene_vm_init if not lucene_vm_init: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True is_index_Exist = os.path.exists(LUCENE_INDEX_DIR) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) # load index to search engine reader = DirectoryReader.open(index_mm) searcher = IndexSearcher(reader) searcher.setSimilarity(BM25Similarity()) # read query read_query() # initialize mongodb client mongoObj = Mongo_Object('localhost', 27017) # initialize word2vec print 'load word2vec model' w2vmodel = gensim.models.Word2Vec.load_word2vec_format( "F:\\modified_w2v\\w2v_wiki_trigram_phrase_20170101\\wiki.en.text.vector.binary", binary=True) print 'finish loading word2vec model' # search global hitsPerPage fields = ['name', 'value'] #parser=MultiFieldQueryParser(fields,analyzer) #parser.setDefaultOperator(QueryParserBase.AND_OPERATOR) rec_result = open('pylucene.runs', 'w') for i in range(len(queries)): query = queries[i] print 'processing query ' + str(i) + ':' + query[0] querystr = remove_duplicate(stemSentence(query[1])) #q_lucene=MultiFieldQueryParser.parse(parser,querystr) q_lucene = QueryParser("all_text", analyzer).parse(querystr) print "q_lucene: " + q_lucene.toString() collector = TopScoreDocCollector.create(hitsPerPage) searcher.search(q_lucene, collector) hits = collector.topDocs().scoreDocs # build query object for computeScore #queryObj=Query_Object(query,mongoObj,w2vmodel) # initialize duplicate remover docDup = set() # find candidate results after 1st round filter candidates = PriorityQueue() for j in xrange(len(hits)): docID = hits[j].doc d = searcher.doc(docID) name = cleanSentence(d['title'].strip()) if name in docDup: continue docDup.add(name) # build entity object entityObj = Entity_Object(d, mongoObj, w2vmodel) #score = computeScore(queryObj,entityObj,mongoObj,w2vmodel) score = hits[j].score candidates.put((-score, j)) # output results from priority queue larger score first rank = 0 while candidates.empty() == False and rank < 100: rank = rank + 1 item = candidates.get() score = -item[0] j = item[1] # index of hits[] docID = hits[j].doc d = searcher.doc(docID) title = '<dbpedia:' + d.get('title') + '>' res_line = query[0] + '\t' + 'Q0' + '\t' + title + '\t' + str( rank) + '\t' + str(score) + '\t' + 'pylucene_multifield' rec_result.writelines(res_line + '\n') rec_result.close()
def _topCollector(self, start, stop, sortKeys): if stop <= start: return TotalHitCountSuperCollector() if self._multithreaded else TotalHitCountCollector() # fillFields = False # always true for multi-threading/sharding trackDocScores = True trackMaxScore = False docsScoredInOrder = True if sortKeys: sortFields = [ self._sortField(fieldname=sortKey['sortBy'], sortDescending=sortKey['sortDescending']) for sortKey in sortKeys ] sort = Sort(sortFields) else: return TopScoreDocSuperCollector(stop, docsScoredInOrder) if self._multithreaded else TopScoreDocCollector.create(stop, docsScoredInOrder) if self._multithreaded: return TopFieldSuperCollector(sort, stop, trackDocScores, trackMaxScore, docsScoredInOrder) else: fillFields = False return TopFieldCollector.create(sort, stop, fillFields, trackDocScores, trackMaxScore, docsScoredInOrder)
def lucene_retrieval_multifield(q_string, q_class, feature_type, use_BM25=False): """ multifield: different query string for different field not same word on different field :param q_string: :param feature_type: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def retrieval_scores(hists): """ return sorted document+score by score :param hists: """ def doc_score(hists): """ return doc_name & score :param hists: """ for h in hists: # docID = h.doc # doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") # text = doc.get("text") score = h.score # yield (file_name, doc_name, score, text) yield score doc_score_list = list(doc_score(hists)) return map(lambda f: f(doc_score_list), feature_type) # feature_type is a list of function text_query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) subject_query = QueryParser(version, 'corpus_name', analyzer).parse(QueryParser.escape(q_class)) query = BooleanQuery() # BooleanClause.Occur # MUST implies that the keyword must occur # SHOULD implies that the keyword SHOULD occur query.add(text_query, BooleanClause.Occur.SHOULD) query.add(subject_query, BooleanClause.Occur.SHOULD) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists results = retrieval_scores(hs) # reader.close() return results # retrieval_scores for each question-answer pair
def main(): global lucene_vm_init if not lucene_vm_init: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True is_index_Exist = os.path.exists(LUCENE_INDEX_DIR) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) # load index to search engine reader = DirectoryReader.open(index_mm) searcher1 = IndexSearcher(reader) searcher1.setSimilarity(BM25Similarity()) searcher2 = IndexSearcher(reader) w = IndexWriter(index_mm,config) # read query read_query() # initialize mongodb client mongoObj=Mongo_Object('localhost',27017) # search docDup=set() finalDup={} for i in xrange(len(queries)): print 'process query %d' %(i) query = queries[i] querystr = stemSentence(query[3]) # build searcher q_lucene = QueryParser("all_text", analyzer).parse(querystr) collector = TopScoreDocCollector.create(hitsPerPage); searcher1.search(q_lucene, collector); hits = collector.topDocs().scoreDocs; # find candidate results after 1st round filter docDup.clear() for j in xrange(len(hits)): docID=hits[j].doc d=searcher1.doc(docID) if d['title'] in docDup: finalDup[d['title']]=d continue docDup.add(d['title']) docDup.clear() for j in xrange(len(hits)): docID=hits[j].doc d=searcher1.doc(docID) title=d['title'] if d['title'] in docDup: continue docDup.add(title) item=(mongoObj.conn_me).find_one({'title':title}) if item is None: continue entitylist=item['entitylist'].split('|') for en_title in entitylist: if title==en_title: continue t=Term('title',en_title) q=TermQuery(t) docs=searcher2.search(q,2) if docs.totalHits<=1: continue docID2=(docs.scoreDocs)[0].doc doc=searcher2.doc(docID2) finalDup[doc['title']]=doc print 'begin to clean index, there are %d dup records' %(len(finalDup)) for title in finalDup: doc=finalDup[title] # title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract name=doc['name'] value=doc['value'] category=doc['category'] skos_category=doc['skos_category'] all_text=doc['all_text'] raw_name=doc['raw_name'] raw_value=doc['raw_value'] abstract=doc['abstract'] print 'process '+title t=Term('title',title) q=TermQuery(t) w.deleteDocuments(q) addDoc(w,title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract) # process remaining records #global batch,cnt_batch #if cnt_batch>0: #w.addDocuments(batch) #cnt_batch=0 #del batch[:] w.close()