def buildIndexes(self, testfiles, methods): """ For every test file in [testfiles], create index for every in-collection reference, add all of the BOWs of methods in [methods] to index """ self.initializeIndexer() count=0 for guid in testfiles: count+=1 print("Building index: paper ",count,"/",len(testfiles),":",guid) fwriters={} doc=cp.Corpus.loadSciDoc(guid) if not doc: print("Error loading SciDoc for", guid) continue indexNames=getDictOfLuceneIndeces(methods) for indexName in indexNames: actual_dir=cp.Corpus.getRetrievalIndexPath(guid, indexName, full_corpus=False) fwriters[indexName]=self.createIndexWriter(actual_dir) # old way, assuming the documents are fine and one can just load all in-collection references # ...NOT! must select them using the same method that gets the resolvable CITATIONS # updated! Should work well now ## for ref in doc["references"]: ## match=cp.Corpus.matcher.matchReference(ref) ## if match: ## ref_guid=match["guid"] # even newer way: just use the precomputed metadata.outlinks outlinks=cp.Corpus.getMetadataByGUID(guid)["outlinks"] for ref_guid in outlinks: addBOWsToIndex(ref_guid,indexNames,9999,fwriters) # TODO integrate this block below into addBOWsToIndex ## for indexName in indexNames: ## # get the maximum year to create inlink_context descriptions from ## if indexNames[indexName]["options"].get("max_year",False) == True: ## max_year=cp.Corpus.getMetadataByGUID(test_guid)["year"] ## else: ## max_year=None for fwriter in fwriters: fwriters[fwriter].close()
def buildGeneralIndex(self, exp, options): """ Creates one index for each method and parameter, adding all files to each """ print ("Building global index...") fwriters={} index_max_year=exp.get("index_max_year",None) indexNames=getDictOfLuceneIndeces(exp["prebuild_general_indexes"]) for entry_name in indexNames: entry=indexNames[entry_name] entry["function_name"]=exp["prebuild_bows"][entry["bow_name"]]["function_name"] max_results=options.get("max_files_to_process",sys.maxint) ALL_GUIDS=cp.Corpus.listPapers("metadata.year:<=%d" % index_max_year, max_results=max_results) for indexName in indexNames: actual_dir=cp.Corpus.getRetrievalIndexPath("ALL_GUIDS", indexName, full_corpus=True) fields=self.listFieldsToIndex(indexNames[indexName]) self.createIndex(actual_dir,fields) fwriters[indexName]=self.createIndexWriter(actual_dir) print("Adding",len(ALL_GUIDS),"files:") if not self.use_celery: ## widgets = ['Adding file: ', SimpleProgress(), ' ', Bar(), ' ', ETA()] ## progress = ProgressBar(widgets=widgets, maxval=100).start() progress=ProgressIndicator(True, len(ALL_GUIDS), print_out=False) for guid in ALL_GUIDS: addBOWsToIndex(guid, indexNames, index_max_year, fwriters) progress.showProgressReport("Adding papers to index") for fwriter in fwriters: fwriters[fwriter].close() else: print("Queueing up files for import...") for guid in ALL_GUIDS: addToindexTask.apply_async(args=[ guid, indexNames, index_max_year, ], queue="add_to_index")