def loadAZLabels(annot_dir=""): """ Loads generated AZ labels from AZPrime output """ if annot_dir=="": annot_dir=cp.Corpus.paths.output papers=cp.Corpus.listPapers() print("Loading AZPrime labels...") progress=ProgressIndicator(True, len(papers),False) for guid in papers: filename=os.path.join(annot_dir, guid+".pred.txt") if os.path.exists(filename): doc=cp.Corpus.loadSciDoc(guid) f=file(filename, "r") lines=f.readlines() allsentences=[s for s in doc.allsentences if s.get("type","") == "s"] if len(lines) != len(allsentences): print("Number of tags mismatch! %d != %d -- %s" % (len(lines), len(allsentences), guid)) lines=["" for n in range(len(allsentences))] ## else: ## print("No mismatch! %d != %d -- %s" % (len(lines), len(doc.allsentences), guid)) for index,sent in enumerate(allsentences): sent["az"]=lines[index].strip() cp.Corpus.saveSciDoc(doc) else: print("Cannot find annotation file for guid %s" % guid) progress.showProgressReport("Loading labels -- %s" % guid)
def ownAZannot(export_annots=False): """ Annotates each sentence using own classifier """ from minerva.az.az_cfc_classification import AZannotator annot=AZannotator("trained_az_classifier.pickle") papers=cp.Corpus.listPapers(max_results=sys.maxint) writer=AZPrimeWriter() writer.save_pos_tags=True ## papers=papers[:1] progress=ProgressIndicator(True, len(papers),False) print("Producing annotations for SciDocs...") for guid in papers: doc=cp.Corpus.loadSciDoc(guid) annot.annotateDoc(doc) if export_annots: output_filename=os.path.join(cp.Corpus.paths.output, doc.metadata["guid"]+".annot.txt") output_file=open(output_filename,"w") for sentence in doc.allsentences: output_file.write(sentence.get("az","")+"\n") output_file.close() else: cp.Corpus.saveSciDoc(doc) progress.showProgressReport("Annotating -- %s" % guid)
def prebuildBOWsForTests(self, exp, options): """ Generates BOWs for each document from its inlinks, stores them in a corpus cached file :param parameters: list of parameters :param maxfiles: max. number of files to process. Simple parameter for debug :param overwrite_existing_bows: should BOWs be rebuilt even if existing? """ self.exp=exp self.options=options maxfiles=options.get("max_files_to_process",sys.maxint) if len(self.exp.get("rhetorical_annotations",[])) > 0: print("Loading AZ/CFC classifiers") cp.Corpus.loadAnnotators() print("Prebuilding BOWs for", min(len(cp.Corpus.ALL_FILES),maxfiles), "files...") numfiles=min(len(cp.Corpus.ALL_FILES),maxfiles) if self.use_celery: print("Queueing tasks...") tasks=[] for guid in cp.Corpus.ALL_FILES[:maxfiles]: for method_name in self.exp["prebuild_bows"]: run_annotators=self.exp.get("rhetorical_annotations",[]) if self.exp.get("run_rhetorical_annotators",False) else [] if self.use_celery: tasks.append(prebuildBOWTask.apply_async(args=[ method_name, self.exp["prebuild_bows"][method_name]["parameters"], self.exp["prebuild_bows"][method_name]["function_name"], guid, self.options["overwrite_existing_bows"], run_annotators], queue="prebuild_bows")) else: progress=ProgressIndicator(True, numfiles, False) for guid in cp.Corpus.ALL_FILES[:maxfiles]: for method_name in self.exp["prebuild_bows"]: run_annotators=self.exp.get("rhetorical_annotations",[]) if self.exp.get("run_rhetorical_annotators",False) else [] prebuildMulti( method_name, self.exp["prebuild_bows"][method_name]["parameters"], self.exp["prebuild_bows"][method_name]["function"], None, None, guid, self.options["overwrite_existing_bows"], run_annotators ) progress.showProgressReport("Building BOWs")
def aggregate_statistics(conditions=None, max_files=sys.maxint): """ Aggretates all counts from all documents in the collection """ res = { "csc_type_counts": {}, "az_counts": {}, "num_sentences": [], "num_sections": [], "num_paragraphs": [], "per_zone_citations": {}, "num_files": 0, } print("Listing files...") papers = cp.Corpus.listRecords(conditions, max_results=max_files, table="papers", field="_id") print("Aggregating statistics for %d SciDocs" % len(papers)) progress = ProgressIndicator(True, len(papers), print_out=False) num_files = 0 for guid in papers: ## try: ## stats=cp.Corpus.getStatistics(guid) ## except: computeAnnotationStatistics(guid) try: stats = cp.Corpus.getStatistics(guid) except: continue for key in ["csc_type_counts", "az_counts", "per_zone_citations"]: for key2 in stats[key]: res[key][key2] = res[key].get(key2, 0) + stats[key][key2] for key in ["num_sentences", "num_sections", "num_paragraphs"]: res[key].append(stats[key]) num_files += 1 progress.showProgressReport("Aggregating statistics -- latest paper " + guid) if num_files == 0: print("No files found in db!") return for key in ["num_sentences", "num_sections", "num_paragraphs"]: res[key.replace("num", "avg")] = sum(res[key]) / float(num_files) res["num_files"] = num_files json.dump(res, file(os.path.join(cp.Corpus.paths.output, "stats.json"), "w"))
def add_statistics_to_all_files(use_celery=False, conditions=None, max_files=sys.maxint): """ For each paper in the corpus, it computes and stores its statistics """ print("Listing files...") papers = cp.Corpus.listPapers(conditions, max_results=max_files) ## papers=cp.Corpus.listRecords(conditions, max_results=max_files, field="_id", table="papers") print("Computing statistics for %d SciDocs" % len(papers)) progress = ProgressIndicator(True, len(papers), print_out=False) for guid in papers: if use_celery: computeAnnotationStatisticsTask.apply_async(args=[guid], kwargs={}, queue="compute_statistics") else: computeAnnotationStatistics(guid) progress.showProgressReport("Computing statistics -- latest paper " + guid)
def fix_citation_parent_aac(): """ """ from minerva.proc.results_logging import ProgressIndicator cp.useElasticCorpus() cp.Corpus.connectCorpus("g:\\nlp\\phd\\aac") guids=cp.Corpus.listPapers("metadata.collection_id:\"AAC\"") progress=ProgressIndicator(True, len(guids), True) for guid in guids: doc=cp.Corpus.loadSciDoc(guid) for cit in doc.citations: if "parent" in cit: cit["parent_s"]=cit.pop("parent") cp.Corpus.saveSciDoc(doc) progress.showProgressReport("Fixing badly imported PaperXML")
def fix_authors_full_corpus(): """ Fixes authors in each metadata entry having a "papers" key which they shouldn't """ from minerva.proc.results_logging import ProgressIndicator cp.useElasticCorpus() cp.Corpus.connectCorpus("g:\\nlp\\phd\\pmc_coresc") guids=cp.Corpus.listPapers() progress=ProgressIndicator(True, len(guids), True) for guid in guids: doc_meta=cp.Corpus.getMetadataByGUID(guid) new_authors=[] for old_author in doc_meta.authors: del old_author["papers"] cp.Corpus.updatePaper(doc_meta) progress.showProgressReport("Removing redundant author information")
def convertAllFilesAndAddToDB(self, ALL_INPUT_FILES, inputdir, import_options): """ Loads each XML file, saves it as a SciDoc JSON file, adds its metadata to the database """ progress=ProgressIndicator(True, self.num_files_to_process, dot_every_xitems=20) tasks=[] for fn in ALL_INPUT_FILES[FILES_TO_PROCESS_FROM:FILES_TO_PROCESS_TO]: corpus_id=self.generate_corpus_id(fn) match=cp.Corpus.getMetadataByField("metadata.filename",os.path.basename(fn)) if not match or import_options.get("reload_xml_if_doc_in_collection",False): if self.use_celery: match_id=match["guid"] if match else None tasks.append(importXMLTask.apply_async( args=[ os.path.join(inputdir,fn), corpus_id, self.import_id, self.collection_id, import_options, match_id ], queue="import_xml" )) else: # main loop over all files filename=cp.Corpus.paths.inputXML+fn corpus_id=self.generate_corpus_id(fn) match=cp.Corpus.getMetadataByField("metadata.filename",os.path.basename(fn)) if not match: try: doc=convertXMLAndAddToCorpus( os.path.join(inputdir,fn), corpus_id, self.import_id, self.collection_id, import_options ) except ValueError: logging.exception("ERROR: Couldn't convert %s" % fn) continue progress.showProgressReport("Importing -- latest file %s" % fn)
def exportSciXML(): """ Exports all scidocs with the selected collection_id to AZPrime XML in the output dir of the corpus """ papers=cp.Corpus.listPapers(max_results=sys.maxint) writer=AZPrimeWriter() writer.save_pos_tags=True ## papers=papers[3894:] progress=ProgressIndicator(True, len(papers),False) print("Exporting SciXML files") for guid in papers: doc=cp.Corpus.loadSciDoc(guid) if len(doc.allsentences) < 1: continue writer.write(doc, os.path.join(cp.Corpus.paths.output, doc.metadata["guid"]+".pos.xml")) cp.Corpus.saveSciDoc(doc) progress.showProgressReport("Exporting -- %s" % guid)
def listAllFiles(self, start_dir, file_mask): """ Creates an ALL_FILES list with relative paths from the start_dir """ ALL_FILES=[] from minerva.proc.results_logging import ProgressIndicator progress=ProgressIndicator(True, 25000, False) for dirpath, dirnames, filenames in os.walk(start_dir): for filename in filenames: if fnmatch.fnmatch(filename,file_mask) and filename not in cp.Corpus.FILES_TO_IGNORE: fn=os.path.join(dirpath,filename) fn=fn.replace(start_dir,"") ALL_FILES.append(fn) progress.showProgressReport("listing") print("Total files:",len(ALL_FILES)) return ALL_FILES
def buildGeneralIndex(self, exp, options): """ Creates one index for each method and parameter, adding all files to each """ print ("Building global index...") fwriters={} index_max_year=exp.get("index_max_year",None) indexNames=getDictOfLuceneIndeces(exp["prebuild_general_indexes"]) for entry_name in indexNames: entry=indexNames[entry_name] entry["function_name"]=exp["prebuild_bows"][entry["bow_name"]]["function_name"] max_results=options.get("max_files_to_process",sys.maxint) ALL_GUIDS=cp.Corpus.listPapers("metadata.year:<=%d" % index_max_year, max_results=max_results) for indexName in indexNames: actual_dir=cp.Corpus.getRetrievalIndexPath("ALL_GUIDS", indexName, full_corpus=True) fields=self.listFieldsToIndex(indexNames[indexName]) self.createIndex(actual_dir,fields) fwriters[indexName]=self.createIndexWriter(actual_dir) print("Adding",len(ALL_GUIDS),"files:") if not self.use_celery: ## widgets = ['Adding file: ', SimpleProgress(), ' ', Bar(), ' ', ETA()] ## progress = ProgressBar(widgets=widgets, maxval=100).start() progress=ProgressIndicator(True, len(ALL_GUIDS), print_out=False) for guid in ALL_GUIDS: addBOWsToIndex(guid, indexNames, index_max_year, fwriters) progress.showProgressReport("Adding papers to index") for fwriter in fwriters: fwriters[fwriter].close() else: print("Queueing up files for import...") for guid in ALL_GUIDS: addToindexTask.apply_async(args=[ guid, indexNames, index_max_year, ], queue="add_to_index")
def precomputeQueries(self,exp): """ Precompute all queries for all annotated citation contexts :param exp: experiment dict with all options :type exp: dict """ self.exp=exp print("Precomputing queries...") logger=ProgressIndicator(True, numitems=len(exp["test_files"])) # init all the logging/counting logger.numchunks=exp.get("numchunks",10) cp.Corpus.loadAnnotators() # convert nested dict to flat dict where each method includes its parameters in the name self.all_doc_methods=getDictOfTestingMethods(exp["doc_methods"]) self.precomputed_queries=[] self.files_dict=OrderedDict() ## if exp["full_corpus"]: ## files_dict["ALL_FILES"]={} ## files_dict["ALL_FILES"]["doc_methods"]=all_doc_methods ## files_dict["ALL_FILES"]["tfidf_models"]=[] ## for method in all_doc_methods: ## actual_dir=cp.Corpus.getRetrievalIndexPath("ALL_FILES",all_doc_methods[method]["index_filename"],exp["full_corpus"]) ## files_dict["ALL_FILES"]["tfidf_models"].append({"method":method,"actual_dir":actual_dir}) #=================================== # MAIN LOOP over all testing files #=================================== for guid in exp["test_files"]: try: self.processOneFile(guid) except ValueError: print("Can't load SciDoc ",guid) continue logger.showProgressReport(guid) # prints out info on how it's going self.saveAllQueries() print("Precomputed queries saved.")
def updateInCollectionReferences(self, ALL_GUIDS, import_options={}): """ For every guid, it matches its in-collection references, and its resolvable citations Args: ALL_GUIDS: list of guids """ print("Finding resolvable references, populating database...") progress=ProgressIndicator(True, len(ALL_GUIDS), dot_every_xitems=100) tasks=[] for doc_id in ALL_GUIDS[FILES_TO_PROCESS_FROM:FILES_TO_PROCESS_TO]: if self.use_celery: tasks.append(updateReferencesTask.apply_async( args=[doc_id, import_options], kwargs={}, queue="update_references" )) else: doc_meta=updatePaperInCollectionReferences(doc_id, import_options) filename=doc_meta["filename"] if doc_meta else "<ERROR>" progress.showProgressReport("Updating references -- latest paper "+filename)
def reloadSciDocsOnly(self, conditions, inputdir, file_mask): """ Iterates through the papers already in the collection given the condition. Tries to load their scidoc. If KeyError occurs, it loads the XML again """ ## filenames=cp.Corpus.SQLQuery("SELECT guid,metadata.filename FROM papers where %s limit 10000" % conditions) in_collection=[item["_source"] for item in cp.Corpus.unlimitedQuery( index="papers", doc_type="paper", _source=["metadata.corpus_id","metadata.filename","guid"], q=conditions )] print("Fixing broken SciDocs") print("Listing all loaded papers...") ALL_INPUT_FILES=self.loadListOrListAllFiles(inputdir,file_mask) files_to_process=[] files_hash={} for input_file in ALL_INPUT_FILES: corpus_id=self.generate_corpus_id(input_file) files_hash[corpus_id]=input_file print("Iterating over all papers trying to load them...") tasks=[] import_options={"reload_xml_if_doc_in_collection": True,} progress=ProgressIndicator(True,len(in_collection)) for item in in_collection: corpus_id=self.generate_corpus_id(item["metadata"]["filename"]) assert corpus_id==item["metadata"]["corpus_id"] try: doc=cp.Corpus.loadSciDoc(item["guid"]) except KeyError: print("File %s is broken" % item["guid"]) if self.use_celery: tasks.append(importXMLTask.apply_async(args=[ os.path.join(cp.Corpus.paths.inputXML,files_hash[corpus_id]), corpus_id, self.import_id, self.collection_id, import_options ], kwargs={"existing_guid":item["guid"]}, queue="import_xml" )) else: files_to_process.append([files_hash[corpus_id],item["guid"]]) progress.showProgressReport("Checking papers") if self.use_celery: return print("Processing all %s broken files..." % len(files_to_process)) progress=ProgressIndicator(True,len(files_to_process)) for fn in files_to_process: corpus_id=self.generate_corpus_id(fn[0]) try: doc=convertXMLAndAddToCorpus( os.path.join(cp.Corpus.paths.inputXML,fn[0]), corpus_id, self.import_id, self.collection_id, import_options, existing_guid=fn[1], ) except ValueError: logging.exception("ERROR: Couldn't convert %s" % fn) continue progress.showProgressReport("Importing -- latest file %s" % fn)