def ez_connect(corpus="AAC", es_config=None): """ Simplifies connecting to the Corpus :param corpus: :return: corpus instance """ # global MINERVA_ELASTICSEARCH_ENDPOINT root_dir = "" if corpus == "AAC": root_dir = getRootDir("aac") elif corpus == "PMC_CSC": root_dir = getRootDir("pmc_coresc") elif corpus is None: root_dir = "" else: raise ValueError("Unknown corpus") cp.useElasticCorpus() if es_config: celery_app.MINERVA_ELASTICSEARCH_ENDPOINT = celery_app.set_config( es_config) cp.Corpus.connectCorpus(root_dir, endpoint=celery_app.MINERVA_ELASTICSEARCH_ENDPOINT) if corpus: cp.Corpus.setCorpusFilter(corpus) return cp.Corpus
def import_aac_corpus(endpoint, use_celery=True): """ Do the importing of the AAC corpus """ importer = CorpusImporter(reader=PaperXMLReader()) importer.collection_id = "AAC" importer.import_id = "initial" importer.generate_corpus_id = getACL_corpus_id options = { # "list_missing_references":True, # default: False # "convert_and_import_docs": False, # default: True } ## corpus_import.FILES_TO_PROCESS_FROM=10222 ## corpus_import.FILES_TO_PROCESS_TO=500 ## importer.restartCollectionImport(options) cp.Corpus.matcher = AANReferenceMatcher( os.path.join(getRootDir("aan"), "release" + os.sep + "acl_full.txt")) importer.use_celery = use_celery importer.importCorpus(os.path.join(getRootDir("aac"), "inputXML"), file_mask="*-paper.xml", import_options=options)
def main(): from multi.config import MINERVA_ELASTICSEARCH_ENDPOINT cp.useElasticCorpus() root_dir = getRootDir("aac") cp.Corpus.connectCorpus(root_dir, endpoint=MINERVA_ELASTICSEARCH_ENDPOINT) cp.Corpus.setCorpusFilter("AAC") ## experiment["test_files"]=["456f8c80-9807-46a9-8455-cd4a7e346f9d"] exp = Experiment(experiment, options, False) exp.run()
def main(): cp.useElasticCorpus() cp.Corpus.connectCorpus(getRootDir("aac"), endpoint=MINERVA_ELASTICSEARCH_ENDPOINT) cp.Corpus.setCorpusFilter("AAC") # fix_sentence_splitting_in_docs(cp.Corpus.listPapers()) fix_stranded_citations_in_docs(cp.Corpus.listPapers()) global num_removed_sent, num_papers_removed_sent print("Removed {} sentences from {} papers".format( num_removed_sent, num_papers_removed_sent))
def find_new_citations_in_aac(): """ Does another run through each AAC scidoc and tries to find citations that may have been missed :return: """ from multi.celery_app import MINERVA_ELASTICSEARCH_ENDPOINT from tqdm import tqdm cp.useElasticCorpus() cp.Corpus.connectCorpus(getRootDir("aac"), endpoint=MINERVA_ELASTICSEARCH_ENDPOINT) cp.Corpus.setCorpusFilter("AAC") total_found = 0 total_could_match = 0 docs_with_new_ones = 0 existing_citations = 0 counter = tqdm(cp.Corpus.listPapers()) for guid in counter: counter.set_description( "{} docs_with_new_ones, {} total_found, {} total_could_match, {} existing_citations, " .format(docs_with_new_ones, total_found, total_could_match, existing_citations)) doc = cp.Corpus.loadSciDoc(guid) for sent in doc.allsentences: existing_citations += len(sent.get("citations", [])) new_citations, citations_found = annotatePlainTextCitationsInSentence( sent, doc) if len(citations_found) > 0: # print(len(new_citations),":",new_citations) total_found += len(citations_found) total_could_match += len(new_citations) docs_with_new_ones += 1 # print("\n NEW CITATION:", sent["text"]) # print(citations_found) # print() else: if len(sent.get("citations", [])) > 0: # print("ALREADY ANNOTATED:", sent["text"], "\n") pass cp.Corpus.saveSciDoc(doc) print("Total citations found: ", total_found) print("Total citations could match: ", total_could_match) print("Docs with new citations: ", docs_with_new_ones) print("Previously annotated citations: ", existing_citations)
def fix_citation_parent_aac(): """ """ from proc.results_logging import ProgressIndicator cp.useElasticCorpus() cp.Corpus.connectCorpus(getRootDir("aac")) guids = cp.Corpus.listPapers({"match": {"metadata.collection_id": "AAC"}}) progress = ProgressIndicator(True, len(guids), True) for guid in guids: doc = cp.Corpus.loadSciDoc(guid) for cit in doc.citations: if "parent" in cit: cit["parent_s"] = cit.pop("parent") cp.Corpus.saveSciDoc(doc) progress.showProgressReport("Fixing badly imported PaperXML")
def main(): from multi.celery_app import set_config endpoint = set_config("aws-server") cp.useElasticCorpus() cp.Corpus.connectCorpus(getRootDir("aac"), endpoint=endpoint) cp.Corpus.createAndInitializeDatabase() # import_aac_corpus(endpoint, use_celery=False) # fix_citation_parent_aac() ## import corpora as cp ## cp.useElasticCorpus() ## cp.Corpus.connectCorpus("g:\\nlp\\phd\\aac") ## print(cp.Corpus.listPapers("year:>2010")[:100]) pass
from db.elastic_corpus import ElasticCorpus import db.corpora as cp from proc.general_utils import getRootDir from multi.config import set_config root_dir = getRootDir("aac") cp.useElasticCorpus() # cp.Corpus.connectCorpus(root_dir, endpoint=GCP_ENDPOINT) cp.Corpus.connectCorpus(root_dir, endpoint=set_config("aws-server")) print("") # for index in ["scidocs", "papers", "venues", "cache", "authors", "links", "missing_references"]: # for index in ["papers", "venues", "cache", "authors", "links", "missing_references"]: # for index in ["papers", "venues", "cache", "authors", "links", "missing_references"]: # if cp.Corpus.es.indices.exists(index): # cp.Corpus.deleteIndex(index) cp.Corpus.createAndInitializeDatabase() # if cp.Corpus.es.indices.exists("scidocs"): # cp.Corpus.deleteIndex("scidocs") # settings = { # "number_of_shards": 5, # "number_of_replicas": 1 # } # properties = { # "scidoc": {"type": "string", "index": "no", "store": True, "doc_values": False}, # "guid": {"type": "string", "index": "not_analyzed", "store": True},