def matchReference(self, ref, doc=None): """ Returns the matching document in the db based on the guid, corpus_id, title and surnames of authors :param ref: reference dict :aram doc: SciDoc (optional). Only here to enable decendant classes to use """ self.corpus.checkConnectedToDB() # try matching by ID first for id_type in [("doi","doi"),("pmid","corpus_id"),("corpus_id","corpus_id"),("guid","guid")]: if ref.get(id_type[0],"") not in ["",None]: doc_meta=self.corpus.getMetadataByField(id_type[1],ref[id_type[0]]) if doc_meta: return doc_meta # if it can't be matched by id norm_title=normalizeTitle(ref["title"]) if not isinstance(norm_title, unicode): norm_title=unicode(norm_title, errors="ignore") rows=self.corpus.listFieldByField("metadata","norm_title",norm_title) for row in rows: doc_meta=row ## doc_meta=json.loads(row[0]) # load metadata dict if len(doc_meta["surnames"]) > 0: for a1 in doc_meta["surnames"]: for a2 in ref["surnames"]: if a1 and a2 and a1.lower() == a2.lower(): # essentially, if ANY surname matches return doc_meta return None
def loadPaperMetadata(self, newDocument, soup, filename): """ Tries to recover metadata from Paper file """ header=soup.find("firstpageheader") if header: title=header.find("title") if title: newDocument.metadata["title"]=title.text path,fname=os.path.split(filename) metafilename=re.sub(r"(.*)-paper.xml",r"\1.xml",fname,flags=re.IGNORECASE) metafilename=os.path.join(path, metafilename) self.bibtex_parser = BibTeXMLParser() ## print("trying to load BibTeXML from ", metafilename) try: bib_data = self.bibtex_parser.parse_file(metafilename) except BibliographyDataError as e: print(e) except: print("COULDN'T LOAD BIBTEXML FOR ",metafilename) bib_data=None if bib_data: entry=bib_data.entries[bib_data.entries.keys()[0]] for field in entry.fields: newDocument.metadata[field]=entry.fields[field].replace(u"\u2013",u"-") authors=[] for a in header.findChildren("author"): authors.append(self.loadPaperMainAuthorXML(a)) newDocument["metadata"]["authors"]=authors newDocument["metadata"]["surnames"]=[a["family"] for a in authors] newDocument["metadata"]["norm_title"]=normalizeTitle(newDocument["metadata"]["title"])
def convertXMLAndAddToCorpus(file_path, corpus_id, import_id, collection_id, import_options, xml_string=None, existing_guid=None): """ Reads the input XML and saves a SciDoc """ update_existing=False if not existing_guid: existing_guid=cp.Corpus.getMetadataByField("metadata.corpus_id", corpus_id) if existing_guid: if not import_options.get("reload_xml_if_doc_in_collection",False): print("Document %s is already in the collection. Ignoring." % corpus_id) return update_existing=True reader=AutoXMLReader() ## try: if xml_string: doc=reader.read(xml_string, file_path) else: doc=reader.readFile(file_path) ## except: ## logging.exception("Could not read file.") ## return doc.metadata["norm_title"]=normalizeTitle(doc.metadata["title"]) if update_existing: doc.metadata["guid"]=existing_guid elif doc.metadata.get("guid", "") == "": doc.metadata["guid"]=cp.Corpus.generateGUID(doc.metadata) if doc.metadata.get("corpus_id", "") == "": doc.metadata["corpus_id"]=corpus_id cp.Corpus.saveSciDoc(doc) if not update_existing: addSciDocToDB(doc, import_id, collection_id) return doc
def addSciDocToDB(doc, import_id, collection_id): """ Extends metadata from doc and adds to database """ meta=deepcopy(doc["metadata"]) if meta.get("corpus_id","")=="": meta["corpus_id"]=meta["pm_id"] if meta.has_key("pm_id") else "" meta["norm_title"]=normalizeTitle(meta["title"]) meta["numref"]=str(len(doc["references"])) meta["outlinks"]=[] meta["inlinks"]=[] meta["num_citations"]=len(doc["citations"]) # this is for later processing and adding to database meta["num_in_collection_references"]=0 meta["num_references"]=len(doc["references"]) meta["num_resolvable_citations"]=0 meta["num_citations"]=0 meta["import_id"]=import_id meta["collection_id"]=collection_id cp.Corpus.addPaper(meta, check_existing=False)