def get_data_from_xml_str(self, xml_str, filename): "Get the data out of the xml string." try: tree = ET.XML(xml_str.encode('utf8')) except ET.ParseError: logger.info("Could not parse %s. Skipping." % filename) return None id_data = { e.get('pub-id-type'): e.text for e in tree.findall('.//article-id') } if 'pmc' not in id_data.keys(): logger.info("Did not get a 'pmc' in %s." % filename) return None if 'pmcid' not in id_data.keys(): id_data['pmcid'] = 'PMC' + id_data['pmc'] if 'manuscript' in id_data.keys(): id_data['manuscript_id'] = id_data['manuscript'] tr_datum = {k: id_data.get(k) for k in self.tr_cols} tc_datum = { 'pmcid': id_data['pmcid'], 'text_type': texttypes.FULLTEXT, 'content': zip_string(xml_str) } return tr_datum, tc_datum
def upload_article(self, db, article_info): deleted_pmids = self.get_deleted_pmids() logger.info("%d PMIDs in XML dataset" % len(article_info)) # Convert the article_info into a list of tuples for insertion into # the text_ref table text_ref_records = [] text_content_info = {} valid_pmids = set(article_info.keys()).difference(set(deleted_pmids)) logger.info("%d valid PMIDs" % len(valid_pmids)) existing_pmids = set(db.get_values(db.select_all( db.TextRef, db.TextRef.pmid.in_(valid_pmids) ), 'pmid')) logger.info( "%d valid PMIDs already in text_refs." % len(existing_pmids) ) pmids_to_add = valid_pmids.difference(existing_pmids) logger.info("%d PMIDs to add to text_refs" % len(pmids_to_add)) for pmid in pmids_to_add: pmid_data = article_info[pmid] rec = ( pmid, pmid_data.get('pmcid'), self.fix_doi(pmid_data.get('doi')), pmid_data.get('pii') ) text_ref_records.append( tuple([None if not r else r for r in rec]) ) abstract = pmid_data.get('abstract') # Make sure it's not an empty or whitespace-only string if abstract and abstract.strip(): abstract_gz = zip_string(abstract) text_content_info[pmid] = (self.my_source, formats.TEXT, texttypes.ABSTRACT, abstract_gz) self.copy_into_db( db, 'text_ref', text_ref_records, ('pmid', 'pmcid', 'doi', 'pii',) ) # Build a dict mapping PMIDs to text_ref IDs pmid_list = list(text_content_info.keys()) tref_list = db.select_all( 'text_ref', db.TextRef.pmid.in_([p for p in pmid_list]) ) pmid_tr_dict = {pmid: trid for (pmid, trid) in db.get_values(tref_list, ['pmid', 'id'])} # Add the text_ref IDs to the content to be inserted text_content_records = [] for pmid, tc_data in text_content_info.items(): if pmid not in pmid_tr_dict.keys(): continue tr_id = pmid_tr_dict[pmid] text_content_records.append((tr_id,) + tc_data) self.copy_into_db( db, 'text_content', text_content_records, cols=('text_ref_id', 'source', 'format', 'text_type', 'content',) ) return True