Example #1
0
 def get_data_from_xml_str(self, xml_str, filename):
     "Get the data out of the xml string."
     try:
         tree = ET.XML(xml_str.encode('utf8'))
     except ET.ParseError:
         logger.info("Could not parse %s. Skipping." % filename)
         return None
     id_data = {
         e.get('pub-id-type'): e.text for e in
         tree.findall('.//article-id')
         }
     if 'pmc' not in id_data.keys():
         logger.info("Did not get a 'pmc' in %s." % filename)
         return None
     if 'pmcid' not in id_data.keys():
         id_data['pmcid'] = 'PMC' + id_data['pmc']
     if 'manuscript' in id_data.keys():
         id_data['manuscript_id'] = id_data['manuscript']
     tr_datum = {k: id_data.get(k) for k in self.tr_cols}
     tc_datum = {
         'pmcid': id_data['pmcid'],
         'text_type': texttypes.FULLTEXT,
         'content': zip_string(xml_str)
         }
     return tr_datum, tc_datum
Example #2
0
    def upload_article(self, db, article_info):
        deleted_pmids = self.get_deleted_pmids()

        logger.info("%d PMIDs in XML dataset" % len(article_info))
        # Convert the article_info into a list of tuples for insertion into
        # the text_ref table
        text_ref_records = []
        text_content_info = {}
        valid_pmids = set(article_info.keys()).difference(set(deleted_pmids))
        logger.info("%d valid PMIDs" % len(valid_pmids))
        existing_pmids = set(db.get_values(db.select_all(
            db.TextRef,
            db.TextRef.pmid.in_(valid_pmids)
            ), 'pmid'))
        logger.info(
            "%d valid PMIDs already in text_refs." % len(existing_pmids)
            )
        pmids_to_add = valid_pmids.difference(existing_pmids)
        logger.info("%d PMIDs to add to text_refs" % len(pmids_to_add))
        for pmid in pmids_to_add:
            pmid_data = article_info[pmid]
            rec = (
                pmid, pmid_data.get('pmcid'),
                self.fix_doi(pmid_data.get('doi')),
                pmid_data.get('pii')
                )
            text_ref_records.append(
                tuple([None if not r else r for r in rec])
                )
            abstract = pmid_data.get('abstract')
            # Make sure it's not an empty or whitespace-only string
            if abstract and abstract.strip():
                abstract_gz = zip_string(abstract)
                text_content_info[pmid] = (self.my_source, formats.TEXT,
                                           texttypes.ABSTRACT, abstract_gz)

        self.copy_into_db(
            db,
            'text_ref',
            text_ref_records,
            ('pmid', 'pmcid', 'doi', 'pii',)
            )

        # Build a dict mapping PMIDs to text_ref IDs
        pmid_list = list(text_content_info.keys())
        tref_list = db.select_all(
            'text_ref',
            db.TextRef.pmid.in_([p for p in pmid_list])
            )
        pmid_tr_dict = {pmid: trid for (pmid, trid) in
                        db.get_values(tref_list, ['pmid', 'id'])}

        # Add the text_ref IDs to the content to be inserted
        text_content_records = []
        for pmid, tc_data in text_content_info.items():
            if pmid not in pmid_tr_dict.keys():
                continue
            tr_id = pmid_tr_dict[pmid]
            text_content_records.append((tr_id,) + tc_data)

        self.copy_into_db(
            db,
            'text_content',
            text_content_records,
            cols=('text_ref_id', 'source', 'format', 'text_type', 'content',)
            )
        return True