예제 #1
0
def run(medline_path, clean, start, end, PROCESSES):
    con = 'postgresql://*****:*****@localhost/'+db

    if end != None:
        end = int(end)

    if clean:
        PubMedDB.create_tables(db)
    
    PubMedDB.init(db)

    paths = []
    for root, dirs, files in os.walk(medline_path):
        for filename in files:
            if os.path.splitext(filename)[-1] in [".xml", ".gz"]:
                paths.append(os.path.join(root,filename))

    paths.sort()
    

    pool = Pool(processes=PROCESSES)    # start with processors
    print "Initialized with ", PROCESSES, "processes"
    #result.get() needs global variable db now - that is why a line "db = options.database" is added in "__main__" - the variable db cannot be given to __start_parser in map_async()
    result = pool.map_async(_start_parser, paths[start:end])
    res = result.get()
    #without multiprocessing:
    #for path in paths:
    #    _start_parser(path)

    print "######################"
    print "###### Finished ######"
    print "######################"
예제 #2
0
def run(medline_path, clean, start, end, PROCESSES):
    con = 'postgresql://*****:*****@localhost/' + db

    if end != None:
        end = int(end)

    if clean:
        PubMedDB.create_tables(db)

    PubMedDB.init(db)

    paths = []
    for root, dirs, files in os.walk(medline_path):
        for filename in files:
            if os.path.splitext(filename)[-1] in [".xml", ".gz"]:
                paths.append(os.path.join(root, filename))

    paths.sort()

    pool = Pool(processes=PROCESSES)  # start with processors
    print "Initialized with ", PROCESSES, "processes"
    #result.get() needs global variable db now - that is why a line "db = options.database" is added in "__main__" - the variable db cannot be given to __start_parser in map_async()
    result = pool.map_async(_start_parser, paths[start:end])
    res = result.get()
    #without multiprocessing:
    #for path in paths:
    #    _start_parser(path)

    print "######################"
    print "###### Finished ######"
    print "######################"
예제 #3
0
    def run(self, medline_path, clean, start, end, PROCESSES):
        if end is not None:
            end = int(end)

        if clean:
            PubMedDB.create_tables(self.db_engine)

        paths = []
        for root, dirs, files in os.walk(medline_path):
            for filename in files:
                if os.path.splitext(filename)[-1] in [".xml", ".gz"]:
                    paths.append(os.path.join(root, filename))

        # Don't reload what we've already got
        with FilePreloadScreener(paths, self.db_engine) as screener:
            paths = screener.exclude_loaded_files(paths)

        paths.sort()

        print "Running for %d files" % (len(paths), )

        # result.get() needs global variable `db` now - that is why a line "db = options.database" is added in "__main__" -
        #  the variable db cannot be given to __start_parser in map_async()

        if PROCESSES > 1 and len(paths) > 1:

            from contextlib import closing

            with closing(Pool(processes=PROCESSES)) as pool:
                print "Running multi-process with %d processes" % (PROCESSES, )
                result = pool.map_async(_start_parser, paths[start:end])
                res = result.get()

        # without multiprocessing:
        else:
            print "Running single process"
            for path in paths:
                _start_parser(path)

        print "######################"
        print "###### Finished ######"
        print "######################"
예제 #4
0
    def __init__(
            self,
            filepath,
            db_name_input='pubmed'):  # TODO make way to pass db name as well
        db_engine, base = PubMedDB.init(db_name_input)

        self.filepath = filepath
        self.connection = db_engine.connect()

        Session = sessionmaker(bind=db_engine)
        self.session = Session(bind=self.connection)
예제 #5
0
def pmcid_dump(DBSession, pairs):
    qury_session = DBSession()
    pmcids = [i[0] for i in qury_session.query(PubMedDB.PM_to_PMC.pmcid).all()]
    qury_session.close
    for pmid, pmcid in pairs:
        if pmcid in pmcids:
            print('{0} exist: pass')
        else:
            session = DBSession()
            record = PubMedDB.PM_to_PMC(pmcid=pmcid,
                                        fk_pmid=pmid,
                                        text_path='')
            session.add(record)
            session.commit()
            session.close
예제 #6
0
    def _parse(self):
        _file = self.filepath
        """
        a = self.session.query(PubMedDB.XMLFile.xml_file_name).filter_by(xml_file_name = os.path.split(self.filepath)[-1])
        if a.all():
            print self.filepath, 'already in DB'
            return True
        """

        if os.path.splitext(_file)[-1] == ".gz":
            _file = gzip.open(_file, 'rb')

        # get an iterable
        context = etree.iterparse(_file, events=("start", "end"))
        # turn it into an iterator
        context = iter(context)

        # get the root element
        event, root = context.next()

        DBCitation = PubMedDB.Citation()
        DBJournal = PubMedDB.Journal()

        DBXMLFile = PubMedDB.XMLFile()
        DBXMLFile.xml_file_name = os.path.split(self.filepath)[-1]
        DBXMLFile.time_processed = datetime.datetime.now()  #time.localtime()

        loop_counter = 0  #to check for memory usage each X loops

        for event, elem in context:

            if event == "end":
                if elem.tag == "MedlineCitation" or elem.tag == "BookDocument":
                    loop_counter += 1
                    #catch KeyError in case there is no Owner or Status attribute before committing DBCitation
                    try:
                        DBCitation.citation_owner = elem.attrib["Owner"]
                    except:
                        pass
                    try:
                        DBCitation.citation_status = elem.attrib["Status"]
                    except:
                        pass
                    DBCitation.journals = [DBJournal]

                    pubmed_id = int(elem.find("PMID").text)
                    DBCitation.pmid = pubmed_id

                    try:
                        same_pmid = self.session.query(
                            PubMedDB.Citation).filter(
                                PubMedDB.Citation.pmid == pubmed_id).all()
                        # The following condition is only for incremental updates.
                        """
                        # Implementation that replaces the database entry with the new article from the XML file.
                        if same_pmid: # -> evt. any()
                            same_pmid = same_pmid[0]
                            warnings.warn('\nDoubled Citation found (%s).' % pubmed_id)
                            if not same_pmid.date_revised or same_pmid.date_revised < DBCitation.date_revised:
                                warnings.warn('\nReplace old Citation. Old Citation from %s, new citation from %s.' % (same_pmid.date_revised, DBCitation.date_revised) )
                                self.session.delete( same_pmid )
                                self.session.commit()
                                DBCitation.xml_files = [DBXMLFile] # adds an implicit add()
                                self.session.add( DBCitation )
                        """

                        # Keep database entry that is already saved in database and continue with the next PubMed-ID.
                        # Manually deleting entries is possible (with PGAdmin3 or via command-line), e.g.:
                        # DELETE FROM pubmed.tbl_medline_citation WHERE pmid = 25005691;
                        if same_pmid:
                            print "Article already in database - " + str(
                                same_pmid[0]
                            ) + "Continuing with next PubMed-ID"
                            DBCitation = PubMedDB.Citation()
                            DBJournal = PubMedDB.Journal()
                            elem.clear()
                            self.session.commit()
                            continue
                        else:
                            DBCitation.xml_files = [DBXMLFile
                                                    ]  # adds an implicit add()
                            self.session.add(DBCitation)

                        if loop_counter % 1000 == 0:
                            self.session.commit()

                    except (IntegrityError) as error:
                        warnings.warn("\nIntegrityError: " + str(error),
                                      Warning)
                        self.session.rollback()
                    except Exception as e:
                        warnings.warn("\nUnbekannter Fehler:" + str(e),
                                      Warning)
                        self.session.rollback()
                        raise

                    DBCitation = PubMedDB.Citation()
                    DBJournal = PubMedDB.Journal()
                    elem.clear()

                #Kersten: some dates are given in 3-letter code - use dictionary month_code for conversion to digits:
                if elem.tag == "DateCreated":
                    try:
                        date = datetime.date(int(elem.find("Year").text),
                                             int(elem.find("Month").text),
                                             int(elem.find("Day").text))
                    except:
                        date = datetime.date(
                            int(elem.find("Year").text),
                            int(month_code[elem.find("Month").text]),
                            int(elem.find("Day").text))
                    DBCitation.date_created = date

                if elem.tag == "DateCompleted":
                    try:
                        date = datetime.date(int(elem.find("Year").text),
                                             int(elem.find("Month").text),
                                             int(elem.find("Day").text))
                    except:
                        date = datetime.date(
                            int(elem.find("Year").text),
                            int(month_code[elem.find("Month").text]),
                            int(elem.find("Day").text))
                    DBCitation.date_completed = date

                if elem.tag == "DateRevised":
                    try:
                        date = datetime.date(int(elem.find("Year").text),
                                             int(elem.find("Month").text),
                                             int(elem.find("Day").text))
                    except:
                        date = datetime.date(
                            int(elem.find("Year").text),
                            int(month_code[elem.find("Month").text]),
                            int(elem.find("Day").text))
                    DBCitation.date_revised = date

                if elem.tag == "NumberOfReferences":
                    DBCitation.number_of_references = elem.text

                if elem.tag == "ISSN":
                    DBJournal.issn = elem.text
                    DBJournal.issn_type = elem.attrib['IssnType']

                if elem.tag == "JournalIssue" or elem.tag == "Book":
                    if elem.find("Volume") != None:
                        DBJournal.volume = elem.find("Volume").text
                    if elem.find("Issue") != None:
                        DBJournal.issue = elem.find("Issue").text

                    #ensure pub_date_year with boolean year:
                    year = False
                    for subelem in elem.find("PubDate"):
                        if subelem.tag == "MedlineDate":
                            if len(subelem.text) > 40:
                                DBJournal.medline_date = subelem.text[:37] + "..."
                            else:
                                DBJournal.medline_date = subelem.text
                        elif subelem.tag == "Year":
                            year = True
                            DBJournal.pub_date_year = subelem.text
                        elif subelem.tag == "Month":
                            if subelem.text in month_code:
                                DBJournal.pub_date_month = month_code[
                                    subelem.text]
                            else:
                                DBJournal.pub_date_month = subelem.text
                        elif subelem.tag == "Day":
                            DBJournal.pub_date_day = subelem.text

                    if not year:
                        try:
                            temp_year = DBJournal.medline_date[0:4]
                            DBJournal.pub_date_year = temp_year
                        except:
                            print _file, " not able to cast first 4 letters of medline_date ", temp_year

                #if there is the attribute ArticleDate, month and day are given
                if elem.tag == "ArticleDate":
                    DBJournal.pub_date_year = elem.find("Year").text
                    DBJournal.pub_date_month = elem.find("Month").text
                    DBJournal.pub_date_day = elem.find("Day").text

                if elem.tag == "Title":
                    """ ToDo """
                    pass

                if elem.tag == "Journal":
                    if elem.find("Title") != None:
                        DBJournal.title = elem.find("Title").text
                    if elem.find("ISOAbbreviation") != None:
                        DBJournal.iso_abbreviation = elem.find(
                            "ISOAbbreviation").text

                if elem.tag == "ArticleTitle" or elem.tag == "BookTitle":
                    DBCitation.article_title = elem.text
                if elem.tag == "MedlinePgn":
                    DBCitation.medline_pgn = elem.text

                if elem.tag == "AuthorList":
                    #catch KeyError in case there is no CompleteYN attribute before committing DBCitation
                    try:
                        DBCitation.article_author_list_comp_yn = elem.attrib[
                            "CompleteYN"]
                    except:
                        pass

                    DBCitation.authors = []
                    for author in elem:
                        DBAuthor = PubMedDB.Author()

                        if author.find("LastName") != None:
                            DBAuthor.last_name = author.find("LastName").text
                        # Forname is restricted to max 99 characters, but it seems like the None query did not always work - try-except-block
                        try:
                            if author.find("ForeName") != None and not len(
                                    author.find("ForeName").text) > 100:
                                DBAuthor.fore_name = author.find(
                                    "ForeName").text
                            elif author.find("ForeName") != None and len(
                                    author.find("ForeName").text) > 100:
                                DBAuthor.fore_name = author.find(
                                    "ForeName").text[0:97] + "..."
                        except:
                            pass
                        if author.find("Initials") != None:
                            DBAuthor.initials = author.find("Initials").text
                        if author.find("Suffix") != None and not len(
                                author.find("Suffix").text) > 20:
                            DBAuthor.suffix = author.find("Suffix").text
                        elif author.find("Suffix") != None and len(
                                author.find("Suffix").text) > 20:
                            DBAuthor.suffix = author.find(
                                "Suffix").text[0:17] + "..."
                        if author.find("CollectiveName") != None:
                            DBAuthor.collective_name = author.find(
                                "CollectiveName").text

                        DBCitation.authors.append(DBAuthor)

                if elem.tag == "PersonalNameSubjectList":
                    DBCitation.personal_names = []
                    for pname in elem:
                        DBPersonalName = PubMedDB.PersonalName()

                        if pname.find("LastName") != None:
                            DBPersonalName.last_name = pname.find(
                                "LastName").text
                        if pname.find("ForeName") != None:
                            DBPersonalName.fore_name = pname.find(
                                "ForeName").text
                        if pname.find("Initials") != None:
                            DBPersonalName.initials = pname.find(
                                "Initials").text
                        if pname.find("Suffix") != None:
                            DBPersonalName.suffix = pname.find("Suffix").text

                        DBCitation.personal_names.append(DBPersonalName)

                if elem.tag == "InvestigatorList":
                    DBCitation.investigators = []
                    for investigator in elem:
                        DBInvestigator = PubMedDB.Investigator()

                        if investigator.find("LastName") != None:
                            DBInvestigator.last_name = investigator.find(
                                "LastName").text
                        if investigator.find("ForeName") != None:
                            DBInvestigator.fore_name = investigator.find(
                                "ForeName").text
                        if investigator.find("Initials") != None:
                            DBInvestigator.initials = investigator.find(
                                "Initials").text
                        if investigator.find("Suffix") != None:
                            DBInvestigator.suffix = investigator.find(
                                "Suffix").text
                        if investigator.find("Affiliation") != None:
                            DBInvestigator.investigator_affiliation = investigator.find(
                                "Affiliation").text

                        DBCitation.investigators.append(DBInvestigator)

                if elem.tag == "SpaceFlightMission":
                    DBSpaceFlight = PubMedDB.SpaceFlight()
                    DBSpaceFlight.space_flight_mission = elem.text
                    DBCitation.space_flights = [DBSpaceFlight]

                if elem.tag == "GeneralNote":
                    DBCitation.notes = []
                    for subelem in elem:
                        DBNote = PubMedDB.Note()
                        DBNote.general_note_owner = elem.attrib["Owner"]
                        DBNote.general_note = subelem.text
                        DBCitation.notes.append(DBNote)

                if elem.tag == "ChemicalList":
                    DBCitation.chemicals = []
                    for chemical in elem:
                        DBChemical = PubMedDB.Chemical()

                        if chemical.find("RegistryNumber") != None:
                            DBChemical.registry_number = chemical.find(
                                "RegistryNumber").text
                        if chemical.find("NameOfSubstance") != None:
                            DBChemical.name_of_substance = chemical.find(
                                "NameOfSubstance").text
                            DBChemical.substance_ui = chemical.find(
                                "NameOfSubstance").attrib['UI']
                        DBCitation.chemicals.append(DBChemical)

                if elem.tag == "GeneSymbolList":
                    DBCitation.gene_symbols = []
                    for genes in elem:
                        DBGeneSymbol = PubMedDB.GeneSymbol()
                        if len(genes.text) < 40:
                            DBGeneSymbol.gene_symbol = genes.text
                        else:
                            DBGeneSymbol.gene_symbol = genes.text[:37] + '...'
                        DBCitation.gene_symbols.append(DBGeneSymbol)

                if elem.tag == "CommentsCorrectionsList":
                    DBCitation.comments = []
                    for comment in elem:
                        DBComment = PubMedDB.Comment()
                        comment_ref_type = comment.attrib['RefType']
                        comment_ref_source = comment.find('RefSource')
                        if comment_ref_source != None:
                            if len(comment_ref_source.text) < 255:
                                DBComment.ref_source = comment_ref_source.text
                            else:
                                DBComment.ref_source = comment_ref_source.text[
                                    0:251] + "..."
                        if comment_ref_type != None:
                            if len(comment_ref_type) < 22:
                                DBComment.ref_type = comment_ref_type
                            else:
                                DBComment.ref_type = comment_ref_type[
                                    0:18] + "..."
                        comment_pmid_version = comment.find('PMID')
                        if comment_pmid_version != None:
                            DBComment.pmid_version = comment_pmid_version.text
                        DBCitation.comments.append(DBComment)

                if elem.tag == "MedlineJournalInfo":
                    DBJournalInfo = PubMedDB.JournalInfo()
                    if elem.find("NlmUniqueID") != None:
                        DBJournalInfo.nlm_unique_id = elem.find(
                            "NlmUniqueID").text
                    if elem.find("Country") != None:
                        DBJournalInfo.country = elem.find("Country").text
                    """#MedlineTA is just a name for the journal as an abbreviation
                    Abstract with PubMed-ID 21625393 has no MedlineTA attributebut it has to be set in PostgreSQL, that is why "unknown" is inserted instead. There is just a <MedlineTA/> tag and the same information is given in  </JournalIssue> <Title>Biotechnology and bioprocess engineering : BBE</Title>, but this is not (yet) read in this parser -> line 173:
                    """
                    if elem.find("MedlineTA") != None and elem.find(
                            "MedlineTA").text == None:
                        DBJournalInfo.medline_ta = "unknown"
                    elif elem.find("MedlineTA") != None:
                        DBJournalInfo.medline_ta = elem.find("MedlineTA").text
                    DBCitation.journal_infos = [DBJournalInfo]

                if elem.tag == "CitationSubset":
                    DBCitation.citation_subsets = []
                    for subelem in elem:
                        DBCitationSubset = CitationSubset(subelem.text)
                        DBCitation.citation_subsets.append(DBCitationSubset)

                if elem.tag == "MeshHeadingList":
                    DBCitation.meshheadings = []
                    DBCitation.qualifiers = []
                    for mesh in elem:
                        DBMeSHHeading = PubMedDB.MeSHHeading()
                        mesh_desc = mesh.find("DescriptorName")
                        if mesh_desc != None:
                            DBMeSHHeading.descriptor_name = mesh_desc.text
                            DBMeSHHeading.descriptor_name_major_yn = mesh_desc.attrib[
                                'MajorTopicYN']
                            DBMeSHHeading.descriptor_ui = mesh_desc.attrib[
                                'UI']
                        if mesh.find("QualifierName") != None:
                            mesh_quals = mesh.findall("QualifierName")
                            for qual in mesh_quals:
                                DBQualifier = PubMedDB.Qualifier()
                                DBQualifier.descriptor_name = mesh_desc.text
                                DBQualifier.qualifier_name = qual.text
                                DBQualifier.qualifier_name_major_yn = qual.attrib[
                                    'MajorTopicYN']
                                DBQualifier.qualifier_ui = qual.attrib['UI']
                                DBCitation.qualifiers.append(DBQualifier)
                        DBCitation.meshheadings.append(DBMeSHHeading)

                if elem.tag == "GrantList":
                    #catch KeyError in case there is no CompleteYN attribute before committing DBCitation
                    try:
                        DBCitation.grant_list_complete_yn = elem.attrib[
                            "CompleteYN"]
                    except:
                        pass
                    DBCitation.grants = []
                    for grant in elem:
                        DBGrants = PubMedDB.Grant()

                        if grant.find("GrantID") != None:
                            DBGrants.grantid = grant.find("GrantID").text
                        if grant.find("Acronym") != None:
                            DBGrants.acronym = grant.find("Acronym").text
                        if grant.find("Agency") != None:
                            DBGrants.agency = grant.find("Agency").text
                        if grant.find("Country") != None:
                            DBGrants.country = grant.find("Country").text
                        DBCitation.grants.append(DBGrants)

                if elem.tag == "DataBankList":
                    #catch KeyError in case there is no CompleteYN attribute before committing DBCitation
                    try:
                        DBCitation.data_bank_list_complete_yn = elem.attrib[
                            "CompleteYN"]
                    except:
                        pass
                    DBCitation.accessions = []
                    DBCitation.databanks = []

                    for databank in elem:
                        DBDataBank = PubMedDB.DataBank()
                        DBDataBank.data_bank_name = databank.find(
                            "DataBankName").text
                        DBCitation.databanks.append(DBDataBank)

                        acc_numbers = databank.find("AccessionNumberList")
                        if acc_numbers != None:
                            for acc_number in acc_numbers:
                                DBAccession = PubMedDB.Accession()
                                DBAccession.data_bank_name = DBDataBank.data_bank_name
                                DBAccession.accession_number = acc_number.text
                                DBCitation.accessions.append(DBAccession)

                if elem.tag == "Language":
                    DBLanguage = PubMedDB.Language()
                    DBLanguage.language = elem.text
                    DBCitation.languages = [DBLanguage]

                if elem.tag == "PublicationTypeList":
                    DBCitation.publication_types = []
                    for subelem in elem:
                        DBPublicationType = PubMedDB.PublicationType()
                        DBPublicationType.publication_type = subelem.text
                        DBCitation.publication_types.append(DBPublicationType)

                if elem.tag == "Article":
                    #ToDo
                    """
                    for subelem in elem:
                        if subelem.tag == "Journal":
                            for sub_subelem in subelem:
                                pass
                        if subelem.tag == "JArticleTitle":
                            pass
                        if subelem.tag == "JPagination":
                            pass
                        if subelem.tag == "JLanguage":
                            pass
                        if subelem.tag == "JPublicationTypeList":
                            pass
                    """

                if elem.tag == "VernacularTitle":
                    DBCitation.vernacular_title = elem.tag

                if elem.tag == "OtherAbstract":
                    DBOtherAbstract = PubMedDB.OtherAbstract()
                    DBCitation.other_abstracts = []
                    for other in elem:
                        if other.tag == "AbstractText":
                            DBOtherAbstract.other_abstract = other.text
                    DBCitation.other_abstracts.append(DBOtherAbstract)

                if elem.tag == "OtherID":
                    DBCitation.other_ids = []
                    DBOtherID = PubMedDB.OtherID()
                    if len(elem.text) < 80:
                        DBOtherID.other_id = elem.text
                    else:
                        DBOtherID.other_id = elem.text[0:77] + "..."
                    DBOtherID.other_id_source = elem.attrib['Source']
                    DBCitation.other_ids.append(DBOtherID)

                # start Kersten: some abstracts contain another structure - code changed:
                # check for different labels: "OBJECTIVE", "CASE SUMMARY", ...
                # next 3 lines are unchanged
                if elem.tag == "Abstract":
                    DBAbstract = PubMedDB.Abstract()
                    DBCitation.abstracts = []
                    #prepare empty string for "normal" abstracts or "labelled" abstracts
                    temp_abstract_text = ""
                    #if there are multiple AbstractText-Tags:
                    if elem.find("AbstractText") != None and len(
                            elem.findall("AbstractText")) > 1:
                        for child_AbstractText in elem.getchildren():
                            # iteration over all labels is needed otherwise only "OBJECTIVE" would be pushed into database
                            # debug: check label
                            # [('NlmCategory', 'METHODS'), ('Label', 'CASE SUMMARY')]
                            # ...
                            # also checked for empty child-tags in this structure!
                            if child_AbstractText.tag == "AbstractText" and child_AbstractText.text != None:
                                #if child_AbstractText.tag == "AbstractText": # would give an error!
                                # no label - this case should not happen with multiple AbstractText-Tags:
                                if len(child_AbstractText.items()) == 0:
                                    temp_abstract_text += child_AbstractText.text + "\n"
                                # one label or the NlmCategory - first index has to be zero:
                                if len(child_AbstractText.items()) == 1:
                                    # filter for the wrong label "UNLABELLED" - usually contains the text "ABSTRACT: - not used:
                                    if child_AbstractText.items(
                                    )[0][1] == "UNLABELLED":
                                        temp_abstract_text += child_AbstractText.text + "\n"
                                    else:
                                        temp_abstract_text += child_AbstractText.items(
                                        )[0][
                                            1] + ":\n" + child_AbstractText.text + "\n"
                                # label and NlmCategory - take label - first index has to be one:
                                if len(child_AbstractText.items()) == 2:
                                    temp_abstract_text += child_AbstractText.items(
                                    )[1][1] + ":\n" + child_AbstractText.text + "\n"
                    # if there is only one AbstractText-Tag ("usually") - no labels used:
                    if elem.find("AbstractText") != None and len(
                            elem.findall("AbstractText")) == 1:
                        temp_abstract_text = elem.findtext("AbstractText")
                    # append abstract text for later pushing it into db:
                    DBAbstract.abstract_text = temp_abstract_text
                    # next 3 lines are unchanged - some abstract texts (few) contain the child-tag "CopyrightInformation" after all AbstractText-Tags:
                    if elem.find("CopyrightInformation") != None:
                        DBAbstract.copyright_information = elem.find(
                            "CopyrightInformation").text
                    DBCitation.abstracts.append(DBAbstract)
                # end Kersten - code changed
                """
                #old code:
                if elem.tag == "Abstract":
                    DBAbstract = PubMedDB.Abstract()
                    DBCitation.abstracts = []

                    if elem.find("AbstractText") != None:   DBAbstract.abstract_text = elem.find("AbstractText").text
                    if elem.find("CopyrightInformation") != None:   DBAbstract.copyright_information = elem.find("CopyrightInformation").text
                    DBCitation.abstracts.append(DBAbstract)
                """
                if elem.tag == "KeywordList":
                    #catch KeyError in case there is no Owner attribute before committing DBCitation
                    try:
                        DBCitation.keyword_list_owner = elem.attrib["Owner"]
                    except:
                        pass
                    DBCitation.keywords = []
                    all_keywords = []
                    for subelem in elem:
                        #some documents contain duplicate keywords which would lead to a key error - if-clause
                        if not subelem.text in all_keywords:
                            all_keywords.append(subelem.text)
                        else:
                            continue
                        DBKeyword = PubMedDB.Keyword()
                        DBKeyword.keyword = subelem.text
                        #catch KeyError in case there is no MajorTopicYN attribute before committing DBCitation
                        try:
                            DBKeyword.keyword_major_yn = subelem.attrib[
                                "MajorTopicYN"]
                        except:
                            pass
                        DBCitation.keywords.append(DBKeyword)

                if elem.tag == "Affiliation":
                    if len(elem.text) < 2000:
                        DBCitation.article_affiliation = elem.text
                    else:
                        DBCitation.article_affiliation = elem.text[
                            0:1996] + "..."

                if elem.tag == "SupplMeshList":
                    DBCitation.suppl_mesh_names = []
                    for suppl_mesh in elem:
                        DBSupplMeshName = PubMedDB.SupplMeshName()
                        if len(suppl_mesh.text) < 80:
                            DBSupplMeshName.suppl_mesh_name = suppl_mesh.text
                        else:
                            DBSupplMeshName.suppl_mesh_name = suppl_mesh.text[
                                0:76] + "..."
                        DBSupplMeshName.suppl_mesh_name_ui = suppl_mesh.attrib[
                            'UI']
                        DBSupplMeshName.suppl_mesh_name_type = suppl_mesh.attrib[
                            'Type']
                        DBCitation.suppl_mesh_names.append(DBSupplMeshName)

        self.session.commit()
        return True
예제 #7
0
 def __init__(self, filepath, db):
     engine, Base = PubMedDB.init(db)
     Session = sessionmaker(bind=engine)
     self.filepath = filepath
     self.session = Session()
예제 #8
0
 def __init__(self, filepath,db):
     engine, Base = PubMedDB.init(db)
     Session = sessionmaker(bind=engine)
     self.filepath = filepath
     self.session = Session()
예제 #9
0
 def __init__(self, db_name_input):
     # Only make a single database connection pool
     db_engine, base = PubMedDB.init(db_name_input)
     self.db_name = db_name_input
     self.db_engine = db_engine