Python XMLDoc Examples

Programming Language: Python

Namespace/Package Name: patentdata.xmlparser

Class/Type: XMLDoc

Examples at hotexamples.com: 9

Python XMLDoc - 9 examples found. These are the top rated real world Python examples of patentdata.xmlparser.XMLDoc extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

XMLDoc(9)

Frequently Used Methods

XMLDoc (9)

Example #1

Show file

File: publications.py Project: orgPatentRoot/patentdata-1

 def iter_xml(self):
     """ Generator for xml file in corpus. """
     for filename in self.first_level_files:
         names = self.get_archive_names(filename)
         for name in names:
             if self.correct_file(name):
                 filedata = self.read_archive_file(filename, name)
                 if filedata:
                     yield XMLDoc(filedata)

Example #2

Show file

File: grants.py Project: orgPatentRoot/patentdata-1

    def read_archive_file(self, filename):
        """ Read large XML file from Zip.

            Returns individual documents from file
        """
        with zipfile.ZipFile(
                    os.path.join(self.path, filename), 'r'
                ) as z:
            for sl, el, filedata in separated_xml_with_lines(z):
                yield sl, el, XMLDoc(filedata)

Example #3

Show file

File: publications.py Project: orgPatentRoot/patentdata-1

 def get_patentdoc(self, publication_number):
     """ Return a PatentDoc object for a given publication number."""
     try:
         filename, name = self.search_files(publication_number)
         if filename and name:
             return XMLDoc(
                 self.read_archive_file(filename, name)
                 ).to_patentdoc()
     except:
         return None

Example #4

Show file

File: publications.py Project: orgPatentRoot/patentdata-1

    def iter_filter_xml(self, classification, sample_size=None):
        """ Generator to return xml that matches has classification.

        :param classification: list in form
        ["G", "61", "K", "039", "00"]. If an entry has None or
        no entry, it and its remaining entries are not filtered.
        """
        records = self.get_records(classification, sample_size)
        filegenerator = self.iter_read(records)
        # Iterate through records and return XMLDocs
        for _, filedata in filegenerator:
            if filedata:
                yield XMLDoc(filedata)

Example #5

Show file

File: ops.py Project: orgPatentRoot/patentdata-1

 def get_doc(self, publication_number):
     """ Get XML for publication number. """
     try:
         description = self.registered_client.published_data(
             reference_type='publication',
             input=epo_ops.models.Epodoc(publication_number),
             endpoint='description').text
         claims = self.registered_client.published_data(
             reference_type='publication',
             input=epo_ops.models.Epodoc(publication_number),
             endpoint='claims').text
     except:
         warnings.warn("Full text document not available")
         description = claims = None
     if description and claims:
         return XMLDoc(description, claims)

Example #6

Show file

File: grants.py Project: orgPatentRoot/patentdata-1

 def read_by_offset(self, filename, offset):
     """ Get XML from zip file with filename starting at line offset. """
     with zipfile.ZipFile(
                 os.path.join(self.path, filename), 'r'
             ) as z:
         return XMLDoc(get_xml_by_line_offset(z, offset))

Example #7

Show file

File: patentcorpus.py Project: orgPatentRoot/patentdata-1

 def documents(self):
     for _, filedata in self.datasource.iter_read(self.filelist):
         yield XMLDoc(filedata).to_patentdoc()

Example #8

Show file

File: publications.py Project: orgPatentRoot/patentdata-1

    def xmldoc_generator(
                            self, classification=None,
                            publication_numbers=None, sample_size=None
                            ):
        """ Generator to return XML Doc objects.

        If classification is supplied results are limited to that
        classification (of form ["G", "06"], length 1 to 5).

        If publication_numbers is supplied as list, results are limited
        to those publication numbers.

        (classification and publication filtering is XOR)

        If sample_size is provided returned documents are limited to
        this integer.
        """
        # If parameters are passed iterate through whole datasource
        if not classification and not publication_numbers:
            if sample_size:
                query_string = (
                    "SELECT ROWID, filename, name FROM files"
                    " WHERE ROWID IN"
                    "(SELECT ROWID FROM files ORDER BY RANDOM() LIMIT ?)"
                    )
                records = self.c.execute(
                    query_string, (sample_size,)).fetchall()
            else:
                query_string = (
                    "SELECT ROWID, filename, name FROM files"
                )
                records = self.c.execute(query_string).fetchall()

            filereader = self.iter_read(records)
            for _, filedata in filereader:
                if filedata:
                    yield XMLDoc(filedata)



        # If a list of publication numbers are supplied
        if publication_numbers:
            if sample_size and len(publication_numbers) > sample_size:
                # Randomly sample down to sample_size
                publication_numbers = random.sample(
                    publication_numbers, sample_size
                )
            # Below is alternate method
            """ query_string = ("SELECT ROWID, filename, name FROM files"
                    " WHERE pub_no IN ({0})").format(
                        ', '.join(['?'] * len(publication_numbers)
                    )
            records = self.c.execute(
                    query_string, publication_numbers).fetchall()"""
            for publication_number in publication_numbers:
                result = self.get_patentdoc(publication_number)
                if result:
                    yield result
        # If a classification is supplied
        if classification:
            filegenerator = self.iter_filter_xml(classification, sample_size)
            for xmldoc in filegenerator:
                yield xmldoc

Example #9

Show file

File: publications.py Project: orgPatentRoot/patentdata-1

 def get_classification(self, filedata):
     """ Return patent classifications as a list of 5 items."""
     return XMLDoc(filedata).classifications()