def iter_xml(self):
     """ Generator for xml file in corpus. """
     for filename in self.first_level_files:
         names = self.get_archive_names(filename)
         for name in names:
             if self.correct_file(name):
                 filedata = self.read_archive_file(filename, name)
                 if filedata:
                     yield XMLDoc(filedata)
Example #2
0
    def read_archive_file(self, filename):
        """ Read large XML file from Zip.

            Returns individual documents from file
        """
        with zipfile.ZipFile(
                    os.path.join(self.path, filename), 'r'
                ) as z:
            for sl, el, filedata in separated_xml_with_lines(z):
                yield sl, el, XMLDoc(filedata)
 def get_patentdoc(self, publication_number):
     """ Return a PatentDoc object for a given publication number."""
     try:
         filename, name = self.search_files(publication_number)
         if filename and name:
             return XMLDoc(
                 self.read_archive_file(filename, name)
                 ).to_patentdoc()
     except:
         return None
    def iter_filter_xml(self, classification, sample_size=None):
        """ Generator to return xml that matches has classification.

        :param classification: list in form
        ["G", "61", "K", "039", "00"]. If an entry has None or
        no entry, it and its remaining entries are not filtered.
        """
        records = self.get_records(classification, sample_size)
        filegenerator = self.iter_read(records)
        # Iterate through records and return XMLDocs
        for _, filedata in filegenerator:
            if filedata:
                yield XMLDoc(filedata)
Example #5
0
 def get_doc(self, publication_number):
     """ Get XML for publication number. """
     try:
         description = self.registered_client.published_data(
             reference_type='publication',
             input=epo_ops.models.Epodoc(publication_number),
             endpoint='description').text
         claims = self.registered_client.published_data(
             reference_type='publication',
             input=epo_ops.models.Epodoc(publication_number),
             endpoint='claims').text
     except:
         warnings.warn("Full text document not available")
         description = claims = None
     if description and claims:
         return XMLDoc(description, claims)
Example #6
0
 def read_by_offset(self, filename, offset):
     """ Get XML from zip file with filename starting at line offset. """
     with zipfile.ZipFile(
                 os.path.join(self.path, filename), 'r'
             ) as z:
         return XMLDoc(get_xml_by_line_offset(z, offset))
 def documents(self):
     for _, filedata in self.datasource.iter_read(self.filelist):
         yield XMLDoc(filedata).to_patentdoc()
    def xmldoc_generator(
                            self, classification=None,
                            publication_numbers=None, sample_size=None
                            ):
        """ Generator to return XML Doc objects.

        If classification is supplied results are limited to that
        classification (of form ["G", "06"], length 1 to 5).

        If publication_numbers is supplied as list, results are limited
        to those publication numbers.

        (classification and publication filtering is XOR)

        If sample_size is provided returned documents are limited to
        this integer.
        """
        # If parameters are passed iterate through whole datasource
        if not classification and not publication_numbers:
            if sample_size:
                query_string = (
                    "SELECT ROWID, filename, name FROM files"
                    " WHERE ROWID IN"
                    "(SELECT ROWID FROM files ORDER BY RANDOM() LIMIT ?)"
                    )
                records = self.c.execute(
                    query_string, (sample_size,)).fetchall()
            else:
                query_string = (
                    "SELECT ROWID, filename, name FROM files"
                )
                records = self.c.execute(query_string).fetchall()

            filereader = self.iter_read(records)
            for _, filedata in filereader:
                if filedata:
                    yield XMLDoc(filedata)



        # If a list of publication numbers are supplied
        if publication_numbers:
            if sample_size and len(publication_numbers) > sample_size:
                # Randomly sample down to sample_size
                publication_numbers = random.sample(
                    publication_numbers, sample_size
                )
            # Below is alternate method
            """ query_string = ("SELECT ROWID, filename, name FROM files"
                    " WHERE pub_no IN ({0})").format(
                        ', '.join(['?'] * len(publication_numbers)
                    )
            records = self.c.execute(
                    query_string, publication_numbers).fetchall()"""
            for publication_number in publication_numbers:
                result = self.get_patentdoc(publication_number)
                if result:
                    yield result
        # If a classification is supplied
        if classification:
            filegenerator = self.iter_filter_xml(classification, sample_size)
            for xmldoc in filegenerator:
                yield xmldoc
 def get_classification(self, filedata):
     """ Return patent classifications as a list of 5 items."""
     return XMLDoc(filedata).classifications()