Beispiel #1
0
    def run(self):
        """
        Begins processing all XML files in the specified directory.
        """
        for filename in os.listdir(self.__doc_dir):
            doc_id, extension = os.path.splitext(filename)
            if extension.lower() != '.xml':
                print 'Ignoring file: {} Reason: Not an XML document.'\
                      .format(filename)
                continue

            full_path = self.__doc_dir + filename
            patent_info = utils.xml_file_to_dict(full_path)
            self.__process_patent(doc_id, patent_info)

        self.__indexer.serialize()
 def run(self):
     """
     Begins downloading thesaurus.
     """
     words = []
     for filename in os.listdir(self.__doc_dir):
         doc_id, extension = os.path.splitext(filename)
         if extension.lower() != '.xml':
             print 'Ignoring file: {} Reason: Not an XML document.'\
                   .format(filename)
             continue
         full_path = self.__doc_dir + filename
         patent_info = utils.xml_file_to_dict(full_path)
         words += self.__process_patent(doc_id, patent_info)
     unique_words = list(set(words))
     nouns = extract_nouns_and_adjectives(unique_words)
     AltervistaThesaurus.build_thesaurus(nouns, self.__out_file)