Beispiel #1
0
    def convertAllFilesAndAddToDB(self, ALL_INPUT_FILES, inputdir, import_options):
        """
            Loads each XML file, saves it as a SciDoc JSON file, adds its metadata to
            the database
        """
        progress=ProgressIndicator(True, self.num_files_to_process, dot_every_xitems=20)
        tasks=[]

        for fn in ALL_INPUT_FILES[FILES_TO_PROCESS_FROM:FILES_TO_PROCESS_TO]:
            corpus_id=self.generate_corpus_id(fn)
            match=cp.Corpus.getMetadataByField("metadata.filename",os.path.basename(fn))
            if not match or import_options.get("reload_xml_if_doc_in_collection",False):
                if self.use_celery:
                        match_id=match["guid"] if match else None
                        tasks.append(importXMLTask.apply_async(
                          args=[
                                os.path.join(inputdir,fn),
                                corpus_id,
                                self.import_id,
                                self.collection_id,
                                import_options,
                                match_id
                                ],
                                queue="import_xml"
                                ))
                else:
                    # main loop over all files
                    filename=cp.Corpus.paths.inputXML+fn
                    corpus_id=self.generate_corpus_id(fn)

                    match=cp.Corpus.getMetadataByField("metadata.filename",os.path.basename(fn))
                    if not match:
                        try:
                            doc=convertXMLAndAddToCorpus(
                                os.path.join(inputdir,fn),
                                corpus_id,
                                self.import_id,
                                self.collection_id,
                                import_options
                                )
                        except ValueError:
                            logging.exception("ERROR: Couldn't convert %s" % fn)
                            continue

                        progress.showProgressReport("Importing -- latest file %s" % fn)
Beispiel #2
0
    def reloadSciDocsOnly(self, conditions, inputdir, file_mask):
        """
            Iterates through the papers already in the collection given the
            condition. Tries to load their scidoc. If KeyError occurs, it loads
            the XML again
        """
##        filenames=cp.Corpus.SQLQuery("SELECT guid,metadata.filename FROM papers where %s limit 10000" % conditions)
        in_collection=[item["_source"] for item in cp.Corpus.unlimitedQuery(
            index="papers",
            doc_type="paper",
            _source=["metadata.corpus_id","metadata.filename","guid"],
            q=conditions
            )]

        print("Fixing broken SciDocs")
        print("Listing all loaded papers...")
        ALL_INPUT_FILES=self.loadListOrListAllFiles(inputdir,file_mask)
        files_to_process=[]
        files_hash={}
        for input_file in ALL_INPUT_FILES:
            corpus_id=self.generate_corpus_id(input_file)
            files_hash[corpus_id]=input_file

        print("Iterating over all papers trying to load them...")
        tasks=[]
        import_options={"reload_xml_if_doc_in_collection": True,}
        progress=ProgressIndicator(True,len(in_collection))
        for item in in_collection:
            corpus_id=self.generate_corpus_id(item["metadata"]["filename"])
            assert corpus_id==item["metadata"]["corpus_id"]
            try:
                doc=cp.Corpus.loadSciDoc(item["guid"])
            except KeyError:
                print("File %s is broken" % item["guid"])
                if self.use_celery:
                    tasks.append(importXMLTask.apply_async(args=[
                            os.path.join(cp.Corpus.paths.inputXML,files_hash[corpus_id]),
                            corpus_id,
                            self.import_id,
                            self.collection_id,
                            import_options
                            ],
                            kwargs={"existing_guid":item["guid"]},
                            queue="import_xml"
                            ))
                else:
                    files_to_process.append([files_hash[corpus_id],item["guid"]])

            progress.showProgressReport("Checking papers")

        if self.use_celery:
            return

        print("Processing all %s broken files..." % len(files_to_process))
        progress=ProgressIndicator(True,len(files_to_process))

        for fn in files_to_process:
            corpus_id=self.generate_corpus_id(fn[0])
            try:
                doc=convertXMLAndAddToCorpus(
                    os.path.join(cp.Corpus.paths.inputXML,fn[0]),
                    corpus_id,
                    self.import_id,
                    self.collection_id,
                    import_options,
                    existing_guid=fn[1],
                    )
            except ValueError:
                logging.exception("ERROR: Couldn't convert %s" % fn)
                continue

            progress.showProgressReport("Importing -- latest file %s" % fn)