def main(): try: # Read configuration file Settings.read(os.path.join(basepath, "config", "GreynirSimple.conf")) except ConfigError as e: print("Configuration error: {0}".format(e)) quit() with SessionContext(commit=True) as session: # Zero sentences print("Deleting all articles with zero sentences") res = session.execute( ArticleModel.table().delete().where(ArticleModel.num_sentences == 0) ) print(str(res.rowcount) + " articles deleted") # Non-Icelandic # TODO: Implement me! # Duplicates # For each https article, check whether there is a corresponding # article URL with http URI scheme dupl = 0 q = session.query(ArticleModel.url).filter(ArticleModel.url.like("https://%")) for r in q.all(): url = re.sub(r"^https://", r"http://", r.url) # c = session.query(ArticleModel.url).filter(ArticleModel.url == url).count() res = session.execute( ArticleModel.table().delete().where(ArticleModel.url == url) ) dupl += res.rowcount print("{0} duplicate URLs w. HTTP scheme removed".format(dupl))
def store(self, enclosing_session=None): """ Store an article in the database, inserting it or updating """ with SessionContext(enclosing_session, commit=True) as session: if self._uuid is None: # Insert a new row self._uuid = str(uuid.uuid1()) ar = ArticleRow( id=self._uuid, url=self._url, root_id=self._root_id, heading=self._heading, author=self._author, timestamp=self._timestamp, authority=self._authority, scraped=self._scraped, parsed=self._parsed, processed=self._processed, indexed=self._indexed, scr_module=self._scr_module, scr_class=self._scr_class, scr_version=self._scr_version, parser_version=self._parser_version, num_sentences=self._num_sentences, num_parsed=self._num_parsed, ambiguity=self._ambiguity, html=self._html, tree=self._tree, tokens=self._tokens, ) # Delete any existing rows with the same URL session.execute(ArticleRow.table().delete().where( ArticleRow.url == self._url)) # Add the new row with a fresh UUID session.add(ar) # Store the word stems occurring in the article self._store_words(session) # Offload the new data from Python to PostgreSQL session.flush() return True # Update an already existing row by UUID ar = (session.query(ArticleRow).filter( ArticleRow.id == self._uuid).one_or_none()) if ar is None: # UUID not found: something is wrong here... return False # Update the columns # UUID is immutable ar.url = self._url ar.root_id = self._root_id ar.heading = self._heading ar.author = self._author ar.timestamp = self._timestamp ar.authority = self._authority ar.scraped = self._scraped ar.parsed = self._parsed ar.processed = self._processed ar.indexed = self._indexed ar.scr_module = self._scr_module ar.scr_class = self._scr_class ar.scr_version = self._scr_version ar.parser_version = self._parser_version ar.num_sentences = self._num_sentences ar.num_parsed = self._num_parsed ar.ambiguity = self._ambiguity ar.html = self._html ar.tree = self._tree ar.tokens = self._tokens # If the article has been parsed, update the index of word stems # (This may cause all stems for the article to be deleted, if # there are no successfully parsed sentences in the article) self._store_words(session) # Offload the new data from Python to PostgreSQL session.flush() return True