Beispiel #1
0
def main():

    try:
        # Read configuration file
        Settings.read(os.path.join(basepath, "config", "GreynirSimple.conf"))
    except ConfigError as e:
        print("Configuration error: {0}".format(e))
        quit()

    with SessionContext(commit=True) as session:

        # Zero sentences
        print("Deleting all articles with zero sentences")
        res = session.execute(
            ArticleModel.table().delete().where(ArticleModel.num_sentences == 0)
        )
        print(str(res.rowcount) + " articles deleted")

        # Non-Icelandic
        # TODO: Implement me!

        # Duplicates
        # For each https article, check whether there is a corresponding
        # article URL with http URI scheme
        dupl = 0
        q = session.query(ArticleModel.url).filter(ArticleModel.url.like("https://%"))
        for r in q.all():
            url = re.sub(r"^https://", r"http://", r.url)
            # c = session.query(ArticleModel.url).filter(ArticleModel.url == url).count()
            res = session.execute(
                ArticleModel.table().delete().where(ArticleModel.url == url)
            )
            dupl += res.rowcount
        print("{0} duplicate URLs w. HTTP scheme removed".format(dupl))
Beispiel #2
0
    def store(self, enclosing_session=None):
        """ Store an article in the database, inserting it or updating """
        with SessionContext(enclosing_session, commit=True) as session:
            if self._uuid is None:
                # Insert a new row
                self._uuid = str(uuid.uuid1())
                ar = ArticleRow(
                    id=self._uuid,
                    url=self._url,
                    root_id=self._root_id,
                    heading=self._heading,
                    author=self._author,
                    timestamp=self._timestamp,
                    authority=self._authority,
                    scraped=self._scraped,
                    parsed=self._parsed,
                    processed=self._processed,
                    indexed=self._indexed,
                    scr_module=self._scr_module,
                    scr_class=self._scr_class,
                    scr_version=self._scr_version,
                    parser_version=self._parser_version,
                    num_sentences=self._num_sentences,
                    num_parsed=self._num_parsed,
                    ambiguity=self._ambiguity,
                    html=self._html,
                    tree=self._tree,
                    tokens=self._tokens,
                )
                # Delete any existing rows with the same URL
                session.execute(ArticleRow.table().delete().where(
                    ArticleRow.url == self._url))
                # Add the new row with a fresh UUID
                session.add(ar)
                # Store the word stems occurring in the article
                self._store_words(session)
                # Offload the new data from Python to PostgreSQL
                session.flush()
                return True

            # Update an already existing row by UUID
            ar = (session.query(ArticleRow).filter(
                ArticleRow.id == self._uuid).one_or_none())
            if ar is None:
                # UUID not found: something is wrong here...
                return False

            # Update the columns
            # UUID is immutable
            ar.url = self._url
            ar.root_id = self._root_id
            ar.heading = self._heading
            ar.author = self._author
            ar.timestamp = self._timestamp
            ar.authority = self._authority
            ar.scraped = self._scraped
            ar.parsed = self._parsed
            ar.processed = self._processed
            ar.indexed = self._indexed
            ar.scr_module = self._scr_module
            ar.scr_class = self._scr_class
            ar.scr_version = self._scr_version
            ar.parser_version = self._parser_version
            ar.num_sentences = self._num_sentences
            ar.num_parsed = self._num_parsed
            ar.ambiguity = self._ambiguity
            ar.html = self._html
            ar.tree = self._tree
            ar.tokens = self._tokens
            # If the article has been parsed, update the index of word stems
            # (This may cause all stems for the article to be deleted, if
            # there are no successfully parsed sentences in the article)
            self._store_words(session)
            # Offload the new data from Python to PostgreSQL
            session.flush()
            return True