Ejemplo n.º 1
0
def dump_tokens(limit):
    """ Iterate through parsed articles and print a list
        of tokens and their matched terminals """

    dtd = dict()
    with closing(BIN_Db.get_db()) as db:
        with SessionContext(commit = True) as session:
            # Iterate through the articles
            q = session.query(Article) \
                .filter(Article.tree != None) \
                .order_by(Article.timestamp)
            if limit is None:
                q = q.all()
            else:
                q = q[0:limit]
            for a in q:
                print("\nARTICLE\nHeading: '{0.heading}'\nURL: {0.url}\nTimestamp: {0.timestamp}".format(a))
                tree = TreeTokenList()
                tree.load(a.tree)
                for ix, toklist in tree.sentences():
                    print("\nSentence {0}:".format(ix))
                    at_start = True
                    for t in toklist:
                        if t.tokentype == "WORD":
                            wrd = t.token[1:-1]
                            td = dtd.get(t.terminal)
                            if td is None:
                                td = TerminalDescriptor(t.terminal)
                                dtd[t.terminal] = td
                            stem = td.stem(db, wrd, at_start)
                            at_start = False
                            print("    {0} {1} {2}".format(wrd, stem, t.terminal))
                        else:
                            print("    {0.token} {0.cat} {0.terminal}".format(t))
Ejemplo n.º 2
0
Archivo: main.py Proyecto: busla/Reynir
def top_persons(limit=_TOP_PERSONS_LENGTH):
    """ Return a list of names and titles appearing recently in the news """
    toplist = dict()
    bindb = BIN_Db.get_db()

    with SessionContext(commit=True) as session:

        q = session.query(Person.name, Person.title, Person.article_url, Article.id) \
            .join(Article).join(Root) \
            .filter(Root.visible) \
            .order_by(desc(Article.timestamp))[0:limit * 2] # Go through up to 2 * N records

        for p in q:
            # Insert the name into the list if it's not already there,
            # or if the new title is longer than the previous one
            if p.name not in toplist or len(p.title) > len(toplist[p.name][0]):
                toplist[p.name] = (correct_spaces(p.title), p.article_url,
                                   p.id, bindb.lookup_name_gender(p.name))
                if len(toplist) >= limit:
                    # We now have as many names as we initially wanted: terminate the loop
                    break

    with changedlocale() as strxfrm:
        # Convert the dictionary to a sorted list of dicts
        return sorted([
            dict(name=name, title=tu[0], gender=tu[3], url=tu[1], uuid=tu[2])
            for name, tu in toplist.items()
        ],
                      key=lambda x: strxfrm(x["name"]))
Ejemplo n.º 3
0
    def execute(self):
        """ Execute the query contained in the previously parsed tree; return True if successful """
        if self._tree is None:
            self.set_error("E_QUERY_NOT_PARSED")
            return False

        self._error = None
        self._qtype = None
        with closing(BIN_Db.get_db()) as bin_db:

            # Process the tree, which has only one sentence
            self._tree.process(self._session, _THIS_MODULE, bin_db, query=self)

        return self._error is None
Ejemplo n.º 4
0
    def go_single(self, url):
        """ Single article processor that will be called by a process within a
            multiprocessing pool """

        print("Processing article {0}".format(url))
        sys.stdout.flush()

        # Load the article
        with closing(self._db.session) as session:

            try:

                article = session.query(Article).filter_by(
                    url=url).one_or_none()

                if article is None:
                    print("Article not found in scraper database")
                else:
                    if article.tree:
                        tree = Tree(url, article.authority)
                        # print("Tree:\n{0}\n".format(article.tree))
                        tree.load(article.tree)

                        with closing(BIN_Db.get_db()) as bin_db:
                            # Run all processors in turn
                            for p in self.processors:
                                tree.process(session, p, bin_db)

                    # Mark the article as being processed
                    article.processed = datetime.utcnow()

                # So far, so good: commit to the database
                session.commit()

            except Exception as e:
                # If an exception occurred, roll back the transaction
                session.rollback()
                print(
                    "Exception in article {0}, transaction rolled back\nException: {1}"
                    .format(url, e))
                raise

        sys.stdout.flush()
Ejemplo n.º 5
0
    def execute(self):
        """ Execute the query contained in the previously parsed tree; return True if successful """
        if self._tree is None:
            self.set_error("E_QUERY_NOT_PARSED")
            return False

        self._error = None
        self._qtype = None
        with closing(BIN_Db.get_db()) as bin_db:

            state = {
                "session": self._session,
                "processor": _THIS_MODULE,
                "bin_db": bin_db,
                "query": self
            }
            # Process the first and only sentence within the tree
            self._tree.process_sentence(state, self._tree[1])

        return self._error is None
Ejemplo n.º 6
0
    def process(self, session, processor):
        """ Process a tree for an entire article """
        # For each sentence in turn, do a depth-first traversal,
        # visiting each parent node after visiting its children
        # Initialize the running state that we keep between sentences

        article_begin = getattr(processor, "article_begin", None) if processor else None
        article_end = getattr(processor, "article_end", None) if processor else None

        with closing(BIN_Db.get_db()) as bin_db:

            state = { "session": session, "processor": processor,
                "bin_db": bin_db, "url": self.url, "authority": self.authority }
            # Call the article_begin(state) function, if it exists
            if article_begin is not None:
                article_begin(state)
            # Process the (parsed) sentences in the article
            for index, tree in self.s.items():
                self.process_sentence(state, tree)
            # Call the article_end(state) function, if it exists
            if article_end is not None:
                article_end(state)
Ejemplo n.º 7
0
    def add_composite(stofn, ordfl):
        """ Add composite word forms by putting a prefix on existing BIN word forms.
            Called from the config file handler. """

        from bindb import BIN_Db

        assert stofn is not None
        assert ordfl is not None
        a = stofn.split("-")
        if len(a) != 2:
            raise ConfigError(
                "Composite word meaning must contain a single hyphen")
        with closing(BIN_Db.get_db()) as db:
            prefix = a[0]
            stem = a[1]
            m = db._forms(stem)
            if m:
                for w in m:
                    if w.ordfl == ordfl:
                        t = (prefix + w.stofn, 0, ordfl, w.fl,
                             prefix + w.ordmynd, w.beyging)
                        Meanings.DICT[prefix + w.ordmynd].append(t)
                        Meanings.ROOT[prefix + w.stofn].append(t)
Ejemplo n.º 8
0
    This module is written in Python 3

"""

from settings import Settings, ConfigError
from scraperdb import SessionContext, Person
from bindb import BIN_Db

try:
    # Read configuration file
    Settings.read("config/Reynir.conf")
except ConfigError as e:
    print("Configuration error: {0}".format(e))
    quit()

with SessionContext(commit=True) as session, BIN_Db.get_db() as bdb:

    # Iterate through the persons
    q = session.query(Person) \
        .filter((Person.gender == None) | (Person.gender == 'hk')) \
        .order_by(Person.name) \
        .yield_per(200)

    lastname = ""

    for p in q:

        p.gender = bdb.lookup_name_gender(p.name)
        if p.name != lastname:
            print("{0} {1}".format(p.gender, p.name))
            lastname = p.name
Ejemplo n.º 9
0
            total_tags - missing_tag_tnt,
            100.0 * (total_tags - missing_tag_tnt) / total_tags))
        print("Correct tags: {0:8} {1:6.2f}%".format(
            correct_tag_tnt, 100.0 * correct_tag_tnt / total_tags))
        print("Partial tags: {0:8} {1:6.2f}%".format(
            partial_tag_tnt + correct_tag_tnt,
            100.0 * (partial_tag_tnt + correct_tag_tnt) / total_tags))
        print("Partial prec: {0:8} {1:6.2f}%".format(
            "", 100.0 * (partial_tag_tnt + correct_tag_tnt) /
            (total_tags - missing_tag_tnt)))
        print("Precision:    {0:8} {1:6.2f}%".format(
            "", 100.0 * correct_tag_tnt / (total_tags - missing_tag_tnt)))
        print("\n-----------------------------------\n")


if __name__ == "__main__":

    try:
        # Read configuration file
        Settings.read(os.path.join(basepath, "config/Reynir.conf"))
    except ConfigError as e:
        print("Configuration error: {0}".format(e))
        quit()

    # This is always run as a main program
    try:
        with timeit("test_tagger()"):
            test_tagger()
    finally:
        BIN_Db.cleanup()
Ejemplo n.º 10
0
            sentence_stream = Article.sentence_stream(limit = TRAINING_SET, skip = TEST_SET)
            word_tag_stream = IFD_Tagset.word_tag_stream(sentence_stream)
            tnt_tagger.train(word_tag_stream)
    with timeit(f"Train TnT tagger on IFD training set"):
        # Get a sentence stream from parsed articles
        # Number of sentences, size of training set
        sample_ratio = 50
        word_tag_stream = IFD_Corpus().word_tag_stream(skip = lambda n: n % sample_ratio == 0)
        tnt_tagger.train(word_tag_stream)
    with timeit(f"Store TnT model trained on {tnt_tagger.count} sentences"):
        tnt_tagger.store(_TNT_MODEL_FILE)


if __name__ == "__main__":

    print("Welcome to the Greynir POS tagging trainer\n")

    try:
        # Read configuration file
        Settings.read(os.path.join(basepath, "config", "Reynir.conf"))
    except ConfigError as e:
        print("Configuration error: {0}".format(e))
        quit()

    # This is always run as a main program
    try:
        with timeit("Training session"):
            train_tagger()
    finally:
        BIN_Db.cleanup()
Ejemplo n.º 11
0
"""

from settings import Settings, ConfigError
from scraperdb import SessionContext, Person
from bindb import BIN_Db

try:
    # Read configuration file
    Settings.read("config/Reynir.conf")
except ConfigError as e:
    print("Configuration error: {0}".format(e))
    quit()

with SessionContext(commit=True) as session:

    bdb = BIN_Db.get_db()

    # Iterate through the persons
    q = session.query(Person) \
        .filter((Person.gender == None) | (Person.gender == 'hk')) \
        .order_by(Person.name) \
        .yield_per(200)

    lastname = ""

    for p in q:

        p.gender = bdb.lookup_name_gender(p.name)
        if p.name != lastname:
            print("{0} {1}".format(p.gender, p.name))
            lastname = p.name
Ejemplo n.º 12
0
"""

from settings import Settings, ConfigError
from db import SessionContext
from db.models import Person
from bindb import BIN_Db

try:
    # Read configuration file
    Settings.read("config/Reynir.conf")
except ConfigError as e:
    print("Configuration error: {0}".format(e))
    quit()

with SessionContext(commit = True) as session, BIN_Db.get_db() as bdb:

    # Iterate through the persons
    q = session.query(Person) \
        .filter((Person.gender == None) | (Person.gender == 'hk')) \
        .order_by(Person.name) \
        .yield_per(200)

    lastname = ""

    for p in q:

        p.gender = bdb.lookup_name_gender(p.name)
        if p.name != lastname:
            print("{0} {1}".format(p.gender, p.name))
            lastname = p.name