Example #1
0
def create_unigram_count_db(lang,
                            langmethod=lambda x: x,
                            db="sqilte:///:memory:"):
    engine = create_engine(db)
    # create session
    Session = sessionmaker(bind=engine)
    session = Session()

    # trigram table
    tablename = 'lang{}unigram'.format(lang)
    Sentence = Tables().get_sentence_table()
    Unigram = Tables().get_unigram_table(tablename)
    # create table
    Unigram.__table__.drop(engine, checkfirst=True)
    Unigram.__table__.create(engine)

    query = session.query(Sentence)
    ngram_dic = collections.defaultdict(int)
    for item in query:
        if lang == 1:
            sentences = langmethod(item.lang1).split()
        elif lang == 2:
            sentences = langmethod(item.lang2).split()
        ngrams = ngram(sentences, 1)
        for tpl in ngrams:
            ngram_dic[tpl] += 1

    # insert items
    for (first, ), count in ngram_dic.items():
        print(u"inserting {}: {}".format(first, count))
        item = Unigram(first=first, count=count)
        session.add(item)
    session.commit()
Example #2
0
def create_unigram_count_db(lang, langmethod=lambda x: x, db="sqilte:///:memory:"):
    engine = create_engine(db)
    # create session
    Session = sessionmaker(bind=engine)
    session = Session()

    # trigram table
    tablename = "lang{}unigram".format(lang)
    Sentence = Tables().get_sentence_table()
    Unigram = Tables().get_unigram_table(tablename)
    # create table
    Unigram.__table__.drop(engine, checkfirst=True)
    Unigram.__table__.create(engine)

    query = session.query(Sentence)
    ngram_dic = collections.defaultdict(int)
    for item in query:
        if lang == 1:
            sentences = langmethod(item.lang1).split()
        elif lang == 2:
            sentences = langmethod(item.lang2).split()
        ngrams = ngram(sentences, 1)
        for tpl in ngrams:
            ngram_dic[tpl] += 1

    # insert items
    for (first,), count in ngram_dic.items():
        print(u"inserting {}: {}".format(first, count))
        item = Unigram(first=first, count=count)
        session.add(item)
    session.commit()
Example #3
0
    def test_ngram_3(self):
        sentence = ["I am teacher", "I am", "I", ""]
        test_sentences = (["</s>", "<s>"] + item.split() + ["</s>"]
                          for item in sentence)
        anss = [
            [("</s>", "<s>", "I"), ("<s>", "I", "am"), ("I", "am", "teacher"),
             ("am", "teacher", "</s>")],
            [("</s>", "<s>", "I"), ("<s>", "I", "am"), ("I", "am", "</s>")],
            [("</s>", "<s>", "I"), ("<s>", "I", "</s>")],
            [("</s>", "<s>", "</s>")],
        ]

        for sentences, ans in zip(test_sentences, anss):
            a = ngram(sentences, 3)
            self.assertEqual(list(a), ans)
Example #4
0
def _create_ngram_count_db(lang, langmethod=lambda x: x, n=3, db="sqilte:///:memory:"):
    engine = create_engine(db)
    # create session
    Session = sessionmaker(bind=engine)
    session = Session()

    Sentence = Tables().get_sentence_table()
    query = session.query(Sentence)

    ngram_dic = collections.defaultdict(float)
    for item in query:
        if lang == 1:
            sentences = langmethod(item.lang1).split()
        elif lang == 2:
            sentences = langmethod(item.lang2).split()
        sentences = ["</s>", "<s>"] + sentences + ["</s>"]
        ngrams = ngram(sentences, n)
        for tpl in ngrams:
            ngram_dic[tpl] += 1

    return ngram_dic
Example #5
0
    def test_ngram_3(self):
        sentence = ["I am teacher",
                    "I am",
                    "I",
                    ""]
        test_sentences = (["</s>", "<s>"] + item.split() + ["</s>"]
                          for item in sentence)
        anss = [[("</s>", "<s>", "I"),
                 ("<s>", "I", "am"),
                 ("I", "am", "teacher"),
                 ("am", "teacher", "</s>")],
                [("</s>", "<s>", "I"),
                 ("<s>", "I", "am"),
                 ("I", "am", "</s>")],
                [("</s>", "<s>", "I"),
                 ("<s>", "I", "</s>")],
                [("</s>", "<s>", "</s>")],
                ]

        for sentences, ans in zip(test_sentences, anss):
            a = ngram(sentences, 3)
            self.assertEqual(list(a), ans)
Example #6
0
def _create_ngram_count_db(lang,
                           langmethod=lambda x: x,
                           n=3,
                           db="sqilte:///:memory:"):
    engine = create_engine(db)
    # create session
    Session = sessionmaker(bind=engine)
    session = Session()

    Sentence = Tables().get_sentence_table()
    query = session.query(Sentence)

    ngram_dic = collections.defaultdict(float)
    for item in query:
        if lang == 1:
            sentences = langmethod(item.lang1).split()
        elif lang == 2:
            sentences = langmethod(item.lang2).split()
        sentences = ["</s>", "<s>"] + sentences + ["</s>"]
        ngrams = ngram(sentences, n)
        for tpl in ngrams:
            ngram_dic[tpl] += 1

    return ngram_dic