コード例 #1
0
ファイル: test_cleantext.py プロジェクト: lefnire/ml-tools
def test_md_split_1():
    doc = articles()[0]
    paras = CleanText(doc) \
        .markdown_split_paragraphs() \
        .value()
    assert len(paras) > 1
    print(paras)
コード例 #2
0
ファイル: test_cleantext.py プロジェクト: lefnire/ml-tools
def test_md_split_all():
    docs = articles()
    paras = CleanText(docs)\
        .markdown_split_paragraphs()\
        .value()
    assert len(paras) > 0
    assert len(docs) < len(paras)
    print(paras)
コード例 #3
0
ファイル: fixtures.py プロジェクト: marait123/gnothi
 def gen_entries(self):
     try:
         # the generates article-fixtures on the GPU container, which then become available in /storage
         # to the server container. So run tests on GPU first, then on esrver. TODO decouple this!
         from ml_tools.fixtures import articles
     except:
         raise Exception("Can't generate entries from server container, must do from GPU container first.")
     entries = articles(group_by='paragraph')
     entries = Box({
         k: dict(text=v, paras=v.split('\n\n'))
         for k, v in entries.items()
     })
     self.save("entries", entries)
     return entries
コード例 #4
0
ファイル: test_cleantext.py プロジェクト: lefnire/ml-tools
def test_normalize(fmt, coverage, mode):
    chain = CleanText(articles(fmt=fmt))
    if coverage == "basic":
        chain = chain.keywords(mode=mode)
    else:
        # Revisit this list as cleantext.py grows
        chain = chain\
            .unmark()\
            .strip_html()\
            .normalize_numbers()\
            .fix_punct()\
            .only_english()\
            .only_ascii()\
            .remove_apos()\
            .multiple_whitespace()\
            .keywords(mode=mode)
    clean = chain.join().value()
    assert len(chain.data.lemmas) > 10
    print(chain.data.lemmas[:5])
    assert len(clean) > 10
    print(clean[0])
コード例 #5
0
ファイル: test_ae.py プロジェクト: lefnire/ml-tools
from ml_tools import Similars
from ml_tools.fixtures import articles
import numpy as np

corpus = articles()


def test_ae():
    chain = Similars(corpus).embed()
    vecs = chain.value()

    orig_cosines = chain.normalize().cosine().value()
    orig_cosines = np.argsort(orig_cosines, axis=1)

    dims = 20
    reduced = chain.autoencode(dims=[400, 20]).value()
    assert vecs.shape[0] == reduced.shape[0]
    assert reduced.shape[1] == dims[-1]

    # TODO do some comparison between original cosines & new cosines
コード例 #6
0
def test_group_none():
    res = articles()
    assert len(res) > 10
    assert type(res[0]) == str
    print(res[0])
コード例 #7
0
def test_group_paragraph():
    res = articles(group_by='paragraph')
    assert len(res.keys()) > 10
    assert type(res.vr_0) == str
コード例 #8
0
def test_group_article():
    res = articles(group_by='article')
    assert len(res.vr) > 10
    assert type(res.vr[0]) == str
コード例 #9
0
from box import Box
from ml_tools import CosineEstimator, Similars
from ml_tools.fixtures import articles
import numpy as np
import pandas as pd
import optuna

import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--jobs', help='Number of threads', default=1)
parser.add_argument('--init',
                    action='store_true',
                    help='initialize starter trials')
args_p = parser.parse_args()

lhs = articles()
lhs = Similars(lhs).embed().cluster(algo='agglomorative').value()

rhs = np.load('/storage/libgen/testing.npy')  #, mmap_mode='r')
books = pd.read_feather('/storage/libgen/testing.df')

# don't use cook(.?book)? , it's used in too many programming books
food_re = "gluten.?free|vegan|vegetarian"
# these should be really specific (think about edge-cases)
votes = Box(
    mine_up=r"(tensorflow|keras)",
    other_up=rf"({food_re}|republican)",
    mine_down=rf"({food_re})",
    other_down=r"(artificial|\bai\b|python|java|css|html|cbt|cognitive.?behav)"
)