def main():
    setup_logging()
    PATH_DATASETS_SLS.mkdir(exist_ok=True, parents=True)
    download_file(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment labelled sentences.zip",
        PATH_DATASETS_SLS_ZIP,
    )

    sentences = []
    labels = []
    with zipfile.ZipFile(PATH_DATASETS_SLS_ZIP) as myzip:
        with myzip.open("sentiment labelled sentences/sls_labelled.txt") as f:
            for i, line in enumerate(f):
                line = line.decode("utf-8")
                text, label = line.strip().split("\t")
                text = html.unescape(text).strip()
                label = "positive" if label == "1" else "negative"
                sentences.append(text)
                labels.append(label)

    X_train, X_test, y_train, y_test = train_test_split(sentences,
                                                        labels,
                                                        test_size=0.2)

    write_sentence_documents(X_train, y_train,
                             PATH_DATASETS_SLS / f"sls_imdb_labeled.xmi")
    write_sentence_documents(X_test,
                             y_test,
                             PATH_DATASETS_SLS / f"sls_imdb_unlabeled.xmi",
                             labeled=False)
def main():
    setup_logging()
    PATH_DATASETS.mkdir(exist_ok=True, parents=True)
    download_file(
        "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
        PATH_DATASETS_IMDB)

    if not PATH_DATASETS_IMDB_EXTRACTED.exists():
        with tarfile.open(PATH_DATASETS_IMDB) as mytar:
            mytar.extractall(PATH_DATASETS_IMDB_EXTRACTED)

    positive = [(p, "positive")
                for p in (PATH_DATASETS_IMDB_TRAIN / "pos").iterdir()]
    negative = [(p, "negative")
                for p in (PATH_DATASETS_IMDB_TRAIN / "neg").iterdir()]
    unsup = [(p, "unsup")
             for p in (PATH_DATASETS_IMDB_TRAIN / "unsup").iterdir()]
    unsup = random.sample(unsup, 100)

    docs = random.sample(positive, 200) + random.sample(negative, 200)

    for i in range(10):
        random.shuffle(docs)

    sentences_per_doc = 200
    for idx, i in enumerate(range(0, len(docs), sentences_per_doc)):
        slice = docs[i:i + sentences_per_doc]
        sentences, labels = read_data(slice)

        doc_name = PATH_DATASETS_IMDB_EXTRACTED / f"imdb_{idx}_labeled.xmi"
        write_sentence_documents(sentences, labels, doc_name)

    sentences, labels = read_data(unsup)
    write_sentence_documents(sentences,
                             labels,
                             PATH_DATASETS_IMDB_EXTRACTED /
                             f"imdb_unlabeled.xmi",
                             labeled=False)
コード例 #3
0
from ariadne.contrib.jieba import JiebaSegmenter
from ariadne.contrib.nltk import NltkStemmer
from ariadne.contrib.sbert import SbertSentenceClassifier
from ariadne.contrib.sklearn import SklearnSentenceClassifier, SklearnMentionDetector
from ariadne.contrib.stringmatcher import LevenshteinStringMatcher
from ariadne.server import Server
from ariadne.util import setup_logging

setup_logging()

server = Server()
# server.add_classifier("spacy_ner", SpacyNerClassifier("en"))
# server.add_classifier("spacy_pos", SpacyPosClassifier("en"))
# server.add_classifier("sklearn_sentence", SklearnSentenceClassifier())
# server.add_classifier("jieba", JiebaSegmenter())
# server.add_classifier("stemmer", NltkStemmer())
# server.add_classifier("leven", LevenshteinStringMatcher())
# server.add_classifier("sbert", SbertSentenceClassifier())

server.start(debug=True, port=40022)