def main(): setup_logging() PATH_DATASETS_SLS.mkdir(exist_ok=True, parents=True) download_file( "https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment labelled sentences.zip", PATH_DATASETS_SLS_ZIP, ) sentences = [] labels = [] with zipfile.ZipFile(PATH_DATASETS_SLS_ZIP) as myzip: with myzip.open("sentiment labelled sentences/sls_labelled.txt") as f: for i, line in enumerate(f): line = line.decode("utf-8") text, label = line.strip().split("\t") text = html.unescape(text).strip() label = "positive" if label == "1" else "negative" sentences.append(text) labels.append(label) X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.2) write_sentence_documents(X_train, y_train, PATH_DATASETS_SLS / f"sls_imdb_labeled.xmi") write_sentence_documents(X_test, y_test, PATH_DATASETS_SLS / f"sls_imdb_unlabeled.xmi", labeled=False)
def main(): setup_logging() PATH_DATASETS.mkdir(exist_ok=True, parents=True) download_file( "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", PATH_DATASETS_IMDB) if not PATH_DATASETS_IMDB_EXTRACTED.exists(): with tarfile.open(PATH_DATASETS_IMDB) as mytar: mytar.extractall(PATH_DATASETS_IMDB_EXTRACTED) positive = [(p, "positive") for p in (PATH_DATASETS_IMDB_TRAIN / "pos").iterdir()] negative = [(p, "negative") for p in (PATH_DATASETS_IMDB_TRAIN / "neg").iterdir()] unsup = [(p, "unsup") for p in (PATH_DATASETS_IMDB_TRAIN / "unsup").iterdir()] unsup = random.sample(unsup, 100) docs = random.sample(positive, 200) + random.sample(negative, 200) for i in range(10): random.shuffle(docs) sentences_per_doc = 200 for idx, i in enumerate(range(0, len(docs), sentences_per_doc)): slice = docs[i:i + sentences_per_doc] sentences, labels = read_data(slice) doc_name = PATH_DATASETS_IMDB_EXTRACTED / f"imdb_{idx}_labeled.xmi" write_sentence_documents(sentences, labels, doc_name) sentences, labels = read_data(unsup) write_sentence_documents(sentences, labels, PATH_DATASETS_IMDB_EXTRACTED / f"imdb_unlabeled.xmi", labeled=False)
from ariadne.contrib.jieba import JiebaSegmenter from ariadne.contrib.nltk import NltkStemmer from ariadne.contrib.sbert import SbertSentenceClassifier from ariadne.contrib.sklearn import SklearnSentenceClassifier, SklearnMentionDetector from ariadne.contrib.stringmatcher import LevenshteinStringMatcher from ariadne.server import Server from ariadne.util import setup_logging setup_logging() server = Server() # server.add_classifier("spacy_ner", SpacyNerClassifier("en")) # server.add_classifier("spacy_pos", SpacyPosClassifier("en")) # server.add_classifier("sklearn_sentence", SklearnSentenceClassifier()) # server.add_classifier("jieba", JiebaSegmenter()) # server.add_classifier("stemmer", NltkStemmer()) # server.add_classifier("leven", LevenshteinStringMatcher()) # server.add_classifier("sbert", SbertSentenceClassifier()) server.start(debug=True, port=40022)