コード例 #1
0
ファイル: train_model.py プロジェクト: ghosthamlet/wikiclass
import pickle
import sys; sys.path.insert(0, ".")
import csv; csv.field_size_limit(sys.maxsize)

from wikiclass import assessments, languages
from wikiclass.models import RFTextModel
from wikiclass.features import WikitextAndInfonoise

# Train and test set ("<assessment class>", "text content")
input_file = open("datasets/assessed_revisions.with_text.tsv")
train_set = []
test_set = []
for row in csv.DictReader(input_file, delimiter="\t"):
    if row['class'] == "A": continue
    if row['is_test'] == "FALSE":
        train_set.append((row['text'], row['class']))
    else:
        test_set.append((row['text'], row['class']))

model = RFTextModel.train(
    train_set,
    assessments=assessments.WP10,
    feature_extractor=WikitextAndInfonoise(languages.get('English'))
)

model.to_file(open("enwiki.rf_text.model", "wb"))
コード例 #2
0
ファイル: test_model.py プロジェクト: refeed/wikiclass
import pickle
import sys
sys.path.insert(0, ".")
import csv
csv.field_size_limit(sys.maxsize)

from wikiclass import assessments, languages
from wikiclass.models import RFTextModel
from wikiclass.features import WikitextAndInfonoise

# Train and test set ("<assessment class>", "text content")
input_file = open("datasets/assessed_revisions.with_text.tsv")
train_set = []
test_set = []
for row in csv.DictReader(input_file, delimiter="\t"):
    if row['class'] == "A": continue
    if row['is_test'] == "TRUE":
        test_set.append((row['text'], row['class']))

model = RFTextModel.from_file(open("enwiki.rf_text.model", "rb"))

results = model.test(test_set)

print(results)
コード例 #3
0
import pickle
import sys;sys.path.insert(0, ".")
from pprint import pprint

from wikiclass.models import RFTextModel

model = RFTextModel.from_file(open("enwiki.rf_text.model", "rb"))

# Classifies a revision of an article based on wikitext alone
text = "An '''anachronism''' {{cite }}(from the [[Ancient Greek|Greek]] <ref ..."
assessment, probs = model.classify(text)

# Print predicted assessment class and probabilities for all classes.
pprint(("assessment", assessment))
pprint(("probs", probs))
コード例 #4
0
ファイル: train_model.py プロジェクト: refeed/wikiclass
import pickle
import sys
sys.path.insert(0, ".")
import csv
csv.field_size_limit(sys.maxsize)

from wikiclass import assessments, languages
from wikiclass.models import RFTextModel
from wikiclass.features import WikitextAndInfonoise

# Train and test set ("<assessment class>", "text content")
input_file = open("datasets/assessed_revisions.with_text.tsv")
train_set = []
test_set = []
for row in csv.DictReader(input_file, delimiter="\t"):
    if row['class'] == "A": continue
    if row['is_test'] == "FALSE":
        train_set.append((row['text'], row['class']))
    else:
        test_set.append((row['text'], row['class']))

model = RFTextModel.train(train_set,
                          assessments=assessments.WP10,
                          feature_extractor=WikitextAndInfonoise(
                              languages.get('English')))

model.to_file(open("enwiki.rf_text.model", "wb"))