def loadReviews():
    global documentbase, functionCollection
    reviews = []
    with open("imdb62.txt") as f:
        for line in f:
            line = line.split('\t')
            reviews.append(features.document(line[5], line[1]))
    documentbase = features.documentbase(reviews).strippedDuplicates()
    functionCollection = features.documentFunctionCollection()
    documentbase.functionCollection = functionCollection
Beispiel #2
0
            docs, [features.stanfordTreeDocumentFunction])
        functionCollection.getValues(docs, features.tokensDocumentFunction)
        functionCollection.getValues(docs, features.posDocumentFunction)
        functionCollection.getValues(docs, features.stDocumentDocumentFunction)
        for doc in docs:
            functionCollection.forgetDocument(doc)
    print("prepared %d documents" % len(documentbase.documents))


if __name__ == '__main__':
    import sys
    if len(sys.argv) < 6:
        print("usage: see ", sys.argv[0])
        sys.exit(1)
    stanford_db = sys.argv[1]
    tokens_db = sys.argv[2]
    pos_db = sys.argv[3]
    c_syntax_tree_db = sys.argv[4]
    documents = sys.argv[5:]
    functionCollection = features.documentFunctionCollection()

    def readfile(filename):
        with open(filename, 'rt') as f:
            return f.read()

    documentbase = features.documentbase(
        [features.document(readfile(d)) for f in documents])
    documentbase.functionCollection = functionCollection
    prepareDocuments(stanford_db, tokens_db, pos_db, c_syntax_tree_db,
                     documentbase)
#creates a KIM model for the PAN11 corpus despite sklearn killed the last process after mining
import features
import tira
from c_syntax_tree import syntax_tree
import svm
import math
def esyntax_tree(label,children):
	result=syntax_tree(label,children)
	result.setExtendable()
	return result
est=lambda d: esyntax_tree(d,[])
st=lambda d: syntax_tree(d,[])
recovered_trees = [est(21),est(65),est(32),esyntax_tree(74,[st(13)]),esyntax_tree(74,[syntax_tree(13,[est(40)])]), \
	esyntax_tree(74,[syntax_tree(13,[st(40)])]), est(46), est(28), est(66), est(33), est(10), est(38), est(1), est(29), esyntax_tree(13,[st(40)]), \
	est(13), est(53)]
tiraInterface = tira.tiraInterface('pan11-authorship-attribution-test-dataset-large-2015-10-20','none','/tmp/output',features.documentFunctionCollection())
tiraInterface.prepareWorkingDirectory()
training_dataset,unknown_dataset=tiraInterface.loadCorpus()
with tiraInterface:
	treeFeature=tiraInterface.functionCollection.getFunction(features.syntaxTreeFrequencyFeature,tuple(recovered_trees))
	treeFeature.moveToMemory(training_dataset.documents)
#	values=treeFeature.getValuev(training_dataset.documents)
#	for i,v in enumerate(values):
#		if any(math.isnan(x) for x in v):
#			print("this document: '%s' has vector %s" % (training_dataset.documents[i].text, repr(v)))
	classifier=features.documentClassifier(training_dataset, treeFeature, svm.SVM)
	#with open(tiraInterface.model_kim,'wb') as f:
	#	f.write(classifier.dumps())
	#print("written model.")