return sourceLink def readAuthor(self, characters): if "".join(characters[0:5]) != "{\\it ": raise ValueError("an author link must start with \"{\\it \"") i = 4 while i < len(characters) and characters[i] != '}': i += 1 authorLink = zbMathTokenizer.Author("".join(characters[5:i])) del characters[:i+1] return authorLink def readBracedText(self, characters): if characters[0] != '(': raise ValueError("first char must be a opening brace if readBracedText funciton is called") i = 1 while i < len(characters) and characters[i] != ')': i += 1 bracedText = zbMathTokenizer.BracedText("".join(characters[1:i])) del characters[:i+1] return bracedText zbMathTokenizer.stopList = readFileLinewise2Array("/home/simon/Projekte/MIRS/ClassificationFramework/util/stoplist.txt") zbMathTokenizer.wnl = WordNetLemmatizer() """t = zbMathTokenizer() print " ".join(map(lambda x : x.strMapping(), t.tokenize("this is a test (TIAT) $asdfasdf$ {\\it Grubel, Bert} TIAT is cool [zbl asdfasdf 1000.1000]")))"""
for line in f: x = line.split(";") doc2msc[str(x[0])] = x[1].strip() f.close() target_class = "81" ordered_document_assignments = map( lambda doc_id: doc2msc[str(doc_id)] if doc_id in doc2msc else None, document_ids) ordered_document_labels = map( lambda lab: None if lab is None else (1 if lab[:len(target_class)] == target_class else 0), ordered_document_assignments) test_doc_ind = indexes_in_list(document_ids, readFileLinewise2Array("raw_data/test_doc_ids")) train_doc_ind = indexes_in_list( document_ids, readFileLinewise2Array("raw_data/train_doc_ids")) mat = load_csr_matrix("derived_data/tfidf_theorem_tdm_grouped_by_docs.npz") train_mat = mat[train_doc_ind, :] train_labels = itemgetter(*train_doc_ind)(ordered_document_labels) svd = TruncatedSVD(n_components=1000) svd.fit(train_mat) test_mat = mat[test_doc_ind, :] test_labels = itemgetter(*test_doc_ind)(ordered_document_labels) clf = svm.LinearSVC() clf.fit(svd.transform(train_mat), train_labels)
f.close()""" # cut test/train set out of corpus document_ids = json.load(open("derived_data/theorem_tdm_grouped_by_docs_doc_ids")) doc2msc = {} f = open("raw_data/doc2msc") for line in f: x = line.split(";") doc2msc[str(x[0])] = x[1].strip() f.close() target_class = "81" ordered_document_assignments = map(lambda doc_id: doc2msc[str(doc_id)] if doc_id in doc2msc else None, document_ids) ordered_document_labels = map(lambda lab: None if lab is None else (1 if lab[:len(target_class)] == target_class else 0), ordered_document_assignments) test_doc_ind = indexes_in_list(document_ids, readFileLinewise2Array("raw_data/test_doc_ids")) train_doc_ind = indexes_in_list(document_ids, readFileLinewise2Array("raw_data/train_doc_ids")) mat = load_csr_matrix("derived_data/tfidf_theorem_tdm_grouped_by_docs.npz") train_mat = mat[train_doc_ind, :] train_labels = itemgetter(*train_doc_ind)(ordered_document_labels) svd = TruncatedSVD(n_components=1000) svd.fit(train_mat) test_mat = mat[test_doc_ind, :] test_labels = itemgetter(*test_doc_ind)(ordered_document_labels) clf = svm.LinearSVC() clf.fit(svd.transform(train_mat), train_labels)