def documents2ArffJsonInstancesCorpus(filepaths, tokens2IndexMap): p = DocumentParser() f = open("raw_data/fulltext-corpus.json", "w") f.write("{" + "relation-name\":\"full-text-corpus\"," + "num-attributes\":" + str(len(tokens2IndexMap)) + "}\n") for filepath in filepaths: doc = p.parse(filepath) if "zbmath metadata" in doc.includedSources: f.write(doc.toArffJsonDocument(tokens2IndexMap) + "\n") f.flush() f.close()
def dumpDocumentDataMaps(tokens2IndexMap, filenameFilepathsPairs, targetDir): p = DocumentParser() count = 0 totalDocs = len(filenameFilepathsPairs) for filename, filepath in filenameFilepathsPairs: doc = p.parse(filepath) print str(count) + " / " + str(totalDocs) if "zbmath metadata" in doc.includedSources: dataMap = doc.toDataMap(tokens2IndexMap) f = open(path.join(targetDir, filename + ".json"), "w") f.write(json.dumps(dataMap)) f.close() count += 1
def buildWordCountDict(filepaths): p = DocumentParser() wordCounts = dict() count = 0 total = len(filepaths) for filepath in filepaths: print str(count) + "/" + str(total) doc = p.parse(filepath) if "zbmath metadata" in doc.includedSources: for token in doc.tokens: if token not in wordCounts: wordCounts[token] = 0 wordCounts[token] = wordCounts[token] + 1 count += 1 return wordCounts
INSERT INTO authorship(document, rank, display_name, zbmath_id) VALUES (%(document_id)s, %(rank)s, %(display_name)s, %(zbmath_id)s) """ mscAssignmentInsertStmt = """ INSERT INTO msc_assignment(document, msc, pos) VALUES (%(document_id)s, %(msc)s, %(pos)s) """ db = connect_to_db() cursor = db.cursor() warning_log = open("warning_log", "a") p = DocumentParser() # filepath = "raw_data/test_documents/07040005.xml" # for filename in filesInDict("raw_data/test_documents", True): for filename, filepath in zip(filenames, filepaths): sys.stdout.write("processing " + filename + "... ") # doc, tokenizedParagraphs, formulaDict = p.parseWithParagraphStructure(filename) doc, raw_paragraphs, formula_dict = p.parse_raw(filepath) # info for doc table: document_id = doc.arxiv_id() publication_date = doc.publication_date title = doc.title msc_cats = doc.zb_msc_cats main_msc_cat = None if len(doc.zb_msc_cats) == 0 else doc.zb_msc_cats[0][:2] authors = doc.authors
INSERT INTO authorship(document, rank, display_name, zbmath_id) VALUES (%(document_id)s, %(rank)s, %(display_name)s, %(zbmath_id)s) """ mscAssignmentInsertStmt = """ INSERT INTO msc_assignment(document, msc, pos) VALUES (%(document_id)s, %(msc)s, %(pos)s) """ db = connect_to_db() cursor = db.cursor() warning_log = open("warning_log", "a") p = DocumentParser() # filepath = "raw_data/test_documents/07040005.xml" # for filename in filesInDict("raw_data/test_documents", True): for filename, filepath in zip(filenames, filepaths): sys.stdout.write("processing " + filename + "... ") # doc, tokenizedParagraphs, formulaDict = p.parseWithParagraphStructure(filename) doc, raw_paragraphs, formula_dict = p.parse_raw(filepath) # info for doc table: document_id = doc.arxiv_id() publication_date = doc.publication_date title = doc.title msc_cats = doc.zb_msc_cats main_msc_cat = None if len( doc.zb_msc_cats) == 0 else doc.zb_msc_cats[0][:2]
from time import time from main.arffJson.ArffJsonCorpus import ArffJsonCorpus, ArffJsonDocument from string import digits, ascii_letters from os.path import isfile, join dirpath = get_dirpath() filenames, filepaths = get_filenames_and_filepaths("raw_data/ntcir_filenames") tdm = load_csr_matrix("derived_data/zb_math_full_text_tdm2.npz") translateMap = json.load( open("derived_data/zb_math_full_texts_tokens2IndexMap")) row_number2fulltext_id_map = json.load( open("derived_data/row_number2fulltext_id_map.json")) phrase = "theorem" tokenizer = DocumentParser.TextTokenizer() tokens = tokenizer.tokenize(phrase) tokenIds = map(lambda token: translateMap[token], tokens) candidateIds = [] index = 0 m = tdm[:, tokenIds] candidateInd = [] currInd = 0 for i in range(len(m.indptr) - 1): diff = m.indptr[i + 1] - m.indptr[i] if diff == len(tokenIds): candidateInd.append(currInd) currInd += 1