def documents2ArffJsonInstancesCorpus(filepaths, tokens2IndexMap):
    p = DocumentParser()

    f = open("raw_data/fulltext-corpus.json", "w")
    f.write("{" + "relation-name\":\"full-text-corpus\"," +
            "num-attributes\":" + str(len(tokens2IndexMap)) + "}\n")

    for filepath in filepaths:
        doc = p.parse(filepath)
        if "zbmath metadata" in doc.includedSources:
            f.write(doc.toArffJsonDocument(tokens2IndexMap) + "\n")
            f.flush()
    f.close()
def documents2ArffJsonInstancesCorpus(filepaths, tokens2IndexMap):
    p = DocumentParser()

    f = open("raw_data/fulltext-corpus.json", "w")
    f.write("{" +
            "relation-name\":\"full-text-corpus\"," +
            "num-attributes\":" + str(len(tokens2IndexMap)) +
            "}\n")

    for filepath in filepaths:
        doc = p.parse(filepath)
        if "zbmath metadata" in doc.includedSources:
            f.write(doc.toArffJsonDocument(tokens2IndexMap) + "\n")
            f.flush()
    f.close()
def dumpDocumentDataMaps(tokens2IndexMap, filenameFilepathsPairs, targetDir):
    p = DocumentParser()

    count = 0
    totalDocs = len(filenameFilepathsPairs)
    for filename, filepath in filenameFilepathsPairs:
        doc = p.parse(filepath)

        print str(count) + " / " + str(totalDocs)

        if "zbmath metadata" in doc.includedSources:
            dataMap = doc.toDataMap(tokens2IndexMap)

            f = open(path.join(targetDir, filename + ".json"), "w")
            f.write(json.dumps(dataMap))
            f.close()

        count += 1
def buildWordCountDict(filepaths):
    p = DocumentParser()

    wordCounts = dict()
    count = 0
    total = len(filepaths)
    for filepath in filepaths:
        print str(count) + "/" + str(total)
        doc = p.parse(filepath)

        if "zbmath metadata" in doc.includedSources:
            for token in doc.tokens:
                if token not in wordCounts:
                    wordCounts[token] = 0
                wordCounts[token] = wordCounts[token] + 1
        count += 1

    return wordCounts
def dumpDocumentDataMaps(tokens2IndexMap, filenameFilepathsPairs, targetDir):
    p = DocumentParser()

    count = 0
    totalDocs = len(filenameFilepathsPairs)
    for filename, filepath in filenameFilepathsPairs:
        doc = p.parse(filepath)

        print str(count) + " / " + str(totalDocs)

        if "zbmath metadata" in doc.includedSources:
            dataMap = doc.toDataMap(tokens2IndexMap)

            f = open(path.join(targetDir, filename + ".json"), "w")
            f.write(json.dumps(dataMap))
            f.close()

        count += 1
def buildWordCountDict(filepaths):
    p = DocumentParser()

    wordCounts = dict()
    count = 0
    total = len(filepaths)
    for filepath in filepaths:
        print str(count) + "/" + str(total)
        doc = p.parse(filepath)

        if "zbmath metadata" in doc.includedSources:
            for token in doc.tokens:
                if token not in wordCounts:
                    wordCounts[token] = 0
                wordCounts[token] = wordCounts[token] + 1
        count += 1

    return wordCounts
INSERT INTO authorship(document, rank, display_name, zbmath_id)
VALUES
(%(document_id)s, %(rank)s, %(display_name)s, %(zbmath_id)s)
"""

mscAssignmentInsertStmt = """
INSERT INTO msc_assignment(document, msc, pos)
VALUES
(%(document_id)s, %(msc)s, %(pos)s)
"""

db = connect_to_db()
cursor = db.cursor()
warning_log = open("warning_log", "a")

p = DocumentParser()
# filepath = "raw_data/test_documents/07040005.xml"
# for filename in filesInDict("raw_data/test_documents", True):
for filename, filepath in zip(filenames, filepaths):
    sys.stdout.write("processing " + filename + "... ")

    # doc, tokenizedParagraphs, formulaDict = p.parseWithParagraphStructure(filename)
    doc, raw_paragraphs, formula_dict = p.parse_raw(filepath)

    # info for doc table:
    document_id = doc.arxiv_id()
    publication_date = doc.publication_date
    title = doc.title
    msc_cats = doc.zb_msc_cats
    main_msc_cat = None if len(doc.zb_msc_cats) == 0 else doc.zb_msc_cats[0][:2]
    authors = doc.authors
INSERT INTO authorship(document, rank, display_name, zbmath_id)
VALUES
(%(document_id)s, %(rank)s, %(display_name)s, %(zbmath_id)s)
"""

mscAssignmentInsertStmt = """
INSERT INTO msc_assignment(document, msc, pos)
VALUES
(%(document_id)s, %(msc)s, %(pos)s)
"""

db = connect_to_db()
cursor = db.cursor()
warning_log = open("warning_log", "a")

p = DocumentParser()
# filepath = "raw_data/test_documents/07040005.xml"
# for filename in filesInDict("raw_data/test_documents", True):
for filename, filepath in zip(filenames, filepaths):
    sys.stdout.write("processing " + filename + "... ")

    # doc, tokenizedParagraphs, formulaDict = p.parseWithParagraphStructure(filename)
    doc, raw_paragraphs, formula_dict = p.parse_raw(filepath)

    # info for doc table:
    document_id = doc.arxiv_id()
    publication_date = doc.publication_date
    title = doc.title
    msc_cats = doc.zb_msc_cats
    main_msc_cat = None if len(
        doc.zb_msc_cats) == 0 else doc.zb_msc_cats[0][:2]
from time import time
from main.arffJson.ArffJsonCorpus import ArffJsonCorpus, ArffJsonDocument
from string import digits, ascii_letters
from os.path import isfile, join

dirpath = get_dirpath()
filenames, filepaths = get_filenames_and_filepaths("raw_data/ntcir_filenames")

tdm = load_csr_matrix("derived_data/zb_math_full_text_tdm2.npz")
translateMap = json.load(
    open("derived_data/zb_math_full_texts_tokens2IndexMap"))
row_number2fulltext_id_map = json.load(
    open("derived_data/row_number2fulltext_id_map.json"))

phrase = "theorem"
tokenizer = DocumentParser.TextTokenizer()
tokens = tokenizer.tokenize(phrase)
tokenIds = map(lambda token: translateMap[token], tokens)

candidateIds = []
index = 0
m = tdm[:, tokenIds]

candidateInd = []
currInd = 0
for i in range(len(m.indptr) - 1):
    diff = m.indptr[i + 1] - m.indptr[i]
    if diff == len(tokenIds):
        candidateInd.append(currInd)
    currInd += 1