def documents2ArffJsonInstancesCorpus(filepaths, tokens2IndexMap):
    p = DocumentParser()

    f = open("raw_data/fulltext-corpus.json", "w")
    f.write("{" + "relation-name\":\"full-text-corpus\"," +
            "num-attributes\":" + str(len(tokens2IndexMap)) + "}\n")

    for filepath in filepaths:
        doc = p.parse(filepath)
        if "zbmath metadata" in doc.includedSources:
            f.write(doc.toArffJsonDocument(tokens2IndexMap) + "\n")
            f.flush()
    f.close()
def documents2ArffJsonInstancesCorpus(filepaths, tokens2IndexMap):
    p = DocumentParser()

    f = open("raw_data/fulltext-corpus.json", "w")
    f.write("{" +
            "relation-name\":\"full-text-corpus\"," +
            "num-attributes\":" + str(len(tokens2IndexMap)) +
            "}\n")

    for filepath in filepaths:
        doc = p.parse(filepath)
        if "zbmath metadata" in doc.includedSources:
            f.write(doc.toArffJsonDocument(tokens2IndexMap) + "\n")
            f.flush()
    f.close()
def dumpDocumentDataMaps(tokens2IndexMap, filenameFilepathsPairs, targetDir):
    p = DocumentParser()

    count = 0
    totalDocs = len(filenameFilepathsPairs)
    for filename, filepath in filenameFilepathsPairs:
        doc = p.parse(filepath)

        print str(count) + " / " + str(totalDocs)

        if "zbmath metadata" in doc.includedSources:
            dataMap = doc.toDataMap(tokens2IndexMap)

            f = open(path.join(targetDir, filename + ".json"), "w")
            f.write(json.dumps(dataMap))
            f.close()

        count += 1
def buildWordCountDict(filepaths):
    p = DocumentParser()

    wordCounts = dict()
    count = 0
    total = len(filepaths)
    for filepath in filepaths:
        print str(count) + "/" + str(total)
        doc = p.parse(filepath)

        if "zbmath metadata" in doc.includedSources:
            for token in doc.tokens:
                if token not in wordCounts:
                    wordCounts[token] = 0
                wordCounts[token] = wordCounts[token] + 1
        count += 1

    return wordCounts
def dumpDocumentDataMaps(tokens2IndexMap, filenameFilepathsPairs, targetDir):
    p = DocumentParser()

    count = 0
    totalDocs = len(filenameFilepathsPairs)
    for filename, filepath in filenameFilepathsPairs:
        doc = p.parse(filepath)

        print str(count) + " / " + str(totalDocs)

        if "zbmath metadata" in doc.includedSources:
            dataMap = doc.toDataMap(tokens2IndexMap)

            f = open(path.join(targetDir, filename + ".json"), "w")
            f.write(json.dumps(dataMap))
            f.close()

        count += 1
def buildWordCountDict(filepaths):
    p = DocumentParser()

    wordCounts = dict()
    count = 0
    total = len(filepaths)
    for filepath in filepaths:
        print str(count) + "/" + str(total)
        doc = p.parse(filepath)

        if "zbmath metadata" in doc.includedSources:
            for token in doc.tokens:
                if token not in wordCounts:
                    wordCounts[token] = 0
                wordCounts[token] = wordCounts[token] + 1
        count += 1

    return wordCounts