Beispiel #1
0
def findUtimateTexts(filename_save, num):
    worker = JSON.workOnJSON()
    list = worker.read_JSON_file(constants.location + "dataSave.json")
    totalPosts = 0
    authorDict = {}
    
    for entry in list:
        author = entry[0]
        entry = entry[1]
        totalPosts += entry["textNumber"]
        authorDict[author] = entry["textNumber"]
    
    #we now know the total number of posts
    for i in range(0, num):
        refNumber = i + 1
        authorList = authorDict.keys()
        pickedAuthors = []
        pickedNumber = 0
        ran.seed()  
        
        while 1:
            randInt = ran.randint(0, len(authorList) - 1)
            author = authorList[randInt]
            authorList.remove(author)
            pickedNumber += authorDict[author]
            pickedAuthors.append(author)
            if pickedNumber >= (totalPosts / 2):
                break
            
        pickedAuthors.sort()
        
        worker.save_JSON_file(constants.authors + "UltimateAuthors" + str(refNumber) + ".json", pickedAuthors)
        extractRandomAuthorTexts(constants.location + "newData.json", 
                                 constants.authors + "UltimateAuthors" + str(refNumber) + ".json", 
                                 filename_save + str(refNumber) + ".json")
Beispiel #2
0
def shortBogusCorpora(num):
    worker = JSON.workOnJSON()
    list = worker.read_JSON_file(constants.corpora + "Many.json")

    authorDict = {}
    for entry in list:
        author = entry["user_id"]
        authorDict[author] = 1

    authorList = authorDict.keys()
    ran.seed()
    for i in range(0, num):
        index = i + 1

        ranInt = ran.randint(0, len(authorList) - 1)
        author = authorList[ranInt]
        # I update the authorList
        del authorList[ranInt]

        finalList = []
        for entry in list:
            user_id = entry["user_id"]
            if user_id == author:
                finalList.append(entry)

        # I find the text that is to be used for the tests
        ranInt = ran.randint(0, len(finalList) - 1)
        randomEntry = finalList[ranInt]

        finalList.append({"user_id": "Bogus", "text": "hello", "post_id": "B1"})
        finalList.append({"user_id": "Bogus", "text": "Why, hello again!", "post_id": "B2"})
        worker.save_JSON_file(constants.corpora + "shortBogusCorpora" + str(index) + ".json", finalList)
        worker.save_JSON_file(constants.tests + "ShortBogusText" + str(index) + ".json", [randomEntry])
def makeTable(filename, foldername, givenNum):
    getcontext().prec = 2
    worker = JSON.workOnJSON()
    for index in range(1, givenNum + 1):
        (id, authorData, name, num) = worker.read_JSON_file(constants.resultDir + filename + str(index) + ".json")
        placeToSave = constants.folderLocation + "report/tabeller/" + foldername + "/" + filename
        produceTable(id, authorData, placeToSave, num)
Beispiel #4
0
def singlePostCorpora():
    worker = JSON.workOnJSON()
    list = worker.read_JSON_file(constants.location + "newData.json")
    authorDict = {}

    for entry in list:
        author = entry["user_id"]
        text = entry["text"]
        id = entry["post_id"]
        value = {"text": text, "post_id": id}
        if authorDict.has_key(author):
            authorDict[author].append(value)
        else:
            authorDict[author] = [value]

    finalTexts = []
    for author in authorDict.keys():
        textList = authorDict[author]
        ran.seed()
        index = ran.randint(0, len(textList) - 1)
        entry = textList[index]
        value = {"user_id": author, "text": entry["text"], "post_id": entry["post_id"]}
        finalTexts.append(value)

    worker.save_JSON_file(constants.corpora + "singlePostCorpora.json", finalTexts)
Beispiel #5
0
def runTest(compareDict, filename, name, num):
    # files to work on
    tempName = name.rpartition("/")[-1]
    
    # we load the comparisons
    if runTimeTest:
        startTime = time.time()
        print startTime

    (ngramLists, tg_dict) = makeNgram(filename)
    worker = JSON.workOnJSON()
    if runTimeTest:
        ngramTime = time.time() - startTime
        file = open(constants.results + "ngramTime.dat", "a")
        file.write(str(corpNumber)+  "\t" + str(ngramTime) + "\n")
        file.close()

    # the list of posts we want to compare to the corpus
    if runTimeTest:
        startTime = time.time()
        
    (id, authorData) = compareAuthors(ngramLists,  compareDict, tg_dict)   
    
    if runTimeTest:
        compareTime = time.time() - startTime
        file = open(constants.results + "workTime.dat", "a")
        file.write(str(corpNumber)+  "\t" + str(compareTime) + "\n")
        file.close()

    worker.save_JSON_file(constants.resultDir + tempName + str(num) + ".json", (id, authorData, name, num))            
def produceXtable():
    getcontext().prec = 4
    worker = JSON.workOnJSON()
    
    authorList = worker.read_JSON_file(constants.resultDir +"workTime.json")
    stringResult = StringIO()
    
    numElements = len(authorList)
    next = "\\\\ \n"    
    line = "\\hline \n"
    stringResult.write("\\begin{center}\n")
    stringResult.write("\\begin{tabular}{|c|" + "c|" * numElements + "}\n")
    stringResult.write(line )

    keys =  [str(i * 100) for i in range(1, 13)]  
    
    for time in keys:
        stringResult.write(" & " + str(time))

    stringResult.write(next)
    stringResult.write(line)
    
    for time1 in keys:
        stringResult.write(str(time1))
        for time2 in keys:
            result = +Decimal(str(authorList[time2][time1]))
            stringResult.write(" & " + str(result))
        
        stringResult.write(next)
        stringResult.write(line)
    
    stringResult.write("\\end{tabular}\n")
    stringResult.write("\\end{center}")
    
    FILE = open(constants.tableSave + "crossSave.tex", "w")  
    FILE.write(stringResult.getvalue())
    FILE.close()

    dict = {}
    for key in keys:    
        dict[key] = authorList[key]["1200"]
    makeGNUplot("ultimateGNUPlot", dict, keys)

    stringResult = StringIO()
    authorList = worker.read_JSON_file(constants.resultDir +"ngramTime.json")
    
    splitPoint = 6
    stringResult = printPartList(authorList, keys[:splitPoint], stringResult)
    stringResult.write("\n \n")
    stringResult = printPartList(authorList, keys[splitPoint:], stringResult)
    
    FILE = open(constants.tableSave + "ngramTime.tex", "w")    
    FILE.write(stringResult.getvalue())
    FILE.close()
    
    #make dat table
    makeGNUplot("ngramGNUPlot", authorList, keys)
def fix(null, dir, files):
    worker = JSON.workOnJSON()
    for file in files:
        path = dir + file
        dict = worker.read_JSON_file(path)

        for oAuthor in dict.keys():
            entry = dict[oAuthor]
            for iAuthor in entry.keys():
                entry[iAuthor] = float(entry[iAuthor]) / float(constants.testTimes)
        
        worker.save_JSON_file(path, dict)
def doUltimateTable():
    worker = JSON.workOnJSON()
    filename = "UltimateTest"
    folderName = "UltimateTest"
    corpora = "newData"
    placeToSave = folderName + filename
    givenNum = 3
    for index in range(1, givenNum + 1):
        (id, authorData, name, num) = worker.read_JSON_file(constants.resultDir + filename + str(index) + ".json")
        placeToSave = constants.folderLocation + "report/tabeller/" + folderName + "/" + filename + str(index)
        (authorAttri, averageFMeasure, authorList, overall) = makeTableData(id, authorData, placeToSave, num)
        produceUltimateTables(authorAttri, averageFMeasure, authorList, id, authorData, placeToSave, num, overall, corpora)
def makeTable(filename, foldername, corpora, givenNum):
    getcontext().prec = 3
    worker = JSON.workOnJSON()
    for index in range(1, givenNum + 1):
        (id, authorData, name, num) = worker.read_JSON_file(constants.resultDir + filename + str(index) + ".json")
        placeToSave = constants.folderLocation + "report/tabeller/" + foldername + "/" + filename
        (authorAttri, averageFMeasure, authorList, overall) = makeTableData(id, authorData, placeToSave, num)
        if filename.count("ShortBogusText"):
            finalCorporaName = corpora + str(index)
        else:
            finalCorporaName = corpora
        produceTable(authorAttri, averageFMeasure, authorList, authorList, id, authorData, placeToSave, num, overall, finalCorporaName)
Beispiel #10
0
def extractRandomAuthorTexts(filename, author_filename, filename_save):
    worker = JSON.workOnJSON()
    authorList = worker.read_JSON_file(author_filename)
    
    posts = worker.read_JSON_file(filename)
    authorPosts = []

    for entry in posts:
        if authorList.count(entry["user_id"]):
            authorPosts.append(entry)
    
    worker.save_JSON_file(constants.tests + filename_save, authorPosts)
def makeRandomTestTables(filename, corpora):
    folderName = "RandomTest"
    worker = JSON.workOnJSON()
    placeToSave = folderName + filename
    authorData = worker.read_JSON_file(constants.randomTest + filename + ".json")
    placeToSave = constants.folderLocation + "report/tabeller/" + folderName + "/" + filename
    (authorAttri, averageFMeasure, authorList, overall) = makeTableData({}, authorData, placeToSave, 1)
    
    ultimate = None
    if filename.count("Ultimate"):
        ultimate = 1
        
    produceUltimateTables(authorAttri, averageFMeasure, authorData.keys(), id, authorData, placeToSave, -1, overall, corpora, ultimate)
Beispiel #12
0
def makeTimeTest():
    i = 100
    worker = JSON.workOnJSON()
    fromDirectory = constants.corpora + "newData.json"
    posts = worker.read_JSON_file(fromDirectory)
    while i < 1400:
        saveFile = constants.corpora + "timeTest" + str(i) + ".json"
        listToSave = posts[0 : i - 1]
        worker.save_JSON_file(saveFile, listToSave)
        i += 100

    i = 1329
    saveFile = constants.corpora + "timeTest" + str(i) + ".json"
    listToSave = posts[0 : i - 1]
    worker.save_JSON_file(saveFile, listToSave)
Beispiel #13
0
def extractRandomText(filename, filename_save, num):
    worker = JSON.workOnJSON()
    filename = constants.corpora + filename + ".json"
    filename_save = constants.tests + filename_save 
    
    posts = worker.read_JSON_file(filename)
    
    ran.seed()
    post = []

    for index in range(0, num):
        postNum = ran.randint(0, len(posts) - 1)
        post.append(posts[postNum])
        del posts[postNum]
    
    for index in range(1, num + 1):
        worker.save_JSON_file(filename_save + str(index) + ".json", [post[index - 1]])
def getAuthorWrittenData(num, corpora):
    worker = JSON.workOnJSON()
    corpora = worker.read_JSON_file(constants.corpora + corpora + ".json")
    listOfOne = []
    
    writtenDict = {}    
    for entry in corpora:
        authorName = entry["user_id"]
        if writtenDict.has_key(authorName):
            writtenDict[authorName] += 1
        else:
            writtenDict[authorName] = 1

    for authorName in writtenDict.keys():
        if writtenDict[authorName] == num:
            listOfOne.append(authorName)
    
    return (listOfOne, writtenDict)
def randomTest(tests, corpora, save_file, times):
    print tests
    
    worker = JSON.workOnJSON()
    corpora = worker.read_JSON_file(constants.corpora + corpora + ".json")
    tests = worker.read_JSON_file(constants.tests + tests + ".json")
    tempDict = {}    
    permTestList = []
    permCorporaList = []
    for entry in tests:
        permTestList.append(entry["user_id"])
         
    for entry in corpora:
        permCorporaList.append(entry["user_id"])
        
    resultDict= {}
    corpora = copy.deepcopy(permCorporaList)
    tests = copy.deepcopy(permTestList)
    
    numAuthor = len(tests)
    authorsDone = 1
    for realAuthor in tests:
        for i in range(0, times):
            ran.seed()
            if i > 0 and i % 1000 == 0:
                print (float(i * authorsDone) / float(times * numAuthor)) * 100 , "percent done"
            ranInt = ran.randint(0, len(corpora) - 1)
            author = corpora[ranInt]
            
            if not resultDict.has_key(realAuthor) :
                resultDict[realAuthor] = {author: 1}
            elif not resultDict[realAuthor].has_key(author):
                resultDict[realAuthor][author] =1
            else:
                resultDict[realAuthor][author] += 1
        
        authorsDone += 1
    
    for oAuthor in resultDict.keys():
        entry = resultDict[oAuthor] 
        for iAuthor in entry.keys():
            entry[iAuthor] = float(entry[iAuthor]) / float(constants.testTimes)
                    
    worker.save_JSON_file(constants.randomTest + save_file + ".json", resultDict)
Beispiel #16
0
def AuthorTest(num, filename_test, corpora_name, foldername, filename_save):
    print "Test:", filename_test
    folder = constants.tableSave + foldername + "/"
    
    if (corpora_name != "newData" or corpora_name != "testData"):
        corpora_name = constants.corpora + corpora_name
    else:
        corpora_name = constants.location + corpora_name
      
    worker = JSON.workOnJSON()
    if num == 0:
        authorText = worker.read_JSON_file(constants.tests + filename_test + ".json")
        value = runTest(authorText, corpora_name + ".json", folder + filename_save, 0)
    else:
        for i in range(0, num):
            index = i + 1
            authorText = worker.read_JSON_file(constants.tests + filename_test + str(index) + ".json")
            corpora_final_name = corpora_name
            if filename_save.count("shortBogusText"):
                corpora_final_name = corpora_name + str(index)
            value = runTest(authorText, corpora_final_name + ".json", folder + filename_save, index)
Beispiel #17
0
def makeNgram(filename):
    worker = JSON.workOnJSON()
    dict = worker.read_JSON_file(filename)
    authorDict = {}
    authorWrittenDict = {}
    tg_dict = {}
    authorNameDirec = {}
    num = 0

    for entry in dict:
        author = entry["user_id"]
        id = entry["post_id"]
         
        value = {"user_id": author, "text": entry["text"]}
        
        if authorDict.has_key(author):
            authorDict[author].append(value)
            authorWrittenDict[author].append(id)
        else:
            authorDict[author] = [value]
            authorWrittenDict[author] = [id]
    
    newAuthorDict = {}
    authorTexts = {}
    
    for authorName in authorDict.keys():
        author = authorDict[authorName]
        newAuthorDictTemp = {}
        
        listOfEntries = [entry["text"] for entry in author]
        newAuthorDict[authorName] = listOfEntries
        text = ''.join(listOfEntries)
        
        tg = ngram.ngram(listOfEntries)
        tg.corp = text
        tg.newRemember()
        tg_dict[authorName] = tg
            
    return (newAuthorDict, tg_dict)
def getAuthorWithOverXPosts(data_file, metadata_file, number):
    postFiles = "authorsWithOver"
    worker = JSON.workOnJSON()
    file_data = worker.read_JSON_file(data_file)
    file_metadata = worker.read_JSON_file(metadata_file)
    
    listOfAuthors = []
    texts = []
    
    # I find the authors who have written over the needed number of texts
    for entry in file_metadata:
        authorName = entry[0]
        entry = entry[1]
        if entry["textNumber"] >= number:
            listOfAuthors.append(authorName)
    
    # With the list of authors I now find all the texts they have written 
    for entry in file_data:
        authorName = entry["user_id"]
        if listOfAuthors.count(authorName):
            texts.append(entry)
    
    worker.save_JSON_file(postFiles + str(number) + ".json", texts)
Beispiel #19
0
def chooseAuthorsWithNumber(filename_save, number_of_posts, num):
    worker = JSON.workOnJSON()
    list = worker.read_JSON_file(constants.location + "newData.json")

    authorDict = {}
    for entry in list:
        author = entry["user_id"]
        value = {"text": entry["text"], "user_id": author, "post_id": entry["post_id"]}
        if authorDict.has_key(author):
            (number, texts) = authorDict[author]
            texts.append(value)
            authorDict[author] = (number + 1, texts)
        else:
            authorDict[author] = (1, [value])

    authorList = []
    textList = []
    authorKeyList = authorDict.keys()
    for author in authorKeyList:
        number = authorDict[author][0]
        if number >= number_of_posts[0] and number <= number_of_posts[1]:
            authorList.append(author)
            textList.extend(authorDict[author][1])

    worker.save_JSON_file(constants.corpora + filename_save + ".json", textList)

    for i in range(0, num):
        author = None
        index = i + 1
        ran.seed()
        if len(authorKeys) != 0:
            ranIndex = ran.randint(0, len(authorList) - 1)
            author = authorList[ranIndex]
            authorList.remove(author)
            worker.save_JSON_file(
                constants.tests + "Author" + name + "Post" + str(index) + ".json", authorDict[author][1]
            )
def produceStatisticalData(filename, filename_save):
    worker = JSON.workOnJSON()
    result = worker.read_JSON_file(filename)
    
    authorData = {}
    for entry in result:
        authorName = entry["user_id"]
        if not authorData.has_key(authorName):
            textLength = len(entry["text"])
            authorData[authorName] = {"textNumber": 1, "min": textLength, "max": textLength, "totalLength": textLength}
        else:
            authorEntry = authorData[authorName]
            
            authorData[authorName]["textNumber"] += 1
            textLength = len(entry["text"])
            
            authorEntry["min"] = min(authorEntry["min"], textLength)
            authorEntry["max"] = max(authorEntry["max"], textLength)
            authorEntry["totalLength"] += textLength
    
    getcontext().prec = 2        
    for authorName in authorData.keys():
        authorEntry = authorData[authorName]
        authorEntry["average"] = round(Decimal(authorEntry["totalLength"]) / Decimal(authorEntry["textNumber"]), 3)
        
    length = 0
    numberTexts = 0
    minNumber = 1000000
    maxNumber = -1
    
    for key in authorData.keys():
        entry = authorData[key]
        length += entry["totalLength"]
        numberTexts += entry["textNumber"]
        minNumber = min(minNumber, entry["totalLength"])
        maxNumber = max(maxNumber, entry["totalLength"])

    keys = sortKeys(authorData.keys())
    worker.save_JSON_file(filename_save, authorData)

    
    header = "\\begin{tabular}{cccccc}\n Name & Number of Texts & Min Length& Max Length & Average Length\\\\\n"
    stringWriter = StringIO()
    stringWriter.write(header)
    
    count = 0
    endCount = 35
    
    numberOnePost = 0
    for name in keys:
        entry = authorData[name]
        number = str(entry["textNumber"])
        if number == "1":
            numberOnePost += 1
            
        stringWriter.write(str(name[0:15]) + " & " + number + " & " + str(entry["min"]) + " & " +  str(entry["max"]) + " & " + str(entry["average"]) + "\\\\\n")
        
        if count == endCount:
            stringWriter.write("\\end{tabular}\n")
            stringWriter.write("\\newpage\n")
            stringWriter.write(header)
            count = 0
        
        count += 1
        
    stringWriter.write("& & & & & \\\\ \n")
    stringWriter.write("Number of Authors & Number of Texts & Total Min & Total Max & Total Average \\\\ \n")
    stringWriter.write(str(len(authorData)) + " & " + str(numberTexts) + " & " + str(minNumber) + " & " + str(maxNumber) +  " & " + str(round(Decimal(length) / Decimal(numberTexts), 3)) + "\\\\ \n")
    oneAuthor = str(float(numberOnePost) / float(len(authorData)) * 100)
    stringWriter.write("\\multicolumn{5}{c}{Percentage of authors who have only written 1 post: " +  oneAuthor[:5] + " \\%}")
    stringWriter.write("\\end{tabular}\n")
    
    FILE_TO_SAVE = open(constants.tableSave + "reportFile.tex","w")
    FILE_TO_SAVE.write(stringWriter.getvalue())
    FILE_TO_SAVE.close()
def produceStatisticalData(filename, filename_save):
    worker = JSON.workOnJSON()
    result = worker.read_JSON_file(filename)
    
    authorData = {}
    for entry in result:
        authorName = entry["user_id"]
        if not authorData.has_key(authorName):
            textLength = len(entry["text"])
            authorData[authorName] = {"textNumber": 1, "min": textLength, "max": textLength, "totalLength": textLength}
        else:
            authorEntry = authorData[authorName]
            
            authorData[authorName]["textNumber"] += 1
            textLength = len(entry["text"])
            
            authorEntry["min"] = min(authorEntry["min"], textLength)
            authorEntry["max"] = max(authorEntry["max"], textLength)
            authorEntry["totalLength"] += textLength
    
    getcontext().prec = 2        
    for authorName in authorData.keys():
        authorEntry = authorData[authorName]
        authorEntry["average"] = round(Decimal(authorEntry["totalLength"]) / Decimal(authorEntry["textNumber"]), 3)
        
    length = 0
    numberTexts = 0
    minNumber = 1000000
    maxNumber = -1
    for key in authorData.keys():
        entry = authorData[key]
        length += entry["totalLength"]
        numberTexts += entry["textNumber"]
        minNumber = min(minNumber, entry["totalLength"])
        maxNumber = max(maxNumber, entry["totalLength"])
    
    print "Number of authors:", len(authorData)
    print "Length:", length
    print "Number of texts:", numberTexts
    print "Average:", str(round(Decimal(length) / Decimal(numberTexts), 3))
        
    authorData = sorted(authorData.iteritems(), key=itemgetter(1))

    worker.save_JSON_file(filename_save, authorData)

    FILE_TO_SAVE = open(constants.tableSave + "reportFile.tex","w")
    
 #   FILE_TO_SAVE.write("\\documentclass[letter, 12pt, english]{article}\n")
  #  FILE_TO_SAVE.write("\\begin{document}\n")

    FILE_TO_SAVE.write("\\begin{tabular}{cccccc}\n")
    FILE_TO_SAVE.write("Name & Number of Texts & Min & Max & Average\\\\\n")
    
    count = 0
    endCount = 35
    
    for entry in authorData:
        name = entry[0]
        entry = entry[1]
        number = str(entry["textNumber"])
       # if (number > 1 and number < 10):
       # number = "\\emph{" + number + "}
       # elif (number >= 10 and number < 100):
       #     number = "\\texttt{" + number + "}"
       # elif number >= 100:
       #     number = "\\texttt{\\emph{" + number + "}}"
        FILE_TO_SAVE.write(str(name[0:15]) + " & " + number + " & " + str(entry["min"]) + " & " +  str(entry["max"]) + " & " + str(entry["average"]) + "\\\\\n")
        
        if count == endCount:
            FILE_TO_SAVE.write("\\end{tabular}\n")
            FILE_TO_SAVE.write("\\newpage\n")
            FILE_TO_SAVE.write("\\begin{tabular}{cccccc}\n")
            FILE_TO_SAVE.write("Name & Number of Texts & Min & Max & Average\\\\\n")
            count = 0
        
        count += 1
        
    FILE_TO_SAVE.write("& & & & & \\\\ \n")
    FILE_TO_SAVE.write("Number of Authors & Number of Texts & Total Min & Total Max & Total Average \\\\ \n")
    FILE_TO_SAVE.write(str(len(authorData)) + " & " + str(numberTexts) + " & " + str(minNumber) + " & " + str(maxNumber) +  " & " + str(round(Decimal(length) / Decimal(numberTexts), 3)) + "\\\\ \n")
    FILE_TO_SAVE.write("\\end{tabular}\n")
   # FILE_TO_SAVE.write("\\end{document}\n")
    FILE_TO_SAVE.close()