def histoDiversity(theData, theType, outFile): draftDict = defaultdict(int) finalDict = defaultdict(int) for line in theData: name = line[1] if (0 != theType) and (int(theType) != int(name[-1])): continue draftDiversity = float(line[4]) / float(line[3]) finalDiversity = float(line[6]) / float(line[5]) draftRounded = round(100 * draftDiversity) finalRounded = round(100 * finalDiversity) draftDict[int(draftRounded)] += 1 finalDict[int(finalRounded)] += 1 # print('%s %s %s %3d %3d' % \ # (name, name[-1], theType, int(draftRounded), (finalRounded) )) which = 'DRAFT' headerInfo = '\nInput from file: %s\n' % (inFileName) headerInfo = 'Diversity is quotient of #uniq words and #words for ' + which + '\n' headerInfo += 'Column 1: This quotient times 100\n' headerInfo += 'Column 2: Percent of papers with this quotient\n' headerInfo += 'Column 3: Raw number of papers with this quotient\n' headerInfo += 'Column 4: The histogram\n' label = 'DIVERSITY FOR TYPE %d\n' % (theType) label += headerInfo if len(draftDict) > 0: histo, shortVersion = Histogram.histoTheData(label, draftDict, 1, 1, outFile) outFile.write('%s\n' % (histo)) else: # print('NO PAPERS FOR %s\n' % (label)) outFile.write('NO PAPERS FOR %s\n' % (label)) which = 'FINAL' headerInfo = '\nInput from file: %s\n' % (inFileName) headerInfo = 'Diversity is quotient of #uniq words and #words for ' + which + '\n' headerInfo += 'Column 1: This quotient times 100\n' headerInfo += 'Column 2: Percent of papers with this quotient\n' headerInfo += 'Column 3: Raw number of papers with this quotient\n' headerInfo += 'Column 4: The histogram\n' label = 'DIVERSITY FOR TYPE %d\n' % (theType) label += headerInfo if len(finalDict) > 0: histo, shortVersion = Histogram.histoTheData(label, finalDict, 1, 1, outFile) outFile.write('%s\n' % (histo)) else: # print('NO PAPERS FOR %s\n' % (label)) outFile.write('NO PAPERS FOR %s\n' % (label))
def histoWordCounts(theData, theType, which, outFile): localDict = defaultdict(int) for key, sent in sorted(theData.items()): if (0 != theType) and (theType != sent.getType()): continue if which != sent.getWhich(): continue name = sent.getName() sentenceLength = sent.getSentenceLength() localDict[name] += sentenceLength # print('SENT %14s %3d %3d' % (name, sentenceLength, localDict[name])) localDict2 = defaultdict(int) for name, wordCount in sorted(localDict.items()): # print('%6s %5d' % (name, wordCount)) localDict2[wordCount] += 1 # for numSents, freq in sorted(localDict2.items()): # print('%5d %5d' % (numSents, freq)) headerInfo = '\nInput from file: %s\n' % (inFileName) headerInfo += 'Column 1: # of words ' + which + '\n' headerInfo += 'Column 2: percent of total\n' headerInfo += 'Column 3: raw numbers of total\n' headerInfo += 'Column 4: the histogram\n' label = 'WORD COUNTS FOR TYPE %d ' % (theType) print('%s %s' % (label, which)) label += headerInfo if len(localDict2) > 0: histo, shortVersion = Histogram.histoTheData(label, localDict2, 50, 50, outFile) # print('%s\n' % (histo)) outFile.write('%s\n' % (histo)) # shortStuff.append([type, shortVersion]) else: # print('NO SENTENCES FOR %s\n' % (label)) outFile.write('NO SENTENCES FOR %s\n' % (label))
def histoInsertionsByPara(theData, theType, which, outFile): localDict = defaultdict(int) for key, sent in sorted(theData.items()): if (0 != theType) and (theType != sent.getType()): continue if which != sent.getWhich(): continue if sent.isAligned(): continue paraNum = sent.getRightParaSub() localDict[paraNum] += 1 # for dist, freq in sorted(localDict.items()): # print('%5d %5d' % (dist, freq)) headerInfo = '\nInput from file: %s\n' % (inFileName) headerInfo += 'Column 1: paragraph numbers\n' headerInfo += 'Column 2: percent of total\n' headerInfo += 'Column 3: raw numbers of total\n' headerInfo += 'Column 4: the histogram\n' label = 'INSERTIONS BY PARAGRAPH INTO FINAL FOR TYPE %d ' % (theType) print('%s %s' % (label, which)) label += headerInfo if len(localDict) > 0: histo, shortVersion = Histogram.histoTheData(label, localDict, 1, 1, outFile) # print('%s\n' % (histo)) outFile.write('%s\n' % (histo)) # shortStuff.append([type, shortVersion]) else: # print('NO INSERTIONS FOR %s\n' % (label)) outFile.write('NO INSERTIONS FOR %s\n' % (label))
def histoAlignmentFractionsByLevel(theData, theType, which, outFile): localDict = defaultdict(int) for key, sent in sorted(theData.items()): if (0 != theType) and (theType != sent.getType()): continue if which != sent.getWhich(): continue name = sent.getName() level = int(sent.getAlignmentLevel()) if -1 == level: level = 999 localDict[level] += 1 # for level, freqs in sorted(localDict.items()): # print('%5d %5d' % (level, freqs)) headerInfo = '\nInput from file: %s\n' % (inFileName) headerInfo += 'Column 1: fraction aligned in '+ which + '\n' headerInfo += 'Column 2: percent of total\n' headerInfo += 'Column 3: raw numbers of total\n' headerInfo += 'Column 4: the histogram\n' label = 'ALIGNMENTS FOR TYPE %d ' % (theType) print('%s %s' % (label, which)) label += headerInfo if len(localDict) > 0: histo, shortVersion = Histogram.histoTheData(label, localDict, 1, 1, outFile) # print('%s\n' % (histo)) outFile.write('%s\n' % (histo)) # shortStuff.append([type, shortVersion]) else: # print('NO PAPERS FOR %s\n' % (label)) outFile.write('NO PAPERS FOR %s\n' % (label))
def histoInsertionsByEditDistFrac(theData, theType, which, outFile): paperSet = set() paperCount = 0 beginningInsertionCount = 0 localDict = defaultdict(int) for key, sent in sorted(theData.items()): if (0 != theType) and (theType != sent.getType()): continue if which != sent.getWhich(): continue keySplit = key.split() paperSet.add(keySplit[0]) if sent.isAligned(): continue prevDist = sent.getPreviousDistance() if prevDist < 0: prevDist = 899 if "FINAL" == which: if 0 == sent.getRightParaSub() and 0 == sent.getRightSentSub(): beginningInsertionCount += 1 # print('INITIAL INSERTION %s %s' % (key, sent)) localDict[prevDist] += 1 # for dist, freq in sorted(localDict.items()): # print('%5d %5d' % (dist, freq)) paperCount = len(paperSet) # print('PAPER COUNT %3s' % (paperCount)) headerInfo = "\nInput from file: %s\n" % (inFileName) headerInfo += "There were initial insertions in %3d of %3d papers\n" % (beginningInsertionCount, paperCount) headerInfo += "Column 1: edit dist fracs before insertion\n" # headerInfo += " the '899' means insertion at beginning of paper\n" headerInfo += "Column 2: percent of total\n" headerInfo += "Column 3: raw numbers of total\n" headerInfo += "Column 4: the histogram\n" label = "INSERTIONS BY EDIT DIST FRAC OF PREVIOUS SENTENCE FOR TYPE %d " % (theType) print("%s %s" % (label, which)) label += headerInfo if len(localDict) > 0: histo, shortVersion = Histogram.histoTheData(label, localDict, 4, 4, outFile) # print('%s\n' % (histo)) outFile.write("%s\n" % (histo)) # shortStuff.append([type, shortVersion]) else: # print('NO INSERTIONS FOR %s\n' % (label)) outFile.write("NO INSERTIONS FOR %s\n" % (label))
def histoSentenceCounts(theData, theType, which, outFile): localDict = defaultdict(int) for key, sent in sorted(theData.items()): if (0 != theType) and (theType != sent.getType()): continue if which != sent.getWhich(): continue name = sent.getName() if "DRAFT" == which: sentNum = int(sent.getLeftSentSub()) else: sentNum = int(sent.getRightSentSub()) if sentNum > localDict[name]: localDict[name] = sentNum localDict2 = defaultdict(int) for name, maxValue in sorted(localDict.items()): # print('%6s %5d' % (name, maxValue)) localDict2[maxValue] += 1 # for numSents, freq in sorted(localDict2.items()): # print('%5d %5d' % (numSents, freq)) headerInfo = "\nInput from file: %s\n" % (inFileName) headerInfo += "Column 1: # of paragraphs " + which + "\n" headerInfo += "Column 2: percent of total\n" headerInfo += "Column 3: raw numbers of total\n" headerInfo += "Column 4: the histogram\n" label = "SENTENCE COUNTS FOR TYPE %d " % (theType) print("%s %s" % (label, which)) label += headerInfo if len(localDict2) > 0: histo, shortVersion = Histogram.histoTheData(label, localDict2, 2, 2, outFile) # print('%s\n' % (histo)) outFile.write("%s\n" % (histo)) # shortStuff.append([type, shortVersion]) else: # print('NO SENTENCES FOR %s\n' % (label)) outFile.write("NO SENTENCES FOR %s\n" % (label))
def histoDeletionsByPara(theData, theType, which, lastParaNumDict, theFilter, outFile): localDict = defaultdict(int) for key, sent in sorted(theData.items()): if (0 != theType) and (theType != sent.getType()): continue if which != sent.getWhich(): continue if sent.isAligned(): continue paraNum = sent.getLeftParaSub() docName = key.split()[0] lastParaKey = docName + ' ' + which lastParaNum = lastParaNumDict[lastParaKey] if ('FIRST' == theFilter) and (0 != paraNum): continue if ('LAST' == theFilter) and (lastParaNum != paraNum): continue if ('MIDDLE' == theFilter): if (0 == paraNum) or (lastParaNum == paraNum): continue localDict[paraNum] += 1 # for dist, freq in sorted(localDict.items()): # print('%5d %5d' % (dist, freq)) headerInfo = '\nInput from file: %s for %s\n' % (inFileName, theFilter) headerInfo += 'Column 1: paragraph numbers\n' headerInfo += 'Column 2: percent of total\n' headerInfo += 'Column 3: raw numbers of total\n' headerInfo += 'Column 4: the histogram\n' label = 'DELETIONS BY PARAGRAPH FROM DRAFT FOR TYPE %d ' % (theType) print('%s %s' % (label, which)) label += headerInfo if len(localDict) > 0: histo, shortVersion = Histogram.histoTheData(label, localDict, 1, 1, outFile) # print('%s\n' % (histo)) outFile.write('%s\n' % (histo)) # shortStuff.append([type, shortVersion]) else: # print('NO DELETIONS FOR %s\n' % (label)) outFile.write('NO DELETIONS FOR %s\n' % (label))
def histoParagraphCounts(theData, theType, which, outFile): localDict = defaultdict(int) for key, sent in sorted(theData.items()): if (0 != theType) and (theType != sent.getType()): continue if which != sent.getWhich(): continue name = sent.getName() if 'DRAFT' == which: paraNum = int(sent.getLeftParaSub()) else: paraNum = int(sent.getRightParaSub()) if paraNum > localDict[name]: localDict[name] = paraNum localDict2 = defaultdict(int) for name, maxValue in sorted(localDict.items()): # print('%6s %5d' % (name, maxValue)) localDict2[maxValue] += 1 # for numParas, freq in sorted(localDict2.items()): # print('%5d %5d' % (numParas, freq)) headerInfo = '\nInput from file: %s\n' % (inFileName) headerInfo += 'Column 1: # of paragraphs ' + which + '\n' headerInfo += 'Column 2: percent of total\n' headerInfo += 'Column 3: raw numbers of total\n' headerInfo += 'Column 4: the histogram\n' label = 'PARAGRAPH COUNTS FOR TYPE %d ' % (theType) print('%s %s' % (label, which)) label += headerInfo if len(localDict2) > 0: histo, shortVersion = Histogram.histoTheData(label, localDict2, 1, 1, outFile) # print('%s\n' % (histo)) outFile.write('%s\n' % (histo)) # shortStuff.append([type, shortVersion]) else: # print('NO PARAGRAPHS FOR %s\n' % (label)) outFile.write('NO PARAGRAPHS FOR %s\n' % (label))
def histoAlignmentLevels(theData, theType, which, outFile): localDict = defaultdict(int) for key, sent in sorted(theData.items()): if (0 != theType) and (theType != sent.getType()): continue if which != sent.getWhich(): continue name = sent.getName() level = int(sent.getAlignmentLevel()) if level > localDict[name]: localDict[name] = level localDict2 = defaultdict(int) for name, maxValue in sorted(localDict.items()): # print('%6s %5d' % (name, maxValue)) localDict2[maxValue] += 1 # for levels, freqs in sorted(localDict2.items()): # print('%5d %5d' % (levels, freqs)) headerInfo = "\nInput from file: %s\n" % (inFileName) headerInfo += "Column 1: last alignment with changes\n" headerInfo += "Column 2: percent of total\n" headerInfo += "Column 3: raw numbers of total\n" headerInfo += "Column 4: the histogram\n" label = "LAST ALIGNMENT FOR TYPE %d " % (theType) print("%s %s" % (label, which)) label += headerInfo if len(localDict2) > 0: histo, shortVersion = Histogram.histoTheData(label, localDict2, 1, 1, outFile) # print('%s\n' % (histo)) outFile.write("%s\n" % (histo)) # shortStuff.append([type, shortVersion]) else: # print('NO ALIGNMENTS FOR %s\n' % (label)) outFile.write("NO ALIGNMENTS FOR %s\n" % (label))
def histoEditDistance(theData, theType, which, outFile): localDict = defaultdict(int) for key, sent in sorted(theData.items()): if (0 != theType) and (theType != sent.getType()): continue if which != sent.getWhich(): continue distRounded = int(round(sent.getEditDistFracOfWorst() * 100.0)) if distRounded < 0: distRounded = 999 localDict[distRounded] += 1 # for dist, freq in sorted(localDict.items()): # print('%5d %5d' % (dist, freq)) headerInfo = '\nInput from file: %s\n' % (inFileName) headerInfo += 'Column 1: % change in aligned sentences from draft to final\n' headerInfo += " 'UNALIGN' means deletions from draft, insertions into final\n" headerInfo += 'Column 2: % of sentences with that change\n' headerInfo += 'Column 3: raw numbers of sentences with that change\n' headerInfo += 'Column 4: the histogram\n' label = 'EDIT DISTANCE COMPARISONS FOR ' + which + ' OF TYPE %d ' % (theType) print('%s %s' % (label, which)) label += headerInfo if len(localDict) > 0: histo, shortVersion = Histogram.histoTheData(label, localDict, 4, 4, outFile) # print('%s\n' % (histo)) outFile.write('%s\n' % (histo)) # shortStuff.append([type, shortVersion]) else: # print('NO EDIT DIST FRACS FOR %s\n' % (label)) outFile.write('NO EDIT DIST FRACS FOR %s\n' % (label)) if 0 != theType: return localTypeListDict = defaultdict(list) for key in range(0, 5): localTypeListDict[key] = [0] for key, sent in sorted(theData.items()): if which != sent.getWhich(): continue thisType = sent.getType() distRounded = int(round(sent.getEditDistFracOfWorst() * 100.0)) if distRounded < 0: distRounded = -5 # do the actual type thisList = localTypeListDict[thisType] thisList.append(distRounded) localTypeListDict[thisType] = thisList # do the "all" type, which is type 0 thisList = localTypeListDict[0] thisList.append(distRounded) localTypeListDict[0] = thisList multiset = [] for key, value in sorted(localTypeListDict.items()): multiset.append(value) # print('TYPE %d' % (key)) # print('type %d %s' % (key, value)) fig = plt.figure() ax = fig.add_subplot(111) numBins = 50 numBins = 25 ax.hist(multiset, numBins, color=['green','red','blue','lime','orange'], \ label = ['0', '1', '2', '3', '4'], alpha=0.8) ax.legend(prop={'size': 10}) ax.set_title('Edit Distance Histograms ' + which) # plt.show() plt.savefig('EditDistHistograms' + which)
def histoDiversityByWordCount(theData, theType, whichLeft, whichRight, outFile): allDict = defaultdict(int) nonstopDict = defaultdict(int) ###################################################################### ## REMEMBER that general diversity is all and nonstopwords ## REMEMBER that student diversity is all for draft and final ALLDIVERSITY = 6 NONSTOPDIVERSITY = 9 #ALLDIVERSITY = 5 #NONSTOPDIVERSITY = 8 allRoundedList = [] nonstopRoundedList = [] for line in theData: name = line[1] name = name.replace('.txt', '') if (0 != theType) and (int(theType) != int(name[-1])): continue allDiversity = float(line[ALLDIVERSITY]) nonstopDiversity = float(line[NONSTOPDIVERSITY]) allRounded = round(100 * allDiversity) allRoundedList.append(allRounded) nonstopRounded = round(100 * nonstopDiversity) nonstopRoundedList.append(nonstopRounded) allDict[int(allRounded)] += 1 nonstopDict[int(nonstopRounded)] += 1 # print('%s %s %s %3d %3d' % \ # (name, name[-1], theType, int(draftRounded), (finalRounded) )) # which = 'ALL' headerInfo = '\nInput from file: %s\n' % (inFileName) headerInfo = 'Diversity is quotient of #uniq words and #words for ' + whichLeft + '\n' headerInfo += 'Column 1: This quotient times 100\n' headerInfo += 'Column 2: Percent of papers with this quotient\n' headerInfo += 'Column 3: Raw number of papers with this quotient\n' headerInfo += 'Column 4: The histogram\n' label = 'DIVERSITY FOR TYPE %d\n' % (theType) label += headerInfo if len(allDict) > 0: histo, shortVersion = Histogram.histoTheData(label, allDict, 1, 1, outFile) outFile.write('%s\n' % (histo)) else: # print('NO PAPERS FOR %s\n' % (label)) outFile.write('NO PAPERS FOR %s\n' % (label)) # which = 'NONSTOP' headerInfo = '\nInput from file: %s\n' % (inFileName) headerInfo = 'Diversity is quotient of #uniq words and #words for ' + whichRight + '\n' headerInfo += 'Column 1: This quotient times 100\n' headerInfo += 'Column 2: Percent of papers with this quotient\n' headerInfo += 'Column 3: Raw number of papers with this quotient\n' headerInfo += 'Column 4: The histogram\n' label = 'DIVERSITY FOR TYPE %d\n' % (theType) label += headerInfo if len(nonstopDict) > 0: histo, shortVersion = Histogram.histoTheData(label, nonstopDict, 1, 1, outFile) outFile.write('%s\n' % (histo)) else: # print('NO PAPERS FOR %s\n' % (label)) outFile.write('NO PAPERS FOR %s\n' % (label)) print(allRoundedList) if len(allRoundedList) > 0: fig = plt.figure() ax = fig.add_subplot(111) numBins = 50 # ax.hist(allRoundedList,numBins,color='green',alpha=0.8) # ax.hist(nonstopRoundedList,numBins,color='red',alpha=0.8) # plt.show() multiset = [allRoundedList, nonstopRoundedList] ax.hist(multiset, numBins, color=['green','red'],label = ['All', 'Nonstop'], alpha=0.8) ax.legend(prop={'size': 10}) ax.set_title('Allword versus Nonstopword diversity') plt.show()