def test_quoteBounds(self): # Sentence that fits in the first regular expression rule: qs = baseline.quotationStart(self._corpus[0]) qe = baseline.quotationEnd(self._corpus[0], qs) qb = baseline.quoteBounds(qs, qe) for i in range(len(qe)): print(self._corpus[0][i][0], "\t", qs[i], qe[i], qb[i]) # Sentence that fits in the second regular expression rule: qs = baseline.quotationStart(self._corpus[231]) qe = baseline.quotationEnd(self._corpus[231], qs) qb = baseline.quoteBounds(qs, qe) for i in range(len(qe)): print(self._corpus[231][i][0], "\t", qs[i], qe[i], qb[i]) self.assertTrue(True)
def testInterval(self): qs = baseline.quotationStart(self.corpus[0]) qe = baseline.quotationEnd(self.corpus[0], qs) qb = baseline.quoteBounds(qs, qe) inte1 = wisinput.interval(qb, 0) resp1 = [(1, 10),(54, 55),(63, 65),(83, 84),(100, 128),(172, 181),(185, 186),(209, 211),(246, 249),(293, 294),(302, 329),(331, 426),(464, 466),(479, 493),(585, 634),(654, 699),(736, 737),(743, 772)] #[ print(k, v) for k, v in enumerate(qb) ] #[ print(e) for e in inte1 ] qs = baseline.quotationStart(self.corpus[231]) qe = baseline.quotationEnd(self.corpus[231], qs) qb = baseline.quoteBounds(qs, qe) inte2 = wisinput.interval(qb, 0) resp2 = [(48, 51),(73, 88),(90, 123),(241, 244),(320, 333),(337, 390),(393, 420),(542, 560),(563, 568),(572, 686),(689, 715),(717, 721),(762, 769),(772, 786),(790, 840),(843, 852),(856, 922),(925, 959),(963, 1045),(1080, 1101),(1103, 1104),(1108, 1201),(1204, 1227),(1250, 1297),(1300, 1316),(1347, 1351),(1353, 1401),(1404, 1419),(1423, 1506),(1509, 1526),(1530, 1593)] #[ print(k, v) for k, v in enumerate(qb) ] #[ print(e) for e in inte2 ] self.assertTrue(inte1 == resp1 and inte2 == resp2)
def createInput(fileName=None, createTest=False): corpus = globoquotes.load("GloboQuotes/corpus-globocom-cv.txt") test = globoquotes.load("GloboQuotes/corpus-globocom-test.txt") converter = verbspeech.Converter() if not fileName: fileName = FILE_NAME open(fileName, 'w').close() pos = feature.pos(corpus + test, posIndex = 1) columns = feature.columns(pos) if createTest: corpus = test i = 0 for i in range(len(corpus)): s = corpus[i] qs = baseline.quotationStart(s) qe = baseline.quotationEnd(s, qs) qb = baseline.quoteBounds(qs, qe) converter.vsay(s, tokenIndex = 0, posIndex = 1) for k in range(len(s)): print(k, s[k][0].ljust(30), s[k][1].ljust(10), s[k][7].ljust(5), qs[k], qe[k], qb[k]) # Baseline: X print("Create bc...") bc = baseline.boundedChunk(s) print("Create vsn...") vsn = baseline.verbSpeechNeighb(s) print("Create fluc...") fluc = baseline.firstLetterUpperCase(s) print("Identifying quotes...") quotes = wisinput.interval(qb) print("Identifying coreferences...") coref, labels = wisinput.coref(s, quotes, corefIndex=7) print("Creating features...") feat = feature.create(s, quotes=quotes, coref=coref, posIndex=1, corefIndex=7, quoteBounds=qb, bc=bc, vsn=vsn, fluc=fluc) print("Binarying features...") bfeat = feature.binary(columns, feat) # Answer: Y print("Output: Creating y...") qbA = [ e[INDEX_QB] for e in s ] print("Output: Identifying quotes...") quotesA = wisinput.interval(qbA) print("Output: Quotes = ", len(quotesA)) print("Output: Identifying coreferences...") corefA, labelsA = wisinput.corefAnnotated(s, quotes=quotesA, corefIndex=7, gpqIndex=6) print("Output: Coref = ", len(corefA)) print("Output: Creating features...") featA = feature.create(s, quotes=quotesA, coref=corefA, posIndex=1, corefIndex=7, \ quoteBounds=qbA, bc=bc, vsn=vsn, fluc=fluc, dummy=False) print("Output: Binarying features...") bfeatA = feature.binary(columns, featA) print("Output: bFeat = ", len(bfeatA)) with open(fileName, 'a', newline='') as csvfile: swriter = csv.writer(csvfile, delimiter=';') for p in range(len(bfeat)): for q in range(len(bfeat[p])): swriter.writerow([i, "x"] + list(quotes[p]) + [labels[p][q]] + bfeat[p][q]) for p in range(len(bfeatA)): for q in range(len(bfeatA[p])): swriter.writerow([i, "y"] + list(quotesA[p]) + [labelsA[p][q]] + bfeatA[p][q]) print("Done!") #k = 0 #for e in quotes: # print(e, coref[k], bfeat[k]) # k += 1