def testInterval(self): inte1 = wisinput.interval(self.corpus[3], 4) #print(inte1) resp1 = [(85, 102)] #[ print(k, v) for k, v in enumerate(qb) ] #[ print(e) for e in inte1 ] self.assertTrue(inte1 == resp1)
def testInterval2(self): s = [['que', 'INDP', 'ChildL1:Root5', '-', 0], ['os', 'ART', 'ChildL2:Root5', 0, '-'], ['relatores', 'N', 'ChildL2:Root5', 0, '-'], ['consideram', 'FIN', 'Root5', '-', '-'], ['existir', 'INF', 'ChildR1:Root5', '-', 0], ['em', 'PRP', 'ChildR1:Root5', '-', 0], ['esta', 'DET', 'ChildR1:Root5', '-', 0], ['área', 'N', 'ChildR1:Root5', '-', 0]] inte1 = wisinput.interval(s, 4) # print(inte1) resp1 = [(85, 102)] #[ print(k, v) for k, v in enumerate(qb) ] # [ print(e) for e in inte1 ] self.assertTrue(inte1 == resp1)
def createInput(fileName=None, createTest=False): """Creates a CSV file with the result of the preprocessing step. Args: fileName: The CSV file that will be created createTest: If the preprocessing will be applyed in the test set """ corpus = globoquotes.load(GLOBOQUOTES_FILE) test = globoquotes.load(GLOBOQUOTES_TEST_FILE) converter = verbspeech.Converter() if not fileName: fileName = INPUT_FILE open(fileName, 'w').close() pos = feature.pos(corpus + test, posIndex = 1) columns = feature.columns(pos) if createTest: corpus = test i = 0 for i in range(len(corpus)): s = corpus[i] qs = baseline.quotationStart(s) qe = baseline.quotationEnd(s, qs) qb = baseline.quoteBounds(qs, qe) converter.vsay(s, tokenIndex = 0, posIndex = 1) #for k in range(len(s)): # print(k, s[k][0].ljust(30), s[k][1].ljust(10), s[k][7].ljust(5), qs[k], qe[k], qb[k]) # Baseline: X #print("Create bc...") bc = baseline.boundedChunk(s) #print("Create vsn...") vsn = baseline.verbSpeechNeighb(s) #print("Create fluc...") fluc = baseline.firstLetterUpperCase(s) #print("Identifying quotes...") quotes = wisinput.interval(qb) #print("Identifying coreferences...") coref, labels = wisinput.coref(s, quotes, corefIndex=7) #print("Creating features...") feat = feature.create(s, quotes=quotes, coref=coref, posIndex=1, corefIndex=7, quoteBounds=qb, bc=bc, vsn=vsn, fluc=fluc) #print("Binarying features...") bfeat = feature.binary(columns, feat) # Answer: Y #print("Output: Creating y...") qbA = [ e[INDEX_QB] for e in s ] #print("Output: Identifying quotes...") quotesA = wisinput.interval(qbA) #print("Output: Quotes = ", len(quotesA)) #print("Output: Identifying coreferences...") corefA, labelsA = wisinput.corefAnnotated(s, quotes=quotesA, corefIndex=7, gpqIndex=6) #print("Output: Coref = ", len(corefA)) #print("Output: Creating features...") featA = feature.create(s, quotes=quotesA, coref=corefA, posIndex=1, corefIndex=7, \ quoteBounds=qbA, bc=bc, vsn=vsn, fluc=fluc, dummy=False) #print("Output: Binarying features...") bfeatA = feature.binary(columns, featA) #print("Output: bFeat = ", len(bfeatA)) with open(fileName, 'a', newline='') as csvfile: swriter = csv.writer(csvfile, delimiter=';') for p in range(len(bfeat)): for q in range(len(bfeat[p])): swriter.writerow([i, "x"] + list(quotes[p]) + [labels[p][q]] + bfeat[p][q]) for p in range(len(bfeatA)): for q in range(len(bfeatA[p])): swriter.writerow([i, "y"] + list(quotesA[p]) + [labelsA[p][q]] + bfeatA[p][q])
def testInterval3(self): s = [['A', 'ART', 'ChildL1:Root1', '0', '-'], ['agência_Nova_China', 'PROP', 'ChildL1:Root1', '0', '-'], ['informou', 'FIN', 'Root1', '-', '-'], ['que', 'S', 'ChildR1:Root1', '-', '0'], ['para', 'PRP', 'ChildR1:Root1', '-', '0'], ['redigir', 'INF', 'ChildR1:Root1', '-', '0'], ['este', 'DET', 'ChildR1:Root1', '-', '0'], ['dicionário', 'N', 'ChildR1:Root1', '-', '0'], ['de', 'PRP', 'ChildR1:Root1', '-', '0'], ['34_470', 'NUM', 'ChildR1:Root1', '-', '0'], ['entradas', 'N', 'ChildR1:Root1', '-', '0'], ['em', 'PRP', 'ChildR1:Root1', '-', '0'], ['língua', 'N', 'ChildR1:Root1', '-', '0'], ['chinesa', 'ADJ', 'ChildR1:Root1', '-', '0'], [',', '', 'ChildR1:Root1', '-', '0'], ['foi', 'FIN', 'ChildR1:Root1', '-', '0'], ['necessário', 'ADJ', 'ChildR1:Root1', '-', '0'], ['o', 'ART', 'ChildR1:Root1', '-', '0'], ['trabalho', 'N', 'ChildR1:Root1', '-', '0'], ['de', 'PRP', 'ChildR1:Root1', '-', '0'], ['300', 'NUM', 'ChildR1:Root1', '-', '0'], ['especialistas', 'N', 'ChildR1:Root1', '-', '0'], ['durante', 'PRP', 'ChildR1:Root1', '-', '0'], ['três', 'NUM', 'ChildR1:Root1', '-', '0'], ['anos', 'N', 'ChildR1:Root1', '-', '0'], ['.', '', '-', '-', '-'], ['A', 'ART', 'ChildL1:Root2', '-', '1'], ['enciclopédia', 'N', 'ChildL1:Root2', '-', '1'], ['«', '', 'ChildL1:Root2', '-', '1'], ['é', 'FIN', 'ChildL1:Root2', '-', '1'], ['considerada', 'PCP', 'ChildL1:Root2', '-', '1'], ['o', 'ART', 'ChildL1:Root2', '-', '1'], ['primeiro', 'ADJ', 'ChildL1:Root2', '-', '1'], ['grande', 'ADJ', 'ChildL1:Root2', '-', '1'], ['instrumento', 'N', 'ChildL1:Root2', '-', '1'], ['de', 'PRP', 'ChildL1:Root2', '-', '1'], ['trabalho', 'N', 'ChildL1:Root2', '-', '1'], ['exaustivo', 'ADJ', 'ChildL1:Root2', '-', '1'], ['e', 'C', 'ChildL1:Root2', '-', '1'], ['sistemático', 'ADJ', 'ChildL1:Root2', '-', '1'], ['para', 'PRP', 'ChildL1:Root2', '-', '1'], ['o', 'ART', 'ChildL1:Root2', '-', '1'], ['mestudo', 'N', 'ChildL1:Root2', '-', '1'], ['de', 'PRP', 'ChildL1:Root2', '-', '1'], ['o', 'ART', 'ChildL1:Root2', '-', '1'], ['marxismo-leninismo', 'N', 'ChildL1:Root2', '-', '1'], ['a', 'PRP', 'ChildL1:Root2', '-', '1'], ['ser', 'INF', 'ChildL1:Root2', '-', '1'], ['publicado', 'PCP', 'ChildL1:Root2', '-', '1'], ['depois', 'ADV', 'ChildL1:Root2', '-', '1'], ['de', 'PRP', 'ChildL1:Root2', '-', '1'], ['o', 'ART', 'ChildL1:Root2', '-', '1'], ['nascimento', 'N', 'ChildL1:Root2', '-', '1'], ['de', 'PRP', 'ChildL1:Root2', '-', '1'], ['a', 'ART', 'ChildL1:Root2', '-', '1'], ['doutrina', 'N', 'ChildL1:Root2', '-', '1'], ['marxista', 'ADJ', 'ChildL1:Root2', '-', '1'], ['»', '', '-', '-', '-'], [',', '', '-', '-', '-'], ['explicou', 'FIN', 'Root2', '-', '-'], ['a', 'ART', 'ChildR1:Root2', '1', '-'], ['agência', 'N', 'ChildR1:Root2', '1', '-'], ['.', '', '-', '-', '-'], ['A', 'ART', '-', '-', '-'], ['primeira', 'ADJ', '-', '-', '-'], ['edição', 'N', '-', '-', '-'], [',', '', '-', '-', '-'], ['de', 'PRP', '-', '-', '-'], ['11', 'NUM', '-', '-', '-'], ['mil', 'N', '-', '-', '-'], ['ecxemplares', 'N', '-', '-', '-'], [',', '', '-', '-', '-'], ['está', 'FIN', '-', '-', '-'], ['já', 'ADV', '-', '-', '-'], ['reservada', 'PCP', '-', '-', '-'], ['em', 'PRP', '-', '-', '-'], ['a', 'ART', '-', '-', '-'], ['sua', 'DET', '-', '-', '-'], ['totalidade', 'N', '-', '-', '-'], ['.', '', '-', '-', '-'], ['Cessar-fogo', 'N', '-', '-', '-'], ['violado', 'PCP', '-', '-', '-'], ['em', 'PRP', '-', '-', '-'], ['o', 'ART', '-', '-', '-'], ['Sara', 'PROP', '-', '-', '-'], ['Cessar-fogo', 'N', '-', '-', '-'], ['violado', 'PCP', '-', '-', '-'], ['em', 'PRP', '-', '-', '-'], ['o', 'ART', '-', '-', '-'], ['Sara', 'PROP', '-', '-', '-'], ['A', 'ART', 'ChildL1:Root3', '-', '-'], ['FRENTE_Polisário', 'PROP', 'ChildL1:Root3', '-', '-'], ['acusou', 'FIN', 'Root3', '-', '-'], ['ontem', 'ADV', 'ChildR1:Root3', '-', '-'], ['Marrocos', 'PROP', 'ChildR2:Root3', '-', '-'], ['de', 'PRP', 'ChildR3:Root3', '-', '-'], ['violar', 'INF', 'ChildR3:Root3', '-', '-'], ['por', 'PRP', 'ChildR3:Root3', '-', '-'], ['a', 'ART', 'ChildR3:Root3', '-', '-'], ['terceira', 'ADJ', 'ChildR3:Root3', '-', '-'], ['vez', 'N', 'ChildR3:Root3', '-', '-'], ['o', 'ART', 'ChildR3:Root3', '-', '-'], ['cessar-fogo', 'N', 'ChildR3:Root3', '-', '-'], ['em', 'PRP', 'ChildR3:Root3', '-', '-'], ['o', 'ART', 'ChildR3:Root3', '-', '-'], ['Sara_Ocidental', 'PROP', 'ChildR3:Root3', '-', '-'], ['a', 'PRP', 'ChildR3:Root3', '-', '-'], ['o', 'ART', 'ChildR3:Root3', '-', '-'], ['enviar', 'INF', 'ChildR3:Root3', '-', '-'], ['aviões', 'N', 'ChildR3:Root3', '-', '-'], ['para', 'PRP', 'ChildR3:Root3', '-', '-'], ['sobrevoar', 'INF', 'ChildR3:Root3', '-', '-'], ['a', 'ART', 'ChildR3:Root3', '-', '-'], ['povoação', 'N', 'ChildR3:Root3', '-', '-'], ['de', 'PRP', 'ChildR3:Root3', '-', '-'], ['Mijek', 'PROP', 'ChildR3:Root3', '-', '-'], [',', '', 'ChildR3:Root3', '-', '-'], ['em', 'PRP', 'ChildR3:Root3', '-', '-'], ['o', 'ART', 'ChildR3:Root3', '-', '-'], ['sudeste', 'N', 'ChildR3:Root3', '-', '-'], ['de', 'PRP', 'ChildR3:Root3', '-', '-'], ['o', 'ART', 'ChildR3:Root3', '-', '-'], ['território', 'N', 'ChildR3:Root3', '-', '-'], ['.', '', '-', '-', '-'], ['«', '', '-', '-', '-'], ['Para', 'PRP', 'ChildL1:Root5', '-', '2'], ['que', 'S', 'ChildL1:Root5', '-', '2'], ['possa', 'FIN', 'ChildL1:Root5', '-', '2'], ['não', 'ADV', 'ChildL1:Root5', '-', '2'], ['responder', 'INF', 'ChildL1:Root5', '-', '2'], ['a', 'PRP', 'ChildL1:Root5', '-', '2'], ['as', 'ART', 'ChildL1:Root5', '-', '2'], ['violações', 'N', 'ChildL1:Root5', '-', '2'], ['marroquinas', 'ADJ', 'ChildL1:Root5', '-', '2'], [',', '', 'ChildL1:Root5', '-', '2'], ['a', 'ART', 'ChildL1:Root5', '2', '2'], ['parte', 'N', 'ChildL1:Root5', '2', '2'], ['sarauí', 'ADJ', 'ChildL1:Root5', '2', '2'], ['exige', 'FIN', 'ChildL1:Root5', '-', '2'], ['que', 'S', 'ChildL1:Root5', '-', '2'], ['a', 'ART', 'ChildL1:Root5', '-', '2'], ['comunidade', 'N', 'ChildL1:Root5', '-', '2'], ['internacional', 'ADJ', 'ChildL1:Root5', '-', '2'], ['lance', 'FIN', 'ChildL1:Root5', '-', '2'], ['um', 'ART', 'ChildL1:Root5', '-', '2'], ['alerta', 'N', 'ChildL1:Root5', '-', '2'], ['a', 'PRP', 'ChildL1:Root5', '-', '2'], ['Marrocos', 'PROP', 'ChildL1:Root5', '-', '2'], ['para', 'PRP', 'ChildL1:Root5', '-', '2'], ['que', 'S', 'ChildL1:Root5', '-', '2'], ['cesse', 'FIN', 'ChildL1:Root5', '-', '2'], ['as', 'ART', 'ChildL1:Root5', '-', '2'], ['provocações', 'N', 'ChildL1:Root5', '-', '2'], ['e', 'C', 'ChildL1:Root5', '-', '2'], ['para', 'PRP', 'ChildL1:Root5', '-', '2'], ['que', 'S', 'ChildL1:Root5', '-', '2'], ['se', 'PERS', 'ChildL1:Root5', '-', '2'], ['comnporte', 'FIN', 'ChildL1:Root5', '-', '2'], ['de', 'PRP', 'ChildL1:Root5', '-', '2'], ['forma', 'N', 'ChildL1:Root5', '-', '2'], ['responsável', 'ADJ', 'ChildL1:Root5', '-', '2'], [',', '', 'ChildL1:Root5', '-', '2'], ['respeitando', 'GER', 'ChildL1:Root5', '-', '2'], ['os', 'ART', 'ChildL1:Root5', '-', '2'], ['seus', 'DET', 'ChildL1:Root5', '-', '2'], ['compromissos', 'N', 'ChildL1:Root5', '-', '2'], ['»', '', '-', '-', '-'], [',', '', '-', '-', '-'], ['declarou', 'FIN', 'Root5', '-', '-'], ['a', 'ART', 'ChildR1:Root5', '2', '-'], ['Polisário', 'PROP', 'ChildR1:Root5', '2', '-'], [',', '', 'ChildR1:Root5', '2', '-'], ['organização', 'N', 'ChildR1:Root5', '2', '-'], ['que', 'INDP', 'ChildR1:Root5', '2', '-'], ['luta', 'FIN', 'ChildR1:Root5', '2', '-'], ['por', 'PRP', 'ChildR1:Root5', '2', '-'], ['a', 'ART', 'ChildR1:Root5', '2', '-'], ['independência', 'N', 'ChildR1:Root5', '2', '-'], ['de', 'PRP', 'ChildR1:Root5', '2', '-'], ['o', 'ART', 'ChildR1:Root5', '2', '-'], ['território', 'N', 'ChildR1:Root5', '2', '-'], ['de', 'PRP', 'ChildR1:Root5', '2', '-'], ['o', 'ART', 'ChildR1:Root5', '2', '-'], ['Sara_Ocidental', 'PROP', 'ChildR1:Root5', '2', '-'], [',', '', 'ChildR1:Root5', '2', '-'], ['em', 'PRP', 'ChildR2:Root5', '-', '-'], ['um', 'ART', 'ChildR2:Root5', '-', '-'], ['comunicado', 'N', 'ChildR2:Root5', '-', '-'], ['divulgado', 'PCP', 'ChildR2:Root5', '-', '-'], ['em', 'PRP', 'ChildR2:Root5', '-', '-'], ['Argel', 'PROP', 'ChildR2:Root5', '-', '-'], ['.', '', '-', '-', '-']] inte1 = wisinput.interval(s, 4) # print(inte1) resp1 = [(85, 102)] #[ print(k, v) for k, v in enumerate(qb) ] #[ print(e) for e in inte1 ] self.assertTrue(inte1 == resp1)
def test_corefAnnotated2(self): s = [['A', 'ART', 'ChildL1:Root1', '0', '-'], ['agência_Nova_China', 'PROP', 'ChildL1:Root1', '0', '-'], ['informou', 'FIN', 'Root1', '-', '-'], ['que', 'S', 'ChildR1:Root1', '-', '0'], ['para', 'PRP', 'ChildR1:Root1', '-', '0'], ['redigir', 'INF', 'ChildR1:Root1', '-', '0'], ['este', 'DET', 'ChildR1:Root1', '-', '0'], ['dicionário', 'N', 'ChildR1:Root1', '-', '0'], ['de', 'PRP', 'ChildR1:Root1', '-', '0'], ['34_470', 'NUM', 'ChildR1:Root1', '-', '0'], ['entradas', 'N', 'ChildR1:Root1', '-', '0'], ['em', 'PRP', 'ChildR1:Root1', '-', '0'], ['língua', 'N', 'ChildR1:Root1', '-', '0'], ['chinesa', 'ADJ', 'ChildR1:Root1', '-', '0'], [',', '', 'ChildR1:Root1', '-', '0'], ['foi', 'FIN', 'ChildR1:Root1', '-', '0'], ['necessário', 'ADJ', 'ChildR1:Root1', '-', '0'], ['o', 'ART', 'ChildR1:Root1', '-', '0'], ['trabalho', 'N', 'ChildR1:Root1', '-', '0'], ['de', 'PRP', 'ChildR1:Root1', '-', '0'], ['300', 'NUM', 'ChildR1:Root1', '-', '0'], ['especialistas', 'N', 'ChildR1:Root1', '-', '0'], ['durante', 'PRP', 'ChildR1:Root1', '-', '0'], ['três', 'NUM', 'ChildR1:Root1', '-', '0'], ['anos', 'N', 'ChildR1:Root1', '-', '0'], ['.', '', '-', '-', '-'], ['A', 'ART', 'ChildL1:Root2', '-', '1'], ['enciclopédia', 'N', 'ChildL1:Root2', '-', '1'], ['«', '', 'ChildL1:Root2', '-', '1'], ['é', 'FIN', 'ChildL1:Root2', '-', '1'], ['considerada', 'PCP', 'ChildL1:Root2', '-', '1'], ['o', 'ART', 'ChildL1:Root2', '-', '1'], ['primeiro', 'ADJ', 'ChildL1:Root2', '-', '1'], ['grande', 'ADJ', 'ChildL1:Root2', '-', '1'], ['instrumento', 'N', 'ChildL1:Root2', '-', '1'], ['de', 'PRP', 'ChildL1:Root2', '-', '1'], ['trabalho', 'N', 'ChildL1:Root2', '-', '1'], ['exaustivo', 'ADJ', 'ChildL1:Root2', '-', '1'], ['e', 'C', 'ChildL1:Root2', '-', '1'], ['sistemático', 'ADJ', 'ChildL1:Root2', '-', '1'], ['para', 'PRP', 'ChildL1:Root2', '-', '1'], ['o', 'ART', 'ChildL1:Root2', '-', '1'], ['mestudo', 'N', 'ChildL1:Root2', '-', '1'], ['de', 'PRP', 'ChildL1:Root2', '-', '1'], ['o', 'ART', 'ChildL1:Root2', '-', '1'], ['marxismo-leninismo', 'N', 'ChildL1:Root2', '-', '1'], ['a', 'PRP', 'ChildL1:Root2', '-', '1'], ['ser', 'INF', 'ChildL1:Root2', '-', '1'], ['publicado', 'PCP', 'ChildL1:Root2', '-', '1'], ['depois', 'ADV', 'ChildL1:Root2', '-', '1'], ['de', 'PRP', 'ChildL1:Root2', '-', '1'], ['o', 'ART', 'ChildL1:Root2', '-', '1'], ['nascimento', 'N', 'ChildL1:Root2', '-', '1'], ['de', 'PRP', 'ChildL1:Root2', '-', '1'], ['a', 'ART', 'ChildL1:Root2', '-', '1'], ['doutrina', 'N', 'ChildL1:Root2', '-', '1'], ['marxista', 'ADJ', 'ChildL1:Root2', '-', '1'], ['»', '', '-', '-', '-'], [',', '', '-', '-', '-'], ['explicou', 'FIN', 'Root2', '-', '-'], ['a', 'ART', 'ChildR1:Root2', '1', '-'], ['agência', 'N', 'ChildR1:Root2', '1', '-'], ['.', '', '-', '-', '-'], ['A', 'ART', '-', '-', '-'], ['primeira', 'ADJ', '-', '-', '-'], ['edição', 'N', '-', '-', '-'], [',', '', '-', '-', '-'], ['de', 'PRP', '-', '-', '-'], ['11', 'NUM', '-', '-', '-'], ['mil', 'N', '-', '-', '-'], ['ecxemplares', 'N', '-', '-', '-'], [',', '', '-', '-', '-'], ['está', 'FIN', '-', '-', '-'], ['já', 'ADV', '-', '-', '-'], ['reservada', 'PCP', '-', '-', '-'], ['em', 'PRP', '-', '-', '-'], ['a', 'ART', '-', '-', '-'], ['sua', 'DET', '-', '-', '-'], ['totalidade', 'N', '-', '-', '-'], ['.', '', '-', '-', '-'], ['Cessar-fogo', 'N', '-', '-', '-'], ['violado', 'PCP', '-', '-', '-'], ['em', 'PRP', '-', '-', '-'], ['o', 'ART', '-', '-', '-'], ['Sara', 'PROP', '-', '-', '-'], ['Cessar-fogo', 'N', '-', '-', '-'], ['violado', 'PCP', '-', '-', '-'], ['em', 'PRP', '-', '-', '-'], ['o', 'ART', '-', '-', '-'], ['Sara', 'PROP', '-', '-', '-'], ['A', 'ART', 'ChildL1:Root3', '-', '-'], ['FRENTE_Polisário', 'PROP', 'ChildL1:Root3', '-', '-'], ['acusou', 'FIN', 'Root3', '-', '-'], ['ontem', 'ADV', 'ChildR1:Root3', '-', '-'], ['Marrocos', 'PROP', 'ChildR2:Root3', '-', '-'], ['de', 'PRP', 'ChildR3:Root3', '-', '-'], ['violar', 'INF', 'ChildR3:Root3', '-', '-'], ['por', 'PRP', 'ChildR3:Root3', '-', '-'], ['a', 'ART', 'ChildR3:Root3', '-', '-'], ['terceira', 'ADJ', 'ChildR3:Root3', '-', '-'], ['vez', 'N', 'ChildR3:Root3', '-', '-'], ['o', 'ART', 'ChildR3:Root3', '-', '-'], ['cessar-fogo', 'N', 'ChildR3:Root3', '-', '-'], ['em', 'PRP', 'ChildR3:Root3', '-', '-'], ['o', 'ART', 'ChildR3:Root3', '-', '-'], ['Sara_Ocidental', 'PROP', 'ChildR3:Root3', '-', '-'], ['a', 'PRP', 'ChildR3:Root3', '-', '-'], ['o', 'ART', 'ChildR3:Root3', '-', '-'], ['enviar', 'INF', 'ChildR3:Root3', '-', '-'], ['aviões', 'N', 'ChildR3:Root3', '-', '-'], ['para', 'PRP', 'ChildR3:Root3', '-', '-'], ['sobrevoar', 'INF', 'ChildR3:Root3', '-', '-'], ['a', 'ART', 'ChildR3:Root3', '-', '-'], ['povoação', 'N', 'ChildR3:Root3', '-', '-'], ['de', 'PRP', 'ChildR3:Root3', '-', '-'], ['Mijek', 'PROP', 'ChildR3:Root3', '-', '-'], [',', '', 'ChildR3:Root3', '-', '-'], ['em', 'PRP', 'ChildR3:Root3', '-', '-'], ['o', 'ART', 'ChildR3:Root3', '-', '-'], ['sudeste', 'N', 'ChildR3:Root3', '-', '-'], ['de', 'PRP', 'ChildR3:Root3', '-', '-'], ['o', 'ART', 'ChildR3:Root3', '-', '-'], ['território', 'N', 'ChildR3:Root3', '-', '-'], ['.', '', '-', '-', '-'], ['«', '', '-', '-', '-'], ['Para', 'PRP', 'ChildL1:Root5', '-', '2'], ['que', 'S', 'ChildL1:Root5', '-', '2'], ['possa', 'FIN', 'ChildL1:Root5', '-', '2'], ['não', 'ADV', 'ChildL1:Root5', '-', '2'], ['responder', 'INF', 'ChildL1:Root5', '-', '2'], ['a', 'PRP', 'ChildL1:Root5', '-', '2'], ['as', 'ART', 'ChildL1:Root5', '-', '2'], ['violações', 'N', 'ChildL1:Root5', '-', '2'], ['marroquinas', 'ADJ', 'ChildL1:Root5', '-', '2'], [',', '', 'ChildL1:Root5', '-', '2'], ['a', 'ART', 'ChildL1:Root5', '-', '2'], ['parte', 'N', 'ChildL1:Root5', '-', '2'], ['sarauí', 'ADJ', 'ChildL1:Root5', '-', '2'], ['exige', 'FIN', 'ChildL1:Root5', '-', '2'], ['que', 'S', 'ChildL1:Root5', '-', '2'], ['a', 'ART', 'ChildL1:Root5', '-', '2'], ['comunidade', 'N', 'ChildL1:Root5', '-', '2'], ['internacional', 'ADJ', 'ChildL1:Root5', '-', '2'], ['lance', 'FIN', 'ChildL1:Root5', '-', '2'], ['um', 'ART', 'ChildL1:Root5', '-', '2'], ['alerta', 'N', 'ChildL1:Root5', '-', '2'], ['a', 'PRP', 'ChildL1:Root5', '-', '2'], ['Marrocos', 'PROP', 'ChildL1:Root5', '-', '2'], ['para', 'PRP', 'ChildL1:Root5', '-', '2'], ['que', 'S', 'ChildL1:Root5', '-', '2'], ['cesse', 'FIN', 'ChildL1:Root5', '-', '2'], ['as', 'ART', 'ChildL1:Root5', '-', '2'], ['provocações', 'N', 'ChildL1:Root5', '-', '2'], ['e', 'C', 'ChildL1:Root5', '-', '2'], ['para', 'PRP', 'ChildL1:Root5', '-', '2'], ['que', 'S', 'ChildL1:Root5', '-', '2'], ['se', 'PERS', 'ChildL1:Root5', '-', '2'], ['comnporte', 'FIN', 'ChildL1:Root5', '-', '2'], ['de', 'PRP', 'ChildL1:Root5', '-', '2'], ['forma', 'N', 'ChildL1:Root5', '-', '2'], ['responsável', 'ADJ', 'ChildL1:Root5', '-', '2'], [',', '', 'ChildL1:Root5', '-', '2'], ['respeitando', 'GER', 'ChildL1:Root5', '-', '2'], ['os', 'ART', 'ChildL1:Root5', '-', '2'], ['seus', 'DET', 'ChildL1:Root5', '-', '2'], ['compromissos', 'N', 'ChildL1:Root5', '-', '2'], ['»', '', '-', '-', '-'], [',', '', '-', '-', '-'], ['declarou', 'FIN', 'Root5', '-', '-'], ['a', 'ART', 'ChildR1:Root5', '2', '-'], ['Polisário', 'PROP', 'ChildR1:Root5', '2', '-'], [',', '', 'ChildR1:Root5', '2', '-'], ['organização', 'N', 'ChildR1:Root5', '2', '-'], ['que', 'INDP', 'ChildR1:Root5', '2', '-'], ['luta', 'FIN', 'ChildR1:Root5', '2', '-'], ['por', 'PRP', 'ChildR1:Root5', '2', '-'], ['a', 'ART', 'ChildR1:Root5', '2', '-'], ['independência', 'N', 'ChildR1:Root5', '2', '-'], ['de', 'PRP', 'ChildR1:Root5', '2', '-'], ['o', 'ART', 'ChildR1:Root5', '2', '-'], ['território', 'N', 'ChildR1:Root5', '2', '-'], ['de', 'PRP', 'ChildR1:Root5', '2', '-'], ['o', 'ART', 'ChildR1:Root5', '2', '-'], ['Sara_Ocidental', 'PROP', 'ChildR1:Root5', '2', '-'], [',', '', 'ChildR1:Root5', '2', '-'], ['em', 'PRP', 'ChildR2:Root5', '-', '-'], ['um', 'ART', 'ChildR2:Root5', '-', '-'], ['comunicado', 'N', 'ChildR2:Root5', '-', '-'], ['divulgado', 'PCP', 'ChildR2:Root5', '-', '-'], ['em', 'PRP', 'ChildR2:Root5', '-', '-'], ['Argel', 'PROP', 'ChildR2:Root5', '-', '-'], ['.', '', '-', '-', '-']] gpqIndex = 0 corefIndex = 1 answer = [[4], [8]] lAnswer = ['Child2', 'Child3'] qbA = [e[4] for e in s] quotesA = wisinput.interval(qbA) print("quotes: ", quotesA) coref, labels = wisinput.corefAnnotated(s, quotes=quotesA, depIndex=2, corefIndex=3, quoteIndex=4) print("coref: ", coref) self.assertTrue(coref == answer and labels == lAnswer)
def createInput(fileName=None, createTest=False): """Creates a CSV file with the result of the preprocessing step. Args: fileName: The CSV file that will be created createTest: If the preprocessing will be applyed in the test set """ corpus = bosquequotes.load(BOSQUE_FILE) test = bosquequotes.load(BOSQUE_TEST_FILE) converter = verbspeech.Converter() if not fileName: fileName = INPUT_FILE open(fileName, 'w').close() pos = feature.pos(corpus + test, posIndex=1) columns = feature.columns(pos) if createTest: corpus = test i = 0 for i in range(len(corpus)): s = corpus[i] # qs = baseline.quotationStart(s) # qe = baseline.quotationEnd(s, qs) qb = baseline.quoteBounds(s) converter.vsay(s, tokenIndex=0, posIndex=1) #for k in range(len(s)): # print(k, s[k][0].ljust(30), s[k][1].ljust(10), s[k][7].ljust(5), qs[k], qe[k], qb[k]) # Baseline: X #print("Create bc...") bc = baseline.boundedChunk(s) #print("Create vsn...") vsn = baseline.verbSpeechNeighb(s) #print("Create fluc...") fluc = baseline.firstLetterUpperCase(s) #print("Identifying quotes...") # quotes = wisinput.interval(qb) #print("Identifying coreferences...") # coref, labels = wisinput.coref(s, quotes, corefIndex=7) quotes, coref, labels = wisinput.candidates(s, depIndex=2) #print("Creating features...") feat = feature.create(s, quotes=quotes, coref=coref, posIndex=1, corefIndex=3, quoteBounds=qb, bc=bc, vsn=vsn, fluc=fluc) #print("Binarying features...") bfeat = feature.binary(columns, feat) # Answer: Y #print("Output: Creating y...") qbA = [e[INDEX_QB] for e in s] #print("Output: Identifying quotes...") quotesA = wisinput.interval(qbA) #print("Output: Quotes = ", len(quotesA)) #print("Output: Identifying coreferences...") corefA, labelsA = wisinput.corefAnnotated(s, quotes=quotesA, depIndex=2, corefIndex=3, quoteIndex=4) #print("Output: Coref = ", len(corefA)) print("Output: Creating features...") featA = feature.create(s, quotes=quotesA, coref=corefA, posIndex=1, corefIndex=3, \ quoteBounds=qbA, bc=bc, vsn=vsn, fluc=fluc, dummy=False) #print("Output: Binarying features...") bfeatA = feature.binary(columns, featA) #print("Output: bFeat = ", len(bfeatA)) with open(fileName, 'a', newline='') as csvfile: swriter = csv.writer(csvfile, delimiter=';') for p in range(len(bfeat)): for q in range(len(bfeat[p])): swriter.writerow([i, "x"] + list(quotes[p]) + [labels[p][q]] + bfeat[p][q]) for p in range(len(bfeatA)): for q in range(len(bfeatA[p])): swriter.writerow([i, "y"] + list(quotesA[p]) + [labelsA[p][q]] + bfeatA[p][q])