コード例 #1
0
    def testInterval(self):

        inte1 = wisinput.interval(self.corpus[3], 4)
        #print(inte1)

        resp1 = [(85, 102)]

        #[ print(k, v) for k, v in enumerate(qb) ]
        #[ print(e) for e in inte1 ]

        self.assertTrue(inte1 == resp1)
コード例 #2
0
    def testInterval2(self):
        s = [['que', 'INDP', 'ChildL1:Root5', '-', 0],
             ['os', 'ART', 'ChildL2:Root5', 0, '-'],
             ['relatores', 'N', 'ChildL2:Root5', 0, '-'],
             ['consideram', 'FIN', 'Root5', '-', '-'],
             ['existir', 'INF', 'ChildR1:Root5', '-', 0],
             ['em', 'PRP', 'ChildR1:Root5', '-', 0],
             ['esta', 'DET', 'ChildR1:Root5', '-', 0],
             ['área', 'N', 'ChildR1:Root5', '-', 0]]

        inte1 = wisinput.interval(s, 4)
        # print(inte1)

        resp1 = [(85, 102)]

        #[ print(k, v) for k, v in enumerate(qb) ]
        # [ print(e) for e in inte1 ]

        self.assertTrue(inte1 == resp1)
コード例 #3
0
def createInput(fileName=None, createTest=False):
    """Creates a CSV file with the result of the preprocessing step.

    Args:
        fileName: The CSV file that will be created
        createTest: If the preprocessing will be applyed in the test set
    """
    corpus = globoquotes.load(GLOBOQUOTES_FILE)
    test = globoquotes.load(GLOBOQUOTES_TEST_FILE)
    converter = verbspeech.Converter()

    if not fileName:
        fileName = INPUT_FILE

    open(fileName, 'w').close()

    pos = feature.pos(corpus + test, posIndex = 1)
    columns = feature.columns(pos)

    if createTest:
        corpus = test

    i = 0
    for i in range(len(corpus)):
        s = corpus[i]
        qs = baseline.quotationStart(s)
        qe = baseline.quotationEnd(s, qs)
        qb = baseline.quoteBounds(qs, qe)

        converter.vsay(s, tokenIndex = 0, posIndex = 1)

        #for k in range(len(s)):
        #    print(k, s[k][0].ljust(30), s[k][1].ljust(10), s[k][7].ljust(5), qs[k], qe[k], qb[k])

        # Baseline: X
        #print("Create bc...")
        bc = baseline.boundedChunk(s)
        #print("Create vsn...")
        vsn = baseline.verbSpeechNeighb(s)
        #print("Create fluc...")
        fluc = baseline.firstLetterUpperCase(s)

        #print("Identifying quotes...")
        quotes = wisinput.interval(qb)

        #print("Identifying coreferences...")
        coref, labels = wisinput.coref(s, quotes, corefIndex=7)

        #print("Creating features...")
        feat = feature.create(s, quotes=quotes, coref=coref, posIndex=1, corefIndex=7, quoteBounds=qb, bc=bc, vsn=vsn, fluc=fluc)

        #print("Binarying features...")
        bfeat = feature.binary(columns, feat)

        # Answer: Y
        #print("Output: Creating y...")
        qbA = [ e[INDEX_QB] for e in s ]

        #print("Output: Identifying quotes...")
        quotesA = wisinput.interval(qbA)
        #print("Output: Quotes = ", len(quotesA))

        #print("Output: Identifying coreferences...")
        corefA, labelsA = wisinput.corefAnnotated(s, quotes=quotesA, corefIndex=7, gpqIndex=6)
        #print("Output: Coref = ", len(corefA))

        #print("Output: Creating features...")
        featA = feature.create(s, quotes=quotesA, coref=corefA, posIndex=1, corefIndex=7, \
                quoteBounds=qbA, bc=bc, vsn=vsn, fluc=fluc, dummy=False)

        #print("Output: Binarying features...")
        bfeatA = feature.binary(columns, featA)
        #print("Output: bFeat = ", len(bfeatA))

        with open(fileName, 'a', newline='') as csvfile:
            swriter = csv.writer(csvfile, delimiter=';')

            for p in range(len(bfeat)):
                for q in range(len(bfeat[p])):
                    swriter.writerow([i, "x"] + list(quotes[p]) + [labels[p][q]] + bfeat[p][q])

            for p in range(len(bfeatA)):
                for q in range(len(bfeatA[p])):
                    swriter.writerow([i, "y"] + list(quotesA[p]) + [labelsA[p][q]] + bfeatA[p][q])
コード例 #4
0
    def testInterval3(self):
        s = [['A', 'ART', 'ChildL1:Root1', '0', '-'],
             ['agência_Nova_China', 'PROP', 'ChildL1:Root1', '0', '-'],
             ['informou', 'FIN', 'Root1', '-', '-'],
             ['que', 'S', 'ChildR1:Root1', '-', '0'],
             ['para', 'PRP', 'ChildR1:Root1', '-', '0'],
             ['redigir', 'INF', 'ChildR1:Root1', '-', '0'],
             ['este', 'DET', 'ChildR1:Root1', '-', '0'],
             ['dicionário', 'N', 'ChildR1:Root1', '-', '0'],
             ['de', 'PRP', 'ChildR1:Root1', '-', '0'],
             ['34_470', 'NUM', 'ChildR1:Root1', '-', '0'],
             ['entradas', 'N', 'ChildR1:Root1', '-', '0'],
             ['em', 'PRP', 'ChildR1:Root1', '-', '0'],
             ['língua', 'N', 'ChildR1:Root1', '-', '0'],
             ['chinesa', 'ADJ', 'ChildR1:Root1', '-', '0'],
             [',', '', 'ChildR1:Root1', '-', '0'],
             ['foi', 'FIN', 'ChildR1:Root1', '-', '0'],
             ['necessário', 'ADJ', 'ChildR1:Root1', '-', '0'],
             ['o', 'ART', 'ChildR1:Root1', '-', '0'],
             ['trabalho', 'N', 'ChildR1:Root1', '-', '0'],
             ['de', 'PRP', 'ChildR1:Root1', '-', '0'],
             ['300', 'NUM', 'ChildR1:Root1', '-', '0'],
             ['especialistas', 'N', 'ChildR1:Root1', '-', '0'],
             ['durante', 'PRP', 'ChildR1:Root1', '-', '0'],
             ['três', 'NUM', 'ChildR1:Root1', '-', '0'],
             ['anos', 'N', 'ChildR1:Root1', '-',
              '0'], ['.', '', '-', '-', '-'],
             ['A', 'ART', 'ChildL1:Root2', '-', '1'],
             ['enciclopédia', 'N', 'ChildL1:Root2', '-', '1'],
             ['«', '', 'ChildL1:Root2', '-', '1'],
             ['é', 'FIN', 'ChildL1:Root2', '-', '1'],
             ['considerada', 'PCP', 'ChildL1:Root2', '-', '1'],
             ['o', 'ART', 'ChildL1:Root2', '-', '1'],
             ['primeiro', 'ADJ', 'ChildL1:Root2', '-', '1'],
             ['grande', 'ADJ', 'ChildL1:Root2', '-', '1'],
             ['instrumento', 'N', 'ChildL1:Root2', '-', '1'],
             ['de', 'PRP', 'ChildL1:Root2', '-', '1'],
             ['trabalho', 'N', 'ChildL1:Root2', '-', '1'],
             ['exaustivo', 'ADJ', 'ChildL1:Root2', '-', '1'],
             ['e', 'C', 'ChildL1:Root2', '-', '1'],
             ['sistemático', 'ADJ', 'ChildL1:Root2', '-', '1'],
             ['para', 'PRP', 'ChildL1:Root2', '-', '1'],
             ['o', 'ART', 'ChildL1:Root2', '-', '1'],
             ['mestudo', 'N', 'ChildL1:Root2', '-', '1'],
             ['de', 'PRP', 'ChildL1:Root2', '-', '1'],
             ['o', 'ART', 'ChildL1:Root2', '-', '1'],
             ['marxismo-leninismo', 'N', 'ChildL1:Root2', '-', '1'],
             ['a', 'PRP', 'ChildL1:Root2', '-', '1'],
             ['ser', 'INF', 'ChildL1:Root2', '-', '1'],
             ['publicado', 'PCP', 'ChildL1:Root2', '-', '1'],
             ['depois', 'ADV', 'ChildL1:Root2', '-', '1'],
             ['de', 'PRP', 'ChildL1:Root2', '-', '1'],
             ['o', 'ART', 'ChildL1:Root2', '-', '1'],
             ['nascimento', 'N', 'ChildL1:Root2', '-', '1'],
             ['de', 'PRP', 'ChildL1:Root2', '-', '1'],
             ['a', 'ART', 'ChildL1:Root2', '-', '1'],
             ['doutrina', 'N', 'ChildL1:Root2', '-', '1'],
             ['marxista', 'ADJ', 'ChildL1:Root2', '-', '1'],
             ['»', '', '-', '-', '-'], [',', '', '-', '-', '-'],
             ['explicou', 'FIN', 'Root2', '-', '-'],
             ['a', 'ART', 'ChildR1:Root2', '1', '-'],
             ['agência', 'N', 'ChildR1:Root2', '1', '-'],
             ['.', '', '-', '-', '-'], ['A', 'ART', '-', '-', '-'],
             ['primeira', 'ADJ', '-', '-', '-'],
             ['edição', 'N', '-', '-', '-'], [',', '', '-', '-', '-'],
             ['de', 'PRP', '-', '-', '-'], ['11', 'NUM', '-', '-', '-'],
             ['mil', 'N', '-', '-', '-'], ['ecxemplares', 'N', '-', '-', '-'],
             [',', '', '-', '-', '-'], ['está', 'FIN', '-', '-', '-'],
             ['já', 'ADV', '-', '-', '-'], ['reservada', 'PCP', '-', '-', '-'],
             ['em', 'PRP', '-', '-', '-'], ['a', 'ART', '-', '-', '-'],
             ['sua', 'DET', '-', '-', '-'], ['totalidade', 'N', '-', '-', '-'],
             ['.', '', '-', '-', '-'], ['Cessar-fogo', 'N', '-', '-', '-'],
             ['violado', 'PCP', '-', '-', '-'], ['em', 'PRP', '-', '-', '-'],
             ['o', 'ART', '-', '-', '-'], ['Sara', 'PROP', '-', '-', '-'],
             ['Cessar-fogo', 'N', '-', '-', '-'],
             ['violado', 'PCP', '-', '-', '-'], ['em', 'PRP', '-', '-', '-'],
             ['o', 'ART', '-', '-', '-'], ['Sara', 'PROP', '-', '-', '-'],
             ['A', 'ART', 'ChildL1:Root3', '-', '-'],
             ['FRENTE_Polisário', 'PROP', 'ChildL1:Root3', '-', '-'],
             ['acusou', 'FIN', 'Root3', '-', '-'],
             ['ontem', 'ADV', 'ChildR1:Root3', '-', '-'],
             ['Marrocos', 'PROP', 'ChildR2:Root3', '-', '-'],
             ['de', 'PRP', 'ChildR3:Root3', '-', '-'],
             ['violar', 'INF', 'ChildR3:Root3', '-', '-'],
             ['por', 'PRP', 'ChildR3:Root3', '-', '-'],
             ['a', 'ART', 'ChildR3:Root3', '-', '-'],
             ['terceira', 'ADJ', 'ChildR3:Root3', '-', '-'],
             ['vez', 'N', 'ChildR3:Root3', '-', '-'],
             ['o', 'ART', 'ChildR3:Root3', '-', '-'],
             ['cessar-fogo', 'N', 'ChildR3:Root3', '-', '-'],
             ['em', 'PRP', 'ChildR3:Root3', '-', '-'],
             ['o', 'ART', 'ChildR3:Root3', '-', '-'],
             ['Sara_Ocidental', 'PROP', 'ChildR3:Root3', '-', '-'],
             ['a', 'PRP', 'ChildR3:Root3', '-', '-'],
             ['o', 'ART', 'ChildR3:Root3', '-', '-'],
             ['enviar', 'INF', 'ChildR3:Root3', '-', '-'],
             ['aviões', 'N', 'ChildR3:Root3', '-', '-'],
             ['para', 'PRP', 'ChildR3:Root3', '-', '-'],
             ['sobrevoar', 'INF', 'ChildR3:Root3', '-', '-'],
             ['a', 'ART', 'ChildR3:Root3', '-', '-'],
             ['povoação', 'N', 'ChildR3:Root3', '-', '-'],
             ['de', 'PRP', 'ChildR3:Root3', '-', '-'],
             ['Mijek', 'PROP', 'ChildR3:Root3', '-', '-'],
             [',', '', 'ChildR3:Root3', '-', '-'],
             ['em', 'PRP', 'ChildR3:Root3', '-', '-'],
             ['o', 'ART', 'ChildR3:Root3', '-', '-'],
             ['sudeste', 'N', 'ChildR3:Root3', '-', '-'],
             ['de', 'PRP', 'ChildR3:Root3', '-', '-'],
             ['o', 'ART', 'ChildR3:Root3', '-', '-'],
             ['território', 'N', 'ChildR3:Root3', '-', '-'],
             ['.', '', '-', '-', '-'], ['«', '', '-', '-', '-'],
             ['Para', 'PRP', 'ChildL1:Root5', '-', '2'],
             ['que', 'S', 'ChildL1:Root5', '-', '2'],
             ['possa', 'FIN', 'ChildL1:Root5', '-', '2'],
             ['não', 'ADV', 'ChildL1:Root5', '-', '2'],
             ['responder', 'INF', 'ChildL1:Root5', '-', '2'],
             ['a', 'PRP', 'ChildL1:Root5', '-', '2'],
             ['as', 'ART', 'ChildL1:Root5', '-', '2'],
             ['violações', 'N', 'ChildL1:Root5', '-', '2'],
             ['marroquinas', 'ADJ', 'ChildL1:Root5', '-', '2'],
             [',', '', 'ChildL1:Root5', '-', '2'],
             ['a', 'ART', 'ChildL1:Root5', '2', '2'],
             ['parte', 'N', 'ChildL1:Root5', '2', '2'],
             ['sarauí', 'ADJ', 'ChildL1:Root5', '2', '2'],
             ['exige', 'FIN', 'ChildL1:Root5', '-', '2'],
             ['que', 'S', 'ChildL1:Root5', '-', '2'],
             ['a', 'ART', 'ChildL1:Root5', '-', '2'],
             ['comunidade', 'N', 'ChildL1:Root5', '-', '2'],
             ['internacional', 'ADJ', 'ChildL1:Root5', '-', '2'],
             ['lance', 'FIN', 'ChildL1:Root5', '-', '2'],
             ['um', 'ART', 'ChildL1:Root5', '-', '2'],
             ['alerta', 'N', 'ChildL1:Root5', '-', '2'],
             ['a', 'PRP', 'ChildL1:Root5', '-', '2'],
             ['Marrocos', 'PROP', 'ChildL1:Root5', '-', '2'],
             ['para', 'PRP', 'ChildL1:Root5', '-', '2'],
             ['que', 'S', 'ChildL1:Root5', '-', '2'],
             ['cesse', 'FIN', 'ChildL1:Root5', '-', '2'],
             ['as', 'ART', 'ChildL1:Root5', '-', '2'],
             ['provocações', 'N', 'ChildL1:Root5', '-', '2'],
             ['e', 'C', 'ChildL1:Root5', '-', '2'],
             ['para', 'PRP', 'ChildL1:Root5', '-', '2'],
             ['que', 'S', 'ChildL1:Root5', '-', '2'],
             ['se', 'PERS', 'ChildL1:Root5', '-', '2'],
             ['comnporte', 'FIN', 'ChildL1:Root5', '-', '2'],
             ['de', 'PRP', 'ChildL1:Root5', '-', '2'],
             ['forma', 'N', 'ChildL1:Root5', '-', '2'],
             ['responsável', 'ADJ', 'ChildL1:Root5', '-', '2'],
             [',', '', 'ChildL1:Root5', '-', '2'],
             ['respeitando', 'GER', 'ChildL1:Root5', '-', '2'],
             ['os', 'ART', 'ChildL1:Root5', '-', '2'],
             ['seus', 'DET', 'ChildL1:Root5', '-', '2'],
             ['compromissos', 'N', 'ChildL1:Root5', '-', '2'],
             ['»', '', '-', '-', '-'], [',', '', '-', '-', '-'],
             ['declarou', 'FIN', 'Root5', '-', '-'],
             ['a', 'ART', 'ChildR1:Root5', '2', '-'],
             ['Polisário', 'PROP', 'ChildR1:Root5', '2', '-'],
             [',', '', 'ChildR1:Root5', '2', '-'],
             ['organização', 'N', 'ChildR1:Root5', '2', '-'],
             ['que', 'INDP', 'ChildR1:Root5', '2', '-'],
             ['luta', 'FIN', 'ChildR1:Root5', '2', '-'],
             ['por', 'PRP', 'ChildR1:Root5', '2', '-'],
             ['a', 'ART', 'ChildR1:Root5', '2', '-'],
             ['independência', 'N', 'ChildR1:Root5', '2', '-'],
             ['de', 'PRP', 'ChildR1:Root5', '2', '-'],
             ['o', 'ART', 'ChildR1:Root5', '2', '-'],
             ['território', 'N', 'ChildR1:Root5', '2', '-'],
             ['de', 'PRP', 'ChildR1:Root5', '2', '-'],
             ['o', 'ART', 'ChildR1:Root5', '2', '-'],
             ['Sara_Ocidental', 'PROP', 'ChildR1:Root5', '2', '-'],
             [',', '', 'ChildR1:Root5', '2', '-'],
             ['em', 'PRP', 'ChildR2:Root5', '-', '-'],
             ['um', 'ART', 'ChildR2:Root5', '-', '-'],
             ['comunicado', 'N', 'ChildR2:Root5', '-', '-'],
             ['divulgado', 'PCP', 'ChildR2:Root5', '-', '-'],
             ['em', 'PRP', 'ChildR2:Root5', '-', '-'],
             ['Argel', 'PROP', 'ChildR2:Root5', '-', '-'],
             ['.', '', '-', '-', '-']]

        inte1 = wisinput.interval(s, 4)
        # print(inte1)

        resp1 = [(85, 102)]

        #[ print(k, v) for k, v in enumerate(qb) ]
        #[ print(e) for e in inte1 ]

        self.assertTrue(inte1 == resp1)
コード例 #5
0
    def test_corefAnnotated2(self):
        s = [['A', 'ART', 'ChildL1:Root1', '0', '-'],
             ['agência_Nova_China', 'PROP', 'ChildL1:Root1', '0', '-'],
             ['informou', 'FIN', 'Root1', '-', '-'],
             ['que', 'S', 'ChildR1:Root1', '-', '0'],
             ['para', 'PRP', 'ChildR1:Root1', '-', '0'],
             ['redigir', 'INF', 'ChildR1:Root1', '-', '0'],
             ['este', 'DET', 'ChildR1:Root1', '-', '0'],
             ['dicionário', 'N', 'ChildR1:Root1', '-', '0'],
             ['de', 'PRP', 'ChildR1:Root1', '-', '0'],
             ['34_470', 'NUM', 'ChildR1:Root1', '-', '0'],
             ['entradas', 'N', 'ChildR1:Root1', '-', '0'],
             ['em', 'PRP', 'ChildR1:Root1', '-', '0'],
             ['língua', 'N', 'ChildR1:Root1', '-', '0'],
             ['chinesa', 'ADJ', 'ChildR1:Root1', '-', '0'],
             [',', '', 'ChildR1:Root1', '-', '0'],
             ['foi', 'FIN', 'ChildR1:Root1', '-', '0'],
             ['necessário', 'ADJ', 'ChildR1:Root1', '-', '0'],
             ['o', 'ART', 'ChildR1:Root1', '-', '0'],
             ['trabalho', 'N', 'ChildR1:Root1', '-', '0'],
             ['de', 'PRP', 'ChildR1:Root1', '-', '0'],
             ['300', 'NUM', 'ChildR1:Root1', '-', '0'],
             ['especialistas', 'N', 'ChildR1:Root1', '-', '0'],
             ['durante', 'PRP', 'ChildR1:Root1', '-', '0'],
             ['três', 'NUM', 'ChildR1:Root1', '-', '0'],
             ['anos', 'N', 'ChildR1:Root1', '-',
              '0'], ['.', '', '-', '-', '-'],
             ['A', 'ART', 'ChildL1:Root2', '-', '1'],
             ['enciclopédia', 'N', 'ChildL1:Root2', '-', '1'],
             ['«', '', 'ChildL1:Root2', '-', '1'],
             ['é', 'FIN', 'ChildL1:Root2', '-', '1'],
             ['considerada', 'PCP', 'ChildL1:Root2', '-', '1'],
             ['o', 'ART', 'ChildL1:Root2', '-', '1'],
             ['primeiro', 'ADJ', 'ChildL1:Root2', '-', '1'],
             ['grande', 'ADJ', 'ChildL1:Root2', '-', '1'],
             ['instrumento', 'N', 'ChildL1:Root2', '-', '1'],
             ['de', 'PRP', 'ChildL1:Root2', '-', '1'],
             ['trabalho', 'N', 'ChildL1:Root2', '-', '1'],
             ['exaustivo', 'ADJ', 'ChildL1:Root2', '-', '1'],
             ['e', 'C', 'ChildL1:Root2', '-', '1'],
             ['sistemático', 'ADJ', 'ChildL1:Root2', '-', '1'],
             ['para', 'PRP', 'ChildL1:Root2', '-', '1'],
             ['o', 'ART', 'ChildL1:Root2', '-', '1'],
             ['mestudo', 'N', 'ChildL1:Root2', '-', '1'],
             ['de', 'PRP', 'ChildL1:Root2', '-', '1'],
             ['o', 'ART', 'ChildL1:Root2', '-', '1'],
             ['marxismo-leninismo', 'N', 'ChildL1:Root2', '-', '1'],
             ['a', 'PRP', 'ChildL1:Root2', '-', '1'],
             ['ser', 'INF', 'ChildL1:Root2', '-', '1'],
             ['publicado', 'PCP', 'ChildL1:Root2', '-', '1'],
             ['depois', 'ADV', 'ChildL1:Root2', '-', '1'],
             ['de', 'PRP', 'ChildL1:Root2', '-', '1'],
             ['o', 'ART', 'ChildL1:Root2', '-', '1'],
             ['nascimento', 'N', 'ChildL1:Root2', '-', '1'],
             ['de', 'PRP', 'ChildL1:Root2', '-', '1'],
             ['a', 'ART', 'ChildL1:Root2', '-', '1'],
             ['doutrina', 'N', 'ChildL1:Root2', '-', '1'],
             ['marxista', 'ADJ', 'ChildL1:Root2', '-', '1'],
             ['»', '', '-', '-', '-'], [',', '', '-', '-', '-'],
             ['explicou', 'FIN', 'Root2', '-', '-'],
             ['a', 'ART', 'ChildR1:Root2', '1', '-'],
             ['agência', 'N', 'ChildR1:Root2', '1', '-'],
             ['.', '', '-', '-', '-'], ['A', 'ART', '-', '-', '-'],
             ['primeira', 'ADJ', '-', '-', '-'],
             ['edição', 'N', '-', '-', '-'], [',', '', '-', '-', '-'],
             ['de', 'PRP', '-', '-', '-'], ['11', 'NUM', '-', '-', '-'],
             ['mil', 'N', '-', '-', '-'], ['ecxemplares', 'N', '-', '-', '-'],
             [',', '', '-', '-', '-'], ['está', 'FIN', '-', '-', '-'],
             ['já', 'ADV', '-', '-', '-'], ['reservada', 'PCP', '-', '-', '-'],
             ['em', 'PRP', '-', '-', '-'], ['a', 'ART', '-', '-', '-'],
             ['sua', 'DET', '-', '-', '-'], ['totalidade', 'N', '-', '-', '-'],
             ['.', '', '-', '-', '-'], ['Cessar-fogo', 'N', '-', '-', '-'],
             ['violado', 'PCP', '-', '-', '-'], ['em', 'PRP', '-', '-', '-'],
             ['o', 'ART', '-', '-', '-'], ['Sara', 'PROP', '-', '-', '-'],
             ['Cessar-fogo', 'N', '-', '-', '-'],
             ['violado', 'PCP', '-', '-', '-'], ['em', 'PRP', '-', '-', '-'],
             ['o', 'ART', '-', '-', '-'], ['Sara', 'PROP', '-', '-', '-'],
             ['A', 'ART', 'ChildL1:Root3', '-', '-'],
             ['FRENTE_Polisário', 'PROP', 'ChildL1:Root3', '-', '-'],
             ['acusou', 'FIN', 'Root3', '-', '-'],
             ['ontem', 'ADV', 'ChildR1:Root3', '-', '-'],
             ['Marrocos', 'PROP', 'ChildR2:Root3', '-', '-'],
             ['de', 'PRP', 'ChildR3:Root3', '-', '-'],
             ['violar', 'INF', 'ChildR3:Root3', '-', '-'],
             ['por', 'PRP', 'ChildR3:Root3', '-', '-'],
             ['a', 'ART', 'ChildR3:Root3', '-', '-'],
             ['terceira', 'ADJ', 'ChildR3:Root3', '-', '-'],
             ['vez', 'N', 'ChildR3:Root3', '-', '-'],
             ['o', 'ART', 'ChildR3:Root3', '-', '-'],
             ['cessar-fogo', 'N', 'ChildR3:Root3', '-', '-'],
             ['em', 'PRP', 'ChildR3:Root3', '-', '-'],
             ['o', 'ART', 'ChildR3:Root3', '-', '-'],
             ['Sara_Ocidental', 'PROP', 'ChildR3:Root3', '-', '-'],
             ['a', 'PRP', 'ChildR3:Root3', '-', '-'],
             ['o', 'ART', 'ChildR3:Root3', '-', '-'],
             ['enviar', 'INF', 'ChildR3:Root3', '-', '-'],
             ['aviões', 'N', 'ChildR3:Root3', '-', '-'],
             ['para', 'PRP', 'ChildR3:Root3', '-', '-'],
             ['sobrevoar', 'INF', 'ChildR3:Root3', '-', '-'],
             ['a', 'ART', 'ChildR3:Root3', '-', '-'],
             ['povoação', 'N', 'ChildR3:Root3', '-', '-'],
             ['de', 'PRP', 'ChildR3:Root3', '-', '-'],
             ['Mijek', 'PROP', 'ChildR3:Root3', '-', '-'],
             [',', '', 'ChildR3:Root3', '-', '-'],
             ['em', 'PRP', 'ChildR3:Root3', '-', '-'],
             ['o', 'ART', 'ChildR3:Root3', '-', '-'],
             ['sudeste', 'N', 'ChildR3:Root3', '-', '-'],
             ['de', 'PRP', 'ChildR3:Root3', '-', '-'],
             ['o', 'ART', 'ChildR3:Root3', '-', '-'],
             ['território', 'N', 'ChildR3:Root3', '-', '-'],
             ['.', '', '-', '-', '-'], ['«', '', '-', '-', '-'],
             ['Para', 'PRP', 'ChildL1:Root5', '-', '2'],
             ['que', 'S', 'ChildL1:Root5', '-', '2'],
             ['possa', 'FIN', 'ChildL1:Root5', '-', '2'],
             ['não', 'ADV', 'ChildL1:Root5', '-', '2'],
             ['responder', 'INF', 'ChildL1:Root5', '-', '2'],
             ['a', 'PRP', 'ChildL1:Root5', '-', '2'],
             ['as', 'ART', 'ChildL1:Root5', '-', '2'],
             ['violações', 'N', 'ChildL1:Root5', '-', '2'],
             ['marroquinas', 'ADJ', 'ChildL1:Root5', '-', '2'],
             [',', '', 'ChildL1:Root5', '-', '2'],
             ['a', 'ART', 'ChildL1:Root5', '-', '2'],
             ['parte', 'N', 'ChildL1:Root5', '-', '2'],
             ['sarauí', 'ADJ', 'ChildL1:Root5', '-', '2'],
             ['exige', 'FIN', 'ChildL1:Root5', '-', '2'],
             ['que', 'S', 'ChildL1:Root5', '-', '2'],
             ['a', 'ART', 'ChildL1:Root5', '-', '2'],
             ['comunidade', 'N', 'ChildL1:Root5', '-', '2'],
             ['internacional', 'ADJ', 'ChildL1:Root5', '-', '2'],
             ['lance', 'FIN', 'ChildL1:Root5', '-', '2'],
             ['um', 'ART', 'ChildL1:Root5', '-', '2'],
             ['alerta', 'N', 'ChildL1:Root5', '-', '2'],
             ['a', 'PRP', 'ChildL1:Root5', '-', '2'],
             ['Marrocos', 'PROP', 'ChildL1:Root5', '-', '2'],
             ['para', 'PRP', 'ChildL1:Root5', '-', '2'],
             ['que', 'S', 'ChildL1:Root5', '-', '2'],
             ['cesse', 'FIN', 'ChildL1:Root5', '-', '2'],
             ['as', 'ART', 'ChildL1:Root5', '-', '2'],
             ['provocações', 'N', 'ChildL1:Root5', '-', '2'],
             ['e', 'C', 'ChildL1:Root5', '-', '2'],
             ['para', 'PRP', 'ChildL1:Root5', '-', '2'],
             ['que', 'S', 'ChildL1:Root5', '-', '2'],
             ['se', 'PERS', 'ChildL1:Root5', '-', '2'],
             ['comnporte', 'FIN', 'ChildL1:Root5', '-', '2'],
             ['de', 'PRP', 'ChildL1:Root5', '-', '2'],
             ['forma', 'N', 'ChildL1:Root5', '-', '2'],
             ['responsável', 'ADJ', 'ChildL1:Root5', '-', '2'],
             [',', '', 'ChildL1:Root5', '-', '2'],
             ['respeitando', 'GER', 'ChildL1:Root5', '-', '2'],
             ['os', 'ART', 'ChildL1:Root5', '-', '2'],
             ['seus', 'DET', 'ChildL1:Root5', '-', '2'],
             ['compromissos', 'N', 'ChildL1:Root5', '-', '2'],
             ['»', '', '-', '-', '-'], [',', '', '-', '-', '-'],
             ['declarou', 'FIN', 'Root5', '-', '-'],
             ['a', 'ART', 'ChildR1:Root5', '2', '-'],
             ['Polisário', 'PROP', 'ChildR1:Root5', '2', '-'],
             [',', '', 'ChildR1:Root5', '2', '-'],
             ['organização', 'N', 'ChildR1:Root5', '2', '-'],
             ['que', 'INDP', 'ChildR1:Root5', '2', '-'],
             ['luta', 'FIN', 'ChildR1:Root5', '2', '-'],
             ['por', 'PRP', 'ChildR1:Root5', '2', '-'],
             ['a', 'ART', 'ChildR1:Root5', '2', '-'],
             ['independência', 'N', 'ChildR1:Root5', '2', '-'],
             ['de', 'PRP', 'ChildR1:Root5', '2', '-'],
             ['o', 'ART', 'ChildR1:Root5', '2', '-'],
             ['território', 'N', 'ChildR1:Root5', '2', '-'],
             ['de', 'PRP', 'ChildR1:Root5', '2', '-'],
             ['o', 'ART', 'ChildR1:Root5', '2', '-'],
             ['Sara_Ocidental', 'PROP', 'ChildR1:Root5', '2', '-'],
             [',', '', 'ChildR1:Root5', '2', '-'],
             ['em', 'PRP', 'ChildR2:Root5', '-', '-'],
             ['um', 'ART', 'ChildR2:Root5', '-', '-'],
             ['comunicado', 'N', 'ChildR2:Root5', '-', '-'],
             ['divulgado', 'PCP', 'ChildR2:Root5', '-', '-'],
             ['em', 'PRP', 'ChildR2:Root5', '-', '-'],
             ['Argel', 'PROP', 'ChildR2:Root5', '-', '-'],
             ['.', '', '-', '-', '-']]
        gpqIndex = 0
        corefIndex = 1

        answer = [[4], [8]]
        lAnswer = ['Child2', 'Child3']

        qbA = [e[4] for e in s]
        quotesA = wisinput.interval(qbA)

        print("quotes: ", quotesA)

        coref, labels = wisinput.corefAnnotated(s,
                                                quotes=quotesA,
                                                depIndex=2,
                                                corefIndex=3,
                                                quoteIndex=4)
        print("coref: ", coref)

        self.assertTrue(coref == answer and labels == lAnswer)
コード例 #6
0
ファイル: preprocessing.py プロジェクト: rafael2reis/iquotesx
def createInput(fileName=None, createTest=False):
    """Creates a CSV file with the result of the preprocessing step.

    Args:
        fileName: The CSV file that will be created
        createTest: If the preprocessing will be applyed in the test set
    """
    corpus = bosquequotes.load(BOSQUE_FILE)
    test = bosquequotes.load(BOSQUE_TEST_FILE)
    converter = verbspeech.Converter()

    if not fileName:
        fileName = INPUT_FILE

    open(fileName, 'w').close()

    pos = feature.pos(corpus + test, posIndex=1)
    columns = feature.columns(pos)

    if createTest:
        corpus = test

    i = 0
    for i in range(len(corpus)):
        s = corpus[i]
        # qs = baseline.quotationStart(s)
        # qe = baseline.quotationEnd(s, qs)
        qb = baseline.quoteBounds(s)

        converter.vsay(s, tokenIndex=0, posIndex=1)

        #for k in range(len(s)):
        #    print(k, s[k][0].ljust(30), s[k][1].ljust(10), s[k][7].ljust(5), qs[k], qe[k], qb[k])

        # Baseline: X
        #print("Create bc...")
        bc = baseline.boundedChunk(s)
        #print("Create vsn...")
        vsn = baseline.verbSpeechNeighb(s)
        #print("Create fluc...")
        fluc = baseline.firstLetterUpperCase(s)

        #print("Identifying quotes...")
        # quotes = wisinput.interval(qb)

        #print("Identifying coreferences...")
        # coref, labels = wisinput.coref(s, quotes, corefIndex=7)
        quotes, coref, labels = wisinput.candidates(s, depIndex=2)

        #print("Creating features...")
        feat = feature.create(s,
                              quotes=quotes,
                              coref=coref,
                              posIndex=1,
                              corefIndex=3,
                              quoteBounds=qb,
                              bc=bc,
                              vsn=vsn,
                              fluc=fluc)

        #print("Binarying features...")
        bfeat = feature.binary(columns, feat)

        # Answer: Y
        #print("Output: Creating y...")
        qbA = [e[INDEX_QB] for e in s]

        #print("Output: Identifying quotes...")
        quotesA = wisinput.interval(qbA)
        #print("Output: Quotes = ", len(quotesA))

        #print("Output: Identifying coreferences...")
        corefA, labelsA = wisinput.corefAnnotated(s,
                                                  quotes=quotesA,
                                                  depIndex=2,
                                                  corefIndex=3,
                                                  quoteIndex=4)
        #print("Output: Coref = ", len(corefA))

        print("Output: Creating features...")
        featA = feature.create(s, quotes=quotesA, coref=corefA, posIndex=1, corefIndex=3, \
                quoteBounds=qbA, bc=bc, vsn=vsn, fluc=fluc, dummy=False)

        #print("Output: Binarying features...")
        bfeatA = feature.binary(columns, featA)
        #print("Output: bFeat = ", len(bfeatA))

        with open(fileName, 'a', newline='') as csvfile:
            swriter = csv.writer(csvfile, delimiter=';')

            for p in range(len(bfeat)):
                for q in range(len(bfeat[p])):
                    swriter.writerow([i, "x"] + list(quotes[p]) +
                                     [labels[p][q]] + bfeat[p][q])

            for p in range(len(bfeatA)):
                for q in range(len(bfeatA[p])):
                    swriter.writerow([i, "y"] + list(quotesA[p]) +
                                     [labelsA[p][q]] + bfeatA[p][q])