Ejemplo n.º 1
0
def read_doc(doc, labels):
    doc = SpaceTokenizer().tokenize(doc.strip())
    # doc = doc.strip().split()
    labels = labels.strip().split('|')
    labels = [la.split() for la in labels]
    for i in range(len(labels)):
        for j in range(len(labels[i])):
            labels[i][j] = int(labels[i][j])

    res_labels = [0] * len(doc)
    for la in labels:
        if la[2] != 0:
            start = la[0]
            end = la[1]
            res_labels[start:end + 1] = [1] * (end + 1 - start)
    return [(doc[i], str(res_labels[i])) for i in range(len(doc))]