def read_doc(doc, labels): doc = SpaceTokenizer().tokenize(doc.strip()) # doc = doc.strip().split() labels = labels.strip().split('|') labels = [la.split() for la in labels] for i in range(len(labels)): for j in range(len(labels[i])): labels[i][j] = int(labels[i][j]) res_labels = [0] * len(doc) for la in labels: if la[2] != 0: start = la[0] end = la[1] res_labels[start:end + 1] = [1] * (end + 1 - start) return [(doc[i], str(res_labels[i])) for i in range(len(doc))]