def generate_sequential_data11(lst, lstv=None, pos="V"): corpus = [] all_data = [] map = Mapping() for fn in lst: txt = read_conll2009_corpus(fn) corpus.extend(txt) for sen in corpus: for w in sen: if isinstance(w, Predicate) and getattr(w, "pos")[0] == pos: lst = [map.add_value(w.form + "_" + w.lemma + "." + w.sense)] arglst = [] for arg in w.arguments: hn = get_represented_form(sen, arg) if hn is not None: arglst.append( map.add_value(hn + "_" + w.arguments[arg])) arglst.append(map.add_value("EOS")) lst.extend(arglst) all_data.append(lst) X = [[x[i] for i in range(len(x) - 1)] for x in all_data] Y = [[x[i + 1] for i in range(len(x) - 1)] for x in all_data] Xv = None Yv = None if lstv is not None: corpusv = [] all_datav = [] for fn in lstv: txt = read_conll2009_corpus(fn) corpusv.extend(txt) for sen in corpusv: for w in sen: if isinstance(w, Predicate) and getattr(w, "pos")[0] == pos: lst = [ map.add_value(w.form + "_" + w.lemma + "." + w.sense) ] arglst = [] for arg in w.arguments: hn = get_represented_form(sen, arg) if hn is not None: arglst.append( map.add_value(hn + "_" + w.arguments[arg])) arglst.append(map.add_value("EOS")) lst.extend(arglst) all_datav.append(lst) Xv = [[x[i] for i in range(len(x) - 1)] for x in all_datav] Yv = [[x[i + 1] for i in range(len(x) - 1)] for x in all_datav] return X, Y, Xv, Yv, map
def generate_sequential_data21_getmap(lst, lstv=None, pos="V", data="form"): corpus = [] mapX1 = Mapping() mapX2 = Mapping() mapY1 = Mapping() for fn in lst: txt = read_conll2009_corpus(fn) corpus.extend(txt) for sen in corpus: for w in sen: if isinstance(w, Predicate) and getattr(w, "pos")[0] == pos: mapX1.add_value(w.form) mapX2.add_value("PRED") mapY1.add_value(w.form + "_PRED") for arg in w.arguments: if data == "origin": hn = sen[arg].form else: hn = get_represented_form(sen, arg) if hn is not None: mapX1.add_value(hn) mapX2.add_value(w.arguments[arg]) mapY1.add_value(hn + "_" + w.arguments[arg]) mapX1.add_value("EOS") mapX2.add_value("EOS") mapY1.add_value("EOS_EOS") return mapX1, mapX2, mapY1
def generate_sequential_data11(lst, lstv=None, pos="V", data="form", count=100000): corpus = [] all_data = [] map = Mapping() for i in range(1, len(lst)): txt = read_conll2009_corpus(lst[i]) corpus.extend(txt) if count is not None: c = 0 corpus_select = [] for s in corpus: c += 1 if c < count: corpus_select.append(s) else: break corpus = corpus_select corpus.extend(read_conll2009_corpus(lst[0])) for sen in corpus: for w in sen: if isinstance(w, Predicate) and getattr(w, "pos")[0] == pos: lst = [map.add_value(w.form + "_PRED")] arglst = [] for arg in w.arguments: if data == "origin": hn = sen[arg].form else: hn = get_represented_form(sen, arg) if hn is not None: arglst.append( map.add_value(hn + "_" + w.arguments[arg])) arglst.append(map.add_value("EOS")) lst.extend(arglst) all_data.append(lst) X = [[x[i] for i in range(len(x) - 1)] for x in all_data] Y = [[x[i + 1] for i in range(len(x) - 1)] for x in all_data] Xv = None Yv = None if lstv is not None: corpusv = [] all_datav = [] for fn in lstv: txt = read_conll2009_corpus(fn) corpusv.extend(txt) for sen in corpusv: for w in sen: if isinstance(w, Predicate) and getattr(w, "pos")[0] == pos: lst = [map.add_value(w.form + "_PRED")] arglst = [] for arg in w.arguments: if data == "origin": hn = sen[arg].form else: hn = get_represented_form(sen, arg) if hn is not None: arglst.append( map.add_value(hn + "_" + w.arguments[arg])) arglst.append(map.add_value("EOS")) lst.extend(arglst) all_datav.append(lst) Xv = [[x[i] for i in range(len(x) - 1)] for x in all_datav] Yv = [[x[i + 1] for i in range(len(x) - 1)] for x in all_datav] return X, Y, Xv, Yv, map
def generate_sequential_data21(corpus, lstv=None, pos="V", data="form", mapX1=None, mapX2=None, mapY1=None): all_data = [] if mapX1 is not None: for sen in corpus: for w in sen: if isinstance(w, Predicate) and getattr(w, "pos")[0] == pos: lst = [(mapX1.get_index(w.form), mapX2.get_index("PRED"), mapY1.get_index(w.form + "_PRED"))] arglst = [] for arg in w.arguments: if data == "origin": hn = sen[arg].form else: hn = get_represented_form(sen, arg) if hn is not None: arglst.append( (mapX1.get_index(hn), mapX2.get_index(w.arguments[arg]), mapY1.get_index(hn + "_" + w.arguments[arg]))) arglst.append( (mapX1.get_index("EOS"), mapX2.get_index("EOS"), mapY1.get_index("EOS_EOS"))) lst.extend(arglst) all_data.append(lst) X1 = [[x[i][0] for i in range(len(x) - 1)] for x in all_data] X2 = [[x[i][1] for i in range(len(x) - 1)] for x in all_data] Y = [[x[i + 1][2] for i in range(len(x) - 1)] for x in all_data] Xv1 = None Xv2 = None Yv = None if lstv is not None: corpusv = [] all_datav = [] for fn in lstv: txt = read_conll2009_corpus(fn) corpusv.extend(txt) for sen in corpusv: for w in sen: if isinstance(w, Predicate) and getattr(w, "pos")[0] == pos: lst = [(mapX1.get_index(w.form), mapX2.get_index("PRED"), mapY1.get_index(w.form + "_PRED"))] arglst = [] for arg in w.arguments: if data == "origin": hn = sen[arg].form else: hn = get_represented_form(sen, arg) if hn is not None: arglst.append( (mapX1.get_index(hn), mapX2.get_index(w.arguments[arg]), mapY1.get_index(hn + "_" + w.arguments[arg]))) arglst.append( (mapX1.get_index("EOS"), mapX2.get_index("EOS"), mapY1.get_index("EOS_EOS"))) lst.extend(arglst) all_datav.append(lst) Xv1 = [[x[i][0] for i in range(len(x) - 1)] for x in all_datav] Xv2 = [[x[i][1] for i in range(len(x) - 1)] for x in all_datav] Yv = [[x[i + 1][2] for i in range(len(x) - 1)] for x in all_datav] return X1, X2, Y, Xv1, Xv2, Yv, mapX1, mapX2, mapY1 else: mapX1 = Mapping() mapX2 = Mapping() mapY1 = Mapping() for txttrain in corpus: all_data.extend() for sen in corpus: for w in sen: if isinstance(w, Predicate) and getattr(w, "pos")[0] == pos: lst = [(mapX1.add_value(w.form), mapX2.add_value("PRED"), mapY1.add_value(w.form + "_PRED"))] arglst = [] for arg in w.arguments: if data == "origin": hn = sen[arg].form else: hn = get_represented_form(sen, arg) if hn is not None: arglst.append( (mapX1.add_value(hn), mapX2.add_value(w.arguments[arg]), mapY1.add_value(hn + "_" + w.arguments[arg]))) arglst.append( (mapX1.add_value("EOS"), mapX2.add_value("EOS"), mapY1.add_value("EOS_EOS"))) lst.extend(arglst) all_data.append(lst) X1 = [[x[i][0] for i in range(len(x) - 1)] for x in all_data] X2 = [[x[i][1] for i in range(len(x) - 1)] for x in all_data] Y = [[x[i + 1][2] for i in range(len(x) - 1)] for x in all_data] Xv1 = None Xv2 = None Yv = None if lstv is not None: corpusv = [] all_datav = [] for fn in lstv: txt = read_conll2009_corpus(fn) corpusv.extend(txt) for sen in corpusv: for w in sen: if isinstance(w, Predicate) and getattr(w, "pos")[0] == pos: lst = [(mapX1.add_value(w.form), mapX2.add_value("PRED"), mapY1.add_value(w.form + "_PRED"))] arglst = [] for arg in w.arguments: if data == "origin": hn = sen[arg].form else: hn = get_represented_form(sen, arg) if hn is not None: arglst.append( (mapX1.add_value(hn), mapX2.add_value(w.arguments[arg]), mapY1.add_value(hn + "_" + w.arguments[arg]))) arglst.append( (mapX1.add_value("EOS"), mapX2.add_value("EOS"), mapY1.add_value("EOS_EOS"))) lst.extend(arglst) all_datav.append(lst) Xv1 = [[x[i][0] for i in range(len(x) - 1)] for x in all_datav] Xv2 = [[x[i][1] for i in range(len(x) - 1)] for x in all_datav] Yv = [[x[i + 1][2] for i in range(len(x) - 1)] for x in all_datav] return X1, X2, Y, Xv1, Xv2, Yv, mapX1, mapX2, mapY1
def readAll(self): txt = [] for f in self.input_file: txt=read_conll2009_corpus(f) return txt
words.append(l.strip()) if len(words) != 0: if end is None: if idx >= start: sens.append(words) else: if idx >= start: if idx < end: sens.append(words) for sen in sens: conll2009sen = read_conll2009_sentence(sen, read_label, use_gold=use_gold) txt.append(conll2009sen) return txt if __name__ == "__main__": lst = ["/home/quynh/working/Data/conll2009/train.conll2009.pp.txt"] reader = Conll2009BatchReader(1000, lst) count = 0 while True: txt = reader.next() if len(txt) == 0: break count+=len(txt) print (count) txt = read_conll2009_corpus("/home/quynh/working/Data/conll2009/train.conll2009.pp.txt") print (len(txt))