def load_json(p, lower): source = [] tgt = [] flag = False for sent in json.load(open(p, encoding='utf-8'))['sentences']: # SH geändert for sent in json.load(open(p))['sentences']: tokens = [t['word'] for t in sent['tokens']] if (lower): tokens = [t.lower() for t in tokens] if (tokens[0] == '@highlight') or (tokens[0] == '@songtitle'): flag = True # tgt.append(tokens[1:]) # sh hinzugefügt continue if (flag): tgt.append(tokens) # flag = False else: if '@songtitle' not in tokens: # sh hinzugefügt to avoid empty tgts source.append(tokens) else: ind = tokens.index('@songtitle') source.append(tokens[:ind-1]) flag = True title = tokens[ind+1:] tgt.append(title) source = [clean(' '.join(sent)).split() for sent in source] tgt = [clean(' '.join(sent)).split() for sent in tgt] return source, tgt
def load_json(p, lower): source = [] tgt = [] flag = False for sent in json.load(open(p))['sentences']: tokens = [t['word'] for t in sent['tokens']] if (lower): tokens = [ t.lower() for t in tokens if t.lower() is not '\n' and t.lower() is not '\r' and t.lower() is not ' ' ] if len(tokens) == 0: continue if (tokens[0] == '@highlight'): flag = True continue if (flag): tgt.append(tokens) # flag = False else: source.append(tokens) source = [clean(' '.join(sent)).split() for sent in source] tgt = [clean(' '.join(sent)).split() for sent in tgt] return source, tgt
def vn_format_to_json(args): stories_dir = os.path.abspath(args.raw_path) tokenized_stories_dir = os.path.abspath(args.save_path) print("Preparing to tokenize %s to %s..." % (stories_dir, tokenized_stories_dir)) stories = glob.glob(pjoin(args.raw_path, '*.txt')) annotator = VnCoreNLP("./vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg", max_heap_size='-Xmx500m') dataset = [] for s in stories: tgt = [] source = [] flag = False f = open(pjoin(stories_dir, s), encoding='utf-8') for line in f: if line == '\n': continue if line == '@highlight\n': flag = True continue tokens = annotator.tokenize(line) if flag: tgt.extend(tokens) else: source = tokens dataset.append({"src": [clean(' '.join(sent)).split() for sent in source], "tgt": [clean(' '.join(sent)).split() for sent in tgt]}) print("Tokenizing %i files in %s" % (len(stories), stories_dir)) print("VNCoreNLP Tokenizer has finished.") valid_test_ratio = 0.1 all_size = len(dataset) test_sets = dataset[:int(all_size * valid_test_ratio)] valid_sets = dataset[int(all_size * valid_test_ratio):int(all_size * valid_test_ratio * 2)] train_sets = dataset[int(all_size * valid_test_ratio * 2):] corpora = {'train': train_sets, 'valid': valid_sets, 'test': test_sets} for corpus_type in ['train', 'valid', 'test']: p_ct = 0 for split in [corpora[corpus_type][i * args.shard_size:(i + 1) * args.shard_size] for i in range((len(corpora[corpus_type]) + args.shard_size - 1) // args.shard_size)]: pt_file = pjoin(args.save_path, corpus_type + '.' + str(p_ct) + '.json') with codecs.open(pt_file, 'w', encoding='utf-8') as save: json.dump(split, save, ensure_ascii=False) p_ct += 1
def load_json(p, lower): source = [] tgt = [] flag = False for sent in json.load(open(p, encoding='utf-8'))['sentences']: tokens = [t['word'] for t in sent['tokens']] if (lower): tokens = [t.lower() for t in tokens] if (tokens[0] == '@highlight'): flag = True continue if (flag): tgt.append(tokens) flag = False else: source.append(tokens) source = [clean(' '.join(sent)).split() for sent in source] tgt = [clean(' '.join(sent)).split() for sent in tgt] return source, tgt
def load_json(p, lower): source = [] tgt = [] flag = False for sent in json.load(open(p))["sentences"]: tokens = [t["word"] for t in sent["tokens"]] if lower: tokens = [t.lower() for t in tokens] if tokens[0] == "@highlight": flag = True tgt.append([]) continue if flag: tgt[-1].extend(tokens) else: source.append(tokens) source = [clean(" ".join(sent)).split() for sent in source] tgt = [clean(" ".join(sent)).split() for sent in tgt] return source, tgt
def load_json(p, lower): source = [] tgt = [] flag = False for sent in json.load(open(p))['sentences']: tokens = [t['word'] for t in sent['tokens']] if lower: tokens = [t.lower() for t in tokens] if tokens[0] == '@highlight': flag = True tgt.append([]) continue if flag: tgt[-1].extend(tokens) else: source.append(tokens) source = [clean(' '.join(sent)).split() for sent in source] tgt = [clean(' '.join(sent)).split() for sent in tgt] return source, tgt
def load_json(p, lower): source = [] tgt = [] flag = False for sent in json.load(open(p))['sentences']: tokens = [t['word'] for t in sent['tokens']] if (lower): tokens = [t.lower() for t in tokens] if (tokens[0] == '@highlight'): flag = True continue if (flag): tgt.append(tokens) #commented because in CNN dataset there's a '@highlight' after each summary, where as I only put only one. #flag = False else: source.append(tokens) source = [clean(' '.join(sent)).split() for sent in source] tgt = [clean(' '.join(sent)).split() for sent in tgt] return source, tgt
def load_json(p, lower=False): regex = r"([\w][.])+?[ ]([\w])" to = r"\1\n\2" examples = [] flag = False for data in tqdm.tqdm(json.load(open(p))): if not (data['type'] == "video"): if len(data["content"]) == 0 or len(data['summary']) == 0: continue src_sentences = re.sub(regex, to, data["content"].strip()).split("\n") tgt_sentences = re.sub(regex, to, data['summary'].strip()).split("\n") src_tokens = [clean(sent).split(' ') for sent in src_sentences] tgt_tokens = [clean(sent).split(' ') for sent in tgt_sentences] examples.append({"src": src_tokens, "tgt": tgt_tokens}) return examples
def load_jsonwiki(p, lower): source = [] tgt = [] flag = False for sent in json.load(open(p))['sentences']: print(sent) tokens = [t['word'] for t in sent['tokens']] print("lower:", lower) if (lower): tokens = [t.lower() for t in tokens] print("tokens are now this:", tokens) if (tokens[0] == '@summary'): flag = True tgt.append([]) if (tokens[0] == '@article'): flag = False source.append(tokens[1:]) continue if (flag): print("tokens:", tokens) if (tokens[0] != '@summary'): tgt[-1].extend(tokens) else: tgt[-1].extend(tokens[1:]) else: source.append(tokens) source = [clean(' '.join(sent)).split() for sent in source] tgt = [clean(' '.join(sent)).split() for sent in tgt] return source, tgt
def load_json(p, lower): """ Function to load json and create dataset Input: text as train; highlight as target """ source = [] tgt = [] flag = False for sent in json.load(open(p))['sentences']: tokens = [t['word'] for t in sent['tokens']] if (lower): tokens = [t.lower() for t in tokens] if (tokens[0] == '@highlight'): flag = True tgt.append([]) continue if (flag): tgt[-1].extend(tokens) else: source.append(tokens) source = [clean(' '.join(sent)).split() for sent in source] tgt = [clean(' '.join(sent)).split() for sent in tgt] return source, tgt
def load_json(p, lower): source = [] tgt = [] flag = False # pos_tag = [] for sent in json.load(open(p))['sentences']: tokens = [t['word'] for t in sent['tokens']] # _pos_tag = [t['pos'] for t in sent['tokens']] if (lower): tokens = [t.lower() for t in tokens] if (tokens[0] == '@highlight'): flag = True continue if (flag): tgt.append(tokens) flag = False else: source.append(tokens) # pos_tag.append(_pos_tag) source = [clean(' '.join(sent)).split() for sent in source] tgt = [clean(' '.join(sent)).split() for sent in tgt] # assert len(' '.join([' '.join(i) for i in pos_tag]).split()) == len(' '.join([' '.join(i) for i in source]).split()) return source, tgt