def read_seg_file(path, word2vec, remove_preface_segment=True, ignore_list=False, remove_special_tokens=False, return_as_sentences=False, high_granularity=True, only_letters=False, max_token_num=0): data, targets = [], [] all_sections = get_sections(path, high_granularity) for section in all_sections: sentences = section.split('\n') if not sentences: continue for sentence in sentences: tokens = sentence.split("\t") if len(tokens) != 2: continue loc, ocr = tokens sentence_words = ocr.split(" ") if 1 <= len(sentence_words): offset, token_num = 0, len(sentence_words) sent_data = [] for i, word in enumerate(sentence_words): word_embed = word_model(word, word2vec) #loc_embed = location_model(loc, i, offset, token_num) sent_data.append(word_embed) offset += len(word) if max_token_num > 0: sent_data = sent_data[:max_token_num] data.append(sent_data) if data: targets.append(len(data) - 1) return data, targets, path
def read_wiki_file(path, word2vec, remove_preface_segment=True, ignore_list=False, remove_special_tokens=False, return_as_sentences=False, high_granularity=True, only_letters=False): data = [] targets = [] all_sections = get_sections(path, high_granularity) required_sections = all_sections[1:] if remove_preface_segment and len( all_sections) > 0 else all_sections required_non_empty_sections = [ section for section in required_sections if len(section) > 0 and section != "\n" ] for section in required_non_empty_sections: sentences = section.split('\n') if sentences: for sentence in sentences: is_list_sentence = wiki_utils.get_list_token() + "." == str( sentence) #print(is_list_sentence) if ignore_list and is_list_sentence: # print(wiki_utils.get_list_token() + ".") # print(sentence.encode('utf-8')) continue if not return_as_sentences: sentence_words = extract_sentence_words( sentence, remove_special_tokens=remove_special_tokens) if 1 <= len(sentence_words): data.append([ word_model(word, word2vec) for word in sentence_words ]) else: #print(sentence) #print(sentence_words) #raise ValueError('Sentence in wikipedia file is empty') logger.info('Sentence in wikipedia file is empty') else: # for the annotation. keep sentence as is. if (only_letters): sentence = re.sub('[^a-zA-Z0-9 ]+', '', sentence) data.append(sentence) else: data.append(sentence) if data: targets.append(len(data) - 1) return data, targets, path
def read_choi_file(path, word2vec, sent_bert_vec, train, return_w2v_tensors=True, manifesto=False): seperator = '========' if manifesto else '==========' with Path(path).open('r') as f: raw_text = f.read() paragraphs = [ clean_paragraph(p) for p in raw_text.strip().split(seperator) if len(p) > 5 and p != "\n" ] if train: random.shuffle(paragraphs) targets = [] new_text = [] text = [] lastparagraphsentenceidx = 0 for paragraph in paragraphs: if manifesto: sentences = split_sentences(paragraph, 0) else: sentences = [ s for s in paragraph.split('\n') if len(s.split()) > 0 ] if sentences: sentences_count = 0 # This is the number of sentences in the paragraph and where we need to split. for sentence in sentences: words, sentence_str = extract_sentence_words(sentence) if (len(words) == 0): continue sentences_count += 1 if return_w2v_tensors: text.append(words) new_text.append([word_model(w, word2vec) for w in words]) else: text.append(words) new_text.append(words) lastparagraphsentenceidx += sentences_count targets.append(lastparagraphsentenceidx - 1) return new_text, targets, path, sent_bert_vec