def segment(data): segmenter = DeepSegment('en') seg_text = [] for text in data: segmenter.segment_long(text), seg_text.extend(segmenter.segment_long(text)) return seg_text
class DeepSegmenter(BaseSegmenter): """ Designed with ASR outputs in mind, DeepSegment uses BiLSTM + CRF for automatic sentence boundary detection. It outperforms the standard libraries (spacy, nltk, corenlp ..) on imperfect text, and performs similarly for perfectly punctuated text. Example: 'I am Batman i live in gotham' -> # ['I am Batman', 'i live in gotham'] Details: https://github.com/notAI-tech/deepsegment :param lang_code: en - english (Trained on data from various sources); fr - french (Only Tatoeba data); it - italian (Only Tatoeba data) :type lang_code: str :param checkpoint_name: Name to be used as checkpoint :type checkpoint_name: str :param args: Additional positional arguments :param kwargs: Additional keyword arguments """ def __init__(self, lang_code: str = 'en', checkpoint_name: str = None, *args, **kwargs): """Set constructor.""" super().__init__(*args, **kwargs) self.lang_code = lang_code self.checkpoint_name = checkpoint_name def post_init(self): from deepsegment import DeepSegment self._segmenter = DeepSegment(self.lang_code, checkpoint_name=self.checkpoint_name) @single def segment(self, text: str, *args, **kwargs) -> List[Dict]: """ Split the text into sentences. :param text: Raw text to be segmented :type text: str :param args: Additional positional arguments :param kwargs: Additional keyword arguments :return: List of sub-docuemnt dicts with the cropped images :rtype: List[Dict] """ results = [] for idx, s in enumerate(self._segmenter.segment_long(text)): results.append(dict(text=s, offset=idx, weight=1.0)) return results
class DeepSegmenter(BaseSegmenter): """ Designed with ASR outputs in mind, DeepSegment uses BiLSTM + CRF for automatic sentence boundary detection. It significantly outperforms the standard libraries (spacy, nltk, corenlp ..) on imperfect text and performs similarly for perfectly punctuated text. Example: 'I am Batman i live in gotham' -> # ['I am Batman', 'i live in gotham'] Details: https://github.com/notAI-tech/deepsegment """ def __init__(self, lang_code: str = 'en', checkpoint_name: str = None, *args, **kwargs): """ :param lang_code: en - english (Trained on data from various sources); fr - french (Only Tatoeba data); it - italian (Only Tatoeba data) :param args: :param kwargs: """ super().__init__(*args, **kwargs) self.lang_code = lang_code self.checkpoint_name = checkpoint_name def post_init(self): from deepsegment import DeepSegment self._segmenter = DeepSegment(self.lang_code, checkpoint_name=self.checkpoint_name) def craft(self, text: str, doc_id: int, *args, **kwargs) -> List[Dict]: """ Split the text into sentences. :param text: the raw text :param doc_id: the doc id :return: a list of chunk dicts with the cropped images """ results = [] for idx, s in enumerate(self._segmenter.segment_long(text)): results.append(dict(text=s, offset=idx, weight=1.0)) return results
def main(): segmenter = DeepSegment('en') connect = pymysql.connect(**config) cursor = connect.cursor() cursor.execute('SELECT Id, Name, Caption FROM video') results = cursor.fetchall() new_results = [] for result in tqdm(results): for playlist in playlist_list: if re.search(r'^' + playlist + '_\d+', result['Name']): new_result = {} new_result['caption'] = ', '.join( segmenter.segment_long(result['Caption'])) new_result['name'] = result['Name'] new_result['id'] = result['Id'] new_results.append(new_result) new_results = rectify(new_results) for new_result in tqdm(new_results): cursor.execute('UPDATE video SET Caption=%s WHERE Id=%s', (new_result['caption'], new_result['id'])) print(cursor.execute('SELECT * FROM video')) connect.commit() cursor.close() connect.close()
class Decoder: def __init__(self, name: str, bit_rate: int, iteration: int = 1, max_active: int = 10000, max_batch_size=50) -> None: super().__init__() self.name = name self.bit_rate = bit_rate self.segmenter = None self.use_feedback = False self.env = os.environ.copy() self.env["ITERATIONS"] = str(iteration) self.env["MAX_ACTIVE"] = str(max_active) self.env["MAX_BATCH_SIZE"] = str(max_batch_size) self.model_dir = os.path.join( "/workspace/nvidia-examples/", name.lower()) self.result_dir = os.path.join("/tmp/results/", name.lower()) self.prep_command = "prepare_data.sh" self.batch_feedback_command = "run_benchmark.sh" self.batch_command = "run_benchmark_org.sh" self.last_run = None # Decoding lock self.batch_lk = Lock() self.model_trainings = 0 def initialize(self) -> None: prep_process = Popen(["/bin/bash", self.prep_command], stdin=PIPE, stderr=PIPE, cwd=self.model_dir) stdout, stderr = prep_process.communicate() logging.debug(stdout) logging.debug(stderr) def init_segment(self): from deepsegment import DeepSegment self.segmenter = DeepSegment("en", tf_serving=False) def extract_corpora(self, batch_id: int, iter_id: int = 0) -> Dict[str, object]: # noinspection PyBroadException try: trans_file = open(os.path.join( self.result_dir, str(batch_id), str(iter_id), "trans")) transcripts = trans_file.readlines() transcript_repo: Dict[str, List[str]] = {} for t in transcripts: spl = t.split(maxsplit=1) header = spl[0].split("_")[-1].split(".")[0] trans = spl[1] single_trans = open(os.path.join(self.result_dir, str( batch_id), str(iter_id), "trans_" + header), "w") single_trans.write(trans) single_trans.close() archived = open(os.path.join("/root/audio/batch" + str(batch_id), "tran_" + header + ".txt"), "w") archived.write(trans) archived.close() transcript_repo[header] = trans.split() trans_int_file = open(os.path.join(self.result_dir, str( batch_id), str(iter_id), "trans_int_combined")) transcript_ints = trans_int_file.readlines() transcript_int_repo: Dict[str, List[int]] = {} for t in transcript_ints: spl = t.split(maxsplit=1) header = spl[0].split("_")[-1].split(".")[0] trans = spl[1] single_trans = open(os.path.join(self.result_dir, str( batch_id), str(iter_id), "trans_int_combined_" + header), "w") single_trans.write(trans) single_trans.close() transcript_int_repo[header] = list( map(lambda x: int(x), trans.split())) cmt_file = open(os.path.join(self.result_dir, str( batch_id), str(iter_id), "CTM.ctm")) convo = cmt_file.readlines() # noinspection PyTypeChecker extraction: Dict[str, TextIOWrapper] = {} convo_repo: Dict[str, List] = {} for c in convo: conv = c.split()[3:] conv = [float(conv[0]), int(conv[1])] meta = c.split()[0] header = meta.split(".", maxsplit=1)[0].split("_")[-1] if header in extraction: extraction[header].write(str(conv[0]) + " " + str(conv[1])) convo_repo[header].append(conv) else: fd = os.path.join(self.result_dir, str( batch_id), str(iter_id), header + ".ctm") # noinspection PyTypeChecker extraction[header] = open(fd, "w") convo_repo[header] = [] for k in extraction.keys(): extraction[k].close() except: logging.error("Failed for batch ", batch_id) return {} batch_out = {} for key in transcript_repo.keys(): # noinspection PyBroadException # try: transcript_tokens = transcript_repo[key] transcript = "" for tt in transcript_tokens: transcript += (tt + " ") alignment, duration = Decoder.calculate_alignment( transcript_repo[key], transcript_int_repo[key], convo_repo[key]) logging.debug("Alignment complete for ", key) sentences = [] # noinspection PyBroadException try: os.environ['CUDA_VISIBLE_DEVICES'] = '0' self.init_segment() sentences = self.segmenter.segment_long(transcript) use_lstm = True except Exception as e: logging.error(e) use_lstm = False tokens = transcript.split() for index in range(len(tokens)): sentence_size = len(sentences) if sentence_size < (int(index / 5) + 1): sentences.append([]) word = tokens[int(index)] sentences[int(index / 5)].append(word) w_dim = 0 aligned_sentences = list() for s_raw in sentences: if use_lstm: sentence = s_raw.split() else: sentence = s_raw sentence[-1] = sentence[-1] + "." aligned_sentence = list() for widx, word in enumerate(sentence): w_dim += 0 Word(word, alignment[w_dim]) word_obj = Word(word, alignment[w_dim]) if widx == len(sentence) - 1: word_obj.add_tag("is_punctuated", True) aligned_sentence.append(word_obj) sentence_obj = Sentence(aligned_sentence, 0) aligned_sentences.append(sentence_obj) for idx, _ in enumerate(aligned_sentences): if idx < (len(aligned_sentences) - 1): aligned_sentences[idx].length = aligned_sentences[idx + 1].words[0].timestamp aligned_sentences[(len(aligned_sentences) - 1)].length = duration transcript_out = {"duration": duration, "length": len(alignment), "sentences": aligned_sentences, "complete": "1"} out_json = open(os.path.join( "/root/audio/batch" + str(batch_id), key + ".json"), "w") json.dump(transcript_out, out_json) out_json.close() batch_out[key] = transcript_out # Release GPU device = cuda.get_current_device() device.reset() return batch_out def decode_batch(self, batch_id: int, iter_id: int = 0) -> Dict[str, object]: self.batch_lk.acquire(blocking=True) # set environment, start new shell batch_env = self.env batch_env["DATASET"] = os.path.join( "/root/audio/batch" + str(batch_id)) # if self.use_feedback: prep_process = Popen(["/bin/bash", self.batch_feedback_command], stdin=PIPE, stderr=PIPE, env=batch_env, cwd=self.model_dir) stdout, stderr = prep_process.communicate() logging.debug(stdout) logging.debug(stderr) num_words = len(open(os.path.join("/tmp/results", self.name, str(batch_id), "0", "trans")).readlines()[0].split()) # Fallback to original model if retrained model doesn't decode if num_words < 2: self.use_feedback = False shutil.rmtree(os.path.join( "/tmp/results", self.name, str(batch_id))) prep_process = Popen(["/bin/bash", self.batch_command], stdin=PIPE, stderr=PIPE, env=batch_env, cwd=self.model_dir) stdout, stderr = prep_process.communicate() logging.debug(stdout) logging.debug(stderr) # batch_env = self.env prep_process = Popen(["/usr/bin/gzip", "-d", os.path.join( self.result_dir, str(batch_id), str(iter_id), "lat_aligned.gz")], stdin=PIPE) stdout, stderr = prep_process.communicate() logging.debug(stdout) logging.debug(stderr) ctm_file = os.path.join(self.result_dir, str( batch_id), str(iter_id), "CTM.ctm") lattice_align_command: str = "" lattice_align_command += "/opt/kaldi/src/latbin/lattice-align-words-lexicon --partial-word-label=4324 " \ "/workspace/models/aspire/data/lang_chain/phones/align_lexicon.int " \ "/workspace/models/aspire/final.mdl" lattice_align_command += (" ark:" + os.path.join(self.result_dir, str(batch_id), str(iter_id), "lat_aligned")) lattice_align_command += " ark:- | /opt/kaldi/src/latbin/lattice-1best ark:- ark:- | " \ "/opt/kaldi/src/latbin/nbest-to-ctm ark:- " lattice_align_command += ctm_file prep_process = Popen(lattice_align_command, stdin=PIPE, shell=True) stdout, stderr = prep_process.communicate() logging.debug(stdout) logging.debug(stderr) corpora = self.extract_corpora(batch_id) if self.last_run is not None: self.last_run += 1 else: self.last_run = 0 self.batch_lk.release() return corpora def clear_results(self) -> None: if os.path.exists(self.result_dir): os.rmdir(self.result_dir) def train_model(self, fb: FeedbackAgent) -> None: fb.iter = self.model_trainings fb.lk = self.batch_lk fb.start() self.model_trainings += 1 @staticmethod def calculate_alignment(words: List[str], idx: List[int], lats: List[List]) -> Tuple[List, float]: word_table: Dict[int, str] = dict() alignment = [] len_words = len(words) len_idx = len(idx) len_lats = len(lats) assert len_words == len_idx if len_idx > len_lats: lats.insert(0, [0.0, idx[0]]) if lats[0][0] == lats[1][0]: lats[1][0] = (lats[2][0] / 2) for i in range(len_idx): wt_idx = idx[i] word_table[wt_idx] = words[i] offset = 0.0 for i in range(len_lats): original_lats = lats[i][0] if original_lats == 0.0: # noinspection PyBroadException try: next_lat = lats[i + 1][0] original_lats = next_lat / 2 except: original_lats = 0.05 offset += original_lats lat_i = int(lats[i][1]) w = word_table[lat_i] align = offset - lats[i][0] alignment.append([w, align]) return alignment, offset @staticmethod def fetch_transcript(batch_id: int, corpus_id: str) -> object: out_json = open(os.path.join("/root/audio/batch" + str(batch_id), corpus_id + ".json"), "r") out_json = json.load(out_json) return out_json
from deepsegment import DeepSegment segmenter = DeepSegment( checkpoint_path= '/Users/trinhgiang/PycharmProjects/deepsegment-2/vi/checkpoint', params_path='/Users/trinhgiang/PycharmProjects/deepsegment-2/vi/params', utils_path='/Users/trinhgiang/PycharmProjects/deepsegment-2/vi/utils') # print(segmenter.segment('Tư vấn cho khách hàng về chữ ký số và các phần mềm bảo hiểm xã hội điện tử, hóa đơn điện tử, chữ ký số ..vv.')) sent = 'Nhân viên kinh doanh và chăm sóc khách hàng 10/2015 - 04/2016 - Tư vấn cho khách hàng về chữ ký số và các phần mềm bảo hiểm xã hội điện tử, hóa đơn điện tử, chữ ký số ..vv.. -Hướng dẫn khách hàng về thủ tục hồ sơ để đăng ký chữ ký số và các phần mềm của công ty - Giải đáp thắc mắc của khách hàng và chuyển yêu cầu của khách hàng cho bộ phận liên quan' print(segmenter.segment_long(sent))
def window_segment(self, strings): segmenter = DeepSegment('en') sentences = segmenter.segment_long(strings) return sentences
from deepsegment import DeepSegment # The default language is 'en' segmenter = DeepSegment('en') print(segmenter.segment('I am Batman i live in gotham')) print(segmenter.segment_long('I am Batman i live in gotham')) # ['I am Batman', 'i live in gotham']
index = 0 for test_data in test_datas: results[str(index)] = {} index = index + 1 index = 0 for test_data in test_datas: print(test_data) for data_set in test_datasets: results[str(index)][data_set] = {} for data_set in test_datasets: print(data_set) f_checkpoint = data_set + "/checkpoint" f_params = data_set + "/params" f_utils = data_set + "/utils" segmenter = DeepSegment(lang_code=None, checkpoint_path=f_checkpoint, params_path=f_params, utils_path=f_utils, tf_serving=False, checkpoint_name=None) res = segmenter.segment_long(test_data, n_window=i_window) results[str(index)][data_set] = res index = index + 1 ################################################################################################################################################ print(results)