def transcript(): if request.method == "POST": transcript_path = request.form.get('transcript_path') with open(transcript_path) as file: text = file.read() segmenter = DeepSegment('deepsegment_eng_v1/config.json') var = segmenter.segment(text) operational_1 = operational(var) length_chat_file = len(var) length_operational = len(operational_1) length_non_operational = length_chat_file - length_operational # Call function to create a pie chart showing Operational vs Non-Operational Problems # draw_figure(length_operational, length_non_operational) return render_template("showgraph.html", length_operational = length_operational, length_non_operational = length_non_operational) else: return redirect(url_for("index"))
def segment(data): segmenter = DeepSegment('en') seg_text = [] for text in data: segmenter.segment_long(text), seg_text.extend(segmenter.segment_long(text)) return seg_text
def main(): args = parse_args() with open(args.input, mode='r') as read_text_file: line = read_text_file.readline() segmenter = DeepSegment('en') corrector = DeepCorrect(args.params_path, args.checkpoint_path) with open(args.output, mode='w') as close_text_file: for part in segmenter.segment(line): tester2 = corrector.correct(part) close_text_file.write(tester2[0]['sequence'] + '\n')
def dataPreProcessModel(): print("Inside dataPreProcessModel") global corrector corrector = DeepCorrect('model_params/deeppunct_params_en', 'model_params/deeppunct_checkpoint_google_news') global segmenter segmenter = DeepSegment('en')
def dataPreProcessModel(): print("Inside dataPreProcessModel") global corrector corrector = DeepCorrect( '/Users/Amitgarg/Documents/SJSU/272-Ranjan/Smart-MOM/model_params/deeppunct_params_en', '/Users/Amitgarg/Documents/SJSU/272-Ranjan/Smart-MOM/model_params/deeppunct_checkpoint_google_news' ) global segmenter segmenter = DeepSegment('en')
class DeepSegmenter(BaseSegmenter): """ Designed with ASR outputs in mind, DeepSegment uses BiLSTM + CRF for automatic sentence boundary detection. It outperforms the standard libraries (spacy, nltk, corenlp ..) on imperfect text, and performs similarly for perfectly punctuated text. Example: 'I am Batman i live in gotham' -> # ['I am Batman', 'i live in gotham'] Details: https://github.com/notAI-tech/deepsegment :param lang_code: en - english (Trained on data from various sources); fr - french (Only Tatoeba data); it - italian (Only Tatoeba data) :type lang_code: str :param checkpoint_name: Name to be used as checkpoint :type checkpoint_name: str :param args: Additional positional arguments :param kwargs: Additional keyword arguments """ def __init__(self, lang_code: str = 'en', checkpoint_name: str = None, *args, **kwargs): """Set constructor.""" super().__init__(*args, **kwargs) self.lang_code = lang_code self.checkpoint_name = checkpoint_name def post_init(self): from deepsegment import DeepSegment self._segmenter = DeepSegment(self.lang_code, checkpoint_name=self.checkpoint_name) @single def segment(self, text: str, *args, **kwargs) -> List[Dict]: """ Split the text into sentences. :param text: Raw text to be segmented :type text: str :param args: Additional positional arguments :param kwargs: Additional keyword arguments :return: List of sub-docuemnt dicts with the cropped images :rtype: List[Dict] """ results = [] for idx, s in enumerate(self._segmenter.segment_long(text)): results.append(dict(text=s, offset=idx, weight=1.0)) return results
def predict(self, sample_text, word_length, segment, verbose): #A text seed is provided '''Predicts the next text sequences''' #model = self.model for wordLength in range(word_length): #Generates a text with a range of word length tokenList = self.tokenizer.texts_to_sequences([sample_text])[0] #Turns the seed into sequences tokenList = pad_sequences([tokenList], maxlen=self.maxSequenceLen - 1, padding=self.padding_method) predicted = self.model.predict_classes(tokenList, verbose=verbose) #Predicts the next sequence(generated outputWord = " " #text) for word, index in self.tokenizer.word_index.items(): if index == predicted: outputWord = word break sample_text += " " + outputWord #Returns the seed plus generated text self.sample_text = sample_text if segment == True: segmenter = DeepSegment('en') result = segmenter.segment(self.sample_text) sample_text = result else: print(sample_text) sample_text = self.sample_text return sample_text
def main(): segmenter = DeepSegment('en') connect = pymysql.connect(**config) cursor = connect.cursor() cursor.execute('SELECT Id, Name, Caption FROM video') results = cursor.fetchall() new_results = [] for result in tqdm(results): for playlist in playlist_list: if re.search(r'^' + playlist + '_\d+', result['Name']): new_result = {} new_result['caption'] = ', '.join( segmenter.segment_long(result['Caption'])) new_result['name'] = result['Name'] new_result['id'] = result['Id'] new_results.append(new_result) new_results = rectify(new_results) for new_result in tqdm(new_results): cursor.execute('UPDATE video SET Caption=%s WHERE Id=%s', (new_result['caption'], new_result['id'])) print(cursor.execute('SELECT * FROM video')) connect.commit() cursor.close() connect.close()
def processing(id): paragraph_object = Paragraph.objects.get(id=id) if not hasattr(globals, 'corrector') and not hasattr(globals, 'segmenter'): segmenter = DeepSegment('en') corrector = DeepCorrect('deep_punc/deeppunct_params_en', 'deep_punc/deeppunct_checkpoint_wikipedia') globals.corrector = corrector globals.segmenter = segmenter else: corrector = globals.corrector segmenter = globals.segmenter list_of_sentences = segmenter.segment(paragraph_object.original_text) paragraph = '' for i in range(len(list_of_sentences)): sentence = corrector.correct(list_of_sentences[i]) if i == 0: paragraph += sentence[0]['sequence'] else: paragraph += ' ' + sentence[0]['sequence'] paragraph = paragraph.replace("\\", "") paragraph_object.processed_text = paragraph paragraph_object.processing = False paragraph_object.save()
class DeepSegmenter(BaseSegmenter): """ Designed with ASR outputs in mind, DeepSegment uses BiLSTM + CRF for automatic sentence boundary detection. It significantly outperforms the standard libraries (spacy, nltk, corenlp ..) on imperfect text and performs similarly for perfectly punctuated text. Example: 'I am Batman i live in gotham' -> # ['I am Batman', 'i live in gotham'] Details: https://github.com/notAI-tech/deepsegment """ def __init__(self, lang_code: str = 'en', checkpoint_name: str = None, *args, **kwargs): """ :param lang_code: en - english (Trained on data from various sources); fr - french (Only Tatoeba data); it - italian (Only Tatoeba data) :param args: :param kwargs: """ super().__init__(*args, **kwargs) self.lang_code = lang_code self.checkpoint_name = checkpoint_name def post_init(self): from deepsegment import DeepSegment self._segmenter = DeepSegment(self.lang_code, checkpoint_name=self.checkpoint_name) def craft(self, text: str, doc_id: int, *args, **kwargs) -> List[Dict]: """ Split the text into sentences. :param text: the raw text :param doc_id: the doc id :return: a list of chunk dicts with the cropped images """ results = [] for idx, s in enumerate(self._segmenter.segment_long(text)): results.append(dict(text=s, offset=idx, weight=1.0)) return results
# https://github.com/bminixhofer/nnsplit # ============================================================================= if False: from nnsplit import NNSplit splitter = NNSplit("de") res = splitter.split([data]) # ============================================================================= # More advanced: Deepsegment: Does not support German # ============================================================================= if False: from deepsegment import DeepSegment # The default language is 'en' segmenter = DeepSegment('de') with open('data/start.txt', 'r') as myfile: data = myfile.read() segmenter.segment('I am Batman i live in gotham') # ============================================================================= # Huggingface tokenizer # ============================================================================= if False: from tokenizers.implementations import ByteLevelBPETokenizer from tokenizers.processors import BertProcessing from pathlib import Path
index = 0 for test_data in test_datas: results[str(index)] = {} index = index + 1 index = 0 for test_data in test_datas: print(test_data) for data_set in test_datasets: results[str(index)][data_set] = {} for data_set in test_datasets: print(data_set) f_checkpoint = data_set + "/checkpoint" f_params = data_set + "/params" f_utils = data_set + "/utils" segmenter = DeepSegment(lang_code=None, checkpoint_path=f_checkpoint, params_path=f_params, utils_path=f_utils, tf_serving=False, checkpoint_name=None) res = segmenter.segment_long(test_data, n_window=i_window) results[str(index)][data_set] = res index = index + 1 ################################################################################################################################################ print(results)
continue else: yield from instances # srl tagger predictor = Predictor.from_path( "https://s3-us-west-2.amazonaws.com/allennlp/models/bert-base-srl-2019.06.17.tar.gz", cuda_device=0) # segmentation model for splitting ill-formed utterances into well-formed sentences logging.disable(logging.WARNING) gpu_devices = tf.config.experimental.list_physical_devices('GPU') tf.config.experimental.set_memory_growth( gpu_devices[0], True) # do not take all GPU memory - tagger needs some too segmentation = DeepSegment('en') # utterances utterances_path = config.Dirs.data / 'training' / f'{CORPUS_NAME}_mlm.txt' params = Params.from_param2val(param2default) utterances = load_utterances_from_file(utterances_path) it = gen_instances() progress_bar = pyprind.ProgBar(len(utterances) // BATCH_SIZE, stream=1) num_no_verb = 0 num_only_verb = 0 lines = set() outer_loop = True while outer_loop:
import json #from ibm_watson import ToneAnalyzerV3 from watson_developer_cloud import ToneAnalyzerV3 import paralleldots as pd from deepsegment import DeepSegment from get_emotions import get_emotion_counts_with_vader segmenter = DeepSegment('en') pd.set_api_key( "yGZxjt2pV3Y3V0FizvQGCygybaLHGZRU0rvTNnSLlp8" ) tone_analyzer = ToneAnalyzerV3( version='2017-09-21', iam_apikey='0DWwlEM6RsPb0nnawbE3Rzbpmrg9OOLcLA5xJOel17wN', url='https://gateway-syd.watsonplatform.net/tone-analyzer/api' ) """ text = 'Team, I know that times are tough! Product '\ 'sales have been disappointing for the past three '\ 'quarters. We have a competitive product, but we '\ 'need to do a better job of selling it!' """ li_B= ['since 1990 the number of gun deaths', 'worldwide has reached six point five', 'million three quarters of gun deaths', 'occur in just 15 countries Latin America', 'is home to some of the worlds most', 'violent countries by murder rate El', 'Salvador Venezuela and Guatemala are the', 'top three countries for deaths caused by', 'guns per population these Latin American', 'countries are marred by corruption', 'organized crime and a dysfunctional', 'criminal justice system that further', 'fuels the problem the availability of', 'guns in the United States is another', 'concern for these countries an estimated', '200,000 guns a year that were first sold', 'in the United States are smuggled over', 'the southern border and used in violent', 'crimes in Latin America and the', 'Caribbean in the United States the', 'constitutional right to bear arms has', 'led to looser regulations and easier', 'access to firearms this contributes to', 'the 30,000 men women and children who', 'were killed with guns each year mass', 'shootings attract their headlines but in', 'fact these make up only 0.2% of gun', 'deaths 60% of gun related deaths are in', 'fact suicide', "America's suicide rate increased by 25", 'percent between 1999 and 2015 of nearly', '45,000 taking their own lives in 2015', 'alone half of these suicides were', "carried out with guns though guns aren't", 'the most common method of suicide they', 'are the most lethal other wealthy', 'countries have far lower rates of gun', 'violence in Japan if you want to own a', 'gun you must pass a written exam and a', 'shooting range test alongside a series', 'of mental health drug in criminal record', 'tests', 'it has virtually eradicated gun crime', 'after a mass shooting in 1996 Australia', 'introduced an effective buyback scheme', 'of firearms in the 20 years following', 'the bag there was an accelerated decline', 'in total gun deaths but in America the', 'House of Representatives has not voted', 'on a single measure to prevent gun', 'violence and in some states such as', 'Texas where students at public colleges', 'can now carry concealed handguns the law', 'has actually loosened easy access to', 'firearms will continue to be the main', 'driver of Americas gun debt']
import urllib.request from bs4 import BeautifulSoup from deepsegment import DeepSegment # load things: VALID_CHARS = set("abcdefghijklmnopqrstuvwxyz123456789. ") nlp = spacy.load("en_core_web_md") merge_ncs = nlp.create_pipe("merge_noun_chunks") merge_ents = nlp.create_pipe("merge_entities") nlp.add_pipe(merge_ents) nlp.add_pipe(merge_ncs) model = fasttext.load_model( os.path.join(os.path.dirname(os.path.realpath(__file__)), "model_1000000.ftz")) segmenter = DeepSegment("en") ######################## # Function definitions # ######################## def preprocess_text(text): """ simplify the text before fasttext processing. """ return "".join(c for c in text.lower() if c in VALID_CHARS) ##################### # Class definitions # #####################
class Decoder: def __init__(self, name: str, bit_rate: int, iteration: int = 1, max_active: int = 10000, max_batch_size=50) -> None: super().__init__() self.name = name self.bit_rate = bit_rate self.segmenter = None self.use_feedback = False self.env = os.environ.copy() self.env["ITERATIONS"] = str(iteration) self.env["MAX_ACTIVE"] = str(max_active) self.env["MAX_BATCH_SIZE"] = str(max_batch_size) self.model_dir = os.path.join( "/workspace/nvidia-examples/", name.lower()) self.result_dir = os.path.join("/tmp/results/", name.lower()) self.prep_command = "prepare_data.sh" self.batch_feedback_command = "run_benchmark.sh" self.batch_command = "run_benchmark_org.sh" self.last_run = None # Decoding lock self.batch_lk = Lock() self.model_trainings = 0 def initialize(self) -> None: prep_process = Popen(["/bin/bash", self.prep_command], stdin=PIPE, stderr=PIPE, cwd=self.model_dir) stdout, stderr = prep_process.communicate() logging.debug(stdout) logging.debug(stderr) def init_segment(self): from deepsegment import DeepSegment self.segmenter = DeepSegment("en", tf_serving=False) def extract_corpora(self, batch_id: int, iter_id: int = 0) -> Dict[str, object]: # noinspection PyBroadException try: trans_file = open(os.path.join( self.result_dir, str(batch_id), str(iter_id), "trans")) transcripts = trans_file.readlines() transcript_repo: Dict[str, List[str]] = {} for t in transcripts: spl = t.split(maxsplit=1) header = spl[0].split("_")[-1].split(".")[0] trans = spl[1] single_trans = open(os.path.join(self.result_dir, str( batch_id), str(iter_id), "trans_" + header), "w") single_trans.write(trans) single_trans.close() archived = open(os.path.join("/root/audio/batch" + str(batch_id), "tran_" + header + ".txt"), "w") archived.write(trans) archived.close() transcript_repo[header] = trans.split() trans_int_file = open(os.path.join(self.result_dir, str( batch_id), str(iter_id), "trans_int_combined")) transcript_ints = trans_int_file.readlines() transcript_int_repo: Dict[str, List[int]] = {} for t in transcript_ints: spl = t.split(maxsplit=1) header = spl[0].split("_")[-1].split(".")[0] trans = spl[1] single_trans = open(os.path.join(self.result_dir, str( batch_id), str(iter_id), "trans_int_combined_" + header), "w") single_trans.write(trans) single_trans.close() transcript_int_repo[header] = list( map(lambda x: int(x), trans.split())) cmt_file = open(os.path.join(self.result_dir, str( batch_id), str(iter_id), "CTM.ctm")) convo = cmt_file.readlines() # noinspection PyTypeChecker extraction: Dict[str, TextIOWrapper] = {} convo_repo: Dict[str, List] = {} for c in convo: conv = c.split()[3:] conv = [float(conv[0]), int(conv[1])] meta = c.split()[0] header = meta.split(".", maxsplit=1)[0].split("_")[-1] if header in extraction: extraction[header].write(str(conv[0]) + " " + str(conv[1])) convo_repo[header].append(conv) else: fd = os.path.join(self.result_dir, str( batch_id), str(iter_id), header + ".ctm") # noinspection PyTypeChecker extraction[header] = open(fd, "w") convo_repo[header] = [] for k in extraction.keys(): extraction[k].close() except: logging.error("Failed for batch ", batch_id) return {} batch_out = {} for key in transcript_repo.keys(): # noinspection PyBroadException # try: transcript_tokens = transcript_repo[key] transcript = "" for tt in transcript_tokens: transcript += (tt + " ") alignment, duration = Decoder.calculate_alignment( transcript_repo[key], transcript_int_repo[key], convo_repo[key]) logging.debug("Alignment complete for ", key) sentences = [] # noinspection PyBroadException try: os.environ['CUDA_VISIBLE_DEVICES'] = '0' self.init_segment() sentences = self.segmenter.segment_long(transcript) use_lstm = True except Exception as e: logging.error(e) use_lstm = False tokens = transcript.split() for index in range(len(tokens)): sentence_size = len(sentences) if sentence_size < (int(index / 5) + 1): sentences.append([]) word = tokens[int(index)] sentences[int(index / 5)].append(word) w_dim = 0 aligned_sentences = list() for s_raw in sentences: if use_lstm: sentence = s_raw.split() else: sentence = s_raw sentence[-1] = sentence[-1] + "." aligned_sentence = list() for widx, word in enumerate(sentence): w_dim += 0 Word(word, alignment[w_dim]) word_obj = Word(word, alignment[w_dim]) if widx == len(sentence) - 1: word_obj.add_tag("is_punctuated", True) aligned_sentence.append(word_obj) sentence_obj = Sentence(aligned_sentence, 0) aligned_sentences.append(sentence_obj) for idx, _ in enumerate(aligned_sentences): if idx < (len(aligned_sentences) - 1): aligned_sentences[idx].length = aligned_sentences[idx + 1].words[0].timestamp aligned_sentences[(len(aligned_sentences) - 1)].length = duration transcript_out = {"duration": duration, "length": len(alignment), "sentences": aligned_sentences, "complete": "1"} out_json = open(os.path.join( "/root/audio/batch" + str(batch_id), key + ".json"), "w") json.dump(transcript_out, out_json) out_json.close() batch_out[key] = transcript_out # Release GPU device = cuda.get_current_device() device.reset() return batch_out def decode_batch(self, batch_id: int, iter_id: int = 0) -> Dict[str, object]: self.batch_lk.acquire(blocking=True) # set environment, start new shell batch_env = self.env batch_env["DATASET"] = os.path.join( "/root/audio/batch" + str(batch_id)) # if self.use_feedback: prep_process = Popen(["/bin/bash", self.batch_feedback_command], stdin=PIPE, stderr=PIPE, env=batch_env, cwd=self.model_dir) stdout, stderr = prep_process.communicate() logging.debug(stdout) logging.debug(stderr) num_words = len(open(os.path.join("/tmp/results", self.name, str(batch_id), "0", "trans")).readlines()[0].split()) # Fallback to original model if retrained model doesn't decode if num_words < 2: self.use_feedback = False shutil.rmtree(os.path.join( "/tmp/results", self.name, str(batch_id))) prep_process = Popen(["/bin/bash", self.batch_command], stdin=PIPE, stderr=PIPE, env=batch_env, cwd=self.model_dir) stdout, stderr = prep_process.communicate() logging.debug(stdout) logging.debug(stderr) # batch_env = self.env prep_process = Popen(["/usr/bin/gzip", "-d", os.path.join( self.result_dir, str(batch_id), str(iter_id), "lat_aligned.gz")], stdin=PIPE) stdout, stderr = prep_process.communicate() logging.debug(stdout) logging.debug(stderr) ctm_file = os.path.join(self.result_dir, str( batch_id), str(iter_id), "CTM.ctm") lattice_align_command: str = "" lattice_align_command += "/opt/kaldi/src/latbin/lattice-align-words-lexicon --partial-word-label=4324 " \ "/workspace/models/aspire/data/lang_chain/phones/align_lexicon.int " \ "/workspace/models/aspire/final.mdl" lattice_align_command += (" ark:" + os.path.join(self.result_dir, str(batch_id), str(iter_id), "lat_aligned")) lattice_align_command += " ark:- | /opt/kaldi/src/latbin/lattice-1best ark:- ark:- | " \ "/opt/kaldi/src/latbin/nbest-to-ctm ark:- " lattice_align_command += ctm_file prep_process = Popen(lattice_align_command, stdin=PIPE, shell=True) stdout, stderr = prep_process.communicate() logging.debug(stdout) logging.debug(stderr) corpora = self.extract_corpora(batch_id) if self.last_run is not None: self.last_run += 1 else: self.last_run = 0 self.batch_lk.release() return corpora def clear_results(self) -> None: if os.path.exists(self.result_dir): os.rmdir(self.result_dir) def train_model(self, fb: FeedbackAgent) -> None: fb.iter = self.model_trainings fb.lk = self.batch_lk fb.start() self.model_trainings += 1 @staticmethod def calculate_alignment(words: List[str], idx: List[int], lats: List[List]) -> Tuple[List, float]: word_table: Dict[int, str] = dict() alignment = [] len_words = len(words) len_idx = len(idx) len_lats = len(lats) assert len_words == len_idx if len_idx > len_lats: lats.insert(0, [0.0, idx[0]]) if lats[0][0] == lats[1][0]: lats[1][0] = (lats[2][0] / 2) for i in range(len_idx): wt_idx = idx[i] word_table[wt_idx] = words[i] offset = 0.0 for i in range(len_lats): original_lats = lats[i][0] if original_lats == 0.0: # noinspection PyBroadException try: next_lat = lats[i + 1][0] original_lats = next_lat / 2 except: original_lats = 0.05 offset += original_lats lat_i = int(lats[i][1]) w = word_table[lat_i] align = offset - lats[i][0] alignment.append([w, align]) return alignment, offset @staticmethod def fetch_transcript(batch_id: int, corpus_id: str) -> object: out_json = open(os.path.join("/root/audio/batch" + str(batch_id), corpus_id + ".json"), "r") out_json = json.load(out_json) return out_json
from deepsegment import DeepSegment m = DeepSegment() def predictor(x, batch_size=32): return m.segment(x, batch_size=batch_size)
#pip install deepsegment from deepsegment import DeepSegment #declaring segmenter object segmenter = DeepSegment() #applying segmentation (tokenization) segmenter.segment('I am Batman, I live in Gotham') >>>['I am Batman, I live in Gotham'] #performs well even without punctuation segmenter.segment('I am Batman i liv in gotham') >>>['I am Batman', 'i liv in gotham']
def post_init(self): from deepsegment import DeepSegment self._segmenter = DeepSegment(self.lang_code, checkpoint_name=self.checkpoint_name)
def segmentsent(self, text): segmenter = DeepSegment('en') result = segmenter.segment(text) return result
def make_caption(video_dir, caption_dir, split_video_dir, frame_dir, trash_dir_path, timecode_dir, video, pyscenedetect_threshold, punct, classify_model, mode, tmp_annotation_dir): tmp_annotation_path = os.path.join(tmp_annotation_dir, video + '_annotation.json') if not os.path.exists(tmp_annotation_path): cv2.setNumThreads(1) print(f'{video} has been started.') video_name = video + ".mp4" video_path = os.path.join(video_dir, video_name) caption_path = os.path.join(caption_dir, (video + ".en.vtt")) video_elements_dir_path = os.path.join(split_video_dir, video) timecode_path = os.path.join(timecode_dir, video + ".pkl") trash_dir_path = os.path.join(trash_dir_path, video) if not os.path.exists(trash_dir_path): os.makedirs(trash_dir_path) if os.path.exists(timecode_path): print(f'[splitting]: {video} has been started. (loading...)') timecode_dict = load_pickle(timecode_path) print( f'[splitting]: {video} has been done. (timecode_list has been loaded.)' ) else: print(f'[splitting]: {video} has been started.') timecode_list = split_video(video_path, video_name, video_elements_dir_path, pyscenedetect_threshold) video_elements = sorted(os.listdir(video_elements_dir_path)) try: assert len(timecode_list) == len( video_elements ), f'video:{video} timecode_list:{len(timecode_list)} video_elements:{len(video_elements)}' except AssertionError as err: print('AssertionError:', err) print(f'[splitting]: {video} has been done.') timecode_dict = {} for i, video_element in enumerate(video_elements): video_element_path = os.path.join(video_elements_dir_path, video_element) is_useful = classify( video_elements_dir_path, video_element, os.path.join(frame_dir, video_name, video_element), classify_model) if is_useful: timecode_dict[video_element[:-4]] = timecode_list[i] else: shutil.move(video_element_path, trash_dir_path) save_pickle(timecode_dict, timecode_path) if args.punct == 'deepsegment': segmenter = DeepSegment('en') elif args.punct == 'fastpunct': segmenter = FastPunct('en') else: raise Exception( 'You have probably chosen something other than fastpunct and deepsegement.' ) annotation_dict = {} for i, useful_element in enumerate(timecode_dict.keys()): useful_element_path = os.path.join(video_elements_dir_path, useful_element + '.mp4') capture = cv2.VideoCapture(useful_element_path) fps = capture.get(cv2.CAP_PROP_FPS) frame_count = int(capture.get(cv2.CAP_PROP_FRAME_COUNT)) duration = frame_count / fps annotation_data = make_caption_data(useful_element, caption_path, timecode_dict[useful_element], duration, fps, punct, mode, segmenter) if len(annotation_data) == 0: shutil.move(useful_element_path, trash_dir_path) # timecodeからは消えない # raise Exception(f'Caption data is None: {useful_element}') else: # print(f'[caption]: {useful_element} has been done.') annotation_dict.update(annotation_data) with open(tmp_annotation_path, 'w') as f: json.dump(annotation_dict, f) else: print(f'[caption]: {video} annotation is already exist.') print(f'[split/caption]: {video} has been done.')
from deepsegment import DeepSegment segmenter = DeepSegment( checkpoint_path= '/Users/trinhgiang/PycharmProjects/deepsegment-2/vi/checkpoint', params_path='/Users/trinhgiang/PycharmProjects/deepsegment-2/vi/params', utils_path='/Users/trinhgiang/PycharmProjects/deepsegment-2/vi/utils') # print(segmenter.segment('Tư vấn cho khách hàng về chữ ký số và các phần mềm bảo hiểm xã hội điện tử, hóa đơn điện tử, chữ ký số ..vv.')) sent = 'Nhân viên kinh doanh và chăm sóc khách hàng 10/2015 - 04/2016 - Tư vấn cho khách hàng về chữ ký số và các phần mềm bảo hiểm xã hội điện tử, hóa đơn điện tử, chữ ký số ..vv.. -Hướng dẫn khách hàng về thủ tục hồ sơ để đăng ký chữ ký số và các phần mềm của công ty - Giải đáp thắc mắc của khách hàng và chuyển yêu cầu của khách hàng cho bộ phận liên quan' print(segmenter.segment_long(sent))
import string from util import * from predefined import * ################################ Constants ####################################### N_WINDOW = 7 MODEL_PATH = "trained/altyazilar_not_456_senteces" CHECKPOINT_PATH = MODEL_PATH + "/checkpoint" PARAM_PATH = MODEL_PATH + "/params" UTILS_PATH = MODEL_PATH + "/utils" PREDEFINED_ENABLED = True ################################ Init DeepSegmenter ################################ segmenter = DeepSegment(lang_code=None, checkpoint_path=CHECKPOINT_PATH, params_path=PARAM_PATH, utils_path=UTILS_PATH, tf_serving=False, checkpoint_name=None) ################################ Init predefined ################################ def normalize_text(text): text = text.translate(str.maketrans('', '', string.punctuation)) text = ' '.join(text.strip().split()) text = lower_tr(text) return text PREDEFINED_LENGTH = len(PREDEFINED) PREDEFINED.sort(key=len, reverse=True) PREDEFINED_NORMALIZED = [normalize_text(x) for x in PREDEFINED]
def fn(test): from deepsegment import DeepSegment segmenter=DeepSegment('en') import textrazor textrazor.api_key = "043e170ef41a6d297a508581225bd493943f3a9f831345fb71f86d64" client = textrazor.TextRazor(extractors=["words", "relations"]) #client.set_do_cleanup_HTML(True) response = client.analyze(test) l=[] for property in response.properties(): for word in property.predicate_words: l.append(word.lemma) if word.lemma == "sound": for property_word in property.property_words: for phrase in property_word.noun_phrases: print (phrase) break l=[] flag=False for sentence in response.sentences(): print(sentence.words) for word in sentence.words: if word.lemma=="image" or word.lemma=="picture" or word.lemma=="photo" or word.lemma=="show" or word.lemma=="see" or word.lemma=="display": k=word.lemma flag=True l.append(word.lemma) astring="" for i in l: astring+=i+" " f=open("keyword.txt",'a') f.write(astring+"\n") f.close() alist=segmenter.segment(astring) print(alist) if(flag): s=l.index(k) m=l[s:] t="" st="" for i in m: t+=i+" " else: t="No image found" st="" for j in l: st+=j+" " text1=st text2=t print(t) response1=client.analyze(t) for noun in response1.noun_phrases(): print(noun.words) for word in noun.words: print(word.lemma) from requests import exceptions import argparse import requests import cv2 import os import time starttime=time.time(); # set your Microsoft Cognitive Services API key along with (1) the # maximum number of results for a given search and (2) the group size # for results (maximum of 50 per request) API_KEY = "948886a19a794c428c53fcfa2aa0325b" MAX_RESULTS = 1 GROUP_SIZE = 1 # set the endpoint API URL URL = "https://api.cognitive.microsoft.com/bing/v7.0/images/search" # when attempting to download images from the web both the Python # programming language and the requests library have a number of # exceptions that can be thrown so let's build a list of them now # so we can filter on them EXCEPTIONS = set([IOError, FileNotFoundError, exceptions.RequestException, exceptions.HTTPError, exceptions.ConnectionError, exceptions.Timeout]) # store the search term in a convenience variable then set the # headers and search parameters term = t headers = {"Ocp-Apim-Subscription-Key" : API_KEY} params = {"q": term, "offset": 0, "count": GROUP_SIZE} # make the search print("[INFO] searching Bing API for '{}'".format(term)) search = requests.get(URL, headers=headers, params=params) search.raise_for_status() # grab the results from the search, including the total number of # estimated results returned by the Bing API results = search.json() estNumResults = min(results["totalEstimatedMatches"], MAX_RESULTS) print("[INFO] {} total results for '{}'".format(estNumResults, term)) # initialize the total number of images downloaded thus far total = 0 for offset in range(0, estNumResults, GROUP_SIZE): # update the search parameters using the current offset, then # make the request to fetch the results print("[INFO] making request for group {}-{} of {}...".format( offset, offset + GROUP_SIZE, estNumResults)) params["offset"] = offset search = requests.get(URL, headers=headers, params=params) search.raise_for_status() results = search.json() print("[INFO] saving images for group {}-{} of {}...".format( offset, offset + GROUP_SIZE, estNumResults)) # loop over the results for v in results["value"]: # try to download the image try: # make a request to download the image print("[INFO] fetching: {}".format(v["contentUrl"])) r = requests.get(v["contentUrl"], timeout=30) # build the path to the output image ext = v["contentUrl"][v["contentUrl"].rfind("."):] p = os.path.sep.join([r"C:\Users\HP\Desktop\Projects\VIT Hack\SlideEZ-test", "{}{}".format( str(total).zfill(8), ext)]) print("The answer is") print(p) # write the image to disk f = open(p, "wb") f.write(r.content) f.close() # catch any errors that would not unable us to download the # image except Exception as e: # check to see if our exception is in our list of # exceptions to check for if type(e) in EXCEPTIONS: print("[INFO] skipping: {}".format(v["contentUrl"])) continue # try to load the image from disk image = cv2.imread(p) # if the image is `None` then we could not properly load the # image from disk (so it should be ignored) if image is None: print("[INFO] deleting: {}".format(p)) os.remove(p) continue # update the counter total += 1 endtime=time.time()-starttime print("Total time taken to search for the query is") print(endtime) from pptx import Presentation from pptx.util import Inches, Pt from pptx.enum.text import PP_ALIGN from PIL import Image from pptx.dml.color import RGBColor from pptx.enum.dml import MSO_THEME_COLOR presentation = "testppt3.pptx" prs = Presentation(presentation) if len(prs.slides)==0: title_slide_layout = prs.slide_layouts[0] slide = prs.slides.add_slide(title_slide_layout) background=slide.background fill=background.fill fill.gradient() fill.gradient_angle=40 gradient_stops=fill.gradient_stops gradient_stop=gradient_stops[0] color=gradient_stop.color color.theme_color=MSO_THEME_COLOR.LIGHT_1 title = slide.shapes.title subtitle = slide.placeholders[1] title.text = "Test" subtitle.text = "test" prs.save(presentation) if not flag: text_slide_layout = prs.slide_layouts[1] slide = prs.slides.add_slide(text_slide_layout) background=slide.background fill=background.fill fill.gradient() fill.gradient_angle=40 gradient_stops=fill.gradient_stops gradient_stop=gradient_stops[0] color=gradient_stop.color color.theme_color=MSO_THEME_COLOR.LIGHT_1 title = slide.shapes.title blist=[] for i in range(0,len(alist)): blist+=alist[i].split(" ") mx=0 slide_t="" for j in blist: if(len(j)>=mx): mx=len(j) slide_t=j.title() title.text= slide_t content = slide.shapes.placeholders[1] tf = content.text_frame for i in alist: para=tf.add_paragraph() para.text=i para.level=1 prs.save(presentation) else: image_slide_layout = prs.slide_layouts[8] slide = prs.slides.add_slide(image_slide_layout) background=slide.background fill=background.fill fill.gradient() fill.gradient_angle=40 gradient_stops=fill.gradient_stops gradient_stop=gradient_stops[0] color=gradient_stop.color color.theme_color=MSO_THEME_COLOR.LIGHT_1 #title = slide.shapes.title #title.text="Sub2" content = slide.shapes.placeholders[1] im=Image.open(p) width,height= im.size content.height= height content.width= width content.insert_picture(p) content = slide.shapes.placeholders[0] tf = content.text_frame for i in alist: para=tf.add_paragraph() para.text=i para.level=1 para.alignment=PP_ALIGN.CENTER #left = Inches(6) #top = Inches(3) #height = Inches(2) #pic = slide.shapes.add_picture(p, left, top, height=height) prs.save(presentation)
# -*- coding: utf-8 -*- """ Created on Mon Jul 13 11:26:00 2020 @author: barth """ import pandas as pd import re from deepsegment import DeepSegment from tqdm import tqdm updating = True segmenter = DeepSegment('en') df = pd.read_pickle('JREdataframeUPDATED.pkl', ) def getPodNum(vidtitle): num = vidtitle.split('#')[1].split()[0] num = num.replace('-', '') return int(num) vidnums = [getPodNum(i) for i in df['Title']] df['PodNum'] = vidnums df = df.sort_values(['PodNum', 'Title']).reset_index(drop=True)
def window_segment(self, strings): segmenter = DeepSegment('en') sentences = segmenter.segment_long(strings) return sentences
def init_segment(self): from deepsegment import DeepSegment self.segmenter = DeepSegment("en", tf_serving=False)
def get_segmenter(): segmenter = DeepSegment('en') return segmenter
from deepsegment import DeepSegment from ibm_watson import NaturalLanguageUnderstandingV1 from ibm_watson.natural_language_understanding_v1 \ import Features, EntitiesOptions, KeywordsOptions natural_language_understanding = NaturalLanguageUnderstandingV1( version='2019-07-12', iam_apikey='oRW-WiI73HQMQxq0mVZnPJzN3UFwX4-9oD-XpjLjqUNi', url= 'https://gateway-wdc.watsonplatform.net/natural-language-understanding/api' ) segmenter = DeepSegment('en') pd.set_api_key("Mf5Rgw0kBSWSNThFQxKYbEQvPgKgrexUKqPEPDMwGkM") tone_analyzer = ToneAnalyzerV3( version='2017-09-21', iam_apikey='E8dobLcUUvh7NZU6MzpFv-GDUiIuEmOV43vQIWSNO0tE', url='https://gateway-wdc.watsonplatform.net/tone-analyzer/api') import sys import nltk from youtube_transcript_api import YouTubeTranscriptApi from nltk.stem.snowball import SnowballStemmer from nltk.stem import WordNetLemmatizer
from deepsegment import DeepSegment # The default language is 'en' segmenter = DeepSegment('en') print(segmenter.segment('I am Batman i live in gotham')) print(segmenter.segment_long('I am Batman i live in gotham')) # ['I am Batman', 'i live in gotham']