def punctuate(text): p = Punctuator( "C:\\Users\\lm44\\Documents\\Code\\Python\\Sumit Backend\\functions\\INTERSPEECH-T-BRNN.pcl" ) punctuated = p.punctuate(text) return punctuated
def punctuate_text(text): print("Performing Puntuation ... \n") p = Punctuator('models/punctuator1.pcl') new_text = p.punctuate(text) new_text = re.sub("[?:;,]", "", new_text) new_text = re.sub("\s\s+", " ", new_text) print("Original text is:\n") print(new_text) print("\n\n") return new_text
def correct(begin_of_path,text,language="English"): #text is currently raw string words=text.split(" ") correct_words=[] spell = SpellChecker() for word in words: correct_words.append(spell.correction(word)) separator = ' ' correct_text=separator.join(correct_words) path_to_model=os.path.join(begin_of_path,"data","Demo-Europarl-EN.pcl") p = Punctuator(path_to_model) correct_text_with_punct=p.punctuate(correct_text) return(correct_text_with_punct)
def punctuates(doo=True): if doo == True: text = open("transcription.txt", "r") text = text.read() from punctuator import Punctuator p = Punctuator('hel.pcl') punctuated = p.punctuate(text) print("Punctuating Done") return punctuated else: text = open("transcription.txt", "r") text = text.read() return text
def addPunctuation(text_file): #load the pre-trained model p = Punctuator('model.pcl') #read unstructured text from the file fp = open(text_file, "r") text = fp.read() #punctuate the read text sentences = p.punctuate(text) fp.close() #write punctuated text into the file otp_file = open("notes.txt", "w") otp_file.write(sentences) otp_file.close()
def punctuate(): global filename global PCL t = open('****************', 'r') file = t.read() source = file # Punctuate timestamp = datetime.datetime.strptime(time.ctime(), "%a %b %d %H:%M:%S %Y") print(f'{timestamp} | Punctuating chunk') p = Punctuator('****************') timestamp = datetime.datetime.strptime(time.ctime(), "%a %b %d %H:%M:%S %Y") print(f'{timestamp} | Saving your file') t.write(p.punctuate(source)) timestamp = datetime.datetime.strptime(time.ctime(), "%a %b %d %H:%M:%S %Y") print(f'{timestamp} | Punctuation complete')
def test(request): video_id = "5v1B1R3lEO8" srt = YouTubeTranscriptApi.get_transcript(video_id) alltext = "" for item in srt: alltext = alltext + item['text'] + " " print(os.path.join(settings.BASE_DIR)) file_ = os.path.join(settings.BASE_DIR, 'model.pcl') p = Punctuator(file_) punctuated = p.punctuate(alltext[:5000]) specialtag = punctuated.split('.') # newpara = "" # for item in specialtag: # newpara = newpara + item+".<br>" #specialtag = alltext return render(request, 'test.html', { 'foo': specialtag, })
def fix_text(text_list, is_saved): """ Cleans, punctuates, neural coreferences, and sentencizes the transcript. :param is_saved: True if a version of the fixed text is already saved in a file :param text_list: A list of strings; an 'unclean' transcript :return: A list of tokenized sentences (every sentence is a Doc object) """ file_name = 'fixed.txt' if is_saved: with open(file_name, 'r') as fixed: fixed_text_list = fixed.readlines() fixed_text_list = [text.replace('\n', '') for text in fixed_text_list] fixed_text_list = [nlp(sentence) for sentence in fixed_text_list] return fixed_text_list else: fixed_text = ' '.join(text_list) # convert the list into one string fixed_text.replace(' ', ' ') # remove double spaces print('adding punctuation; please wait a few minutes...') punctuator = Punctuator('Demo-Europarl-EN.pcl') fixed_text = punctuator.punctuate(fixed_text) print('removing interjections; please wait a few more minutes...') fixed_text_doc = remove_tokens_by_pos(nlp(fixed_text), 'INTJ') print( 'performing neural coreferencing; please wait for several more minutes...' ) neuralcoref.add_to_pipe(nlp) fixed_text_doc = fixed_text_doc._.coref_resolved print('splitting the text into sentences; please keep waiting...') fixed_text_list = re.split('\\.|\\?|!', fixed_text_doc) with open(file_name, 'w') as fixed: for sentence in fixed_text_list: fixed.write(sentence + "\n") fixed_text_list = [nlp(sentence) for sentence in fixed_text_list] return fixed_text_list
def summarize(vid_id): text = """ """ subs = YouTubeTranscriptApi.get_transcript(vid_id) sentences = [i['text'] for i in subs] text = ' '.join(sentences) p = Punctuator('INTERSPEECH-T-BRNN.pcl') text = p.punctuate(text) stopWords = set(stopwords.words("english")) words = word_tokenize(text) freqTable = dict() for word in words: word = word.lower() if word in stopWords: continue if word in freqTable: freqTable[word] += 1 else: freqTable[word] = 1 sentences = sent_tokenize(text) sentenceValue = dict() for sentence in sentences: for word, freq in freqTable.items(): if word in sentence.lower(): if sentence in sentenceValue: sentenceValue[sentence] += freq else: sentenceValue[sentence] = freq sumValues = 0 for sentence in sentenceValue: sumValues += sentenceValue[sentence] average = int(sumValues / len(sentenceValue)) summary = '' for sentence in sentences: if (sentence in sentenceValue) and (sentenceValue[sentence] > (1.2 * average)): summary += " " + sentence return summary
def main(fileName): fileName, fileExt = fileName.split('.') print(fileName, fileExt) # os.system(f"ffmpeg -i C:\\Users\\Chinmay\\CSE\\pyCode\\Unysis\\UNA-Unisys-Natural-Assistant-main\\UNA-Unisys-Natural-Assistant-main\\{fileName}.{fileExt} -ab 160k -ac 2 -ar 44100 -vn C:\\Users\\Chinmay\\CSE\\pyCode\\Unysis\\UNA-Unisys-Natural-Assistant-main\\UNA-Unisys-Natural-Assistant-main\\{fileName}.wav") # ipFile = ffmpeg.input(fileName + fileExt) # opFile = ffmpeg.output(ipFile, fileName + ".wav") clip = AudioFileClip(f"{fileName}.{fileExt}") clip.write_audiofile(f"{fileName}.wav", codec='pcm_s16le') f = sf.SoundFile(f'{fileName}.wav') audio_dur = len(f) / f.samplerate r = sr.Recognizer() text = "" rec_dur = 25 with sr.AudioFile(f'{fileName}.wav') as source: for x in range(0, int(audio_dur / rec_dur)): audio = r.record(source, duration=rec_dur) try: new_txt = r.recognize_google(audio) text = text + new_txt except: pass audio = r.record(source, duration=(audio_dur - int(audio_dur / rec_dur))) try: new_txt = r.recognize_google(audio) text = text + new_txt except: pass print("Done") p = Punctuator('Demo-Europarl-EN.pcl') text = p.punctuate(text) tool = language_tool_python.LanguageTool('en-US') matches = tool.check(text) print(len(matches)) for lab in range(len(matches)): print(lab) print(matches[lab].ruleId, matches[lab].replacements) text_new = tool.correct(text) print(text_new) nltk.download('punkt') nltk.download('stopwords') preprocessor = TextPreProcessor(NLTKTokenizer(), NLTKCleaner()) similarity_algorithm = BM25Plus() ranker = TextRank() ir = ClassicalIR() # Text Summarization model = Summarizer(preprocessor, similarity_algorithm, ranker, ir) summarised_content = model.summarise(text_new, reduction_ratio=0.80, preserve_order=True) print("\n --- Summarized Text ---\n") print(construct_sentences_from_ranking(summarised_content)) with open(f"{fileName}.txt", "w+") as file: file.write(construct_sentences_from_ranking(summarised_content)) # Text Keyword Extraction preprocessor = TextPreProcessor(NLTKTokenizer(), NLTKCleaner(skip_stemming=True)) keyword_extractor = KeywordExtractor(preprocessor, ClassicalIR()) keywords = keyword_extractor.extract_keywords(text, count=10, raw=False) print("\n --- Keywords ---\n") print(keywords)
from punctuator import Punctuator p = Punctuator('Demo-Europarl-EN.pcl') output_file = open('output.txt', 'w') output_file.write('Demo-Europarl_EN.pcl\n\n') output_file.write( p.punctuate( 'Uh Now last video we went through the first five steps AGADAP So you just kind of shown here of our uh relative combat power that weve chosen to do a penetration umh how weve laid out our forces arrayed them and uh set our phases and selected our leadership Okay so now we get our battle book page were gonna do our COA statement and sketch and we can drive on with our sketch Alright so heres the page out of the battle book and uh we need to do both our sketch and our statement Now for this Im gonna go ahead and and just start with uh the sketch Now uh we know that were gonna start out in our assembly area Alright so we can go ahead and draw that up here at the top of the page and I wanna alot leave uh quite a bit of room down here to show whats gonna happen for actions on the objective Remember the COA sketch isnt to scale uh Were gonna need uh some kind of minor departure We know thats gonna happen Alright and were gonna leave from the uh assembly area and were gonna move to an ORP right So we gotta put our ORP on here uh and since thats gonna be a movement theres gonna be an axis to get us there Alright and uh thinking through this theres probably some phase line here Alright because were gonna spend phase one in the assembly area phase two moving to the ORP and then phase three is all gonna happen in the ORP so theres probably a phase line here because were gonna act differently at that point' )) #output_file.write(p.punctuate('this is a test sentence for part 1')) p3 = Punctuator('INTERSPEECH-T-BRNN.pcl') output_file.write('\n\nINTERSPEECH-T-BRNN.pcl\n\n') output_file.write( p3.punctuate( 'Uh Now last video we went through the first five steps AGADAP So you just kind of shown here of our uh relative combat power that weve chosen to do a penetration umh how weve laid out our forces arrayed them and uh set our phases and selected our leadership Okay so now we get our battle book page were gonna do our COA statement and sketch and we can drive on with our sketch Alright so heres the page out of the battle book and uh we need to do both our sketch and our statement Now for this Im gonna go ahead and and just start with uh the sketch Now uh we know that were gonna start out in our assembly area Alright so we can go ahead and draw that up here at the top of the page and I wanna alot leave uh quite a bit of room down here to show whats gonna happen for actions on the objective Remember the COA sketch isnt to scale uh Were gonna need uh some kind of minor departure We know thats gonna happen Alright and were gonna leave from the uh assembly area and were gonna move to an ORP right So we gotta put our ORP on here uh and since thats gonna be a movement theres gonna be an axis to get us there Alright and uh thinking through this theres probably some phase line here Alright because were gonna spend phase one in the assembly area phase two moving to the ORP and then phase three is all gonna happen in the ORP so theres probably a phase line here because were gonna act differently at that point' )) #output_file.write(p3.punctuate('this is a test sentence for part 3')) output_file.close()
import sys punctuatorSelected = input( 'Which punctuator would you like to test: \n1. Demo-Europarl-En.pcl \n2. INTERSPEECH-T-BRNN.pcl\n\n' ) if (punctuatorSelected == '1'): p = Punctuator('Demo-Europarl-EN.pcl') pText = 'Demo-Europarl-EN' if (punctuatorSelected == '2'): p = Punctuator('INTERSPEECH-T-BRNN.pcl') pText = 'INTERSPEECH-T-BRNN' print('\nYou have selected ', pText, '\n') textfile = sys.argv[1] output_file = open('output.txt', 'w') with open(textfile, 'r') as file: data = file.read().replace('\n', ' ') data = data.lower() fillerwords = ['uh'] datawords = data.split() processData = [word for word in datawords if word.lower() not in fillerwords] finalText = ' '.join(processData) print(p.punctuate(finalText)) output_file.write(p.punctuate(finalText))
def punctuate_text(text): p = Punctuator('models/INTERSPEECH-T-BRNN.pcl') print(p.punctuate(text))
def punctuate_conversation(conversation, loc): p = Punctuator(loc) punctuated_converse = p.punctuate(conversation) return punctuated_converse
textD = "61 people passed away today. I am very tired, I want to sleep. I only slept for 1 hour last night." p = Punctuator('Demo-Europarl-EN.pcl') # NLP def nlp_parser(text): parser.setup(text) # parser.display_tree() parser.extract_noun_verb() def nummod_parser(text): nparser.setup(text) nparser.get_noun_count_pairs() # Speech to text recognizer = SR.Recognizer() with SR.Microphone() as source: print("Speak:") audio = recognizer.listen(source) try: text_result = recognizer.recognize_google(audio) + "." text_result_punctuated = p.punctuate(text_result) print("You said: " + text_result_punctuated) nlp_parser(text_result_punctuated) # mmod_parser(text_result_punctuated) except SR.UnknownValueError: print("Could not understand audio") except SR.RequestError as e: print("Could not request results; {0}".format(e))
def punctuator_src_test(): loc = "./src/AI_files/punctuator/Demo-Europarl-EN.pcl" p = Punctuator(loc) punctuated_converse = p.punctuate("some one") return punctuated_converse
def form_valid(self, form): self.object = form.save(commit=False) video_id = self.object.body.split('?v=')[1].split("&")[0] self.object.vid_id = video_id print('Does he have it?') print(self.object.vid_id) if (Post.objects.filter(vid_id=self.object.vid_id).exists()): yo = Post.objects.filter(vid_id=self.object.vid_id)[:1] print(yo[0].pk) print('REDIRECT!') return HttpResponseRedirect( reverse('article-detail', kwargs={ 'pk': str(yo[0].pk), "yt": video_id })) #return redirect('article-detail', post.pk post.vid_id+str(yo[0].pk)+'/'+str(video_id)) data = scrape_url('http://youtube.com/watch?v=' + video_id) print(data.title) print(data.poster) srt = YouTubeTranscriptApi.get_transcript(video_id) totalduration = 0 alltext = "" for item in srt: go = item['start'] if (go - totalduration > 30): alltext = alltext + time.strftime( '%H:%M:%S', time.gmtime( item['start'])) + item['text'] + " " totalduration = item['start'] else: alltext = alltext + item['text'] + " " r = re.findall( '(?:[0123456789]\d|2[0123456789]):(?:[0123456789]\d):(?:[0123456789]\d)', alltext) for item in r: # print(item) alltext = alltext.replace( item, "<br><a class='ytlink' href='#' type='button' onclick='seek(" + str(get_sec(item)) + ")'>" + item + "</a> </br>") print(alltext) file_ = os.path.join(settings.BASE_DIR, 'model.pcl') p = Punctuator(file_) punctuated = p.punctuate(alltext) self.object.title = data.title self.object.title_tag = data.poster # totaltext = punctuated.split(".") # finaltext = "" # for item in totaltext: # finaltext = finaltext + item +"."+"<br><br>" iframe = '''<iframe id="player" type="text/html" width="560" height="315" src="http://www.youtube.com/embed/PjDw3azfZWI?enablejsapi=1" frameborder="0"></iframe>''' embed = '''<iframe id="player" type="text/html" width="560" height="315" src="http://www.youtube.com/embed/''' + video_id + '''?enablejsapi=1" frameborder="0"></iframe>''' self.object.body = embed + "<br>" + alltext self.object.save() return HttpResponseRedirect( reverse('article-detail', kwargs={ 'pk': self.object.id, "yt": video_id }))
parts = line.split(" ") if len(parts) > 1: id = parts[0] val = " ".join(parts[1:]) return id, val with open(file, 'r') as fd: while True: line = fd.readline() if not line: break id, valstr = parse_line(line) val = json.loads(valstr) text = val.get("text") text = " ".join(text) text = text.replace("[Music]", " ").replace("\r", " ").replace("\n", " ") text = ' '.join(text.split()) sentences = sent_tokenize(text) punc_text = None punc_needed = True if len(sentences) > 2: punc_needed = False punc_text = text else: punc_text = punc.punctuate(text)
from punctuator import Punctuator import sys p = Punctuator('Demo-Europarl-EN.pcl') textfile = sys.argv[1] with open(textfile, 'r') as file: data = file.read().replace('\n', ' ') data = data.lower() print(p.punctuate(data))
from punctuator import Punctuator p = Punctuator('/home/erviewre/h3podcastbot/.punctuator/Demo-Europarl-EN.pcl') import os directory = os.fsencode("/home/erviewre/h3podcastbot/raw_scripts") file_count = len(os.listdir(directory)) progress = 1 for file in os.listdir(directory): print(str(progress) + "/" + str(file_count)) progress += 1 filename = os.fsdecode(file) with open('/home/erviewre/h3podcastbot/raw_scripts/' + filename, 'r') as open_file: data = open_file.read().replace('\n', ' ') with open('/home/erviewre/h3podcastbot/punctuated_scripts/' + filename, 'w') as punctuated_file: punctuated_file.write(p.punctuate(data))
def handlePunctuation(text): #Punctuator库 #https://drive.google.com/drive/folders/0B7BsN5f2F1fZQnFsbzJ3TWxxMms p = Punctuator('Demo-Europarl-EN.pcl') return p.punctuate(text)
corrector = jamspell.TSpellCorrector() corrector.LoadLangModel("../models/spellchecker_en.bin") # load any audio file of your choice speech, rate = librosa.load("../10mintest.mp3", sr=16000) lenght = librosa.get_duration(speech, sr=16000) n_chuncks = np.ceil(lenght / 10) chuncks = np.array_split(speech, n_chuncks) def transcriptor(chunks): string = "" for i in chuncks: input_values = tokenizer(i, return_tensors='pt').input_values # Store logits (non-normalized predictions) logits = model(input_values).logits # Store predicted id's predicted_ids = torch.argmax(logits, dim=-1) # decode the audio to generate text transcriptions = tokenizer.decode(predicted_ids[0]) string += transcriptions + " " return string text = transcriptor(chuncks) # print(text) text = text.lower() text = punctuator.punctuate(text) text = corrector.FixFragment(text) print(text)
from punctuator import Punctuator from pathlib import Path import os home = str(Path.home()) model_file = os.path.join(home, ".punctuator", "Demo-Europarl-EN.pcl") print(f"model_file {model_file}") p = Punctuator(model_file) print("Loaded model in memory") print(p.punctuate("some text")) print(p.punctuate("some more text")) print(p.punctuate("some more more text")) print(p.punctuate("some many more text that needs punctuation"))