Ejemplo n.º 1
0
def punctuate(text):
    p = Punctuator(
        "C:\\Users\\lm44\\Documents\\Code\\Python\\Sumit Backend\\functions\\INTERSPEECH-T-BRNN.pcl"
    )
    punctuated = p.punctuate(text)

    return punctuated
Ejemplo n.º 2
0
def punctuate_text(text):
    print("Performing Puntuation ... \n")
    p = Punctuator('models/punctuator1.pcl')
    new_text = p.punctuate(text)
    new_text = re.sub("[?:;,]", "", new_text)
    new_text = re.sub("\s\s+", " ", new_text)
    print("Original text is:\n")
    print(new_text)
    print("\n\n")
    return new_text
def correct(begin_of_path,text,language="English"):
    #text is currently raw string
    words=text.split(" ")
    correct_words=[]
    spell = SpellChecker()
    for word in words:
        correct_words.append(spell.correction(word))
    separator = ' '
    correct_text=separator.join(correct_words)
    path_to_model=os.path.join(begin_of_path,"data","Demo-Europarl-EN.pcl")
    p = Punctuator(path_to_model)
    correct_text_with_punct=p.punctuate(correct_text)
    return(correct_text_with_punct)
Ejemplo n.º 4
0
def punctuates(doo=True):
    if doo == True:
        text = open("transcription.txt", "r")
        text = text.read()
        from punctuator import Punctuator

        p = Punctuator('hel.pcl')
        punctuated = p.punctuate(text)
        print("Punctuating Done")
        return punctuated
    else:
        text = open("transcription.txt", "r")
        text = text.read()
        return text
Ejemplo n.º 5
0
def addPunctuation(text_file):
    #load the pre-trained model
    p = Punctuator('model.pcl')

    #read unstructured text from the file
    fp = open(text_file, "r")
    text = fp.read()

    #punctuate the read text
    sentences = p.punctuate(text)
    fp.close()

    #write punctuated text into the file
    otp_file = open("notes.txt", "w")
    otp_file.write(sentences)
    otp_file.close()
Ejemplo n.º 6
0
def punctuate():
    global filename
    global PCL
    t = open('****************', 'r')
    file = t.read()
    source = file
    # Punctuate
    timestamp = datetime.datetime.strptime(time.ctime(),
                                           "%a %b %d %H:%M:%S %Y")
    print(f'{timestamp} | Punctuating chunk')
    p = Punctuator('****************')
    timestamp = datetime.datetime.strptime(time.ctime(),
                                           "%a %b %d %H:%M:%S %Y")
    print(f'{timestamp} | Saving your file')
    t.write(p.punctuate(source))
    timestamp = datetime.datetime.strptime(time.ctime(),
                                           "%a %b %d %H:%M:%S %Y")
    print(f'{timestamp} | Punctuation complete')
Ejemplo n.º 7
0
def test(request):
    video_id = "5v1B1R3lEO8"
    srt = YouTubeTranscriptApi.get_transcript(video_id)
    alltext = ""
    for item in srt:
        alltext = alltext + item['text'] + " "
    print(os.path.join(settings.BASE_DIR))
    file_ = os.path.join(settings.BASE_DIR, 'model.pcl')
    p = Punctuator(file_)
    punctuated = p.punctuate(alltext[:5000])
    specialtag = punctuated.split('.')
    # newpara = ""
    # for item in specialtag:
    # 	newpara = newpara + item+".<br>"

    #specialtag = alltext

    return render(request, 'test.html', {
        'foo': specialtag,
    })
Ejemplo n.º 8
0
def fix_text(text_list, is_saved):
    """
    Cleans, punctuates, neural coreferences, and sentencizes the transcript.
    :param is_saved: True if a version of the fixed text is already saved in a file
    :param text_list: A list of strings; an 'unclean' transcript
    :return: A list of tokenized sentences (every sentence is a Doc object)
    """
    file_name = 'fixed.txt'

    if is_saved:
        with open(file_name, 'r') as fixed:
            fixed_text_list = fixed.readlines()
        fixed_text_list = [text.replace('\n', '') for text in fixed_text_list]
        fixed_text_list = [nlp(sentence) for sentence in fixed_text_list]
        return fixed_text_list

    else:
        fixed_text = ' '.join(text_list)  # convert the list into one string
        fixed_text.replace('  ', ' ')  # remove double spaces

        print('adding punctuation; please wait a few minutes...')
        punctuator = Punctuator('Demo-Europarl-EN.pcl')
        fixed_text = punctuator.punctuate(fixed_text)

        print('removing interjections; please wait a few more minutes...')
        fixed_text_doc = remove_tokens_by_pos(nlp(fixed_text), 'INTJ')

        print(
            'performing neural coreferencing; please wait for several more minutes...'
        )
        neuralcoref.add_to_pipe(nlp)
        fixed_text_doc = fixed_text_doc._.coref_resolved

        print('splitting the text into sentences; please keep waiting...')
        fixed_text_list = re.split('\\.|\\?|!', fixed_text_doc)

        with open(file_name, 'w') as fixed:
            for sentence in fixed_text_list:
                fixed.write(sentence + "\n")
        fixed_text_list = [nlp(sentence) for sentence in fixed_text_list]
        return fixed_text_list
Ejemplo n.º 9
0
def summarize(vid_id):

    text = """ """
    subs = YouTubeTranscriptApi.get_transcript(vid_id)
    sentences = [i['text'] for i in subs]
    text = ' '.join(sentences)
    p = Punctuator('INTERSPEECH-T-BRNN.pcl')
    text = p.punctuate(text)

    stopWords = set(stopwords.words("english")) 
    words = word_tokenize(text) 
    freqTable = dict() 
    for word in words: 
        word = word.lower() 
        if word in stopWords: 
            continue
        if word in freqTable: 
            freqTable[word] += 1
        else: 
            freqTable[word] = 1
    sentences = sent_tokenize(text) 
    sentenceValue = dict() 
    for sentence in sentences: 
        for word, freq in freqTable.items(): 
            if word in sentence.lower(): 
                if sentence in sentenceValue: 
                    sentenceValue[sentence] += freq 
                else: 
                    sentenceValue[sentence] = freq 
    sumValues = 0
    for sentence in sentenceValue: 
        sumValues += sentenceValue[sentence] 
    average = int(sumValues / len(sentenceValue)) 
    summary = '' 
    for sentence in sentences: 
        if (sentence in sentenceValue) and (sentenceValue[sentence] > (1.2 * average)): 
            summary += " " + sentence 
    return summary
def main(fileName):
    fileName, fileExt = fileName.split('.')
    print(fileName, fileExt)
    # os.system(f"ffmpeg -i C:\\Users\\Chinmay\\CSE\\pyCode\\Unysis\\UNA-Unisys-Natural-Assistant-main\\UNA-Unisys-Natural-Assistant-main\\{fileName}.{fileExt} -ab 160k -ac 2 -ar 44100 -vn C:\\Users\\Chinmay\\CSE\\pyCode\\Unysis\\UNA-Unisys-Natural-Assistant-main\\UNA-Unisys-Natural-Assistant-main\\{fileName}.wav")

    # ipFile = ffmpeg.input(fileName + fileExt)
    # opFile = ffmpeg.output(ipFile, fileName + ".wav")

    clip = AudioFileClip(f"{fileName}.{fileExt}")
    clip.write_audiofile(f"{fileName}.wav", codec='pcm_s16le')

    f = sf.SoundFile(f'{fileName}.wav')
    audio_dur = len(f) / f.samplerate

    r = sr.Recognizer()
    text = ""
    rec_dur = 25

    with sr.AudioFile(f'{fileName}.wav') as source:
        for x in range(0, int(audio_dur / rec_dur)):
            audio = r.record(source, duration=rec_dur)
            try:
                new_txt = r.recognize_google(audio)
                text = text + new_txt
            except:
                pass

        audio = r.record(source,
                         duration=(audio_dur - int(audio_dur / rec_dur)))
        try:
            new_txt = r.recognize_google(audio)
            text = text + new_txt
        except:
            pass

        print("Done")

    p = Punctuator('Demo-Europarl-EN.pcl')
    text = p.punctuate(text)

    tool = language_tool_python.LanguageTool('en-US')

    matches = tool.check(text)
    print(len(matches))

    for lab in range(len(matches)):
        print(lab)
        print(matches[lab].ruleId, matches[lab].replacements)

    text_new = tool.correct(text)

    print(text_new)

    nltk.download('punkt')
    nltk.download('stopwords')

    preprocessor = TextPreProcessor(NLTKTokenizer(), NLTKCleaner())
    similarity_algorithm = BM25Plus()
    ranker = TextRank()
    ir = ClassicalIR()

    # Text Summarization
    model = Summarizer(preprocessor, similarity_algorithm, ranker, ir)
    summarised_content = model.summarise(text_new,
                                         reduction_ratio=0.80,
                                         preserve_order=True)

    print("\n --- Summarized Text ---\n")
    print(construct_sentences_from_ranking(summarised_content))

    with open(f"{fileName}.txt", "w+") as file:
        file.write(construct_sentences_from_ranking(summarised_content))

    # Text Keyword Extraction
    preprocessor = TextPreProcessor(NLTKTokenizer(),
                                    NLTKCleaner(skip_stemming=True))
    keyword_extractor = KeywordExtractor(preprocessor, ClassicalIR())
    keywords = keyword_extractor.extract_keywords(text, count=10, raw=False)

    print("\n --- Keywords ---\n")
    print(keywords)
Ejemplo n.º 11
0
from punctuator import Punctuator

p = Punctuator('Demo-Europarl-EN.pcl')
output_file = open('output.txt', 'w')
output_file.write('Demo-Europarl_EN.pcl\n\n')
output_file.write(
    p.punctuate(
        'Uh Now last video we went through the first five steps AGADAP So you just kind of shown here of our uh relative combat power that weve chosen to do a penetration umh how weve laid out our forces arrayed them and uh set our phases and selected our leadership Okay so now we get our battle book page were gonna do our COA statement and sketch and we can drive on with our sketch Alright so heres the page out of the battle book and uh we need to do both our sketch and our statement Now for this Im gonna go ahead and and just start with uh the sketch Now uh we know that were gonna start out in our assembly area Alright so we can go ahead and draw that up here at the top of the page and I wanna alot leave uh quite a bit of room down here to show whats gonna happen for actions on the objective Remember the COA sketch isnt to scale uh Were gonna need uh some kind of minor departure We know thats gonna happen Alright and were gonna leave from the uh assembly area and were gonna move to an ORP right So we gotta put our ORP on here uh and since thats gonna be a movement theres gonna be an axis to get us there Alright and uh thinking through this theres probably some phase line here Alright because were gonna spend phase one in the assembly area phase two moving to the ORP and then phase three is all gonna happen in the ORP so theres probably a phase line here because were gonna act differently at that point'
    ))
#output_file.write(p.punctuate('this is a test sentence for part 1'))

p3 = Punctuator('INTERSPEECH-T-BRNN.pcl')
output_file.write('\n\nINTERSPEECH-T-BRNN.pcl\n\n')
output_file.write(
    p3.punctuate(
        'Uh Now last video we went through the first five steps AGADAP So you just kind of shown here of our uh relative combat power that weve chosen to do a penetration umh how weve laid out our forces arrayed them and uh set our phases and selected our leadership Okay so now we get our battle book page were gonna do our COA statement and sketch and we can drive on with our sketch Alright so heres the page out of the battle book and uh we need to do both our sketch and our statement Now for this Im gonna go ahead and and just start with uh the sketch Now uh we know that were gonna start out in our assembly area Alright so we can go ahead and draw that up here at the top of the page and I wanna alot leave uh quite a bit of room down here to show whats gonna happen for actions on the objective Remember the COA sketch isnt to scale uh Were gonna need uh some kind of minor departure We know thats gonna happen Alright and were gonna leave from the uh assembly area and were gonna move to an ORP right So we gotta put our ORP on here uh and since thats gonna be a movement theres gonna be an axis to get us there Alright and uh thinking through this theres probably some phase line here Alright because were gonna spend phase one in the assembly area phase two moving to the ORP and then phase three is all gonna happen in the ORP so theres probably a phase line here because were gonna act differently at that point'
    ))
#output_file.write(p3.punctuate('this is a test sentence for part 3'))

output_file.close()
Ejemplo n.º 12
0
import sys

punctuatorSelected = input(
    'Which punctuator would you like to test: \n1. Demo-Europarl-En.pcl \n2. INTERSPEECH-T-BRNN.pcl\n\n'
)

if (punctuatorSelected == '1'):
    p = Punctuator('Demo-Europarl-EN.pcl')
    pText = 'Demo-Europarl-EN'
if (punctuatorSelected == '2'):
    p = Punctuator('INTERSPEECH-T-BRNN.pcl')
    pText = 'INTERSPEECH-T-BRNN'

print('\nYou have selected ', pText, '\n')

textfile = sys.argv[1]
output_file = open('output.txt', 'w')

with open(textfile, 'r') as file:
    data = file.read().replace('\n', ' ')
    data = data.lower()

fillerwords = ['uh']
datawords = data.split()

processData = [word for word in datawords if word.lower() not in fillerwords]
finalText = ' '.join(processData)

print(p.punctuate(finalText))

output_file.write(p.punctuate(finalText))
Ejemplo n.º 13
0
def punctuate_text(text):
    p = Punctuator('models/INTERSPEECH-T-BRNN.pcl')
    print(p.punctuate(text))
Ejemplo n.º 14
0
def punctuate_conversation(conversation, loc):
    p = Punctuator(loc)
    punctuated_converse = p.punctuate(conversation)
    return punctuated_converse
Ejemplo n.º 15
0
textD = "61 people passed away today. I am very tired, I want to sleep. I only slept for 1 hour last night."

p = Punctuator('Demo-Europarl-EN.pcl')
# NLP
def nlp_parser(text):
    parser.setup(text)
    # parser.display_tree()
    parser.extract_noun_verb()

def nummod_parser(text):
    nparser.setup(text)
    nparser.get_noun_count_pairs()

# Speech to text
recognizer = SR.Recognizer()
with SR.Microphone() as source:
    print("Speak:")
    audio = recognizer.listen(source)

try:
    text_result = recognizer.recognize_google(audio) + "."
    text_result_punctuated = p.punctuate(text_result)
    print("You said: " + text_result_punctuated)
    nlp_parser(text_result_punctuated)
    # mmod_parser(text_result_punctuated)
except SR.UnknownValueError:
    print("Could not understand audio")
except SR.RequestError as e:
    print("Could not request results; {0}".format(e))

Ejemplo n.º 16
0
def punctuator_src_test():
    loc = "./src/AI_files/punctuator/Demo-Europarl-EN.pcl"
    p = Punctuator(loc)
    punctuated_converse = p.punctuate("some one")
    return punctuated_converse
Ejemplo n.º 17
0
    def form_valid(self, form):
        self.object = form.save(commit=False)
        video_id = self.object.body.split('?v=')[1].split("&")[0]
        self.object.vid_id = video_id
        print('Does he have it?')
        print(self.object.vid_id)
        if (Post.objects.filter(vid_id=self.object.vid_id).exists()):
            yo = Post.objects.filter(vid_id=self.object.vid_id)[:1]
            print(yo[0].pk)
            print('REDIRECT!')
            return HttpResponseRedirect(
                reverse('article-detail',
                        kwargs={
                            'pk': str(yo[0].pk),
                            "yt": video_id
                        }))
            #return redirect('article-detail', post.pk post.vid_id+str(yo[0].pk)+'/'+str(video_id))

        data = scrape_url('http://youtube.com/watch?v=' + video_id)
        print(data.title)
        print(data.poster)
        srt = YouTubeTranscriptApi.get_transcript(video_id)
        totalduration = 0
        alltext = ""
        for item in srt:
            go = item['start']
            if (go - totalduration > 30):
                alltext = alltext + time.strftime(
                    '%H:%M:%S', time.gmtime(
                        item['start'])) + item['text'] + " "
                totalduration = item['start']
            else:
                alltext = alltext + item['text'] + " "

        r = re.findall(
            '(?:[0123456789]\d|2[0123456789]):(?:[0123456789]\d):(?:[0123456789]\d)',
            alltext)
        for item in r:
            # print(item)
            alltext = alltext.replace(
                item,
                "<br><a class='ytlink' href='#' type='button' onclick='seek(" +
                str(get_sec(item)) + ")'>" + item + "</a> </br>")

        print(alltext)
        file_ = os.path.join(settings.BASE_DIR, 'model.pcl')
        p = Punctuator(file_)
        punctuated = p.punctuate(alltext)

        self.object.title = data.title
        self.object.title_tag = data.poster
        # totaltext = punctuated.split(".")
        # finaltext = ""
        # for item in totaltext:
        # 	finaltext = finaltext + item +"."+"<br><br>"
        iframe = '''<iframe id="player" type="text/html" width="560" height="315" src="http://www.youtube.com/embed/PjDw3azfZWI?enablejsapi=1" frameborder="0"></iframe>'''
        embed = '''<iframe id="player" type="text/html" width="560" height="315" src="http://www.youtube.com/embed/''' + video_id + '''?enablejsapi=1" frameborder="0"></iframe>'''
        self.object.body = embed + "<br>" + alltext

        self.object.save()
        return HttpResponseRedirect(
            reverse('article-detail',
                    kwargs={
                        'pk': self.object.id,
                        "yt": video_id
                    }))
Ejemplo n.º 18
0
    parts = line.split(" ")
    if len(parts) > 1:
        id = parts[0]
        val = " ".join(parts[1:])
        return id, val


with open(file, 'r') as fd:
    while True:
        line = fd.readline()
        if not line:
            break

        id, valstr = parse_line(line)
        val = json.loads(valstr)
        text = val.get("text")
        text = " ".join(text)
        text = text.replace("[Music]", " ").replace("\r",
                                                    " ").replace("\n", " ")
        text = ' '.join(text.split())
        sentences = sent_tokenize(text)

        punc_text = None
        punc_needed = True

        if len(sentences) > 2:
            punc_needed = False
            punc_text = text
        else:
            punc_text = punc.punctuate(text)
Ejemplo n.º 19
0
from punctuator import Punctuator
import sys

p = Punctuator('Demo-Europarl-EN.pcl')

textfile = sys.argv[1]

with open(textfile, 'r') as file:
    data = file.read().replace('\n', ' ')
    data = data.lower()

print(p.punctuate(data))
Ejemplo n.º 20
0
from punctuator import Punctuator
p = Punctuator('/home/erviewre/h3podcastbot/.punctuator/Demo-Europarl-EN.pcl')

import os

directory = os.fsencode("/home/erviewre/h3podcastbot/raw_scripts")
file_count = len(os.listdir(directory))
progress = 1

for file in os.listdir(directory):
    print(str(progress) + "/" + str(file_count))
    progress += 1
    filename = os.fsdecode(file)
    with open('/home/erviewre/h3podcastbot/raw_scripts/' + filename,
              'r') as open_file:
        data = open_file.read().replace('\n', ' ')
        with open('/home/erviewre/h3podcastbot/punctuated_scripts/' + filename,
                  'w') as punctuated_file:
            punctuated_file.write(p.punctuate(data))
Ejemplo n.º 21
0
def handlePunctuation(text):
    #Punctuator库
    #https://drive.google.com/drive/folders/0B7BsN5f2F1fZQnFsbzJ3TWxxMms
    p = Punctuator('Demo-Europarl-EN.pcl')
    return p.punctuate(text)
Ejemplo n.º 22
0
corrector = jamspell.TSpellCorrector()
corrector.LoadLangModel("../models/spellchecker_en.bin")

# load any audio file of your choice
speech, rate = librosa.load("../10mintest.mp3", sr=16000)
lenght = librosa.get_duration(speech, sr=16000)
n_chuncks = np.ceil(lenght / 10)
chuncks = np.array_split(speech, n_chuncks)


def transcriptor(chunks):
    string = ""
    for i in chuncks:
        input_values = tokenizer(i, return_tensors='pt').input_values
        # Store logits (non-normalized predictions)
        logits = model(input_values).logits
        # Store predicted id's
        predicted_ids = torch.argmax(logits, dim=-1)
        # decode the audio to generate text
        transcriptions = tokenizer.decode(predicted_ids[0])
        string += transcriptions + " "
    return string


text = transcriptor(chuncks)
# print(text)
text = text.lower()
text = punctuator.punctuate(text)
text = corrector.FixFragment(text)
print(text)
Ejemplo n.º 23
0
from punctuator import Punctuator
from pathlib import Path
import os

home = str(Path.home())
model_file = os.path.join(home, ".punctuator", "Demo-Europarl-EN.pcl")
print(f"model_file {model_file}")
p = Punctuator(model_file)
print("Loaded model in memory")
print(p.punctuate("some text"))
print(p.punctuate("some more text"))
print(p.punctuate("some more more text"))
print(p.punctuate("some many more text that needs punctuation"))