Exemple #1
0
def punctuate(text):
    p = Punctuator(
        "C:\\Users\\lm44\\Documents\\Code\\Python\\Sumit Backend\\functions\\INTERSPEECH-T-BRNN.pcl"
    )
    punctuated = p.punctuate(text)

    return punctuated
def punctuate_text(text):
    print("Performing Puntuation ... \n")
    p = Punctuator('models/punctuator1.pcl')
    new_text = p.punctuate(text)
    new_text = re.sub("[?:;,]", "", new_text)
    new_text = re.sub("\s\s+", " ", new_text)
    print("Original text is:\n")
    print(new_text)
    print("\n\n")
    return new_text
def correct(begin_of_path,text,language="English"):
    #text is currently raw string
    words=text.split(" ")
    correct_words=[]
    spell = SpellChecker()
    for word in words:
        correct_words.append(spell.correction(word))
    separator = ' '
    correct_text=separator.join(correct_words)
    path_to_model=os.path.join(begin_of_path,"data","Demo-Europarl-EN.pcl")
    p = Punctuator(path_to_model)
    correct_text_with_punct=p.punctuate(correct_text)
    return(correct_text_with_punct)
Exemple #4
0
def punctuates(doo=True):
    if doo == True:
        text = open("transcription.txt", "r")
        text = text.read()
        from punctuator import Punctuator

        p = Punctuator('hel.pcl')
        punctuated = p.punctuate(text)
        print("Punctuating Done")
        return punctuated
    else:
        text = open("transcription.txt", "r")
        text = text.read()
        return text
def addPunctuation(text_file):
    #load the pre-trained model
    p = Punctuator('model.pcl')

    #read unstructured text from the file
    fp = open(text_file, "r")
    text = fp.read()

    #punctuate the read text
    sentences = p.punctuate(text)
    fp.close()

    #write punctuated text into the file
    otp_file = open("notes.txt", "w")
    otp_file.write(sentences)
    otp_file.close()
Exemple #6
0
def punctuate():
    global filename
    global PCL
    t = open('****************', 'r')
    file = t.read()
    source = file
    # Punctuate
    timestamp = datetime.datetime.strptime(time.ctime(),
                                           "%a %b %d %H:%M:%S %Y")
    print(f'{timestamp} | Punctuating chunk')
    p = Punctuator('****************')
    timestamp = datetime.datetime.strptime(time.ctime(),
                                           "%a %b %d %H:%M:%S %Y")
    print(f'{timestamp} | Saving your file')
    t.write(p.punctuate(source))
    timestamp = datetime.datetime.strptime(time.ctime(),
                                           "%a %b %d %H:%M:%S %Y")
    print(f'{timestamp} | Punctuation complete')
def Transcription(videoFile):
    try:
        videoToWav(videoFile)

        r = sr.Recognizer()
        audio_clip = sr.AudioFile("{}".format("extracted.wav"))
        with audio_clip as source:
            audio_file = r.record(source)
        print("Please wait ...")
        resultTemp = r.recognize_google(audio_file, language="en-EN")
        punctuator = Punctuator('en')
        resultTemp = punctuator.punct([resultTemp], batch_size=32)
        empty = " "
        resultFinal3 = empty.join(resultTemp)

        print("Speech to text conversion successfull.")

        return resultFinal3
    except Exception as e:
        print("Attempt failed -- ", e)
Exemple #8
0
def test(request):
    video_id = "5v1B1R3lEO8"
    srt = YouTubeTranscriptApi.get_transcript(video_id)
    alltext = ""
    for item in srt:
        alltext = alltext + item['text'] + " "
    print(os.path.join(settings.BASE_DIR))
    file_ = os.path.join(settings.BASE_DIR, 'model.pcl')
    p = Punctuator(file_)
    punctuated = p.punctuate(alltext[:5000])
    specialtag = punctuated.split('.')
    # newpara = ""
    # for item in specialtag:
    # 	newpara = newpara + item+".<br>"

    #specialtag = alltext

    return render(request, 'test.html', {
        'foo': specialtag,
    })
Exemple #9
0
def fix_text(text_list, is_saved):
    """
    Cleans, punctuates, neural coreferences, and sentencizes the transcript.
    :param is_saved: True if a version of the fixed text is already saved in a file
    :param text_list: A list of strings; an 'unclean' transcript
    :return: A list of tokenized sentences (every sentence is a Doc object)
    """
    file_name = 'fixed.txt'

    if is_saved:
        with open(file_name, 'r') as fixed:
            fixed_text_list = fixed.readlines()
        fixed_text_list = [text.replace('\n', '') for text in fixed_text_list]
        fixed_text_list = [nlp(sentence) for sentence in fixed_text_list]
        return fixed_text_list

    else:
        fixed_text = ' '.join(text_list)  # convert the list into one string
        fixed_text.replace('  ', ' ')  # remove double spaces

        print('adding punctuation; please wait a few minutes...')
        punctuator = Punctuator('Demo-Europarl-EN.pcl')
        fixed_text = punctuator.punctuate(fixed_text)

        print('removing interjections; please wait a few more minutes...')
        fixed_text_doc = remove_tokens_by_pos(nlp(fixed_text), 'INTJ')

        print(
            'performing neural coreferencing; please wait for several more minutes...'
        )
        neuralcoref.add_to_pipe(nlp)
        fixed_text_doc = fixed_text_doc._.coref_resolved

        print('splitting the text into sentences; please keep waiting...')
        fixed_text_list = re.split('\\.|\\?|!', fixed_text_doc)

        with open(file_name, 'w') as fixed:
            for sentence in fixed_text_list:
                fixed.write(sentence + "\n")
        fixed_text_list = [nlp(sentence) for sentence in fixed_text_list]
        return fixed_text_list
    def get_punctuator_model(self) -> Punctuator:
        """Returns a punctuator model. It will reuse the same punctuator model when
        calling this method multiple times, i.e. a singleton.

        Returns:
            Punctuator: the punctuator model
        """
        if self._punctuator_model is None:
            print("Loading punctuator model..")
            model = str(Path("/app/chappy/punctuator/INTERSPEECH-T-BRNN.pcl").resolve())
            self._punctuator_model = Punctuator(model)  # use pretrained model
            print("Completed loading punctuator model.\n")
        return self._punctuator_model
Exemple #11
0
def summarize(vid_id):

    text = """ """
    subs = YouTubeTranscriptApi.get_transcript(vid_id)
    sentences = [i['text'] for i in subs]
    text = ' '.join(sentences)
    p = Punctuator('INTERSPEECH-T-BRNN.pcl')
    text = p.punctuate(text)

    stopWords = set(stopwords.words("english")) 
    words = word_tokenize(text) 
    freqTable = dict() 
    for word in words: 
        word = word.lower() 
        if word in stopWords: 
            continue
        if word in freqTable: 
            freqTable[word] += 1
        else: 
            freqTable[word] = 1
    sentences = sent_tokenize(text) 
    sentenceValue = dict() 
    for sentence in sentences: 
        for word, freq in freqTable.items(): 
            if word in sentence.lower(): 
                if sentence in sentenceValue: 
                    sentenceValue[sentence] += freq 
                else: 
                    sentenceValue[sentence] = freq 
    sumValues = 0
    for sentence in sentenceValue: 
        sumValues += sentenceValue[sentence] 
    average = int(sumValues / len(sentenceValue)) 
    summary = '' 
    for sentence in sentences: 
        if (sentence in sentenceValue) and (sentenceValue[sentence] > (1.2 * average)): 
            summary += " " + sentence 
    return summary
Exemple #12
0
def main():

    words, word_to_index, index_to_word, word_to_vec_map = read_glove_vecs(
        EMBEDDING_FILE)

    punctuator = Punctuator(word_to_index, None)

    punctuator.load_model(MODEL_FILE)
    punctuator.load_weights(WEIGHTS_FILE)

    examples = [
        "good morning chairman who I saw and members of the committee it's my pleasure to be here today I'm Elizabeth Ackles director of the office of rate payer advocates and I appreciate the chance to present on oris key activities from 2017 I have a short presentation and I'm going to move through it really quickly because you've had a long morning already and be happy to answer any questions that you have"
    ]
    for example in examples:
        words = example.split()
        x = punctuator.create_live_data(words)
        for s in x:
            prediction = punctuator.predict(s)
            result = punctuator.add_punctuation(prediction, words)
            print(result)
Exemple #13
0
import json
from .word_analyzer import WordAnalyzer
from .script_analyzer import ScriptAnalyzer
from punctuator import Punctuator

target_video_id = "TLnUJzueBOQ"
cbc_kid = 'SuSTBXGiOsw'
comedy_central = 'fKSiol1uczc'
bbc_news = 'hFAROEKiHl8'

WA = WordAnalyzer()
SA = ScriptAnalyzer()
# Place model files at '~/.punctuator' Download ULR : https://drive.google.com/drive/folders/0B7BsN5f2F1fZQnFsbzJ3TWxxMms
# model list : Demo-Europarl-EN.pcl INTERSPEECH-T-BRNN-pre.pcl INTERSPEECH-T-BRNN.pcl
P = Punctuator('Demo-Europarl-EN.pcl')


def analyzeAll(videoId):
    sa_result = json.loads(SA.analyzeScript(videoId))
    print('script analyze ok')
    punc_script = P.punctuate(sa_result['script'])  # 문장부호 포함된 스크립트
    print('punctuator ok')
    wa_result = json.loads(WA.analyzeText(punc_script))
    print('word analyze ok')

    analyze_result = {}

    analyze_result['videoId'] = sa_result['videoId']
    analyze_result['script'] = punc_script
    analyze_result['totalWords'] = wa_result['Total_words']
    analyze_result['totalUniqueWords'] = wa_result['Total_unique_words']
Exemple #14
0
def main():
    """Train a model using lines of text contained in a file
    and evaluates the model.
    """

    #read golve vecs
    words, word_to_index, index_to_word, word_to_vec_map = read_glove_vecs(
        EMBEDDING_FILE)
    #create word embedding matrix
    embedding_matrix = create_emb_matrix(word_to_index, word_to_vec_map)
    print('shape of embedding_matrix:', embedding_matrix.shape)

    #load trainig text from a file
    utterances = load_text_data(TEXT_FILE)
    print(utterances[0])

    #create an instance of Punctutor and create training data
    punctuator = Punctuator(word_to_index, None)
    X, Y = punctuator.create_training_data(utterances, False)

    #if a model already exists, load the model
    if os.path.isfile(MODEL_FILE):
        punctuator.load_model(MODEL_FILE)
    else:
        model = BidirectionalGruWithGru.create_model(
            input_shape=(X.shape[1], ),
            embedding_matrix=embedding_matrix,
            vocab_len=len(word_to_index),
            n_d1=128,
            n_d2=128,
            n_c=len(punctuator.labels))
        print(model.summary())
        punctuator.__model__ = model

    #if the model has been already trained, use the pre-trained weights
    if os.path.isfile(WEIGHTS_FILE):
        punctuator.load_weights(WEIGHTS_FILE)

    #shuffle the training data
    shuffle(X, Y)

    denom_Y = Y.swapaxes(0, 1).sum((0, 1))
    print('Summary of Y:', denom_Y)

    print('shape of X:', X.shape)
    print(X[0:10])
    print('shape of Y:', Y.shape)
    print(Y[0:10])

    #define optimizer and compile the model
    opt = Adam(lr=0.007, beta_1=0.9, beta_2=0.999, decay=0.01)
    punctuator.compile(opt,
                       loss='categorical_crossentropy',
                       metrics=['accuracy'])

    #split the training data into training set, test set, and dev set
    t_size = int(X.shape[0] * 0.9)
    train_X, train_Y = X[:t_size], Y[:t_size]
    test_X, test_Y = X[t_size:-DEV_SIZE], Y[t_size:-DEV_SIZE]
    dev_X, dev_Y = X[-DEV_SIZE:], Y[-DEV_SIZE:]

    print(train_Y.swapaxes(0, 1).sum((0, 1)))
    print(test_Y.swapaxes(0, 1).sum((0, 1)))

    #train the model
    punctuator.fit([train_X], train_Y, batch_size=BATCH, epochs=EPOCH)
    punctuator.save_model(MODEL_FILE)
    punctuator.save_weights(WEIGHTS_FILE)

    #evaluate the model on the dev set (or the test set)
    for i, example in enumerate(dev_X):
        prediction = punctuator.predict(example)
        punctuator.check_result(prediction, dev_Y[i])

    #manually evaluate the model on an example
    examples = [
        "good morning chairman who I saw and members of the committee it's my pleasure to be here today I'm Elizabeth Ackles director of the office of rate payer advocates and I appreciate the chance to present on oris key activities from 2017 I have a short presentation and I'm going to move through it really quickly because you've had a long morning already and be happy to answer any questions that you have"
    ]
    for example in examples:
        words = example.split()
        x = punctuator.create_live_data(words)
        print x
        for s in x:
            print s
            prediction = punctuator.predict(s)
            result = punctuator.add_punctuation(prediction, words)
            print(result)
import math
import datetime
import numpy as np
import config

app = Flask(__name__)
app.config.from_object(config.ProductionConfig)
app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False
db = SQLAlchemy(app)
jwt = JWTManager(app)

punctuate_model_name = 'PT_Punctuator.pcl'
punctuate_model_directory = './punctuate_model/'
punctuate_model_path = punctuate_model_directory + punctuate_model_name
app.config['punctuate_model_path'] = punctuate_model_path
punctuator_model = Punctuator(app.config['punctuate_model_path'])

classifier_model_name = 'saved_model/my_model'
classifier_model_directory = './classifier_model/'
classifier_model_path = classifier_model_directory + classifier_model_name
app.config['classifier_model_path'] = classifier_model_path
classifier_model = tf.keras.models.load_model(
    app.config['classifier_model_path'])
vocab_path = classifier_model_directory + 'vocab.txt'
tokenizer = FullTokenizer(vocab_file=vocab_path)


def punctuateTextFile(file_name):
    with open(file_name, "r") as file:
        text_to_punctuate = file.read()
        text_to_punctuate = text_to_punctuate.lower()
Exemple #16
0
from punctuator import Punctuator

p = Punctuator('Models/model.pcl')


def auto_punctuation(text):
    return p.punctuate(text)
Exemple #17
0
def handlePunctuation(text):
    #Punctuator库
    #https://drive.google.com/drive/folders/0B7BsN5f2F1fZQnFsbzJ3TWxxMms
    p = Punctuator('Demo-Europarl-EN.pcl')
    return p.punctuate(text)
from transformers import pipeline
import librosa
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
import numpy as np
from punctuator import Punctuator
import jamspell

# load pre-trained model and tokenizer
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
punctuator = Punctuator("../models/INTERSPEECH-T-BRNN2.pcl")
corrector = jamspell.TSpellCorrector()
corrector.LoadLangModel("../models/spellchecker_en.bin")

# load any audio file of your choice
speech, rate = librosa.load("../10mintest.mp3", sr=16000)
lenght = librosa.get_duration(speech, sr=16000)
n_chuncks = np.ceil(lenght / 10)
chuncks = np.array_split(speech, n_chuncks)


def transcriptor(chunks):
    string = ""
    for i in chuncks:
        input_values = tokenizer(i, return_tensors='pt').input_values
        # Store logits (non-normalized predictions)
        logits = model(input_values).logits
        # Store predicted id's
        predicted_ids = torch.argmax(logits, dim=-1)
        # decode the audio to generate text
def main(fileName):
    fileName, fileExt = fileName.split('.')
    print(fileName, fileExt)
    # os.system(f"ffmpeg -i C:\\Users\\Chinmay\\CSE\\pyCode\\Unysis\\UNA-Unisys-Natural-Assistant-main\\UNA-Unisys-Natural-Assistant-main\\{fileName}.{fileExt} -ab 160k -ac 2 -ar 44100 -vn C:\\Users\\Chinmay\\CSE\\pyCode\\Unysis\\UNA-Unisys-Natural-Assistant-main\\UNA-Unisys-Natural-Assistant-main\\{fileName}.wav")

    # ipFile = ffmpeg.input(fileName + fileExt)
    # opFile = ffmpeg.output(ipFile, fileName + ".wav")

    clip = AudioFileClip(f"{fileName}.{fileExt}")
    clip.write_audiofile(f"{fileName}.wav", codec='pcm_s16le')

    f = sf.SoundFile(f'{fileName}.wav')
    audio_dur = len(f) / f.samplerate

    r = sr.Recognizer()
    text = ""
    rec_dur = 25

    with sr.AudioFile(f'{fileName}.wav') as source:
        for x in range(0, int(audio_dur / rec_dur)):
            audio = r.record(source, duration=rec_dur)
            try:
                new_txt = r.recognize_google(audio)
                text = text + new_txt
            except:
                pass

        audio = r.record(source,
                         duration=(audio_dur - int(audio_dur / rec_dur)))
        try:
            new_txt = r.recognize_google(audio)
            text = text + new_txt
        except:
            pass

        print("Done")

    p = Punctuator('Demo-Europarl-EN.pcl')
    text = p.punctuate(text)

    tool = language_tool_python.LanguageTool('en-US')

    matches = tool.check(text)
    print(len(matches))

    for lab in range(len(matches)):
        print(lab)
        print(matches[lab].ruleId, matches[lab].replacements)

    text_new = tool.correct(text)

    print(text_new)

    nltk.download('punkt')
    nltk.download('stopwords')

    preprocessor = TextPreProcessor(NLTKTokenizer(), NLTKCleaner())
    similarity_algorithm = BM25Plus()
    ranker = TextRank()
    ir = ClassicalIR()

    # Text Summarization
    model = Summarizer(preprocessor, similarity_algorithm, ranker, ir)
    summarised_content = model.summarise(text_new,
                                         reduction_ratio=0.80,
                                         preserve_order=True)

    print("\n --- Summarized Text ---\n")
    print(construct_sentences_from_ranking(summarised_content))

    with open(f"{fileName}.txt", "w+") as file:
        file.write(construct_sentences_from_ranking(summarised_content))

    # Text Keyword Extraction
    preprocessor = TextPreProcessor(NLTKTokenizer(),
                                    NLTKCleaner(skip_stemming=True))
    keyword_extractor = KeywordExtractor(preprocessor, ClassicalIR())
    keywords = keyword_extractor.extract_keywords(text, count=10, raw=False)

    print("\n --- Keywords ---\n")
    print(keywords)
Exemple #20
0
from punctuator import Punctuator
p = Punctuator('/home/erviewre/h3podcastbot/.punctuator/Demo-Europarl-EN.pcl')

import os

directory = os.fsencode("/home/erviewre/h3podcastbot/raw_scripts")
file_count = len(os.listdir(directory))
progress = 1

for file in os.listdir(directory):
    print(str(progress) + "/" + str(file_count))
    progress += 1
    filename = os.fsdecode(file)
    with open('/home/erviewre/h3podcastbot/raw_scripts/' + filename,
              'r') as open_file:
        data = open_file.read().replace('\n', ' ')
        with open('/home/erviewre/h3podcastbot/punctuated_scripts/' + filename,
                  'w') as punctuated_file:
            punctuated_file.write(p.punctuate(data))
Exemple #21
0
def punctuate_text(text):
    p = Punctuator('models/INTERSPEECH-T-BRNN.pcl')
    print(p.punctuate(text))
Exemple #22
0
import os
import subprocess
import time
import logging
import uuid
from speech_recognizer import SpeechRecognizer
from punctuator import Punctuator
from number_utils.text2numbers import TextToNumbers

speech_recognizer = SpeechRecognizer()
punctuator = Punctuator(model_path="data/punctuator")
text2numbers = TextToNumbers()


class FileHandler:
    @staticmethod
    def get_recognized_text(blob):
        try:
            filename = str(uuid.uuid4())
            os.makedirs('./records', exist_ok=True)
            new_record_path = os.path.join('./records', filename + '.webm')
            blob.save(new_record_path)
            new_filename = filename + '.wav'
            converted_record_path = FileHandler.convert_to_wav(
                new_record_path, new_filename)
            response_models_result = FileHandler.get_models_result(
                converted_record_path)
            return 0, new_filename, response_models_result
        except Exception as e:
            logging.exception(e)
            return 1, None, str(e)
import json
import pprint
import string
from nltk.tokenize import sent_tokenize
import nltk
from punctuator import Punctuator

punc = Punctuator('model.pcl')
nltk.download('punkt')

file = 'captions.txt'


def parse_line(line):
    parts = line.split(" ")
    if len(parts) > 1:
        id = parts[0]
        val = " ".join(parts[1:])
        return id, val


with open(file, 'r') as fd:
    while True:
        line = fd.readline()
        if not line:
            break

        id, valstr = parse_line(line)
        val = json.loads(valstr)
        text = val.get("text")
        text = " ".join(text)
Exemple #24
0
def punctuate_conversation(conversation, loc):
    p = Punctuator(loc)
    punctuated_converse = p.punctuate(conversation)
    return punctuated_converse
Exemple #25
0
    def form_valid(self, form):
        self.object = form.save(commit=False)
        video_id = self.object.body.split('?v=')[1].split("&")[0]
        self.object.vid_id = video_id
        print('Does he have it?')
        print(self.object.vid_id)
        if (Post.objects.filter(vid_id=self.object.vid_id).exists()):
            yo = Post.objects.filter(vid_id=self.object.vid_id)[:1]
            print(yo[0].pk)
            print('REDIRECT!')
            return HttpResponseRedirect(
                reverse('article-detail',
                        kwargs={
                            'pk': str(yo[0].pk),
                            "yt": video_id
                        }))
            #return redirect('article-detail', post.pk post.vid_id+str(yo[0].pk)+'/'+str(video_id))

        data = scrape_url('http://youtube.com/watch?v=' + video_id)
        print(data.title)
        print(data.poster)
        srt = YouTubeTranscriptApi.get_transcript(video_id)
        totalduration = 0
        alltext = ""
        for item in srt:
            go = item['start']
            if (go - totalduration > 30):
                alltext = alltext + time.strftime(
                    '%H:%M:%S', time.gmtime(
                        item['start'])) + item['text'] + " "
                totalduration = item['start']
            else:
                alltext = alltext + item['text'] + " "

        r = re.findall(
            '(?:[0123456789]\d|2[0123456789]):(?:[0123456789]\d):(?:[0123456789]\d)',
            alltext)
        for item in r:
            # print(item)
            alltext = alltext.replace(
                item,
                "<br><a class='ytlink' href='#' type='button' onclick='seek(" +
                str(get_sec(item)) + ")'>" + item + "</a> </br>")

        print(alltext)
        file_ = os.path.join(settings.BASE_DIR, 'model.pcl')
        p = Punctuator(file_)
        punctuated = p.punctuate(alltext)

        self.object.title = data.title
        self.object.title_tag = data.poster
        # totaltext = punctuated.split(".")
        # finaltext = ""
        # for item in totaltext:
        # 	finaltext = finaltext + item +"."+"<br><br>"
        iframe = '''<iframe id="player" type="text/html" width="560" height="315" src="http://www.youtube.com/embed/PjDw3azfZWI?enablejsapi=1" frameborder="0"></iframe>'''
        embed = '''<iframe id="player" type="text/html" width="560" height="315" src="http://www.youtube.com/embed/''' + video_id + '''?enablejsapi=1" frameborder="0"></iframe>'''
        self.object.body = embed + "<br>" + alltext

        self.object.save()
        return HttpResponseRedirect(
            reverse('article-detail',
                    kwargs={
                        'pk': self.object.id,
                        "yt": video_id
                    }))
parser.add_argument('-c', '--captions-path', type=str, required=True,
                    help='path to filtered captions')
parser.add_argument('-p', '--punctuator-model', type=str, required=True,
                    help='path to punctuator .pcl model')
parser.add_argument('-l', '--labelled-data', type=str, required=True,
                    help='path to labelled data json file')
parser.add_argument('-f', '--root-features', type=str, required=True,
                    help='directory with all the video features')
parser.add_argument('-s', '--save-path', type=str, required=True,
                    help='json file to save training data to')
args = parser.parse_args()

captions_path = args.captions_path
save_path = args.save_path

punc = Punctuator(args.punctuator_model)
captions = json.load(open(captions_path, 'r'))
labelled_data = json.load(open(args.labelled_data, 'r'))
vid_ids = os.listdir(args.root_features)

start = 0
if os.path.exists(save_path):
    train_data = json.load(open(save_path))
    print('starting from vid id', len(train_data))
    start = len(train_data)
else:
    train_data = {}


def timestamp_to_idx(time):
    return int(0.5 + time / 1.5)
Exemple #27
0
from punctuator import Punctuator
import sys

p = Punctuator('Demo-Europarl-EN.pcl')

textfile = sys.argv[1]

with open(textfile, 'r') as file:
    data = file.read().replace('\n', ' ')
    data = data.lower()

print(p.punctuate(data))
Exemple #28
0
from punctuator import Punctuator

p = Punctuator('Demo-Europarl-EN.pcl')
output_file = open('output.txt', 'w')
output_file.write('Demo-Europarl_EN.pcl\n\n')
output_file.write(
    p.punctuate(
        'Uh Now last video we went through the first five steps AGADAP So you just kind of shown here of our uh relative combat power that weve chosen to do a penetration umh how weve laid out our forces arrayed them and uh set our phases and selected our leadership Okay so now we get our battle book page were gonna do our COA statement and sketch and we can drive on with our sketch Alright so heres the page out of the battle book and uh we need to do both our sketch and our statement Now for this Im gonna go ahead and and just start with uh the sketch Now uh we know that were gonna start out in our assembly area Alright so we can go ahead and draw that up here at the top of the page and I wanna alot leave uh quite a bit of room down here to show whats gonna happen for actions on the objective Remember the COA sketch isnt to scale uh Were gonna need uh some kind of minor departure We know thats gonna happen Alright and were gonna leave from the uh assembly area and were gonna move to an ORP right So we gotta put our ORP on here uh and since thats gonna be a movement theres gonna be an axis to get us there Alright and uh thinking through this theres probably some phase line here Alright because were gonna spend phase one in the assembly area phase two moving to the ORP and then phase three is all gonna happen in the ORP so theres probably a phase line here because were gonna act differently at that point'
    ))
#output_file.write(p.punctuate('this is a test sentence for part 1'))

p3 = Punctuator('INTERSPEECH-T-BRNN.pcl')
output_file.write('\n\nINTERSPEECH-T-BRNN.pcl\n\n')
output_file.write(
    p3.punctuate(
        'Uh Now last video we went through the first five steps AGADAP So you just kind of shown here of our uh relative combat power that weve chosen to do a penetration umh how weve laid out our forces arrayed them and uh set our phases and selected our leadership Okay so now we get our battle book page were gonna do our COA statement and sketch and we can drive on with our sketch Alright so heres the page out of the battle book and uh we need to do both our sketch and our statement Now for this Im gonna go ahead and and just start with uh the sketch Now uh we know that were gonna start out in our assembly area Alright so we can go ahead and draw that up here at the top of the page and I wanna alot leave uh quite a bit of room down here to show whats gonna happen for actions on the objective Remember the COA sketch isnt to scale uh Were gonna need uh some kind of minor departure We know thats gonna happen Alright and were gonna leave from the uh assembly area and were gonna move to an ORP right So we gotta put our ORP on here uh and since thats gonna be a movement theres gonna be an axis to get us there Alright and uh thinking through this theres probably some phase line here Alright because were gonna spend phase one in the assembly area phase two moving to the ORP and then phase three is all gonna happen in the ORP so theres probably a phase line here because were gonna act differently at that point'
    ))
#output_file.write(p3.punctuate('this is a test sentence for part 3'))

output_file.close()
Exemple #29
0
import os
import config
import json

init(init(autoreset=True))

if config.settings.punct_correction_tool == "fastpunct":
    from fastpunct import FastPunct

    fastpunct = FastPunct("en")
elif config.settings.punct_correction_tool == "punctuator":
    from punctuator import Punctuator

    model_file = os.path.join(str(Path.home()), ".punctuator",
                              "Demo-Europarl-EN.pcl")
    punctuator_runner = Punctuator(model_file)


def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)


def _transformTime(srt_time, start_utc_time=0):
    """[summary]

    :return: [description]
    :rtype: [type]
    """
    minutes = srt_time.minutes
    seconds = srt_time.seconds + 60 * minutes
    milliseconds = srt_time.milliseconds + seconds * 1000 + start_utc_time
Exemple #30
0
def main():
    """Train a model using lines of text contained in a file
    and evaluates the model.
    """

    #read golve vecs
    #words, word_to_index, index_to_word, word_to_vec_map = read_glove_vecs(EMBEDDING_FILE)
    #create word embedding matrix
    #embedding_matrix = create_emb_matrix(word_to_index, word_to_vec_map)
    embedding_matrix = None
    #print('shape of embedding_matrix:', embedding_matrix.shape)

    #load trainig text from a file
    utterances = load_text_data(TEXT_FILE)
    punctuator = Punctuator(None, None)
    X, Y = punctuator.create_training_data(utterances[:3], False)
    print(X.shape)
    print(X.shape[1])
    print(Y.shape)

    #if a model already exists, load the model
    if os.path.isfile(MODEL_FILE) and False:
        punctuator.load_model(MODEL_FILE)
    else:
        model = BidirectionalGruWithGru.create_model(input_shape=(
            X.shape[1],
            X.shape[2],
        ),
                                                     embedding_matrix=None,
                                                     vocab_len=0,
                                                     n_d1=128,
                                                     n_d2=128,
                                                     n_c=len(
                                                         punctuator.labels))
        print(model.summary())
        punctuator.__model__ = model

    #if the model has been already trained, use the pre-trained weights
    if os.path.isfile(WEIGHTS_FILE):
        punctuator.load_weights(WEIGHTS_FILE)

    for i in range(100):
        shuffle(utterances)
        print(utterances[0])

        #create an instance of Punctutor and create training data
        X, Y = punctuator.create_training_data(utterances[:300000], False)

        #shuffle the training data
        shuffle(X, Y)

        denom_Y = Y.swapaxes(0, 1).sum((0, 1))
        print('Summary of Y:', denom_Y)

        print('shape of X:', X.shape)
        print(X[0:10])
        print('shape of Y:', Y.shape)
        print(Y[0:10])

        #define optimizer and compile the model
        opt = Adam(lr=0.007, beta_1=0.9, beta_2=0.999, decay=0.01)
        punctuator.compile(opt,
                           loss='categorical_crossentropy',
                           metrics=['accuracy'])

        #split the training data into training set, test set, and dev set
        t_size = int(X.shape[0] * 0.9)
        train_X, train_Y = X[:t_size], Y[:t_size]
        test_X, test_Y = X[t_size:-DEV_SIZE], Y[t_size:-DEV_SIZE]
        dev_X, dev_Y = X[-DEV_SIZE:], Y[-DEV_SIZE:]

        print(train_Y.swapaxes(0, 1).sum((0, 1)))
        print(test_Y.swapaxes(0, 1).sum((0, 1)))

        #train the model
        punctuator.fit([train_X], train_Y, batch_size=BATCH, epochs=EPOCH)

    punctuator.save_model(MODEL_FILE)
    punctuator.save_weights(WEIGHTS_FILE)

    #evaluate the model on the dev set (or the test set)
    for i, example in enumerate(dev_X):
        prediction = punctuator.predict(example)
        punctuator.check_result(prediction, dev_Y[i])

    #manually evaluate the model on an example
    examples = [
        "good morning chairman who I saw and members of the committee it's my pleasure to be here today I'm Elizabeth Ackles director of the office of rate payer advocates and I appreciate the chance to present on oris key activities from 2017 I have a short presentation and I'm going to move through it really quickly because you've had a long morning already and be happy to answer any questions that you have",
        "this was a measure that first was introduced back in 1979 known as the International bill of rights for women it is the first and only international instrument that comprehensively addresses women's rights within political cultural economic social and family life",
        "I'm Elizabeth Neumann from the San Francisco Department on the status of women Sita is not just about naming equal rights for women and girls it provides a framework to identify and address inequality",
        "we have monitored the demographics of commissioners and board members in San Francisco to assess the equality of political opportunities and after a decade of reports women are now half of appointees but white men are still over-represented and Asian and Latina men and women are underrepresented",
        "when the city and county faced a 300 million dollar budget deficit in 2003 a gender analysis of budget cuts by city departments identified the disproportionate effect on women and particularly women of color in the proposed layoffs and reduction of services"
    ]
    for example in examples:
        words = example.split()
        x = punctuator.create_live_data(words)
        print x
        for s in x:
            print s
            prediction = punctuator.predict(s)
            result = punctuator.add_punctuation(prediction, words)
            print(result)