Example #1
0
def main():
    args = parser.parse_args()
    data_json = read_dataset(args.data)
    random.shuffle(data_json)

    training_set_ratio = 0.7
    training_set_size = int(training_set_ratio * len(data_json) + 0.5)

    training_set = data_json[:training_set_size]
    test_set = data_json[training_set_size:]

    processor = TextProcessor()
    classifier = Classifier(processor)
    classifier.train(training_set)

    print classifier.dump() == Classifier.load(classifier.dump(),
                                               processor).dump()
    def get_article(self, url):

        # инициализируем UrlHandler
        urlhandler = UrlHandler()
        # получаем веб-страницу и её кодировку
        source_page, encoding = urlhandler.load_page(url)

        # инициализируем парсер, текстовый процессор, экстрактор
        html_parser = Parser(source_page, encoding)
        text_processor = TextProcessor(self.language)
        article_extractor = ArticleExtractor(self.language)
        formatter = Formatter()

        # получаем списки элементов, очищенных от тегов (raw_cleaned_elements)
        # и нет (elements_as_string)
        raw_cleaned_elements, elements_as_string = html_parser.get_parsed_nodes(
        )
        # заголовок
        title = html_parser.get_title()

        # получаем спосок лемматизированных текстов
        stemmed_tag_elements = text_processor.iterate_over_texts(
            raw_cleaned_elements)
        # получаем ранжированный список элементов
        best_nodes = article_extractor.find_best_node(stemmed_tag_elements)

        # для первого элемента из ранжированного списка
        # ищем в цикле нужный элемент с тегами (elements_as_string)
        # передаем найденный элемент в out_formatter
        for text, element in zip(raw_cleaned_elements, elements_as_string):
            if best_nodes[0][0] == text:
                node_to_format = element

        # out_formatter подготавливает текст для сохранения
        clean_text = formatter.format_article(node_to_format)

        # сохраняем в текстовый файл
        with codecs.open('output.txt', 'w', 'utf-8') as out:
            out.write(title + '\n\n')
            for paragraph in clean_text:
                for line in paragraph:
                    out.write(line)
                    out.write('\n')
                out.write('\n')
Example #3
0
    def ImportLocalizedFiles(self):
        """If new localized files are available, import them.

    This method sifts through the available localized files, and imports each
    to the correct location under CONTENT_ROOT.
    """
        for locale in self.new_localizations:
            if os.path.isfile(self.GetLocalizedFilePath(locale)):
                try:
                    os.mkdir(os.path.dirname(self.GetOriginalFilePath(locale)))
                except OSError:
                    pass
                in_path = self.GetLocalizedFilePath(locale)
                out_path = self.GetOriginalFilePath(locale)
                with codecs.open(in_path, 'r', 'UTF-8') as infile:
                    with codecs.open(out_path, 'w', 'UTF-8') as outfile:
                        temp = TextProcessor(html=infile.read())
                        outfile.write(temp.django)
        self.__available_localizations = []
        self.__locales = []
Example #4
0
class TestIsWord:

    tp = TextProcessor()

    def test_one_word_latin(self):
        assert self.tp.is_word('Test')

    def test_one_word_cyr(self):
        assert self.tp.is_word('Тест')

    def test_two_words(self):
        assert not self.tp.is_word('Test test')

    def test_word_with_num(self):
        assert self.tp.is_word('Test1')

    def test_word_with_underscore(self):
        assert self.tp.is_word('Test_test')

    def test_words_devided_by_symbol(self):
        assert not self.tp.is_word('Test*test')
Example #5
0
def main():
    args = parser.parse_args()
    full_data_json = read_dataset(args.data)

    # for n in xrange(30, len(full_data_json), 30):
    for n in [len(full_data_json)]:

        corrects = 0
        total = 0

        for _ in xrange(SAMPLES):

            random.shuffle(full_data_json)
            data_json = full_data_json[:n]

            training_set_ratio = 0.7
            training_set_size = int(training_set_ratio * len(data_json) + 0.5)

            training_set = data_json[:training_set_size]
            test_set = data_json[training_set_size:]

            processor = TextProcessor()
            classifier = Classifier(processor)
            classifier.train(training_set)

            for example in test_set:
                text = example['content']
                predicted_tag = classifier.classify(text)
                expected_tag = classifier.normalize_tag_label(example['tag'])
                if expected_tag in Classifier.IGNORE_TAGS:
                    continue
                if predicted_tag == expected_tag:
                    corrects += 1
                else:
                    # print 'expected = {}, predicted = {}'.format(expected_tag, predicted_tag)
                    pass
                total += 1

        print '{} {}'.format(len(data_json), float(corrects) / total)
Example #6
0
def main(argv=None):
    """
    Training.
    """

    ### parametres

    LEARNING_RATE = FLAGS.LEARNING_RATE
    NUMBER_OF_FRAMES = FLAGS.NUMBER_OF_FRAMES
    BATCH_SIZE = FLAGS.BATCH_SIZE
    EPOCH = FLAGS.EPOCH
    TRAINING_DEVICE = FLAGS.TRAINING_DEVICE
    VOCAB_SIZE = FLAGS.VOCAB_SIZE
    NUMBER_OF_WORDS = FLAGS.NUMBER_OF_WORDS
    HIDDEN_SIZE = FLAGS.HIDDEN_SIZE
    INPUT_SIZE = FLAGS.INPUT_SIZE
    NUMBER_OF_LAYERS = FLAGS.NUMBER_OF_LAYERS
    tsfm = transforms.Compose([
        transforms.Resize([224, 224]),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ])
    train_dir = FLAGS.train_dir  #'D:/Machine_Learning/datasets/YouTubeClips_2/YouTubeClips/'
    train_corpus = FLAGS.train_corpus  #'D:/Machine_Learning/datasets/video_corpus/video_corpus.csv'

    print("train_dir is =", train_dir)
    print("train_corpus =", train_corpus)

    utils = Utils()
    all_text = utils.output_text(train_corpus)
    text_processor = TextProcessor(freq_threshold=10)
    dictionary = text_processor.vocab_creator(all_text)

    ### training data preparation
    train_ds = CustomDataset(train_dir,
                             train_corpus,
                             device,
                             dictionary,
                             VOCAB_SIZE,
                             NUMBER_OF_WORDS,
                             INPUT_SIZE,
                             NUMBER_OF_FRAMES,
                             tsfm,
                             model=md.model_vgg)
    train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE)

    ### Model definition
    encoder = Encoder_LSTM(input_size=INPUT_SIZE,
                           hidden_size=HIDDEN_SIZE,
                           num_layers=NUMBER_OF_LAYERS)
    decoder = Decoder_LSTM(input_size=VOCAB_SIZE,
                           hidden_size=HIDDEN_SIZE,
                           num_layers=NUMBER_OF_LAYERS,
                           number_of_words=NUMBER_OF_WORDS)
    model_seq_to_seq = Seq2Seq(encoder, decoder).to(device)
    model = model_seq_to_seq

    ### load the state_dict of model if model has been pretrained.
    if (FLAGS.load_weights):
        print("there are weights to be loaded")

        model.load_state_dict(torch.load(FLAGS.load_weights))

    ### optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.CrossEntropyLoss()

    #### Model Training

    import time
    print_feq = 1
    best_loss = np.inf
    for epoch in range(1, EPOCH + 1):
        model.train()
        epoch_loss = 0

        for step, (img, label) in enumerate(train_dl):

            time_1 = time.time()  ## timing

            X_1, X_2 = img  ### get inputs

            X_1 = X_1.to(device)  # Set device
            X_2 = X_2.to(device)  # Set device

            label = label.to(device)  # Set output device

            ### zero the parameter gradients
            optimizer.zero_grad()

            ### forward
            prediction = model(X_1, X_2)

            ### Optimize
            prediction = prediction.to(device)
            prediction = torch.squeeze(prediction, 0)
            label = torch.squeeze(label, 0)

            new_label = torch.zeros([label.shape[0]])
            for l in range(label.shape[0]):
                new_label[l] = np.argmax(label[l].cpu())
            new_label = new_label.to(device)
            loss = criterion(prediction, new_label.long())

            # Backward prop.
            loss.backward()
            optimizer.step()

            ### print out statistics
            epoch_loss += loss.item()
            if step % print_feq == 0:
                print('epoch:', epoch, '\tstep:', step + 1, '/',
                      len(train_dl) + 1, '\ttrain loss:',
                      '{:.4f}'.format(loss.item()), '\ttime:', '{:.4f}'.format(
                          (time.time() - time_1) * print_feq), 's')

        ### save best model
        if (epoch_loss < best_loss):
            best_loss = epoch_loss

            model_name = 'MODEL_SEQ2SEQ' + 'VOCAB_SIZE_' + str(
                VOCAB_SIZE) + 'NUMBER_OF_WORDS_' + str(
                    NUMBER_OF_WORDS
                ) + 'HIDDEN_SIZE_' + str(HIDDEN_SIZE) + 'INPUT_SIZE_' + str(
                    INPUT_SIZE) + 'NUMBER_OF_LAYERS_' + str(NUMBER_OF_LAYERS)
            torch.save(model.state_dict(), model_name + '.pth')

        print("The loss for this epoch is = :", epoch_loss / len(train_dl))
Example #7
0
def main(argv=None):
    """
    Training.
    """

    ### parametres

    LEARNING_RATE = FLAGS.LEARNING_RATE
    NUMBER_OF_FRAMES = FLAGS.NUMBER_OF_FRAMES
    BATCH_SIZE = FLAGS.BATCH_SIZE
    EPOCH = FLAGS.EPOCH
    TRAINING_DEVICE = FLAGS.TRAINING_DEVICE
    VOCAB_SIZE = FLAGS.VOCAB_SIZE
    NUMBER_OF_WORDS = FLAGS.NUMBER_OF_WORDS
    HIDDEN_SIZE = FLAGS.HIDDEN_SIZE
    INPUT_SIZE = FLAGS.INPUT_SIZE
    NUMBER_OF_LAYERS = FLAGS.NUMBER_OF_LAYERS
    tsfm = transforms.Compose([
        transforms.Resize([224, 224]),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
    ])
    train_corpus = FLAGS.train_corpus
    utils = Utils()
    all_text = utils.output_text(train_corpus)
    text_processor = TextProcessor(freq_threshold=10)
    dictionary = text_processor.vocab_creator(all_text)

    ### Model definition
    encoder = Encoder_LSTM(input_size=INPUT_SIZE,
                           hidden_size=HIDDEN_SIZE,
                           num_layers=NUMBER_OF_LAYERS)
    decoder = Decoder_LSTM(input_size=VOCAB_SIZE,
                           hidden_size=HIDDEN_SIZE,
                           num_layers=NUMBER_OF_LAYERS,
                           number_of_words=NUMBER_OF_WORDS)
    model_seq_to_seq = Seq2Seq(encoder, decoder).to(device)
    model = model_seq_to_seq

    ### load the state_dict of model if model has been pretrained.
    model.load_state_dict(torch.load(FLAGS.load_weights))

    #### Model Testing
    model.eval()
    from random import randint
    import matplotlib.pyplot as plt

    utils = Utils()

    video_path = FLAGS.video_file

    video_pre_data = utils.video_to_frames(video_path,
                                           frame_number=NUMBER_OF_FRAMES,
                                           device='cuda',
                                           INPUT_SIZE=INPUT_SIZE,
                                           model=md.model_vgg,
                                           transform=tsfm)

    X_2 = torch.zeros([NUMBER_OF_WORDS, VOCAB_SIZE])

    for i in range(NUMBER_OF_WORDS):
        if (i == 0):

            X_2[i][2] = 1
        else:
            X_2[i][1] = 1

    input_data = video_pre_data.unsqueeze(0)

    final_sentence = []

    X_2 = X_2.unsqueeze(0)
    X_2 = X_2.to(device)
    input_data = input_data.to(device)

    for i in range(NUMBER_OF_WORDS - 1):
        with torch.no_grad():
            predicted = model(input_data, X_2)
            predicted = predicted.squeeze(0)

            final_sentence.append(
                next((key for key, value in dictionary.items()
                      if value == torch.argmax(predicted[i])), None))
            X_2[0][i + 1][torch.argmax(predicted[i])] = 1
            X_2[0][i + 1][1] = 0
    print(final_sentence)
Example #8
0
 def form_valid(self, form):
     text_processor = TextProcessor()
     form.instance.processed_text = text_processor.process(
         form.cleaned_data['origin_text'])
     return super().form_valid(form)
Example #9
0
def main():
    device = torch.device('cuda')

    embedding_vectors = torch.load(f'{EMBEDDINGS_DIR}/vectors.pkl')

    text_processor = TextProcessor(
        wti=pickle.load(open(f'{EMBEDDINGS_DIR}/wti.pkl', 'rb')),
        tokenizer=get_tokenizer('basic_english'),
        standardize=True,
        min_len=3,
    )

    dataset = TextDataset(CORPUS_DIR, text_processor)

    # split into training and test set
    # TODO: fix this splitting sometimes failing when corpus size changes
    train_set, test_set = torch.utils.data.random_split(
        dataset, [
            int(len(dataset) * DATA_SPLIT),
            int(len(dataset) * (1.0 - DATA_SPLIT))
        ])

    # count number of samples in each class
    class_count = [0, 0]
    for data, label in dataset:
        class_count[int(label.item())] += 1

    # get relative weights for classes
    _sum = sum(class_count)
    class_count[0] /= _sum
    class_count[1] /= _sum

    # reverse the weights since we're getting the inverse for the sampler
    class_count = list(reversed(class_count))

    # set weight for every sample
    weights = [class_count[int(x[1].item())] for x in train_set]

    # weighted sampler
    sampler = torch.utils.data.WeightedRandomSampler(
        weights=weights, num_samples=len(train_set), replacement=True)

    train_loader = DataLoader(dataset=train_set,
                              batch_size=32,
                              collate_fn=Sequencer(SEQUENCE_LEN),
                              sampler=sampler)

    test_loader = DataLoader(dataset=test_set,
                             batch_size=32,
                             collate_fn=Sequencer(SEQUENCE_LEN))

    # number of filters in each convolutional filter
    N_FILTERS = 64

    # sizes and number of convolutional layers
    FILTER_SIZES = [2, 3]

    # dropout for between conv and dense layers
    DROPOUT = 0.5

    model = TextCNN(
        embeddings=embedding_vectors,
        n_filters=N_FILTERS,
        filter_sizes=FILTER_SIZES,
        dropout=DROPOUT,
    ).to(device)

    print(model)
    print('Trainable params:',
          sum(p.numel() for p in model.parameters() if p.requires_grad))

    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    EPOCHS = 12

    best_acc = 0.0

    # training loop
    for epoch in range(EPOCHS):
        print('Epoch', epoch + 1)

        for i, data in tqdm(enumerate(train_loader), total=len(train_loader)):
            # get word indices vector and corresponding labels
            x, labels = data

            # send to device
            x = x.to(device)
            labels = labels.to(device)

            # make predictions
            predictions = model(x).squeeze()

            # calculate loss
            loss = criterion(predictions, labels)

            # learning stuff...
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # evaluate
        with torch.no_grad():
            model.eval()

            correct = 0
            wrong = 0
            m = [[0, 0], [0, 0]]

            for data in test_loader:
                x, label = data
                x = x.to(device)

                predictions = model(x).squeeze()

                for truth, prediction in zip(label, predictions):
                    y = int(truth.item())
                    y_pred = 1 if prediction.item() > 0.5 else 0

                    m[y][y_pred] += 1

                    if y == y_pred:
                        correct += 1
                    else:
                        wrong += 1

            model.train()

            acc = correct / (correct + wrong)
            if acc > best_acc:
                best_acc = acc
                for file in glob.glob('models/model_*.pth'):
                    os.remove(file)
                torch.save(model.state_dict(), f'models/state_{epoch}.pth')

            print()
            print('Correct:', f'{correct}/{correct + wrong}', 'Accuracy:', acc)
            print('[[TN, FP], [FN, TP]]')
            print(m)
            print()

    # put into evaluation mode
    model.eval()

    text_processor.do_standardize = True

    with torch.no_grad():
        while True:
            text = input('Prompt: ')
            x = text_processor.process(text)
            x = torch.tensor(x).unsqueeze(dim=0)
            print(model(x.to(device)).squeeze())
Example #10
0
 def get_text(self, row):
     guid = row['GUID']
     text_processor = TextProcessor(guid, self.file_dictionary)
     return text_processor.get_text()
Example #11
0
def process_file(file_id):
    file = File.objects.get(pk=file_id)
    print(file)
    try:
        origin_path = file.origin_file.path
    except ValueError:
        origin_path = None
    file.input_type = get_input_type(file)
    print(file.input_type)
    file.progress += 10
    file.save()

    sleep(.5)
    file.progress += 10
    file.save()


    document = None
    if file.input_type == File.InputTypes.IMAGE:
        document = Document()
        text = image_to_text(origin_path)
    elif file.input_type == File.InputTypes.TEXTBOX:
        text = file.origin_text
    else:
        document = Document(origin_path, text_params)
        text = document.parse()
    # file.progress += 50
    file.progress += 20
    file.save()

    sleep(.5)
    file.progress += 10
    file.save()
    sleep(.5)
    file.progress += 10
    file.save()

    text_processor = TextProcessor()
    processed_text = text_processor.process(text)
    # file.progress += 30
    file.progress += 20
    file.save()

    sleep(.5)
    file.progress += 10
    file.save()

    if file.input_type == File.InputTypes.TEXTBOX:
        file.processed_text = processed_text

    else:
        if document is None:
            raise ValueError('Error with document')
        output_name = get_output_field(file)
        document.change_text(processed_text)
        document.save(file.processed_file.storage.path(output_name))
        file.processed_file = output_name
    sleep(.5)
    file.progress = 100
    file.save()
    print(file)
Example #12
0
import os
import sys

currentdir = os.path.dirname(os.path.realpath(__file__))
parentdir = os.path.dirname(currentdir)
sys.path.append(parentdir + '/source')
sys.path.append(parentdir)

import csv
from text_processor import TextProcessor

#read csv, and split on "," the line
csv_file = open('tweet_data/ShardiB2_2021-02-03 05_29_05.236426_tweets.csv',
                "r")

text_pro = TextProcessor()
x = open(
    "tweet_data/no_links or tickers_ShardiB2_2021-02-03 05_29_05.236426_tweets.csv",
    "w")

#loop through the csv list
for row in csv_file:
    replaced_row = text_pro.replace_tickers_with_company_names(row)
    #uncomment below to also remove links
    index = replaced_row.find("https")
    if index != -1:
        replaced_row = replaced_row[:index] + ", \n"
    x.writelines(replaced_row)

x.close()
Example #13
0
import nltk
from nltk.stem.lancaster import LancasterStemmer  
from nltk.corpus import stopwords, brown
from text_processor import TextProcessor
from keyword_retrieval import KeywordRetrieval
from datetime import datetime

if __name__ == "__main__":
    text = raw_input("Please input your text (It's better not to input more than 15 words)\n>> ")
    # case based ontology
    print "Case: [[ Ontology-based method]]\nText :%s" % text
    # create text processor to get keywords
    start = datetime.now()
    text_processor = TextProcessor(text = text)
    keywords = text_processor.get_keywords()
    if keywords:
        dos = {}
        print "Extracted ontology keywords:", keywords
        for word in keywords:
            kr = KeywordRetrieval(keyword=word)
            dos[word] = kr.get_result()
    else:
        print "The model does not extract any keywords"
    end = datetime.now()
    time_1 = end - start
    for word in keywords:
        if dos[word]:
            # sorted by name's length(similarity)
            for do in sorted(dos[word], key=lambda do: len(do.name)):
                # print do's information 
                print "_"*100
Example #14
0
 def assertHtmlDjango(self, html, django):
     from_html = TextProcessor(html=html)
     from_django = TextProcessor(django=django)
     self.assertEqual(html, from_django.html)
     self.assertEqual(django, from_html.django)
Example #15
0
        current_batch_input.append(batch_input)
        current_batch_output.append(batch_output)

        read_sequences_count += 1
        # It is possible we don't complete a batch. In that case, this if won't execute and the result won't be added.
        if read_sequences_count == batch_size:
            result.append([ current_batch_input, current_batch_output ])
            current_batch_input = []
            current_batch_output = []
            read_sequences_count = 0
    
    return np.array(result)

#########################################################################

tf.logging.set_verbosity(tf.logging.ERROR)

text_processor = TextProcessor(data_dir, data_percentage)
text = text_processor.load_and_preprocess_text()

print('Creating computation graph...')
nn = NeuralNetwork()

summary_output_dir = '/output/e{}_b{}_rnn{}_rnnl{}_seq{}_lr{}_dp{}'.format(num_epochs, batch_size, rnn_size, rnn_layer_count, seq_length, learning_rate, data_percentage)
nn.build_model(text_processor.int_to_vocab, rnn_size, rnn_layer_count, summary_output_dir)
print('Computation graph created.')

print('Training...')
batches = get_batches(text_processor.int_text, batch_size, seq_length)
nn.train_model(batches, num_epochs, learning_rate, save_every, save_dir, test_every, prime_word, gen_length, text_processor, seq_length)
Example #16
0
embeddings = torch.load(f'{EMBEDDINGS_DIR}/vectors.pkl')

model = TextCNN(
    embeddings=embeddings,
    n_filters=64,
    filter_sizes=[2, 3],
    dropout=0.0,
)

device = torch.device('cpu')
model.load_state_dict(torch.load('model.pth', map_location=device))
model.eval()

text_processing = TextProcessor(
    wti=pickle.load(open(f'{EMBEDDINGS_DIR}/wti.pkl', 'rb')),
    tokenizer=get_tokenizer('basic_english'),
    standardize=True,
    min_len=3,
)


@app.post('/game')
async def game(request: Request):
    q = request.form.get('q', None)

    if q is None:
        return HTTPResponse(status=400)

    tokens = text_processing.process(q)
    x = torch.unsqueeze(tokens, dim=0)

    pred = model(x)
Example #17
0
    np.random.seed(SEED)
    print('W2VEC embedding: %s' % (W2VEC_MODEL_FILE))
    print('Embedding Dimension: %d' % (EMBEDDING_DIM))
    print('Allowing embedding learning: %s' % (str(LEARN_EMBEDDINGS)))

    cf = configparser.ConfigParser()
    cf.read("file_path.properties")
    path = dict(cf.items("file_path"))
    dir_w2v = path['dir_w2v']
    dir_in = path['dir_in']

    word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(
        dir_w2v + W2VEC_MODEL_FILE, binary=False, unicode_errors="ignore")

    tp = TextProcessor()

    texts = list()
    tx_class = list()

    tmp = list()
    with open(POLITICS_FILE) as l_file:
        for line in l_file:
            tmp.append(line)
            tx_class.append('politics')

    texts += tp.text_process(tmp, text_only=True)

    tmp = list()
    with open(NON_POLITICS_FILE) as l_file:
        for line in l_file:
    def __worker(self, pipe, l_log):
        """The core of the STT program, this is the multiprocessed part

        Note:
            Multiprocessing will require a pipe between the parent and child subprocess.
            Since this is the case, the worker subprocess cannot access non-shared variables

        """

        l_log.debug("STT worker started")

        audio_processor = AudioProcessor(
        )  # Create a new audio processing object
        text_processor = TextProcessor(
        )  # Remember that we can't load the text processor nltk model until the nltk model is set from the client language
        config = Decoder.default_config(
        )  # Create a new pocketsphinx decoder with the default configuration, which is English
        decoder = None
        nltk_model = None
        mutex_flags = {"keyphrases": {"use": False}}
        shutdown_flags = {"shutdown": False, "decoder": None}

        def send_json(pipe, to_send):
            """Internal worker method to send a json through the parent socket

            Arguments:
                pipe (:obj: socket): The response pipe to send to the parent process
                to_send (:obj: dict): A dictionary to be sent to the parent socket

            """
            try:
                ret = self.__send_buffered(
                    pipe, to_send
                )  # Send the message passed by argument back to the parent process
                if not ret[0]:
                    l_log.error(
                        "Failed to send buffered message to the parent process! (err: %s)"
                        % ret[1])
            except Exception as err:
                l_log.error("Failed to send json! (err: %s)" % str(err))

        def send_error(pipe, error):
            """Internal worker method to send a json error through the parent socket

            Arguments:
                pipe (:obj: socket): The response pipe to send to the parent process
                error (str): The string error message to send

            """
            send_json(pipe, {"error": error})

        def load_models(pipe, config, models):
            """Internal worker method to load the language model

            Note:
                Some lanaguages take a long time to load. English is by far
                the fastest language to be loaded as a model.
            
            Arguments:
                pipe (:obj: socket): The response pipe to send to the parent process
                models (dict): The language and nltk models developed by the parent process
           
            Returns: (Decoder)
                The STT decoder object and the nltk model

            """

            language_model = models["language_model"]
            nltk_model = models["nltk_model"]

            if False in [
                    language_model.is_valid_model(),
                    nltk_model.is_valid_model()
            ]:
                l_log.error("The language model %s is invalid!" %
                            str(language_model.name))
                send_error(pipe, "Failed loading language model!")
                return

            # Load the model configurations into pocketsphinx
            config.set_string('-hmm', str(language_model.hmm))
            config.set_string('-lm', str(language_model.lm))
            config.set_string('-dict', str(language_model.dict))
            decoder = Decoder(config)

            send_json(
                pipe,
                {"success": True})  # Send a success message to the client

            l_log.debug("Set the language model to %s" %
                        str(language_model.name))

            return decoder, nltk_model  # Return the new decoder and nltk model

        def process_text(pipe, text, is_final, args):
            """Internal worker method to process the Speech To Text phrase

            Arguments:
                pipe (:obj: socket): The response pipe to send to the parent process
                text (str): The spoken text to further process
                is_final (boo): If the text being processed is the final text else it's a partial result
                args (dict): Any other flags specifically required for a final or partial speech result
            """

            generate_keyphrases = mutex_flags["keyphrases"]["use"]
            keyphrases = []

            if generate_keyphrases:
                text_processor.generate_keyphrases(
                    text)  # Generate keyphrases from the given text
                keyphrases_list = text_processor.get_keyphrases()

                for keyphrase in keyphrases_list:
                    to_append_keyphrase = {
                        "score": keyphrase[0],
                        "keyphrase": keyphrase[1]
                    }
                    keyphrases.append(to_append_keyphrase)
            else:
                keyphrases = text  # Don't do any processing and just pass the text into the keyphrases

            # Generate the json to be sent back to the client
            hypothesis_results = args
            hypothesis_results["keyphrases"] = generate_keyphrases
            if is_final:
                hypothesis_results["hypothesis"] = keyphrases
            else:
                hypothesis_results["partial_hypothesis"] = keyphrases

            print(hypothesis_results)

            # Send the results back to the client
            send_json(pipe, hypothesis_results)

        def start_audio(pipe, decoder, args):
            """Internal worker method to start the audio processing chunk sequence

            Note:
                This must be called before the process_audio method or the STT engine will not process the audio chunks

            Arguments:
                pipe (:obj: socket): The response pipe to send to the parent process
                decoder (Decoder): The pocketsphinx decoder to control the STT engine
                args (dict): All of the available arguments passed by the parent process

            """

            if decoder is None:
                l_log.error("Language model is not loaded")
                send_error(pipe, "Language model not loaded!")
                send_json(pipe, {"decoder": False})
                return

            l_log.debug("Starting the audio processing...")

            decoder.start_utt()  # Start the pocketsphinx listener

            # Tell the client that the decoder has successfully been loaded
            send_json(pipe, {"decoder": True})

        def process_audio(pipe, decoder, args):
            """Internal worker method to process an audio chunk

            Note:
                The audio chunk is expected to be in base64 format

            Arguments:
                pipe (:obj: socket): The response pipe to send to the parent process
                decoder (Decoder): The pocketsphinx decoder to control the STT engine
                args (dict): All of the available arguments passed by the parent process

            """
            if decoder is None:
                l_log.error("Language model is not loaded")
                send_error(pipe, "Language model not loaded!")
                return

            l_log.debug("Processing audio chunk!")

            audio_chunk = args["audio"]  # Retrieve the audio data
            processed_wav = audio_processor.process_chunk(
                audio_chunk)  # Process the base64 wrapped audio data

            l_log.debug("Recognizing speech...")

            decoder.process_raw(
                processed_wav, False,
                False)  # Process the audio chunk through the STT engine

            hypothesis = decoder.hyp()  # Get pocketshpinx's hypothesis

            # Send back the results of the decoding
            if hypothesis is None:
                l_log.debug("Silence detected")
                send_json(pipe, {
                    "partial_silence": True,
                    "partial_hypothesis": None
                })
            else:
                hypothesis_results = {
                    "partial_silence":
                    False if len(hypothesis.hypstr) > 0 else True,
                }

                l_log.debug("Partial speech detected: %s" %
                            str(hypothesis.hypstr))
                process_text(pipe, hypothesis.hypstr, False,
                             hypothesis_results)

            l_log.debug("Done decoding speech from audio chunk!")

        def stop_audio(pipe, decoder, args):
            """Internal worker method to stop the audio processing chunk sequence

            Note:
                This must be called after the process_audio method or the STT engine will continue to listen for audio chunks

            Arguments:
                pipe (:obj: socket): The response pipe to send to the parent process
                decoder (Decoder): The pocketsphinx decoder to control the STT engine
                args (dict): All of the available arguments passed by the parent process

            """

            if decoder is None:
                l_log.error("Language model is not loaded")
                send_error(pipe, "Language model not loaded!")
                send_json({"decoder": False})
                return

            l_log.debug("Stopping the audio processing...")

            decoder.end_utt()  # Stop the pocketsphinx listener

            l_log.debug("Done recognizing speech!")

            hypothesis = decoder.hyp()  # Get pocketshpinx's hypothesis
            logmath = decoder.get_logmath()

            # Send back the results of the decoding
            if hypothesis is None:
                l_log.debug("Silence detected")
                send_json(pipe, {"silence": True, "hypothesis": None})
            else:
                hypothesis_results = {
                    "silence": False if len(hypothesis.hypstr) > 0 else True,
                    "score": hypothesis.best_score,
                    "confidence": logmath.exp(hypothesis.prob)
                }

                l_log.debug("Speech detected: %s" % str(hypothesis.hypstr))
                process_text(pipe, hypothesis.hypstr, True, hypothesis_results)

        def shutdown_thread(self, l_log):
            """Worker method to handle the checking of a shutdown call

            Note:
                To reduce overhead, this thread will only be called every 100 milliseconds

            """
            while not shutdown_flags["shutdown"]:
                try:
                    if self._shutdown_event.is_set():
                        l_log.debug("Shutting down worker thread!")
                        shutdown_flags["shutdown"] = True  # Exit the main loop
                        if shutdown_flags["decoder"] is not None:
                            try:
                                shutdown_flags["decoder"].end_utt()
                            except Exception as err:
                                l_log.debug(
                                    "STT decoder object returned a non-zero status"
                                )
                        else:
                            l_log.warning(
                                "The decoder object is already None!")

                        break
                    sleep(0.1)
                except Exception as err:
                    l_log.error(
                        "Failed shutting down worker thread! (err: %s)" %
                        str(err))

        shutdown_t = Thread(target=shutdown_thread, args=(
            self,
            l_log,
        ))
        shutdown_t.setDaemon(True)
        shutdown_t.start()

        p_out, p_in = pipe
        while not shutdown_flags["shutdown"]:
            try:
                try:
                    command = self.__get_buffered(
                        p_out)  # Wait for a command from the parent process
                    if "set_models" in command[
                            "exec"]:  # Check to see if our command is to
                        decoder, nltk_model = load_models(
                            p_out, config, command["args"])
                        text_processor.set_nltk_model(
                            nltk_model)  # Set the text processor nltk model
                        shutdown_flags["decoder"] = decoder
                    elif "start_audio" in command["exec"]:
                        start_audio(p_out, decoder, command["args"])
                    elif "process_audio" in command["exec"]:
                        process_audio(p_out, decoder, command["args"])
                    elif "stop_audio" in command["exec"]:
                        stop_audio(p_out, decoder, command["args"])
                    elif "set_keyphrases" in command["exec"]:
                        mutex_flags["keyphrases"] = command["args"]
                    else:
                        l_log.error("Invalid command %s" % str(command))
                        send_error(socket, "Invalid command!")
                except (EOFError, IOError) as err:
                    continue
            except Exception as err:
                l_log.error(
                    "Failed recieving command from subprocess (id: %d) (err: %s)"
                    % (current_process().pid, str(err)))