Esempi in Python per Frog.process, esempi in Python per frog.Frog.process

Esempio n. 1

0

Mostra file

class AbstractFrogChunker(ChunkParserI):

    _frog = None

    def __init__(self, **kwargs):
        self._frog = Frog(FrogOptions(parser=False, mwu=False, tok=False, xmlIn=True, **kwargs))

    def __get_folia_doc__(self, tokens):
        doc = folia.Document(id='nltk-sentence')
        folia_sent = doc.add(folia.Text)
        for tok, pos in tokens:
            word = folia_sent.add(folia.Word, tok)
            word.add(folia.PosAnnotation(None, set='custom', cls=pos))
        return doc

    def __create_tree__(self, tokens, key):
        _input = self.__get_folia_doc__(tokens)
        __output = self._frog.process(_input)
        for token in __output:
            token['pos'] = token['pos'].split('(')[0]
            if token['pos'].startswith('SPEC'):
                token['pos'] = 'NNP'
        return conlltags2tree([(token['text'], token['pos'], token[key]) for token in __output ])

    def parse(self, tokens):
        raise NotImplementedError()

Esempio n. 2

0

Mostra file

File: encoder_morphCount.py Progetto: nicholaswellens/MorphismTokenizer

    def morph_counts_old_version(self, words):
        #Word List to list of all morphisms
        print("len words: ")
        print(len(words))
        print("len unique words: ")
        print(len(set(words)))
        frog = Frog(
            FrogOptions(tok=True,
                        lemma=True,
                        morph=True,
                        daringmorph=False,
                        mwu=False,
                        chunking=False,
                        ner=False,
                        parser=False))
        morphisms = []
        print_counter = 1
        t0 = time.time()
        for word in words:
            output = frog.process(word)
            morphisms_word = output[0].get("morph")
            morphisms_word_list = morphisms_word.replace('[', '').split(']')
            #Momenteel GEEN GEHELE WOORDEN IN COUNT
            if len(morphisms_word_list) > 2:
                morphisms += morphisms_word_list
            total_length = len(words)
            print(str(print_counter) + " of " + str(total_length))
            print_counter += 1
        print("Frog Processing Time:")
        print(self.format_time(time.time() - t0))

        morphisms = list(filter(None, morphisms))
        morph_counts = Counter(morphisms)
        return morph_counts

Esempio n. 3

0

Mostra file

    def morph_counts_new_version(self, words):
        #Word List to list of all morphisms
        frog = Frog(FrogOptions(tok=True, lemma=True, morph=True, daringmorph=False, mwu=False, chunking=False, ner=False,parser=False))
        words_string = ' '.join(words)
        morphisms = []
        print_counter = 1
        t0 = time.time()
        print("Starting Frog Processing..")
        output = frog.process(words_string)
        print("Process time:")
        process_time = self.format_time(time.time() - t0)
        print(process_time)
        t1 = time.time()
        for i in range(0,len(words)-1):
            morphisms_word = output[i].get("morph")
            morphisms_word_list = morphisms_word.replace('[', '').split(']')
            #Momenteel GEEN GEHELE WOORDEN IN COUNT
            if len(morphisms_word_list) > 2:
                morphisms += morphisms_word_list
            total_length = len(words)
            print(str(print_counter) + " of " + str(total_length))
            print_counter += 1
        print("Process Time:")
        print(process_time)
        print("Getting Morphisms Time:")
        print(self.format_time(time.time() - t1))
        print("Total Time:")
        print(self.format_time(time.time() - t0))



        morphisms = list(filter(None, morphisms))
        morph_counts = Counter(morphisms)
        return morph_counts

Esempio n. 4

0

Mostra file

class FrogTagger(TaggerI):
    def __init__(self, **kwargs):
        # Disable multiword recognition, which is performed by the chunker
        options = FrogOptions(parser=False, mwu=False, xmlIn=True, **kwargs)
        self.__frog = Frog(options)

    def tag(self, sentences):

        if isinstance(sentences, list):
            doc = folia.Document(id='nltk-sentence')
            folia_sent = doc.add(folia.Text)
            for sent in sentences:
                folia_sent.add(folia.Word, sent)
            _input = doc
        else:
            _input = sentences
        self.__output = self.__frog.process(_input)
        return [(token['text'], token['pos'].split('(')[0])
                for token in self.__output]

    def get_tag_probabilities(self):
        if self.__output is None:
            return []
        return [(token['text'], token['posprob']) for token in self.__output]

    def get_lemmas(self):
        if self.__output is None:
            return []
        return [(token['text'], token['lemma']) for token in self.__output]

    def get_morph(self):
        if self.__output is None:
            return []
        return [(token['text'], token['morph']) for token in self.__output]

Esempio n. 5

0

Mostra file

def prep_nl(df, filename):
    from frog import Frog, FrogOptions

    print("Tokenizing, POS tagging, and lemmatizing the Dutch data...")

    # Create 'frog' instance. Turn off various options to save time.
    frog = Frog(
        FrogOptions(parser=False, morph=False, chunking=False, ner=False))

    # Define set of possible answers
    if not "STAT_C" in str(filename):
        answers = ['Answer']
    elif "STAT_C" in str(filename):
        answers = ['Answer4a', 'Answer2aDec', 'Answer2aCaus']

    # Loop through answers
    for question_type in answers:

        for index in df.index:
            ans = df.loc[index, question_type]

            # Logging
            if index % 20 == 0:
                print(index, "/", df.index[-1], question_type[6:])

            # Remove numbers
            ans = re.sub("\d+", "", ans)

            # Remove tags in spelling-corrected data
            ans = ans.replace("_abbreviation", "")

            # Remove non-Dutch and illegible words
            ans = re.sub("\w+_nonexistent", "", ans)
            ans = re.sub("\w+_nonexisting", "", ans)
            ans = re.sub("\w+_english", "", ans)
            ans = re.sub("\w+_german", "", ans)
            ans = re.sub("\?+_illegible", "", ans)

            # Preprocess the data with Frog
            ans_dict = frog.process(ans)

            tok_answer = []
            lem_answer = []
            pos_tags = []

            # Append outcomes to list
            for word_index in range(len(ans_dict)):
                if ans_dict[word_index][
                        'pos'] != "LET()":  # Exclude punctuation
                    tok_answer.append(ans_dict[word_index]['text'].lower())
                    lem_answer.append(ans_dict[word_index]['lemma'])
                    pos_tags.append(ans_dict[word_index]['pos'])

            # Fill in the dataframe
            df.at[index, 'Tokenized{}'.format(question_type[6:])] = tok_answer
            df.at[index, 'Lemmatized{}'.format(question_type[6:])] = lem_answer
            df.at[index, 'POS{}'.format(question_type[6:])] = pos_tags

    return df

Esempio n. 6

0

Mostra file

    def morph_counts_faster_version(self, words):
        #Word List to list of all morphisms

        frog = Frog(FrogOptions(tok=True, lemma=True, morph=True, daringmorph=False, mwu=False, chunking=False, ner=False,parser=False))
        batch_size = 400
        morphisms = []
        print_batch_number = 1
        start_time = time.time()
        total_batch_number = math.ceil(len(words)/batch_size)
        total_process_time = 0
        total_getting_morphisms_time = 0
        for i in range(0, len(words), batch_size):
            t0 = time.time()
            #print_counter = 1
            words_batch = words[i:i + batch_size]
            words_batch_string = ' '.join(words_batch)
            #print("Starting Frog Processing.. for batch = " + str(print_batch_number))
            output = frog.process(words_batch_string)
            #print("Process time:")
            process_time = time.time() - t0
            #print(self.format_time(process_time))
            #print(process_time)
            t1 = time.time()
            for j in range(0,len(words_batch)-1):
                morphisms_word = output[j].get("morph")
                morphisms_word_list = morphisms_word.replace('[', '').split(']')
                #Momenteel GEEN GEHELE WOORDEN IN COUNT
                if len(morphisms_word_list) > 2:
                    morphisms += morphisms_word_list
                total_batch_length = len(words_batch)
                #print(str(print_counter) + " of " + str(total_batch_length) + " -- of batch -- " + str(print_batch_number) + " of " + str(total_batch_number) )
                #print("batch" + " (batch_size: " + str(batch_size) + " words):    " +  str(print_batch_number) + " of " + str(total_batch_number))
                #print_counter += 1
            print_batch_number += 1
            getting_morphisms_time = time.time() - t1
            total_process_time += process_time
            total_getting_morphisms_time += getting_morphisms_time

        print("Total number of words: ")
        print(len(words))
        print("")
        print("Unique number words: ")
        print(len(set(words)))
        print("")
        print("Total Process Time:")
        print(self.format_time(total_process_time))
        print("")
        print("Total Getting Morphisms Time: ")
        print(self.format_time(total_getting_morphisms_time))
        print("")
        print("Total Time:")
        print(self.format_time(time.time() - start_time))
        print("")

        morphisms = list(filter(None, morphisms))
        morph_counts = Counter(morphisms)
        return morph_counts

Esempio n. 7

0

Mostra file

def change_text_to_morphs(sentences,
                          frog_merge=False,
                          save=False,
                          filename=None):
    # sentence list to sentence list in frog morphism form
    morphSentences = []

    frog = Frog(
        FrogOptions(tok=True,
                    lemma=True,
                    morph=True,
                    daringmorph=False,
                    mwu=False,
                    chunking=False,
                    ner=False,
                    parser=False))

    for sentenceNumber in range(0, len(sentences)):
        print(sentenceNumber)
        print("of")
        print(len(sentences))
        sentenceToBeProcessed = sentences[sentenceNumber]
        sentenceToBeProcessed = sentenceToBeProcessed.replace("\n", " ")
        morphSentence = []
        output = frog.process(sentenceToBeProcessed)
        for i in range(0, len(output)):
            morphisms_word = output[i].get("morph")
            morphisms_word_list = morphisms_word.replace('[', '').split(']')
            if frog_merge:
                morphisms_word_list = list(filter(None, morphisms_word_list))
                morphisms_word_list = intersperse(morphisms_word_list,
                                                  "insertmergetoken")
                #print(morphisms_word_list)
            #print("EVET")
            #print(morphisms_word_list)
            morphSentence += morphisms_word_list
        #print("MORPHSENTENCE")
        #print(morphSentence)
        # Remove the empty strings
        morphSentence = list(filter(None, morphSentence))
        #print("ok")
        #print(morphSentence)
        morphSentence = ' '.join(morphSentence)
        #print("HERE")
        #print(morphSentence)
        morphSentences.append(morphSentence)

    if save is True:
        with open(filename, 'wb') as outputfile:
            pickle.dump(morphSentences, outputfile)
    return morphSentences

Esempio n. 8

0

Mostra file

File: createBertTinyFeatures.py Progetto: nicholaswellens/MorphismTokenizer

def change_text_to_morphs(sentences,
                          frog_merge=False,
                          save=False,
                          filename=None):
    # sentence list to sentence list in frog morphism form
    morphSentences = []

    frog = Frog(
        FrogOptions(tok=True,
                    lemma=True,
                    morph=True,
                    daringmorph=False,
                    mwu=False,
                    chunking=False,
                    ner=False,
                    parser=False))
    j = 0
    for sentenceToBeProcessed in sentences:

        if j % 1000 == 0:
            print(j + 1)
            print("of")
            print(len(sentences))

        j += 1
        sentenceToBeProcessed = sentenceToBeProcessed.rstrip('\n')
        morphSentence = []
        output = frog.process(sentenceToBeProcessed)

        for i in range(0, len(output)):
            morphisms_word = output[i].get("morph")
            morphisms_word_list = morphisms_word.replace('[', '').split(']')
            if frog_merge:
                morphisms_word_list = list(filter(None, morphisms_word_list))
                morphisms_word_list = intersperse(morphisms_word_list,
                                                  "__add_merge__")

            morphSentence += morphisms_word_list

        # Remove the empty strings
        morphSentence = list(filter(None, morphSentence))

        morphSentence = ' '.join(morphSentence)
        morphSentences.append(morphSentence)

    if save is True:
        with open(filename, 'wb') as outputfile:
            pickle.dump(morphSentences, outputfile)
    return morphSentences

Esempio n. 9

0

Mostra file

File: tagECSD.py Progetto: timjzee/dmc-scripts

                    line_txt = re.sub(r"^'s ", "''s ", line_txt)
                    #                    if re.search(r'"".+""', line_txt):
                    #                        print(re.search(r'"".+""', line_txt).group())
                    txt_dict[pair][part][spkr] += line_txt + " "

frog = Frog(FrogOptions(mwu=False, ner=False))

for pair in txt_dict:
    for part in txt_dict[pair]:
        with open("{}pos/{}/{}_{}.pos".format(ecsd_path, pair, pair, part),
                  "w",
                  encoding="utf-8") as g:
            for spkr in txt_dict[pair][part]:
                print(pair, part, spkr)
                text = txt_dict[pair][part][spkr]
                word_list = frog.process(text)
                s_counter = 0
                w_counter = 0
                for word in word_list:
                    if word["index"] == "1":
                        s_counter += 1
                        w_counter = 0
                        g.write(
                            "< file id: {}_{} speaker id: {} sentence: {} >\n".
                            format(pair, part, spkr, s_counter))
                    if "LET" in word["pos"] and word["text"] != "&":
                        continue
                    else:
                        g.write("\t".join([
                            word["text"], word["pos"], word["lemma"],
                            str(word["posprob"])

Esempio n. 10

0

Mostra file

#!/usr/bin/env python3

from __future__ import print_function, unicode_literals

from frog import Frog, FrogOptions
import folia.main as folia

frog = Frog(FrogOptions(parser=True))
output = frog.process_raw("Dit is een test")
print("RAW OUTPUT=", output)

output = frog.process("Dit is nog een test.")
print("PARSED OUTPUT=", output)

frog = Frog(FrogOptions(parser=True, xmlout=True))
output = frog.process("Dit is een FoLiA test.")
assert isinstance(output, folia.Document)
assert isinstance(len(output.data), 1)
assert isinstance(
    next(output.data.select(folia.Sentence)).text(), "Dit is een FoLiA test.")
#output is now no longer a string but an instance of folia.Document, provided by the FoLiA library in PyNLPl (pynlpl.formats.folia)
print("FOLIA OUTPUT=")
print(output.xmlstring())

print("Inspecting FoLiA output (example):")
for word in output.words():
    print(word.text() + " " + word.pos() + " " + word.lemma())
assert len(output.words()) == 5

Esempio n. 11

0

Mostra file

import pickle
import re

Processed_Sentence = " lezen optimaal liep europese unie gekregen spellen rugzak super allesinds boomhut ontwikkelende gemeenschappen vermeenigvuldigde getallen Vereenvoudigd. ....... is werken lopen een kleine test gewoon om te zien of het wel werkt."
#Processed_Sentence = "Ik spring wat rond in het rond"
#frog = Frog(FrogOptions(tok=True, lemma=True, morph = True, daringmorph=False, mwu=False, chunking=True, ner=True, parser=False))
frog = Frog(
    FrogOptions(tok=True,
                lemma=True,
                morph=True,
                daringmorph=False,
                mwu=False,
                chunking=False,
                ner=False,
                parser=False))
output = frog.process(Processed_Sentence)

print("")
print("RAW OUTPUT")
print(output)
print("length")
print(len(output))
print(output[0])
print(output[1])
print(output[2])
print(output[3])

print("")
print("index:    " + str(output[0].get("index")))
print("text:     " + str(output[0].get("text")))
print("lemma:    " + str(output[0].get("lemma")))

Esempio n. 12

0

Mostra file

    trials_dict[t]["s"] = s_plural


with open(f_path + "SonarVar_test.csv", "w") as g:
    g.write("s_plural,item,prosodic_break,next_stress,next_sound,next_vowel_length,ort\n")
    with open(f_path + "sonar_plurals2.csv", "r") as f:
        for line_num, i_line in enumerate(f, 1):
            print("Processing line " + str(line_num))
            i_list = i_line[:-1].split("\t")
            sentence = i_list[1].strip('" ')
            foll_context = i_list[2].strip('" ')
            for t in trials:
                grep_text = "(^{}|[ ']{})({}|{})(?=[ .,?!':;])".format(t[0].upper(), t[0], trials_dict[t]["en"][1:], trials_dict[t]["s"][1:])
                if re.search(grep_text, sentence):
                    line = re.sub(" & ", " en ", sentence)
                    frog_output = frog.process(line)
                    frog_output_clean = [i for i in frog_output if i["pos"] != "LET()"]
                    w_indices = []
                    for num, b in enumerate(frog_output, 0):
                        if b["text"].lower() in [trials_dict[t]["en"], trials_dict[t]["s"]] and b["pos"] == "N(soort,mv,basis)":
                            w_indices.append(num)
                    for w_index in w_indices:
                        s_plural = "1" if frog_output[w_index]["text"][-1] == "s" else "0"
                        clean_index = [w["index"] for w in frog_output_clean].index(frog_output[w_index]["index"])
                        if clean_index + 1 < len(frog_output_clean):
                            next_word = frog_output_clean[clean_index + 1]["text"].lower()
                        else:
                            context_frog_output = frog.process(re.sub(" & ", " en ", foll_context))
                            context_frog_output_clean = [i for i in context_frog_output if i["pos"] != "LET()"]
                            if len(context_frog_output_clean) > 0:
                                next_word = context_frog_output_clean[0]["text"].lower()

Esempio n. 13

0

Mostra file

File: encoder_morphCount.py Progetto: nicholaswellens/MorphismTokenizer

    def morph_counts_fastest_version(self, words):
        # Word List to list of all morphisms

        word_counts = Counter(
            word for word in toolz.concat(map(self.word_tokenizer, words)))

        print("words_counts: ")
        print(word_counts)
        print("")
        print("Unique number words: " + str(len(set(words))))
        print("Total number of words: " + str(len(words)))
        print("")
        unique_words_set = set(words)
        unique_words = list(unique_words_set)

        frog = Frog(
            FrogOptions(tok=True,
                        lemma=True,
                        morph=True,
                        daringmorph=False,
                        mwu=False,
                        chunking=False,
                        ner=False,
                        parser=False))
        batch_size = 400
        morphisms = []
        print_batch_number = 1
        start_time = time.time()
        total_batch_number = math.ceil(len(unique_words) / batch_size)
        total_process_time = 0
        total_getting_morphisms_time = 0

        for i in range(0, len(unique_words), batch_size):
            t0 = time.time()
            words_batch = unique_words[i:i + batch_size]
            words_batch_string = ' '.join(words_batch)
            output = frog.process(words_batch_string)
            process_time = time.time() - t0
            t1 = time.time()

            for j in range(0, len(words_batch) - 1):
                current_word = output[j].get("text")
                morphisms_word = output[j].get("morph")
                morphisms_word_list = morphisms_word.replace('[',
                                                             '').split(']')
                current_word_count = word_counts[current_word]

                # Momenteel GEEN GEHELE WOORDEN IN COUNT
                if len(morphisms_word_list) > 2:
                    morphisms += morphisms_word_list * current_word_count

                total_batch_length = len(words_batch)
            print("batch" + " (batch_size: " + str(batch_size) +
                  " words):    " + str(print_batch_number) + " of " +
                  str(total_batch_number))

            print_batch_number += 1
            getting_morphisms_time = time.time() - t1
            total_process_time += process_time
            total_getting_morphisms_time += getting_morphisms_time

        print("Total number of words: ")
        print(len(words))
        print("")
        print("Unique number words: ")
        print(len(set(words)))
        print("")
        print("Total Process Time:")
        print(self.format_time(total_process_time))
        print("")
        print("Total Getting Morphisms Time: ")
        print(self.format_time(total_getting_morphisms_time))
        print("")
        print("Total Time:")
        print(self.format_time(time.time() - start_time))
        print("")

        # Remove the empty strings
        morphisms = list(filter(None, morphisms))
        #Make a counter of all morphisms
        morph_counts = Counter(morphisms)
        return morph_counts

Esempio n. 14

0

Mostra file

File: createBertTinyFeatures.py Progetto: nicholaswellens/MorphismTokenizer

def change_text_to_lemma_POS(sentences, save=False, filename=None):
    # sentence list to sentence list in frog lemma + pos
    lemmapos_sentences = []

    frog = Frog(
        FrogOptions(tok=True,
                    lemma=True,
                    morph=False,
                    daringmorph=False,
                    mwu=False,
                    chunking=False,
                    ner=False,
                    parser=False))

    j = 0
    for sentenceToBeProcessed in sentences:
        if j % 1000 == 0:
            print(j + 1)
            print("of")
            print(len(sentences))

        j += 1
        sentenceToBeProcessed = sentenceToBeProcessed.rstrip('\n')
        output = frog.process(sentenceToBeProcessed)
        lemmapos_sentence = ""
        for i in range(0, len(output)):
            pos = str(output[i].get("pos"))
            lemma = str(output[i].get("lemma"))
            #posprob = str(output[i].get("posprob"))
            #print(posprob)

            # print("pos:      " + pos)
            # print("lemma:    " + lemma)

            pos = "<" + pos
            pos = pos.replace("(", "><")
            pos = pos.replace(")", ">")
            pos = pos.replace(",", "><")
            pos = pos.replace("<>", "")

            # print(pos)

            lemmapos_word = lemma + " " + "**" + pos + "**"

            #word = str(output[i].get("text"))
            #print(f"{word}: {lemmapos_word}")

            lemmapos_sentence = lemmapos_sentence + " " + lemmapos_word

        # Remove the first empty string
        #print(lemmapos_sentence)

        lemmapos_sentence = lemmapos_sentence[1:]
        #print("")
        #print("")
        #print("")
        #print("")
        #print(lemmapos_sentence)
        #print("")
        #print("")
        #print("")
        #print("")
        lemmapos_sentences.append(lemmapos_sentence)
        #print("")
        #print(lemmapos_sentences)
        #print("")

    if save is True:
        with open(filename, 'wb') as outputfile:
            pickle.dump(lemmapos_sentences, outputfile)
    return lemmapos_sentences

Esempio n. 15

0

Mostra file

def main():
    parser = argparse.ArgumentParser(
        description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        '-s',
        '--nerset',
        type=str,
        help="NER FoLiA Set",
        action='store',
        default=
        "https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/namedentities.foliaset.ttl"
    )
    parser.add_argument('-c',
                        '--config',
                        type=str,
                        help="Frog configuration",
                        action='store',
                        required=True)
    parser.add_argument('--notexact',
                        dest='exact',
                        help="Loose evaluation",
                        action='store_false',
                        default=True)
    parser.add_argument('files', nargs='+', help='bar help')
    args = parser.parse_args()

    frog = Frog(FrogOptions(ner=True, parser=False, xmlout=True), args.config)

    sentence = []
    entities = []
    precisions = []
    recalls = []
    entity = None
    entity_cls = None
    doc = None
    classeval = defaultdict(lambda: defaultdict(int))
    for filename in args.files:
        for token, tag in readdata(
                filename):  #extracttrain also works on test gold standard
            if token is None:  #end of sentence
                if entity:
                    entities.append((" ".join(entity), entity_cls))
                    entity = []
                if sentence:
                    print("Processing: ", " ".join(sentence), file=sys.stderr)
                    print("    Reference entities:", entities, file=sys.stderr)
                    doc = frog.process(" ".join(sentence))
                    precision, recall = evaluate(doc, entities, args.nerset,
                                                 classeval, args.exact)
                    print("     precision=",
                          precision,
                          " recall=",
                          recall,
                          file=sys.stderr)
                    if precision is not None:
                        precisions.append(precision)
                    if recall is not None:
                        recalls.append(recall)
                    sentence = []
                    entities = []
            else:
                if tag[0] == 'B':
                    if entity:
                        entities.append((" ".join(entity), entity_cls))
                    entity = []
                    entity_cls = tag[2:]
                    entity.append(token)
                elif tag[0] == 'I':
                    entity.append(token)
                elif entity:
                    entities.append((" ".join(entity), entity_cls))
                    entity = []
                sentence.append(token)

    print("overall precision (macroav):\t", sum(precisions) / len(precisions))
    print("overall recall (macroav):\t", sum(recalls) / len(recalls))

    for cls, evaldata in classeval.items():
        try:
            print(cls + " precision (microav):\t",
                  evaldata['tp'] / (evaldata['tp'] + evaldata['fp']))
        except ZeroDivisionError:
            print(cls + " precision (microav):\tn/a")
        try:
            print(cls + " recall (microav):\t",
                  evaldata['tp'] / (evaldata['tp'] + evaldata['fn']))
        except ZeroDivisionError:
            print(cls + " recall (microav):\tn/a")

Esempio n. 16

0

Mostra file

File: getOpenSubtitlesData.py Progetto: timjzee/dmc-scripts

 g.write(
     "s_plural,item,next_stress,next_sound,f1,f2,f3,f4,f5,f6,f7,f8,ort\n")
 for t in trials:
     print("Processing " + t)
     grep_text = "(^{}|[ ']{})({}|{})(?=[ .,?!':;])".format(
         t[0].upper(), t[0], trials_dict[t]["en"][1:],
         trials_dict[t]["s"][1:])
     w_lines = subprocess.check_output(
         ["grep", "-P", grep_text, f_path + "OpenSubtitles2018_nl_raw.txt"],
         universal_newlines=True)
     w_lines_list = w_lines.split("\n")
     w_lines_list = list(set(w_lines_list))
     w_lines_list = [i for i in w_lines_list if i != ""]
     for line in w_lines_list:
         line = re.sub(" & ", " en ", line)
         frog_output = frog.process(line)
         print(line)
         frog_output_clean = [i for i in frog_output if i["pos"] != "LET()"]
         lemmas = [i["lemma"] for i in frog_output_clean]
         if t not in lemmas:
             continue
         # get indices of matching words
         w_indices = []
         for num, b in enumerate(frog_output_clean, 0):
             if b["text"].lower() in [
                     trials_dict[t]["en"], trials_dict[t]["s"]
             ] and b["pos"] == "N(soort,mv,basis)":
                 w_indices.append(num)
         for w_index in w_indices:
             s_plural = "1" if frog_output_clean[w_index]["text"][
                 -1] == "s" else "0"