def lemmas_collocations(database, IGC_folder, prop_names):
    dci = SQLDatabase(db_name='gagnagrunnar/nmo.db')
    dim = SQLDatabase(db_name='gagnagrunnar/bin_lemmur_ordmyndir.db')
    filters = SQLDatabase(db_name='gagnagrunnar/IGC_filters.db'
                          )  # Predefined stop-word list based on the RMH
    pos_to_ignore = [
        'e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au'
    ]  # The POS tags that should not be displayed in the results
    outdict = {}

    print("""
    ============================================================
    Les skjöl úr málheildinni. 
    ============================================================
    """)
    xml_files = glob.glob(IGC_folder + '/**/*.xml', recursive=True)

    filebar = IncrementalBar('Framvinda', max=len(xml_files))
    for file in xml_files:
        colloc = []
        with open(file, 'r', encoding='utf-8') as content:
            try:
                tree = ET.parse(content)
                for word in tree.iter():
                    if word.text is not None:
                        if word.attrib.get('lemma') is not None:
                            pos = word.attrib.get('type')
                            lemma = word.attrib.get('lemma')
                            word_form = word.text
                            colloc.append((word_form, lemma, pos))
                        elif word.text in punctuation:
                            colloc.append((word.text, ' ', ' '))

                for i, w in enumerate(colloc):
                    if prop_names == False:
                        if w[2].startswith('n') and w[2].endswith(
                                's'):  # Ignore proper names
                            continue
                    if w[2] in pos_to_ignore:
                        continue
                    if w[1][-1] == '-':  # if a word starts or ends in an hyphen, ignore it (likely OCR error)
                        continue
                    if w[1][0] == '-':
                        continue
                    if (
                            not all(i.isalpha() or i == '-' for i in w[1])
                    ):  # if a word contains anything but an alphabetic letter or hyphen, ignore it
                        continue
                    filter_query = SQLiteQuery(w[1],
                                               'filter',
                                               'FILTER_WORD_FORMS',
                                               cursor=filters.cursor)
                    if filter_query.exists:
                        continue
                    else:
                        if database == 'NMO':
                            query = SQLiteQuery(w[1],
                                                'lemma',
                                                'DCI_ELEMENT',
                                                cursor=dci.cursor
                                                )  # Capitalized words included
                            query_lower = SQLiteQuery(w[1].lower(),
                                                      'lemma',
                                                      'DCI_ELEMENT',
                                                      cursor=dci.cursor)
                        elif database == 'BIN':
                            query = SQLiteQuery(w[1],
                                                'lemma',
                                                'DIM_ELEMENT',
                                                cursor=dim.cursor
                                                )  # Capitalized words included
                            query_lower = SQLiteQuery(w[1].lower(),
                                                      'lemma',
                                                      'DIM_ELEMENT',
                                                      cursor=dim.cursor)
                        if not query.exists and not query_lower.exists:  # If the word is not found in the database nor the filters
                            if len(w[1]) > 1:
                                if i - 2 < 0:  # collects 2 words before and after the candidate
                                    w1 = ""
                                else:
                                    w1 = str(colloc[i - 2][0])
                                if i - 1 < 0:
                                    w2 = ""
                                else:
                                    w2 = str(colloc[i - 1][0])
                                if i + 1 > len(colloc) - 1:
                                    w4 = ""
                                else:
                                    w4 = str(colloc[i + 1][0])
                                if i + 2 > len(colloc) - 1:
                                    w5 = ""
                                else:
                                    w5 = str(colloc[i + 2][0])
                                if w[1] in outdict:
                                    if str(w1 + ' ' + w2 + ' ' + w[0] + ' ' +
                                           w4 + ' ' + w5) not in outdict[
                                               w[1]]['orðstaða']:
                                        outdict[w[1]]['orðstaða'][
                                            str(w1 + ' ' + w2 + ' ' + w[0] +
                                                ' ' + w4 + ' ' + w5)] = 1
                                    else:
                                        outdict[w[1]]['orðstaða'][
                                            str(w1 + ' ' + w2 + ' ' + w[0] +
                                                ' ' + w4 + ' ' + w5)] += 1
                                    outdict[w[1]]['tíðni'] += 1
                                else:
                                    outdict[w[1]] = {}
                                    outdict[w[1]]['tíðni'] = 1
                                    outdict[w[1]]['orðstaða'] = {
                                        str(w1 + ' ' + w2 + ' ' + w[0] + ' ' + w4 + ' ' + w5):
                                        1
                                    }
            except sqlite3.OperationalError:
                pass
        filebar.next()
        sys.stdout.flush()
    filebar.finish()

    if IGC_folder == "malheildir/RMH/":
        with open(f'uttak/{database}/RMH_lemmur_med_orstodulyklum.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(outdict.items(),
                                   key=lambda item: item[1]['tíðni'],
                                   reverse=True)
            }  # Sort the candidates by their total frequencies
            for key, item in candidates.items():
                for counter, dictitem in enumerate(item.items()):
                    if counter % 2 == 0:
                        freq = dictitem[1]
                    elif counter % 2 != 0:
                        sorted_sents = {
                            k: v
                            for k, v in sorted(
                                dictitem[1].items(
                                ),  # Sort the sentence examples by their frequencies
                                key=lambda item: item[1],
                                reverse=True)
                        }
                        if len(
                                sorted_sents
                        ) > 5:  # This limit the examples to the 5 most frequent ones, can be changed
                            sents = list(sorted_sents)[:5]
                        else:
                            sents = list(sorted_sents)
                        outputfile.write(
                            key + ' : ' + str(freq) + '. ' + str(sents) + '\n'
                        )  # word: freq. [sent example 1, sent example 2...]

        print(f"""
    ============================================================
    Úttaksskjalið RMH_lemmur_med_orstodulyklum.freq er tilbúið og 
    er að finna í undirmöppunni uttak/{database}/
    ============================================================
        """)

    elif IGC_folder == "malheildir/RMH/CC_BY/":
        with open(f'uttak/{database}/CC_BY_lemmur_med_orstodulyklum.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(outdict.items(),
                                   key=lambda item: item[1]['tíðni'],
                                   reverse=True)
            }  # Sort the candidates by their total frequencies
            for key, item in candidates.items():
                for counter, dictitem in enumerate(item.items()):
                    if counter % 2 == 0:
                        freq = dictitem[1]
                    elif counter % 2 != 0:
                        sorted_sents = {
                            k: v
                            for k, v in sorted(
                                dictitem[1].items(
                                ),  # Sort the sentence examples by their frequencies
                                key=lambda item: item[1],
                                reverse=True)
                        }
                        if len(
                                sorted_sents
                        ) > 5:  # This limit the examples to the 5 most frequent ones, can be changed
                            sents = list(sorted_sents)[:5]
                        else:
                            sents = list(sorted_sents)
                        outputfile.write(
                            key + ' : ' + str(freq) + '. ' + str(sents) + '\n'
                        )  # word: freq. [sent example 1, sent example 2...]

        print(f"""
    ============================================================
    Úttaksskjalið CC_BY_lemmur_med_orstodulyklum.freq er tilbúið og 
    er að finna í undirmöppunni uttak/{database}/
    ============================================================
        """)
    elif IGC_folder == "malheildir/RMH/MIM/":
        with open(f'uttak/{database}/MIM_lemmur_med_orstodulyklum.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(outdict.items(),
                                   key=lambda item: item[1]['tíðni'],
                                   reverse=True)
            }  # Sort the candidates by their total frequencies
            for key, item in candidates.items():
                for counter, dictitem in enumerate(item.items()):
                    if counter % 2 == 0:
                        freq = dictitem[1]
                    elif counter % 2 != 0:
                        sorted_sents = {
                            k: v
                            for k, v in sorted(
                                dictitem[1].items(
                                ),  # Sort the sentence examples by their frequencies
                                key=lambda item: item[1],
                                reverse=True)
                        }
                        if len(
                                sorted_sents
                        ) > 5:  # This limit the examples to the 5 most frequent ones, can be changed
                            sents = list(sorted_sents)[:5]
                        else:
                            sents = list(sorted_sents)
                        outputfile.write(
                            key + ' : ' + str(freq) + '. ' + str(sents) + '\n'
                        )  # word: freq. [sent example 1, sent example 2...]

        print(f"""
    ============================================================
    Úttaksskjalið MIM_lemmur_med_orstodulyklum.freq er tilbúið og 
    er að finna í undirmöppunni uttak/{database}/
    ============================================================
        """)

    else:
        namefolder = IGC_folder.split("/")[3]
        with open(f'uttak/{database}/' + namefolder +
                  '_lemmur_med_orstodulyklum.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(outdict.items(),
                                   key=lambda item: item[1]['tíðni'],
                                   reverse=True)
            }  # Sort the candidates by their total frequencies
            for key, item in candidates.items():
                for counter, dictitem in enumerate(item.items()):
                    if counter % 2 == 0:
                        freq = dictitem[1]
                    elif counter % 2 != 0:
                        sorted_sents = {
                            k: v
                            for k, v in sorted(
                                dictitem[1].items(
                                ),  # Sort the sentence examples by their frequencies
                                key=lambda item: item[1],
                                reverse=True)
                        }
                        if len(
                                sorted_sents
                        ) > 5:  # This limit the examples to the 5 most frequent ones, can be changed
                            sents = list(sorted_sents)[:5]
                        else:
                            sents = list(sorted_sents)
                        outputfile.write(
                            key + ' : ' + str(freq) + '. ' + str(sents) + '\n'
                        )  # word: freq. [sent example 1, sent example 2...]

        print(f"""
    ============================================================
    Úttaksskjalið {namefolder}_lemmur_med_orstodulyklum.freq er tilbúið og 
    er að finna í undirmöppunni uttak/{database}/
    ============================================================
        """)
def texttype_freqs(database, folder, prop_names):
    """
    Used to collect lemmas by the types of text they appear in and sort
    them by frequency. Filters the RMH in order to retrieve the desired
    results. The script can be modified according to the user's need 
    and to fit another corpus.  
    """
    dci = SQLDatabase(db_name='gagnagrunnar/nmo.db')
    dim = SQLDatabase(db_name='gagnagrunnar/bin_lemmur_ordmyndir.db')
    filters = SQLDatabase(db_name='gagnagrunnar/IGC_filters.db') # Predefined stop-word list based on the RMH

    print("""
    ============================================================
    Les skjöl úr málheildinni.
    ============================================================
    """)
    xml_files = glob.glob(folder+'/**/*.xml', recursive=True)

    alltexttypes = []
    freqdic1 = {}
    freqdic2 = {}
    filebar = IncrementalBar('Framvinda', max = len(xml_files))
    for file in xml_files:
        with open(file, 'r', encoding='utf-8') as content:
            try:
                tree = ET.parse(content)
                root = tree.getroot()
                textClass = root[0][2][0][0][0][0] # Retrieve the texttype tag from the XML file
                texttype = textClass.text 
                if texttype not in alltexttypes:
                    alltexttypes.append(texttype) # Collect all unique texttypes
                pos_to_ignore = ['e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au'] # The POS tags that should not be displayed in the results
                for word in tree.iter():
                    pos = word.attrib.get('type')
                    if pos is not None:
                        if prop_names==False:
                            if pos.startswith('n') and pos.endswith('s'): # Ignore proper names
                                continue
                        if pos in pos_to_ignore:
                            continue
                        if (not all(i.isalpha() or i == '-' for i in word.text)): # Ignore all that are not alphabetic letters or hyphen 
                            continue
                        if len(word.text) < 3: # Ignore very short words, likely to be particles
                            continue
                        if word.text[-1] == '-': # Ignore words starting or ending with a hypen (likely OCR errors)
                            continue
                        if word.text[0] == '-':
                            continue
                        if word.attrib.get('lemma') is not None:
                            lemma = word.attrib.get('lemma')
                            filter_query = SQLiteQuery(lemma,'filter','FILTER_WORD_FORMS', cursor=filters.cursor) # Ignore stop words
                            if filter_query.exists:
                                continue
                            else:
                                if database == 'NMO':
                                    query = SQLiteQuery(lemma, 'lemma','DCI_ELEMENT', cursor = dci.cursor) # Capitalized words included
                                    query_lower = SQLiteQuery(lemma.lower(),'lemma','DCI_ELEMENT', cursor = dci.cursor)
                                elif database == 'BIN':
                                    query = SQLiteQuery(lemma, 'lemma','DIM_ELEMENT', cursor = dim.cursor) # Capitalized words included
                                    query_lower = SQLiteQuery(lemma.lower(),'lemma','DIM_ELEMENT', cursor = dim.cursor)
                                if not query.exists and not query_lower.exists: # If the word is not found in the DIM or the stopwords
                                    if lemma not in freqdic1: # Collect total freqs
                                        freqdic1[lemma] = 1
                                    else:
                                        freqdic1[lemma] += 1
                                    if (lemma,texttype) not in freqdic2: # Collect texttype freqs
                                        freqdic2[(lemma,texttype)] = 1
                                    else:
                                        freqdic2[(lemma,texttype)] += 1
            except IndexError:
                continue
            except ET.ParseError:
                continue

        filebar.next()
        sys.stdout.flush()
    filebar.finish()

    print("""
    ============================================================
    Flokkar tíðni eftir textagerðum. 
    ============================================================
    """)

    tempfinal = []
    bar1 = IncrementalBar('Framvinda', max = len(freqdic1))
    for key, value in sorted(freqdic1.items()): # Lemma, total freq
        tempf = []
        tempf.append(key)
        temp = []
        for k, v in freqdic2.items(): 
            if k[0] == key:
                temp.append((k[1], v)) # A list of all possible texttypes that appear with the lemma
        for tt in alltexttypes:
            if tt in [item[0] for item in temp]:
                continue
            else:
                temp.append((tt, 0)) 
        tempf.append(value)
        for tup in sorted(temp):
            tempf.append(tup[1]) 
        tempfinal.append(tempf) # The format of this list is [lemma, totalfreq, texttype_a freq, texttype_b freq...]
        bar1.next()
        sys.stdout.flush()
    bar1.finish()

    header = ['Lemma', 'Heildartíðni'] + sorted(alltexttypes)

    if folder == "malheildir/RMH/":
        with open(f"uttak/{database}/RMH_textagerdir.csv", mode='w+') as outputfile:
            csvwriter = csv.writer(outputfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            csvwriter.writerow(header)
            for i in tempfinal:
                csvwriter.writerow(i)
        print(f"""
    ============================================================
    Úttaksskjalið RMH_textagerdir.freq er tilbúið og 
    er að finna í undirmöppunni uttak/{database}/
    ============================================================
        """)
    elif folder == "malheildir/RMH/CC_BY/":
        with open(f'uttak/{database}/CC_BY_textagerdir.csv', mode='w+') as outputfile:
            csvwriter = csv.writer(outputfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            csvwriter.writerow(header)
            for i in tempfinal:
                csvwriter.writerow(i)
        print(f"""
    ============================================================
    Úttaksskjalið CC_BY_textagerdir.freq er tilbúið og 
    er að finna í undirmöppunni uttak/{database}/
    ============================================================
        """)
    elif folder == "malheildir/RMH/MIM/":
        with open(f'uttak/{database}/MIM_textagerdir.csv', mode='w+') as outputfile:
            csvwriter = csv.writer(outputfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            csvwriter.writerow(header)
            for i in tempfinal:
                csvwriter.writerow(i)
        print(f"""
    ============================================================
    Úttaksskjalið MIM_textagerdir.freq er tilbúið og 
    er að finna í undirmöppunni uttak/{database}/
    ============================================================
        """)
    else:
        namefolder = folder.split("/")[3]
        with open(f'uttak/{database}/'+namefolder+"_textagerdir.csv", mode='w+') as outputfile:
           csvwriter = csv.writer(outputfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
           csvwriter.writerow(header)
           for i in tempfinal:
               csvwriter.writerow(i)

        print(f"""
    ============================================================
    Úttaksskjalið {namefolder}_textagerdir.freq er tilbúið og 
    er að finna í undirmöppunni uttak/{database}/
    ============================================================
        """)
def user_defined_collocations(database, filterbase, corpus):
    """
    Iterates through the corpus and retrieves the words that do 
    not appear in the database. Collects 5 word collocations on
    every word, two words before and after the candidate word. 
    """
    db = SQLDatabase(db_name=database)
    txt_files = glob.glob(corpus + '/**/*.txt', recursive=True)
    if filterbase != 'None':
        filters = SQLDatabase(db_name=filterbase)
    else:
        pass  # if there is no filterbase, ignore this step

    outdict = {}

    print("""
    ============================================================
    Reading corpus files.
    ============================================================
    """)
    filebar = IncrementalBar('Progress', max=len(txt_files))
    for file in txt_files:
        with open(file, 'r', encoding='utf-8') as content:
            f = content.read()
            words = f.split()
            for i, w in enumerate(words):
                if w[-1] == '-':  # if a word starts or ends in an hyphen, ignore it (likely OCR error)
                    continue
                if w[0] == '-':
                    continue
                if (
                        not all(i.isalpha() or i == '-' for i in w)
                ):  # if a word contains anything but an alphabetic letter or hyphen, ignore it
                    continue
                if filterbase != 'None':  # if a stopword database has been defined, filter the results
                    filter_query = SQLiteQuery(w,
                                               'filter',
                                               'FILTER_WORD_FORMS',
                                               cursor=filters.cursor)
                    if filter_query.exists:
                        continue
                    else:
                        query = SQLiteQuery(
                            w, 'word', 'LEXICON_WORD', cursor=db.cursor
                        )  # parameters must be updated if the database format is changed
                        query_lower = SQLiteQuery(w.lower(),
                                                  'word',
                                                  'LEXICON_WORD',
                                                  cursor=db.cursor)
                        if not query.exists and not query_lower.exists:  # If the word is not found in the database nor the filters
                            if len(w) > 1:
                                if i - 2 < 0:  # collects 2 words before and after the candidate
                                    w1 = ""
                                else:
                                    w1 = str(words[i - 2])
                                if i - 1 < 0:
                                    w2 = ""
                                else:
                                    w2 = str(words[i - 1])
                                if i + 1 > len(words) - 1:
                                    w4 = ""
                                else:
                                    w4 = str(words[i + 1])
                                if i + 2 > len(words) - 1:
                                    w5 = ""
                                else:
                                    w5 = str(words[i + 2])
                                if w in outdict:
                                    if str(w1 + ' ' + w2 + ' ' + w + ' ' + w4 +
                                           ' ' +
                                           w5) not in outdict[w]['colloc']:
                                        outdict[w]['colloc'][str(w1 + ' ' +
                                                                 w2 + ' ' + w +
                                                                 ' ' + w4 +
                                                                 ' ' + w5)] = 1
                                    else:
                                        outdict[w]['colloc'][str(w1 + ' ' +
                                                                 w2 + ' ' + w +
                                                                 ' ' + w4 +
                                                                 ' ' +
                                                                 w5)] += 1
                                    outdict[w]['freq'] += 1
                                else:
                                    outdict[w] = {}
                                    outdict[w]['freq'] = 1
                                    outdict[w]['colloc'] = {
                                        str(w1 + ' ' + w2 + ' ' + w + ' ' + w4 + ' ' + w5):
                                        1
                                    }

                else:
                    query = SQLiteQuery(w,
                                        'word',
                                        'LEXICON_WORD',
                                        cursor=db.cursor)
                    query_lower = SQLiteQuery(w.lower(),
                                              'word',
                                              'LEXICON_WORD',
                                              cursor=db.cursor)
                    if not query.exists and not query_lower.exists:
                        if len(w) > 1:
                            if i - 2 < 0:
                                w1 = ""
                            else:
                                w1 = str(words[i - 2])
                            if i - 1 < 0:
                                w2 = ""
                            else:
                                w2 = str(words[i - 1])
                            if i + 1 > len(words) - 1:
                                w4 = ""
                            else:
                                w4 = str(words[i + 1])
                            if i + 2 > len(words) - 1:
                                w5 = ""
                            else:
                                w5 = str(words[i + 2])
                            if w in outdict:
                                if str(w1 + ' ' + w2 + ' ' + w + ' ' + w4 +
                                       ' ' + w5) not in outdict[w]['colloc']:
                                    outdict[w]['colloc'][str(w1 + ' ' + w2 +
                                                             ' ' + w + ' ' +
                                                             w4 + ' ' +
                                                             w5)] = 1
                                else:
                                    outdict[w]['colloc'][str(w1 + ' ' + w2 +
                                                             ' ' + w + ' ' +
                                                             w4 + ' ' +
                                                             w5)] += 1
                                outdict[w]['freq'] += 1
                            else:
                                outdict[w] = {}
                                outdict[w]['freq'] = 1
                                outdict[w]['colloc'] = {
                                    str(w1 + ' ' + w2 + ' ' + w + ' ' + w4 + ' ' + w5):
                                    1
                                }

        filebar.next()
        sys.stdout.flush()
    filebar.finish()

    output_file = input("""
    ============================================================
    Please indicate what your output file should be called,
    followed by .freq

    Example: lexicon_collocations.freq
    ============================================================
    """)

    with open('output/user_defined/' + output_file, mode='w+') as outputfile:
        candidates = {
            k: v
            for k, v in sorted(outdict.items(),
                               key=lambda item: item[1]['freq'],
                               reverse=True)
        }  # Sort the candidates by their total frequencies
        for key, item in candidates.items():
            for counter, dictitem in enumerate(item.items()):
                if counter % 2 == 0:
                    freq = dictitem[1]
                elif counter % 2 != 0:
                    sorted_sents = {
                        k: v
                        for k, v in sorted(
                            dictitem[1].items(
                            ),  # Sort the sentence examples by their frequencies
                            key=lambda item: item[1],
                            reverse=True)
                    }
                    if len(
                            sorted_sents
                    ) > 5:  # This limit the examples to the 5 most frequent ones, can be changed
                        sents = list(sorted_sents)[:5]
                    else:
                        sents = list(sorted_sents)
                    outputfile.write(
                        key + ' : ' + str(freq) + '. ' + str(sents) + '\n'
                    )  # word: freq. [sent example 1, sent example 2...]

    print(f"""
    ============================================================
    Output file {output_file} is ready and can be 
    found at the output/user_defined/ directory.
    ============================================================
    """)
Beispiel #4
0
def user_defined_freqlist(database,filterbase,corpus):
    """
    Iterates through a user-defined corpus and compares
    the results to a user-defined database, filtering out
    stopwords if the user has defined a stopword database.
    Returns a frequency word list.
    """
    db = SQLDatabase(db_name=database)
    txt_files = glob.glob(corpus+'/**/*.txt', recursive=True)
    print(filterbase)
    if filterbase not in ['n', 'N']:
        filters = SQLDatabase(db_name=filterbase)
    else:
        pass

    outdict = {}
    
    print("""
    ============================================================
    Les skjöl úr málheildinni.
    ============================================================
    """)
    filebar = IncrementalBar('Framvinda', max = len(txt_files))
    for file in txt_files:
        with open(file, 'r', encoding='utf-8') as content:
            f = content.read()
            words = f.split()
            for w in words:
                if w[-1] == '-': # if a word starts or ends in an hyphen, ignore it (likely OCR error)
                    continue
                if w[0] == '-':
                    continue
                if (not all(i.isalpha() or i == '-' for i in w)): # if a word contains anything but an alphabetic letter or hyphen, ignore it
                    continue
                if filterbase not in ['n', 'N']: # if a stopword database has been defined, filter the results
                    filter_query = SQLiteQuery(w,'filter','FILTER_WORD_FORMS', cursor=filters.cursor) 
                    if filter_query.exists:
                        continue
                    else:
                        query = SQLiteQuery(w,'word','LEXICON_WORD', cursor = db.cursor) # parameters must be updated if the database format is changed                 
                        query_lower = SQLiteQuery(w.lower(),'word','LEXICON_WORD', cursor = db.cursor) 
                        if not query.exists and not query_lower.exists: 
                            if len(w) > 1:
                                if w in outdict:
                                    outdict[w] += 1
                                else:
                                    outdict[w] = 1
                else:
                    query = SQLiteQuery(w,'word','LEXICON_WORD', cursor = db.cursor)                 
                    query_lower = SQLiteQuery(w.lower(),'word','LEXICON_WORD', cursor = db.cursor) 
                    if not query.exists and not query_lower.exists: 
                        if len(w) > 1:
                            if w in outdict:
                                outdict[w] += 1
                            else:
                                outdict[w] = 1
        filebar.next()
        sys.stdout.flush()
    filebar.finish()

    output_file = input("""
    ============================================================
    Skrifaðu það sem þú vilt að úttaksskjalið heiti með 
    endingunni .freq
 
    Dæmi: ordasafn.freq
    ============================================================
    """)

    with open('uttak/notendagogn/'+output_file, mode='w+') as outputfile:
        candidates = {k: v for k, v in sorted(outdict.items(),
                        key=lambda item: item[1], reverse=True)}
        for key, value in candidates.items():
            outputfile.write(key+': '+str(value)+ '\n')

    print(f"""
    ============================================================
    Úttaksskjalið {output_file} er tilbúið og má finna í 
    undirmöppunni uttak/notendagogn/
    ============================================================
    """)