Exemple #1
0
def lemmabase_wordforms(database, IGC_folder, prop_names):
    """
    Iterates through the IGC, outputting a list of lemmas
    and their frequencies as well as all wordforms that appear
    alongside the lemma in the corpus. Useful for detecting whether
    a word only appears in certain context (e.g. fixed expressions)
    or whether a certain wordform never appears. Can be modified to 
    fit the user's need.
    """
    dci = SQLDatabase(db_name='gagnagrunnar/nmo.db')
    dim = SQLDatabase(db_name='gagnagrunnar/bin_lemmur_ordmyndir.db')
    filters = SQLDatabase(db_name='gagnagrunnar/IGC_filters.db'
                          )  # Predefined stop-word list based on the RMH
    pos_to_ignore = [
        'e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au'
    ]  # The POS tags that should not be displayed in the results
    IGC = IGCExtractor(folder=str(IGC_folder))
    freqdic = {}

    print("""
    ============================================================
    Les skjöl úr málheildinni.
    ============================================================
    """)

    for word in IGC.extract(forms=True, lemmas=True, pos=True):
        try:
            if prop_names == False:
                if word.pos.startswith('n') and word.pos.endswith(
                        's'):  # Ignore proper names
                    continue
            if word.pos in pos_to_ignore:
                continue
            if (not all(i.isalpha() or i == '-' for i in word.lemma)
                ):  # Ignore if not only letters or letters and hyphen
                continue
            if len(word.lemma
                   ) < 3:  # Ignore very short words, likely to be particles
                continue
            if '-' in [
                    word.lemma[0], word.lemma[1], word.lemma[-1]
            ]:  # Ignore words that start with '[anyLetter?]-' or end with '-'
                continue
            # Ignore unwanted words, such as names, foreign words, stopwords, abbreviations
            filter_query = SQLiteQuery(word.lemma,
                                       'filter',
                                       'FILTER_WORD_FORMS',
                                       cursor=filters.cursor)
            if filter_query.exists:
                continue
            else:
                if database == 'NMO':
                    query = SQLiteQuery(
                        word.lemma, 'lemma', 'DCI_ELEMENT',
                        cursor=dci.cursor)  # Capitalized words included
                    query_lower = SQLiteQuery(word.lemma.lower(),
                                              'lemma',
                                              'DCI_ELEMENT',
                                              cursor=dci.cursor)
                elif database == 'BIN':
                    query = SQLiteQuery(
                        word.lemma, 'lemma', 'DIM_ELEMENT',
                        cursor=dim.cursor)  # Capitalized words included
                    query_lower = SQLiteQuery(word.lemma.lower(),
                                              'lemma',
                                              'DIM_ELEMENT',
                                              cursor=dim.cursor)
                if not query.exists and not query_lower.exists:
                    if word.lemma in freqdic:
                        if word.word_form not in freqdic[
                                word.lemma]['orðmyndir']:
                            freqdic[word.lemma]['orðmyndir'].append(
                                word.word_form)
                        freqdic[word.lemma]['tíðni'] += 1
                    else:
                        freqdic[word.lemma] = {}
                        freqdic[word.lemma]['tíðni'] = 1
                        freqdic[word.lemma]['orðmyndir'] = [word.word_form]
        except IndexError:
            continue
        except ET.ParseError:
            continue

    print("""
    ============================================================
    Flokkar orð eftir tíðni.
    ============================================================
    """)

    if IGC_folder == "malheildir/RMH/":
        with open(f'uttak/{database}/RMH_lemmur_med_ordmyndum.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['tíðni'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print(f"""
    ============================================================
    Úttaksskjalið RMH_lemmur_med_ordmyndum.freq er tilbúið og 
    er að finna í undirmöppunni uttak/{database}/
    ============================================================
        """)
    elif IGC_folder == "malheildir/RMH/CC_BY/":
        with open(f'uttak/{database}/CC_BY_lemmur_med_ordmyndum.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['tíðni'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print(f"""
    ============================================================
    Úttaksskjalið CC_BY_lemmur_med_ordmyndum.freq er tilbúið og 
    er að finna í undirmöppunni uttak/{database}/
    ============================================================
        """)
    elif IGC_folder == "malheildir/RMH/MIM/":
        with open(f'uttak/{database}/MIM_lemmur_med_ordmyndum.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['tíðni'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print(f"""
    ============================================================
    Úttaksskjalið MIM_lemmur_med_ordmyndum.freq er tilbúið og 
    er að finna í undirmöppunni uttak/{database}/    
    ============================================================
        """)
    else:
        namefolder = IGC_folder.split("/")[3]
        with open(f'uttak/{database}/' + namefolder +
                  '_lemmur_med_ordmyndum.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['tíðni'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')

        print(f"""
    ============================================================
    Úttaksskjalið {database}_lemmur_med_ordmyndum.freq er tilbúið og 
    er að finna í undirmöppunni uttak/{database}/
    ============================================================
        """)
Exemple #2
0
def lemma_output(database, IGC_folder, prop_names):
    """
    Iterates over the input corpus and returns the lemmas not found 
    in the input database, ordered by frequency. Also includes 
    information on the number of nouns, indicating if a noun only 
    exists in either the singular or the plural form (and whether 
    the automatic lemmatization/pos tagging is off). Can be altered 
    for other databases or corpora. 
    """
    dci = SQLDatabase(db_name='databases/dci.db')
    dim = SQLDatabase(db_name='databases/dim_lemmas_word_forms.db')
    filters = SQLDatabase(db_name='databases/IGC_filters.db'
                          )  # Predefined stop-word list based on the IGC
    pos_to_ignore = [
        'e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au'
    ]  # The POS tags that should not be displayed in the results
    IGC = IGCExtractor(folder=str(IGC_folder))
    freqdic = {}

    print("""
    ============================================================
    Reading corpus files.
    ============================================================
    """)
    for word in IGC.extract(forms=False, lemmas=True, pos=True):
        try:
            if prop_names == False:
                if word.pos.startswith('n') and word.pos.endswith(
                        's'):  # Ignore proper names
                    continue
            if word.pos in pos_to_ignore:
                continue
            if (not all(i.isalpha() or i == '-' for i in word.lemma)
                ):  # Ignore if not only letters or letters and hyphen
                continue
            if len(word.lemma
                   ) < 3:  # Ignore very short words, likely to be particles
                continue
            if '-' in [
                    word.lemma[0], word.lemma[1], word.lemma[-1]
            ]:  # Ignore words that start with '[anyLetter?]-' or end with '-'
                continue
            # Ignore unwanted words, such as names, foreign words, stopwords, abbreviations
            filter_query = SQLiteQuery(word.lemma,
                                       'filter',
                                       'FILTER_WORD_FORMS',
                                       cursor=filters.cursor)
            if filter_query.exists:
                continue
            else:
                if database == 'DCI':
                    query = SQLiteQuery(
                        word.lemma, 'lemma', 'DCI_ELEMENT',
                        cursor=dci.cursor)  # Capitalized words included
                    query_lower = SQLiteQuery(word.lemma.lower(),
                                              'lemma',
                                              'DCI_ELEMENT',
                                              cursor=dci.cursor)
                elif database == 'DIM':
                    query = SQLiteQuery(
                        word.lemma, 'lemma', 'DIM_ELEMENT',
                        cursor=dim.cursor)  # Capitalized words included
                    query_lower = SQLiteQuery(word.lemma.lower(),
                                              'lemma',
                                              'DIM_ELEMENT',
                                              cursor=dim.cursor)

                if not query.exists and not query_lower.exists:  # If the word is not found in the DIM or the stopwords
                    if word.lemma in freqdic:
                        if word.pos[0] == 'n':  # if word is a noun
                            freqdic[word.lemma]['freq'] += 1
                            if word.pos[
                                    2] == 'e':  # if the noun is singular (eintala)
                                freqdic[word.lemma]['number']['sing'] += 1
                            elif word.pos[
                                    2] == 'f':  # if the noun is plural (fleirtala)
                                freqdic[word.lemma]['number']['plur'] += 1
                        else:
                            freqdic[word.lemma][
                                'freq'] += 1  # Necessary for proper names, nouns with no number
                            freqdic[word.lemma]['number']['no_number'] += 1
                    else:
                        if word.pos[0] == 'n':
                            if word.pos[2] == 'e':
                                freqdic[word.lemma] = {
                                    'freq': 0,
                                    'number': {
                                        'sing': 1,
                                        'plur': 0,
                                        'no_number': 0
                                    }
                                }
                            elif word.pos[2] == 'f':
                                freqdic[word.lemma] = {
                                    'freq': 0,
                                    'number': {
                                        'sing': 0,
                                        'plur': 1,
                                        'no_number': 0
                                    }
                                }
                            else:
                                freqdic[word.lemma] = {
                                    'freq': 0,
                                    'number': {
                                        'sing': 0,
                                        'plur': 0,
                                        'no_number': 1
                                    }
                                }
                        else:
                            freqdic[word.lemma] = {
                                'freq': 0,
                                'number': {
                                    'sing': 0,
                                    'plur': 0,
                                    'no_number': 1
                                }
                            }
                        freqdic[word.lemma]['freq'] = 1
        except IndexError:
            continue
        except ET.ParseError:
            continue

    print("""
    ============================================================
    Sorting candidate frequencies.
    ============================================================
    """)
    if IGC_folder == "corpora/IGC/":
        with open(f'output/{database}/IGC_lemma.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['freq'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print(f"""
    ============================================================
    Output file IGC_lemma.freq is ready and can be 
    found in the output/{database}/ directory.
    ============================================================
        """)
    elif IGC_folder == "corpora/IGC/CC_BY/":
        with open(f'output/{database}/CC_BY_lemma.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['freq'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print(f"""
    ============================================================
    Output file CC_BY_lemma.freq is ready and can be 
    found in the output/{database}/ directory.
    ============================================================
        """)
    elif IGC_folder == "corpora/IGC/TIC/":
        with open(f'output/{database}/TIC_lemma.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['freq'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print(f"""
    ============================================================
    Output file TIC_lemma.freq is ready and can be 
    found in the output/{database}/ directory.
    ============================================================
        """)
    else:
        namefolder = IGC_folder.split("/")[3]
        with open(f'output/{database}/' + namefolder + '_lemma.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['freq'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')

        print(f"""
    ============================================================
    Output file {namefolder}_lemma.freq is ready and can be 
    found in the output/{database}/ directory.
    ============================================================
        """)
Exemple #3
0
def wordform_output(IGC_folder, prop_names):
    """
    Iterates over the input corpus and returns the word forms not found 
    in the DIM, ordered by frequency. Can be altered for other 
    databases or corpora. 
    """
    dim = SQLDatabase(db_name='databases/dim_lemmas_word_forms.db')
    filters = SQLDatabase(db_name='databases/IGC_filters.db'
                          )  # Predefined stop-word list based on the IGC
    pos_to_ignore = [
        'e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au'
    ]  # The POS tags that should not be displayed in the results
    IGC = IGCExtractor(folder=str(IGC_folder))
    freqdic = {}
    print("""
    ============================================================
    Reading corpus files.
    ============================================================
    """)
    for word in IGC.extract(forms=True, lemmas=False, pos=True):
        try:
            if prop_names == False:
                if word.pos.startswith('n') and word.pos.endswith(
                        's'):  # Ignore proper names
                    continue
            if word.pos in pos_to_ignore:
                continue
            if (not all(i.isalpha() or i == '-' for i in word.word_form)
                ):  # Ignore if not only letters or letters and hyphen
                continue
            if len(word.word_form) < 3:
                continue
            if '-' in [
                    word.word_form[0], word.word_form[1], word.word_form[-1]
            ]:  # Ignore words that start with '[anyLetter?]-' or end with '-'
                continue
            # Ignore unwanted words, such as names, foreign words, stopwords, abbreviations
            filter_query = SQLiteQuery(word.word_form,
                                       'filter',
                                       'FILTER_WORD_FORMS',
                                       cursor=filters.cursor)
            if filter_query.exists:
                continue
            else:
                query = SQLiteQuery(
                    word.word_form,
                    'word_form',
                    'DIM_ELEMENT',
                    cursor=dim.cursor)  # Capitalized words included
                query_lower = SQLiteQuery(word.word_form.lower(),
                                          'word_form',
                                          'DIM_ELEMENT',
                                          cursor=dim.cursor)
                if not query.exists and not query_lower.exists:  # If the word is not found in the DIM or the stopwords
                    if word.word_form in freqdic:
                        freqdic[word.word_form] += 1
                    else:
                        freqdic[word.word_form] = 1
        except IndexError:
            continue
        except ET.ParseError:
            continue
    print("""
    ============================================================
    Sorting candidate frequencies.
    ============================================================
    """)

    if IGC_folder == "corpora/IGC/":
        with open('output/DIM/IGC_wordform.freq', mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(
                    freqdic.items(), key=lambda item: item[1], reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print("""
    ============================================================
    Output file IGC_wordforms.freq is ready and can be 
    found in the output/DIM/ directory.
    ============================================================
        """)
    elif IGC_folder == "corpora/IGC/CC_BY/":
        with open('output/DIM/CC_BY_wordform.freq', mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(
                    freqdic.items(), key=lambda item: item[1], reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print("""
    ============================================================
    Output file CC_BY_wordforms.freq is ready and can be 
    found in the output/DIM/ directory.
    ============================================================
        """)

    elif IGC_folder == "corpora/IGC/TIC/":
        with open('output/DIM/TIC_wordform.freq', mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(
                    freqdic.items(), key=lambda item: item[1], reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print("""
    ============================================================
    Output file TIC_wordforms.freq is ready and can be 
    found in the output/DIM/ directory.
    ============================================================
        """)

    else:
        namefolder = IGC_folder.split("/")[3]
        with open('output/DIM/' + namefolder + '_wordform.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(
                    freqdic.items(), key=lambda item: item[1], reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')

        print(f"""
    ============================================================
    Output file {namefolder}_wordforms.freq is ready and can be 
    found in the output/DIM/ directory.
    ============================================================
        """)
def lemma_output(database, IGC_folder, prop_names):
    """
    Iterates over the input corpus and returns the lemmas not found 
    in the input database, ordered by frequency. Also includes 
    information on the tala of nouns, indicating if a noun only 
    exists in either the singular or the plural form (and whether 
    the automatic lemmatization/pos tagging is off). Can be altered 
    for other databases or malheildir. 
    """
    dci = SQLDatabase(db_name='gagnagrunnar/nmo.db')
    dim = SQLDatabase(db_name='gagnagrunnar/bin_lemmur_ordmyndir.db')
    filters = SQLDatabase(db_name='gagnagrunnar/IGC_filters.db'
                          )  # Predefined stop-word list based on the RMH
    pos_to_ignore = [
        'e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au'
    ]  # The POS tags that should not be displayed in the results
    RMH = IGCExtractor(folder=str(IGC_folder))
    freqdic = {}

    print("""
    ============================================================
    Les skjöl úr málheildinni.
    ============================================================
    """)
    for word in RMH.extract(forms=False, lemmas=True, pos=True):
        try:
            if prop_names == False:
                if word.pos.startswith('n') and word.pos.endswith(
                        's'):  # Ignore proper names
                    continue
            if word.pos in pos_to_ignore:
                continue
            if (not all(i.isalpha() or i == '-' for i in word.lemma)
                ):  # Ignore if not only letters or letters and hyphen
                continue
            if len(word.lemma
                   ) < 3:  # Ignore very short words, likely to be particles
                continue
            if '-' in [
                    word.lemma[0], word.lemma[1], word.lemma[-1]
            ]:  # Ignore words that start with '[anyLetter?]-' or end with '-'
                continue
            # Ignore unwanted words, such as names, foreign words, stopwords, abbreviations
            filter_query = SQLiteQuery(word.lemma,
                                       'filter',
                                       'FILTER_WORD_FORMS',
                                       cursor=filters.cursor)
            if filter_query.exists:
                continue
            else:
                if database == 'NMO':
                    query = SQLiteQuery(
                        word.lemma, 'lemma', 'DCI_ELEMENT',
                        cursor=dci.cursor)  # Capitalized words included
                    query_lower = SQLiteQuery(word.lemma.lower(),
                                              'lemma',
                                              'DCI_ELEMENT',
                                              cursor=dci.cursor)
                elif database == 'BIN':
                    query = SQLiteQuery(
                        word.lemma, 'lemma', 'DIM_ELEMENT',
                        cursor=dim.cursor)  # Capitalized words included
                    query_lower = SQLiteQuery(word.lemma.lower(),
                                              'lemma',
                                              'DIM_ELEMENT',
                                              cursor=dim.cursor)

                if not query.exists and not query_lower.exists:  # If the word is not found in the BIN or the stopwords
                    if word.lemma in freqdic:
                        if word.pos[0] == 'n':  # if word is a noun
                            freqdic[word.lemma]['tíðni'] += 1
                            if word.pos[
                                    2] == 'e':  # if the noun is singular (eintala)
                                freqdic[word.lemma]['tala']['eintala'] += 1
                            elif word.pos[
                                    2] == 'f':  # if the noun is plural (fleirtala)
                                freqdic[word.lemma]['tala']['fleirtala'] += 1
                        else:
                            freqdic[word.lemma][
                                'tíðni'] += 1  # Necessary for proper names, nouns with no tala
                            freqdic[word.lemma]['tala']['engin_tala'] += 1
                    else:
                        if word.pos[0] == 'n':
                            if word.pos[2] == 'e':
                                freqdic[word.lemma] = {
                                    'tíðni': 0,
                                    'tala': {
                                        'eintala': 1,
                                        'fleirtala': 0,
                                        'engin_tala': 0
                                    }
                                }
                            elif word.pos[2] == 'f':
                                freqdic[word.lemma] = {
                                    'tíðni': 0,
                                    'tala': {
                                        'eintala': 0,
                                        'fleirtala': 1,
                                        'engin_tala': 0
                                    }
                                }
                            else:
                                freqdic[word.lemma] = {
                                    'tíðni': 0,
                                    'tala': {
                                        'eintala': 0,
                                        'fleirtala': 0,
                                        'engin_tala': 1
                                    }
                                }
                        else:
                            freqdic[word.lemma] = {
                                'tíðni': 0,
                                'tala': {
                                    'eintala': 0,
                                    'fleirtala': 0,
                                    'engin_tala': 1
                                }
                            }
                        freqdic[word.lemma]['tíðni'] = 1
        except IndexError:
            continue
        except ET.ParseError:
            continue

    print("""
    ============================================================
    Flokkar orð eftir tíðni.
    ============================================================
    """)
    if IGC_folder == "malheildir/RMH/":
        with open(f'uttak/{database}/RMH_lemmur.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['tíðni'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print(f"""
    ============================================================
    Úttaksskjalið RMH_lemmur.freq er tilbúið og er að finna í 
    undirmöppunni uttak/{database}/
    ============================================================
        """)
    elif IGC_folder == "malheildir/RMH/CC_BY/":
        with open(f'uttak/{database}/CC_BY_lemmur.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['tíðni'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print(f"""
    ============================================================
    Úttaksskjalið CC_BY_lemmur.freq er tilbúið og er að finna í 
    undirmöppunni uttak/{database}/    
    ============================================================
        """)
    elif IGC_folder == "malheildir/RMH/MIM/":
        with open(f'uttak/{database}/MIM_lemmur.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['tíðni'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print(f"""
    ============================================================
    Úttaksskjalið MIM_lemmur.freq er tilbúið og er að finna í 
    undirmöppunni uttak/{database}/    
    ============================================================
        """)
    else:
        namefolder = IGC_folder.split("/")[3]
        with open(f'uttak/{database}/' + namefolder + '_lemmur.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['tíðni'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')

        print(f"""
    ============================================================
    Úttaksskjalið {namefolder}_lemmur.freq er tilbúið og er að finna í 
    undirmöppunni uttak/{database}/    
    ============================================================
        """)
def wordform_output(IGC_folder, prop_names):
    """
    Iterates over the input corpus and returns the word forms not found 
    in the BIN, ordered by frequency. Can be altered for other 
    gagnagrunnar or malheildir. 
    """
    dim = SQLDatabase(db_name='gagnagrunnar/bin_lemmur_ordmyndir.db')
    filters = SQLDatabase(db_name='gagnagrunnar/IGC_filters.db'
                          )  # Predefined stop-word list based on the RMH
    pos_to_ignore = [
        'e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au'
    ]  # The POS tags that should not be displayed in the results
    RMH = IGCExtractor(folder=str(IGC_folder))
    freqdic = {}
    print("""
    ============================================================
    Les skjöl úr málheildinni.
    ============================================================
    """)
    for word in RMH.extract(forms=True, lemmas=False, pos=True):
        try:
            if prop_names == False:
                if word.pos.startswith('n') and word.pos.endswith(
                        's'):  # Ignore proper names
                    continue
            if word.pos in pos_to_ignore:
                continue
            if (not all(i.isalpha() or i == '-' for i in word.word_form)
                ):  # Ignore if not only letters or letters and hyphen
                continue
            if len(word.word_form) < 3:
                continue
            if '-' in [
                    word.word_form[0], word.word_form[1], word.word_form[-1]
            ]:  # Ignore words that start with '[anyLetter?]-' or end with '-'
                continue
            # Ignore unwanted words, such as names, foreign words, stopwords, abbreviations
            filter_query = SQLiteQuery(word.word_form,
                                       'filter',
                                       'FILTER_WORD_FORMS',
                                       cursor=filters.cursor)
            if filter_query.exists:
                continue
            else:
                query = SQLiteQuery(
                    word.word_form,
                    'word_form',
                    'DIM_ELEMENT',
                    cursor=dim.cursor)  # Capitalized words included
                query_lower = SQLiteQuery(word.word_form.lower(),
                                          'word_form',
                                          'DIM_ELEMENT',
                                          cursor=dim.cursor)
                if not query.exists and not query_lower.exists:  # If the word is not found in the BIN or the stopwords
                    if word.word_form in freqdic:
                        freqdic[word.word_form] += 1
                    else:
                        freqdic[word.word_form] = 1
        except IndexError:
            continue
        except ET.ParseError:
            continue
    print("""
    ============================================================
    Flokkar orð eftir tíðni.
    ============================================================
    """)

    if IGC_folder == "malheildir/RMH/":
        with open('uttak/BIN/RMH_ordmyndir.freq', mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(
                    freqdic.items(), key=lambda item: item[1], reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print("""
    ============================================================
    Úttaksskjalið RMH_ordmyndir.freq er tilbúið og er að finna í 
    undirmöppunni uttak/BIN/
    ============================================================
        """)
    elif IGC_folder == "malheildir/RMH/CC_BY/":
        with open('uttak/BIN/CC_BY_ordmyndir.freq', mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(
                    freqdic.items(), key=lambda item: item[1], reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print("""
    ============================================================
    Úttaksskjalið CC_BY_ordmyndir.freq er tilbúið og er að finna í 
    undirmöppunni uttak/BIN/
    ============================================================
        """)

    elif IGC_folder == "malheildir/RMH/MIM/":
        with open('uttak/BIN/MIM_ordmyndir.freq', mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(
                    freqdic.items(), key=lambda item: item[1], reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print("""
    ============================================================
    Úttaksskjalið MIM_ordmyndir.freq er tilbúið og er að finna í 
    undirmöppunni uttak/BIN/
    ============================================================
        """)

    else:
        namefolder = IGC_folder.split("/")[3]
        with open('uttak/BIN/' + namefolder + '_ordmyndir.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(
                    freqdic.items(), key=lambda item: item[1], reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')

        print(f"""
    ============================================================
    Úttaksskjalið {namefolder}_ordmyndir.freq er tilbúið og er að finna í 
    undirmöppunni uttak/BIN/
    ============================================================
        """)
Exemple #6
0
def lemmabase_wordforms(database, IGC_folder, prop_names):
    """
    Iterates through the IGC, outputting a list of lemmas
    and their frequencies as well as all wordforms that appear
    alongside the lemma in the corpus. Useful for detecting whether
    a word only appears in certain context (e.g. fixed expressions)
    or whether a certain wordform never appears. Can be modified to 
    fit the user's need.
    """
    dim = SQLDatabase(db_name='databases/dim_lemmas_word_forms.db')
    dci = SQLDatabase(db_name='databases/dci.db')
    filters = SQLDatabase(db_name='databases/IGC_filters.db'
                          )  # Predefined stop-word list based on the IGC
    pos_to_ignore = [
        'e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au'
    ]  # The POS tags that should not be displayed in the results
    IGC = IGCExtractor(folder=str(IGC_folder))
    freqdic = {}

    print("""
    ============================================================
    Reading corpus files.
    ============================================================
    """)

    for word in IGC.extract(forms=True, lemmas=True, pos=True):
        try:
            if prop_names == False:
                if word.pos.startswith('n') and word.pos.endswith(
                        's'):  # Ignore proper names
                    continue
            if word.pos in pos_to_ignore:
                continue
            if (not all(i.isalpha() or i == '-' for i in word.lemma)
                ):  # Ignore if not only letters or letters and hyphen
                continue
            if len(word.lemma
                   ) < 3:  # Ignore very short words, likely to be particles
                continue
            if '-' in [
                    word.lemma[0], word.lemma[1], word.lemma[-1]
            ]:  # Ignore words that start with '[anyLetter?]-' or end with '-'
                continue
            # Ignore unwanted words, such as names, foreign words, stopwords, abbreviations
            filter_query = SQLiteQuery(word.lemma,
                                       'filter',
                                       'FILTER_WORD_FORMS',
                                       cursor=filters.cursor)
            if filter_query.exists:
                continue
            else:
                if database == 'DCI':
                    query = SQLiteQuery(
                        word.lemma, 'lemma', 'DCI_ELEMENT',
                        cursor=dci.cursor)  # Capitalized words included
                    query_lower = SQLiteQuery(word.lemma.lower(),
                                              'lemma',
                                              'DCI_ELEMENT',
                                              cursor=dci.cursor)
                elif database == 'DIM':
                    query = SQLiteQuery(
                        word.lemma, 'lemma', 'DIM_ELEMENT',
                        cursor=dim.cursor)  # Capitalized words included
                    query_lower = SQLiteQuery(word.lemma.lower(),
                                              'lemma',
                                              'DIM_ELEMENT',
                                              cursor=dim.cursor)
                if not query.exists and not query_lower.exists:
                    if word.lemma in freqdic:
                        if word.word_form not in freqdic[
                                word.lemma]['wordforms']:
                            freqdic[word.lemma]['wordforms'].append(
                                word.word_form)
                        freqdic[word.lemma]['freq'] += 1
                    else:
                        freqdic[word.lemma] = {}
                        freqdic[word.lemma]['freq'] = 1
                        freqdic[word.lemma]['wordforms'] = [word.word_form]
        except IndexError:
            continue
        except ET.ParseError:
            continue

    print("""
    ============================================================
    Sorting candidate frequencies.
    ============================================================
    """)

    if IGC_folder == "corpora/IGC/":
        with open(f'output/{database}/IGC_lemma_plus_wordform.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['freq'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print("""
    ============================================================
    Output file IGC_lemma_plus_wordform.freq is ready and can be 
    found in the output/DIM/ directory.
    ============================================================
        """)
    elif IGC_folder == "corpora/IGC/CC_BY/":
        with open(f'output/{database}/CC_BY_lemma_plus_wordform.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['freq'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print("""
    ============================================================
    Output file CC_BY_lemma_plus_wordform.freq is ready and can be 
    found in the output/DIM/ directory.
    ============================================================
        """)
    elif IGC_folder == "corpora/IGC/TIC/":
        with open(f'output/{database}/TIC_lemma_plus_wordform.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['freq'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')
        print("""
    ============================================================
    Output file TIC_lemma_plus_wordform.freq is ready and can be 
    found in the output/DIM/ directory.
    ============================================================
        """)
    else:
        namefolder = IGC_folder.split("/")[3]
        with open(f'output/{database}/' + namefolder +
                  '_lemma_plus_wordform.freq',
                  mode='w+') as outputfile:
            candidates = {
                k: v
                for k, v in sorted(freqdic.items(),
                                   key=lambda item: item[1]['freq'],
                                   reverse=True)
            }
            for key, value in candidates.items():
                outputfile.write(key + ': ' + str(value) + '\n')

        print(f"""
    ============================================================
    Output file {namefolder}_lemma_plus_wordform.freq is ready 
    and can be found in the output/DIM/ directory.
    ============================================================
        """)