def lemmabase_wordforms(database, IGC_folder, prop_names): """ Iterates through the IGC, outputting a list of lemmas and their frequencies as well as all wordforms that appear alongside the lemma in the corpus. Useful for detecting whether a word only appears in certain context (e.g. fixed expressions) or whether a certain wordform never appears. Can be modified to fit the user's need. """ dci = SQLDatabase(db_name='gagnagrunnar/nmo.db') dim = SQLDatabase(db_name='gagnagrunnar/bin_lemmur_ordmyndir.db') filters = SQLDatabase(db_name='gagnagrunnar/IGC_filters.db' ) # Predefined stop-word list based on the RMH pos_to_ignore = [ 'e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au' ] # The POS tags that should not be displayed in the results IGC = IGCExtractor(folder=str(IGC_folder)) freqdic = {} print(""" ============================================================ Les skjöl úr málheildinni. ============================================================ """) for word in IGC.extract(forms=True, lemmas=True, pos=True): try: if prop_names == False: if word.pos.startswith('n') and word.pos.endswith( 's'): # Ignore proper names continue if word.pos in pos_to_ignore: continue if (not all(i.isalpha() or i == '-' for i in word.lemma) ): # Ignore if not only letters or letters and hyphen continue if len(word.lemma ) < 3: # Ignore very short words, likely to be particles continue if '-' in [ word.lemma[0], word.lemma[1], word.lemma[-1] ]: # Ignore words that start with '[anyLetter?]-' or end with '-' continue # Ignore unwanted words, such as names, foreign words, stopwords, abbreviations filter_query = SQLiteQuery(word.lemma, 'filter', 'FILTER_WORD_FORMS', cursor=filters.cursor) if filter_query.exists: continue else: if database == 'NMO': query = SQLiteQuery( word.lemma, 'lemma', 'DCI_ELEMENT', cursor=dci.cursor) # Capitalized words included query_lower = SQLiteQuery(word.lemma.lower(), 'lemma', 'DCI_ELEMENT', cursor=dci.cursor) elif database == 'BIN': query = SQLiteQuery( word.lemma, 'lemma', 'DIM_ELEMENT', cursor=dim.cursor) # Capitalized words included query_lower = SQLiteQuery(word.lemma.lower(), 'lemma', 'DIM_ELEMENT', cursor=dim.cursor) if not query.exists and not query_lower.exists: if word.lemma in freqdic: if word.word_form not in freqdic[ word.lemma]['orðmyndir']: freqdic[word.lemma]['orðmyndir'].append( word.word_form) freqdic[word.lemma]['tíðni'] += 1 else: freqdic[word.lemma] = {} freqdic[word.lemma]['tíðni'] = 1 freqdic[word.lemma]['orðmyndir'] = [word.word_form] except IndexError: continue except ET.ParseError: continue print(""" ============================================================ Flokkar orð eftir tíðni. ============================================================ """) if IGC_folder == "malheildir/RMH/": with open(f'uttak/{database}/RMH_lemmur_med_ordmyndum.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['tíðni'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Úttaksskjalið RMH_lemmur_med_ordmyndum.freq er tilbúið og er að finna í undirmöppunni uttak/{database}/ ============================================================ """) elif IGC_folder == "malheildir/RMH/CC_BY/": with open(f'uttak/{database}/CC_BY_lemmur_med_ordmyndum.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['tíðni'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Úttaksskjalið CC_BY_lemmur_med_ordmyndum.freq er tilbúið og er að finna í undirmöppunni uttak/{database}/ ============================================================ """) elif IGC_folder == "malheildir/RMH/MIM/": with open(f'uttak/{database}/MIM_lemmur_med_ordmyndum.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['tíðni'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Úttaksskjalið MIM_lemmur_med_ordmyndum.freq er tilbúið og er að finna í undirmöppunni uttak/{database}/ ============================================================ """) else: namefolder = IGC_folder.split("/")[3] with open(f'uttak/{database}/' + namefolder + '_lemmur_med_ordmyndum.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['tíðni'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Úttaksskjalið {database}_lemmur_med_ordmyndum.freq er tilbúið og er að finna í undirmöppunni uttak/{database}/ ============================================================ """)
def lemma_output(database, IGC_folder, prop_names): """ Iterates over the input corpus and returns the lemmas not found in the input database, ordered by frequency. Also includes information on the number of nouns, indicating if a noun only exists in either the singular or the plural form (and whether the automatic lemmatization/pos tagging is off). Can be altered for other databases or corpora. """ dci = SQLDatabase(db_name='databases/dci.db') dim = SQLDatabase(db_name='databases/dim_lemmas_word_forms.db') filters = SQLDatabase(db_name='databases/IGC_filters.db' ) # Predefined stop-word list based on the IGC pos_to_ignore = [ 'e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au' ] # The POS tags that should not be displayed in the results IGC = IGCExtractor(folder=str(IGC_folder)) freqdic = {} print(""" ============================================================ Reading corpus files. ============================================================ """) for word in IGC.extract(forms=False, lemmas=True, pos=True): try: if prop_names == False: if word.pos.startswith('n') and word.pos.endswith( 's'): # Ignore proper names continue if word.pos in pos_to_ignore: continue if (not all(i.isalpha() or i == '-' for i in word.lemma) ): # Ignore if not only letters or letters and hyphen continue if len(word.lemma ) < 3: # Ignore very short words, likely to be particles continue if '-' in [ word.lemma[0], word.lemma[1], word.lemma[-1] ]: # Ignore words that start with '[anyLetter?]-' or end with '-' continue # Ignore unwanted words, such as names, foreign words, stopwords, abbreviations filter_query = SQLiteQuery(word.lemma, 'filter', 'FILTER_WORD_FORMS', cursor=filters.cursor) if filter_query.exists: continue else: if database == 'DCI': query = SQLiteQuery( word.lemma, 'lemma', 'DCI_ELEMENT', cursor=dci.cursor) # Capitalized words included query_lower = SQLiteQuery(word.lemma.lower(), 'lemma', 'DCI_ELEMENT', cursor=dci.cursor) elif database == 'DIM': query = SQLiteQuery( word.lemma, 'lemma', 'DIM_ELEMENT', cursor=dim.cursor) # Capitalized words included query_lower = SQLiteQuery(word.lemma.lower(), 'lemma', 'DIM_ELEMENT', cursor=dim.cursor) if not query.exists and not query_lower.exists: # If the word is not found in the DIM or the stopwords if word.lemma in freqdic: if word.pos[0] == 'n': # if word is a noun freqdic[word.lemma]['freq'] += 1 if word.pos[ 2] == 'e': # if the noun is singular (eintala) freqdic[word.lemma]['number']['sing'] += 1 elif word.pos[ 2] == 'f': # if the noun is plural (fleirtala) freqdic[word.lemma]['number']['plur'] += 1 else: freqdic[word.lemma][ 'freq'] += 1 # Necessary for proper names, nouns with no number freqdic[word.lemma]['number']['no_number'] += 1 else: if word.pos[0] == 'n': if word.pos[2] == 'e': freqdic[word.lemma] = { 'freq': 0, 'number': { 'sing': 1, 'plur': 0, 'no_number': 0 } } elif word.pos[2] == 'f': freqdic[word.lemma] = { 'freq': 0, 'number': { 'sing': 0, 'plur': 1, 'no_number': 0 } } else: freqdic[word.lemma] = { 'freq': 0, 'number': { 'sing': 0, 'plur': 0, 'no_number': 1 } } else: freqdic[word.lemma] = { 'freq': 0, 'number': { 'sing': 0, 'plur': 0, 'no_number': 1 } } freqdic[word.lemma]['freq'] = 1 except IndexError: continue except ET.ParseError: continue print(""" ============================================================ Sorting candidate frequencies. ============================================================ """) if IGC_folder == "corpora/IGC/": with open(f'output/{database}/IGC_lemma.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['freq'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Output file IGC_lemma.freq is ready and can be found in the output/{database}/ directory. ============================================================ """) elif IGC_folder == "corpora/IGC/CC_BY/": with open(f'output/{database}/CC_BY_lemma.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['freq'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Output file CC_BY_lemma.freq is ready and can be found in the output/{database}/ directory. ============================================================ """) elif IGC_folder == "corpora/IGC/TIC/": with open(f'output/{database}/TIC_lemma.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['freq'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Output file TIC_lemma.freq is ready and can be found in the output/{database}/ directory. ============================================================ """) else: namefolder = IGC_folder.split("/")[3] with open(f'output/{database}/' + namefolder + '_lemma.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['freq'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Output file {namefolder}_lemma.freq is ready and can be found in the output/{database}/ directory. ============================================================ """)
def wordform_output(IGC_folder, prop_names): """ Iterates over the input corpus and returns the word forms not found in the DIM, ordered by frequency. Can be altered for other databases or corpora. """ dim = SQLDatabase(db_name='databases/dim_lemmas_word_forms.db') filters = SQLDatabase(db_name='databases/IGC_filters.db' ) # Predefined stop-word list based on the IGC pos_to_ignore = [ 'e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au' ] # The POS tags that should not be displayed in the results IGC = IGCExtractor(folder=str(IGC_folder)) freqdic = {} print(""" ============================================================ Reading corpus files. ============================================================ """) for word in IGC.extract(forms=True, lemmas=False, pos=True): try: if prop_names == False: if word.pos.startswith('n') and word.pos.endswith( 's'): # Ignore proper names continue if word.pos in pos_to_ignore: continue if (not all(i.isalpha() or i == '-' for i in word.word_form) ): # Ignore if not only letters or letters and hyphen continue if len(word.word_form) < 3: continue if '-' in [ word.word_form[0], word.word_form[1], word.word_form[-1] ]: # Ignore words that start with '[anyLetter?]-' or end with '-' continue # Ignore unwanted words, such as names, foreign words, stopwords, abbreviations filter_query = SQLiteQuery(word.word_form, 'filter', 'FILTER_WORD_FORMS', cursor=filters.cursor) if filter_query.exists: continue else: query = SQLiteQuery( word.word_form, 'word_form', 'DIM_ELEMENT', cursor=dim.cursor) # Capitalized words included query_lower = SQLiteQuery(word.word_form.lower(), 'word_form', 'DIM_ELEMENT', cursor=dim.cursor) if not query.exists and not query_lower.exists: # If the word is not found in the DIM or the stopwords if word.word_form in freqdic: freqdic[word.word_form] += 1 else: freqdic[word.word_form] = 1 except IndexError: continue except ET.ParseError: continue print(""" ============================================================ Sorting candidate frequencies. ============================================================ """) if IGC_folder == "corpora/IGC/": with open('output/DIM/IGC_wordform.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted( freqdic.items(), key=lambda item: item[1], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(""" ============================================================ Output file IGC_wordforms.freq is ready and can be found in the output/DIM/ directory. ============================================================ """) elif IGC_folder == "corpora/IGC/CC_BY/": with open('output/DIM/CC_BY_wordform.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted( freqdic.items(), key=lambda item: item[1], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(""" ============================================================ Output file CC_BY_wordforms.freq is ready and can be found in the output/DIM/ directory. ============================================================ """) elif IGC_folder == "corpora/IGC/TIC/": with open('output/DIM/TIC_wordform.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted( freqdic.items(), key=lambda item: item[1], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(""" ============================================================ Output file TIC_wordforms.freq is ready and can be found in the output/DIM/ directory. ============================================================ """) else: namefolder = IGC_folder.split("/")[3] with open('output/DIM/' + namefolder + '_wordform.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted( freqdic.items(), key=lambda item: item[1], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Output file {namefolder}_wordforms.freq is ready and can be found in the output/DIM/ directory. ============================================================ """)
def lemma_output(database, IGC_folder, prop_names): """ Iterates over the input corpus and returns the lemmas not found in the input database, ordered by frequency. Also includes information on the tala of nouns, indicating if a noun only exists in either the singular or the plural form (and whether the automatic lemmatization/pos tagging is off). Can be altered for other databases or malheildir. """ dci = SQLDatabase(db_name='gagnagrunnar/nmo.db') dim = SQLDatabase(db_name='gagnagrunnar/bin_lemmur_ordmyndir.db') filters = SQLDatabase(db_name='gagnagrunnar/IGC_filters.db' ) # Predefined stop-word list based on the RMH pos_to_ignore = [ 'e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au' ] # The POS tags that should not be displayed in the results RMH = IGCExtractor(folder=str(IGC_folder)) freqdic = {} print(""" ============================================================ Les skjöl úr málheildinni. ============================================================ """) for word in RMH.extract(forms=False, lemmas=True, pos=True): try: if prop_names == False: if word.pos.startswith('n') and word.pos.endswith( 's'): # Ignore proper names continue if word.pos in pos_to_ignore: continue if (not all(i.isalpha() or i == '-' for i in word.lemma) ): # Ignore if not only letters or letters and hyphen continue if len(word.lemma ) < 3: # Ignore very short words, likely to be particles continue if '-' in [ word.lemma[0], word.lemma[1], word.lemma[-1] ]: # Ignore words that start with '[anyLetter?]-' or end with '-' continue # Ignore unwanted words, such as names, foreign words, stopwords, abbreviations filter_query = SQLiteQuery(word.lemma, 'filter', 'FILTER_WORD_FORMS', cursor=filters.cursor) if filter_query.exists: continue else: if database == 'NMO': query = SQLiteQuery( word.lemma, 'lemma', 'DCI_ELEMENT', cursor=dci.cursor) # Capitalized words included query_lower = SQLiteQuery(word.lemma.lower(), 'lemma', 'DCI_ELEMENT', cursor=dci.cursor) elif database == 'BIN': query = SQLiteQuery( word.lemma, 'lemma', 'DIM_ELEMENT', cursor=dim.cursor) # Capitalized words included query_lower = SQLiteQuery(word.lemma.lower(), 'lemma', 'DIM_ELEMENT', cursor=dim.cursor) if not query.exists and not query_lower.exists: # If the word is not found in the BIN or the stopwords if word.lemma in freqdic: if word.pos[0] == 'n': # if word is a noun freqdic[word.lemma]['tíðni'] += 1 if word.pos[ 2] == 'e': # if the noun is singular (eintala) freqdic[word.lemma]['tala']['eintala'] += 1 elif word.pos[ 2] == 'f': # if the noun is plural (fleirtala) freqdic[word.lemma]['tala']['fleirtala'] += 1 else: freqdic[word.lemma][ 'tíðni'] += 1 # Necessary for proper names, nouns with no tala freqdic[word.lemma]['tala']['engin_tala'] += 1 else: if word.pos[0] == 'n': if word.pos[2] == 'e': freqdic[word.lemma] = { 'tíðni': 0, 'tala': { 'eintala': 1, 'fleirtala': 0, 'engin_tala': 0 } } elif word.pos[2] == 'f': freqdic[word.lemma] = { 'tíðni': 0, 'tala': { 'eintala': 0, 'fleirtala': 1, 'engin_tala': 0 } } else: freqdic[word.lemma] = { 'tíðni': 0, 'tala': { 'eintala': 0, 'fleirtala': 0, 'engin_tala': 1 } } else: freqdic[word.lemma] = { 'tíðni': 0, 'tala': { 'eintala': 0, 'fleirtala': 0, 'engin_tala': 1 } } freqdic[word.lemma]['tíðni'] = 1 except IndexError: continue except ET.ParseError: continue print(""" ============================================================ Flokkar orð eftir tíðni. ============================================================ """) if IGC_folder == "malheildir/RMH/": with open(f'uttak/{database}/RMH_lemmur.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['tíðni'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Úttaksskjalið RMH_lemmur.freq er tilbúið og er að finna í undirmöppunni uttak/{database}/ ============================================================ """) elif IGC_folder == "malheildir/RMH/CC_BY/": with open(f'uttak/{database}/CC_BY_lemmur.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['tíðni'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Úttaksskjalið CC_BY_lemmur.freq er tilbúið og er að finna í undirmöppunni uttak/{database}/ ============================================================ """) elif IGC_folder == "malheildir/RMH/MIM/": with open(f'uttak/{database}/MIM_lemmur.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['tíðni'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Úttaksskjalið MIM_lemmur.freq er tilbúið og er að finna í undirmöppunni uttak/{database}/ ============================================================ """) else: namefolder = IGC_folder.split("/")[3] with open(f'uttak/{database}/' + namefolder + '_lemmur.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['tíðni'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Úttaksskjalið {namefolder}_lemmur.freq er tilbúið og er að finna í undirmöppunni uttak/{database}/ ============================================================ """)
def wordform_output(IGC_folder, prop_names): """ Iterates over the input corpus and returns the word forms not found in the BIN, ordered by frequency. Can be altered for other gagnagrunnar or malheildir. """ dim = SQLDatabase(db_name='gagnagrunnar/bin_lemmur_ordmyndir.db') filters = SQLDatabase(db_name='gagnagrunnar/IGC_filters.db' ) # Predefined stop-word list based on the RMH pos_to_ignore = [ 'e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au' ] # The POS tags that should not be displayed in the results RMH = IGCExtractor(folder=str(IGC_folder)) freqdic = {} print(""" ============================================================ Les skjöl úr málheildinni. ============================================================ """) for word in RMH.extract(forms=True, lemmas=False, pos=True): try: if prop_names == False: if word.pos.startswith('n') and word.pos.endswith( 's'): # Ignore proper names continue if word.pos in pos_to_ignore: continue if (not all(i.isalpha() or i == '-' for i in word.word_form) ): # Ignore if not only letters or letters and hyphen continue if len(word.word_form) < 3: continue if '-' in [ word.word_form[0], word.word_form[1], word.word_form[-1] ]: # Ignore words that start with '[anyLetter?]-' or end with '-' continue # Ignore unwanted words, such as names, foreign words, stopwords, abbreviations filter_query = SQLiteQuery(word.word_form, 'filter', 'FILTER_WORD_FORMS', cursor=filters.cursor) if filter_query.exists: continue else: query = SQLiteQuery( word.word_form, 'word_form', 'DIM_ELEMENT', cursor=dim.cursor) # Capitalized words included query_lower = SQLiteQuery(word.word_form.lower(), 'word_form', 'DIM_ELEMENT', cursor=dim.cursor) if not query.exists and not query_lower.exists: # If the word is not found in the BIN or the stopwords if word.word_form in freqdic: freqdic[word.word_form] += 1 else: freqdic[word.word_form] = 1 except IndexError: continue except ET.ParseError: continue print(""" ============================================================ Flokkar orð eftir tíðni. ============================================================ """) if IGC_folder == "malheildir/RMH/": with open('uttak/BIN/RMH_ordmyndir.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted( freqdic.items(), key=lambda item: item[1], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(""" ============================================================ Úttaksskjalið RMH_ordmyndir.freq er tilbúið og er að finna í undirmöppunni uttak/BIN/ ============================================================ """) elif IGC_folder == "malheildir/RMH/CC_BY/": with open('uttak/BIN/CC_BY_ordmyndir.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted( freqdic.items(), key=lambda item: item[1], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(""" ============================================================ Úttaksskjalið CC_BY_ordmyndir.freq er tilbúið og er að finna í undirmöppunni uttak/BIN/ ============================================================ """) elif IGC_folder == "malheildir/RMH/MIM/": with open('uttak/BIN/MIM_ordmyndir.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted( freqdic.items(), key=lambda item: item[1], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(""" ============================================================ Úttaksskjalið MIM_ordmyndir.freq er tilbúið og er að finna í undirmöppunni uttak/BIN/ ============================================================ """) else: namefolder = IGC_folder.split("/")[3] with open('uttak/BIN/' + namefolder + '_ordmyndir.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted( freqdic.items(), key=lambda item: item[1], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Úttaksskjalið {namefolder}_ordmyndir.freq er tilbúið og er að finna í undirmöppunni uttak/BIN/ ============================================================ """)
def lemmabase_wordforms(database, IGC_folder, prop_names): """ Iterates through the IGC, outputting a list of lemmas and their frequencies as well as all wordforms that appear alongside the lemma in the corpus. Useful for detecting whether a word only appears in certain context (e.g. fixed expressions) or whether a certain wordform never appears. Can be modified to fit the user's need. """ dim = SQLDatabase(db_name='databases/dim_lemmas_word_forms.db') dci = SQLDatabase(db_name='databases/dci.db') filters = SQLDatabase(db_name='databases/IGC_filters.db' ) # Predefined stop-word list based on the IGC pos_to_ignore = [ 'e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au' ] # The POS tags that should not be displayed in the results IGC = IGCExtractor(folder=str(IGC_folder)) freqdic = {} print(""" ============================================================ Reading corpus files. ============================================================ """) for word in IGC.extract(forms=True, lemmas=True, pos=True): try: if prop_names == False: if word.pos.startswith('n') and word.pos.endswith( 's'): # Ignore proper names continue if word.pos in pos_to_ignore: continue if (not all(i.isalpha() or i == '-' for i in word.lemma) ): # Ignore if not only letters or letters and hyphen continue if len(word.lemma ) < 3: # Ignore very short words, likely to be particles continue if '-' in [ word.lemma[0], word.lemma[1], word.lemma[-1] ]: # Ignore words that start with '[anyLetter?]-' or end with '-' continue # Ignore unwanted words, such as names, foreign words, stopwords, abbreviations filter_query = SQLiteQuery(word.lemma, 'filter', 'FILTER_WORD_FORMS', cursor=filters.cursor) if filter_query.exists: continue else: if database == 'DCI': query = SQLiteQuery( word.lemma, 'lemma', 'DCI_ELEMENT', cursor=dci.cursor) # Capitalized words included query_lower = SQLiteQuery(word.lemma.lower(), 'lemma', 'DCI_ELEMENT', cursor=dci.cursor) elif database == 'DIM': query = SQLiteQuery( word.lemma, 'lemma', 'DIM_ELEMENT', cursor=dim.cursor) # Capitalized words included query_lower = SQLiteQuery(word.lemma.lower(), 'lemma', 'DIM_ELEMENT', cursor=dim.cursor) if not query.exists and not query_lower.exists: if word.lemma in freqdic: if word.word_form not in freqdic[ word.lemma]['wordforms']: freqdic[word.lemma]['wordforms'].append( word.word_form) freqdic[word.lemma]['freq'] += 1 else: freqdic[word.lemma] = {} freqdic[word.lemma]['freq'] = 1 freqdic[word.lemma]['wordforms'] = [word.word_form] except IndexError: continue except ET.ParseError: continue print(""" ============================================================ Sorting candidate frequencies. ============================================================ """) if IGC_folder == "corpora/IGC/": with open(f'output/{database}/IGC_lemma_plus_wordform.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['freq'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(""" ============================================================ Output file IGC_lemma_plus_wordform.freq is ready and can be found in the output/DIM/ directory. ============================================================ """) elif IGC_folder == "corpora/IGC/CC_BY/": with open(f'output/{database}/CC_BY_lemma_plus_wordform.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['freq'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(""" ============================================================ Output file CC_BY_lemma_plus_wordform.freq is ready and can be found in the output/DIM/ directory. ============================================================ """) elif IGC_folder == "corpora/IGC/TIC/": with open(f'output/{database}/TIC_lemma_plus_wordform.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['freq'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(""" ============================================================ Output file TIC_lemma_plus_wordform.freq is ready and can be found in the output/DIM/ directory. ============================================================ """) else: namefolder = IGC_folder.split("/")[3] with open(f'output/{database}/' + namefolder + '_lemma_plus_wordform.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['freq'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Output file {namefolder}_lemma_plus_wordform.freq is ready and can be found in the output/DIM/ directory. ============================================================ """)