def lemmabase_wordforms(database, IGC_folder, prop_names): """ Iterates through the IGC, outputting a list of lemmas and their frequencies as well as all wordforms that appear alongside the lemma in the corpus. Useful for detecting whether a word only appears in certain context (e.g. fixed expressions) or whether a certain wordform never appears. Can be modified to fit the user's need. """ dci = SQLDatabase(db_name='gagnagrunnar/nmo.db') dim = SQLDatabase(db_name='gagnagrunnar/bin_lemmur_ordmyndir.db') filters = SQLDatabase(db_name='gagnagrunnar/IGC_filters.db' ) # Predefined stop-word list based on the RMH pos_to_ignore = [ 'e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au' ] # The POS tags that should not be displayed in the results IGC = IGCExtractor(folder=str(IGC_folder)) freqdic = {} print(""" ============================================================ Les skjöl úr málheildinni. ============================================================ """) for word in IGC.extract(forms=True, lemmas=True, pos=True): try: if prop_names == False: if word.pos.startswith('n') and word.pos.endswith( 's'): # Ignore proper names continue if word.pos in pos_to_ignore: continue if (not all(i.isalpha() or i == '-' for i in word.lemma) ): # Ignore if not only letters or letters and hyphen continue if len(word.lemma ) < 3: # Ignore very short words, likely to be particles continue if '-' in [ word.lemma[0], word.lemma[1], word.lemma[-1] ]: # Ignore words that start with '[anyLetter?]-' or end with '-' continue # Ignore unwanted words, such as names, foreign words, stopwords, abbreviations filter_query = SQLiteQuery(word.lemma, 'filter', 'FILTER_WORD_FORMS', cursor=filters.cursor) if filter_query.exists: continue else: if database == 'NMO': query = SQLiteQuery( word.lemma, 'lemma', 'DCI_ELEMENT', cursor=dci.cursor) # Capitalized words included query_lower = SQLiteQuery(word.lemma.lower(), 'lemma', 'DCI_ELEMENT', cursor=dci.cursor) elif database == 'BIN': query = SQLiteQuery( word.lemma, 'lemma', 'DIM_ELEMENT', cursor=dim.cursor) # Capitalized words included query_lower = SQLiteQuery(word.lemma.lower(), 'lemma', 'DIM_ELEMENT', cursor=dim.cursor) if not query.exists and not query_lower.exists: if word.lemma in freqdic: if word.word_form not in freqdic[ word.lemma]['orðmyndir']: freqdic[word.lemma]['orðmyndir'].append( word.word_form) freqdic[word.lemma]['tíðni'] += 1 else: freqdic[word.lemma] = {} freqdic[word.lemma]['tíðni'] = 1 freqdic[word.lemma]['orðmyndir'] = [word.word_form] except IndexError: continue except ET.ParseError: continue print(""" ============================================================ Flokkar orð eftir tíðni. ============================================================ """) if IGC_folder == "malheildir/RMH/": with open(f'uttak/{database}/RMH_lemmur_med_ordmyndum.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['tíðni'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Úttaksskjalið RMH_lemmur_med_ordmyndum.freq er tilbúið og er að finna í undirmöppunni uttak/{database}/ ============================================================ """) elif IGC_folder == "malheildir/RMH/CC_BY/": with open(f'uttak/{database}/CC_BY_lemmur_med_ordmyndum.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['tíðni'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Úttaksskjalið CC_BY_lemmur_med_ordmyndum.freq er tilbúið og er að finna í undirmöppunni uttak/{database}/ ============================================================ """) elif IGC_folder == "malheildir/RMH/MIM/": with open(f'uttak/{database}/MIM_lemmur_med_ordmyndum.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['tíðni'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Úttaksskjalið MIM_lemmur_med_ordmyndum.freq er tilbúið og er að finna í undirmöppunni uttak/{database}/ ============================================================ """) else: namefolder = IGC_folder.split("/")[3] with open(f'uttak/{database}/' + namefolder + '_lemmur_med_ordmyndum.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['tíðni'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Úttaksskjalið {database}_lemmur_med_ordmyndum.freq er tilbúið og er að finna í undirmöppunni uttak/{database}/ ============================================================ """)
def texttype_freqs(database, folder, prop_names): """ Used to collect lemmas by the types of text they appear in and sort them by frequency. Filters the IGC in order to retrieve the desired results. The script can be modified according to the user's need and to fit another corpus. """ dci = SQLDatabase(db_name='databases/dci.db') dim = SQLDatabase(db_name='databases/dim_lemmas_word_forms.db') filters = SQLDatabase(db_name='databases/IGC_filters.db') # Predefined stop-word list based on the IGC print(""" ============================================================ Reading corpus files. ============================================================ """) xml_files = glob.glob(folder+'/**/*.xml', recursive=True) alltexttypes = [] freqdic1 = {} freqdic2 = {} filebar = IncrementalBar('Progress', max = len(xml_files)) for file in xml_files: with open(file, 'r', encoding='utf-8') as content: try: tree = ET.parse(content) root = tree.getroot() textClass = root[0][2][0][0][0][0] # Retrieve the texttype tag from the XML file texttype = textClass.text if texttype not in alltexttypes: alltexttypes.append(texttype) # Collect all unique texttypes pos_to_ignore = ['e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au'] # The POS tags that should not be displayed in the results for word in tree.iter(): pos = word.attrib.get('type') if pos is not None: if prop_names==False: if pos.startswith('n') and pos.endswith('s'): # Ignore proper names continue if pos in pos_to_ignore: continue if (not all(i.isalpha() or i == '-' for i in word.text)): # Ignore all that are not alphabetic letters or hyphen continue if len(word.text) < 3: # Ignore very short words, likely to be particles continue if word.text[-1] == '-': # Ignore words starting or ending with a hypen (likely OCR errors) continue if word.text[0] == '-': continue if word.attrib.get('lemma') is not None: lemma = word.attrib.get('lemma') filter_query = SQLiteQuery(lemma,'filter','FILTER_WORD_FORMS', cursor=filters.cursor) # Ignore stop words if filter_query.exists: continue else: if database == 'DCI': query = SQLiteQuery(lemma, 'lemma','DCI_ELEMENT', cursor = dci.cursor) # Capitalized words included query_lower = SQLiteQuery(lemma.lower(),'lemma','DCI_ELEMENT', cursor = dci.cursor) elif database == 'DIM': query = SQLiteQuery(lemma, 'lemma','DIM_ELEMENT', cursor = dim.cursor) # Capitalized words included query_lower = SQLiteQuery(lemma.lower(),'lemma','DIM_ELEMENT', cursor = dim.cursor) if not query.exists and not query_lower.exists: # If the word is not found in the DIM or the stopwords if lemma not in freqdic1: # Collect total freqs freqdic1[lemma] = 1 else: freqdic1[lemma] += 1 if (lemma,texttype) not in freqdic2: # Collect texttype freqs freqdic2[(lemma,texttype)] = 1 else: freqdic2[(lemma,texttype)] += 1 except IndexError: continue except ET.ParseError: continue filebar.next() sys.stdout.flush() filebar.finish() print(""" ============================================================ Sorting frequencies by text types. ============================================================ """) tempfinal = [] bar1 = IncrementalBar('Progress', max = len(freqdic1)) for key, value in sorted(freqdic1.items()): # Lemma, total freq tempf = [] tempf.append(key) temp = [] for k, v in freqdic2.items(): if k[0] == key: temp.append((k[1], v)) # A list of all possible texttypes that appear with the lemma for tt in alltexttypes: if tt in [item[0] for item in temp]: continue else: temp.append((tt, 0)) tempf.append(value) for tup in sorted(temp): tempf.append(tup[1]) tempfinal.append(tempf) # The format of this list is [lemma, totalfreq, texttype_a freq, texttype_b freq...] bar1.next() sys.stdout.flush() bar1.finish() header = ['Word', 'Total freq'] + sorted(alltexttypes) if folder == "corpora/IGC/": with open(f"output/{database}/IGC_texttypes.csv", mode='w+') as outputfile: csvwriter = csv.writer(outputfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csvwriter.writerow(header) for i in tempfinal: csvwriter.writerow(i) print(f""" ============================================================ Output file IGC_texttypes.csv is ready and can be found in the output/{database}/ directory. ============================================================ """) elif folder == "corpora/IGC/CC_BY/": with open(f'output/{database}/CC_BY_texttypes.csv', mode='w+') as outputfile: csvwriter = csv.writer(outputfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csvwriter.writerow(header) for i in tempfinal: csvwriter.writerow(i) print(f""" ============================================================ Output file CC_BY_texttypes.csv is ready and can be found in the output/{database}/ directory. ============================================================ """) elif folder == "corpora/IGC/TIC/": with open(f'output/{database}/TIC_texttypes.csv', mode='w+') as outputfile: csvwriter = csv.writer(outputfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csvwriter.writerow(header) for i in tempfinal: csvwriter.writerow(i) print(f""" ============================================================ Output file TIC_texttypes.csv is ready and can be found in the output/{database}/ directory. ============================================================ """) else: namefolder = folder.split("/")[3] with open(f'output/{database}/'+namefolder+"_texttypes.csv", mode='w+') as outputfile: csvwriter = csv.writer(outputfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csvwriter.writerow(header) for i in tempfinal: csvwriter.writerow(i) print(f""" ============================================================ Output file {namefolder}_texttypes.csv is ready and can be found in the output/{database}/ directory. ============================================================ """)
def wordform_output(IGC_folder, prop_names): """ Iterates over the input corpus and returns the word forms not found in the DIM, ordered by frequency. Can be altered for other databases or corpora. """ dim = SQLDatabase(db_name='databases/dim_lemmas_word_forms.db') filters = SQLDatabase(db_name='databases/IGC_filters.db' ) # Predefined stop-word list based on the IGC pos_to_ignore = [ 'e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au' ] # The POS tags that should not be displayed in the results IGC = IGCExtractor(folder=str(IGC_folder)) freqdic = {} print(""" ============================================================ Reading corpus files. ============================================================ """) for word in IGC.extract(forms=True, lemmas=False, pos=True): try: if prop_names == False: if word.pos.startswith('n') and word.pos.endswith( 's'): # Ignore proper names continue if word.pos in pos_to_ignore: continue if (not all(i.isalpha() or i == '-' for i in word.word_form) ): # Ignore if not only letters or letters and hyphen continue if len(word.word_form) < 3: continue if '-' in [ word.word_form[0], word.word_form[1], word.word_form[-1] ]: # Ignore words that start with '[anyLetter?]-' or end with '-' continue # Ignore unwanted words, such as names, foreign words, stopwords, abbreviations filter_query = SQLiteQuery(word.word_form, 'filter', 'FILTER_WORD_FORMS', cursor=filters.cursor) if filter_query.exists: continue else: query = SQLiteQuery( word.word_form, 'word_form', 'DIM_ELEMENT', cursor=dim.cursor) # Capitalized words included query_lower = SQLiteQuery(word.word_form.lower(), 'word_form', 'DIM_ELEMENT', cursor=dim.cursor) if not query.exists and not query_lower.exists: # If the word is not found in the DIM or the stopwords if word.word_form in freqdic: freqdic[word.word_form] += 1 else: freqdic[word.word_form] = 1 except IndexError: continue except ET.ParseError: continue print(""" ============================================================ Sorting candidate frequencies. ============================================================ """) if IGC_folder == "corpora/IGC/": with open('output/DIM/IGC_wordform.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted( freqdic.items(), key=lambda item: item[1], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(""" ============================================================ Output file IGC_wordforms.freq is ready and can be found in the output/DIM/ directory. ============================================================ """) elif IGC_folder == "corpora/IGC/CC_BY/": with open('output/DIM/CC_BY_wordform.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted( freqdic.items(), key=lambda item: item[1], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(""" ============================================================ Output file CC_BY_wordforms.freq is ready and can be found in the output/DIM/ directory. ============================================================ """) elif IGC_folder == "corpora/IGC/TIC/": with open('output/DIM/TIC_wordform.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted( freqdic.items(), key=lambda item: item[1], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(""" ============================================================ Output file TIC_wordforms.freq is ready and can be found in the output/DIM/ directory. ============================================================ """) else: namefolder = IGC_folder.split("/")[3] with open('output/DIM/' + namefolder + '_wordform.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted( freqdic.items(), key=lambda item: item[1], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Output file {namefolder}_wordforms.freq is ready and can be found in the output/DIM/ directory. ============================================================ """)
def lemma_output(database, IGC_folder, prop_names): """ Iterates over the input corpus and returns the lemmas not found in the input database, ordered by frequency. Also includes information on the number of nouns, indicating if a noun only exists in either the singular or the plural form (and whether the automatic lemmatization/pos tagging is off). Can be altered for other databases or corpora. """ dci = SQLDatabase(db_name='databases/dci.db') dim = SQLDatabase(db_name='databases/dim_lemmas_word_forms.db') filters = SQLDatabase(db_name='databases/IGC_filters.db' ) # Predefined stop-word list based on the IGC pos_to_ignore = [ 'e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au' ] # The POS tags that should not be displayed in the results IGC = IGCExtractor(folder=str(IGC_folder)) freqdic = {} print(""" ============================================================ Reading corpus files. ============================================================ """) for word in IGC.extract(forms=False, lemmas=True, pos=True): try: if prop_names == False: if word.pos.startswith('n') and word.pos.endswith( 's'): # Ignore proper names continue if word.pos in pos_to_ignore: continue if (not all(i.isalpha() or i == '-' for i in word.lemma) ): # Ignore if not only letters or letters and hyphen continue if len(word.lemma ) < 3: # Ignore very short words, likely to be particles continue if '-' in [ word.lemma[0], word.lemma[1], word.lemma[-1] ]: # Ignore words that start with '[anyLetter?]-' or end with '-' continue # Ignore unwanted words, such as names, foreign words, stopwords, abbreviations filter_query = SQLiteQuery(word.lemma, 'filter', 'FILTER_WORD_FORMS', cursor=filters.cursor) if filter_query.exists: continue else: if database == 'DCI': query = SQLiteQuery( word.lemma, 'lemma', 'DCI_ELEMENT', cursor=dci.cursor) # Capitalized words included query_lower = SQLiteQuery(word.lemma.lower(), 'lemma', 'DCI_ELEMENT', cursor=dci.cursor) elif database == 'DIM': query = SQLiteQuery( word.lemma, 'lemma', 'DIM_ELEMENT', cursor=dim.cursor) # Capitalized words included query_lower = SQLiteQuery(word.lemma.lower(), 'lemma', 'DIM_ELEMENT', cursor=dim.cursor) if not query.exists and not query_lower.exists: # If the word is not found in the DIM or the stopwords if word.lemma in freqdic: if word.pos[0] == 'n': # if word is a noun freqdic[word.lemma]['freq'] += 1 if word.pos[ 2] == 'e': # if the noun is singular (eintala) freqdic[word.lemma]['number']['sing'] += 1 elif word.pos[ 2] == 'f': # if the noun is plural (fleirtala) freqdic[word.lemma]['number']['plur'] += 1 else: freqdic[word.lemma][ 'freq'] += 1 # Necessary for proper names, nouns with no number freqdic[word.lemma]['number']['no_number'] += 1 else: if word.pos[0] == 'n': if word.pos[2] == 'e': freqdic[word.lemma] = { 'freq': 0, 'number': { 'sing': 1, 'plur': 0, 'no_number': 0 } } elif word.pos[2] == 'f': freqdic[word.lemma] = { 'freq': 0, 'number': { 'sing': 0, 'plur': 1, 'no_number': 0 } } else: freqdic[word.lemma] = { 'freq': 0, 'number': { 'sing': 0, 'plur': 0, 'no_number': 1 } } else: freqdic[word.lemma] = { 'freq': 0, 'number': { 'sing': 0, 'plur': 0, 'no_number': 1 } } freqdic[word.lemma]['freq'] = 1 except IndexError: continue except ET.ParseError: continue print(""" ============================================================ Sorting candidate frequencies. ============================================================ """) if IGC_folder == "corpora/IGC/": with open(f'output/{database}/IGC_lemma.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['freq'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Output file IGC_lemma.freq is ready and can be found in the output/{database}/ directory. ============================================================ """) elif IGC_folder == "corpora/IGC/CC_BY/": with open(f'output/{database}/CC_BY_lemma.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['freq'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Output file CC_BY_lemma.freq is ready and can be found in the output/{database}/ directory. ============================================================ """) elif IGC_folder == "corpora/IGC/TIC/": with open(f'output/{database}/TIC_lemma.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['freq'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Output file TIC_lemma.freq is ready and can be found in the output/{database}/ directory. ============================================================ """) else: namefolder = IGC_folder.split("/")[3] with open(f'output/{database}/' + namefolder + '_lemma.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['freq'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Output file {namefolder}_lemma.freq is ready and can be found in the output/{database}/ directory. ============================================================ """)
def lemma_output(database, IGC_folder, prop_names): """ Iterates over the input corpus and returns the lemmas not found in the input database, ordered by frequency. Also includes information on the tala of nouns, indicating if a noun only exists in either the singular or the plural form (and whether the automatic lemmatization/pos tagging is off). Can be altered for other databases or malheildir. """ dci = SQLDatabase(db_name='gagnagrunnar/nmo.db') dim = SQLDatabase(db_name='gagnagrunnar/bin_lemmur_ordmyndir.db') filters = SQLDatabase(db_name='gagnagrunnar/IGC_filters.db' ) # Predefined stop-word list based on the RMH pos_to_ignore = [ 'e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au' ] # The POS tags that should not be displayed in the results RMH = IGCExtractor(folder=str(IGC_folder)) freqdic = {} print(""" ============================================================ Les skjöl úr málheildinni. ============================================================ """) for word in RMH.extract(forms=False, lemmas=True, pos=True): try: if prop_names == False: if word.pos.startswith('n') and word.pos.endswith( 's'): # Ignore proper names continue if word.pos in pos_to_ignore: continue if (not all(i.isalpha() or i == '-' for i in word.lemma) ): # Ignore if not only letters or letters and hyphen continue if len(word.lemma ) < 3: # Ignore very short words, likely to be particles continue if '-' in [ word.lemma[0], word.lemma[1], word.lemma[-1] ]: # Ignore words that start with '[anyLetter?]-' or end with '-' continue # Ignore unwanted words, such as names, foreign words, stopwords, abbreviations filter_query = SQLiteQuery(word.lemma, 'filter', 'FILTER_WORD_FORMS', cursor=filters.cursor) if filter_query.exists: continue else: if database == 'NMO': query = SQLiteQuery( word.lemma, 'lemma', 'DCI_ELEMENT', cursor=dci.cursor) # Capitalized words included query_lower = SQLiteQuery(word.lemma.lower(), 'lemma', 'DCI_ELEMENT', cursor=dci.cursor) elif database == 'BIN': query = SQLiteQuery( word.lemma, 'lemma', 'DIM_ELEMENT', cursor=dim.cursor) # Capitalized words included query_lower = SQLiteQuery(word.lemma.lower(), 'lemma', 'DIM_ELEMENT', cursor=dim.cursor) if not query.exists and not query_lower.exists: # If the word is not found in the BIN or the stopwords if word.lemma in freqdic: if word.pos[0] == 'n': # if word is a noun freqdic[word.lemma]['tíðni'] += 1 if word.pos[ 2] == 'e': # if the noun is singular (eintala) freqdic[word.lemma]['tala']['eintala'] += 1 elif word.pos[ 2] == 'f': # if the noun is plural (fleirtala) freqdic[word.lemma]['tala']['fleirtala'] += 1 else: freqdic[word.lemma][ 'tíðni'] += 1 # Necessary for proper names, nouns with no tala freqdic[word.lemma]['tala']['engin_tala'] += 1 else: if word.pos[0] == 'n': if word.pos[2] == 'e': freqdic[word.lemma] = { 'tíðni': 0, 'tala': { 'eintala': 1, 'fleirtala': 0, 'engin_tala': 0 } } elif word.pos[2] == 'f': freqdic[word.lemma] = { 'tíðni': 0, 'tala': { 'eintala': 0, 'fleirtala': 1, 'engin_tala': 0 } } else: freqdic[word.lemma] = { 'tíðni': 0, 'tala': { 'eintala': 0, 'fleirtala': 0, 'engin_tala': 1 } } else: freqdic[word.lemma] = { 'tíðni': 0, 'tala': { 'eintala': 0, 'fleirtala': 0, 'engin_tala': 1 } } freqdic[word.lemma]['tíðni'] = 1 except IndexError: continue except ET.ParseError: continue print(""" ============================================================ Flokkar orð eftir tíðni. ============================================================ """) if IGC_folder == "malheildir/RMH/": with open(f'uttak/{database}/RMH_lemmur.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['tíðni'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Úttaksskjalið RMH_lemmur.freq er tilbúið og er að finna í undirmöppunni uttak/{database}/ ============================================================ """) elif IGC_folder == "malheildir/RMH/CC_BY/": with open(f'uttak/{database}/CC_BY_lemmur.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['tíðni'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Úttaksskjalið CC_BY_lemmur.freq er tilbúið og er að finna í undirmöppunni uttak/{database}/ ============================================================ """) elif IGC_folder == "malheildir/RMH/MIM/": with open(f'uttak/{database}/MIM_lemmur.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['tíðni'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Úttaksskjalið MIM_lemmur.freq er tilbúið og er að finna í undirmöppunni uttak/{database}/ ============================================================ """) else: namefolder = IGC_folder.split("/")[3] with open(f'uttak/{database}/' + namefolder + '_lemmur.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['tíðni'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Úttaksskjalið {namefolder}_lemmur.freq er tilbúið og er að finna í undirmöppunni uttak/{database}/ ============================================================ """)
def user_defined_freqlist(database,filterbase,corpus): """ Iterates through a user-defined corpus and compares the results to a user-defined database, filtering out stopwords if the user has defined a stopword database. Returns a frequency word list. """ db = SQLDatabase(db_name=database) txt_files = glob.glob(corpus+'/**/*.txt', recursive=True) if filterbase != 'None': filters = SQLDatabase(db_name=filterbase) else: pass outdict = {} print(""" ============================================================ Reading corpus files. ============================================================ """) filebar = IncrementalBar('Progress', max = len(txt_files)) for file in txt_files: with open(file, 'r', encoding='utf-8') as content: f = content.read() words = f.split() for w in words: if w[-1] == '-': # if a word starts or ends in an hyphen, ignore it (likely OCR error) continue if w[0] == '-': continue if (not all(i.isalpha() or i == '-' for i in w)): # if a word contains anything but an alphabetic letter or hyphen, ignore it continue if filterbase != 'None': # if a stopword database has been defined, filter the results filter_query = SQLiteQuery(w,'filter','FILTER_WORD_FORMS', cursor=filters.cursor) if filter_query.exists: continue else: query = SQLiteQuery(w,'word','LEXICON_WORD', cursor = db.cursor) # parameters must be updated if the database format is changed query_lower = SQLiteQuery(w.lower(),'word','LEXICON_WORD', cursor = db.cursor) if not query.exists and not query_lower.exists: if len(w) >= 3: if w in outdict: outdict[w] += 1 else: outdict[w] = 1 else: query = SQLiteQuery(w,'word','LEXICON_WORD', cursor = db.cursor) query_lower = SQLiteQuery(w.lower(),'word','LEXICON_WORD', cursor = db.cursor) if not query.exists and not query_lower.exists: if len(w) > 1: if w in outdict: outdict[w] += 1 else: outdict[w] = 1 filebar.next() sys.stdout.flush() filebar.finish() output_file = input(""" ============================================================ Please indicate what your output file should be called, followed by .freq Example: lexicon_frequencylist.freq ============================================================ """) with open('output/user_defined/'+output_file, mode='w+') as outputfile: candidates = {k: v for k, v in sorted(outdict.items(), key=lambda item: item[1], reverse=True)} for key, value in candidates.items(): outputfile.write(key+': '+str(value)+ '\n') print(f""" ============================================================ Output file {output_file} is ready and can be found at the output/user_defined/ directory. ============================================================ """)
def wordform_output(IGC_folder, prop_names): """ Iterates over the input corpus and returns the word forms not found in the BIN, ordered by frequency. Can be altered for other gagnagrunnar or malheildir. """ dim = SQLDatabase(db_name='gagnagrunnar/bin_lemmur_ordmyndir.db') filters = SQLDatabase(db_name='gagnagrunnar/IGC_filters.db' ) # Predefined stop-word list based on the RMH pos_to_ignore = [ 'e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au' ] # The POS tags that should not be displayed in the results RMH = IGCExtractor(folder=str(IGC_folder)) freqdic = {} print(""" ============================================================ Les skjöl úr málheildinni. ============================================================ """) for word in RMH.extract(forms=True, lemmas=False, pos=True): try: if prop_names == False: if word.pos.startswith('n') and word.pos.endswith( 's'): # Ignore proper names continue if word.pos in pos_to_ignore: continue if (not all(i.isalpha() or i == '-' for i in word.word_form) ): # Ignore if not only letters or letters and hyphen continue if len(word.word_form) < 3: continue if '-' in [ word.word_form[0], word.word_form[1], word.word_form[-1] ]: # Ignore words that start with '[anyLetter?]-' or end with '-' continue # Ignore unwanted words, such as names, foreign words, stopwords, abbreviations filter_query = SQLiteQuery(word.word_form, 'filter', 'FILTER_WORD_FORMS', cursor=filters.cursor) if filter_query.exists: continue else: query = SQLiteQuery( word.word_form, 'word_form', 'DIM_ELEMENT', cursor=dim.cursor) # Capitalized words included query_lower = SQLiteQuery(word.word_form.lower(), 'word_form', 'DIM_ELEMENT', cursor=dim.cursor) if not query.exists and not query_lower.exists: # If the word is not found in the BIN or the stopwords if word.word_form in freqdic: freqdic[word.word_form] += 1 else: freqdic[word.word_form] = 1 except IndexError: continue except ET.ParseError: continue print(""" ============================================================ Flokkar orð eftir tíðni. ============================================================ """) if IGC_folder == "malheildir/RMH/": with open('uttak/BIN/RMH_ordmyndir.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted( freqdic.items(), key=lambda item: item[1], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(""" ============================================================ Úttaksskjalið RMH_ordmyndir.freq er tilbúið og er að finna í undirmöppunni uttak/BIN/ ============================================================ """) elif IGC_folder == "malheildir/RMH/CC_BY/": with open('uttak/BIN/CC_BY_ordmyndir.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted( freqdic.items(), key=lambda item: item[1], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(""" ============================================================ Úttaksskjalið CC_BY_ordmyndir.freq er tilbúið og er að finna í undirmöppunni uttak/BIN/ ============================================================ """) elif IGC_folder == "malheildir/RMH/MIM/": with open('uttak/BIN/MIM_ordmyndir.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted( freqdic.items(), key=lambda item: item[1], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(""" ============================================================ Úttaksskjalið MIM_ordmyndir.freq er tilbúið og er að finna í undirmöppunni uttak/BIN/ ============================================================ """) else: namefolder = IGC_folder.split("/")[3] with open('uttak/BIN/' + namefolder + '_ordmyndir.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted( freqdic.items(), key=lambda item: item[1], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Úttaksskjalið {namefolder}_ordmyndir.freq er tilbúið og er að finna í undirmöppunni uttak/BIN/ ============================================================ """)
def lemmabase_wordforms(database, IGC_folder, prop_names): """ Iterates through the IGC, outputting a list of lemmas and their frequencies as well as all wordforms that appear alongside the lemma in the corpus. Useful for detecting whether a word only appears in certain context (e.g. fixed expressions) or whether a certain wordform never appears. Can be modified to fit the user's need. """ dim = SQLDatabase(db_name='databases/dim_lemmas_word_forms.db') dci = SQLDatabase(db_name='databases/dci.db') filters = SQLDatabase(db_name='databases/IGC_filters.db' ) # Predefined stop-word list based on the IGC pos_to_ignore = [ 'e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au' ] # The POS tags that should not be displayed in the results IGC = IGCExtractor(folder=str(IGC_folder)) freqdic = {} print(""" ============================================================ Reading corpus files. ============================================================ """) for word in IGC.extract(forms=True, lemmas=True, pos=True): try: if prop_names == False: if word.pos.startswith('n') and word.pos.endswith( 's'): # Ignore proper names continue if word.pos in pos_to_ignore: continue if (not all(i.isalpha() or i == '-' for i in word.lemma) ): # Ignore if not only letters or letters and hyphen continue if len(word.lemma ) < 3: # Ignore very short words, likely to be particles continue if '-' in [ word.lemma[0], word.lemma[1], word.lemma[-1] ]: # Ignore words that start with '[anyLetter?]-' or end with '-' continue # Ignore unwanted words, such as names, foreign words, stopwords, abbreviations filter_query = SQLiteQuery(word.lemma, 'filter', 'FILTER_WORD_FORMS', cursor=filters.cursor) if filter_query.exists: continue else: if database == 'DCI': query = SQLiteQuery( word.lemma, 'lemma', 'DCI_ELEMENT', cursor=dci.cursor) # Capitalized words included query_lower = SQLiteQuery(word.lemma.lower(), 'lemma', 'DCI_ELEMENT', cursor=dci.cursor) elif database == 'DIM': query = SQLiteQuery( word.lemma, 'lemma', 'DIM_ELEMENT', cursor=dim.cursor) # Capitalized words included query_lower = SQLiteQuery(word.lemma.lower(), 'lemma', 'DIM_ELEMENT', cursor=dim.cursor) if not query.exists and not query_lower.exists: if word.lemma in freqdic: if word.word_form not in freqdic[ word.lemma]['wordforms']: freqdic[word.lemma]['wordforms'].append( word.word_form) freqdic[word.lemma]['freq'] += 1 else: freqdic[word.lemma] = {} freqdic[word.lemma]['freq'] = 1 freqdic[word.lemma]['wordforms'] = [word.word_form] except IndexError: continue except ET.ParseError: continue print(""" ============================================================ Sorting candidate frequencies. ============================================================ """) if IGC_folder == "corpora/IGC/": with open(f'output/{database}/IGC_lemma_plus_wordform.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['freq'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(""" ============================================================ Output file IGC_lemma_plus_wordform.freq is ready and can be found in the output/DIM/ directory. ============================================================ """) elif IGC_folder == "corpora/IGC/CC_BY/": with open(f'output/{database}/CC_BY_lemma_plus_wordform.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['freq'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(""" ============================================================ Output file CC_BY_lemma_plus_wordform.freq is ready and can be found in the output/DIM/ directory. ============================================================ """) elif IGC_folder == "corpora/IGC/TIC/": with open(f'output/{database}/TIC_lemma_plus_wordform.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['freq'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(""" ============================================================ Output file TIC_lemma_plus_wordform.freq is ready and can be found in the output/DIM/ directory. ============================================================ """) else: namefolder = IGC_folder.split("/")[3] with open(f'output/{database}/' + namefolder + '_lemma_plus_wordform.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(freqdic.items(), key=lambda item: item[1]['freq'], reverse=True) } for key, value in candidates.items(): outputfile.write(key + ': ' + str(value) + '\n') print(f""" ============================================================ Output file {namefolder}_lemma_plus_wordform.freq is ready and can be found in the output/DIM/ directory. ============================================================ """)
def user_defined_collocations(database, filterbase, corpus): """ Iterates through the corpus and retrieves the words that do not appear in the database. Collects 5 word collocations on every word, two words before and after the candidate word. """ db = SQLDatabase(db_name=database) txt_files = glob.glob(corpus + '/**/*.txt', recursive=True) if filterbase not in ['n', 'N']: filters = SQLDatabase(db_name=filterbase) else: pass # if there is no filterbase, ignore this step outdict = {} print(""" ============================================================ Les skjöl úr málheildinni. ============================================================ """) filebar = IncrementalBar('Framvinda', max=len(txt_files)) for file in txt_files: with open(file, 'r', encoding='utf-8') as content: f = content.read() words = f.split() for i, w in enumerate(words): if w[-1] == '-': # if a word starts or ends in an hyphen, ignore it (likely OCR error) continue if w[0] == '-': continue if ( not all(i.isalpha() or i == '-' for i in w) ): # if a word contains anything but an alphabetic letter or hyphen, ignore it continue if filterbase not in [ 'n', 'N' ]: # if a stopword database has been defined, filter the results filter_query = SQLiteQuery(w, 'filter', 'FILTER_WORD_FORMS', cursor=filters.cursor) if filter_query.exists: continue else: query = SQLiteQuery( w, 'word', 'LEXICON_WORD', cursor=db.cursor ) # parameters must be updated if the database format is changed query_lower = SQLiteQuery(w.lower(), 'word', 'LEXICON_WORD', cursor=db.cursor) if not query.exists and not query_lower.exists: # If the word is not found in the database nor the filters if len(w) > 1: if i - 2 < 0: # collects 2 words before and after the candidate w1 = "" else: w1 = str(words[i - 2]) if i - 1 < 0: w2 = "" else: w2 = str(words[i - 1]) if i + 1 > len(words) - 1: w4 = "" else: w4 = str(words[i + 1]) if i + 2 > len(words) - 1: w5 = "" else: w5 = str(words[i + 2]) if w in outdict: if str(w1 + ' ' + w2 + ' ' + w + ' ' + w4 + ' ' + w5) not in outdict[w]['orðstaða']: outdict[w]['orðstaða'][str(w1 + ' ' + w2 + ' ' + w + ' ' + w4 + ' ' + w5)] = 1 else: outdict[w]['orðstaða'][str(w1 + ' ' + w2 + ' ' + w + ' ' + w4 + ' ' + w5)] += 1 outdict[w]['tíðni'] += 1 else: outdict[w] = {} outdict[w]['tíðni'] = 1 outdict[w]['orðstaða'] = { str(w1 + ' ' + w2 + ' ' + w + ' ' + w4 + ' ' + w5): 1 } else: query = SQLiteQuery(w, 'word', 'LEXICON_WORD', cursor=db.cursor) query_lower = SQLiteQuery(w.lower(), 'word', 'LEXICON_WORD', cursor=db.cursor) if not query.exists and not query_lower.exists: if len(w) > 1: if i - 2 < 0: w1 = "" else: w1 = str(words[i - 2]) if i - 1 < 0: w2 = "" else: w2 = str(words[i - 1]) if i + 1 > len(words) - 1: w4 = "" else: w4 = str(words[i + 1]) if i + 2 > len(words) - 1: w5 = "" else: w5 = str(words[i + 2]) if w in outdict: if str(w1 + ' ' + w2 + ' ' + w + ' ' + w4 + ' ' + w5) not in outdict[w]['orðstaða']: outdict[w]['orðstaða'][str(w1 + ' ' + w2 + ' ' + w + ' ' + w4 + ' ' + w5)] = 1 else: outdict[w]['orðstaða'][str(w1 + ' ' + w2 + ' ' + w + ' ' + w4 + ' ' + w5)] += 1 outdict[w]['tíðni'] += 1 else: outdict[w] = {} outdict[w]['tíðni'] = 1 outdict[w]['orðstaða'] = { str(w1 + ' ' + w2 + ' ' + w + ' ' + w4 + ' ' + w5): 1 } filebar.next() sys.stdout.flush() filebar.finish() output_file = input(""" ============================================================ Skrifaðu það sem þú vilt að úttaksskjalið heiti með endingunni .freq Dæmi: ordasafn_ordstodulyklar.freq ============================================================ """) with open('uttak/notendagogn/' + output_file, mode='w+') as outputfile: candidates = { k: v for k, v in sorted(outdict.items(), key=lambda item: item[1]['tíðni'], reverse=True) } # Sort the candidates by their total frequencies for key, item in candidates.items(): for counter, dictitem in enumerate(item.items()): if counter % 2 == 0: freq = dictitem[1] elif counter % 2 != 0: sorted_sents = { k: v for k, v in sorted( dictitem[1].items( ), # Sort the sentence examples by their frequencies key=lambda item: item[1], reverse=True) } if len( sorted_sents ) > 5: # This limit the examples to the 5 most frequent ones, can be changed sents = list(sorted_sents)[:5] else: sents = list(sorted_sents) outputfile.write( key + ' : ' + str(freq) + '. ' + str(sents) + '\n' ) # word: freq. [sent example 1, sent example 2...] print(f""" ============================================================ Úttaksskjalið {output_file} er tilbúið og má finna í undirmöppunni uttak/notendagogn/ ============================================================ """)
def lemmas_collocations(database, IGC_folder, prop_names): dci = SQLDatabase(db_name='gagnagrunnar/nmo.db') dim = SQLDatabase(db_name='gagnagrunnar/bin_lemmur_ordmyndir.db') filters = SQLDatabase(db_name='gagnagrunnar/IGC_filters.db' ) # Predefined stop-word list based on the RMH pos_to_ignore = [ 'e', 'c', 'v', 'as', 'to', 'tp', 'ta', 'au' ] # The POS tags that should not be displayed in the results outdict = {} print(""" ============================================================ Les skjöl úr málheildinni. ============================================================ """) xml_files = glob.glob(IGC_folder + '/**/*.xml', recursive=True) filebar = IncrementalBar('Framvinda', max=len(xml_files)) for file in xml_files: colloc = [] with open(file, 'r', encoding='utf-8') as content: try: tree = ET.parse(content) for word in tree.iter(): if word.text is not None: if word.attrib.get('lemma') is not None: pos = word.attrib.get('type') lemma = word.attrib.get('lemma') word_form = word.text colloc.append((word_form, lemma, pos)) elif word.text in punctuation: colloc.append((word.text, ' ', ' ')) for i, w in enumerate(colloc): if prop_names == False: if w[2].startswith('n') and w[2].endswith( 's'): # Ignore proper names continue if w[2] in pos_to_ignore: continue if w[1][-1] == '-': # if a word starts or ends in an hyphen, ignore it (likely OCR error) continue if w[1][0] == '-': continue if ( not all(i.isalpha() or i == '-' for i in w[1]) ): # if a word contains anything but an alphabetic letter or hyphen, ignore it continue filter_query = SQLiteQuery(w[1], 'filter', 'FILTER_WORD_FORMS', cursor=filters.cursor) if filter_query.exists: continue else: if database == 'NMO': query = SQLiteQuery(w[1], 'lemma', 'DCI_ELEMENT', cursor=dci.cursor ) # Capitalized words included query_lower = SQLiteQuery(w[1].lower(), 'lemma', 'DCI_ELEMENT', cursor=dci.cursor) elif database == 'BIN': query = SQLiteQuery(w[1], 'lemma', 'DIM_ELEMENT', cursor=dim.cursor ) # Capitalized words included query_lower = SQLiteQuery(w[1].lower(), 'lemma', 'DIM_ELEMENT', cursor=dim.cursor) if not query.exists and not query_lower.exists: # If the word is not found in the database nor the filters if len(w[1]) > 1: if i - 2 < 0: # collects 2 words before and after the candidate w1 = "" else: w1 = str(colloc[i - 2][0]) if i - 1 < 0: w2 = "" else: w2 = str(colloc[i - 1][0]) if i + 1 > len(colloc) - 1: w4 = "" else: w4 = str(colloc[i + 1][0]) if i + 2 > len(colloc) - 1: w5 = "" else: w5 = str(colloc[i + 2][0]) if w[1] in outdict: if str(w1 + ' ' + w2 + ' ' + w[0] + ' ' + w4 + ' ' + w5) not in outdict[ w[1]]['orðstaða']: outdict[w[1]]['orðstaða'][ str(w1 + ' ' + w2 + ' ' + w[0] + ' ' + w4 + ' ' + w5)] = 1 else: outdict[w[1]]['orðstaða'][ str(w1 + ' ' + w2 + ' ' + w[0] + ' ' + w4 + ' ' + w5)] += 1 outdict[w[1]]['tíðni'] += 1 else: outdict[w[1]] = {} outdict[w[1]]['tíðni'] = 1 outdict[w[1]]['orðstaða'] = { str(w1 + ' ' + w2 + ' ' + w[0] + ' ' + w4 + ' ' + w5): 1 } except sqlite3.OperationalError: pass filebar.next() sys.stdout.flush() filebar.finish() if IGC_folder == "malheildir/RMH/": with open(f'uttak/{database}/RMH_lemmur_med_orstodulyklum.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(outdict.items(), key=lambda item: item[1]['tíðni'], reverse=True) } # Sort the candidates by their total frequencies for key, item in candidates.items(): for counter, dictitem in enumerate(item.items()): if counter % 2 == 0: freq = dictitem[1] elif counter % 2 != 0: sorted_sents = { k: v for k, v in sorted( dictitem[1].items( ), # Sort the sentence examples by their frequencies key=lambda item: item[1], reverse=True) } if len( sorted_sents ) > 5: # This limit the examples to the 5 most frequent ones, can be changed sents = list(sorted_sents)[:5] else: sents = list(sorted_sents) outputfile.write( key + ' : ' + str(freq) + '. ' + str(sents) + '\n' ) # word: freq. [sent example 1, sent example 2...] print(f""" ============================================================ Úttaksskjalið RMH_lemmur_med_orstodulyklum.freq er tilbúið og er að finna í undirmöppunni uttak/{database}/ ============================================================ """) elif IGC_folder == "malheildir/RMH/CC_BY/": with open(f'uttak/{database}/CC_BY_lemmur_med_orstodulyklum.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(outdict.items(), key=lambda item: item[1]['tíðni'], reverse=True) } # Sort the candidates by their total frequencies for key, item in candidates.items(): for counter, dictitem in enumerate(item.items()): if counter % 2 == 0: freq = dictitem[1] elif counter % 2 != 0: sorted_sents = { k: v for k, v in sorted( dictitem[1].items( ), # Sort the sentence examples by their frequencies key=lambda item: item[1], reverse=True) } if len( sorted_sents ) > 5: # This limit the examples to the 5 most frequent ones, can be changed sents = list(sorted_sents)[:5] else: sents = list(sorted_sents) outputfile.write( key + ' : ' + str(freq) + '. ' + str(sents) + '\n' ) # word: freq. [sent example 1, sent example 2...] print(f""" ============================================================ Úttaksskjalið CC_BY_lemmur_med_orstodulyklum.freq er tilbúið og er að finna í undirmöppunni uttak/{database}/ ============================================================ """) elif IGC_folder == "malheildir/RMH/MIM/": with open(f'uttak/{database}/MIM_lemmur_med_orstodulyklum.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(outdict.items(), key=lambda item: item[1]['tíðni'], reverse=True) } # Sort the candidates by their total frequencies for key, item in candidates.items(): for counter, dictitem in enumerate(item.items()): if counter % 2 == 0: freq = dictitem[1] elif counter % 2 != 0: sorted_sents = { k: v for k, v in sorted( dictitem[1].items( ), # Sort the sentence examples by their frequencies key=lambda item: item[1], reverse=True) } if len( sorted_sents ) > 5: # This limit the examples to the 5 most frequent ones, can be changed sents = list(sorted_sents)[:5] else: sents = list(sorted_sents) outputfile.write( key + ' : ' + str(freq) + '. ' + str(sents) + '\n' ) # word: freq. [sent example 1, sent example 2...] print(f""" ============================================================ Úttaksskjalið MIM_lemmur_med_orstodulyklum.freq er tilbúið og er að finna í undirmöppunni uttak/{database}/ ============================================================ """) else: namefolder = IGC_folder.split("/")[3] with open(f'uttak/{database}/' + namefolder + '_lemmur_med_orstodulyklum.freq', mode='w+') as outputfile: candidates = { k: v for k, v in sorted(outdict.items(), key=lambda item: item[1]['tíðni'], reverse=True) } # Sort the candidates by their total frequencies for key, item in candidates.items(): for counter, dictitem in enumerate(item.items()): if counter % 2 == 0: freq = dictitem[1] elif counter % 2 != 0: sorted_sents = { k: v for k, v in sorted( dictitem[1].items( ), # Sort the sentence examples by their frequencies key=lambda item: item[1], reverse=True) } if len( sorted_sents ) > 5: # This limit the examples to the 5 most frequent ones, can be changed sents = list(sorted_sents)[:5] else: sents = list(sorted_sents) outputfile.write( key + ' : ' + str(freq) + '. ' + str(sents) + '\n' ) # word: freq. [sent example 1, sent example 2...] print(f""" ============================================================ Úttaksskjalið {namefolder}_lemmur_med_orstodulyklum.freq er tilbúið og er að finna í undirmöppunni uttak/{database}/ ============================================================ """)
def user_defined_freqlist(database,filterbase,corpus): """ Iterates through a user-defined corpus and compares the results to a user-defined database, filtering out stopwords if the user has defined a stopword database. Returns a frequency word list. """ db = SQLDatabase(db_name=database) txt_files = glob.glob(corpus+'/**/*.txt', recursive=True) print(filterbase) if filterbase not in ['n', 'N']: filters = SQLDatabase(db_name=filterbase) else: pass outdict = {} print(""" ============================================================ Les skjöl úr málheildinni. ============================================================ """) filebar = IncrementalBar('Framvinda', max = len(txt_files)) for file in txt_files: with open(file, 'r', encoding='utf-8') as content: f = content.read() words = f.split() for w in words: if w[-1] == '-': # if a word starts or ends in an hyphen, ignore it (likely OCR error) continue if w[0] == '-': continue if (not all(i.isalpha() or i == '-' for i in w)): # if a word contains anything but an alphabetic letter or hyphen, ignore it continue if filterbase not in ['n', 'N']: # if a stopword database has been defined, filter the results filter_query = SQLiteQuery(w,'filter','FILTER_WORD_FORMS', cursor=filters.cursor) if filter_query.exists: continue else: query = SQLiteQuery(w,'word','LEXICON_WORD', cursor = db.cursor) # parameters must be updated if the database format is changed query_lower = SQLiteQuery(w.lower(),'word','LEXICON_WORD', cursor = db.cursor) if not query.exists and not query_lower.exists: if len(w) > 1: if w in outdict: outdict[w] += 1 else: outdict[w] = 1 else: query = SQLiteQuery(w,'word','LEXICON_WORD', cursor = db.cursor) query_lower = SQLiteQuery(w.lower(),'word','LEXICON_WORD', cursor = db.cursor) if not query.exists and not query_lower.exists: if len(w) > 1: if w in outdict: outdict[w] += 1 else: outdict[w] = 1 filebar.next() sys.stdout.flush() filebar.finish() output_file = input(""" ============================================================ Skrifaðu það sem þú vilt að úttaksskjalið heiti með endingunni .freq Dæmi: ordasafn.freq ============================================================ """) with open('uttak/notendagogn/'+output_file, mode='w+') as outputfile: candidates = {k: v for k, v in sorted(outdict.items(), key=lambda item: item[1], reverse=True)} for key, value in candidates.items(): outputfile.write(key+': '+str(value)+ '\n') print(f""" ============================================================ Úttaksskjalið {output_file} er tilbúið og má finna í undirmöppunni uttak/notendagogn/ ============================================================ """)