def get_frequency_dict(lang_code, lang_name): print_status("Creating frequency dictionaries...") frequency_dict = dict() # Load data for root, dirs, files in os.walk('datasets/monolingual-' + lang_code): if ('.DS_Store' in files): files.remove('.DS_Store') for f in files: print(f) filepath = os.path.join(root, f) file = open(filepath, 'rt', encoding='utf8') text = file.read() file.close() # Clean XML tags cleantext = BeautifulSoup(text, "lxml").text module = importlib.import_module("spacy.lang." + lang_code) nlp = getattr(module, lang_name)() if module is not None else spacy.language.Language() tokenizer = nlp.Defaults.create_tokenizer(nlp) tokens = list(tokenizer(cleantext)) for word in tokens: word = word.text.lower() if is_other(word): continue else: if word in frequency_dict.keys(): frequency_dict[word] += 1 else: frequency_dict[word] = 1 return frequency_dict
def get_probability_dict(frequency_dict): print_status("Creating probability dictionaries...") smoothing_factor = 1 nr_of_tokens = sum(frequency_dict.values()) nr_of_distinct_words = len(frequency_dict.keys()) probability_dict = dict() for k, v in frequency_dict.items(): probability_dict[k] = (v + smoothing_factor) / (nr_of_tokens + smoothing_factor * nr_of_distinct_words) probability_dict['OOV'] = smoothing_factor / (nr_of_tokens + smoothing_factor * nr_of_distinct_words) return probability_dict
def get_ngrams(self): unigrams = [] bigrams = [] trigrams = [] fourgrams = [] fivegrams = [] sixgrams = [] print_status("Creating n-grams...") j = 0 for token in self.tokens_dict.keys(): if type(token) is float: print(f"ERROR : unknown token {token}") continue chars = list( pad_sequence(str(token), pad_left=True, left_pad_symbol="<w>", pad_right=True, right_pad_symbol="</w>", n=self.n)) ngrams = list(everygrams(chars, max_len=self.n)) for ngram in ngrams: if (len(ngram) == 1 and self.n == 2): for i in range(self.tokens_dict[token]): unigrams.append(ngram) if (len(ngram) == 2 and self.n <= 3): for i in range(self.tokens_dict[token]): bigrams.append(ngram) if (len(ngram) == 3 and self.n <= 4): for i in range(self.tokens_dict[token]): trigrams.append(ngram) if (len(ngram) == 4 and self.n <= 5): for i in range(self.tokens_dict[token]): fourgrams.append(ngram) if (len(ngram) == 5 and self.n <= 6): for i in range(self.tokens_dict[token]): fivegrams.append(ngram) if (len(ngram) == 6 and self.n <= 6): for i in range(self.tokens_dict[token]): sixgrams.append(ngram) if j % (len(self.tokens_dict) / 10) == 0: print(f"token {j} of {len(self.tokens_dict)}") j += 1 return unigrams + bigrams + trigrams + fourgrams + fivegrams + sixgrams
def get_tokenized_sentences(lang_code, lang_name): tokenizedFile = [] # Initialize tokenizer module = importlib.import_module("spacy.lang." + lang_code) nlp = getattr( module, lang_name)() if module is not None else spacy.language.Language() tokenizer = nlp.Defaults.create_tokenizer(nlp) # Load data print_status("Creating tokenized sentences from dataset...") for root, dirs, files in os.walk('datasets/monolingual-' + lang_code): if ('.DS_Store' in files): files.remove('.DS_Store') for f in files: print(f) filepath = os.path.join(root, f) file = open(filepath, 'rt', encoding='utf8') text = file.read() file.close() # Clean XML tags cleantext = BeautifulSoup(text, "lxml").text # Split in sentences sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", cleantext) # Split in tokens for s in sentences: word_tokens = [] tokens = list(tokenizer(s)) for t in tokens: t = t.text.lower() if (not is_other(t)): word_tokens.append(t) tokenizedFile.append(word_tokens) return tokenizedFile
def get_ngrams(self): unigrams = [] bigrams = [] trigrams = [] print_status("Creating n-grams...") j = 0 for sent in self.tokens_arr: words = list( pad_sequence(sent, pad_left=True, left_pad_symbol="<s>", pad_right=True, right_pad_symbol="</s>", n=self.n)) ngrams = list(everygrams(words, max_len=self.n)) for ngram in ngrams: if (len(ngram) == 1 and self.n == 2): unigrams.append(ngram) if (len(ngram) == 2 and self.n <= 3): bigrams.append(ngram) if j % (len(self.tokens_arr) / 10) == 0: print(f"token {j} of {len(self.tokens_arr)}") j += 1 return unigrams + bigrams + trigrams
from sklearn.metrics import f1_score from sklearn.metrics import ConfusionMatrixDisplay from sklearn.metrics import confusion_matrix from tools.utils import save_predictions import sys PREDICTIONS_PATH = './results/predictions/' # Get evaluation dataset from keyboard if len(sys.argv) == 1: print("Please enter evaluation dataset: 'dev', 'test' or 'test-original'") exit(1) evaluation_dataset = sys.argv[1] # Get predictions data print_status("Getting predictions data...") if (evaluation_dataset == 'dev'): predictionsFileName = PREDICTIONS_PATH + 'mBERT_predictions_dev.out' # validation if (evaluation_dataset == 'test'): predictionsFileName = PREDICTIONS_PATH + 'mBERT_predictions_test.out' # test # Get predictions file = open(predictionsFileName, 'rt', encoding='utf8') y = [] for line in file: # Remove empty lines, lines starting with # sent_enum, \n and split on tab if (line.strip() is not '' and '# sent_enum' not in line): line = line.rstrip('\n') splits = line.split("\t") pred = splits[1] y.append(pred)
import sys import os DICTIONARIES_PATH = "./dictionaries/word-level/" # Get language codes and evaluation dataset from keyboard if len(sys.argv) == 1: print("Please give two letter language codes as arg, for example en es") print("Please enter evaluation dataset: 'dev', 'test' or 'test-original'") exit(1) lang1_code = sys.argv[1] lang2_code = sys.argv[2] evaluation_dataset = sys.argv[3] # Get dictionaries print_status("Getting dictionaries...") lang1_path = DICTIONARIES_PATH + 'probability_dict_' + lang1_code + '.csv' lang2_path = DICTIONARIES_PATH + 'probability_dict_' + lang2_code + '.csv' if (os.path.exists(lang1_path) and os.path.exists(lang2_path)): probability_lang1_df = pd.read_csv(lang1_path, encoding='utf-16') probability_lang1_dict = probability_lang1_df.set_index( 'word')['probability'].to_dict() probability_lang2_df = pd.read_csv(lang2_path, encoding='utf-16') probability_lang2_dict = probability_lang2_df.set_index( 'word')['probability'].to_dict() else: print("Please run: python train_probability.py " + lang1_code + " " + lang2_code) # Get data
if len(sys.argv) == 1: print("Please give two letter language codes as arg, for example en es") print("Please enter evaluation dataset: 'dev', 'test' or 'test-original'") print("Please enter n value") exit(1) lang1_code = sys.argv[1] lang2_code = sys.argv[2] evaluation_dataset = sys.argv[3] n = int(sys.argv[4]) if n != 2 and n != 3 and n != 4 and n != 5 and n != 6: print("n should be 2, 3, 4, 5 or 6") exit(1) # Get dictionaries print_status("Getting dictionaries...") lang1_path = CHAR_LEVEL_DICTIONARIES_PATH + str( n) + '_grams_dict_' + lang1_code + '.csv' lang2_path = CHAR_LEVEL_DICTIONARIES_PATH + str( n) + '_grams_dict_' + lang2_code + '.csv' if (os.path.exists(lang1_path) and os.path.exists(lang2_path)): frequency_lang1_df = pd.read_csv(lang1_path, encoding='utf-16', converters={"word": ast.literal_eval}) frequency_lang1_dict = frequency_lang1_df.set_index( 'word')['frequency'].to_dict() frequency_lang2_df = pd.read_csv(lang2_path, encoding='utf-16', converters={"word": ast.literal_eval}) frequency_lang2_dict = frequency_lang2_df.set_index(
lang1 = sys.argv[1] lang1_code = langs()[lang1]['code'] lang1_name = langs()[lang1]['name'] # Lang 2 lang2 = sys.argv[2] lang2_code = langs()[lang2]['code'] lang2_name = langs()[lang2]['name'] # Frequency fullTraining = sys.argv[3] == 'probability' # If create frequency dictionaries frequency_lang1_dict = get_frequency_dict(lang1_code, lang1_name) frequency_lang2_dict = get_frequency_dict(lang2_code, lang2_name) # Probability dict probability_lang1_dict = get_probability_dict(frequency_lang1_dict) if (fullTraining): write_dict(WORD_LEVEL_DICTIONARIES_PATH, frequency_lang1_dict, 'frequency_dict_' + lang1_code, probability_lang1_dict, 'probability_dict_' + lang1_code) else: write_dict(WORD_LEVEL_DICTIONARIES_PATH, frequency_lang1_dict, 'frequency_dict_' + lang1_code) probability_lang2_dict = get_probability_dict(frequency_lang2_dict) if (fullTraining): write_dict(WORD_LEVEL_DICTIONARIES_PATH, frequency_lang2_dict, 'frequency_dict_' + lang2_code, probability_lang2_dict, 'probability_dict_' + lang2_code) else: write_dict(WORD_LEVEL_DICTIONARIES_PATH, frequency_lang2_dict, 'frequency_dict_' + lang2_code) print_status('Done!')
# sources: https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0 # https://github.com/kapadias/mediumposts/blob/master/nlp/published_notebooks/Introduction%20to%20Topic%20Modeling.ipynb WORD_LEVEL_DICTIONARIES_PATH = "./dictionaries/word-level/" # Get language codes and evaluation dataset from keyboard if len(sys.argv) == 1: print("Please give two letter language codes as arg, for example en es") print("Please enter evaluation dataset: 'dev', 'test' or 'test-original'") exit(1) lang1_code = sys.argv[1] lang2_code = sys.argv[2] evaluation_dataset = sys.argv[3] print_status("Getting dictionaries...") lang1_path = WORD_LEVEL_DICTIONARIES_PATH + 'probability_dict_' + lang1_code + '.csv' lang2_path = WORD_LEVEL_DICTIONARIES_PATH + 'probability_dict_' + lang2_code + '.csv' if (os.path.exists(lang1_path) and os.path.exists(lang2_path)): probability_lang1_df = pd.read_csv(lang1_path, encoding='utf-16') probability_lang1_dict = probability_lang1_df.set_index('word')['probability'].to_dict() probability_lang2_df = pd.read_csv(lang2_path, encoding='utf-16') probability_lang2_dict = probability_lang2_df.set_index('word')['probability'].to_dict() print_status("Dictionaries ready!") else: print("Please run: python train_probability.py " + lang1_code + " " + lang2_code) # Get training dictionaries print_status("Getting tokenized sentences...") lang1_path_tokenized = './dictionaries/word-level/tokenized_sentences_' + lang1_code + '.p'
from sklearn.linear_model import LogisticRegression import pandas as pd import sys import os # Get language codes and evaluation dataset from keyboard if len(sys.argv) == 1: print("Please give two letter language codes as arg, for example en es") print("Please enter evaluation dataset: 'dev', 'test' or 'test-original'") exit(1) lang1_code = sys.argv[1] lang2_code = sys.argv[2] evaluation_dataset = sys.argv[3] # Get training dictionaries print_status("Getting tokenized sentences...") lang1_path_tokenized = './dictionaries/word-level/tokenized_sentences_' + lang1_code + '.p' lang2_path_tokenized = './dictionaries/word-level/tokenized_sentences_' + lang2_code + '.p' if (os.path.exists(lang1_path_tokenized) and os.path.exists(lang2_path_tokenized)): tokenized_sentences_lang1 = pd.read_pickle(lang1_path_tokenized) tokenized_sentences_lang2 = pd.read_pickle(lang2_path_tokenized) else: print("Please run: python train_ngrams_word.py " + lang1_code + " " + lang2_code + " 2") # Flatten lists, so we have a long array of strings (words) tokenized_sentences_lang1 = [ item for sent in tokenized_sentences_lang1 for item in sent ][:100000]
# Lang 1 lang1 = sys.argv[1] lang1_code = langs()[lang1]['code'] lang1_name = langs()[lang1]['name'] # Lang 2 lang2 = sys.argv[2] lang2_code = langs()[lang2]['code'] lang2_name = langs()[lang2]['name'] # Get frequency dictionaries lang1_path = WORD_LEVEL_DICTIONARIES_PATH + 'frequency_dict_' + lang1_code + '.csv' lang2_path = WORD_LEVEL_DICTIONARIES_PATH + 'frequency_dict_' + lang2_code + '.csv' if (os.path.exists(lang1_path) and os.path.exists(lang1_path)): print_status('Getting dictionaries...') frequency_lang1_df = pd.read_csv(lang1_path, encoding='utf-16') frequency_lang1_dict = frequency_lang1_df.set_index('word')['frequency'].to_dict() frequency_lang2_df = pd.read_csv(lang2_path, encoding='utf-16') frequency_lang2_dict = frequency_lang2_df.set_index('word')['frequency'].to_dict() else: print("Please run: python train_probability.py " + lang1_code + " " + lang2_code) # Create ngrams frequency dictionaries ns = [ 2, 3, 4, 5,
else: tokenized_sentences_lang1 = get_tokenized_sentences(lang1_code, lang1_name) with open(lang1_path, 'wb') as fp: pickle.dump(tokenized_sentences_lang1, fp) lang2_path = WORD_LEVEL_DICTIONARIES_PATH + 'tokenized_sentences_' + lang2_code + '.p' if (os.path.exists(lang2_path)): tokenized_sentences_lang2 = pd.read_pickle(lang2_path) else: tokenized_sentences_lang2 = get_tokenized_sentences(lang2_code, lang2_name) with open(lang2_path, 'wb') as fp: pickle.dump(tokenized_sentences_lang2, fp) # Train n gram model ns = [ 2, 3, ] for n in ns: print_status('Training word ngrams model... n=' + str(n)) model_lang1 = NGramModel(n) model_lang1.train(tokenized_sentences_lang1) write_dict(WORD_LEVEL_DICTIONARIES_PATH, model_lang1.freq_dist, str(n) + '_grams_word_dict_' + lang1_code) model_lang2 = NGramModel(n) model_lang2.train(tokenized_sentences_lang2) write_dict(WORD_LEVEL_DICTIONARIES_PATH, model_lang2.freq_dist, str(n) + '_grams_word_dict_' + lang2_code) print_status('Done!')
from sklearn.svm import LinearSVC import pandas as pd import sys import os # Get language codes and evaluation dataset from keyboard if len(sys.argv) == 1: print("Please give two letter language codes as arg, for example en es") print("Please enter evaluation dataset: 'dev', 'test' or 'test-original'") exit(1) lang1_code = sys.argv[1] lang2_code = sys.argv[2] evaluation_dataset = sys.argv[3] # Get training dictionaries print_status("Getting tokenized sentences...") lang1_path_tokenized = './dictionaries/word-level/tokenized_sentences_' + lang1_code + '.p' lang2_path_tokenized = './dictionaries/word-level/tokenized_sentences_' + lang2_code + '.p' if (os.path.exists(lang1_path_tokenized) and os.path.exists(lang2_path_tokenized)): tokenized_sentences_lang1 = pd.read_pickle(lang1_path_tokenized) tokenized_sentences_lang2 = pd.read_pickle(lang2_path_tokenized) else: print("Please run: python train_ngrams_word.py " + lang1_code + " " + lang2_code + " 2") # Flatten lists, so we have a long array of strings (words) tokenized_sentences_lang1 = [ item for sent in tokenized_sentences_lang1 for item in sent ][:100000]