Ejemplo n.º 1
0
def get_frequency_dict(lang_code, lang_name):
	print_status("Creating frequency dictionaries...")

	frequency_dict = dict()

	# Load data
	for root, dirs, files in os.walk('datasets/monolingual-' + lang_code):
		if ('.DS_Store' in files):
			files.remove('.DS_Store')
		for f in files:
			print(f)
			filepath = os.path.join(root, f)
			file = open(filepath, 'rt', encoding='utf8')
			text = file.read()
			file.close()

			# Clean XML tags
			cleantext = BeautifulSoup(text, "lxml").text

			module = importlib.import_module("spacy.lang." + lang_code)
			nlp = getattr(module, lang_name)() if module is not None else spacy.language.Language()
			tokenizer = nlp.Defaults.create_tokenizer(nlp)
			tokens = list(tokenizer(cleantext))

			for word in tokens:
				word = word.text.lower()

				if is_other(word):
					continue
				else:
					if word in frequency_dict.keys():
						frequency_dict[word] += 1
					else:
						frequency_dict[word] = 1
	return frequency_dict
def get_tokenized_sentences(lang_code, lang_name):

    tokenizedFile = []
    # Initialize tokenizer
    module = importlib.import_module("spacy.lang." + lang_code)
    nlp = getattr(
        module,
        lang_name)() if module is not None else spacy.language.Language()
    tokenizer = nlp.Defaults.create_tokenizer(nlp)

    # Load data
    print_status("Creating tokenized sentences from dataset...")
    for root, dirs, files in os.walk('datasets/monolingual-' + lang_code):
        if ('.DS_Store' in files):
            files.remove('.DS_Store')
        for f in files:
            print(f)
            filepath = os.path.join(root, f)
            file = open(filepath, 'rt', encoding='utf8')
            text = file.read()
            file.close()

            # Clean XML tags
            cleantext = BeautifulSoup(text, "lxml").text

            # Split in sentences
            sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s",
                                 cleantext)

            # Split in tokens
            for s in sentences:
                word_tokens = []
                tokens = list(tokenizer(s))
                for t in tokens:
                    t = t.text.lower()
                    if (not is_other(t)):
                        word_tokens.append(t)

                tokenizedFile.append(word_tokens)

    return tokenizedFile
Ejemplo n.º 3
0
        word = word.lower()

        # Get lang1 prob
        if word in probability_lang1_dict:
            prob_lang1 = probability_lang1_dict[word]
        else:
            prob_lang1 = probability_lang1_dict['OOV']

        # Get lang2 prob
        if word in probability_lang2_dict:
            prob_lang2 = probability_lang2_dict[word]
        else:
            prob_lang2 = probability_lang2_dict['OOV']

        # Assign class based on regex or class with highest prob
        if (is_other(word)):
            lang = 'other'
        else:
            if (prob_lang1 >= prob_lang2):
                lang = 'lang1'
            else:
                lang = 'lang2'

        y.append(lang)
        predictions_dict[word] = lang
    else:
        y.append('')

if (evaluation_dataset == 'test-original'):
    save_predictions(
        y, './results/predictions/' + lang1_code + '-' + lang2_code +
Ejemplo n.º 4
0
        if (line.strip() is not ''):
            token = line.rstrip('\n')
            words.append(token.lower())
        else:
            words.append('')
file.close()

# Choose language with highest probability for each word based on ngrams
y = []
predictions_dict = dict()
counter = 0
print_status("Classifying...")
for word in words:
    if (word != ''):
        word = word.lower()
        if is_other(word):
            lang = 'other'
        else:
            prob_lang1 = model_lang1.get_word_log_prob(word)
            prob_lang2 = model_lang2.get_word_log_prob(word)
            if (prob_lang1 >= prob_lang2):
                lang = 'lang1'
            else:
                lang = 'lang2'

        y.append(lang)
        predictions_dict[word] = lang

        if counter % 10000 == 0:
            print(f"{counter} of {len(words)}")
        counter += 1
            s.append(token.lower())
        else:
            sentences.append(s)
            s = []

file.close()

y = []
predictions_dict = dict()
for tokens in sentences:
    if (len(tokens) > 0):
        # Separate 'lang' words from 'other' words
        lang_tokens = []
        other_indexes = []
        for i in range(len(tokens)):
            if (is_other(tokens[i])): other_indexes.append(i)
            else: lang_tokens.append(tokens[i])

        # For sentences with 'lang1', 'lang2' and 'other' words
        if (len(lang_tokens) > 0):
            y_sentence = identifier.identify(lang_tokens)
            for index in other_indexes:
                y_sentence.insert(index, 'other')

        # For sentences that are made up only of 'other' words
        else:
            y_sentence = []
            for index in other_indexes:
                y_sentence.append('other')
        for i in range(len(tokens)):
            predictions_dict[tokens[i]] = y_sentence[i]
	# Original test set
	for line in file:
		# Remove empty lines, lines starting with # sent_enum, \n and split on tab
		if (line.strip() is not ''):
			token = line.rstrip('\n')
			words.append(token.lower())
		else:
			words.append('')
file.close()


# Remove 'other' words
print_status("Removing 'other' data...")
words_not_other = []
for word in words:
	if(word != '' and not is_other(word)):
		words_not_other.append(word)


# Convert a collection of words to a matrix of token counts
print_status("Counting ngrams...")
# vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 5), binary=True)
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 5), binary=True)
vectorized_train_data = vectorizer.fit_transform(X_train)
vectorized_dev_data = vectorizer.transform(words_not_other)


# Create and fit the LDA model
print_status("Training LDA...")
number_topics = 2
lda_model = LDA(n_components=number_topics, max_iter=100, random_state=123)
Ejemplo n.º 7
0
            token = line.rstrip('\n')
            s.append(token.lower())
        else:
            sentences.append(s)
            s = []
file.close()

# Choose language with highest probability for each word based on ngrams
y = []
predictions_dict = dict()
counter = 0
print_status("Classifying...")
for s in sentences:
    if (len(s) == 0): continue
    for word_index in range(len(s)):
        if is_other(s[word_index]):
            lang = 'other'
        else:
            prob_lang1 = model_lang1.get_word_log_prob(s, word_index)
            prob_lang2 = model_lang2.get_word_log_prob(s, word_index)
            if (prob_lang1 >= prob_lang2):
                lang = 'lang1'
            else:
                lang = 'lang2'
        y.append(lang)
        predictions_dict[s[word_index]] = lang

    if counter % 10000 == 0:
        print(f"{counter} of {len(sentences)}")
    counter += 1