def detect_language(comment):
    """
    To detect language we could compare a comment to stopwords from each language. The language that has most
    stopwords in common with the comment is likely to be the language in which the comment is written. This is obviously
    not waterproof, however, a well written comment would work way better than a comment written in slang or with poor
    grammar. Ultimately, this would likely result in comments that are more valuable because of their structure.
    In addition, languages that are easily distinguished from English could be detected, thus being able to compare the
    language of a comment to the actual content that is annotated in Hypothes.is, since most users won't understand
    comments in a different language anyway. 
    """

    # first we tokenize the comment
    tokens = wordpunct_tokenize(comment)
    words = [word.lower() for word in tokens]

    languages_ratios = {}

    # Then we compare the words to the most frequent stopwords per language
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)

        # Calculate the language score
        languages_ratios[language] = len(common_elements)

    # Get the key with the highest value
    most_rated_language = max(languages_ratios, key=languages_ratios.get)

    return most_rated_language
Esempio n. 2
0
def calculate_language_scores(text):
    """
    Calculate probability of given text to be written in several languages and
    return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}.

    :param text: Text to analyze.
    :type text: str

    :return: Dictionary with languages and unique stopwords seen in analyzed text.
    :rtype: dict(str -> int)

    :raises: TypeError
    """
    if not isinstance(text, basestring):
        raise TypeError("Expected basestring, got '%s' instead" % type(text))
    if not text:
        return {}

    languages_ratios = {}

    # Split the text into separate tokens, using natural language punctuation signs.
    tokens = wordpunct_tokenize(text)
    tokenized_words = [word.lower() for word in tokens]

    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(tokenized_words)
        common_elements = words_set.intersection(stopwords_set)
        languages_ratios[language] = len(common_elements)  # language "score"

    return languages_ratios
Esempio n. 3
0
def generate(languages):
    """
    Generate a dict of language stopwords from nltk.corpus by the specific
    lanauges.

        language_stopwords:
             {
                {'english': {'a', 'an'},
                {'french': {'un', 'une'}
             }

    @param languages: languages specified, e.g. ['english', 'french']
    @type languages: list

    @return: a dict of stopwords corresponding to the specific languages
    @rtype: dict
    """

    language_stopwords = {}

    for language in stopwords.fileids():
        if language in languages:
            stopwords_set = set(stopwords.words(language))
            language_stopwords[language] = stopwords_set

    return language_stopwords
def _calculate_languages_ratios(text): 
    """
    Calculate probability of given text to be written in several languages and
    return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
    
    @param text: Text whose language want to be detected
    @type text: str
    
    @return: Dictionary with languages and unique stopwords seen in analyzed text
    @rtype: dict
    """

    languages_ratios = {}

    '''
    nltk.wordpunct_tokenize() splits all punctuations into separate tokens
    '''

    tokens = wordpunct_tokenize(text)
    words = [word.lower() for word in tokens]

    # Compute per language included in nltk number of unique stopwords appearing in analyzed text
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)

        languages_ratios[language] = len(common_elements) # language "score"

    return languages_ratios
def hello_world():
    if request.method == 'POST':
        print "Request: ", request
        print "Form: ", request.form
        print "Files: ", request.files
        archive = zipfile.ZipFile(request.files.get("solution"))
        with archive.open("extra.txt") as solution:
            languages_ratios = {}
            tokens = nltk.wordpunct_tokenize(solution.read().decode('utf-8'))
            words_list = [word.lower() for word in tokens]
            words_set = set(words_list)
            print "Words_set: ", words_set
            for language in stopwords.fileids():
                stopwords_set = set(stopwords.words(language))
                common_elements = words_set.intersection(stopwords_set)
                if common_elements:
                    languages_ratios[language] = len(common_elements)

            print "Language ratios: ", languages_ratios
            # 50%
            mark = 50 if max(languages_ratios, key=languages_ratios.get) == 'english' else 0
            # 50%
            print "Mark for lang: ", mark
            words_count = len(words_list)
            print "Words count: ", words_count
            mark += (float(words_count) / 200) * 50 if words_count < 200 else 50
            print "Total Mark: ", mark

        req = requests.post(request.form["url"], data={"mark": int(mark)})

    return ''
def split_count(sentence):     # split the sentence and count where each words come from 
    # how about making a parallel list and then nameing the language and word in the same index #box
    vocab_list = []
    languages_ratios = {}
    split = wordpunct_tokenize(sentence)              # tokenizes the input 
    words  = [word.lower()for word in split]          # makes sentence lower in the list split 
    lang_dict = {} 
    for language in stopwords.fileids():              # iterate through a list of lang built in 
        stopwords_set = set(stopwords.words(language)) 
        words_set = set(words)                        # creates a set of words 
        vocab_list = words                            # good
        # print "this is word set: " ,words_set
        #print "this is vocablist: " , vocab_list 
        common_element = words_set.intersection(stopwords_set)
        
        languages_ratios[language] = len(common_element) # this will detrm}ain the score
        lang_dict[language] = common_element          # works like intend, but want to make Cleaner 

        #main_language_set = 
        #secondary_lang    = lang_dict.intersection( secondary_lang) 

    # print "size of vocab: ",len(vocab_list)     #,"and lang ", len(lang_list)  ---Delete
    # for i in range(len(vocab_list)):
    #     print  lang_list[i],vocab_list[i]
    #     print "----------------------------"
    print "this is the set for main lang:", lang_dict.get(main_language), "\n"
    print "this is the set for second lang:", lang_dict.get(secondary_lang),"\n"
    # print "this lang. ratios", languages_ratios , "\n"
    # print "this is lang list: ",lang_list
    print "this is vocb_list: ", vocab_list , "\n" # check good 
    print "this is DICT: ", lang_dict
    print "ORIGINAL SENTENCE: " , sentence
def calcularValoresDeIdioma(contenido):
    languages_ratios = {}
    tokens = wordpunct_tokenize(contenido)
    words = [word.lower() for word in tokens]
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)
        languages_ratios[language] = len(common_elements)
    return languages_ratios
def calculate_languages_ratios(text):
	languages_ratios = {}
	tokens = wordpunct_tokenize(text)
	words = [word.lower() for word in tokens]
	for language in stopwords.fileids():
		stopwords_set = set(stopwords.words(language))
		words_set = set(words)
		common_elements = words_set.intersection(stopwords_set)
		languages_ratios[language] = len(common_elements) # language "score"
	return languages_ratios
Esempio n. 9
0
    def detectLanguage(self, text):
        languages_scores = {}
        tokens = word_tokenize(text)
        words = [word.lower() for word in tokens]
        # Compute per language included in nltk number of unique stopwords
        # appearing in analyzed text
        for language in stopwords.fileids():
            stopwords_set = set(stopwords.words(language))
            words_set = set(words)
            common_elements = words_set.intersection(stopwords_set)
            languages_scores[language] = len(common_elements) # language "score"

        return max(languages_scores, key=languages_scores.get)
Esempio n. 10
0
	def check_language(self, word_list):
		""" source: http://blog.alejandronolla.com/2013/05/15/detecting-text-language-with-python-and-nltk/""" 
		languages_ratios = {}
		for language in stopwords.fileids():
			stopwords_set = set(stopwords.words(language))
			words_set = set(word_list)
			# Check similarity
			common_elements = words_set.intersection(stopwords_set)
			# Save as ratio
			languages_ratios[language] = len(common_elements)

		# Get language with most similarities
		most_rated_language = max(languages_ratios, key=languages_ratios.get)
		return most_rated_language
def _calculate_languages_ratios(text):
    text = str(text) # assuring we receive a String
    languages_ratios = {}
    tokens = wordpunct_tokenize(text)
    words = [word.lower() for word in tokens]

    # Compute per language included in nltk number of unique stopwords appearing in analyzed text
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)

        languages_ratios[language] = len(common_elements) # language "score"

    return languages_ratios
def cal():
    text = sys.stdin.read()
    languages_ratios = {}
    toekns = wordpunct_tokenize(text)
    words = [word.lower() for word in toekns]
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)
        languages_ratios[language] = len(common_elements)
        
    ratios = languages_ratios
    most = max(ratios, key=ratios.get)
    print (most)
    """if most == "english":
def language_detector(string):

	tokens = wordpunct_tokenize(string)
	words = [word.lower() for word in tokens]

	# compute language scores
	languages_ratios = {}
	for language in stopwords.fileids():
		stopwords_set = set(stopwords.words(language))
		words_set = set(words)
		common_elements = words_set.intersection(stopwords_set)
		languages_ratios[language] = len(common_elements) # language "score"

	languages_ratios
	most_rated_language = max(languages_ratios, key=languages_ratios.get)
	return most_rated_language
Esempio n. 14
0
	def _calculate_languages_ratios(self, text):
    	#Calcule la probabilité d'avoir un text écrit dans telle ou telle languages et
    	#retourne un dictionnaire qui ressemble à {'french': 2, 'english': 4, 'dutsh': 0}

		languages_ratios = {}
		tokens = self.getWords(text)

    		# Compte par language le nombre de stopwords qui apparait.
		for language in stopwords.fileids():
			stopwords_set = set(stopwords.words(language))
			words_set = set(tokens)
			common_elements = words_set.intersection(stopwords_set)

			languages_ratios[language] = len(common_elements) # nombre d'aparition de stopwords par langue

		return languages_ratios
Esempio n. 15
0
    def lang_likelihood(self, document):
        ''' This method computes the language likelihood using algorithm 
        and tokenizer from NLTK.
    '''
        languages_likelihood = {}

        tokens = wordpunct_tokenize(document)
        words = [word.lower() for word in tokens]

        for language in stopwords.fileids():
            stopwords_set = set(stopwords.words(language))
            words_set = set(words)
            common_elements = words_set.intersection(stopwords_set)

            languages_likelihood[language] = len(common_elements) # language "score"
        
        return languages_likelihood
def _calculate_languages_ratios(text):
    """
    Calculate probability of given text to be written in several languages and
    return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
    
    @param text: Text whose language want to be detected
    @type text: str
    
    @return: Dictionary with languages and unique stopwords seen in analyzed text
    @rtype: dict
    """

    languages_ratios = {}

    '''
    nltk.wordpunct_tokenize() splits all punctuations into separate tokens
    
    >>> wordpunct_tokenize("That's thirty minutes away. I'll be there in ten.")
    ['That', "'", 's', 'thirty', 'minutes', 'away', '.', 'I', "'", 'll', 'be', 'there', 'in', 'ten', '.']
    '''

    tokens = wordpunct_tokenize(text)
    words = [word.lower() for word in tokens]

    # Compute per language included in nltk number of unique stopwords appearing in analyzed text
    for language in stopwords.fileids():
        if (language == "portuguese"):
            lista=stopwords.words(language)
            lista.append('Fatec')
            lista.append('fatec')
            lista.append('Palmeiras')
            lista.append('palmeiras')
            lista.append('Dilma')
            lista.append('dilma')
            lista.append('Copa')
            lista.append('copa')
            stopwords_set=set(lista)
        else: 
            stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)
        languages_ratios[language] = len(common_elements) # language "score"

    return languages_ratios
def main(): 
#step 1 which tokenizes the words and now we have clean words to stop match against stop 

    print "\n -----------------------------\n"
    split = wordpunct_tokenize("hola como estas, espero que estes bien" )
    print split
    print "\n -----------------------------\n"
    #
    #Lets Get serious 
    #
    languages_ratios = {}
    tokens = wordpunct_tokenize("hola como estas?")
    words = [word.lower() for word in tokens]
    for language in stopwords.fileids(): 
        stopwords_set = set(stopwords.words(language))
        words_set = set(words) 
        common_element = words_set.intersection(stopwords_set)
        languages_ratios[language] = len(common_element) # this will detrmain the score

    print languages_ratios
Esempio n. 18
0
def calculate_language_scores(text):
    """
    Calculate probability of given text to be written in several languages and
    return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}.

    :param text: Text to analyze.
    :type text: str

    :return: Dictionary with languages and unique stopwords seen in analyzed text.
    :rtype: dict(str -> int)
    """

    # Split the text into separate tokens, using natural language punctuation signs.
    words = {word.lower() for word in wordpunct_tokenize(text)}

    # Return the number of stopwords found per language.
    return {
        len( words.intersection( stopwords.words(language) ) )
        for language in stopwords.fileids()
    }
def identify_language(text):
    """
    Identify a language, given a text of that language.

    Parameters
    ----------
    text : str

    Returns
    -------
    list of tuples (ISO 369-3, score)

    Examples
    --------
    >>> identify_language('Ich gehe zur Schule.')
    [('deu', 0.8)]
    """
    languages_ratios = []

    tokens = wordpunct_tokenize(text)
    words = [word.lower() for word in tokens]

    # Check how many stopwords of the languages NLTK knows appear in the
    # provided text
    for language in stopwords.fileids():
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)

        score = len(common_elements)
        languages_ratios.append((language, score))

    # Normalize
    sum_scores = float(sum(el[1] for el in languages_ratios))
    languages_ratios = [(_nltk_to_iso369_3(el[0]), el[1])
                        for el in languages_ratios]
    if sum_scores > 0:
        languages_ratios = [(el[0], el[1] / sum_scores)
                            for el in languages_ratios]

    return sorted(languages_ratios, key=lambda n: n[1], reverse=True)
Esempio n. 20
0
	def detect_lang(self,text):
		""" Returns the detected language.

		Args:
		  text: input text 

		Returns:
		  the detectred language string
		"""
		language_ratio = {}
		words = wordpunct_tokenize(text)

		for language in stopwords.fileids():
			stopwords_set = set(stopwords.words(language))
			words_set = set(words)
			common_words = words_set.intersection(stopwords_set)
			language_ratio[language] = len(common_words)

		detected_lang = max(language_ratio, key=language_ratio.get)

		return detected_lang
Esempio n. 21
0
def capitalize(text):
    """
    Text capitalizator for Python 2.
    """
    if isinstance(text, str):
        text = text.decode("utf-8")
    if set(text) & CYRILLIC_ALPHABET:
        language = "russian"
    else:
        words = set(wordpunct_tokenize(text.lower()))
        language = max(
            stopwords.fileids(),
            key=lambda lang: len(words & PRECALCULATED_LANGSETS[lang])
        )

    class_ = EnglishCapitalization
    if language == "russian":
        class_ = RussianCapitalization
    elif language == "spanish":
        class_ = SpanishCapitalization
    elif language == "dutch":
        class_ = DutchCapitalization
    return class_().capitalize(text)
Esempio n. 22
0
def _calculate_languages_ratios(words):
    """
    Calculate probability of given text to be written in several languages and
    return a dictionary that looks like {'french': 2, 'spanish': 4, 'english': 0}
    
    @param text: Text whose language want to be detected
    @type text: str
    
    @return: Dictionary with languages and unique stopwords seen in analyzed text
    @rtype: dict
    """

    languages_ratios = {}

    # Compute per language included in nltk number of unique stopwords appearing in analyzed text
    for language in stopwords.fileids():
        stopwords_set = set([i.encode("utf-8") for i in stopwords.words(language)])
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)

        languages_ratios[language] = len(common_elements)  # language "score"

    return languages_ratios
Esempio n. 23
0
 def _get_available_languages(self):
     """ Get available languages by listing nltk's stopwords files """
     return stopwords.fileids()
Esempio n. 24
0
    def construct_ui(self):
        self.setWindowTitle('TextAnalysis')

        self.mainWidget = QtWidgets.QWidget()
        self.setCentralWidget(self.mainWidget)

        self.statusBar = QtWidgets.QStatusBar()
        self.setStatusBar(self.statusBar)

        layout = QtWidgets.QGridLayout()
        self.mainWidget.setLayout(layout)

        # text input
        layout.addWidget(QtWidgets.QLabel("Paste text to analyze or "), 0, 0,
                         1, 2)
        self.button_file = QtWidgets.QPushButton("open file")

        layout.addWidget(self.button_file, 0, 2, 1, 1)
        self.textin = QtWidgets.QPlainTextEdit(
            "👍 and 📋 and 👌 and 😋 and 👍 again. In an interview on BBC Radio 4’s Today programme, Raab argued that leaving the EU without a deal would not be a problem, partly because the general agreement on tariffs and trade (Gatt) could be applied to create a standstill on tariffs with the EU. Mark Carney, the governor of the Bank of England, and Liam Fox, the trade secretary, have said it is not possible for the UK to trigger this unilaterally. But Raab said Carney was not a lawyer and claimed that legally it could be done and the question is whether there is the political will."
        )
        layout.addWidget(self.textin, 1, 0, 1, 4)

        self.separatorLine1 = QtWidgets.QFrame()
        self.separatorLine1.setFrameShape(QtWidgets.QFrame.HLine)
        self.separatorLine1.setFrameShadow(QtWidgets.QFrame.Plain)
        self.separatorLine1.setLineWidth(1)
        layout.addWidget(self.separatorLine1, 2, 0, 1, 4)

        # emoji interface
        layout.addWidget(QtWidgets.QLabel("Create emoji statistics:"), 3, 0, 1,
                         4)
        self.button_emoji = QtWidgets.QPushButton("analyze")
        layout.addWidget(self.button_emoji, 4, 0, 1, 1)
        self.textout_emoji = QtWidgets.QPlainTextEdit()
        self.textout_emoji.setPlainText(
            "output goes here (copy and paste into e.g. a spreadsheet)")
        layout.addWidget(self.textout_emoji, 5, 0, 1, 4)

        self.separatorLine2 = QtWidgets.QFrame()
        self.separatorLine2.setFrameShape(QtWidgets.QFrame.HLine)
        self.separatorLine2.setFrameShadow(QtWidgets.QFrame.Plain)
        self.separatorLine2.setLineWidth(1)
        layout.addWidget(self.separatorLine2, 6, 0, 1, 4)

        # bigram interface
        layout.addWidget(QtWidgets.QLabel("Create ngram statistics:"), 7, 0, 1,
                         4)
        layout.addWidget(QtWidgets.QLabel("Stopwords:"), 8, 0, 1, 1)
        self.languagelist = QtWidgets.QComboBox()
        self.languagelist.addItem("- none -")
        for item in stopwords.fileids():
            self.languagelist.addItem(item)
        layout.addWidget(self.languagelist, 8, 1, 1, 1)
        layout.addWidget(QtWidgets.QLabel("Windowsize:"), 8, 2, 1, 1)
        self.windowsize = QtWidgets.QLineEdit()
        self.windowsize.setMaxLength(1)
        self.windowsize.setText("2")
        layout.addWidget(self.windowsize, 8, 3, 1, 1)
        self.button_ngrams = QtWidgets.QPushButton("analyze")
        layout.addWidget(self.button_ngrams, 9, 0, 1, 1)
        self.textout_ngrams = QtWidgets.QPlainTextEdit()
        self.textout_ngrams.setPlainText(
            "output goes here (copy and paste into e.g. a spreadsheet)")
        layout.addWidget(self.textout_ngrams, 10, 0, 1, 4)

        # event binding
        self.button_file.clicked.connect(self.opentextfile)
        self.button_emoji.clicked.connect(self.emojistats)
        self.button_ngrams.clicked.connect(self.start_ngrams)
Esempio n. 25
0
#!/usr/bin/env python

import nltk
import numpy as np
import sys

from nltk.corpus import stopwords
from review_data import read_reviews

###############################################################################

languages = stopwords.fileids()

stopword_sets = [set(stopwords.words(lang)) for lang in languages]

target_languages = [u'english', u'spanish']

###############################################################################

def detect_language(tokens):
    token_set = set(tokens)

    lang_scores = []
    for stopword_set in stopword_sets:
        common_words = stopword_set & token_set
        lang_scores.append(len(common_words))

    best_index = np.argmax(lang_scores)
    best_lang = languages[best_index]
    return best_lang
from nltk.corpus import stopwords
english_stops=set(stopwords.words('english'))
words=["Can't", 'is','a','contraction']
print [word for word in words if word not in english_stops]

print stopwords.fileids()
import math
import operator
import re
from pprint import pprint
import numpy as np
from SequenceMining import GspSearch
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage

### LANGUAGE RECOGNITION ###

if __name__ == '__main__':
    # These are the available languages with stopwords from NLTK
    languages = stopwords.fileids()

    # Fill the dictionary of languages, to avoid  unnecessary function calls
    print("Loading stop words...", end='\r')
    try:
        dict_list = np.load('stopwords.npy').item()
    except:
        dict_list = {}
        for lang in languages:
            dict_list[lang] = {}
            for stop_word in stopwords.words(lang):
                dict_list[lang][stop_word] = 0
        np.save('stopwords.npy', dict_list)
    print("Loaded stop words.      ")

Esempio n. 28
0
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

tokenizer = RegexpTokenizer('\s+', gaps=True)
english_stopwords = set(stopwords.words('english'))
words = tokenizer.tokenize("Can't is a contraction.")
print(words)
filtered = [word for word in words if word not in english_stopwords]
print(filtered)

# stopwords of all languages
# print(stopwords.words())

# languages
print(stopwords.fileids())
Esempio n. 29
0
    def run(self, table):
        # concatenate all cell's contents
        cell_content = ' '.join(
            map(
                attrgetter('content'),
                table.cells()
            )
        )
        input_string = cell_content
        # add text from additional fields
        for field_name in self.additional_fields:
            if field_name in table.table_data:
                input_string += table.table_data[field_name]

        # tokenize string and extract lower case words
        tokens = wordpunct_tokenize(input_string)
        words = [word.lower() for word in tokens]

        # iterate over all languages in nltk and match their stopwords
        # to the words in our input string.
        # larger intersection -> higher score
        languages_ratios = {}
        for language in stopwords.fileids():
            # the language's stopwords
            stopwords_set = set(stopwords.words(language))
            # our words
            words_set = set(words)
            # intersection between the sets
            common_elements = len(words_set.intersection(stopwords_set))
            if common_elements > 0:
                languages_ratios[language] = common_elements

        # get top <n> languages
        top_n_languages = sorted(
            languages_ratios.items(),
            key=itemgetter(1),
            reverse=True
        )[:self.top_n]

        # sum all language scores to normalize the individual scores
        language_score_sum = sum(
            map(itemgetter(1), top_n_languages)
        )

        limit_match = len(self.limit) == 0

        # add annotations for each identified language
        for language in top_n_languages:
            language_name, language_score = language
            normalized_score = language_score/language_score_sum

            if language_name in self.limit and normalized_score >= self.limit[language_name]:
                limit_match = True

            table.annotations.append({
                'source': 'preprocessing',
                'task': 'LanguageDetection',
                'language': language_name,
                'score': normalized_score,
            })

        return limit_match
Esempio n. 30
0
	A multi-class classifier chooses one of many possible labels.
	A multi-binary classifier choose zero or more labels by combining multiple
	binary classifiers, 1 for each label.''')
classifier_group.add_argument('--binary', action='store_true', default=False,
	help='train a binary classifier, or a multi-binary classifier if --multi is also given')
classifier_group.add_argument('--multi', action='store_true', default=False,
	help='train a multi-class classifier, or a multi-binary classifier if --binary is also given')

feat_group = parser.add_argument_group('Feature Extraction',
	'The default is to lowercase every word, strip punctuation, and use stopwords')
feat_group.add_argument('--bigrams', action='store_true', default=False,
	help='include bigrams as features')
feat_group.add_argument('--no-lowercase', action='store_true', default=False,
	help="don't lowercase every word")
feat_group.add_argument('--filter-stopwords', default='no',
	choices=['no']+stopwords.fileids(),
	help='language stopwords to filter, defaults to "no" to keep stopwords')
feat_group.add_argument('--punctuation', action='store_true', default=False,
	help="don't strip punctuation")

score_group = parser.add_argument_group('Feature Scoring',
	'The default is no scoring, all words are included as features')
score_group.add_argument('--score_fn', default='chi_sq',
	choices=[f for f in dir(BigramAssocMeasures) if not f.startswith('_')],
	help='scoring function for information gain and bigram collocations, defaults to chi_sq')
score_group.add_argument('--min_score', default=0, type=int,
	help='minimum score for a word to be included, default is 0 to include all words')
score_group.add_argument('--max_feats', default=0, type=int,
	help='maximum number of words to include, ordered by highest score, defaults is 0 to include all words')

eval_group = parser.add_argument_group('Classifier Evaluation',
Esempio n. 31
0
from nltk.corpus import stopwords
from six import text_type


###############################################################################


# This small hack (?) helps NLTK to find its files.
NLTK_DATA.path[0:0] = [NLTK_PATH]


###############################################################################


PRECALCULATED_LANGSETS = {}
for _language in stopwords.fileids():
    stopwords_set = set(
        text_type(wrd, "utf-8") for wrd in stopwords.words(_language)
    )
    stopwords_set = (wordpunct_tokenize(word) for word in stopwords_set)
    PRECALCULATED_LANGSETS[_language] = set(chain.from_iterable(stopwords_set))

CYRILLIC_ALPHABET = text_type("ЙЦУКЕНГШЩЗХЪФЫВАПРОЛДЖЭЯЧСМИТЬЬБЮ", "utf-8")
CYRILLIC_ALPHABET = frozenset(CYRILLIC_ALPHABET + CYRILLIC_ALPHABET.lower())


###############################################################################


class Capitalization(object):
    """
Esempio n. 32
0
feat_group = parser.add_argument_group(
    'Feature Extraction',
    'The default is to lowercase every word, strip punctuation, and use stopwords'
)
feat_group.add_argument('--ngrams',
                        nargs='+',
                        type=int,
                        help='use n-grams as features.')
feat_group.add_argument('--no-lowercase',
                        action='store_true',
                        default=False,
                        help="don't lowercase every word")
feat_group.add_argument(
    '--filter-stopwords',
    default='no',
    choices=['no'] + stopwords.fileids(),
    help='language stopwords to filter, defaults to "no" to keep stopwords')
feat_group.add_argument('--punctuation',
                        action='store_true',
                        default=False,
                        help="don't strip punctuation")
feat_group.add_argument(
    '--value-type',
    default='bool',
    choices=('bool', 'int', 'float'),
    help=
    '''Data type of values in featuresets. The default is bool, which ignores word counts.
	Use int to get word and/or ngram counts.''')

score_group = parser.add_argument_group(
    'Feature Scoring',
Esempio n. 33
0
for review in tokenized_docs:
    new_review = []
    for token in review:
        """re是regular expression的所写,表示正则表达式 sub是substitute的所写,表示替换"""
        """re.sub(pattern, repl, string, count=0, flags=0)"""
        """x.sub()"""
        new_token = x.sub(u'', token)  # 不匹配正则的用空字符(u'')替换掉
        if not new_token == u'':
            new_review.append(new_token)
        if len(new_review) == 0:  # 跳过空格
            continue
        tokenized_docs_no_punctuation.append(new_review)  # breakpoint
print(tokenized_docs_no_punctuation)

print("分词 nltk_data\\corpora\\stopwords\\*")
lang = stopwords.fileids()
print("语言:", lang)
stops = set(stopwords.words("english"))
print("英语单词-停顿词:", stops)
words = ["Don't", 'hesitate', 'to', 'ask', 'questions']
out = [word for word in words if word not in stops]
print(out)
"""相似性度量(比较)算法 参考源码 distance.py"""
out = edit_distance('relate', 'relation')  # 动态规划方法(y=x映射)
print(out)
X = set([10, 20, 30, 40])
Y = set([20, 30, 60])
print(jaccard_distance(X, Y))  # 集合相交法  交集长/并集长
"""
史密斯-沃特曼算法 nlp/Smith-Waterman-Algorithm-Example.gif
基于生物信息学的知识来匹配蛋白序列或者DNA序列的算法  找出两个序列中具有高相似度的片段
Esempio n. 34
0
def topics(df, model="lda", language=False, save=False):
    """ Either executes LDA or NMF on a dutch document.
    This is a simple implementation and only used for
    "fun" purposes. It is not so much to find the very
    best topics, but topics that are good enough. 
    
    
    Parameters:
    -----------
    df : pandas dataframe
        Pandas dataframe that contains the raw messages
    mode : str, default "lda"
        Which model to use for topic modelling. 
        Either "lda" or "nmf" works for now
    stopwords : str, default None
        If you want to remove stopwords, provide a local 
        link to the text file (that includes a list of words)
        including the extension. 
    
    """
    if save:
        file = open(f"results/topic_{model}.txt", "a")
    else:
        file = None

    # Prepare stopwords
    try:
        stopwords = nltk_stopwords.words(language)
    except:
        languages = nltk_stopwords.fileids()
        raise Exception(
            f"Please select one of the following languages: {languages}")

    # Create Topics
    for user in df.User.unique():
        print("#" * len(user) + "########", file=file)
        print("### " + user + " ###", file=file)
        print("#" * len(user) + "########\n", file=file)

        data_samples = df[df.User == user].Message_Only_Text
        data_samples = data_samples.tolist()

        if model == "lda":
            # Extracting Features
            tf_vectorizer = CountVectorizer(max_df=0.95,
                                            min_df=2,
                                            stop_words=stopwords)
            tf = tf_vectorizer.fit_transform(data_samples)

            # Fitting LDA
            topic_model = LatentDirichletAllocation(n_components=5,
                                                    max_iter=5,
                                                    learning_method='online',
                                                    learning_offset=50.,
                                                    random_state=0)
            topic_model.fit(tf)
            feature_names = tf_vectorizer.get_feature_names()
        else:
            # MNF uses tfidf
            tfidf_vectorizer = TfidfVectorizer(max_df=0.95,
                                               min_df=2,
                                               stop_words=stopwords)
            tfidf = tfidf_vectorizer.fit_transform(data_samples)
            feature_names = tfidf_vectorizer.get_feature_names()

            # Run NMF
            topic_model = NMF(n_components=5,
                              random_state=1,
                              alpha=.1,
                              l1_ratio=.5,
                              init='nndsvd')
            topic_model.fit(tfidf)

        print("\nTopics in {} model:".format(model), file=file)
        print_top_words(topic_model, feature_names, 7, file=file)
Esempio n. 35
0
# -*- coding: utf-8 -*-
"""
Created on Thu Oct 11 21:39:24 2018

@author: Ahmad
"""
import nltk
from nltk.corpus import stopwords
sw = stopwords.words("english")
sw[0]
len(sw)
stopwords_count = set(stopwords.words("english"))
len(stopwords_count)
stopwords_count
stopwords.fileids()

sw_a = stopwords.words("spanish")
sw[0]
set(stopwords.words("spanish"))


# &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
import nltk
from nltk.stem.snowball import SnowballStemmer
import string
words = "Hi Everyone  If you can read this message youre properly using parseOutText  Please proceed to the next part of the project"

stemmer = SnowballStemmer("english")
stemmer.stem("Hi")
        
words_list = words.split()
Esempio n. 36
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

from nltk.corpus import names, stopwords, words
print(words.fileids())
print(words.words('en'))  # doctest: +ELLIPSIS
print(stopwords.fileids())  # doctest: +ELLIPSIS
print(stopwords.words('portuguese'))  # doctest: +ELLIPSIS
print(names.fileids())
print(names.words('male.txt'))  # doctest: +ELLIPSIS
print(names.words('female.txt'))  # doctest: +ELLIPSIS

from nltk.corpus import cmudict
print(cmudict.entries()[653:659])  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
# Load the entire cmudict corpus into a Python dictionary:
transcr = cmudict.dict()
print([transcr[w][0] for w in 'Natural Language Tool Kit'.lower().split()])
Esempio n. 37
0
def preprocess(args):
    path = './dataset/'

    doc_df = pd.read_csv(path + 'documents.csv')
    doc_dict = dict()
    # Fill nan(float) as none(str)
    doc_df = doc_df.fillna('')
    Languages = list(stopwords.fileids())

    # Build remove terms
    remove_title = [
        '[Text]', 'Language:', '<F P=105>', '</F>', 'Article Type:BFN',
        '<F P=106>', 'Article Type:CSO', '[Excerpt]', '[Editorial Report]',
        '[passage omitted]', 'ONLY <F P=103>', '<F P=104>'
    ]
    remove_term = [
        '.', '"', '--', '\'s', '<', '>', '[', ']', '`', ',', ':', '/', '\\',
        '{', '}', '-', '(', ')'
    ]
    my_stopwords = [
        'mr', 'he\'d', 'also', 'every', 'would', 'without', 'per', 'yesterday',
        'however', 'could', 'since', 'many', 'must', 'well', 'still', 'today',
        'people', 'next'
    ]

    print('Stopwords removing processing\n')
    # Build dictionary
    for doc in doc_df.iloc:
        temp_str = doc['doc_text']
        # Choosing the fileids of stopwords, initial:english
        Lang_tmp = 'english'
        Lang_flag = False
        if ('Language: <F P=105>' in temp_str):
            Lang_flag = True
            Lang_tmp = temp_str.split('Language: <F P=105>')[1].split()[0]
            if (not (Lang_tmp.lower() in Languages)):
                Lang_flag = False
        # Removing meaningless words
        for t in remove_title:
            if (t in temp_str):
                temp_str = temp_str.replace(t, '')
        for w in remove_term:
            if (w in temp_str):
                temp_str = temp_str.replace(w, '')
        # Removing stopwords
        temp = temp_str.split()
        tmp_len = len(temp)
        for t in range(tmp_len):
            temp[t] = temp[t].lower()
        temp = Counter(temp)
        for m in my_stopwords:  # My stopwords set for all doc
            if (m in temp):
                del temp[m]
        for s in stopwords.words('english'):  # english stopwords for all doc
            if (s in temp):
                del temp[s]
        if (Lang_flag and Lang_tmp != 'English'):
            for s in stopwords.words(Lang_tmp.lower()):
                if (s in temp):
                    del temp[s]
        # Save to dict
        doc_dict[doc['doc_id']] = temp

    print('.pkl file output\n')
    # File output
    pickle_out = open(args.doc_dict_path, 'wb')
    pickle.dump(doc_dict, pickle_out)
    pickle_out.close()

    print('Done\n')
    return doc_dict
Esempio n. 38
0
# Pulling the data. Each page contains 20 tweets
while (cnt <= lim):
    new_tweets = api.user_timeline(screen_name=handle, page=cnt)
    all_tweets.extend(new_tweets)
    cnt += 1

# Pull out only the Tweets from all the other data
op = [tweet.text.encode("utf-8") for tweet in all_tweets]

for tweets in op:
    # Language Filter
    text = wordpunct_tokenize(tweets)
    lang_count = {}
    words = [word.lower() for word in text]

    for language in stopwords.fileids():
        stop_set = set(stopwords.words(language))
        word_set = set(words)

        common = word_set.intersection(stop_set)
        lang_count[language] = len(common)

    lang = max(lang_count, key=lang_count.get)
    # Store in DB
    if (lang == "english"):
        print tweets
        saveFile = open('final.csv', 'a')
        saveFile.write(tweets)
        saveFile.write('\n')
        saveFile.close()
Esempio n. 39
0
#!/usr/bin/python3
# coding: utf-8
from nltk.corpus import stopwords
##################################################################
## 简单测试
print(type(stopwords))  # <class 'nltk.corpus.reader.wordlist.WordListCorpusReader'>
print(len(stopwords.fileids()))  # 21; 支持 21 种语言; 在 ~/nltk_data/corpora/stopwords/; 目前不支持中文
print(stopwords.fileids())  # ['arabic', 'azerbaijani', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hungarian', 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish', 'turkish']
print(len(stopwords.words('english')))  # 179 个停用词
print(stopwords.words('english')[:5])  # ['i', 'me', 'my', 'myself', 'we']
print(stopwords.words('chinese')[:5])  # ['i', 'me', 'my', 'myself', 'we']
    help="""the group of words that represents a single training instance,
	the default is to use entire files""",
)
corpus_group.add_argument(
    "--fraction", default=1.0, type=float, help="""The fraction of the corpus to use for testing coverage"""
)

feat_group = parser.add_argument_group(
    "Feature Extraction", "The default is to lowercase every word, strip punctuation, and use stopwords"
)
feat_group.add_argument("--ngrams", nargs="+", type=int, help="use n-grams as features.")
feat_group.add_argument("--no-lowercase", action="store_true", default=False, help="don't lowercase every word")
feat_group.add_argument(
    "--filter-stopwords",
    default="no",
    choices=["no"] + stopwords.fileids(),
    help='language stopwords to filter, defaults to "no" to keep stopwords',
)
feat_group.add_argument("--punctuation", action="store_true", default=False, help="don't strip punctuation")

args = parser.parse_args()

###################
## corpus reader ##
###################

reader_args = []
reader_kwargs = {}

if args.cat_pattern:
    reader_args.append(args.cat_pattern)
Esempio n. 41
0
try:
    from nltk.tokenize import wordpunct_tokenize # RE-based tokenizer which splits text on whitespace and punctuation (except for underscore)
except ImportError:
    print('[!] You need to install nltk (http://nltk.org/index.html)')
    

test_tokens = wordpunct_tokenize(text)
test_tokens


#### stopwords#######

from nltk.corpus import stopwords
stopwords.readme().replace('\n', ' ') # Since this is raw text, we need to replace \n's with spaces for it to be readable.

stopwords.fileids() # Most corpora consist of a set of files, each containing a piece of text. A list of identifiers for these files is accessed via fileids().


stopwords.words('english')[:10]

len(stopwords.words('english'))

###
##We loop through the list of stop words in all languages and check how many stop words our test text contains in each language. The text is then classified to be in the language in which it has the most stop words.
language_ratios = {}

test_words = [word.lower() for word in test_tokens] # lowercase all tokens
test_words_set = set(test_words)

for language in stopwords.fileids():
    stopwords_set = set(stopwords.words(language)) # For some languages eg. Russian, it would be a wise idea to tokenize the stop words by punctuation too.
Esempio n. 42
0
 def _get_available_languages(self):
     """ Get available languages by listing nltk's stopwords files """
     return stopwords.fileids()
Esempio n. 43
0
from nltk.tokenize import word_tokenize
import random
from nltk.corpus import names
import nltk.data
from nltk.tokenize import TweetTokenizer
from nltk.corpus import wordnet
import nltk
from nltk.corpus import stopwords
import nltk.corpus
# 1. Write a Python NLTK program to list down all the corpus names.
dir(nltk.corpus)
print("\nAvailable corpus names:")
print(dir(nltk.corpus))

# 2. Write a Python NLTK program to get a list of common stop words in various languages in Python.
print(stopwords.fileids())

# 3. Write a Python NLTK program to check the list of stopwords in various languages.
# From Wikipedia:
# In computing, stop words are words which are filtered out before or after processing of natural language data(text). Though "stop words" usually refers to the most common words in a language, there is no single universal list of stop words used by all natural language processing tools, and indeed not all tools even use such a list. Some tools specifically avoid removing these stop words to support phrase search.
# Any group of words can be chosen as the stop words for a given purpose. For some search engines, these are some of the most common, short function words, such as the, is, at, which, and on. In this case, stop words can cause problems when searching for phrases that include them, particularly in names such as "The Who", "The The", or "Take That". Other search engines remove some of the most common words-including lexical words, such as "want"-from a query in order to improve performance.
result = set(stopwords.words('english'))
print("List of stopwords in English:")
print(result)
print("\nList of stopwords in Arabic:")
result = set(stopwords.words('arabic'))
print(result)
print("\nList of stopwords in Azerbaijani:")
result = set(stopwords.words('azerbaijani'))
print(result)
print("\nList of stopwords in Danish:")
Esempio n. 44
0
import nltk
from nltk.corpus import stopwords

print stopwords.fileids()
stops = set(stopwords.words('english'))

print stops

words = "Don't hesitate to ask questions".split(" ")

print[word for word in words if word not in stops]


def para_fraction(text):
    s = set(stopwords.words('english'))
    para = [w for w in text if w.lower() not in s]
    return len(para) / len(text)


print nltk.corpus.reuters.words()
print para_fraction(nltk.corpus.reuters.words())
print para_fraction(nltk.corpus.inaugural.words())
Esempio n. 45
0
## Using default (treebank) tokenizers
para = "Hello World. It's good to see you. Thanks for buying this book."
print(sent_tokenize(para))
print(word_tokenize(para))  ## punctation is treated as a seprate word

## Alternative word tokenizer
tokenizer = WordPunctTokenizer()  ## Punctuation is a seprate word
print(tokenizer.tokenize(para))

## Make your sentence tokenizer - based on unsupervised learning
text = webtext.raw('overheard.txt')  ## Read corpus example
sent_tokenizer1 = PunktSentenceTokenizer(text)  ## Train tokenizer
sent1 = sent_tokenizer1.tokenize(text)  ## Use new tokenizer
sent = sent_tokenize(text)  ## Old tokenizer

#### check difference between tokenizers
print("Default tokenizer:\n",
      sent[678])  ## Fails to tokenize sentences properly
print("Learned tokenizer:\n", sent1[678])  ## Works well

#### STOPWORDS ######################################################################
from nltk.corpus import stopwords

## Find languages
stopwords.fileids()  ## Which languages
stopwords.words('english')[1:10]

## Filtering stop words
stopset = set(stopwords.words('english'))
words = ["Can't", 'is', 'a', 'contraction']
print([word for word in words if word not in stopset])