Esempio n. 1
0
def get_freq_dist(recv, send, fd=None, dcount_smile=None, classes=None):
    """
    Find word frequency distribution and count smile in the given text.

    Parameters
    ----------
    recv : multiprocessing.Connection
        Read only
    send : multiprocessing.Connection
        Write only
    fd : dict
        Word frequency distributions
    dcount_smile : dict
        Smile counters
    """
    stopwords = frozenset(nltk.corpus.stopwords.words('italian')).union(
        frozenset("[]':,(){}.?!*\"")).union(frozenset(("==", "--")))
    tokenizer = nltk.PunktWordTokenizer()

    if not classes:
        classes = ('anonymous', 'bot', 'bureaucrat', 'sysop', 'normal user',
                   'all')

    # prepare a dict of empty Counter, one for every class
    if not fd:
        fd = {cls: Counter() for cls in classes}
    if not dcount_smile:
        dcount_smile = {cls: Counter() for cls in classes}

    while 1:
        try:
            cls, msg = recv.recv()
        except TypeError:  # end
            for cls in set(classes).difference(('all', )):
                fd['all'].update(fd[cls])
                dcount_smile['all'].update(dcount_smile[cls])

            ## send word counters to the main process
            ## TODO: change it into Counter.most_commons(1000)
            send.send([(cls, freq.most_common(1000))
                       for cls, freq in fd.iteritems()])
            # send smile counters to the main process
            send.send([(cls, counters.items())
                       for cls, counters in dcount_smile.iteritems()])

            return

        msg = remove_templates(msg.encode('utf-8'))

        count_smile = find_smiles(msg)
        dcount_smile[cls].update(count_smile)

        tokens = tokenizer.tokenize(nltk.clean_html(msg.lower()))

        tokens = [t for t in tokens if t not in stopwords]
        fd[cls].update(tokens)
Esempio n. 2
0
def processOneIngredient(ing, ingDict, allIng, measures):
    solution = {}
    tokens = nltk.PunktWordTokenizer().tokenize(ing)
    if tokens == []: return allIng
    else:
        num, units = extractQM(tokens, ingDict, measures)
        desc, name, prep = extractIngredient(tokens, ingDict, units, num)
        weight = calculateWeight(name, ingDict, num, units)
        units = stripChars(units)
        desc = stripChars(desc)
        name = stripChars(name)
        prep = stripChars(prep)
    allIng[name] = {
        "name": name,
        "weight": weight,
        "quantity": num,
        "measurement": units,
        "description": desc,
        "preparation": prep
    }
    return allIng
Esempio n. 3
0
def main(query, lang):
    langMap = {'es': 'Spanish', 'en': 'English'}
    stemmer = nltk.stem.snowball.SnowballStemmer(langMap[lang].lower())
    j = wikiApi.get_article(query, lang)
    wordDict = {}
    for page in j:
        t = wikiParser(j[page]['content'])
        for header in t.headers:
            try:
                stemmedHeader = stemmer.stem(header)
            except Exception, e:
                print str(e)
                header = unidecode(header)
                stemmedHeader = stemmer.stem(header)
            if stemmedHeader in wordDict:
                wordDict[stemmedHeader]['count'] = 1
            else:
                wordDict[stemmedHeader] = {'count': 1, 'form': stemmedHeader}
        text = t.text
        print type(text)
        tokens = [
            k.split('|')[0] for k in nltk.PunktWordTokenizer().tokenize(text)
            if re.match('[a-zA-Z]', k)
        ]
        words = [
            w.lower() for w in tokens if w.encode('utf-8').lower() not in
            nltk.corpus.stopwords.words(langMap[lang].lower())
        ]
        print len(words)
        for w in words:
            try:
                st = stemmer.stem(w)
            except Exception, e:
                st = stemmer.stem(unidecode(w))
                w = unidecode(w)
                continue
            if st in wordDict:
                wordDict[st]['count'] += 1
            else:
                wordDict[st] = {'count': 1, 'form': w}
Esempio n. 4
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pandas as pd
import nltk

stopwords = nltk.corpus.stopwords.words('english')
tokenizer = nltk.PunktWordTokenizer()
stemmer = nltk.PorterStemmer()
lemmatizer = nltk.WordNetLemmatizer()


def process_text(text):
    text = text.lower()
    # Tokenizing
    tokens = [
        token for token in tokenizer.tokenize(text) if token not in stopwords
    ]
    # Stemming
    tokens = map(stemmer.stem, tokens)
    # # Lemmatizing
    # tokens = map(lemmatizer.lemmatize, tokens)

    return tokens


if __name__ == '__main__':

    df = pd.read_csv('dataset.csv', nrows=80000, error_bad_lines=False)

    tagged_tokens = []
Esempio n. 5
0
    def __init__(self, **kwargs):
        super(HistoryWordsPageProcessor, self).__init__(**kwargs)
        self.tokenizer = nltk.PunktWordTokenizer()
        self.stopwords = frozenset(nltk.corpus.stopwords.words('italian'))

        self.counter_desired_words = nltk.FreqDist()
Esempio n. 6
0
File: pos.py Progetto: javipus/NLP
LOG_FORMAT = "%(asctime).19s %(levelname)s %(filename)s: %(lineno)s %(message)s"

#tagger = stanford.StanfordTagger('/media/data/NER/stanford/pos/models/left3words-wsj-0-18.tagger',
#                                 '/media/data/NER/stanford/pos/stanford-postagger.jar',
#                                 encoding='utf-8')
tagger = senna.SennaTagger('/media/petra/NER/senna-v2.0', encoding='utf-8')

i = 0
size = 0
samples = []
lock = Lock()

sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
tree_tokenizer = nltk.TreebankWordTokenizer()
word_punct_tokenizer = nltk.WordPunctTokenizer()
punkt_word_tokenizer = nltk.PunktWordTokenizer()
whitespace_tokenizer = nltk.WhitespaceTokenizer()


def tokenize(text):
    sentences = filter(lambda x: x, sent_tokenizer.tokenize(text.strip()))
    tokens = [
        punkt_word_tokenizer.tokenize(sentence) for sentence in sentences
    ]
    return tokens


def process(labeled_comments):
    global i
    ids, comments, langs, users, page_ids, page_titles, times, levels = zip(
        *labeled_comments)