Beispiel #1
0
def getTokens(texts):
    #global corpusTokens
    #global docsTokens
    stemmer = PorterStemmer()
    tokenizer = WordPunctTokenizer()

    allTokens = []
    #tokens=[]
    if type(texts) != type([]):
        texts = [texts]
    for s in texts:
        #toks = nltk.word_tokenize(s.lower())
        toks = tokenizer.tokenize(s)
        allTokens.extend(toks)
        #corpusTokens.extend(toks)
        #docsTokens.append(toks)

    allTokens_2 = [
        t.lower() for t in allTokens
        if len(t) > 2 and t.isalnum() and t not in stopwordsList
    ]
    #allTokens_an = [t2 for t2 in allTokens_2 if t2.isalnum()]
    #allTokens_stw = [t3 for t3 in allTokens if t3 not in stopwordsList]
    #allTokens_stw = [t3 for t3 in allTokens_an if t3 not in stopwordsList]
    allTokens_stem = [stemmer.stem(word)
                      for word in allTokens_2]  #allTokens_stw]
    final = [t for t in allTokens_stem if t not in stopwordsList]
    return final
Beispiel #2
0
 def get_words(klass, text):
     tokenizer = WordPunctTokenizer()
     words = Counter()
     for word in tokenizer.tokenize(text):
         word = word.lower()
         if len(word) > 2 and not word in STOPWORDS:
             words[word] += 1
     return words
Beispiel #3
0
 def getEntity(self, word, originalcase):
     tokenizer = WordPunctTokenizer()
     for sentence in self.raw_sentences:
         words = tokenizer.tokenize(sentence.strip())
         if originalcase in words:
             entity = self.getEntityFromSentence(originalcase, words)
             if len(entity) > 1:
                 return " ".join(entity)
     return originalcase
Beispiel #4
0
def getDocTokens(docText):
    stemmer = PorterStemmer()
    tokenizer = WordPunctTokenizer()
    docTokens = tokenizer.tokenize(docText)
    
    allTokens_2 = [t.lower() for t in docTokens if len(t)>2]
    allTokens_an = [t2 for t2 in allTokens_2 if t2.isalnum()]
    allTokens_stw = [t3 for t3 in allTokens_an if t3 not in stopwordsList]
    allTokens_stem = [stemmer.stem(word) for word in allTokens_stw]
    final = [t for t in allTokens_stem if t not in stopwordsList]
    return final
Beispiel #5
0
def getTokens(doc): 
    #global stemmer
    stemmer = PorterStemmer()
    tokenizer = WordPunctTokenizer()
    stopwords = nltk.corpus.stopwords.words('english')
      
    webpage = open(doc,'r').read()
    text = nltk.clean_html(webpage)
    #tokens = nltk.word_tokenize(text)
    tokens = tokenizer.tokenize(text)
    clean = [token.lower() for token in tokens if token.lower() not in stopwords]
    final = [stemmer.stem(word) for word in clean]
    return final
Beispiel #6
0
def getTokenizedDocs(docs):
    docs_tokens=[]
    stemmer = PorterStemmer()
    tokenizer = WordPunctTokenizer()
    stopwords = nltk.corpus.stopwords.words('english')
    for text in docs:
        tokens = tokenizer.tokenize(text)
        clean = [token for token in tokens if token.isalnum()]
        clean = [token.lower() for token in clean if token.lower() not in stopwords] 
        clean = [token for token in clean if len(token) > 2]
        final = [stemmer.stem(word) for word in clean]
        docs_tokens.append(final)
    return docs_tokens
Beispiel #7
0
class TrollAnalyzer:
    def __init__(self, model):
        self.model = model
        self.tokenizer = WordPunctTokenizer()
        self.scale_score = lambda k: k * 2 - 1

    def analyze(self, sentence):
        words = self.tokenizer.tokenize(sentence)
        predictions = {
            word: self.scale_score(self.model.predict_proba([word])[0][1])
            for word in words
        }
        total = self.scale_score(self.model.predict_proba([sentence])[0][1])
        return {"master": total, "tokenized": predictions}
 def convertDoctoTFIDF(self,doc):
     stemmer = PorterStemmer()
     tokenizer = WordPunctTokenizer()
     stopwords_e = stopwords.words('english')
     tokens = tokenizer.tokenize(doc)
     clean = [token for token in tokens if token.isalnum()]
     clean = [token.lower() for token in clean if token.lower() not in stopwords_e] 
     clean = [token for token in clean if len(token) > 2]
     final_doc = [stemmer.stem(word) for word in clean]
     doc_tfidf=[]
     words = self.model[1]
     for i in range(0,len(words)):            
         tf = final_doc.count(words[i])  
         doc_tfidf.append((tf,words[i]))
     return doc_tfidf
Beispiel #9
0
 def convertDoctoTFIDF(self,doc):
     stemmer = PorterStemmer()
     tokenizer = WordPunctTokenizer()
     stopwords_e = stopwords.words('english')
     tokens = tokenizer.tokenize(doc)
     clean = [token for token in tokens if token.isalnum()]
     clean = [token.lower() for token in clean if token.lower() not in stopwords_e] 
     clean = [token for token in clean if len(token) > 2]
     final_doc = [stemmer.stem(word) for word in clean]
     doc_tfidf=[]
     words = self.model[1]
     for i in range(0,len(words)):            
         tf = final_doc.count(words[i])  
         doc_tfidf.append((tf,words[i]))
     return doc_tfidf
def getTokenizedDoc(doc):
    #docs_tokens=[]
    stemmer = PorterStemmer()
    tokenizer = WordPunctTokenizer()
    stopwords = nltk.corpus.stopwords.words('english')
    #stopwords.extend(["last","time","week","favorite","home","search","follow","year","account","update","com","video","close","http","retweet","tweet","twitter","news","people","said","comment","comments","share","email","new","would","one","world"])
    stopwords.extend(["com","http","retweet","tweet","twitter"])
    tokens = tokenizer.tokenize(doc)
    tokens = [token for token in tokens if len(token) > 2]
    clean = [token for token in tokens if token.isalnum()]
    clean = [token.lower() for token in clean if token.lower() not in stopwords] 
    #clean = [token for token in clean if len(token) > 2]
    final = [stemmer.stem(word) for word in clean]
    final = [t for t in final if t not in stopwords]
    #docs_tokens.append(final)
    return final
Beispiel #11
0
    def get_texts(self):
        """
        Parse documents from the .cor file provided in the constructor. Lowercase
        each document and ignore some stopwords.

        .cor format: one document per line, words separated by whitespace.
        """
        tokenizer = WordPunctTokenizer()
        #print CorpusCSV.stoplist
        #return self.getstream()
        for doc in self.getstream():
            #yield [word for word in doc.lower().split()]
                    #if word not in CorpusMiislita.stoplist]
            #yield doc
            yield [word for word in tokenizer.tokenize(doc.lower())
                   if word_ok(word)]
Beispiel #12
0
 def extract_words(self, wid):
     """Updates db with previously unseen words and lemmas, and page unigrams"""
     words_file = gzip.open(self.words_file, 'a')
     page_file = gzip.open(os.path.join(self.wiki_filepath, '%i.gz' % wid), 'w')
     w = WordPunctTokenizer()
     qi = QueryIterator(get_config(), {'query': 'wid:%s AND iscontent:true' % str(wid), 'fields': 'id, wid, pageid, html_en', 'sort': 'id asc'})
     print 'starting extraction for wiki %s...' % str(wid)
     for doc in qi:
         print 'extracting words for %s...' % doc['id']
         page_file.write('\t%s\n' % doc['pageid'])
         for word in w.tokenize(doc.get('html_en', '').lower()):
             if word not in self.words:
                 self.words[word] = self.counter
                 words_file.write('%i\t%s\n' % (self.counter, word.encode('utf-8')))
                 self.counter += 1
             page_file.write('%i\n' % self.words.get(word, 0))
     page_file.close()
     words_file.close()
Beispiel #13
0
def getTokenizedDoc(doc):
    #docs_tokens=[]
    stemmer = PorterStemmer()
    tokenizer = WordPunctTokenizer()
    stopwords = nltk.corpus.stopwords.words('english')
    #stopwords.extend(["last","time","week","favorite","home","search","follow","year","account","update","com","video","close","http","retweet","tweet","twitter","news","people","said","comment","comments","share","email","new","would","one","world"])
    stopwords.extend(["com", "http", "retweet", "tweet", "twitter"])
    tokens = tokenizer.tokenize(doc)
    tokens = [token for token in tokens if len(token) > 2]
    clean = [token for token in tokens if token.isalnum()]
    clean = [
        token.lower() for token in clean if token.lower() not in stopwords
    ]
    #clean = [token for token in clean if len(token) > 2]
    final = [stemmer.stem(word) for word in clean]
    final = [t for t in final if t not in stopwords]
    #docs_tokens.append(final)
    return final
Beispiel #14
0
def rouge(candidate, reference, n=2, verbose=False):
    """This is a basic implementation of ROUGE-N.  It calculates the
    n-gram recall of a candidate summary against a refrence
    summary.

    """
    tokenizer = WordPunctTokenizer()
    candidate = tokenizer.tokenize(candidate.lower())
    reference = tokenizer.tokenize(reference.lower())
    c_ngrams = set(ngrams(candidate, n))
    r_ngrams = set(ngrams(reference, n))
    cr_ngrams = [g for g in c_ngrams if g in r_ngrams]
    rouge_n = len(cr_ngrams) / len(r_ngrams)
    if verbose:
        print("{:d} matching {:d}-grams out of {:d} total.".format(
            len(cr_ngrams), n, len(r_ngrams)))
        print(cr_ngrams)
        print("ROUGE-{:d}: {:0.3f}".format(n, rouge_n))
    return rouge_n
class Scorer(object):
    #docs =[]
    #docs_length=[]
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.stopwords = nltk.corpus.stopwords.words('english')
        self.tokenizer = WordPunctTokenizer()
        self.keywords = []
        self.score = 0

    def cleanDoc(self, doc):

        tokens = self.tokenizer.tokenize(doc)
        clean = [
            token.lower() for token in tokens
            if token.lower() not in self.stopwords and len(token) > 2
        ]
        final = [self.stemmer.stem(word) for word in clean]
        return final

    #def __init__(self,seedUrls):
    def __init__(self, keywords):
        self.stemmer = PorterStemmer()
        self.stopwords = nltk.corpus.stopwords.words('english')
        self.tokenizer = WordPunctTokenizer()
        self.score = 0
        self.keywords = keywords
        #self.keywords = []
        '''for url in seedUrls:
            page = Webpage(url)
            data = self.cleanDoc(page.text)
            for d in data:
                self.keywords.append(d)'''
# this function checks if the url words contain the keywords or not.
# the score given is calculated by finding how many keywords occur in the url.

    def calculate_score(self, url):
        words = url.getAllText().split()
        for w in self.keywords:
            if w in words:
                self.score += 1
        self.score = self.score / float(len(self.keywords))
        return self.score
class Scorer(object):
    #docs =[]
    #docs_length=[]
    def __init__(self):
        self.stemmer = PorterStemmer()
        self.stopwords = nltk.corpus.stopwords.words('english')
        self.tokenizer = WordPunctTokenizer()
        self.keywords = []
        self.score = 0

    def cleanDoc(self,doc):
        
        tokens = self.tokenizer.tokenize(doc)
        clean = [token.lower() for token in tokens if token.lower() not in self.stopwords and len(token) > 2]
        final = [self.stemmer.stem(word) for word in clean]
        return final
    
    #def __init__(self,seedUrls):
    def __init__(self,keywords):
        self.stemmer = PorterStemmer()
        self.stopwords = nltk.corpus.stopwords.words('english')
        self.tokenizer = WordPunctTokenizer()
        self.score = 0
        self.keywords = keywords
        #self.keywords = []
        '''for url in seedUrls:
            page = Webpage(url)
            data = self.cleanDoc(page.text)
            for d in data:
                self.keywords.append(d)'''
# this function checks if the url words contain the keywords or not.
# the score given is calculated by finding how many keywords occur in the url.
    def calculate_score(self,url):
        words = url.getAllText().split()        
        for w in self.keywords:
            if w in words:
                self.score +=1
        self.score = self.score / float(len(self.keywords))
        return self.score
def getTokens(texts):
    #global corpusTokens
    #global docsTokens
    stemmer = PorterStemmer()
    tokenizer = WordPunctTokenizer()
    
    allTokens=[]
    #tokens=[]
    if type(texts) != type([]):
        texts = [texts]
    for s in texts:
        #toks = nltk.word_tokenize(s.lower())
        toks = tokenizer.tokenize(s)
        allTokens.extend(toks)
        #corpusTokens.extend(toks)
        #docsTokens.append(toks)
   
    allTokens_2 = [t.lower() for t in allTokens if len(t)>2]
    allTokens_an = [t2 for t2 in allTokens_2 if t2.isalnum()]
    allTokens_stw = [t3 for t3 in allTokens_an if t3 not in stopwordsList]
    allTokens_stem = [stemmer.stem(word) for word in allTokens_stw]
    final = [t for t in allTokens_stem if t not in stopwordsList]
    return final
Beispiel #18
0
def document_as_words(document):
    """
    There is probably an NLTK function somewhere that does already, but I
    couldn't find it.
    
    So this just converts a single document into a list of words which you
    can then use with the rest of these functions, to get a feature list
    which you can then classify.
    """ 
    stringbuf = StringIO.StringIO(document)
    
    word_tokenizer = WordPunctTokenizer()
    para_tokenizer = read_blankline_block
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    
    words = []
    
    for para in para_tokenizer(stringbuf):
        for sent in sent_tokenizer.tokenize(para):
            for word in word_tokenizer.tokenize(sent):
                words.append(word)
                
    return words
def stem_text(text):
    tokenizer = WordPunctTokenizer()
    stemmer = SnowballStemmer('french')
    liste_racines = [stemmer.stem(token) for token in tokenizer.tokenize(text)]
    return ' '.join(liste_racines)
    def handle(self, *args, **options):
        super(Command, self).handle(self, *args, **options)

        for database in self.selected_dbs:

            print "Processing database " + database
            cursor = connections[database].cursor()

            get_authors_query = """
                SELECT name, long_name
                  FROM author
            """

            get_comments_query = """
                SELECT id, content FROM comment
            """

            cursor = connections[database].cursor()
            tokenizer = WordPunctTokenizer()

            self.stdout.write("Grabbing authors...")
            authors = []
            cursor.execute(get_authors_query)
            self.pbar_setup(maxval=cursor.rowcount)
            for row in dictfetch(cursor):
                authors.append(row['name'])
                if row['long_name'] != "":
                    authors.append(row['long_name'])
                self.pbar_increment()
            self.pbar_destroy()

            self.stdout.write("Sorting authors...")
            authors.sort()

            self.stdout.write("Determining real parents...")
            real_parents = []
            cursor1 = connections[database].cursor()
            cursor1.execute(get_comments_query)
            self.pbar_setup(maxval=cursor1.rowcount)
            for row in dictfetch(cursor1):
                tokens = tokenizer.tokenize(row['content'])
                if len(tokens) > 0:
                    if tokens[0] == '@':
                        real_parents.append(int(row['id']))
                    else:
                        i = bisect_left(authors, tokens[0])
                        if i != len(authors) and authors[i] == tokens[0]:
                            real_parents.append(int(row['id']))

                self.pbar_increment()
            self.pbar_destroy()

            self.stdout.write("Non-Real-parents found: {}".format(
                len(real_parents)))

            cursor2 = connections[database].cursor()
            update_query = """
                UPDATE comment 
                   SET real_parent = (CASE WHEN id in ({}) THEN 0 ELSE 1 END)            
            """.format(('%s,' * len(real_parents)).rstrip(','))
            cursor2.execute(update_query, real_parents)

        self.stdout.write(self.style.SUCCESS('Command executed succesfully'))
Beispiel #21
0
#!/usr/bin/env python
import sys

from nltk.tokenize.regexp import WordPunctTokenizer

filename = sys.argv[1]

with open(filename, 'r') as f:
    contents = f.read()

from nltk.tokenize import wordpunct_tokenize
wptk = WordPunctTokenizer()
tokenized = wptk.tokenize(contents)

lower_list = []
for word in tokenized:
    l = word.lower()
    lower_list.append(l)

from collections import Counter
counts = Counter(lower_list)
#counts_list = sorted(lower_list, key=lambda x: (counts[x], x), reverse=True)

print(counts)