Python words Examples, pattern.vector.words Python Examples

Example #1

0

Show file

File: test_vector.py Project: sb123456789sb/pattern-1

 def test_words(self):
     # Assert word split algorithm (default treats lines as spaces and ignores numbers).
     s = "The cat sat on the\nmat. 1 11."
     v = vector.words(s, filter=lambda w: w.isalpha())
     self.assertEqual(v, ["The", "cat", "sat", "on", "the", "mat"])
     # Assert custom word filter.
     v = vector.words(s, filter=lambda w: True)
     self.assertEqual(v, ["The", "cat", "sat", "on", "the", "mat", "1", "11"])
     print("pattern.vector.words()")

Example #2

0

Show file

File: test_vector.py Project: andres-root/pattern

 def test_words(self):
     # Assert word split algorithm (default treats lines as spaces and ignores numbers).
     s = "The cat sat on the\nmat. 1 11."
     v = vector.words(s, filter=lambda w: w.isalpha())
     self.assertEqual(v, ["The", "cat", "sat", "on", "the", "mat"])
     # Assert custom word filter.
     v = vector.words(s, filter=lambda w: True)
     self.assertEqual(v, ["The", "cat", "sat", "on", "the", "mat", "1", "11"])
     print("pattern.vector.words()")

Example #3

0

Show file

File: stalkerbot.py Project: sigras/stalkerbot

 def get_keywords(self, comment_history):
     comments = [str(x) for x in comment_history]
     keywords = count(words(comments.__str__()))
     sorted_keywords = sorted(keywords.iteritems(),
                              key=operator.itemgetter(1),
                              reverse=True)
     return sorted_keywords

Example #4

0

Show file

File: parse2.py Project: samdimmortal/be-project

def build_model(results=[]):
    documents = [
        Document(i.get('text'),
                 name=i.get('url'),
                 description=i.get('index'),
                 stemmer=LEMMA) for i in results
    ]
    m = Model(documents, weight=TFIDF)

    y, x = 1, len(m.features)
    model = np.zeros((y, x))

    sentence_dict = {}
    model_sentences = []
    for i_index, i in enumerate(documents):
        sentences = sent_tokenize(results[i_index].get('text').lower())

        dy, dx = len(sentences), x
        for s_index, s in enumerate(sentences):
            s_words = {
                w: 1
                for w in words(s, stemmer=LEMMA, stopwords=False)
                if not stopwords_hash.get(w)
            }
            if len(s_words) < 5:
                continue
            model_sentences.append(s)
            model = np.append(
                model, [[1 if s_words.get(w) else 0 for w in m.features]], 0)
            sentence_dict[model.shape[0] - 1] = i.name
            # model_sentences[model.shape[0]-1] = s

    model = np.delete(model, (0), 0)

    return model, m, model_sentences, sentence_dict

Example #5

0

Show file

File: algorithmTools.py Project: luislezcair/gisiaws

 def run(self,minePackage):
     clouds=minePackage['clouds']
     urlContent=UrlToPlainText()
     for cloud in clouds:
         for n in cloud.graph.nodes():#Itera una lista de enlaces de la nube
             # print cloud.graph.node[n]['link']
             pageContent=urlContent.plainTextConverter(cloud.graph.node[n]['link'])
             cloud.graph.node[n]['methodData']=MethodData(count(words(Sentence(parse(pageContent))), stemmer=PORTER))

Example #6

0

Show file

File: algorithmTools.py Project: luislezcair/gisiaws

 def __init__(self , data, url=""):
     if url != "":
         urlContent = UrlToPlainText()
         self.contenidoConEtiquetas = urlContent.plainTextConverter(url,"mantenerEtiquetas")
         self.contenido = plaintext(self.contenidoConEtiquetas,keep={})
     else:
         self.contenido = ""
     self.data = count(words(Sentence(parse(self.contenido))), stemmer=PORTER)

Example #7

0

Show file

def roots_and_lemmas():

    print(stem('cars', PORTER))  #Root
    print(stem('cars', LEMMA))
    print(stem('studies', PORTER))  # Root
    print(stem('studies', LEMMA))

    text = "People who teach find teaching very rewarding."
    tokens = words(text)
    print(count(tokens, stopwords=True, stemmer=PORTER))
    print(count(tokens, stopwords=True, stemmer=LEMMA))

Example #8

0

Show file

 def run(self, minePackage):
     clouds = minePackage['clouds']
     urlContent = UrlToPlainText()
     for cloud in clouds:
         for n in cloud.graph.nodes(
         ):  #Itera una lista de enlaces de la nube
             print cloud.graph.node[n]['link']
             pageContent = urlContent.plainTextConverter(
                 cloud.graph.node[n]['link'])
             cloud.graph.node[n]['methodData'] = MethodData(
                 count(words(Sentence(parse(pageContent))), stemmer=PORTER))

Example #9

0

Show file

File: controller.py Project: matigrojas/pruebaWsd

 def start(self):
     cloudSize = dameCloudSize(self.id_request)
     cloudSize = cloudSize[0][0]
     searchKey = dameSerchKey(self.id_request)
     searchKey = searchKey[0][0]
     step = 0
     while step <= 5:  #Mas adelante setear get_stop; esto indica la cantidad de niveles
         for id_cloud in dameIdCloud(
                 self.id_request
         ):  #Obtiene IDS de los clouds que pertenecen al proyecto
             print "Id Cloud: " + str(id_cloud[0])
             cloud = self.generar_cloud(dameNodo(id_cloud[0]))
             true_nodes = self.trueNodesSelection(cloud)
             for n in true_nodes:
                 try:
                     cloud.graph.node[n]['select'] = False
                     crawler = SimpleCrawler1(n, delay=0.1)
                     crawler.newStructure(cloud.graph)
                     time = 0
                 except:
                     continue
                 while len(crawler.visited) < cloudSize:
                     print "Cloudsize = " + str(
                         cloudSize) + " Crawler Visited = " + str(
                             len(crawler.visited)) + " Nivel =  " + str(
                                 step)
                     print 'Explorando ...'
                     crawler.crawl(method=None)
                     time += 1
                     if time > cloudSize * 10:
                         break
                 actualizarSelect(cloud.graph.node[n]['ID'],
                                  cloud.graph.node[n]['select'])
                 print
                 print '#####Generando documentos#####'
                 #Creacion de minePackage
                 clouds = list()
                 clouds.append(cloud)
                 minePackage = dict()
                 minePackage['clouds'] = clouds
                 minePackage['searchKey'] = searchKey
                 minePackage['searchKeyStemmer'] = count(words(
                     Sentence(parse(searchKey))),
                                                         stemmer=PORTER)
                 self.IRController.start(minePackage)  #Recupera Informacion
                 #FALTA SCRAPPER CONTROLLER
             #Se pone none para que no ocupen espacio innecesario, todo ya fue guardado en BD
             minePackage = None
             cloud = None
             gc.collect
         step += 1
         print "Explorando nivel nro: " + str(step)
         #Controla los niveles a expandir, en este caso 10
     print "Proceso Finalizado"

Example #10

0

Show file

 def __init__(self, data, url="", contenidoBd=""):
     if url != "":
         urlContent = UrlToPlainText()
         self.contenidoConEtiquetas = urlContent.plainTextConverter(
             url, "mantenerEtiquetas")
         self.contenido = plaintext(self.contenidoConEtiquetas, keep={})
     else:
         if (contenidoBd != ""):
             self.contenidoConEtiquetas = contenidoBd
             self.contenido = plaintext(self.contenidoConEtiquetas, keep={})
         else:
             self.contenido = ""
     self.data = count(words(Sentence(parse(self.contenido))),
                       stemmer=PORTER)

Example #11

0

Show file

File: common_words_pattern.py Project: rakeshsukla53/fuzzy-nemesis

def count_one_artist(name, bad_words):

    # ok, this is a bad way to get number of songs for that artist, so we can average out
    # the words per song
    default_dir = basedir + name
    num_songs = len(os.listdir(default_dir))

    # we need the number of songs, this is so annoying
    dict = {}
    docs = vec.count(vec.words(get_artist_docs(name)))
    for w in bad_words:
        if w in docs:
            dict[w] = docs[w]
    dict['num_songs'] = num_songs  # this is cheap
    return dict

Example #12

0

Show file

File: processor.py Project: luislezcair/gisiaws

	def tokenizer(self,url):
		#text = 'The black cat was spying on the white cat.'
		#stemmer=None, stemmer=LEMMA, stemmer=PORTER
		#print count(words(pageContent), stemmer=PORTER)
		#print count(words(pageContent), stemmer=LEMMA)


		#url_content = UrlToplainTextConverter()
		#page_content = url_content.plainTextConverter(url)
           page_content = url
           s = Sentence(parse(page_content))
           tokenized_file = count(words(s), stemmer=PORTER)
           print 
           print tokenized_file
           print

Example #13

0

Show file

File: FromFuzzyNemesis.py Project: nickponvert/BigDataKane

def count_one_artist(name, bad_words):

    # ok, this is a bad way to get number of songs for that artist, so we can average out
    # the words per song
    default_dir = basedir + name
    num_songs = len(os.listdir(default_dir))

    # we need the number of songs, this is so annoying
    dict = {}
    docs = vec.count(vec.words(get_artist_docs(name)))
    for w in bad_words:
        if w in docs:
            dict[w] = docs[w]
    dict['num_songs'] = num_songs # this is cheap
    return dict

Example #14

0

Show file

File: algorithmTools.py Project: luislezcair/gisiaws

 def processor(self,minePackage):
     # print '####SEARCH_KEY:',minePackage['searchKey']
     s = Sentence(parse(minePackage['searchKey']))
     minePackage['searchKey']=count(words(s), stemmer=PORTER)
     return minePackage['searchKey']

Example #15

0

Show file

File: Python_Computational_Linguistics_Mining_Data.py Project: VakinduPhilliam/Python_Computation_Linguistics

while len(links) > 0:

    try:
        article = Wikipedia(language="it").search(links.pop(), throttle=10)
        seen[article.title] = True

        # Parse links from article.

        for link in article.links:

            if link not in seen:
                links.add(link)

        # Parse words from article. Count words.

        for word in words(article.string):

            if word not in frequency:
                frequency[word] = 0
            frequency[word] += 1
        print sum(frequency.values()), article.title

    except:
        pass

    # Collect a reliable amount of words (e.g., 1M).

    if sum(frequency.values()) > 1000000:
        break

#top = sorted((count, word) for word, count in frequency.items())

Example #16

0

Show file

File: pattern_vector.py Project: vishalbelsare/pattern_CLiPS

#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
from pattern.vector import words, count, stem, PORTER, LEMMA, chngrams, Document, Vector, distance, Model, TFIDF,\
    HIERARCHICAL, Cluster, NB, kfoldcv, KNN, EUCLIDEAN, TF, SVM, RADIAL, gridsearch, GA
from pattern.en import parse, Sentence, parsetree, lexicon
from pattern.db import csv
from random import choice

# word count
freq_dic = {}
with open('data/input/corpus.txt', 'r') as fp:
    words_list = words(fp.read(),
                       filter=lambda w: w.strip("'").isalnum(),
                       punctuation='.,;:!?()[]{}`'
                       '\"@#$^&*+-|=~_')
    # returns a list of words by splitting the string on spaces.
    freq_dic = count(  # takes a list of words and returns a dictionary of (word, count)-items.
        words=words_list,
        top=None,  # Filter words not in the top most frequent (int).
        threshold=0,  # Filter words whose count <= threshold.
        stemmer=None,  # PORTER | LEMMA | function | None
        exclude=[],  # Filter words in the exclude list.
        stopwords=False,  # Include stop words?
        language='en')  # en, es, de, fr, it, nl
for k, v in freq_dic.iteritems():
    print k, v
# stop words and stemming
print stem('spies', stemmer=PORTER)
print stem('spies', stemmer=LEMMA)
s = 'The black cat was spying on the white cat.'
print count(words(s), stemmer=PORTER)

Example #17

0

Show file

def v(s):
    """ Returns a bag-of-words vector for the given string.
    """
    v = {}
    v.update(count(words(s)))
    return v

Example #18

0

Show file

File: stalkerbot.py Project: sigras/stalkerbot

 def get_keywords(self, comment_history):
     comments = [str(x) for x in comment_history]
     keywords = count(words(comments.__str__()))
     sorted_keywords = sorted(keywords.iteritems(), key=operator.itemgetter(1), reverse=True)
     return sorted_keywords

Example #19

0

Show file

def tokenization():
    text = "My new car is better than my new bed"
    tokens = words(text)
    print(tokens)
    print(count(tokens))

Example #20

0

Show file

File: divergent_thinking.py Project: canuk/ml-paper-resources

        for row in all_q:
            row = filter(None, row)  #remove nulls
    
            def fluency(questions):
                return len(questions)
            
            def elaboration(questions):
                return sum(min(len(parsetree(a)[0].pnp), 2) for a in questions)
                
            def variance(cluster):
                return avg([distance(centroid(cluster), v) for v in cluster])
    
            vectors = []
                
            for q in all_q:
                v = count(words(q), stemmer='lemma') 
                v = Vector(v)
                vectors.append(v)
                
            clusters = hierarchical(vectors, k=250, distance='cosine')
            clusters = [isinstance(v, Vector) and [v] or v.flatten() for v in clusters] 
            clusters = sorted(clusters, key=variance)
            
            categories = {}
            
            for i, cluster in enumerate(clusters):
                for v in cluster: 
                    categories[row[vectors.index(v)]] = i

            def flex(questions):
                ml_categories = []

Example #21

0

Show file

 def countWords(self):
     wordDict = count(
         words(plaintext(self.content),
               filter=lambda w: w.strip("'").isalpha()))
     return Counter(wordDict)

Example #22

0

Show file

def getWords(text):
    return words(
        text, stemmer=LEMMA, exclude=[], stopwords=False,
        language='en')  # seeing same results with stemmer.stem, LEMMA, PORTER

Example #23

0

Show file

File: good-evil.py Project: OAlm/the_stromberg_stories

def v(s):
    """ Returns a bag-of-words vector for the given string.
    """
    v = {}
    v.update(count(words(s)))
    return v

Example #24

0

Show file

 def processor(self, minePackage):
     print '####SEARCH_KEY:', minePackage['searchKey']
     var = minePackage['searchKey']
     s = Sentence(parse(var))
     return count(words(s),
                  stemmer=PORTER)  #Retorna diccionario {palabra: cantidad}