コード例 #1
0
 def test_words(self):
     # Assert word split algorithm (default treats lines as spaces and ignores numbers).
     s = "The cat sat on the\nmat. 1 11."
     v = vector.words(s, filter=lambda w: w.isalpha())
     self.assertEqual(v, ["The", "cat", "sat", "on", "the", "mat"])
     # Assert custom word filter.
     v = vector.words(s, filter=lambda w: True)
     self.assertEqual(v, ["The", "cat", "sat", "on", "the", "mat", "1", "11"])
     print("pattern.vector.words()")
コード例 #2
0
ファイル: test_vector.py プロジェクト: andres-root/pattern
 def test_words(self):
     # Assert word split algorithm (default treats lines as spaces and ignores numbers).
     s = "The cat sat on the\nmat. 1 11."
     v = vector.words(s, filter=lambda w: w.isalpha())
     self.assertEqual(v, ["The", "cat", "sat", "on", "the", "mat"])
     # Assert custom word filter.
     v = vector.words(s, filter=lambda w: True)
     self.assertEqual(v, ["The", "cat", "sat", "on", "the", "mat", "1", "11"])
     print("pattern.vector.words()")
コード例 #3
0
ファイル: stalkerbot.py プロジェクト: sigras/stalkerbot
 def get_keywords(self, comment_history):
     comments = [str(x) for x in comment_history]
     keywords = count(words(comments.__str__()))
     sorted_keywords = sorted(keywords.iteritems(),
                              key=operator.itemgetter(1),
                              reverse=True)
     return sorted_keywords
コード例 #4
0
ファイル: parse2.py プロジェクト: samdimmortal/be-project
def build_model(results=[]):
    documents = [
        Document(i.get('text'),
                 name=i.get('url'),
                 description=i.get('index'),
                 stemmer=LEMMA) for i in results
    ]
    m = Model(documents, weight=TFIDF)

    y, x = 1, len(m.features)
    model = np.zeros((y, x))

    sentence_dict = {}
    model_sentences = []
    for i_index, i in enumerate(documents):
        sentences = sent_tokenize(results[i_index].get('text').lower())

        dy, dx = len(sentences), x
        for s_index, s in enumerate(sentences):
            s_words = {
                w: 1
                for w in words(s, stemmer=LEMMA, stopwords=False)
                if not stopwords_hash.get(w)
            }
            if len(s_words) < 5:
                continue
            model_sentences.append(s)
            model = np.append(
                model, [[1 if s_words.get(w) else 0 for w in m.features]], 0)
            sentence_dict[model.shape[0] - 1] = i.name
            # model_sentences[model.shape[0]-1] = s

    model = np.delete(model, (0), 0)

    return model, m, model_sentences, sentence_dict
コード例 #5
0
ファイル: algorithmTools.py プロジェクト: luislezcair/gisiaws
 def run(self,minePackage):
     clouds=minePackage['clouds']
     urlContent=UrlToPlainText()
     for cloud in clouds:
         for n in cloud.graph.nodes():#Itera una lista de enlaces de la nube
             # print cloud.graph.node[n]['link']
             pageContent=urlContent.plainTextConverter(cloud.graph.node[n]['link'])
             cloud.graph.node[n]['methodData']=MethodData(count(words(Sentence(parse(pageContent))), stemmer=PORTER))
コード例 #6
0
ファイル: algorithmTools.py プロジェクト: luislezcair/gisiaws
 def __init__(self , data, url=""):
     if url != "":
         urlContent = UrlToPlainText()
         self.contenidoConEtiquetas = urlContent.plainTextConverter(url,"mantenerEtiquetas")
         self.contenido = plaintext(self.contenidoConEtiquetas,keep={})
     else:
         self.contenido = ""
     self.data = count(words(Sentence(parse(self.contenido))), stemmer=PORTER)
コード例 #7
0
def roots_and_lemmas():

    print(stem('cars', PORTER))  #Root
    print(stem('cars', LEMMA))
    print(stem('studies', PORTER))  # Root
    print(stem('studies', LEMMA))

    text = "People who teach find teaching very rewarding."
    tokens = words(text)
    print(count(tokens, stopwords=True, stemmer=PORTER))
    print(count(tokens, stopwords=True, stemmer=LEMMA))
コード例 #8
0
 def run(self, minePackage):
     clouds = minePackage['clouds']
     urlContent = UrlToPlainText()
     for cloud in clouds:
         for n in cloud.graph.nodes(
         ):  #Itera una lista de enlaces de la nube
             print cloud.graph.node[n]['link']
             pageContent = urlContent.plainTextConverter(
                 cloud.graph.node[n]['link'])
             cloud.graph.node[n]['methodData'] = MethodData(
                 count(words(Sentence(parse(pageContent))), stemmer=PORTER))
コード例 #9
0
ファイル: controller.py プロジェクト: matigrojas/pruebaWsd
 def start(self):
     cloudSize = dameCloudSize(self.id_request)
     cloudSize = cloudSize[0][0]
     searchKey = dameSerchKey(self.id_request)
     searchKey = searchKey[0][0]
     step = 0
     while step <= 5:  #Mas adelante setear get_stop; esto indica la cantidad de niveles
         for id_cloud in dameIdCloud(
                 self.id_request
         ):  #Obtiene IDS de los clouds que pertenecen al proyecto
             print "Id Cloud: " + str(id_cloud[0])
             cloud = self.generar_cloud(dameNodo(id_cloud[0]))
             true_nodes = self.trueNodesSelection(cloud)
             for n in true_nodes:
                 try:
                     cloud.graph.node[n]['select'] = False
                     crawler = SimpleCrawler1(n, delay=0.1)
                     crawler.newStructure(cloud.graph)
                     time = 0
                 except:
                     continue
                 while len(crawler.visited) < cloudSize:
                     print "Cloudsize = " + str(
                         cloudSize) + " Crawler Visited = " + str(
                             len(crawler.visited)) + " Nivel =  " + str(
                                 step)
                     print 'Explorando ...'
                     crawler.crawl(method=None)
                     time += 1
                     if time > cloudSize * 10:
                         break
                 actualizarSelect(cloud.graph.node[n]['ID'],
                                  cloud.graph.node[n]['select'])
                 print
                 print '#####Generando documentos#####'
                 #Creacion de minePackage
                 clouds = list()
                 clouds.append(cloud)
                 minePackage = dict()
                 minePackage['clouds'] = clouds
                 minePackage['searchKey'] = searchKey
                 minePackage['searchKeyStemmer'] = count(words(
                     Sentence(parse(searchKey))),
                                                         stemmer=PORTER)
                 self.IRController.start(minePackage)  #Recupera Informacion
                 #FALTA SCRAPPER CONTROLLER
             #Se pone none para que no ocupen espacio innecesario, todo ya fue guardado en BD
             minePackage = None
             cloud = None
             gc.collect
         step += 1
         print "Explorando nivel nro: " + str(step)
         #Controla los niveles a expandir, en este caso 10
     print "Proceso Finalizado"
コード例 #10
0
 def __init__(self, data, url="", contenidoBd=""):
     if url != "":
         urlContent = UrlToPlainText()
         self.contenidoConEtiquetas = urlContent.plainTextConverter(
             url, "mantenerEtiquetas")
         self.contenido = plaintext(self.contenidoConEtiquetas, keep={})
     else:
         if (contenidoBd != ""):
             self.contenidoConEtiquetas = contenidoBd
             self.contenido = plaintext(self.contenidoConEtiquetas, keep={})
         else:
             self.contenido = ""
     self.data = count(words(Sentence(parse(self.contenido))),
                       stemmer=PORTER)
コード例 #11
0
def count_one_artist(name, bad_words):

    # ok, this is a bad way to get number of songs for that artist, so we can average out
    # the words per song
    default_dir = basedir + name
    num_songs = len(os.listdir(default_dir))

    # we need the number of songs, this is so annoying
    dict = {}
    docs = vec.count(vec.words(get_artist_docs(name)))
    for w in bad_words:
        if w in docs:
            dict[w] = docs[w]
    dict['num_songs'] = num_songs  # this is cheap
    return dict
コード例 #12
0
ファイル: processor.py プロジェクト: luislezcair/gisiaws
	def tokenizer(self,url):
		#text = 'The black cat was spying on the white cat.'
		#stemmer=None, stemmer=LEMMA, stemmer=PORTER
		#print count(words(pageContent), stemmer=PORTER)
		#print count(words(pageContent), stemmer=LEMMA)


		#url_content = UrlToplainTextConverter()
		#page_content = url_content.plainTextConverter(url)
           page_content = url
           s = Sentence(parse(page_content))
           tokenized_file = count(words(s), stemmer=PORTER)
           print 
           print tokenized_file
           print
コード例 #13
0
def count_one_artist(name, bad_words):

    # ok, this is a bad way to get number of songs for that artist, so we can average out
    # the words per song
    default_dir = basedir + name
    num_songs = len(os.listdir(default_dir))

    # we need the number of songs, this is so annoying
    dict = {}
    docs = vec.count(vec.words(get_artist_docs(name)))
    for w in bad_words:
        if w in docs:
            dict[w] = docs[w]
    dict['num_songs'] = num_songs # this is cheap
    return dict
コード例 #14
0
ファイル: algorithmTools.py プロジェクト: luislezcair/gisiaws
 def processor(self,minePackage):
     # print '####SEARCH_KEY:',minePackage['searchKey']
     s = Sentence(parse(minePackage['searchKey']))
     minePackage['searchKey']=count(words(s), stemmer=PORTER)
     return minePackage['searchKey']
while len(links) > 0:

    try:
        article = Wikipedia(language="it").search(links.pop(), throttle=10)
        seen[article.title] = True

        # Parse links from article.

        for link in article.links:

            if link not in seen:
                links.add(link)

        # Parse words from article. Count words.

        for word in words(article.string):

            if word not in frequency:
                frequency[word] = 0
            frequency[word] += 1
        print sum(frequency.values()), article.title

    except:
        pass

    # Collect a reliable amount of words (e.g., 1M).

    if sum(frequency.values()) > 1000000:
        break

#top = sorted((count, word) for word, count in frequency.items())
コード例 #16
0
#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
from pattern.vector import words, count, stem, PORTER, LEMMA, chngrams, Document, Vector, distance, Model, TFIDF,\
    HIERARCHICAL, Cluster, NB, kfoldcv, KNN, EUCLIDEAN, TF, SVM, RADIAL, gridsearch, GA
from pattern.en import parse, Sentence, parsetree, lexicon
from pattern.db import csv
from random import choice

# word count
freq_dic = {}
with open('data/input/corpus.txt', 'r') as fp:
    words_list = words(fp.read(),
                       filter=lambda w: w.strip("'").isalnum(),
                       punctuation='.,;:!?()[]{}`'
                       '\"@#$^&*+-|=~_')
    # returns a list of words by splitting the string on spaces.
    freq_dic = count(  # takes a list of words and returns a dictionary of (word, count)-items.
        words=words_list,
        top=None,  # Filter words not in the top most frequent (int).
        threshold=0,  # Filter words whose count <= threshold.
        stemmer=None,  # PORTER | LEMMA | function | None
        exclude=[],  # Filter words in the exclude list.
        stopwords=False,  # Include stop words?
        language='en')  # en, es, de, fr, it, nl
for k, v in freq_dic.iteritems():
    print k, v
# stop words and stemming
print stem('spies', stemmer=PORTER)
print stem('spies', stemmer=LEMMA)
s = 'The black cat was spying on the white cat.'
print count(words(s), stemmer=PORTER)
コード例 #17
0
def v(s):
    """ Returns a bag-of-words vector for the given string.
    """
    v = {}
    v.update(count(words(s)))
    return v
コード例 #18
0
ファイル: stalkerbot.py プロジェクト: sigras/stalkerbot
 def get_keywords(self, comment_history):
     comments = [str(x) for x in comment_history]
     keywords = count(words(comments.__str__()))
     sorted_keywords = sorted(keywords.iteritems(), key=operator.itemgetter(1), reverse=True)
     return sorted_keywords
コード例 #19
0
def tokenization():
    text = "My new car is better than my new bed"
    tokens = words(text)
    print(tokens)
    print(count(tokens))
コード例 #20
0
        for row in all_q:
            row = filter(None, row)  #remove nulls
    
            def fluency(questions):
                return len(questions)
            
            def elaboration(questions):
                return sum(min(len(parsetree(a)[0].pnp), 2) for a in questions)
                
            def variance(cluster):
                return avg([distance(centroid(cluster), v) for v in cluster])
    
            vectors = []
                
            for q in all_q:
                v = count(words(q), stemmer='lemma') 
                v = Vector(v)
                vectors.append(v)
                
            clusters = hierarchical(vectors, k=250, distance='cosine')
            clusters = [isinstance(v, Vector) and [v] or v.flatten() for v in clusters] 
            clusters = sorted(clusters, key=variance)
            
            categories = {}
            
            for i, cluster in enumerate(clusters):
                for v in cluster: 
                    categories[row[vectors.index(v)]] = i

            def flex(questions):
                ml_categories = []
コード例 #21
0
 def countWords(self):
     wordDict = count(
         words(plaintext(self.content),
               filter=lambda w: w.strip("'").isalpha()))
     return Counter(wordDict)
コード例 #22
0
def getWords(text):
    return words(
        text, stemmer=LEMMA, exclude=[], stopwords=False,
        language='en')  # seeing same results with stemmer.stem, LEMMA, PORTER
コード例 #23
0
def v(s):
    """ Returns a bag-of-words vector for the given string.
    """
    v = {}
    v.update(count(words(s)))
    return v
コード例 #24
0
 def processor(self, minePackage):
     print '####SEARCH_KEY:', minePackage['searchKey']
     var = minePackage['searchKey']
     s = Sentence(parse(var))
     return count(words(s),
                  stemmer=PORTER)  #Retorna diccionario {palabra: cantidad}