Python wordsの例、pattern.vector.words Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_vector.py プロジェクト: sb123456789sb/pattern-1

 def test_words(self):
     # Assert word split algorithm (default treats lines as spaces and ignores numbers).
     s = "The cat sat on the\nmat. 1 11."
     v = vector.words(s, filter=lambda w: w.isalpha())
     self.assertEqual(v, ["The", "cat", "sat", "on", "the", "mat"])
     # Assert custom word filter.
     v = vector.words(s, filter=lambda w: True)
     self.assertEqual(v, ["The", "cat", "sat", "on", "the", "mat", "1", "11"])
     print("pattern.vector.words()")

コード例 #2

0

ファイルを表示

ファイル: test_vector.py プロジェクト: andres-root/pattern

 def test_words(self):
     # Assert word split algorithm (default treats lines as spaces and ignores numbers).
     s = "The cat sat on the\nmat. 1 11."
     v = vector.words(s, filter=lambda w: w.isalpha())
     self.assertEqual(v, ["The", "cat", "sat", "on", "the", "mat"])
     # Assert custom word filter.
     v = vector.words(s, filter=lambda w: True)
     self.assertEqual(v, ["The", "cat", "sat", "on", "the", "mat", "1", "11"])
     print("pattern.vector.words()")

コード例 #3

0

ファイルを表示

ファイル: stalkerbot.py プロジェクト: sigras/stalkerbot

 def get_keywords(self, comment_history):
     comments = [str(x) for x in comment_history]
     keywords = count(words(comments.__str__()))
     sorted_keywords = sorted(keywords.iteritems(),
                              key=operator.itemgetter(1),
                              reverse=True)
     return sorted_keywords

コード例 #4

0

ファイルを表示

ファイル: parse2.py プロジェクト: samdimmortal/be-project

def build_model(results=[]):
    documents = [
        Document(i.get('text'),
                 name=i.get('url'),
                 description=i.get('index'),
                 stemmer=LEMMA) for i in results
    ]
    m = Model(documents, weight=TFIDF)

    y, x = 1, len(m.features)
    model = np.zeros((y, x))

    sentence_dict = {}
    model_sentences = []
    for i_index, i in enumerate(documents):
        sentences = sent_tokenize(results[i_index].get('text').lower())

        dy, dx = len(sentences), x
        for s_index, s in enumerate(sentences):
            s_words = {
                w: 1
                for w in words(s, stemmer=LEMMA, stopwords=False)
                if not stopwords_hash.get(w)
            }
            if len(s_words) < 5:
                continue
            model_sentences.append(s)
            model = np.append(
                model, [[1 if s_words.get(w) else 0 for w in m.features]], 0)
            sentence_dict[model.shape[0] - 1] = i.name
            # model_sentences[model.shape[0]-1] = s

    model = np.delete(model, (0), 0)

    return model, m, model_sentences, sentence_dict

コード例 #5

0

ファイルを表示

ファイル: algorithmTools.py プロジェクト: luislezcair/gisiaws

 def run(self,minePackage):
     clouds=minePackage['clouds']
     urlContent=UrlToPlainText()
     for cloud in clouds:
         for n in cloud.graph.nodes():#Itera una lista de enlaces de la nube
             # print cloud.graph.node[n]['link']
             pageContent=urlContent.plainTextConverter(cloud.graph.node[n]['link'])
             cloud.graph.node[n]['methodData']=MethodData(count(words(Sentence(parse(pageContent))), stemmer=PORTER))

コード例 #6

0

ファイルを表示

ファイル: algorithmTools.py プロジェクト: luislezcair/gisiaws

 def __init__(self , data, url=""):
     if url != "":
         urlContent = UrlToPlainText()
         self.contenidoConEtiquetas = urlContent.plainTextConverter(url,"mantenerEtiquetas")
         self.contenido = plaintext(self.contenidoConEtiquetas,keep={})
     else:
         self.contenido = ""
     self.data = count(words(Sentence(parse(self.contenido))), stemmer=PORTER)

コード例 #7

0

ファイルを表示

def roots_and_lemmas():

    print(stem('cars', PORTER))  #Root
    print(stem('cars', LEMMA))
    print(stem('studies', PORTER))  # Root
    print(stem('studies', LEMMA))

    text = "People who teach find teaching very rewarding."
    tokens = words(text)
    print(count(tokens, stopwords=True, stemmer=PORTER))
    print(count(tokens, stopwords=True, stemmer=LEMMA))

コード例 #8

0

ファイルを表示

 def run(self, minePackage):
     clouds = minePackage['clouds']
     urlContent = UrlToPlainText()
     for cloud in clouds:
         for n in cloud.graph.nodes(
         ):  #Itera una lista de enlaces de la nube
             print cloud.graph.node[n]['link']
             pageContent = urlContent.plainTextConverter(
                 cloud.graph.node[n]['link'])
             cloud.graph.node[n]['methodData'] = MethodData(
                 count(words(Sentence(parse(pageContent))), stemmer=PORTER))

コード例 #9

0

ファイルを表示

ファイル: controller.py プロジェクト: matigrojas/pruebaWsd

 def start(self):
     cloudSize = dameCloudSize(self.id_request)
     cloudSize = cloudSize[0][0]
     searchKey = dameSerchKey(self.id_request)
     searchKey = searchKey[0][0]
     step = 0
     while step <= 5:  #Mas adelante setear get_stop; esto indica la cantidad de niveles
         for id_cloud in dameIdCloud(
                 self.id_request
         ):  #Obtiene IDS de los clouds que pertenecen al proyecto
             print "Id Cloud: " + str(id_cloud[0])
             cloud = self.generar_cloud(dameNodo(id_cloud[0]))
             true_nodes = self.trueNodesSelection(cloud)
             for n in true_nodes:
                 try:
                     cloud.graph.node[n]['select'] = False
                     crawler = SimpleCrawler1(n, delay=0.1)
                     crawler.newStructure(cloud.graph)
                     time = 0
                 except:
                     continue
                 while len(crawler.visited) < cloudSize:
                     print "Cloudsize = " + str(
                         cloudSize) + " Crawler Visited = " + str(
                             len(crawler.visited)) + " Nivel =  " + str(
                                 step)
                     print 'Explorando ...'
                     crawler.crawl(method=None)
                     time += 1
                     if time > cloudSize * 10:
                         break
                 actualizarSelect(cloud.graph.node[n]['ID'],
                                  cloud.graph.node[n]['select'])
                 print
                 print '#####Generando documentos#####'
                 #Creacion de minePackage
                 clouds = list()
                 clouds.append(cloud)
                 minePackage = dict()
                 minePackage['clouds'] = clouds
                 minePackage['searchKey'] = searchKey
                 minePackage['searchKeyStemmer'] = count(words(
                     Sentence(parse(searchKey))),
                                                         stemmer=PORTER)
                 self.IRController.start(minePackage)  #Recupera Informacion
                 #FALTA SCRAPPER CONTROLLER
             #Se pone none para que no ocupen espacio innecesario, todo ya fue guardado en BD
             minePackage = None
             cloud = None
             gc.collect
         step += 1
         print "Explorando nivel nro: " + str(step)
         #Controla los niveles a expandir, en este caso 10
     print "Proceso Finalizado"

コード例 #10

0

ファイルを表示

 def __init__(self, data, url="", contenidoBd=""):
     if url != "":
         urlContent = UrlToPlainText()
         self.contenidoConEtiquetas = urlContent.plainTextConverter(
             url, "mantenerEtiquetas")
         self.contenido = plaintext(self.contenidoConEtiquetas, keep={})
     else:
         if (contenidoBd != ""):
             self.contenidoConEtiquetas = contenidoBd
             self.contenido = plaintext(self.contenidoConEtiquetas, keep={})
         else:
             self.contenido = ""
     self.data = count(words(Sentence(parse(self.contenido))),
                       stemmer=PORTER)

コード例 #11

0

ファイルを表示

ファイル: common_words_pattern.py プロジェクト: rakeshsukla53/fuzzy-nemesis

def count_one_artist(name, bad_words):

    # ok, this is a bad way to get number of songs for that artist, so we can average out
    # the words per song
    default_dir = basedir + name
    num_songs = len(os.listdir(default_dir))

    # we need the number of songs, this is so annoying
    dict = {}
    docs = vec.count(vec.words(get_artist_docs(name)))
    for w in bad_words:
        if w in docs:
            dict[w] = docs[w]
    dict['num_songs'] = num_songs  # this is cheap
    return dict

コード例 #12

0

ファイルを表示

ファイル: processor.py プロジェクト: luislezcair/gisiaws

	def tokenizer(self,url):
		#text = 'The black cat was spying on the white cat.'
		#stemmer=None, stemmer=LEMMA, stemmer=PORTER
		#print count(words(pageContent), stemmer=PORTER)
		#print count(words(pageContent), stemmer=LEMMA)


		#url_content = UrlToplainTextConverter()
		#page_content = url_content.plainTextConverter(url)
           page_content = url
           s = Sentence(parse(page_content))
           tokenized_file = count(words(s), stemmer=PORTER)
           print 
           print tokenized_file
           print

コード例 #13

0

ファイルを表示

ファイル: FromFuzzyNemesis.py プロジェクト: nickponvert/BigDataKane

def count_one_artist(name, bad_words):

    # ok, this is a bad way to get number of songs for that artist, so we can average out
    # the words per song
    default_dir = basedir + name
    num_songs = len(os.listdir(default_dir))

    # we need the number of songs, this is so annoying
    dict = {}
    docs = vec.count(vec.words(get_artist_docs(name)))
    for w in bad_words:
        if w in docs:
            dict[w] = docs[w]
    dict['num_songs'] = num_songs # this is cheap
    return dict

コード例 #14

0

ファイルを表示

ファイル: algorithmTools.py プロジェクト: luislezcair/gisiaws

 def processor(self,minePackage):
     # print '####SEARCH_KEY:',minePackage['searchKey']
     s = Sentence(parse(minePackage['searchKey']))
     minePackage['searchKey']=count(words(s), stemmer=PORTER)
     return minePackage['searchKey']

コード例 #15

0

ファイルを表示

ファイル: Python_Computational_Linguistics_Mining_Data.py プロジェクト: VakinduPhilliam/Python_Computation_Linguistics

while len(links) > 0:

    try:
        article = Wikipedia(language="it").search(links.pop(), throttle=10)
        seen[article.title] = True

        # Parse links from article.

        for link in article.links:

            if link not in seen:
                links.add(link)

        # Parse words from article. Count words.

        for word in words(article.string):

            if word not in frequency:
                frequency[word] = 0
            frequency[word] += 1
        print sum(frequency.values()), article.title

    except:
        pass

    # Collect a reliable amount of words (e.g., 1M).

    if sum(frequency.values()) > 1000000:
        break

#top = sorted((count, word) for word, count in frequency.items())

コード例 #16

0

ファイルを表示

ファイル: pattern_vector.py プロジェクト: vishalbelsare/pattern_CLiPS

#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
from pattern.vector import words, count, stem, PORTER, LEMMA, chngrams, Document, Vector, distance, Model, TFIDF,\
    HIERARCHICAL, Cluster, NB, kfoldcv, KNN, EUCLIDEAN, TF, SVM, RADIAL, gridsearch, GA
from pattern.en import parse, Sentence, parsetree, lexicon
from pattern.db import csv
from random import choice

# word count
freq_dic = {}
with open('data/input/corpus.txt', 'r') as fp:
    words_list = words(fp.read(),
                       filter=lambda w: w.strip("'").isalnum(),
                       punctuation='.,;:!?()[]{}`'
                       '\"@#$^&*+-|=~_')
    # returns a list of words by splitting the string on spaces.
    freq_dic = count(  # takes a list of words and returns a dictionary of (word, count)-items.
        words=words_list,
        top=None,  # Filter words not in the top most frequent (int).
        threshold=0,  # Filter words whose count <= threshold.
        stemmer=None,  # PORTER | LEMMA | function | None
        exclude=[],  # Filter words in the exclude list.
        stopwords=False,  # Include stop words?
        language='en')  # en, es, de, fr, it, nl
for k, v in freq_dic.iteritems():
    print k, v
# stop words and stemming
print stem('spies', stemmer=PORTER)
print stem('spies', stemmer=LEMMA)
s = 'The black cat was spying on the white cat.'
print count(words(s), stemmer=PORTER)

コード例 #17

0

ファイルを表示

def v(s):
    """ Returns a bag-of-words vector for the given string.
    """
    v = {}
    v.update(count(words(s)))
    return v

コード例 #18

0

ファイルを表示

ファイル: stalkerbot.py プロジェクト: sigras/stalkerbot

 def get_keywords(self, comment_history):
     comments = [str(x) for x in comment_history]
     keywords = count(words(comments.__str__()))
     sorted_keywords = sorted(keywords.iteritems(), key=operator.itemgetter(1), reverse=True)
     return sorted_keywords

コード例 #19

0

ファイルを表示

def tokenization():
    text = "My new car is better than my new bed"
    tokens = words(text)
    print(tokens)
    print(count(tokens))

コード例 #20

0

ファイルを表示

ファイル: divergent_thinking.py プロジェクト: canuk/ml-paper-resources

        for row in all_q:
            row = filter(None, row)  #remove nulls
    
            def fluency(questions):
                return len(questions)
            
            def elaboration(questions):
                return sum(min(len(parsetree(a)[0].pnp), 2) for a in questions)
                
            def variance(cluster):
                return avg([distance(centroid(cluster), v) for v in cluster])
    
            vectors = []
                
            for q in all_q:
                v = count(words(q), stemmer='lemma') 
                v = Vector(v)
                vectors.append(v)
                
            clusters = hierarchical(vectors, k=250, distance='cosine')
            clusters = [isinstance(v, Vector) and [v] or v.flatten() for v in clusters] 
            clusters = sorted(clusters, key=variance)
            
            categories = {}
            
            for i, cluster in enumerate(clusters):
                for v in cluster: 
                    categories[row[vectors.index(v)]] = i

            def flex(questions):
                ml_categories = []

コード例 #21

0

ファイルを表示

 def countWords(self):
     wordDict = count(
         words(plaintext(self.content),
               filter=lambda w: w.strip("'").isalpha()))
     return Counter(wordDict)

コード例 #22

0

ファイルを表示

def getWords(text):
    return words(
        text, stemmer=LEMMA, exclude=[], stopwords=False,
        language='en')  # seeing same results with stemmer.stem, LEMMA, PORTER

コード例 #23

0

ファイルを表示

ファイル: good-evil.py プロジェクト: OAlm/the_stromberg_stories

def v(s):
    """ Returns a bag-of-words vector for the given string.
    """
    v = {}
    v.update(count(words(s)))
    return v

コード例 #24

0

ファイルを表示

 def processor(self, minePackage):
     print '####SEARCH_KEY:', minePackage['searchKey']
     var = minePackage['searchKey']
     s = Sentence(parse(var))
     return count(words(s),
                  stemmer=PORTER)  #Retorna diccionario {palabra: cantidad}