Python Document Examples, pattern.vector.Document Python Examples

Example #1

0

Show file

File: theogobag.py Project: heyaqiong123/literature

def nnps_and_keywords(text):
    s = parsetree(text, relations=True, lemmata=True)

    nnp_kw = {}
    for e in s:
        d = Document(e)
        kw = d.keywords()

        nnp = set()
        for w in kw:
            if w[1].type == 'NNP':
                wdstr = []
                for wd in w[1].phrase.words:
                    if wd.type == 'NNP':
                        wdstr.append(wd.string)
                nnp.add("-".join(wdstr))


        kw = d.keywords(top=5)
        words = set()
        for w in kw:
            if w[1].type != 'NNP':
                if w[1].lemma:
                    words.add(w[1].lemma)
                else:
                    words.add(w[1].string)

        if len(nnp)>1 and len(words)>1:
            if tuple(nnp) in nnp_kw:
                nnp_kw[tuple(nnp)].update(words)
            else:
                nnp_kw[tuple(nnp)]=words

    return nnp_kw

Example #2

0

Show file

File: Tagger.py Project: ChristophMr/PolArg

 def get_keywords_article(article):
     tagged_content_words = ([
         i.Word for i in article.tagged_content if i.Tag.startswith('NN')
     ])
     d = Document(tagged_content_words)
     k = d.keywords(top=5)
     article.keywords = k

Example #3

0

Show file

File: algorithmTools.py Project: luislezcair/gisiaws

 def run(self,minePackage):
     ac=0.0 #acierto clave
     ap=0.0 #acierto positivo
     an=0.0 #acierto negativo
     alpha=1.00
     beta=0.75
     gamma=0.25
     dictionary= open(os.path.dirname(__file__) + "/dictionary.txt",'r').read()
     dictionary = Document(dictionary, stemmer = PORTER)
     clouds=minePackage['clouds']
     query=minePackage['searchKeyStemmer']
     for cloud in clouds:
         for n in cloud.graph.nodes():
             methodData=cloud.graph.node[n]['methodData']
             content = Document(methodData.getContent(),stemmer = PORTER)
             for doc in content.keywords(top=500,normalized=True):
                 if doc[1] in query and doc[1] in dictionary.words:
                     ac += doc[0]
                 elif doc[1] in dictionary.words:
                     ap += doc[0]
                 elif doc[1] in query:
                     an += doc[0]
             if ac+ap+an > 0:
                 cloud.graph.node[n]['weight_WA']=((ac*alpha)+(ap*beta)+(an*gamma))/(ac+ap+an)
             else:
                 cloud.graph.node[n]['weight_WA']=0

Example #4

0

Show file

File: theogobag.py Project: 2od/literature

def nnps_and_keywords(text):
    s = parsetree(text, relations=True, lemmata=True)

    nnp_kw = {}
    for e in s:
        d = Document(e)
        kw = d.keywords()

        nnp = set()
        for w in kw:
            if w[1].type == 'NNP':
                wdstr = []
                for wd in w[1].phrase.words:
                    if wd.type == 'NNP':
                        wdstr.append(wd.string)
                nnp.add("-".join(wdstr))


        kw = d.keywords(top=5)
        words = set()
        for w in kw:
            if w[1].type != 'NNP':
                if w[1].lemma:
                    words.add(w[1].lemma)
                else:
                    words.add(w[1].string)

        if len(nnp)>1 and len(words)>1:
            if tuple(nnp) in nnp_kw:
                nnp_kw[tuple(nnp)].update(words)
            else:
                nnp_kw[tuple(nnp)]=words

    return nnp_kw

Example #5

0

Show file

File: machinelearning.py Project: bradofclark/docassemble

 def confusion_matrix(self, key=None, output_format=None, split=False):
     """Returns a confusion matrix for the model based on splitting the data set randomly into two pieces, training on one and testing on the other"""
     if split:
         list_of_dependent = self.dependent_in_use(key=key)
     else:
         list_of_dependent = [None]
     output = ''
     matrices = dict()
     for current_dep in list_of_dependent:
         testing_set = list()
         model = self._learner()
         for record in self.classified_entries(key=key):
             if split:
                 dep_result = str(record.dependent == current_dep)
             else:
                 dep_result = record.dependent
             if random.random() < 0.5:
                 model.train(Document(record.independent.lower(), stemmer=PORTER), dep_result)
             else:
                 testing_set.append((Document(record.independent.lower(), stemmer=PORTER), dep_result))
         matrix = model.confusion_matrix(documents=testing_set)
         matrices[current_dep] = matrix
         if output_format == 'html':
             if split:
                 output += '<h4>' + current_dep + "</h4>"
             vals = matrix.keys()
             output += '<table class="table table-bordered"><thead><tr><td></td><td></td><td style="text-align: center" colspan="' + str(len(vals)) + '">Actual</td></tr><tr><th></th><th></th>'
             first = True
             for val in vals:
                 output += '<th>' + val + '</th>'
             output += '</tr></thead><tbody>'
             for val_a in vals:
                 output += '<tr>'                
                 if first:
                     output += '<td style="text-align: right; vertical-align: middle;" rowspan="' + str(len(vals)) + '">Predicted</td>'
                     first = False
                 output += '<th>' + val_a + '</th>'
                 for val_b in vals:
                     output += '<td>' + str(matrix[val_b].get(val_a, 0)) + '</td>'
                 output += '</tr>'
             output += '</tbody></table>'
             #output += "\n\n`" + str(matrix) + "`"
             # output += '<ul>'
             # for document, actual in testing_set:
             #     predicted = model.classify(document)
             #     output += '<li>Predicted: ' + predicted + '; Actual: ' + actual + '</li>'
             # output += '</ul>'
     if output_format == 'html':
         return output
     if split:
         ret_val = matrices
     else:
         ret_val = matrices[None]
     if output_format == 'json':
         return json.dumps(ret_val, sort_keys=True, indent=4)
     if output_format == 'yaml':
         return yaml.safe_dump(ret_val, default_flow_style=False)
     if output_format is None:
         return ret_val
     return ret_val

Example #6

0

Show file

def setup():
    global pages
    global urlalias
    global revurlalias
    global knn
    pages = dict()
    urlalias = dict()
    revurlalias = dict()
    knn = KNN()
    db = MySQLdb.connect(host="192.168.200.26",
                         user="******",
                         passwd="xxxsecretxxx",
                         db="pla")
    cur = db.cursor()
    cur.execute("select source, alias from url_alias")
    for row in cur.fetchall():
        urlalias[row[1]] = row[0]
        revurlalias[row[0]] = row[1]
    cur.execute("select tid, name, description, vid from taxonomy_term_data;")
    for row in cur.fetchall():
        url = 'taxonomy/term/' + str(row[0])
        pages[url] = row[1]
        if url in revurlalias:
            pages[revurlalias[url]] = row[1]
            url = revurlalias[url]
        if row[3] == 3:
            soup = bs4.BeautifulSoup(row[2])
            the_text = re.sub(r'[\n\r]+', r'  ', soup.get_text(' ')).lower()
            knn.train(Document(the_text, stemmer=PORTER), url)
            knn.train(Document(row[1].lower()), url)
    cur.execute(
        "select a.tid, c.body_value, d.title from taxonomy_term_data as a inner join field_data_field_practice_areas as b on (a.tid=b.field_practice_areas_tid and b.entity_type='node' and b.bundle != 'professionals' and b.deleted=0) inner join field_data_body as c on (b.entity_id=c.entity_id and b.entity_type=c.entity_type) inner join node as d on (c.entity_id=d.nid);"
    )
    for row in cur.fetchall():
        url = 'taxonomy/term/' + str(row[0])
        if url in revurlalias:
            url = revurlalias[url]
        soup = bs4.BeautifulSoup(row[1])
        the_text = re.sub(r'[\n\r]+', r'  ', soup.get_text(' ')).lower()
        knn.train(Document(the_text, stemmer=PORTER), url)
        knn.train(Document(row[2].lower()), url)
    cur.execute("select nid, title from node where status=1;")
    for row in cur.fetchall():
        url = 'node/' + str(row[0])
        pages[url] = row[1]
        if url in revurlalias:
            pages[revurlalias[url]] = row[1]
    db.close()
    pgcur = conn.cursor()
    pgcur.execute(
        "select query, target from website_queries where target is not null group by query, target"
    )
    for row in pgcur.fetchall():
        words = re.split(r'[\n\r,;]+ *', row[1])
        for word in words:
            print("training on " + row[0].lower() + " for " + word)
            knn.train(Document(row[0].lower()), word)
    conn.commit()
    pgcur.close()

Example #7

0

Show file

File: keyword_wrapper.py Project: birkin/nlp_app

 def load_text( self, text ):
   self.time_start = datetime.datetime.now()
   self.document_raw = Document( text, threshold=0 )
   self.document_raw_count = self.document_raw.count
   self.document_thresh_stemmed = Document( text, stemmer=PORTER, threshold=1 )
   self.document_thresh_unstemmed = Document( text, threshold=1 )
   self.original_text = text
   self.original_text_md5_hash = hashlib.md5(self.original_text.encode(u'utf-8', u'replace')).hexdigest().decode(u'utf-8', u'replace')  # takes source-u-string, makes source-string, gets hash-string, makes hash-u-string

Example #8

0

Show file

File: preprocesamientoController.py Project: Leanwit/TesisFinal

 def insertarDocumento(self, url, contenido):
     """ Crea registro en mongodb y un archivo Pattern Document"""
     unDocumento = Document(contenido,
                            name=url,
                            stopwords=True,
                            stemming=PORTER,
                            weigth=TFIDF)
     result = self.mongodb.crearDocumento(unDocumento)
     if result:
         unDocumento.save("DocumentoPattern/" + str(result.inserted_id))
     return unDocumento

Example #9

0

Show file

def resolve_certainty(certainty_info):
    '''Resolve certainty with Naive Bayes'''
    if certainty_info == '':
        return 'No certainty info.'
    else:
        nb = NB()
        for observation, certainty in csv(
                'library/templatetags/c_training_data.csv'):
            v = Document(observation, type=int(certainty), stopwords=True)
            nb.train(v)
        return nb.classify(Document(certainty_info))

Example #10

0

Show file

def evaluate_query(query):
    probs = dict()
    for key, value in knn.classify(Document(query),
                                   discrete=False).iteritems():
        probs[key] = value
    if not len(probs):
        probs[knn.classify(Document(query))] = 1.0
    seen = set()
    probs = map(lambda x: fixurl(x, seen),
                sorted(probs, key=probs.get, reverse=True))
    probs = [prob for prob in probs if prob is not None]
    return probs

Example #11

0

Show file

File: preprocesamientoController.py Project: Leanwit/TesisFinal

 def crearDocumentoPattern(self, contenido, name=""):
     '''Creacion de documentos eliminando stopwords, aplicando stemming y peso de frecuencias TFIDF'''
     return Document(contenido,
                     name=name,
                     stemmer=PORTER,
                     stopwords=True,
                     weigth=TFIDF)

Example #12

0

Show file

def summarize(text, n=1):
    """
    extract most relevant sentences from text according to TextRank algorithm
    - text: string consisting of a few sentences
    - n: number of sentences to extract
    """
    # tokenize text to sentences list
    sentences = tokenize(text)

    # create documents list
    # stop words and punctuation erase by default
    docs = [Document(sentences[i], name=i) for i in range(len(sentences))]

    # model initialize
    m = Model(docs, weight=TFIDF)

    # dict of TextRank ranking of cosine similarity matrix
    ranking = utils.textrank(m.documents, m.distance)

    # indexes of top n sentences
    top_sents_idx, _ = list(zip(*ranking.most_common(n)))

    # reordering
    output = [sentences[i] for i in sorted(top_sents_idx)]

    return ''.join(output)

Example #13

0

Show file

File: articles.py Project: news-ai/trends

def articles_to_trends(articles):
    news = {}
    for story in articles:
        if story['added_at']:
            article_text = get_article_text(story['url'])
            d, s = timestamptext(story['added_at'], article_text)

            # Each key in the news dictionary is a date: news is grouped per day.
            # Each value is a dictionary of id => story items.
            # We use hash(story['summary']) as a unique id to avoid duplicate
            # content.
            news.setdefault(d, {})[hash(s)] = s

    m = Model()
    for date, stories in news.items():
        s = stories.values()
        s = ' '.join(s).lower()
        # Each day of news is a single document.
        # By adding all documents to a model we can calculate tf-idf.
        m.append(Document(s, stemmer=LEMMA, exclude=[
                 'news', 'day'], name=date))

    for document in m:
        print document.name
        print document.keywords(top=10)

Example #14

0

Show file

 def _train(self, indep, depend):
     """Trains the machine learner given an independent variable and a corresponding dependent variable."""
     if indep is None:
         return
     the_text = re.sub(r'[\n\r]+', r'  ', indep).lower()
     learners[self.group_id].train(
         Document(the_text.lower(), stemmer=PORTER), depend)

Example #15

0

Show file

File: summarize.py Project: xmonkee/Shards

def summarize(text_to_summarize):
    stokens = tokenize(text_to_summarize)
 
    # STEP 1
    # pattern.vector's Document is a nifty bag-o-words structure,
    # with a TF weighting scheme
    docs = [Document(string= s, name=e,stemmer=LEMMA)
            for e,s in enumerate(stokens) if len(s.split(" ")) > 7]
    
    linkgraph = []
    # STEP 2 and 3 happen interwovenly
    for doc in docs:
        for doc_copy in docs:
            if doc.name != doc_copy.name:
                # STEP 2 happens here
                wordset_a = [x[1] for x in doc.keywords()]
                wordset_b = [y[1] for y in doc_copy.keywords()]
                jacc_dist = distance.jaccard(wordset_a, wordset_b)
                if jacc_dist < 1:
                    linkgraph.append((str(doc.name), #index to sentence
                                      str(doc_copy.name),1-jacc_dist)) #dist. score
    # By the time we reach here, we'd have completed STEP 3
    
    # STEP 4
    #I referenced this SO post for help with pagerank'ing
    #http://stackoverflow.com/questions/9136539/how-to-weighted-edges-affect-pagerank-in-networkx
    D=nx.DiGraph()
    D.add_weighted_edges_from(linkgraph)
    pagerank = nx.pagerank(D)
    sort_pagerank = sorted(pagerank.items(),key=operator.itemgetter(1))
    sort_pagerank.reverse()
    top2 = sort_pagerank[:2]
    orderedtop2 = [int(x[0]) for x in top2]
    orderedtop2 = sorted(orderedtop2)
    return " ".join([ stokens[i] for i in orderedtop2 ])

Example #16

0

Show file

File: publications.py Project: news-ai/trends

def feeds_to_trends(feeds):
    for url in feeds:
        url = url['feed_url']
        news = {}
        try:
            for story in Newsfeed().search(url, cached=False):
                d, s = datetext(story.date, story.description)

                # Each key in the news dictionary is a date: news is grouped per day.
                # Each value is a dictionary of id => story items.
                # We use hash(story.description) as a unique id to avoid duplicate
                # content.
                news.setdefault(d, {})[hash(s)] = s

            m = Model()
            for date, stories in news.items():
                s = stories.values()
                s = ' '.join(s).lower()
                # Each day of news is a single document.
                # By adding all documents to a model we can calculate tf-idf.
                m.append(Document(s, stemmer=LEMMA, exclude=[
                         'news', 'day'], name=date))

            for document in m:
                print document.name
                print document.keywords(top=10)
        except HTTP404NotFound:
            print url
            pass

Example #17

0

Show file

def doclist_from_feeds(feeds):
    titles = gettitles(feeds)
    documents = []
    for key in titles:
        doc = Document(" ".join(titles[key]), stemmer=LEMMA, threshold=0)
        documents.append(doc)
    return documents

Example #18

0

Show file

File: machinelearning.py Project: bradofclark/docassemble

 def predict(self, indep, probabilities=False):
     """Returns a list of predicted dependent variables for a given independent variable."""
     indep = re.sub(r'[\n\r]+', r'  ', indep).lower()
     if not self._train_from_db():
         return list()
     probs = dict()
     for key, value in learners[self.group_id].classify(Document(indep.lower(), stemmer=PORTER), discrete=False).iteritems():
         probs[key] = value
     if not len(probs):
         single_result = learners[self.group_id].classify(Document(indep.lower(), stemmer=PORTER))
         if single_result is not None:
             probs[single_result] = 1.0
     if probabilities:
         return [(x, probs[x]) for x in sorted(probs.keys(), key=probs.get, reverse=True)]
     else:
         return sorted(probs.keys(), key=probs.get, reverse=True)

Example #19

0

Show file

def word_ranking(text, n='L2'):
    """
    extract most relevant sentences from text according to LSA algorithm
    steps:    
    1. tokenize text by sentences
    2. compute tfidf matrix
    3. applying SVD of tfidf matrix (reduce to n-dimensions) 
    4. ranking sentences according to cross-method (source: http://www.aclweb.org/anthology/C10-1098.pdf)
        
    - text: string consisting of a few sentences
    - n: number of sentences to extract
    
    """
    # tokenize text to sentences list
    sentences = tokenize(text)

    #==============================================================================
    #     #synctatic filter
    #     exclude_list = []
    #     for sent in sentences:
    #         for word, pos in tag(sent):
    #             if pos != "JJ" or pos != 'NN': # Retrieve all adjectives and nouns.
    #                 exclude_list.append(word.lower())
    #==============================================================================

    # create documents list
    # stop words and punctuation erase by default
    docs = [Document(sentences[i], name=i) for i in range(len(sentences))]

    # model initialize
    m = Model(docs, weight=TFIDF)

    # dimensions number equal to euclidean norm of singular values
    # U, S, Vt = np.linalg.svd(m.vectors, full_matrices=False)
    # dimensions=int(round(np.linalg.norm(S, 2)))
    m.reduce(dimensions=n)

    # sentences selection according to cross-method
    # source: http://www.ceng.metu.edu.tr/~e1395383/papers/TextSummarizationUsingLSA(Journal).pdf
    # topic(rows) x tokens(cols) matrix(tfidf)
    V = np.array(m.lsa.vt)

    # average sentence score for each concept/topic by the rows of the Vt matrix
    avg_score = np.mean(V, axis=1).reshape((-1, 1))

    # cell values which are less than or equal to the average score are set to zero
    V[V <= avg_score] = 0.0

    # sigma natrix after svd performing
    S = np.array(m.lsa.sigma).reshape((-1, 1))

    # total length of each sentence vector
    length = np.sum(V * S, axis=0)

    # ranking words by length score
    ranking = Counter(dict(zip(m.lsa.terms, length)))  #.most_common(n)

    #words, score =  list(zip(*ranking))

    return ranking

Example #20

0

Show file

File: parse2.py Project: samdimmortal/be-project

def build_model(results=[]):
    documents = [
        Document(i.get('text'),
                 name=i.get('url'),
                 description=i.get('index'),
                 stemmer=LEMMA) for i in results
    ]
    m = Model(documents, weight=TFIDF)

    y, x = 1, len(m.features)
    model = np.zeros((y, x))

    sentence_dict = {}
    model_sentences = []
    for i_index, i in enumerate(documents):
        sentences = sent_tokenize(results[i_index].get('text').lower())

        dy, dx = len(sentences), x
        for s_index, s in enumerate(sentences):
            s_words = {
                w: 1
                for w in words(s, stemmer=LEMMA, stopwords=False)
                if not stopwords_hash.get(w)
            }
            if len(s_words) < 5:
                continue
            model_sentences.append(s)
            model = np.append(
                model, [[1 if s_words.get(w) else 0 for w in m.features]], 0)
            sentence_dict[model.shape[0] - 1] = i.name
            # model_sentences[model.shape[0]-1] = s

    model = np.delete(model, (0), 0)

    return model, m, model_sentences, sentence_dict

Example #21

0

Show file

 def get_labeled_feats(self, data):
     labeled_binary = []
     for (word, tag) in data:
         feat = FeatExtract(
             word,
             ArtOrDet=(self.error_tag == 'ArtOrDet')).binary_features()
         d = Document(feat, type=tag, stopwords=True)
         labeled_binary.append(d)
     return labeled_binary

Example #22

0

Show file

def getMod():
    essay_path = 'essays/original/'
    files = fio.recGetTextFiles(path.abspath(essay_path))
    docs = []
    for f in files:
        with io.open(f, 'r', encoding='utf-8') as w:
            text = TextBlob(PageParser.parse(w.read()))
            text = ' '.join([
                word for word in text.words if word not in cachedStopWords
            ]).lstrip()
            #ent_text = ' '.join(er.recognize_entities(text.sentences))
            #ent_text = PageParser.parse(w.read())
            docs.append(Document(text, name=f, top=40))
    m = Model(docs)
    lsa = m.reduce(5)
    return lsa
    # Clustering could be a useful technique, commenting out for now
    #with io.open(r'lsa.txt', 'w+', encoding='utf-8') as w:
    #	write_cluster(m.cluster(method=HIERARCHICAL, k=4), w, "")

    with io.open(r'lsa.txt', 'w+', encoding='utf-8') as w:
        for i, concept in enumerate(m.lsa.concepts):
            print("Concept {0}:".format(i)),
            w.write(unicode("Concept {0}:".format(i)))
            count = 0
            # Show top only first 5 features we come across
        for feature, weight in m.lsa.concepts[i].items():
            if abs(weight) > 0.2:
                print(feature),
                w.write(feature + " ")
                count += 1

            if count > 5:
                break
        w.write(unicode('\n'))
        #print

        cat_docs = []
        for d in m.documents:
            cat = (0, 0, {})
            #print d.name.split('\\')[-1]
            for idx, weight in m.lsa.vectors[d.id].items():
                print "\tCat {0}: {1}".format(idx, weight)
                if abs(weight) > abs(cat[1]) or cat[1] == 0:
                    cat = (idx, weight, d)

            if cat[0] == i:
                cat_docs.append(cat)
                #print "\t{0}".format(d.name.split('\\')[-1])

        cat_docs.sort(key=lambda tup: abs(tup[1]), reverse=True)
        for cat, weight, d in cat_docs:
            f = d.name.split('\\')[-1]
            w.write(
                unicode("\t{0} - {1}\n").format(
                    filter(lambda x: x in string.printable, f), weight))

Example #23

0

Show file

def asDocumentClass(data, classification):
    '''
    a function that converts list of reviews to Documents to be used by Pattern
    '''
    data = [(r['review/text'], str(classification)) for r in data]
    data = [
        Document(review, type=classification, stopwords=True)
        for review, classification in data
    ]
    return data

Example #24

0

Show file

def asDocumentReview(data):
    '''
    a function that converts list of reviews to Documents to be used by Pattern
    '''
    data = [(r['review/text'], float(r['review/score'])) for r in data]
    data = [
        Document(review, type=rating, stopwords=True)
        for review, rating in data
    ]
    return data

Example #25

0

Show file

 def run(self, minePackage):
     ac = 0.0  #acierto clave
     ap = 0.0  #acierto positivo
     an = 0.0  #acierto negativo
     alpha = 1.00
     beta = 0.75
     gamma = 0.25
     dictionary = open(os.path.dirname(__file__) + "/dictionary.txt",
                       'r').read()
     dictionary = Document(dictionary, stemmer=PORTER)
     clouds = minePackage['clouds']
     query = minePackage['searchKeyStemmer']
     for cloud in clouds:
         for n in cloud.graph.nodes():
             methodData = cloud.graph.node[n]['methodData']
             # document=methodData.getData()
             # for t in document:
             #     tf=document[t]
             #     if t in query:
             #         print "entroooooooooooooooooo"
             #         ac+=tf
             #     else:
             #         if t in dictionary:#creo que me olvide de hacer stemming a las palabras del diccionario
             #             ap+=tf
             #         else:
             #             an+=tf
             content = Document(methodData.getContent(), stemmer=PORTER)
             for doc in content.keywords(top=200, normalized=True):
                 if doc[1] in query:
                     ac += doc[0]
                 else:
                     if doc[1] in dictionary.words:
                         ap += doc[0]
                     else:
                         an += doc[0]
             if ac + ap + an > 0:
                 cloud.graph.node[n]['weight_WA'] = (
                     (ac * alpha) + (ap * beta) +
                     (an * gamma)) / (ac + ap + an)
             else:
                 cloud.graph.node[n]['weight_WA'] = 0

Example #26

0

Show file

 def calculate(self, minePackage):
     webDocuments = []
     query = Document((minePackage['searchKey']))
     clouds = minePackage['clouds']
     count = UnPack()
     totalLinks = count.total(clouds)
     urlContent = UrlToPlainText()
     step = 0
     for cloud in clouds:
         for n in cloud.graph.nodes():
             doc = cloud.graph.node[n]['methodData']
             webDocuments.append(Document(doc.getData()))
             step += 1
     m = Model(documents=webDocuments, weight=TFIDF)
     for cloud in clouds:
         for n in cloud.graph.nodes():
             methodData = cloud.graph.node[n]['methodData']
             vector = Document(methodData.getData())
             cloud.graph.node[n]['weight_VSM'] = m.similarity(
                 vector,
                 query)  #SETEA EL VALOR DE VSM EN EL CLOUD!!!!!!!!!!

Example #27

0

Show file

File: summarizer.py Project: Radahika/Persimmon

def summarize(raw_text):
    if len(raw_text) == 0:
        return ""

    sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    tokens = sentence_tokenizer.tokenize(raw_text.strip())

    documents = []
    for position, sentence in enumerate(tokens):
        if len(sentence.split(" ")) > 5:
            document = Document(string=sentence, name=position, stemmer=LEMMA)
            if len(document.features) > 0:
                documents.append(document)

    edges = []
    for document in documents:
        for other_document in documents:
            if document.name == other_document.name:
                continue
            doc_words = document.features
            other_doc_words = other_document.features
            similarity = jaccard_similarity(doc_words, other_doc_words)
            if similarity > 0:
                edges.append((document.name, other_document.name, similarity))

    graph = networkx.DiGraph()
    graph.add_weighted_edges_from(edges)
    page_rank = networkx.pagerank(graph)

    sorted_ranks = sorted(page_rank.items(),
                          key=operator.itemgetter(1),
                          reverse=True)

    summary = []
    sentence_numbers = []

    num_sentences = 3
    for i in range(num_sentences):
        if i < len(sorted_ranks):
            node = sorted_ranks[i]
            sentence_numbers.append(node[0])

    sentence_numbers = sorted(sentence_numbers)

    for sentence_number in sentence_numbers:
        sentence = tokens[sentence_number]
        summary.append(sentence)

    if len(summary) == 0:
        summary.append(tokens[0])

    return " ".join(summary)

Example #28

0

Show file

File: nlp.py Project: jogsdjf/NLP-Project

def extractSentiment(characterSentences):
    """
    Trains a Naive Bayes classifier object with the reviews.csv file, analyzes
    the sentence, and returns the tone.
    """
    nb = NB()
    characterTones = defaultdict(list)
    for review, rating in csv("reviews.csv"):
        nb.train(Document(review, type=int(rating), stopwords=True))
    for key, value in characterSentences.items():
        for x in value:
            characterTones[key].append(nb.classify(str(x)))
    return characterTones

Example #29

0

Show file

File: analytics.py Project: enjoylife/StupidMonkey

 def sync_corpus(self):
     """Creates  a new corpus on all notes if we already have synced before
     TODO:
         Store other data in the corpus besides basic text content, ie,
         extracted image, attribute note data, etc...
         catch corpus not found file error?
     """
     docs =[]
     corpus_check =  self.mongo.users.find_one({'_id':self.user_id},
             {'corpus':1}).get('corpus')
     # make sure we already created corpus
     if corpus_check and self.need_sync:
         update_guids = self.resync_db()
         corpus = self.load_corpus()
         # only those that need to be updated from the update_guids
         for x in self.mongo.notes.find(
                 {'_id':{'$in':update_guids}},{'tokens_content':1,'str_title':1}):
             # create the updated doc
             d =  Document(x['tokens_content'],name=x['str_title'],top=50)
             # set the id to what we want
             d._id = x['_id']
             docs.append(d)
             # remove old doc because corpus will still have old content
             corpus.remove(d)
         corpus.extend(docs)
         self.save_corpus(corpus,update=True)
     # dont need the sync, do nothing
     elif corpus_check:
         return
     # corpus sync has not been done before
     else: 
         for x in self.mongo.notes.find( # all notes of this user
                     {'_id_user':self.user_id},{'tokens_content':1,'str_title':1}):
                 d =  Document(x['tokens_content'],name=x['str_title'],top=30)
                 d._id = x['_id']
                 docs.append(d)
         corpus = Corpus(docs)
         self.save_corpus(corpus)
         self.mongo.users.update({'_id':self.user_id},{'$set':{'corpus':True}})

Example #30

0

Show file

def create_doc_list(df):
    '''
    Given a dataframe containing an 'id' column and a 'review' column, create a
    list of documents in Pattern.Vector Document format. Because of how the data
    is formatted in the dataframe, the id contains an extra quote at the beginning
    and end of the id which need to be stripped away.
    '''
    print "Creating a list of {} documents".format(len(df))
    doc_list = []
    for index, row in df.iterrows():
        d = Document(row['review'], threshold=1, name=row['id'][1:-1])
        doc_list.append(d)
    return doc_list

Example #31

0

Show file

    def classify(text):
        predicted_category = Classifications._category.classify(Document(text),
                                                                discrete=True)
        predicted_rate = Classifications._rating.classify(Document(text),
                                                          discrete=True)
        predicted_rate_nlp = Classifications._rating_nlp.classify(
            Classifications.selectWords(text), discrete=True)
        predicted_sentiment_dict = Classifications._sentiment.classify(
            Classifications.selectWords(text), discrete=False)
        predicted_sentiment = True if str(
            sorted(predicted_sentiment_dict.items(),
                   key=operator.itemgetter(1),
                   reverse=True)[1][0]) in ['True', '3.0', '4.0', '5.0'
                                            ] else False

        return {
            'text': text,
            'rate': predicted_rate,
            'category': predicted_category,
            'rate_nlp': predicted_rate_nlp,
            'positivity': predicted_sentiment
        }

Example #32

0

Show file

File: extractFeatures.py Project: man-dsc/Resesarch2018

def extract():
    print 'Extracting features from app descriptions...\n'
    if os.path.exists(OUTPUT_PATH):
        shutil.rmtree(OUTPUT_PATH)
    os.makedirs(OUTPUT_PATH)

    for dir in os.listdir(INPUT_PATH):
        if not dir.startswith('.'):
            os.makedirs("{}/{}".format(OUTPUT_PATH, dir))
            for file in os.listdir('{}/'.format(INPUT_PATH) + dir):
                with open('{}/{}/{}'.format(INPUT_PATH, dir, file), 'rb') as f:
                    reader = csv.reader(f)
                    next(reader)
                    with open('{}/{}/{}'.format(OUTPUT_PATH, dir, file),
                              'wb') as r:
                        writer = csv.writer(r)
                        for app in reader:
                            name = app[0]
                            description = app[2]

                            # Prepare an app description string for NLTK and LDA processing
                            preparedDescription = prepare_description(
                                description)

                            # Extract 3 word featurlets from the description
                            featurelets = featurelet_extraction(
                                preparedDescription)

                            list = []
                            for feature in featurelets:
                                featurelet = '{} {} {}'.format(
                                    feature[0], feature[1], feature[2])
                                list.append(
                                    Document(featurelet, name=featurelet))

                            # Perform hierarchical clustering
                            m = Model(list)
                            cluster = m.cluster(method=HIERARCHICAL,
                                                k=3,
                                                iterations=1000,
                                                distance=COSINE)

                            # Organize clusters into features and alternative tokens
                            (features,
                             alterTokens) = group(cluster, [], [], [])

                            # Write results to file
                            writer.writerow(
                                [name, description, features, alterTokens])
                        r.close()
                    f.close()

Example #33

0

Show file

def get_top_freq_words_in_text(txt_string, top_count, filter_method = lambda w: w.lstrip("'").isalnum(),
                               exclude_len = 0):
    """ Method to get the top frequency of words in text.
        Args:
            txt_string (str): Input string.
            top_count (int): number of top words to be returned.

        Kwargs:
            filter_method (method): special character to ignore, in some cases numbers may also need to ignore.
                                    pass in lambda function.
                                    Default accept method that include only alphanumeric

            exclude_len (int): exclude keyword if len less than certain len.
                                default 0, which will not take effect.

        Returns:
            (list): list of top words

    """
    docu = Document(txt_string, threshold=1, filter = filter_method)

    ## Provide extra buffer if there is word exclusion
    ## Allow for additional buffer of top of keyword so that can still within spec top count after later elimiation.
    freq_keyword_tuples = docu.keywords(top = top_count + 5 )
    
    ## encode for unicode handliing
    if exclude_len  == 0:
        freq_keyword_list = [n[1].encode() for n in freq_keyword_tuples]
    else:
        freq_keyword_list = [n[1].encode() for n in freq_keyword_tuples if not len(n[1])<=exclude_len]

    ## reduce all word to same form
    freq_keyword_list = [get_singular_form_of_word(n) for n in freq_keyword_list]

    ## remove duplicates
    freq_keyword_list = rm_duplicate_keywords(freq_keyword_list)

    return freq_keyword_list[:top_count]

Example #34

0

Show file

File: Pattern_Parsing.py Project: nakamichikun/google_search_module_alt

def get_top_freq_words_in_text(txt_string, top_count, filter_method=lambda w: w.lstrip("'").isalnum(), exclude_len=0):
    """ Method to get the top frequency of words in text.
        Args:
            txt_string (str): Input string.
            top_count (int): number of top words to be returned.

        Kwargs:
            filter_method (method): special character to ignore, in some cases numbers may also need to ignore.
                                    pass in lambda function.
                                    Default accept method that include only alphanumeric

            exclude_len (int): exclude keyword if len less than certain len.
                                default 0, which will not take effect.

        Returns:
            (list): list of top words

    """
    docu = Document(txt_string, threshold=1, filter=filter_method)

    ## Provide extra buffer if there is word exclusion
    ## Allow for additional buffer of top of keyword so that can still within spec top count after later elimiation.
    freq_keyword_tuples = docu.keywords(top=top_count + 5)

    ## encode for unicode handliing
    if exclude_len == 0:
        freq_keyword_list = [n[1].encode() for n in freq_keyword_tuples]
    else:
        freq_keyword_list = [n[1].encode() for n in freq_keyword_tuples if not len(n[1]) <= exclude_len]

    ## reduce all word to same form
    freq_keyword_list = [get_singular_form_of_word(n) for n in freq_keyword_list]

    ## remove duplicates
    freq_keyword_list = rm_duplicate_keywords(freq_keyword_list)

    return freq_keyword_list[:top_count]

Example #35

0

Show file

File: Util.py Project: jayrod/fluentanki

    def text_to_database(self, full_text, sha224, title, fluent_anki_session, target_language):

        language = DBWrapper().get_or_create_language(fluent_anki_session, target_language)
        #parse text into objects
        pobj = Parser().parse(full_text)
        #import pdb; pdb.set_trace()

        doc = Document(full_text)

        #create the source text database object
        source_text_dbobj = SourceText(title = title, 
                                       hash_text=sha224, 
                                       text_length=len(pobj.tagged_words))

        black_list = DBWrapper().get_black_list_word_set(fluent_anki_session)

        for word in tqdm(pobj.unique_words):
            #see if it's on the black list
            if (word not in black_list) and (not Util().has_numbers(word)):
                #length of the word should not be null
                if word:
                    wstf = Word_SourceText_Frequency(frequency = pobj.word_frequency_dict[word])
                    wstf.text_pos = pobj.unique_parse_words_dict[word].text_pos
                    wstf.tfidf = doc.tfidf(word)
                    wtype = pobj.unique_parse_words_dict[word][0].type
                    w = ExoticWord(text=word, word_type=wtype, lang=language.id)
                    wstf.words = w
                    source_text_dbobj.words.append(wstf)
                    for sentence in pobj.unique_parse_words_dict[word].suggested_sentences:
                        sent = DBWrapper().get_or_create_sentence(fluent_anki_session, sentence.string, target_language)
                        sent.words.append(w)
                        fluent_anki_session.add(sent)

        #write parsed words to database
        fluent_anki_session.add(source_text_dbobj)
        fluent_anki_session.commit()

Example #36

0

Show file

File: some-subtitle-text-data-pattern-tfidf-top10-features.py Project: exo-mer/2019-some-subtitle-text-mining-with-pattern-clips

def get_model_from_documents(path='./*/*.txt'):
    '''return model from given txt files'''
    import codecs
    import glob
    from pattern.vector import Document, Model, TFIDF

    documents = []
    files = glob.glob('./*/*.*')
    for file in files:
        f = codecs.open(file, 'r')
        data = f.read()
        document = Document(data)
        documents.append(document)

    model = Model(documents=documents, weight=TFIDF)
    return documents, model

Example #37

0

Show file

File: keyword_wrapper.py Project: birkin/nlp_app

class KeywordWrapper( object ):
  '''
  Non-django model; wrapper around pattern.vector keyword functions.
  See views.keywords() for usage.
  '''

  def __init__(self):
    self.time_start = None
    self.params = {}
    self.original_text = None
    self.original_text_md5_hash = None
    self.document_raw = None
    self.document_raw_count = None
    self.document_thresh_stemmed = None
    self.document_thresh_unstemmed = None
    self.top_num = 10
    self.keywords_stemmed = None
    self.keywords_unstemmed = None
    self.keywords_unstemmed_additional = None
    self.keywords_stemmed_simple = []
    self.explore_json_string = None
    self.simple_json_string = None

  def get_params( self, dj_request ):
    assert type(dj_request) == django.core.handlers.wsgi.WSGIRequest
    if dj_request.method == u'GET':
      for item in dj_request.GET.items():
        key = item[0]; value = item[1]
        self.params[key] = value
    else:  # POST
      for item in dj_request.POST.items():
        key = item[0]; value = item[1]
        self.params[key] = value

  def load_text( self, text ):
    self.time_start = datetime.datetime.now()
    self.document_raw = Document( text, threshold=0 )
    self.document_raw_count = self.document_raw.count
    self.document_thresh_stemmed = Document( text, stemmer=PORTER, threshold=1 )
    self.document_thresh_unstemmed = Document( text, threshold=1 )
    self.original_text = text
    self.original_text_md5_hash = hashlib.md5(self.original_text.encode(u'utf-8', u'replace')).hexdigest().decode(u'utf-8', u'replace')  # takes source-u-string, makes source-string, gets hash-string, makes hash-u-string

  def set_top_num( self ):
    assert type(self.document_raw) == pattern.vector.Document
    for i in range( 1, self.document_raw.count, 1000 ):
      self.top_num += 1
      if self.top_num == 50:
        break

  def make_keywords_stemmed_simple( self ):
    assert type(self.document_thresh_stemmed) == pattern.vector.Document
    self.keywords_stemmed = self.document_thresh_stemmed.keywords( top=self.top_num )
    for kw_tuple in self.keywords_stemmed:
      score = kw_tuple[0]; word = kw_tuple[1]
      self.keywords_stemmed_simple.append( word )

  def make_default_keywords( self ):
    '''keywords stemmed & unstemmed'''
    assert type(self.document_thresh_stemmed) == pattern.vector.Document
    assert type(self.document_thresh_unstemmed) == pattern.vector.Document
    self.keywords_stemmed = self.document_thresh_stemmed.keywords( top=self.top_num )
    self.keywords_unstemmed = self.document_thresh_unstemmed.keywords( top=self.top_num )
    
  def make_additional_keywords( self ):
    '''unstemmed words not in stemmed list'''
    assert type(self.keywords_stemmed) == list
    if len( self.keywords_stemmed ) > 0:
      assert type(self.keywords_stemmed[0]) == tuple
    assert type(self.keywords_unstemmed) == list
    if len( self.keywords_unstemmed ) > 0:
      assert type(self.keywords_unstemmed[0]) == tuple
    ## make simple stemmed keyword list from (score, word) tuple
    temp_simple_stemmed = []
    for kw_tuple in self.keywords_stemmed:
      score = kw_tuple[0]; word = kw_tuple[1]
      temp_simple_stemmed.append( word )
    ## add any additional unstemmed keywords (whose stems aren't in temp_simple_stemmed )
    self.keywords_unstemmed_additional = []
    for kw_tuple in self.keywords_unstemmed:
      score = kw_tuple[0]; word = kw_tuple[1]
      if word not in temp_simple_stemmed:  # TODO: time using sets here instead
        if stem( word, stemmer=PORTER ) not in temp_simple_stemmed:
          self.keywords_unstemmed_additional.append( kw_tuple )

  def build_explore_json_string( self ):
    import hashlib
    d = {
      u'count_words_raw': len( self.original_text.split() ),
      u'count_words_analyzed': self.document_raw.count,
      u'count_words_repeating_stemmed': self.document_thresh_stemmed.count,
      u'count_words_repeating_unstemmed': self.document_thresh_unstemmed.count,
      u'count_keywords_stemmed': len( self.keywords_stemmed ),
      u'count_keywords_unstemmed': len( self.keywords_unstemmed ),
      u'count_keywords_unstemmed_additional': len( self.keywords_unstemmed_additional ),
      u'hash_md5': self.original_text_md5_hash,
      u'keywords_stemmed': self.keywords_stemmed,
      u'keywords_unstemmed': self.keywords_unstemmed,
      u'keywords_unstemmed_additional': self.keywords_unstemmed_additional,
      u'repeating_words_unstemmed': self.document_thresh_unstemmed.terms,
      u'time_start': unicode( self.time_start ),
      u'time_taken': unicode( datetime.datetime.now() - self.time_start ),
      u'docs': app_settings.DOCS_URL
      }
    self.explore_json_string = json.dumps( d, sort_keys=True, indent=2 )

  def build_simple_json_string( self ):
    d = {
      u'count_keywords_stemmed': len( self.keywords_stemmed ),
      u'keywords_stemmed': self.keywords_stemmed_simple,
      u'hash_md5': self.original_text_md5_hash,
      u'time_start': unicode( self.time_start ),
      u'time_taken': unicode( datetime.datetime.now() - self.time_start ),
      u'docs': app_settings.DOCS_URL
      }
    self.simple_json_string = json.dumps( d, sort_keys=True, indent=2 )

Example #38

0

Show file

File: document_test.py Project: tweettu/TweetGraph

# coding=utf-8

from pattern.vector import Document


s = '''
    The shuttle Discovery, already delayed three times by technical problems
    and bad weather, was grounded again Friday, this time by a potentially
    dangerous gaseous hydrogen leak in a vent line attached to the shipʼs
    external tank. The Discovery was initially scheduled to make its 39th
    and final flight last Monday, bearing fresh supplies and an intelligent
    robot for the International Space Station. But complications delayed the
    flight from Monday to Friday,  when the hydrogen leak led NASA to conclude
    that the shuttle would not be ready to launch before its flight window
    closed this Monday.
'''

d = Document(s)
print d.keywords(top=10)
d._description = 'sample corpus'
print d._description
print d.term_frequency('flight')
print d.tfidf('flight')
print d.features
print d.words
print 'vector = ', d.vector

Example #39

0

Show file

File: 01-document.py Project: DataBranner/pattern

# e.g., "conspiracies" => "conspiracy", "conspired" => "conspire".

s = """
The shuttle Discovery, already delayed three times by technical problems and bad weather, 
was grounded again Friday, this time by a potentially dangerous gaseous hydrogen leak 
in a vent line attached to the ship's external tank.
The Discovery was initially scheduled to make its 39th and final flight last Monday, 
bearing fresh supplies and an intelligent robot for the International Space Station. 
But complications delayed the flight from Monday to Friday, 
when the hydrogen leak led NASA to conclude that the shuttle would not be ready to launch 
before its flight window closed this Monday.
"""

# With threshold=1, only words that occur more than once are counted.
# With stopwords=False, words like "the", "and", "I", "is" are ignored.
document = Document(s, threshold=1, stopwords=False)
print(document.words)
print()

# The /corpus folder contains texts mined from Wikipedia.
# Below is the mining script (we already executed it for you):

#import os, codecs
#from pattern.web import Wikipedia
#
#w = Wikipedia()
# for q in (
#  "badger", "bear", "dog", "dolphin", "lion", "parakeet",
#  "rabbit", "shark", "sparrow", "tiger", "wolf"):
#    s = w.search(q, cached=True)
#    s = s.plaintext()

Example #40

0

Show file

File: map_keywords.py Project: muguira-james/news_heatmap

###
### to cmd line test this:
###   echo "{ \"title\" : \"james muguira\", \"link\" : \"http://rss.cnn.com/rss/cnn_topstories.rss\", \"source\" : \"hello world\", \"data\" : \"{ json }\" }"
###
### for example
###
###   insert into table cnn_top select transform (text) 
###     using 'python map_strm.py' as (title, link, source, data) 
###     from test;

###

import sys
import json
from   pattern.vector import Document
from   pattern.web import plaintext
import urllib2


for line in sys.stdin:
	line = line.strip()
	ljs = json.loads(line)
	fjs = urllib2.urlopen(ljs['link']).read()
	st = plaintext(fjs)
	d = Document(st)
	w = json.dumps(d.keywords())
	print "%s\t%s\t%s\t%s" % (ljs['title'], ljs['link'], ljs['source'], w)

Example #41

0

Show file

File: 01-document.py Project: Dirklectisch/cityment

# "conspiracy" and "conspired" are both reduced to "conspir".

s = """
The shuttle Discovery, already delayed three times by technical problems and bad weather, 
was grounded again Friday, this time by a potentially dangerous gaseous hydrogen leak 
in a vent line attached to the ship's external tank.
The Discovery was initially scheduled to make its 39th and final flight last Monday, 
bearing fresh supplies and an intelligent robot for the International Space Station. 
But complications delayed the flight from Monday to Friday, 
when the hydrogen leak led NASA to conclude that the shuttle would not be ready to launch 
before its flight window closed this Monday.
"""

# With threshold=1 (default), only words that occur more than once are counted.
# Some stop words like "the", "and", "I", "is" are always ignored.
document = Document(s, threshold=1)
print document.terms
print

# The corpus/ folder contains some texts retrieved from Wikipedia.
# Here is the code (we already executed it for you):

#from pattern.web import Wikipedia
#
#wp = Wikipedia()
#for q in (
#  "badger", "bear", "dog", "dolphin", "lion", "parakeet", 
#  "rabbit", "shark", "sparrow", "tiger", "wolf"):
#    s = wp.search(q, cached=True)
#    s = s.plaintext()
#    f = codecs.open(os.path.join("corpus", q+".txt"), "w", encoding="utf-8")