Example #1
0
def nnps_and_keywords(text):
    s = parsetree(text, relations=True, lemmata=True)

    nnp_kw = {}
    for e in s:
        d = Document(e)
        kw = d.keywords()

        nnp = set()
        for w in kw:
            if w[1].type == 'NNP':
                wdstr = []
                for wd in w[1].phrase.words:
                    if wd.type == 'NNP':
                        wdstr.append(wd.string)
                nnp.add("-".join(wdstr))


        kw = d.keywords(top=5)
        words = set()
        for w in kw:
            if w[1].type != 'NNP':
                if w[1].lemma:
                    words.add(w[1].lemma)
                else:
                    words.add(w[1].string)

        if len(nnp)>1 and len(words)>1:
            if tuple(nnp) in nnp_kw:
                nnp_kw[tuple(nnp)].update(words)
            else:
                nnp_kw[tuple(nnp)]=words

    return nnp_kw
Example #2
0
 def get_keywords_article(article):
     tagged_content_words = ([
         i.Word for i in article.tagged_content if i.Tag.startswith('NN')
     ])
     d = Document(tagged_content_words)
     k = d.keywords(top=5)
     article.keywords = k
Example #3
0
 def run(self,minePackage):
     ac=0.0 #acierto clave
     ap=0.0 #acierto positivo
     an=0.0 #acierto negativo
     alpha=1.00
     beta=0.75
     gamma=0.25
     dictionary= open(os.path.dirname(__file__) + "/dictionary.txt",'r').read()
     dictionary = Document(dictionary, stemmer = PORTER)
     clouds=minePackage['clouds']
     query=minePackage['searchKeyStemmer']
     for cloud in clouds:
         for n in cloud.graph.nodes():
             methodData=cloud.graph.node[n]['methodData']
             content = Document(methodData.getContent(),stemmer = PORTER)
             for doc in content.keywords(top=500,normalized=True):
                 if doc[1] in query and doc[1] in dictionary.words:
                     ac += doc[0]
                 elif doc[1] in dictionary.words:
                     ap += doc[0]
                 elif doc[1] in query:
                     an += doc[0]
             if ac+ap+an > 0:
                 cloud.graph.node[n]['weight_WA']=((ac*alpha)+(ap*beta)+(an*gamma))/(ac+ap+an)
             else:
                 cloud.graph.node[n]['weight_WA']=0
Example #4
0
def nnps_and_keywords(text):
    s = parsetree(text, relations=True, lemmata=True)

    nnp_kw = {}
    for e in s:
        d = Document(e)
        kw = d.keywords()

        nnp = set()
        for w in kw:
            if w[1].type == 'NNP':
                wdstr = []
                for wd in w[1].phrase.words:
                    if wd.type == 'NNP':
                        wdstr.append(wd.string)
                nnp.add("-".join(wdstr))


        kw = d.keywords(top=5)
        words = set()
        for w in kw:
            if w[1].type != 'NNP':
                if w[1].lemma:
                    words.add(w[1].lemma)
                else:
                    words.add(w[1].string)

        if len(nnp)>1 and len(words)>1:
            if tuple(nnp) in nnp_kw:
                nnp_kw[tuple(nnp)].update(words)
            else:
                nnp_kw[tuple(nnp)]=words

    return nnp_kw
 def confusion_matrix(self, key=None, output_format=None, split=False):
     """Returns a confusion matrix for the model based on splitting the data set randomly into two pieces, training on one and testing on the other"""
     if split:
         list_of_dependent = self.dependent_in_use(key=key)
     else:
         list_of_dependent = [None]
     output = ''
     matrices = dict()
     for current_dep in list_of_dependent:
         testing_set = list()
         model = self._learner()
         for record in self.classified_entries(key=key):
             if split:
                 dep_result = str(record.dependent == current_dep)
             else:
                 dep_result = record.dependent
             if random.random() < 0.5:
                 model.train(Document(record.independent.lower(), stemmer=PORTER), dep_result)
             else:
                 testing_set.append((Document(record.independent.lower(), stemmer=PORTER), dep_result))
         matrix = model.confusion_matrix(documents=testing_set)
         matrices[current_dep] = matrix
         if output_format == 'html':
             if split:
                 output += '<h4>' + current_dep + "</h4>"
             vals = matrix.keys()
             output += '<table class="table table-bordered"><thead><tr><td></td><td></td><td style="text-align: center" colspan="' + str(len(vals)) + '">Actual</td></tr><tr><th></th><th></th>'
             first = True
             for val in vals:
                 output += '<th>' + val + '</th>'
             output += '</tr></thead><tbody>'
             for val_a in vals:
                 output += '<tr>'                
                 if first:
                     output += '<td style="text-align: right; vertical-align: middle;" rowspan="' + str(len(vals)) + '">Predicted</td>'
                     first = False
                 output += '<th>' + val_a + '</th>'
                 for val_b in vals:
                     output += '<td>' + str(matrix[val_b].get(val_a, 0)) + '</td>'
                 output += '</tr>'
             output += '</tbody></table>'
             #output += "\n\n`" + str(matrix) + "`"
             # output += '<ul>'
             # for document, actual in testing_set:
             #     predicted = model.classify(document)
             #     output += '<li>Predicted: ' + predicted + '; Actual: ' + actual + '</li>'
             # output += '</ul>'
     if output_format == 'html':
         return output
     if split:
         ret_val = matrices
     else:
         ret_val = matrices[None]
     if output_format == 'json':
         return json.dumps(ret_val, sort_keys=True, indent=4)
     if output_format == 'yaml':
         return yaml.safe_dump(ret_val, default_flow_style=False)
     if output_format is None:
         return ret_val
     return ret_val
Example #6
0
def setup():
    global pages
    global urlalias
    global revurlalias
    global knn
    pages = dict()
    urlalias = dict()
    revurlalias = dict()
    knn = KNN()
    db = MySQLdb.connect(host="192.168.200.26",
                         user="******",
                         passwd="xxxsecretxxx",
                         db="pla")
    cur = db.cursor()
    cur.execute("select source, alias from url_alias")
    for row in cur.fetchall():
        urlalias[row[1]] = row[0]
        revurlalias[row[0]] = row[1]
    cur.execute("select tid, name, description, vid from taxonomy_term_data;")
    for row in cur.fetchall():
        url = 'taxonomy/term/' + str(row[0])
        pages[url] = row[1]
        if url in revurlalias:
            pages[revurlalias[url]] = row[1]
            url = revurlalias[url]
        if row[3] == 3:
            soup = bs4.BeautifulSoup(row[2])
            the_text = re.sub(r'[\n\r]+', r'  ', soup.get_text(' ')).lower()
            knn.train(Document(the_text, stemmer=PORTER), url)
            knn.train(Document(row[1].lower()), url)
    cur.execute(
        "select a.tid, c.body_value, d.title from taxonomy_term_data as a inner join field_data_field_practice_areas as b on (a.tid=b.field_practice_areas_tid and b.entity_type='node' and b.bundle != 'professionals' and b.deleted=0) inner join field_data_body as c on (b.entity_id=c.entity_id and b.entity_type=c.entity_type) inner join node as d on (c.entity_id=d.nid);"
    )
    for row in cur.fetchall():
        url = 'taxonomy/term/' + str(row[0])
        if url in revurlalias:
            url = revurlalias[url]
        soup = bs4.BeautifulSoup(row[1])
        the_text = re.sub(r'[\n\r]+', r'  ', soup.get_text(' ')).lower()
        knn.train(Document(the_text, stemmer=PORTER), url)
        knn.train(Document(row[2].lower()), url)
    cur.execute("select nid, title from node where status=1;")
    for row in cur.fetchall():
        url = 'node/' + str(row[0])
        pages[url] = row[1]
        if url in revurlalias:
            pages[revurlalias[url]] = row[1]
    db.close()
    pgcur = conn.cursor()
    pgcur.execute(
        "select query, target from website_queries where target is not null group by query, target"
    )
    for row in pgcur.fetchall():
        words = re.split(r'[\n\r,;]+ *', row[1])
        for word in words:
            print("training on " + row[0].lower() + " for " + word)
            knn.train(Document(row[0].lower()), word)
    conn.commit()
    pgcur.close()
Example #7
0
 def load_text( self, text ):
   self.time_start = datetime.datetime.now()
   self.document_raw = Document( text, threshold=0 )
   self.document_raw_count = self.document_raw.count
   self.document_thresh_stemmed = Document( text, stemmer=PORTER, threshold=1 )
   self.document_thresh_unstemmed = Document( text, threshold=1 )
   self.original_text = text
   self.original_text_md5_hash = hashlib.md5(self.original_text.encode(u'utf-8', u'replace')).hexdigest().decode(u'utf-8', u'replace')  # takes source-u-string, makes source-string, gets hash-string, makes hash-u-string
 def insertarDocumento(self, url, contenido):
     """ Crea registro en mongodb y un archivo Pattern Document"""
     unDocumento = Document(contenido,
                            name=url,
                            stopwords=True,
                            stemming=PORTER,
                            weigth=TFIDF)
     result = self.mongodb.crearDocumento(unDocumento)
     if result:
         unDocumento.save("DocumentoPattern/" + str(result.inserted_id))
     return unDocumento
Example #9
0
def resolve_certainty(certainty_info):
    '''Resolve certainty with Naive Bayes'''
    if certainty_info == '':
        return 'No certainty info.'
    else:
        nb = NB()
        for observation, certainty in csv(
                'library/templatetags/c_training_data.csv'):
            v = Document(observation, type=int(certainty), stopwords=True)
            nb.train(v)
        return nb.classify(Document(certainty_info))
Example #10
0
def evaluate_query(query):
    probs = dict()
    for key, value in knn.classify(Document(query),
                                   discrete=False).iteritems():
        probs[key] = value
    if not len(probs):
        probs[knn.classify(Document(query))] = 1.0
    seen = set()
    probs = map(lambda x: fixurl(x, seen),
                sorted(probs, key=probs.get, reverse=True))
    probs = [prob for prob in probs if prob is not None]
    return probs
 def crearDocumentoPattern(self, contenido, name=""):
     '''Creacion de documentos eliminando stopwords, aplicando stemming y peso de frecuencias TFIDF'''
     return Document(contenido,
                     name=name,
                     stemmer=PORTER,
                     stopwords=True,
                     weigth=TFIDF)
Example #12
0
def summarize(text, n=1):
    """
    extract most relevant sentences from text according to TextRank algorithm
    - text: string consisting of a few sentences
    - n: number of sentences to extract
    """
    # tokenize text to sentences list
    sentences = tokenize(text)

    # create documents list
    # stop words and punctuation erase by default
    docs = [Document(sentences[i], name=i) for i in range(len(sentences))]

    # model initialize
    m = Model(docs, weight=TFIDF)

    # dict of TextRank ranking of cosine similarity matrix
    ranking = utils.textrank(m.documents, m.distance)

    # indexes of top n sentences
    top_sents_idx, _ = list(zip(*ranking.most_common(n)))

    # reordering
    output = [sentences[i] for i in sorted(top_sents_idx)]

    return ''.join(output)
Example #13
0
def articles_to_trends(articles):
    news = {}
    for story in articles:
        if story['added_at']:
            article_text = get_article_text(story['url'])
            d, s = timestamptext(story['added_at'], article_text)

            # Each key in the news dictionary is a date: news is grouped per day.
            # Each value is a dictionary of id => story items.
            # We use hash(story['summary']) as a unique id to avoid duplicate
            # content.
            news.setdefault(d, {})[hash(s)] = s

    m = Model()
    for date, stories in news.items():
        s = stories.values()
        s = ' '.join(s).lower()
        # Each day of news is a single document.
        # By adding all documents to a model we can calculate tf-idf.
        m.append(Document(s, stemmer=LEMMA, exclude=[
                 'news', 'day'], name=date))

    for document in m:
        print document.name
        print document.keywords(top=10)
Example #14
0
 def _train(self, indep, depend):
     """Trains the machine learner given an independent variable and a corresponding dependent variable."""
     if indep is None:
         return
     the_text = re.sub(r'[\n\r]+', r'  ', indep).lower()
     learners[self.group_id].train(
         Document(the_text.lower(), stemmer=PORTER), depend)
Example #15
0
def summarize(text_to_summarize):
    stokens = tokenize(text_to_summarize)
 
    # STEP 1
    # pattern.vector's Document is a nifty bag-o-words structure,
    # with a TF weighting scheme
    docs = [Document(string= s, name=e,stemmer=LEMMA)
            for e,s in enumerate(stokens) if len(s.split(" ")) > 7]
    
    linkgraph = []
    # STEP 2 and 3 happen interwovenly
    for doc in docs:
        for doc_copy in docs:
            if doc.name != doc_copy.name:
                # STEP 2 happens here
                wordset_a = [x[1] for x in doc.keywords()]
                wordset_b = [y[1] for y in doc_copy.keywords()]
                jacc_dist = distance.jaccard(wordset_a, wordset_b)
                if jacc_dist < 1:
                    linkgraph.append((str(doc.name), #index to sentence
                                      str(doc_copy.name),1-jacc_dist)) #dist. score
    # By the time we reach here, we'd have completed STEP 3
    
    # STEP 4
    #I referenced this SO post for help with pagerank'ing
    #http://stackoverflow.com/questions/9136539/how-to-weighted-edges-affect-pagerank-in-networkx
    D=nx.DiGraph()
    D.add_weighted_edges_from(linkgraph)
    pagerank = nx.pagerank(D)
    sort_pagerank = sorted(pagerank.items(),key=operator.itemgetter(1))
    sort_pagerank.reverse()
    top2 = sort_pagerank[:2]
    orderedtop2 = [int(x[0]) for x in top2]
    orderedtop2 = sorted(orderedtop2)
    return " ".join([ stokens[i] for i in orderedtop2 ])
Example #16
0
def feeds_to_trends(feeds):
    for url in feeds:
        url = url['feed_url']
        news = {}
        try:
            for story in Newsfeed().search(url, cached=False):
                d, s = datetext(story.date, story.description)

                # Each key in the news dictionary is a date: news is grouped per day.
                # Each value is a dictionary of id => story items.
                # We use hash(story.description) as a unique id to avoid duplicate
                # content.
                news.setdefault(d, {})[hash(s)] = s

            m = Model()
            for date, stories in news.items():
                s = stories.values()
                s = ' '.join(s).lower()
                # Each day of news is a single document.
                # By adding all documents to a model we can calculate tf-idf.
                m.append(Document(s, stemmer=LEMMA, exclude=[
                         'news', 'day'], name=date))

            for document in m:
                print document.name
                print document.keywords(top=10)
        except HTTP404NotFound:
            print url
            pass
Example #17
0
def doclist_from_feeds(feeds):
    titles = gettitles(feeds)
    documents = []
    for key in titles:
        doc = Document(" ".join(titles[key]), stemmer=LEMMA, threshold=0)
        documents.append(doc)
    return documents
 def predict(self, indep, probabilities=False):
     """Returns a list of predicted dependent variables for a given independent variable."""
     indep = re.sub(r'[\n\r]+', r'  ', indep).lower()
     if not self._train_from_db():
         return list()
     probs = dict()
     for key, value in learners[self.group_id].classify(Document(indep.lower(), stemmer=PORTER), discrete=False).iteritems():
         probs[key] = value
     if not len(probs):
         single_result = learners[self.group_id].classify(Document(indep.lower(), stemmer=PORTER))
         if single_result is not None:
             probs[single_result] = 1.0
     if probabilities:
         return [(x, probs[x]) for x in sorted(probs.keys(), key=probs.get, reverse=True)]
     else:
         return sorted(probs.keys(), key=probs.get, reverse=True)
Example #19
0
def word_ranking(text, n='L2'):
    """
    extract most relevant sentences from text according to LSA algorithm
    steps:    
    1. tokenize text by sentences
    2. compute tfidf matrix
    3. applying SVD of tfidf matrix (reduce to n-dimensions) 
    4. ranking sentences according to cross-method (source: http://www.aclweb.org/anthology/C10-1098.pdf)
        
    - text: string consisting of a few sentences
    - n: number of sentences to extract
    
    """
    # tokenize text to sentences list
    sentences = tokenize(text)

    #==============================================================================
    #     #synctatic filter
    #     exclude_list = []
    #     for sent in sentences:
    #         for word, pos in tag(sent):
    #             if pos != "JJ" or pos != 'NN': # Retrieve all adjectives and nouns.
    #                 exclude_list.append(word.lower())
    #==============================================================================

    # create documents list
    # stop words and punctuation erase by default
    docs = [Document(sentences[i], name=i) for i in range(len(sentences))]

    # model initialize
    m = Model(docs, weight=TFIDF)

    # dimensions number equal to euclidean norm of singular values
    # U, S, Vt = np.linalg.svd(m.vectors, full_matrices=False)
    # dimensions=int(round(np.linalg.norm(S, 2)))
    m.reduce(dimensions=n)

    # sentences selection according to cross-method
    # source: http://www.ceng.metu.edu.tr/~e1395383/papers/TextSummarizationUsingLSA(Journal).pdf
    # topic(rows) x tokens(cols) matrix(tfidf)
    V = np.array(m.lsa.vt)

    # average sentence score for each concept/topic by the rows of the Vt matrix
    avg_score = np.mean(V, axis=1).reshape((-1, 1))

    # cell values which are less than or equal to the average score are set to zero
    V[V <= avg_score] = 0.0

    # sigma natrix after svd performing
    S = np.array(m.lsa.sigma).reshape((-1, 1))

    # total length of each sentence vector
    length = np.sum(V * S, axis=0)

    # ranking words by length score
    ranking = Counter(dict(zip(m.lsa.terms, length)))  #.most_common(n)

    #words, score =  list(zip(*ranking))

    return ranking
Example #20
0
def build_model(results=[]):
    documents = [
        Document(i.get('text'),
                 name=i.get('url'),
                 description=i.get('index'),
                 stemmer=LEMMA) for i in results
    ]
    m = Model(documents, weight=TFIDF)

    y, x = 1, len(m.features)
    model = np.zeros((y, x))

    sentence_dict = {}
    model_sentences = []
    for i_index, i in enumerate(documents):
        sentences = sent_tokenize(results[i_index].get('text').lower())

        dy, dx = len(sentences), x
        for s_index, s in enumerate(sentences):
            s_words = {
                w: 1
                for w in words(s, stemmer=LEMMA, stopwords=False)
                if not stopwords_hash.get(w)
            }
            if len(s_words) < 5:
                continue
            model_sentences.append(s)
            model = np.append(
                model, [[1 if s_words.get(w) else 0 for w in m.features]], 0)
            sentence_dict[model.shape[0] - 1] = i.name
            # model_sentences[model.shape[0]-1] = s

    model = np.delete(model, (0), 0)

    return model, m, model_sentences, sentence_dict
Example #21
0
 def get_labeled_feats(self, data):
     labeled_binary = []
     for (word, tag) in data:
         feat = FeatExtract(
             word,
             ArtOrDet=(self.error_tag == 'ArtOrDet')).binary_features()
         d = Document(feat, type=tag, stopwords=True)
         labeled_binary.append(d)
     return labeled_binary
Example #22
0
def getMod():
    essay_path = 'essays/original/'
    files = fio.recGetTextFiles(path.abspath(essay_path))
    docs = []
    for f in files:
        with io.open(f, 'r', encoding='utf-8') as w:
            text = TextBlob(PageParser.parse(w.read()))
            text = ' '.join([
                word for word in text.words if word not in cachedStopWords
            ]).lstrip()
            #ent_text = ' '.join(er.recognize_entities(text.sentences))
            #ent_text = PageParser.parse(w.read())
            docs.append(Document(text, name=f, top=40))
    m = Model(docs)
    lsa = m.reduce(5)
    return lsa
    # Clustering could be a useful technique, commenting out for now
    #with io.open(r'lsa.txt', 'w+', encoding='utf-8') as w:
    #	write_cluster(m.cluster(method=HIERARCHICAL, k=4), w, "")

    with io.open(r'lsa.txt', 'w+', encoding='utf-8') as w:
        for i, concept in enumerate(m.lsa.concepts):
            print("Concept {0}:".format(i)),
            w.write(unicode("Concept {0}:".format(i)))
            count = 0
            # Show top only first 5 features we come across
        for feature, weight in m.lsa.concepts[i].items():
            if abs(weight) > 0.2:
                print(feature),
                w.write(feature + " ")
                count += 1

            if count > 5:
                break
        w.write(unicode('\n'))
        #print

        cat_docs = []
        for d in m.documents:
            cat = (0, 0, {})
            #print d.name.split('\\')[-1]
            for idx, weight in m.lsa.vectors[d.id].items():
                print "\tCat {0}: {1}".format(idx, weight)
                if abs(weight) > abs(cat[1]) or cat[1] == 0:
                    cat = (idx, weight, d)

            if cat[0] == i:
                cat_docs.append(cat)
                #print "\t{0}".format(d.name.split('\\')[-1])

        cat_docs.sort(key=lambda tup: abs(tup[1]), reverse=True)
        for cat, weight, d in cat_docs:
            f = d.name.split('\\')[-1]
            w.write(
                unicode("\t{0} - {1}\n").format(
                    filter(lambda x: x in string.printable, f), weight))
Example #23
0
def asDocumentClass(data, classification):
    '''
    a function that converts list of reviews to Documents to be used by Pattern
    '''
    data = [(r['review/text'], str(classification)) for r in data]
    data = [
        Document(review, type=classification, stopwords=True)
        for review, classification in data
    ]
    return data
Example #24
0
def asDocumentReview(data):
    '''
    a function that converts list of reviews to Documents to be used by Pattern
    '''
    data = [(r['review/text'], float(r['review/score'])) for r in data]
    data = [
        Document(review, type=rating, stopwords=True)
        for review, rating in data
    ]
    return data
Example #25
0
 def run(self, minePackage):
     ac = 0.0  #acierto clave
     ap = 0.0  #acierto positivo
     an = 0.0  #acierto negativo
     alpha = 1.00
     beta = 0.75
     gamma = 0.25
     dictionary = open(os.path.dirname(__file__) + "/dictionary.txt",
                       'r').read()
     dictionary = Document(dictionary, stemmer=PORTER)
     clouds = minePackage['clouds']
     query = minePackage['searchKeyStemmer']
     for cloud in clouds:
         for n in cloud.graph.nodes():
             methodData = cloud.graph.node[n]['methodData']
             # document=methodData.getData()
             # for t in document:
             #     tf=document[t]
             #     if t in query:
             #         print "entroooooooooooooooooo"
             #         ac+=tf
             #     else:
             #         if t in dictionary:#creo que me olvide de hacer stemming a las palabras del diccionario
             #             ap+=tf
             #         else:
             #             an+=tf
             content = Document(methodData.getContent(), stemmer=PORTER)
             for doc in content.keywords(top=200, normalized=True):
                 if doc[1] in query:
                     ac += doc[0]
                 else:
                     if doc[1] in dictionary.words:
                         ap += doc[0]
                     else:
                         an += doc[0]
             if ac + ap + an > 0:
                 cloud.graph.node[n]['weight_WA'] = (
                     (ac * alpha) + (ap * beta) +
                     (an * gamma)) / (ac + ap + an)
             else:
                 cloud.graph.node[n]['weight_WA'] = 0
Example #26
0
 def calculate(self, minePackage):
     webDocuments = []
     query = Document((minePackage['searchKey']))
     clouds = minePackage['clouds']
     count = UnPack()
     totalLinks = count.total(clouds)
     urlContent = UrlToPlainText()
     step = 0
     for cloud in clouds:
         for n in cloud.graph.nodes():
             doc = cloud.graph.node[n]['methodData']
             webDocuments.append(Document(doc.getData()))
             step += 1
     m = Model(documents=webDocuments, weight=TFIDF)
     for cloud in clouds:
         for n in cloud.graph.nodes():
             methodData = cloud.graph.node[n]['methodData']
             vector = Document(methodData.getData())
             cloud.graph.node[n]['weight_VSM'] = m.similarity(
                 vector,
                 query)  #SETEA EL VALOR DE VSM EN EL CLOUD!!!!!!!!!!
Example #27
0
def summarize(raw_text):
    if len(raw_text) == 0:
        return ""

    sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    tokens = sentence_tokenizer.tokenize(raw_text.strip())

    documents = []
    for position, sentence in enumerate(tokens):
        if len(sentence.split(" ")) > 5:
            document = Document(string=sentence, name=position, stemmer=LEMMA)
            if len(document.features) > 0:
                documents.append(document)

    edges = []
    for document in documents:
        for other_document in documents:
            if document.name == other_document.name:
                continue
            doc_words = document.features
            other_doc_words = other_document.features
            similarity = jaccard_similarity(doc_words, other_doc_words)
            if similarity > 0:
                edges.append((document.name, other_document.name, similarity))

    graph = networkx.DiGraph()
    graph.add_weighted_edges_from(edges)
    page_rank = networkx.pagerank(graph)

    sorted_ranks = sorted(page_rank.items(),
                          key=operator.itemgetter(1),
                          reverse=True)

    summary = []
    sentence_numbers = []

    num_sentences = 3
    for i in range(num_sentences):
        if i < len(sorted_ranks):
            node = sorted_ranks[i]
            sentence_numbers.append(node[0])

    sentence_numbers = sorted(sentence_numbers)

    for sentence_number in sentence_numbers:
        sentence = tokens[sentence_number]
        summary.append(sentence)

    if len(summary) == 0:
        summary.append(tokens[0])

    return " ".join(summary)
Example #28
0
def extractSentiment(characterSentences):
    """
    Trains a Naive Bayes classifier object with the reviews.csv file, analyzes
    the sentence, and returns the tone.
    """
    nb = NB()
    characterTones = defaultdict(list)
    for review, rating in csv("reviews.csv"):
        nb.train(Document(review, type=int(rating), stopwords=True))
    for key, value in characterSentences.items():
        for x in value:
            characterTones[key].append(nb.classify(str(x)))
    return characterTones
Example #29
0
 def sync_corpus(self):
     """Creates  a new corpus on all notes if we already have synced before
     TODO:
         Store other data in the corpus besides basic text content, ie,
         extracted image, attribute note data, etc...
         catch corpus not found file error?
     """
     docs =[]
     corpus_check =  self.mongo.users.find_one({'_id':self.user_id},
             {'corpus':1}).get('corpus')
     # make sure we already created corpus
     if corpus_check and self.need_sync:
         update_guids = self.resync_db()
         corpus = self.load_corpus()
         # only those that need to be updated from the update_guids
         for x in self.mongo.notes.find(
                 {'_id':{'$in':update_guids}},{'tokens_content':1,'str_title':1}):
             # create the updated doc
             d =  Document(x['tokens_content'],name=x['str_title'],top=50)
             # set the id to what we want
             d._id = x['_id']
             docs.append(d)
             # remove old doc because corpus will still have old content
             corpus.remove(d)
         corpus.extend(docs)
         self.save_corpus(corpus,update=True)
     # dont need the sync, do nothing
     elif corpus_check:
         return
     # corpus sync has not been done before
     else: 
         for x in self.mongo.notes.find( # all notes of this user
                     {'_id_user':self.user_id},{'tokens_content':1,'str_title':1}):
                 d =  Document(x['tokens_content'],name=x['str_title'],top=30)
                 d._id = x['_id']
                 docs.append(d)
         corpus = Corpus(docs)
         self.save_corpus(corpus)
         self.mongo.users.update({'_id':self.user_id},{'$set':{'corpus':True}})
Example #30
0
def create_doc_list(df):
    '''
    Given a dataframe containing an 'id' column and a 'review' column, create a
    list of documents in Pattern.Vector Document format. Because of how the data
    is formatted in the dataframe, the id contains an extra quote at the beginning
    and end of the id which need to be stripped away.
    '''
    print "Creating a list of {} documents".format(len(df))
    doc_list = []
    for index, row in df.iterrows():
        d = Document(row['review'], threshold=1, name=row['id'][1:-1])
        doc_list.append(d)
    return doc_list
Example #31
0
    def classify(text):
        predicted_category = Classifications._category.classify(Document(text),
                                                                discrete=True)
        predicted_rate = Classifications._rating.classify(Document(text),
                                                          discrete=True)
        predicted_rate_nlp = Classifications._rating_nlp.classify(
            Classifications.selectWords(text), discrete=True)
        predicted_sentiment_dict = Classifications._sentiment.classify(
            Classifications.selectWords(text), discrete=False)
        predicted_sentiment = True if str(
            sorted(predicted_sentiment_dict.items(),
                   key=operator.itemgetter(1),
                   reverse=True)[1][0]) in ['True', '3.0', '4.0', '5.0'
                                            ] else False

        return {
            'text': text,
            'rate': predicted_rate,
            'category': predicted_category,
            'rate_nlp': predicted_rate_nlp,
            'positivity': predicted_sentiment
        }
Example #32
0
def extract():
    print 'Extracting features from app descriptions...\n'
    if os.path.exists(OUTPUT_PATH):
        shutil.rmtree(OUTPUT_PATH)
    os.makedirs(OUTPUT_PATH)

    for dir in os.listdir(INPUT_PATH):
        if not dir.startswith('.'):
            os.makedirs("{}/{}".format(OUTPUT_PATH, dir))
            for file in os.listdir('{}/'.format(INPUT_PATH) + dir):
                with open('{}/{}/{}'.format(INPUT_PATH, dir, file), 'rb') as f:
                    reader = csv.reader(f)
                    next(reader)
                    with open('{}/{}/{}'.format(OUTPUT_PATH, dir, file),
                              'wb') as r:
                        writer = csv.writer(r)
                        for app in reader:
                            name = app[0]
                            description = app[2]

                            # Prepare an app description string for NLTK and LDA processing
                            preparedDescription = prepare_description(
                                description)

                            # Extract 3 word featurlets from the description
                            featurelets = featurelet_extraction(
                                preparedDescription)

                            list = []
                            for feature in featurelets:
                                featurelet = '{} {} {}'.format(
                                    feature[0], feature[1], feature[2])
                                list.append(
                                    Document(featurelet, name=featurelet))

                            # Perform hierarchical clustering
                            m = Model(list)
                            cluster = m.cluster(method=HIERARCHICAL,
                                                k=3,
                                                iterations=1000,
                                                distance=COSINE)

                            # Organize clusters into features and alternative tokens
                            (features,
                             alterTokens) = group(cluster, [], [], [])

                            # Write results to file
                            writer.writerow(
                                [name, description, features, alterTokens])
                        r.close()
                    f.close()
Example #33
0
def get_top_freq_words_in_text(txt_string, top_count, filter_method = lambda w: w.lstrip("'").isalnum(),
                               exclude_len = 0):
    """ Method to get the top frequency of words in text.
        Args:
            txt_string (str): Input string.
            top_count (int): number of top words to be returned.

        Kwargs:
            filter_method (method): special character to ignore, in some cases numbers may also need to ignore.
                                    pass in lambda function.
                                    Default accept method that include only alphanumeric

            exclude_len (int): exclude keyword if len less than certain len.
                                default 0, which will not take effect.

        Returns:
            (list): list of top words

    """
    docu = Document(txt_string, threshold=1, filter = filter_method)

    ## Provide extra buffer if there is word exclusion
    ## Allow for additional buffer of top of keyword so that can still within spec top count after later elimiation.
    freq_keyword_tuples = docu.keywords(top = top_count + 5 )
    
    ## encode for unicode handliing
    if exclude_len  == 0:
        freq_keyword_list = [n[1].encode() for n in freq_keyword_tuples]
    else:
        freq_keyword_list = [n[1].encode() for n in freq_keyword_tuples if not len(n[1])<=exclude_len]

    ## reduce all word to same form
    freq_keyword_list = [get_singular_form_of_word(n) for n in freq_keyword_list]

    ## remove duplicates
    freq_keyword_list = rm_duplicate_keywords(freq_keyword_list)

    return freq_keyword_list[:top_count]
def get_top_freq_words_in_text(txt_string, top_count, filter_method=lambda w: w.lstrip("'").isalnum(), exclude_len=0):
    """ Method to get the top frequency of words in text.
        Args:
            txt_string (str): Input string.
            top_count (int): number of top words to be returned.

        Kwargs:
            filter_method (method): special character to ignore, in some cases numbers may also need to ignore.
                                    pass in lambda function.
                                    Default accept method that include only alphanumeric

            exclude_len (int): exclude keyword if len less than certain len.
                                default 0, which will not take effect.

        Returns:
            (list): list of top words

    """
    docu = Document(txt_string, threshold=1, filter=filter_method)

    ## Provide extra buffer if there is word exclusion
    ## Allow for additional buffer of top of keyword so that can still within spec top count after later elimiation.
    freq_keyword_tuples = docu.keywords(top=top_count + 5)

    ## encode for unicode handliing
    if exclude_len == 0:
        freq_keyword_list = [n[1].encode() for n in freq_keyword_tuples]
    else:
        freq_keyword_list = [n[1].encode() for n in freq_keyword_tuples if not len(n[1]) <= exclude_len]

    ## reduce all word to same form
    freq_keyword_list = [get_singular_form_of_word(n) for n in freq_keyword_list]

    ## remove duplicates
    freq_keyword_list = rm_duplicate_keywords(freq_keyword_list)

    return freq_keyword_list[:top_count]
Example #35
0
    def text_to_database(self, full_text, sha224, title, fluent_anki_session, target_language):

        language = DBWrapper().get_or_create_language(fluent_anki_session, target_language)
        #parse text into objects
        pobj = Parser().parse(full_text)
        #import pdb; pdb.set_trace()

        doc = Document(full_text)

        #create the source text database object
        source_text_dbobj = SourceText(title = title, 
                                       hash_text=sha224, 
                                       text_length=len(pobj.tagged_words))

        black_list = DBWrapper().get_black_list_word_set(fluent_anki_session)

        for word in tqdm(pobj.unique_words):
            #see if it's on the black list
            if (word not in black_list) and (not Util().has_numbers(word)):
                #length of the word should not be null
                if word:
                    wstf = Word_SourceText_Frequency(frequency = pobj.word_frequency_dict[word])
                    wstf.text_pos = pobj.unique_parse_words_dict[word].text_pos
                    wstf.tfidf = doc.tfidf(word)
                    wtype = pobj.unique_parse_words_dict[word][0].type
                    w = ExoticWord(text=word, word_type=wtype, lang=language.id)
                    wstf.words = w
                    source_text_dbobj.words.append(wstf)
                    for sentence in pobj.unique_parse_words_dict[word].suggested_sentences:
                        sent = DBWrapper().get_or_create_sentence(fluent_anki_session, sentence.string, target_language)
                        sent.words.append(w)
                        fluent_anki_session.add(sent)

        #write parsed words to database
        fluent_anki_session.add(source_text_dbobj)
        fluent_anki_session.commit()
def get_model_from_documents(path='./*/*.txt'):
    '''return model from given txt files'''
    import codecs
    import glob
    from pattern.vector import Document, Model, TFIDF

    documents = []
    files = glob.glob('./*/*.*')
    for file in files:
        f = codecs.open(file, 'r')
        data = f.read()
        document = Document(data)
        documents.append(document)

    model = Model(documents=documents, weight=TFIDF)
    return documents, model
Example #37
0
class KeywordWrapper( object ):
  '''
  Non-django model; wrapper around pattern.vector keyword functions.
  See views.keywords() for usage.
  '''

  def __init__(self):
    self.time_start = None
    self.params = {}
    self.original_text = None
    self.original_text_md5_hash = None
    self.document_raw = None
    self.document_raw_count = None
    self.document_thresh_stemmed = None
    self.document_thresh_unstemmed = None
    self.top_num = 10
    self.keywords_stemmed = None
    self.keywords_unstemmed = None
    self.keywords_unstemmed_additional = None
    self.keywords_stemmed_simple = []
    self.explore_json_string = None
    self.simple_json_string = None

  def get_params( self, dj_request ):
    assert type(dj_request) == django.core.handlers.wsgi.WSGIRequest
    if dj_request.method == u'GET':
      for item in dj_request.GET.items():
        key = item[0]; value = item[1]
        self.params[key] = value
    else:  # POST
      for item in dj_request.POST.items():
        key = item[0]; value = item[1]
        self.params[key] = value

  def load_text( self, text ):
    self.time_start = datetime.datetime.now()
    self.document_raw = Document( text, threshold=0 )
    self.document_raw_count = self.document_raw.count
    self.document_thresh_stemmed = Document( text, stemmer=PORTER, threshold=1 )
    self.document_thresh_unstemmed = Document( text, threshold=1 )
    self.original_text = text
    self.original_text_md5_hash = hashlib.md5(self.original_text.encode(u'utf-8', u'replace')).hexdigest().decode(u'utf-8', u'replace')  # takes source-u-string, makes source-string, gets hash-string, makes hash-u-string

  def set_top_num( self ):
    assert type(self.document_raw) == pattern.vector.Document
    for i in range( 1, self.document_raw.count, 1000 ):
      self.top_num += 1
      if self.top_num == 50:
        break

  def make_keywords_stemmed_simple( self ):
    assert type(self.document_thresh_stemmed) == pattern.vector.Document
    self.keywords_stemmed = self.document_thresh_stemmed.keywords( top=self.top_num )
    for kw_tuple in self.keywords_stemmed:
      score = kw_tuple[0]; word = kw_tuple[1]
      self.keywords_stemmed_simple.append( word )

  def make_default_keywords( self ):
    '''keywords stemmed & unstemmed'''
    assert type(self.document_thresh_stemmed) == pattern.vector.Document
    assert type(self.document_thresh_unstemmed) == pattern.vector.Document
    self.keywords_stemmed = self.document_thresh_stemmed.keywords( top=self.top_num )
    self.keywords_unstemmed = self.document_thresh_unstemmed.keywords( top=self.top_num )
    
  def make_additional_keywords( self ):
    '''unstemmed words not in stemmed list'''
    assert type(self.keywords_stemmed) == list
    if len( self.keywords_stemmed ) > 0:
      assert type(self.keywords_stemmed[0]) == tuple
    assert type(self.keywords_unstemmed) == list
    if len( self.keywords_unstemmed ) > 0:
      assert type(self.keywords_unstemmed[0]) == tuple
    ## make simple stemmed keyword list from (score, word) tuple
    temp_simple_stemmed = []
    for kw_tuple in self.keywords_stemmed:
      score = kw_tuple[0]; word = kw_tuple[1]
      temp_simple_stemmed.append( word )
    ## add any additional unstemmed keywords (whose stems aren't in temp_simple_stemmed )
    self.keywords_unstemmed_additional = []
    for kw_tuple in self.keywords_unstemmed:
      score = kw_tuple[0]; word = kw_tuple[1]
      if word not in temp_simple_stemmed:  # TODO: time using sets here instead
        if stem( word, stemmer=PORTER ) not in temp_simple_stemmed:
          self.keywords_unstemmed_additional.append( kw_tuple )

  def build_explore_json_string( self ):
    import hashlib
    d = {
      u'count_words_raw': len( self.original_text.split() ),
      u'count_words_analyzed': self.document_raw.count,
      u'count_words_repeating_stemmed': self.document_thresh_stemmed.count,
      u'count_words_repeating_unstemmed': self.document_thresh_unstemmed.count,
      u'count_keywords_stemmed': len( self.keywords_stemmed ),
      u'count_keywords_unstemmed': len( self.keywords_unstemmed ),
      u'count_keywords_unstemmed_additional': len( self.keywords_unstemmed_additional ),
      u'hash_md5': self.original_text_md5_hash,
      u'keywords_stemmed': self.keywords_stemmed,
      u'keywords_unstemmed': self.keywords_unstemmed,
      u'keywords_unstemmed_additional': self.keywords_unstemmed_additional,
      u'repeating_words_unstemmed': self.document_thresh_unstemmed.terms,
      u'time_start': unicode( self.time_start ),
      u'time_taken': unicode( datetime.datetime.now() - self.time_start ),
      u'docs': app_settings.DOCS_URL
      }
    self.explore_json_string = json.dumps( d, sort_keys=True, indent=2 )

  def build_simple_json_string( self ):
    d = {
      u'count_keywords_stemmed': len( self.keywords_stemmed ),
      u'keywords_stemmed': self.keywords_stemmed_simple,
      u'hash_md5': self.original_text_md5_hash,
      u'time_start': unicode( self.time_start ),
      u'time_taken': unicode( datetime.datetime.now() - self.time_start ),
      u'docs': app_settings.DOCS_URL
      }
    self.simple_json_string = json.dumps( d, sort_keys=True, indent=2 )
Example #38
0
# coding=utf-8

from pattern.vector import Document


s = '''
    The shuttle Discovery, already delayed three times by technical problems
    and bad weather, was grounded again Friday, this time by a potentially
    dangerous gaseous hydrogen leak in a vent line attached to the shipʼs
    external tank. The Discovery was initially scheduled to make its 39th
    and final flight last Monday, bearing fresh supplies and an intelligent
    robot for the International Space Station. But complications delayed the
    flight from Monday to Friday,  when the hydrogen leak led NASA to conclude
    that the shuttle would not be ready to launch before its flight window
    closed this Monday.
'''

d = Document(s)
print d.keywords(top=10)
d._description = 'sample corpus'
print d._description
print d.term_frequency('flight')
print d.tfidf('flight')
print d.features
print d.words
print 'vector = ', d.vector
Example #39
0
# e.g., "conspiracies" => "conspiracy", "conspired" => "conspire".

s = """
The shuttle Discovery, already delayed three times by technical problems and bad weather, 
was grounded again Friday, this time by a potentially dangerous gaseous hydrogen leak 
in a vent line attached to the ship's external tank.
The Discovery was initially scheduled to make its 39th and final flight last Monday, 
bearing fresh supplies and an intelligent robot for the International Space Station. 
But complications delayed the flight from Monday to Friday, 
when the hydrogen leak led NASA to conclude that the shuttle would not be ready to launch 
before its flight window closed this Monday.
"""

# With threshold=1, only words that occur more than once are counted.
# With stopwords=False, words like "the", "and", "I", "is" are ignored.
document = Document(s, threshold=1, stopwords=False)
print(document.words)
print()

# The /corpus folder contains texts mined from Wikipedia.
# Below is the mining script (we already executed it for you):

#import os, codecs
#from pattern.web import Wikipedia
#
#w = Wikipedia()
# for q in (
#  "badger", "bear", "dog", "dolphin", "lion", "parakeet",
#  "rabbit", "shark", "sparrow", "tiger", "wolf"):
#    s = w.search(q, cached=True)
#    s = s.plaintext()
###
### to cmd line test this:
###   echo "{ \"title\" : \"james muguira\", \"link\" : \"http://rss.cnn.com/rss/cnn_topstories.rss\", \"source\" : \"hello world\", \"data\" : \"{ json }\" }"
###
### for example
###
###   insert into table cnn_top select transform (text) 
###     using 'python map_strm.py' as (title, link, source, data) 
###     from test;

###

import sys
import json
from   pattern.vector import Document
from   pattern.web import plaintext
import urllib2


for line in sys.stdin:
	line = line.strip()
	ljs = json.loads(line)
	fjs = urllib2.urlopen(ljs['link']).read()
	st = plaintext(fjs)
	d = Document(st)
	w = json.dumps(d.keywords())
	print "%s\t%s\t%s\t%s" % (ljs['title'], ljs['link'], ljs['source'], w)
	


Example #41
0
# "conspiracy" and "conspired" are both reduced to "conspir".

s = """
The shuttle Discovery, already delayed three times by technical problems and bad weather, 
was grounded again Friday, this time by a potentially dangerous gaseous hydrogen leak 
in a vent line attached to the ship's external tank.
The Discovery was initially scheduled to make its 39th and final flight last Monday, 
bearing fresh supplies and an intelligent robot for the International Space Station. 
But complications delayed the flight from Monday to Friday, 
when the hydrogen leak led NASA to conclude that the shuttle would not be ready to launch 
before its flight window closed this Monday.
"""

# With threshold=1 (default), only words that occur more than once are counted.
# Some stop words like "the", "and", "I", "is" are always ignored.
document = Document(s, threshold=1)
print document.terms
print

# The corpus/ folder contains some texts retrieved from Wikipedia.
# Here is the code (we already executed it for you):

#from pattern.web import Wikipedia
#
#wp = Wikipedia()
#for q in (
#  "badger", "bear", "dog", "dolphin", "lion", "parakeet", 
#  "rabbit", "shark", "sparrow", "tiger", "wolf"):
#    s = wp.search(q, cached=True)
#    s = s.plaintext()
#    f = codecs.open(os.path.join("corpus", q+".txt"), "w", encoding="utf-8")